| ;****************************************************************************** |
| ;* x86 optimizations for PNG decoding |
| ;* |
| ;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu> |
| ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> |
| ;* |
| ;* This file is part of FFmpeg. |
| ;* |
| ;* FFmpeg is free software; you can redistribute it and/or |
| ;* modify it under the terms of the GNU Lesser General Public |
| ;* License as published by the Free Software Foundation; either |
| ;* version 2.1 of the License, or (at your option) any later version. |
| ;* |
| ;* FFmpeg is distributed in the hope that it will be useful, |
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| ;* Lesser General Public License for more details. |
| ;* |
| ;* You should have received a copy of the GNU Lesser General Public |
| ;* License along with FFmpeg; if not, write to the Free Software |
| ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| ;****************************************************************************** |
| |
| %include "libavutil/x86/x86util.asm" |
| |
| SECTION_RODATA |
| |
| cextern pw_255 |
| |
| SECTION .text |
| |
| ; %1 = nr. of xmm registers used |
| %macro ADD_BYTES_FN 1 |
| cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i |
| %if ARCH_X86_64 |
| movsxd waq, wad |
| %endif |
| xor iq, iq |
| |
| ; vector loop |
| mov wq, waq |
| and waq, ~(mmsize*2-1) |
| jmp .end_v |
| .loop_v: |
| movu m0, [src2q+iq] |
| movu m1, [src2q+iq+mmsize] |
| paddb m0, [src1q+iq] |
| paddb m1, [src1q+iq+mmsize] |
| movu [dstq+iq ], m0 |
| movu [dstq+iq+mmsize], m1 |
| add iq, mmsize*2 |
| .end_v: |
| cmp iq, waq |
| jl .loop_v |
| |
| %if mmsize == 16 |
| ; vector loop |
| mov waq, wq |
| and waq, ~7 |
| jmp .end_l |
| .loop_l: |
| movq mm0, [src1q+iq] |
| paddb mm0, [src2q+iq] |
| movq [dstq+iq ], mm0 |
| add iq, 8 |
| .end_l: |
| cmp iq, waq |
| jl .loop_l |
| %endif |
| |
| ; scalar loop for leftover |
| jmp .end_s |
| .loop_s: |
| mov wab, [src1q+iq] |
| add wab, [src2q+iq] |
| mov [dstq+iq], wab |
| inc iq |
| .end_s: |
| cmp iq, wq |
| jl .loop_s |
| REP_RET |
| %endmacro |
| |
| %if ARCH_X86_32 |
| INIT_MMX mmx |
| ADD_BYTES_FN 0 |
| %endif |
| |
| INIT_XMM sse2 |
| ADD_BYTES_FN 2 |
| |
| %macro ADD_PAETH_PRED_FN 1 |
| cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr |
| %if ARCH_X86_64 |
| movsxd bppq, bppd |
| movsxd wq, wd |
| %endif |
| lea endq, [dstq+wq-(mmsize/2-1)] |
| sub topq, dstq |
| sub srcq, dstq |
| sub dstq, bppq |
| pxor m7, m7 |
| |
| PUSH dstq |
| lea cntrq, [bppq-1] |
| shr cntrq, 2 + mmsize/16 |
| .bpp_loop: |
| lea dstq, [dstq+cntrq*(mmsize/2)] |
| movh m0, [dstq] |
| movh m1, [topq+dstq] |
| punpcklbw m0, m7 |
| punpcklbw m1, m7 |
| add dstq, bppq |
| .loop: |
| mova m2, m1 |
| movh m1, [topq+dstq] |
| mova m3, m2 |
| punpcklbw m1, m7 |
| mova m4, m2 |
| psubw m3, m1 |
| psubw m4, m0 |
| mova m5, m3 |
| paddw m5, m4 |
| %if cpuflag(ssse3) |
| pabsw m3, m3 |
| pabsw m4, m4 |
| pabsw m5, m5 |
| %else ; !cpuflag(ssse3) |
| psubw m7, m5 |
| pmaxsw m5, m7 |
| pxor m6, m6 |
| pxor m7, m7 |
| psubw m6, m3 |
| psubw m7, m4 |
| pmaxsw m3, m6 |
| pmaxsw m4, m7 |
| pxor m7, m7 |
| %endif ; cpuflag(ssse3) |
| mova m6, m4 |
| pminsw m6, m5 |
| pcmpgtw m3, m6 |
| pcmpgtw m4, m5 |
| mova m6, m4 |
| pand m4, m3 |
| pandn m6, m3 |
| pandn m3, m0 |
| movh m0, [srcq+dstq] |
| pand m6, m1 |
| pand m2, m4 |
| punpcklbw m0, m7 |
| paddw m0, m6 |
| paddw m3, m2 |
| paddw m0, m3 |
| pand m0, [pw_255] |
| mova m3, m0 |
| packuswb m3, m3 |
| movh [dstq], m3 |
| add dstq, bppq |
| cmp dstq, endq |
| jl .loop |
| |
| mov dstq, [rsp] |
| dec cntrq |
| jge .bpp_loop |
| POP dstq |
| RET |
| %endmacro |
| |
| INIT_MMX mmxext |
| ADD_PAETH_PRED_FN 0 |
| |
| INIT_MMX ssse3 |
| ADD_PAETH_PRED_FN 0 |