| ;***************************************************************************** |
| ;* x86-optimized functions for removegrain filter |
| ;* |
| ;* Copyright (C) 2015 James Darnley |
| ;* |
| ;* This file is part of FFmpeg. |
| ;* |
| ;* FFmpeg is free software; you can redistribute it and/or modify |
| ;* it under the terms of the GNU General Public License as published by |
| ;* the Free Software Foundation; either version 2 of the License, or |
| ;* (at your option) any later version. |
| ;* |
| ;* FFmpeg is distributed in the hope that it will be useful, |
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| ;* GNU General Public License for more details. |
| ;* |
| ;* You should have received a copy of the GNU General Public License along |
| ;* with FFmpeg; if not, write to the Free Software Foundation, Inc., |
| ;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| ;***************************************************************************** |
| |
| ; column: -1 0 +1 |
| ; row -1: a1 a2 a3 |
| ; row 0: a4 c a5 |
| ; row +1: a6 a7 a8 |
| |
| %include "libavutil/x86/x86util.asm" |
| |
| SECTION_RODATA 32 |
| |
| pw_4: times 16 dw 4 |
| pw_8: times 16 dw 8 |
| pw_div9: times 16 dw ((1<<16)+4)/9 |
| |
| SECTION .text |
| |
| ;*** Preprocessor helpers |
| |
| %define a1 srcq+stride_n-1 |
| %define a2 srcq+stride_n |
| %define a3 srcq+stride_n+1 |
| %define a4 srcq-1 |
| %define c srcq |
| %define a5 srcq+1 |
| %define a6 srcq+stride_p-1 |
| %define a7 srcq+stride_p |
| %define a8 srcq+stride_p+1 |
| |
| ; %1 dest simd register |
| ; %2 source memory location |
| ; %3 zero location (simd register/memory) |
| %macro LOAD 3 |
| movh %1, %2 |
| punpcklbw %1, %3 |
| %endmacro |
| |
| %macro LOAD_SQUARE 0 |
| movu m1, [a1] |
| movu m2, [a2] |
| movu m3, [a3] |
| movu m4, [a4] |
| movu m0, [c] |
| movu m5, [a5] |
| movu m6, [a6] |
| movu m7, [a7] |
| movu m8, [a8] |
| %endmacro |
| |
| ; %1 zero location (simd register/memory) |
| %macro LOAD_SQUARE_16 1 |
| LOAD m1, [a1], %1 |
| LOAD m2, [a2], %1 |
| LOAD m3, [a3], %1 |
| LOAD m4, [a4], %1 |
| LOAD m0, [c], %1 |
| LOAD m5, [a5], %1 |
| LOAD m6, [a6], %1 |
| LOAD m7, [a7], %1 |
| LOAD m8, [a8], %1 |
| %endmacro |
| |
| ; %1 data type |
| ; %2 simd register to hold maximums |
| ; %3 simd register to hold minimums |
| ; %4 temp location (simd register/memory) |
| %macro SORT_PAIR 4 |
| mova %4, %2 |
| pmin%1 %2, %3 |
| pmax%1 %3, %4 |
| %endmacro |
| |
| %macro SORT_AXIS 0 |
| SORT_PAIR ub, m1, m8, m9 |
| SORT_PAIR ub, m2, m7, m10 |
| SORT_PAIR ub, m3, m6, m11 |
| SORT_PAIR ub, m4, m5, m12 |
| %endmacro |
| |
| |
| %macro SORT_AXIS_16 0 |
| SORT_PAIR sw, m1, m8, m9 |
| SORT_PAIR sw, m2, m7, m10 |
| SORT_PAIR sw, m3, m6, m11 |
| SORT_PAIR sw, m4, m5, m12 |
| %endmacro |
| |
| ; The loop doesn't need to do all the iterations. It could stop when the right |
| ; pixels are in the right registers. |
| %macro SORT_SQUARE 0 |
| %assign k 7 |
| %rep 7 |
| %assign i 1 |
| %assign j 2 |
| %rep k |
| SORT_PAIR ub, m %+ i , m %+ j , m9 |
| %assign i i+1 |
| %assign j j+1 |
| %endrep |
| %assign k k-1 |
| %endrep |
| %endmacro |
| |
| ; %1 dest simd register |
| ; %2 source (simd register/memory) |
| ; %3 temp simd register |
| %macro ABS_DIFF 3 |
| mova %3, %2 |
| psubusb %3, %1 |
| psubusb %1, %2 |
| por %1, %3 |
| %endmacro |
| |
| ; %1 dest simd register |
| ; %2 source (simd register/memory) |
| ; %3 temp simd register |
| %macro ABS_DIFF_W 3 |
| mova %3, %2 |
| psubusw %3, %1 |
| psubusw %1, %2 |
| por %1, %3 |
| %endmacro |
| |
| ; %1 simd register that holds the "false" values and will hold the result |
| ; %2 simd register that holds the "true" values |
| ; %3 location (simd register/memory) that hold the mask |
| %macro BLEND 3 |
| %if cpuflag(avx2) |
| vpblendvb %1, %1, %2, %3 |
| %else |
| pand %2, %3 |
| pandn %3, %1 |
| por %3, %2 |
| SWAP %1, %3 |
| %endif |
| %endmacro |
| |
| ; Functions |
| |
| INIT_XMM sse2 |
| cglobal rg_fl_mode_1, 4, 5, 3, 0, dst, src, stride, pixels |
| mov r4q, strideq |
| neg r4q |
| %define stride_p strideq |
| %define stride_n r4q |
| |
| .loop: |
| movu m0, [a1] |
| mova m1, m0 |
| |
| movu m2, [a2] |
| pmaxub m0, m2 |
| pminub m1, m2 |
| |
| movu m2, [a3] |
| pmaxub m0, m2 |
| pminub m1, m2 |
| |
| movu m2, [a4] |
| pmaxub m0, m2 |
| pminub m1, m2 |
| |
| movu m2, [a5] |
| pmaxub m0, m2 |
| pminub m1, m2 |
| |
| movu m2, [a6] |
| pmaxub m0, m2 |
| pminub m1, m2 |
| |
| movu m2, [a7] |
| pmaxub m0, m2 |
| pminub m1, m2 |
| |
| movu m2, [a8] |
| pmaxub m0, m2 |
| pminub m1, m2 |
| |
| movu m2, [c] |
| pminub m2, m0 |
| pmaxub m2, m1 |
| |
| movu [dstq], m2 |
| add srcq, mmsize |
| add dstq, mmsize |
| sub pixelsd, mmsize |
| jg .loop |
| RET |
| |
| %if ARCH_X86_64 |
| cglobal rg_fl_mode_2, 4, 5, 10, 0, dst, src, stride, pixels |
| mov r4q, strideq |
| neg r4q |
| %define stride_p strideq |
| %define stride_n r4q |
| |
| .loop: |
| LOAD_SQUARE |
| SORT_SQUARE |
| |
| CLIPUB m0, m2, m7 |
| |
| movu [dstq], m0 |
| add srcq, mmsize |
| add dstq, mmsize |
| sub pixelsd, mmsize |
| jg .loop |
| RET |
| |
| cglobal rg_fl_mode_3, 4, 5, 10, 0, dst, src, stride, pixels |
| mov r4q, strideq |
| neg r4q |
| %define stride_p strideq |
| %define stride_n r4q |
| |
| .loop: |
| LOAD_SQUARE |
| SORT_SQUARE |
| |
| CLIPUB m0, m3, m6 |
| |
| movu [dstq], m0 |
| add srcq, mmsize |
| add dstq, mmsize |
| sub pixelsd, mmsize |
| jg .loop |
| RET |
| |
| cglobal rg_fl_mode_4, 4, 5, 10, 0, dst, src, stride, pixels |
| mov r4q, strideq |
| neg r4q |
| %define stride_p strideq |
| %define stride_n r4q |
| |
| .loop: |
| LOAD_SQUARE |
| SORT_SQUARE |
| |
| CLIPUB m0, m4, m5 |
| |
| movu [dstq], m0 |
| add srcq, mmsize |
| add dstq, mmsize |
| sub pixelsd, mmsize |
| jg .loop |
| RET |
| |
| cglobal rg_fl_mode_5, 4, 5, 13, 0, dst, src, stride, pixels |
| mov r4q, strideq |
| neg r4q |
| %define stride_p strideq |
| %define stride_n r4q |
| |
| .loop: |
| LOAD_SQUARE |
| SORT_AXIS |
| |
| mova m9, m0 |
| mova m10, m0 |
| mova m11, m0 |
| mova m12, m0 |
| |
| CLIPUB m9, m1, m8 |
| CLIPUB m10, m2, m7 |
| CLIPUB m11, m3, m6 |
| CLIPUB m12, m4, m5 |
| |
| mova m8, m9 ; clip1 |
| mova m7, m10 ; clip2 |
| mova m6, m11 ; clip3 |
| mova m5, m12 ; clip4 |
| |
| ABS_DIFF m9, m0, m1 ; c1 |
| ABS_DIFF m10, m0, m2 ; c2 |
| ABS_DIFF m11, m0, m3 ; c3 |
| ABS_DIFF m12, m0, m4 ; c4 |
| |
| pminub m9, m10 |
| pminub m9, m11 |
| pminub m9, m12 ; mindiff |
| |
| pcmpeqb m10, m9 |
| pcmpeqb m11, m9 |
| pcmpeqb m12, m9 |
| |
| ; Notice the order here: c1, c3, c2, c4 |
| BLEND m8, m6, m11 |
| BLEND m8, m7, m10 |
| BLEND m8, m5, m12 |
| |
| movu [dstq], m8 |
| add srcq, mmsize |
| add dstq, mmsize |
| sub pixelsd, mmsize |
| jg .loop |
| RET |
| |
| cglobal rg_fl_mode_6, 4, 5, 16, 0, dst, src, stride, pixels |
| mov r4q, strideq |
| neg r4q |
| %define stride_p strideq |
| %define stride_n r4q |
| |
| ; Some register saving suggestions: the zero can be somewhere other than a |
| ; register, the center pixels could be on the stack. |
| |
| pxor m15, m15 |
| .loop: |
| LOAD_SQUARE_16 m15 |
| SORT_AXIS_16 |
| |
| mova m9, m0 |
| mova m10, m0 |
| mova m11, m0 |
| mova m12, m0 |
| CLIPW m9, m1, m8 ; clip1 |
| CLIPW m10, m2, m7 ; clip2 |
| CLIPW m11, m3, m6 ; clip3 |
| CLIPW m12, m4, m5 ; clip4 |
| |
| psubw m8, m1 ; d1 |
| psubw m7, m2 ; d2 |
| psubw m6, m3 ; d3 |
| psubw m5, m4 ; d4 |
| |
| mova m1, m9 |
| mova m2, m10 |
| mova m3, m11 |
| mova m4, m12 |
| ABS_DIFF_W m1, m0, m13 |
| ABS_DIFF_W m2, m0, m14 |
| ABS_DIFF_W m3, m0, m13 |
| ABS_DIFF_W m4, m0, m14 |
| psllw m1, 1 |
| psllw m2, 1 |
| psllw m3, 1 |
| psllw m4, 1 |
| paddw m1, m8 ; c1 |
| paddw m2, m7 ; c2 |
| paddw m3, m6 ; c3 |
| paddw m4, m5 ; c4 |
| ; As the differences (d1..d4) can only be positive, there is no need to |
| ; clip to zero. Also, the maximum positive value is less than 768. |
| |
| pminsw m1, m2 |
| pminsw m1, m3 |
| pminsw m1, m4 |
| |
| pcmpeqw m2, m1 |
| pcmpeqw m3, m1 |
| pcmpeqw m4, m1 |
| |
| BLEND m9, m11, m3 |
| BLEND m9, m10, m2 |
| BLEND m9, m12, m4 |
| packuswb m9, m9 |
| |
| movh [dstq], m9 |
| add srcq, mmsize/2 |
| add dstq, mmsize/2 |
| sub pixelsd, mmsize/2 |
| jg .loop |
| RET |
| |
| ; This is just copy-pasted straight from mode 6 with the left shifts removed. |
| cglobal rg_fl_mode_7, 4, 5, 16, 0, dst, src, stride, pixels |
| mov r4q, strideq |
| neg r4q |
| %define stride_p strideq |
| %define stride_n r4q |
| |
| ; Can this be done without unpacking? |
| |
| pxor m15, m15 |
| .loop: |
| LOAD_SQUARE_16 m15 |
| SORT_AXIS_16 |
| |
| mova m9, m0 |
| mova m10, m0 |
| mova m11, m0 |
| mova m12, m0 |
| CLIPW m9, m1, m8 ; clip1 |
| CLIPW m10, m2, m7 ; clip2 |
| CLIPW m11, m3, m6 ; clip3 |
| CLIPW m12, m4, m5 ; clip4 |
| |
| psubw m8, m1 ; d1 |
| psubw m7, m2 ; d2 |
| psubw m6, m3 ; d3 |
| psubw m5, m4 ; d4 |
| |
| mova m1, m9 |
| mova m2, m10 |
| mova m3, m11 |
| mova m4, m12 |
| ABS_DIFF_W m1, m0, m13 |
| ABS_DIFF_W m2, m0, m14 |
| ABS_DIFF_W m3, m0, m13 |
| ABS_DIFF_W m4, m0, m14 |
| paddw m1, m8 ; c1 |
| paddw m2, m7 ; c2 |
| paddw m3, m6 ; c3 |
| paddw m4, m5 ; c4 |
| |
| pminsw m1, m2 |
| pminsw m1, m3 |
| pminsw m1, m4 |
| |
| pcmpeqw m2, m1 |
| pcmpeqw m3, m1 |
| pcmpeqw m4, m1 |
| |
| BLEND m9, m11, m3 |
| BLEND m9, m10, m2 |
| BLEND m9, m12, m4 |
| packuswb m9, m9 |
| |
| movh [dstq], m9 |
| add srcq, mmsize/2 |
| add dstq, mmsize/2 |
| sub pixelsd, mmsize/2 |
| jg .loop |
| RET |
| |
| ; This is just copy-pasted straight from mode 6 with a few changes. |
| cglobal rg_fl_mode_8, 4, 5, 16, 0, dst, src, stride, pixels |
| mov r4q, strideq |
| neg r4q |
| %define stride_p strideq |
| %define stride_n r4q |
| |
| pxor m15, m15 |
| .loop: |
| LOAD_SQUARE_16 m15 |
| SORT_AXIS_16 |
| |
| mova m9, m0 |
| mova m10, m0 |
| mova m11, m0 |
| mova m12, m0 |
| CLIPW m9, m1, m8 ; clip1 |
| CLIPW m10, m2, m7 ; clip2 |
| CLIPW m11, m3, m6 ; clip3 |
| CLIPW m12, m4, m5 ; clip4 |
| |
| psubw m8, m1 ; d1 |
| psubw m7, m2 ; d2 |
| psubw m6, m3 ; d3 |
| psubw m5, m4 ; d4 |
| psllw m8, 1 |
| psllw m7, 1 |
| psllw m6, 1 |
| psllw m5, 1 |
| |
| mova m1, m9 |
| mova m2, m10 |
| mova m3, m11 |
| mova m4, m12 |
| ABS_DIFF_W m1, m0, m13 |
| ABS_DIFF_W m2, m0, m14 |
| ABS_DIFF_W m3, m0, m13 |
| ABS_DIFF_W m4, m0, m14 |
| paddw m1, m8 ; c1 |
| paddw m2, m7 ; c1 |
| paddw m3, m6 ; c1 |
| paddw m4, m5 ; c1 |
| ; As the differences (d1..d4) can only be positive, there is no need to |
| ; clip to zero. Also, the maximum positive value is less than 768. |
| |
| pminsw m1, m2 |
| pminsw m1, m3 |
| pminsw m1, m4 |
| |
| pcmpeqw m2, m1 |
| pcmpeqw m3, m1 |
| pcmpeqw m4, m1 |
| |
| BLEND m9, m11, m3 |
| BLEND m9, m10, m2 |
| BLEND m9, m12, m4 |
| packuswb m9, m9 |
| |
| movh [dstq], m9 |
| add srcq, mmsize/2 |
| add dstq, mmsize/2 |
| sub pixelsd, mmsize/2 |
| jg .loop |
| RET |
| |
| cglobal rg_fl_mode_9, 4, 5, 13, 0, dst, src, stride, pixels |
| mov r4q, strideq |
| neg r4q |
| %define stride_p strideq |
| %define stride_n r4q |
| |
| .loop: |
| LOAD_SQUARE |
| SORT_AXIS |
| |
| mova m9, m0 |
| mova m10, m0 |
| mova m11, m0 |
| mova m12, m0 |
| CLIPUB m9, m1, m8 ; clip1 |
| CLIPUB m10, m2, m7 ; clip2 |
| CLIPUB m11, m3, m6 ; clip3 |
| CLIPUB m12, m4, m5 ; clip4 |
| |
| psubb m8, m1 ; d1 |
| psubb m7, m2 ; d2 |
| psubb m6, m3 ; d3 |
| psubb m5, m4 ; d4 |
| |
| pminub m8, m7 |
| pminub m8, m6 |
| pminub m8, m5 |
| |
| pcmpeqb m7, m8 |
| pcmpeqb m6, m8 |
| pcmpeqb m5, m8 |
| |
| BLEND m9, m11, m6 |
| BLEND m9, m10, m7 |
| BLEND m9, m12, m5 |
| |
| movu [dstq], m9 |
| add srcq, mmsize |
| add dstq, mmsize |
| sub pixelsd, mmsize |
| jg .loop |
| RET |
| %endif |
| |
| cglobal rg_fl_mode_10, 4, 5, 8, 0, dst, src, stride, pixels |
| mov r4q, strideq |
| neg r4q |
| %define stride_p strideq |
| %define stride_n r4q |
| |
| .loop: |
| movu m0, [c] |
| |
| movu m1, [a4] |
| mova m2, m1 |
| ABS_DIFF m1, m0, m7 |
| |
| movu m3, [a5] ; load pixel |
| mova m4, m3 |
| ABS_DIFF m4, m0, m7 ; absolute difference from center |
| pminub m1, m4 ; mindiff |
| pcmpeqb m4, m1 ; if (difference == mindiff) |
| BLEND m2, m3, m4 ; return pixel |
| |
| movu m5, [a1] |
| mova m6, m5 |
| ABS_DIFF m6, m0, m7 |
| pminub m1, m6 |
| pcmpeqb m6, m1 |
| BLEND m2, m5, m6 |
| |
| movu m3, [a3] |
| mova m4, m3 |
| ABS_DIFF m4, m0, m7 |
| pminub m1, m4 |
| pcmpeqb m4, m1 |
| BLEND m2, m3, m4 |
| |
| movu m5, [a2] |
| mova m6, m5 |
| ABS_DIFF m6, m0, m7 |
| pminub m1, m6 |
| pcmpeqb m6, m1 |
| BLEND m2, m5, m6 |
| |
| movu m3, [a6] |
| mova m4, m3 |
| ABS_DIFF m4, m0, m7 |
| pminub m1, m4 |
| pcmpeqb m4, m1 |
| BLEND m2, m3, m4 |
| |
| movu m5, [a8] |
| mova m6, m5 |
| ABS_DIFF m6, m0, m7 |
| pminub m1, m6 |
| pcmpeqb m6, m1 |
| BLEND m2, m5, m6 |
| |
| movu m3, [a7] |
| mova m4, m3 |
| ABS_DIFF m4, m0, m7 |
| pminub m1, m4 |
| pcmpeqb m4, m1 |
| BLEND m2, m3, m4 |
| |
| movu [dstq], m2 |
| add srcq, mmsize |
| add dstq, mmsize |
| sub pixelsd, mmsize |
| jg .loop |
| RET |
| |
| cglobal rg_fl_mode_11_12, 4, 5, 7, 0, dst, src, stride, pixels |
| mov r4q, strideq |
| neg r4q |
| %define stride_p strideq |
| %define stride_n r4q |
| |
| pxor m0, m0 |
| .loop: |
| LOAD m1, [c], m0 |
| LOAD m2, [a2], m0 |
| LOAD m3, [a4], m0 |
| LOAD m4, [a5], m0 |
| LOAD m5, [a7], m0 |
| |
| psllw m1, 2 |
| paddw m2, m3 |
| paddw m4, m5 |
| paddw m2, m4 |
| psllw m2, 1 |
| |
| LOAD m3, [a1], m0 |
| LOAD m4, [a3], m0 |
| LOAD m5, [a6], m0 |
| LOAD m6, [a8], m0 |
| paddw m1, m2 |
| paddw m3, m4 |
| paddw m5, m6 |
| paddw m1, m3 |
| paddw m1, m5 |
| |
| paddw m1, [pw_8] |
| psraw m1, 4 |
| |
| packuswb m1, m1 |
| |
| movh [dstq], m1 |
| add srcq, mmsize/2 |
| add dstq, mmsize/2 |
| sub pixelsd, mmsize/2 |
| jg .loop |
| RET |
| |
| cglobal rg_fl_mode_13_14, 4, 5, 8, 0, dst, src, stride, pixels |
| mov r4q, strideq |
| neg r4q |
| %define stride_p strideq |
| %define stride_n r4q |
| |
| .loop: |
| movu m1, [a1] |
| movu m2, [a8] |
| mova m0, m1 |
| pavgb m1, m2 |
| ABS_DIFF m0, m2, m6 |
| |
| movu m3, [a3] |
| movu m4, [a6] |
| mova m5, m3 |
| pavgb m3, m4 |
| ABS_DIFF m5, m4, m7 |
| pminub m0, m5 |
| pcmpeqb m5, m0 |
| BLEND m1, m3, m5 |
| |
| movu m2, [a2] |
| movu m3, [a7] |
| mova m4, m2 |
| pavgb m2, m3 |
| ABS_DIFF m4, m3, m6 |
| pminub m0, m4 |
| pcmpeqb m4, m0 |
| BLEND m1, m2, m4 |
| |
| movu [dstq], m1 |
| add srcq, mmsize |
| add dstq, mmsize |
| sub pixelsd, mmsize |
| jg .loop |
| RET |
| |
| %if ARCH_X86_64 |
| cglobal rg_fl_mode_15_16, 4, 5, 16, 0, dst, src, stride, pixels |
| mov r4q, strideq |
| neg r4q |
| %define stride_p strideq |
| %define stride_n r4q |
| |
| pxor m15, m15 |
| .loop: |
| LOAD_SQUARE_16 m15 |
| |
| mova m9, m1 |
| mova m10, m2 |
| mova m11, m3 |
| ABS_DIFF_W m9, m8, m12 |
| ABS_DIFF_W m10, m7, m13 |
| ABS_DIFF_W m11, m6, m14 |
| pminsw m9, m10 |
| pminsw m9, m11 |
| pcmpeqw m10, m9 |
| pcmpeqw m11, m9 |
| |
| mova m12, m2 |
| mova m13, m1 |
| mova m14, m6 |
| paddw m12, m7 |
| psllw m12, 1 |
| paddw m13, m3 |
| paddw m14, m8 |
| paddw m12, [pw_4] |
| paddw m13, m14 |
| paddw m12, m13 |
| psrlw m12, 3 |
| |
| SORT_PAIR ub, m1, m8, m0 |
| SORT_PAIR ub, m2, m7, m9 |
| SORT_PAIR ub, m3, m6, m14 |
| mova m4, m12 |
| mova m5, m12 |
| CLIPW m4, m1, m8 |
| CLIPW m5, m2, m7 |
| CLIPW m12, m3, m6 |
| |
| BLEND m4, m12, m11 |
| BLEND m4, m5, m10 |
| packuswb m4, m4 |
| |
| movh [dstq], m4 |
| add srcq, mmsize/2 |
| add dstq, mmsize/2 |
| sub pixelsd, mmsize/2 |
| jg .loop |
| RET |
| |
| cglobal rg_fl_mode_17, 4, 5, 9, 0, dst, src, stride, pixels |
| mov r4q, strideq |
| neg r4q |
| %define stride_p strideq |
| %define stride_n r4q |
| |
| .loop: |
| LOAD_SQUARE |
| SORT_AXIS |
| |
| pmaxub m1, m2 |
| pmaxub m3, m4 |
| |
| pminub m8, m7 |
| pminub m5, m6 |
| |
| pmaxub m1, m3 |
| pminub m8, m5 |
| |
| mova m2, m1 |
| pminub m1, m8 |
| pmaxub m8, m2 |
| |
| CLIPUB m0, m1, m8 |
| |
| movu [dstq], m0 |
| add srcq, mmsize |
| add dstq, mmsize |
| sub pixelsd, mmsize |
| jg .loop |
| RET |
| |
| cglobal rg_fl_mode_18, 4, 5, 16, 0, dst, src, stride, pixels |
| mov r4q, strideq |
| neg r4q |
| %define stride_p strideq |
| %define stride_n r4q |
| |
| .loop: |
| LOAD_SQUARE |
| |
| mova m9, m1 |
| mova m10, m8 |
| ABS_DIFF m9, m0, m11 |
| ABS_DIFF m10, m0, m12 |
| pmaxub m9, m10 ; m9 = d1 |
| |
| mova m10, m2 |
| mova m11, m7 |
| ABS_DIFF m10, m0, m12 |
| ABS_DIFF m11, m0, m13 |
| pmaxub m10, m11 ; m10 = d2 |
| |
| mova m11, m3 |
| mova m12, m6 |
| ABS_DIFF m11, m0, m13 |
| ABS_DIFF m12, m0, m14 |
| pmaxub m11, m12 ; m11 = d3 |
| |
| mova m12, m4 |
| mova m13, m5 |
| ABS_DIFF m12, m0, m14 |
| ABS_DIFF m13, m0, m15 |
| pmaxub m12, m13 ; m12 = d4 |
| |
| mova m13, m9 |
| pminub m13, m10 |
| pminub m13, m11 |
| pminub m13, m12 ; m13 = mindiff |
| |
| pcmpeqb m10, m13 |
| pcmpeqb m11, m13 |
| pcmpeqb m12, m13 |
| |
| mova m14, m1 |
| pminub m1, m8 |
| pmaxub m8, m14 |
| |
| mova m13, m0 |
| mova m14, m1 |
| pminub m1, m8 |
| pmaxub m8, m14 |
| CLIPUB m13, m1, m8 ; m13 = ret...d1 |
| |
| mova m14, m0 |
| mova m15, m3 |
| pminub m3, m6 |
| pmaxub m6, m15 |
| CLIPUB m14, m3, m6 |
| pand m14, m11 |
| pandn m11, m13 |
| por m14, m11 ; m14 = ret...d3 |
| |
| mova m15, m0 |
| mova m1, m2 |
| pminub m2, m7 |
| pmaxub m7, m1 |
| CLIPUB m15, m2, m7 |
| pand m15, m10 |
| pandn m10, m14 |
| por m15, m10 ; m15 = ret...d2 |
| |
| mova m1, m0 |
| mova m2, m4 |
| pminub m4, m5 |
| pmaxub m5, m2 |
| CLIPUB m1, m4, m5 |
| pand m1, m12 |
| pandn m12, m15 |
| por m1, m12 ; m15 = ret...d4 |
| |
| movu [dstq], m1 |
| add srcq, mmsize |
| add dstq, mmsize |
| sub pixelsd, mmsize |
| jg .loop |
| RET |
| %endif |
| |
| cglobal rg_fl_mode_19, 4, 5, 7, 0, dst, src, stride, pixels |
| mov r4q, strideq |
| neg r4q |
| %define stride_p strideq |
| %define stride_n r4q |
| |
| pxor m0, m0 |
| .loop: |
| LOAD m1, [a1], m0 |
| LOAD m2, [a2], m0 |
| paddw m1, m2 |
| |
| LOAD m3, [a3], m0 |
| LOAD m4, [a4], m0 |
| paddw m3, m4 |
| |
| LOAD m5, [a5], m0 |
| LOAD m6, [a6], m0 |
| paddw m5, m6 |
| |
| LOAD m2, [a7], m0 |
| LOAD m4, [a8], m0 |
| paddw m2, m4 |
| |
| paddw m1, m3 |
| paddw m2, m5 |
| paddw m1, m2 |
| |
| paddw m1, [pw_4] |
| psraw m1, 3 |
| |
| packuswb m1, m1 |
| |
| movh [dstq], m1 |
| add srcq, mmsize/2 |
| add dstq, mmsize/2 |
| sub pixelsd, mmsize/2 |
| jg .loop |
| RET |
| |
| cglobal rg_fl_mode_20, 4, 5, 7, 0, dst, src, stride, pixels |
| mov r4q, strideq |
| neg r4q |
| %define stride_p strideq |
| %define stride_n r4q |
| |
| pxor m0, m0 |
| .loop: |
| LOAD m1, [a1], m0 |
| LOAD m2, [a2], m0 |
| paddw m1, m2 |
| |
| LOAD m3, [a3], m0 |
| LOAD m4, [a4], m0 |
| paddw m3, m4 |
| |
| LOAD m5, [a5], m0 |
| LOAD m6, [a6], m0 |
| paddw m5, m6 |
| |
| LOAD m2, [a7], m0 |
| LOAD m4, [a8], m0 |
| paddw m2, m4 |
| |
| LOAD m6, [c], m0 |
| paddw m1, m3 |
| paddw m2, m5 |
| paddw m6, [pw_4] |
| |
| paddw m1, m2 |
| paddw m1, m6 |
| |
| pmulhuw m1, [pw_div9] |
| |
| packuswb m1, m1 |
| |
| movh [dstq], m1 |
| add srcq, mmsize/2 |
| add dstq, mmsize/2 |
| sub pixelsd, mmsize/2 |
| jg .loop |
| RET |
| |
| cglobal rg_fl_mode_21, 4, 5, 8, 0, dst, src, stride, pixels |
| mov r4q, strideq |
| neg r4q |
| %define stride_p strideq |
| %define stride_n r4q |
| |
| pxor m0, m0 |
| .loop: |
| movu m1, [a1] |
| movu m2, [a8] |
| pavgb m7, m1, m2 |
| punpckhbw m3, m1, m0 |
| punpcklbw m1, m0 |
| punpckhbw m4, m2, m0 |
| punpcklbw m2, m0 |
| paddw m3, m4 |
| paddw m1, m2 |
| psrlw m3, 1 |
| psrlw m1, 1 |
| packuswb m1, m3 |
| |
| movu m2, [a2] |
| movu m3, [a7] |
| pavgb m6, m2, m3 |
| punpckhbw m4, m2, m0 |
| punpcklbw m2, m0 |
| punpckhbw m5, m3, m0 |
| punpcklbw m3, m0 |
| paddw m4, m5 |
| paddw m2, m3 |
| psrlw m4, 1 |
| psrlw m2, 1 |
| packuswb m2, m4 |
| |
| pminub m1, m2 |
| pmaxub m7, m6 |
| |
| movu m2, [a3] |
| movu m3, [a6] |
| pavgb m6, m2, m3 |
| punpckhbw m4, m2, m0 |
| punpcklbw m2, m0 |
| punpckhbw m5, m3, m0 |
| punpcklbw m3, m0 |
| paddw m4, m5 |
| paddw m2, m3 |
| psrlw m4, 1 |
| psrlw m2, 1 |
| packuswb m2, m4 |
| |
| pminub m1, m2 |
| pmaxub m7, m6 |
| |
| movu m2, [a4] |
| movu m3, [a5] |
| pavgb m6, m2, m3 |
| punpckhbw m4, m2, m0 |
| punpcklbw m2, m0 |
| punpckhbw m5, m3, m0 |
| punpcklbw m3, m0 |
| paddw m4, m5 |
| paddw m2, m3 |
| psrlw m4, 1 |
| psrlw m2, 1 |
| packuswb m2, m4 |
| |
| pminub m1, m2 |
| pmaxub m7, m6 |
| |
| movu m3, [c] |
| CLIPUB m3, m1, m7 |
| |
| movu [dstq], m3 |
| add srcq, mmsize |
| add dstq, mmsize |
| sub pixelsd, mmsize |
| jg .loop |
| RET |
| |
| cglobal rg_fl_mode_22, 4, 5, 8, 0, dst, src, stride, pixels |
| mov r4q, strideq |
| neg r4q |
| %define stride_p strideq |
| %define stride_n r4q |
| |
| .loop: |
| movu m0, [a1] |
| movu m1, [a8] |
| pavgb m0, m1 |
| movu m2, [a2] |
| movu m3, [a7] |
| pavgb m2, m3 |
| movu m4, [a3] |
| movu m5, [a6] |
| pavgb m4, m5 |
| movu m6, [a4] |
| movu m7, [a5] |
| pavgb m6, m7 |
| |
| mova m1, m0 |
| mova m3, m2 |
| mova m5, m4 |
| mova m7, m6 |
| pminub m0, m2 |
| pminub m4, m6 |
| pmaxub m1, m3 |
| pmaxub m5, m7 |
| pminub m0, m4 |
| pmaxub m1, m5 |
| |
| movu m2, [c] |
| CLIPUB m2, m0, m1 |
| |
| movu [dstq], m2 |
| add srcq, mmsize |
| add dstq, mmsize |
| sub pixelsd, mmsize |
| jg .loop |
| RET |
| |
| %if ARCH_X86_64 |
| cglobal rg_fl_mode_23, 4, 5, 16, 0, dst, src, stride, pixels |
| mov r4q, strideq |
| neg r4q |
| %define stride_p strideq |
| %define stride_n r4q |
| |
| pxor m15, m15 |
| .loop: |
| LOAD_SQUARE_16 m15 |
| SORT_AXIS_16 |
| |
| mova m9, m8 |
| mova m10, m7 |
| mova m11, m6 |
| mova m12, m5 |
| psubw m9, m1 ; linediff1 |
| psubw m10, m2 ; linediff2 |
| psubw m11, m3 ; linediff3 |
| psubw m12, m4 ; linediff4 |
| |
| psubw m1, m0 |
| psubw m2, m0 |
| psubw m3, m0 |
| psubw m4, m0 |
| pminsw m1, m9 ; d1 |
| pminsw m2, m10 ; d2 |
| pminsw m3, m11 ; d3 |
| pminsw m4, m12 ; d4 |
| pmaxsw m1, m2 |
| pmaxsw m3, m4 |
| pmaxsw m1, m3 |
| pmaxsw m1, m15 ; d |
| |
| mova m13, m0 |
| mova m14, m0 |
| mova m2, m0 |
| mova m4, m0 |
| psubw m13, m8 |
| psubw m14, m7 |
| psubw m2, m6 |
| psubw m4, m5 |
| pminsw m9, m13 ; u1 |
| pminsw m10, m14 ; u2 |
| pminsw m11, m2 ; u3 |
| pminsw m12, m4 ; u4 |
| pmaxsw m9, m10 |
| pmaxsw m11, m12 |
| pmaxsw m9, m11 |
| pmaxsw m9, m15 ; u |
| |
| paddw m0, m1 |
| psubw m0, m9 |
| packuswb m0, m0 |
| |
| movh [dstq], m0 |
| add srcq, mmsize/2 |
| add dstq, mmsize/2 |
| sub pixelsd, mmsize/2 |
| jg .loop |
| RET |
| |
| cglobal rg_fl_mode_24, 4, 5, 16, mmsize, dst, src, stride, pixels |
| mov r4q, strideq |
| neg r4q |
| %define stride_p strideq |
| %define stride_n r4q |
| |
| pxor m15, m15 |
| .loop: |
| LOAD_SQUARE_16 m15 |
| mova [rsp], m0 |
| SORT_AXIS_16 |
| |
| mova m9, m8 |
| mova m10, m7 |
| mova m11, m6 |
| mova m12, m5 |
| psubw m9, m1 ; linediff1 |
| psubw m10, m2 ; linediff2 |
| psubw m11, m3 ; linediff3 |
| psubw m12, m4 ; linediff4 |
| |
| psubw m1, [rsp] ; td1 |
| psubw m2, [rsp] ; td2 |
| psubw m3, [rsp] ; td3 |
| psubw m4, [rsp] ; td4 |
| mova m0, m9 |
| mova m13, m10 |
| mova m14, m11 |
| mova m15, m12 |
| psubw m0, m1 |
| psubw m13, m2 |
| psubw m14, m3 |
| psubw m15, m4 |
| pminsw m1, m0 ; d1 |
| pminsw m2, m13 ; d2 |
| pminsw m3, m14 ; d3 |
| pminsw m4, m15 ; d4 |
| pmaxsw m1, m2 |
| pmaxsw m3, m4 |
| |
| mova m0, [rsp] |
| mova m13, [rsp] |
| mova m14, [rsp] |
| mova m15, [rsp] |
| psubw m0, m8 ; tu1 |
| psubw m13, m7 ; tu2 |
| psubw m14, m6 ; tu3 |
| psubw m15, m5 ; tu4 |
| psubw m9, m0 |
| psubw m10, m13 |
| psubw m11, m14 |
| psubw m12, m15 |
| pminsw m9, m0 ; u1 |
| pminsw m10, m13 ; u2 |
| pminsw m11, m14 ; u3 |
| pminsw m12, m15 ; u4 |
| pmaxsw m9, m10 |
| pmaxsw m11, m12 |
| |
| pmaxsw m1, m3 ; d without max(d,0) |
| pmaxsw m9, m11 ; u without max(u,0) |
| pxor m15, m15 |
| pmaxsw m1, m15 |
| pmaxsw m9, m15 |
| |
| mova m0, [rsp] |
| paddw m0, m1 |
| psubw m0, m9 |
| packuswb m0, m0 |
| |
| movh [dstq], m0 |
| add srcq, mmsize/2 |
| add dstq, mmsize/2 |
| sub pixelsd, mmsize/2 |
| jg .loop |
| RET |
| %endif |