blob: 67d851abf4edc4cfe974ce96ea99acf0f7702c04 [file] [log] [blame]
;*****************************************************************************
;* x86-optimized functions for colorspace filter
;*
;* Copyright (C) 2016 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pw_1: times 8 dw 1
pw_2: times 8 dw 2
pw_4: times 8 dw 4
pw_8: times 8 dw 8
pw_16: times 8 dw 16
pw_64: times 8 dw 64
pw_128: times 8 dw 128
pw_256: times 8 dw 256
pw_512: times 8 dw 512
pw_1023: times 8 dw 1023
pw_1024: times 8 dw 1024
pw_2048: times 8 dw 2048
pw_4095: times 8 dw 4095
pw_8192: times 8 dw 8192
pw_16384: times 8 dw 16384
pd_1: times 4 dd 1
pd_2: times 4 dd 2
pd_128: times 4 dd 128
pd_512: times 4 dd 512
pd_2048: times 4 dd 2048
pd_8192: times 4 dd 8192
pd_32768: times 4 dd 32768
pd_131072: times 4 dd 131072
SECTION .text
; void ff_yuv2yuv_420p8to8_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_out_stride[3],
; uint8_t *yuv_in[3], ptrdiff_t yuv_in_stride[3],
; int w, int h, const int16_t yuv2yuv_coeffs[3][3][8],
; const int16_t yuv_offset[2][8])
%if ARCH_X86_64
%macro YUV2YUV_FN 4 ; in_bitdepth, out_bitdepth, log2_chroma_w (horiz), log2_chroma_h (vert)
%assign %%sh (14 + %1 - %2)
%assign %%rnd (1 << (%%sh - 1))
%assign %%uvinoff (128 << (%1 - 8))
%assign %%uvoutoff (128 << (%2 - 8))
%if %3 == 0
%assign %%ss 444
%elif %4 == 0
%assign %%ss 422
%else ; %4 == 1
%assign %%ss 420
%endif ; %3/%4
%if %2 != 8
%assign %%maxval (1 << %2) - 1
%endif ; %2 != 8
%assign %%ypsh %%sh - 1
%if %%ypsh > 14
%assign %%yoffsh %%ypsh - 13
%assign %%ypsh 14
%else
%assign %%yoffsh 1
%endif
%assign %%yprnd (1 << (%%yoffsh - 1))
%assign %%ypmul (1 << %%ypsh)
cglobal yuv2yuv_ %+ %%ss %+ p%1to%2, 8, 14, 16, 0 - (4 * mmsize), \
yo, yos, yi, yis, w, h, c, yoff, ui, vi, uo, vo
%if %3 == 1
inc wd
sar wd, 1
%if %4 == 1
inc hd
sar hd, 1
%endif ; %4 == 1
%endif ; %3 == 1
mov [rsp+3*mmsize+0], wd
mov [rsp+3*mmsize+4], hd
mova m10, [cq]
pxor m11, m11
mova m12, [pd_ %+ %%uvoutoff]
pslld m12, %%sh
paddd m12, [pd_ %+ %%rnd]
mova m13, [pw_ %+ %%uvinoff]
mova m14, [yoffq+ 0] ; y_off_in
mova m15, [yoffq+16] ; y_off_out
%if %%yoffsh != 0
psllw m15, %%yoffsh
%endif
paddw m15, [pw_ %+ %%yprnd]
punpcklwd m10, m15
mova m15, [pw_ %+ %%ypmul]
movh m0, [cq+1*16] ; cyu
movh m1, [cq+2*16] ; cyv
movh m2, [cq+4*16] ; cuu
movh m3, [cq+5*16] ; cuv
movh m4, [cq+7*16] ; cvu
movh m5, [cq+8*16] ; cvv
punpcklwd m0, m1
punpcklwd m2, m3
punpcklwd m4, m5
mova [rsp+0*mmsize], m0
mova [rsp+1*mmsize], m2
mova [rsp+2*mmsize], m4
DEFINE_ARGS yo, yos, yi, yis, ui, vi, uo, vo, uis, vis, uos, vos, x, tmp
mov uiq, [yiq+gprsize*1]
mov viq, [yiq+gprsize*2]
mov yiq, [yiq+gprsize*0]
mov uoq, [yoq+gprsize*1]
mov voq, [yoq+gprsize*2]
mov yoq, [yoq+gprsize*0]
mov uisq, [yisq+gprsize*1]
mov visq, [yisq+gprsize*2]
mov yisq, [yisq+gprsize*0]
mov uosq, [yosq+gprsize*1]
mov vosq, [yosq+gprsize*2]
mov yosq, [yosq+gprsize*0]
.loop_v:
xor xq, xq
.loop_h:
%if %4 == 1
lea tmpq, [yiq+yisq]
%endif ; %4 == 1
%if %1 == 8
movu m0, [yiq+xq*(1<<%3)] ; y00/01
%if %4 == 1
movu m2, [tmpq+xq*2] ; y10/11
%endif ; %4 == 1
%if %3 == 1
movh m4, [uiq+xq] ; u
movh m5, [viq+xq] ; v
%else ; %3 != 1
movu m4, [uiq+xq] ; u
movu m5, [viq+xq] ; v
%endif ; %3 ==/!= 1
punpckhbw m1, m0, m11
punpcklbw m0, m11
%if %4 == 1
punpckhbw m3, m2, m11
punpcklbw m2, m11
%endif ; %4 == 1
%if %3 == 0
punpckhbw m2, m4, m11
punpckhbw m3, m5, m11
%endif ; %3 == 0
punpcklbw m4, m11
punpcklbw m5, m11
%else ; %1 != 8
movu m0, [yiq+xq*(2<<%3)] ; y00/01
movu m1, [yiq+xq*(2<<%3)+mmsize] ; y00/01
%if %4 == 1
movu m2, [tmpq+xq*4] ; y10/11
movu m3, [tmpq+xq*4+mmsize] ; y10/11
%endif ; %4 == 1
movu m4, [uiq+xq*2] ; u
movu m5, [viq+xq*2] ; v
%if %3 == 0
movu m2, [uiq+xq*2+mmsize]
movu m3, [viq+xq*2+mmsize]
%endif ; %3 == 0
%endif ; %1 ==/!= 8
psubw m0, m14
psubw m1, m14
%if %4 == 1
psubw m2, m14
psubw m3, m14
%endif ; %4 == 1
psubw m4, m13
psubw m5, m13
%if %3 == 0
psubw m2, m13
psubw m3, m13
%endif ; %3 == 0
SBUTTERFLY wd, 4, 5, 6
pmaddwd m6, m4, [rsp+1*mmsize]
pmaddwd m7, m5, [rsp+1*mmsize]
%if %3 == 0
SBUTTERFLY wd, 2, 3, 8
pmaddwd m8, m2, [rsp+1*mmsize]
pmaddwd m9, m3, [rsp+1*mmsize]
%else ; %3 != 0
pmaddwd m8, m4, [rsp+2*mmsize]
pmaddwd m9, m5, [rsp+2*mmsize]
%endif
paddd m6, m12
paddd m7, m12
paddd m8, m12
paddd m9, m12
psrad m6, %%sh
psrad m7, %%sh
psrad m8, %%sh
psrad m9, %%sh
packssdw m6, m7
packssdw m8, m9
%if %2 == 8
packuswb m6, m8
%if %3 == 0
movu [uoq+xq], m6
%else ; %3 != 0
movh [uoq+xq], m6
movhps [voq+xq], m6
%endif ; %3 ==/!= 0
%else ; %2 != 8
CLIPW m6, m11, [pw_ %+ %%maxval]
CLIPW m8, m11, [pw_ %+ %%maxval]
movu [uoq+xq*2], m6
%if %3 == 0
movu [uoq+xq*2+mmsize], m8
%else ; %3 != 0
movu [voq+xq*2], m8
%endif ; %3 ==/!= 0
%endif ; %2 ==/!= 8
%if %3 == 0
pmaddwd m6, m4, [rsp+2*mmsize]
pmaddwd m7, m5, [rsp+2*mmsize]
pmaddwd m8, m2, [rsp+2*mmsize]
pmaddwd m9, m3, [rsp+2*mmsize]
paddd m6, m12
paddd m7, m12
paddd m8, m12
paddd m9, m12
psrad m6, %%sh
psrad m7, %%sh
psrad m8, %%sh
psrad m9, %%sh
packssdw m6, m7
packssdw m8, m9
%if %2 == 8
packuswb m6, m8
movu [voq+xq], m6
%else ; %2 != 8
CLIPW m6, m11, [pw_ %+ %%maxval]
CLIPW m8, m11, [pw_ %+ %%maxval]
movu [voq+xq*2], m6
movu [voq+xq*2+mmsize], m8
%endif ; %2 ==/!= 8
%endif ; %3 == 0
pmaddwd m4, [rsp+0*mmsize]
pmaddwd m5, [rsp+0*mmsize] ; uv_val
%if %3 == 0
pmaddwd m2, [rsp+0*mmsize]
pmaddwd m3, [rsp+0*mmsize]
%endif ; %3 == 0
; unpack y pixels with m15 (shifted round + offset), then multiply
; by m10, add uv pixels, and we're done!
%if %3 == 1
punpckhdq m8, m4, m4
punpckldq m4, m4
punpckhdq m9, m5, m5
punpckldq m5, m5
%else ; %3 != 1
SWAP 8, 5, 2
SWAP 3, 9
%endif ; %3 ==/!= 1
%if %4 == 1
punpckhwd m6, m2, m15
punpcklwd m2, m15
punpckhwd m7, m3, m15
punpcklwd m3, m15
pmaddwd m2, m10
pmaddwd m6, m10
pmaddwd m3, m10
pmaddwd m7, m10
paddd m2, m4
paddd m6, m8
paddd m3, m5
paddd m7, m9
psrad m2, %%sh
psrad m6, %%sh
psrad m3, %%sh
psrad m7, %%sh
packssdw m2, m6
packssdw m3, m7
lea tmpq, [yoq+yosq]
%if %2 == 8
packuswb m2, m3
movu [tmpq+xq*2], m2
%else ; %2 != 8
CLIPW m2, m11, [pw_ %+ %%maxval]
CLIPW m3, m11, [pw_ %+ %%maxval]
movu [tmpq+xq*4], m2
movu [tmpq+xq*4+mmsize], m3
%endif ; %2 ==/!= 8
%endif ; %4 == 1
punpckhwd m6, m0, m15
punpcklwd m0, m15
punpckhwd m7, m1, m15
punpcklwd m1, m15
pmaddwd m0, m10
pmaddwd m6, m10
pmaddwd m1, m10
pmaddwd m7, m10
paddd m0, m4
paddd m6, m8
paddd m1, m5
paddd m7, m9
psrad m0, %%sh
psrad m6, %%sh
psrad m1, %%sh
psrad m7, %%sh
packssdw m0, m6
packssdw m1, m7
%if %2 == 8
packuswb m0, m1
movu [yoq+xq*(1<<%3)], m0
%else ; %2 != 8
CLIPW m0, m11, [pw_ %+ %%maxval]
CLIPW m1, m11, [pw_ %+ %%maxval]
movu [yoq+xq*(2<<%3)], m0
movu [yoq+xq*(2<<%3)+mmsize], m1
%endif ; %2 ==/!= 8
add xq, mmsize >> %3
cmp xd, dword [rsp+3*mmsize+0]
jl .loop_h
%if %4 == 1
lea yiq, [yiq+yisq*2]
lea yoq, [yoq+yosq*2]
%else ; %4 != 1
add yiq, yisq
add yoq, yosq
%endif ; %4 ==/!= 1
add uiq, uisq
add viq, visq
add uoq, uosq
add voq, vosq
dec dword [rsp+3*mmsize+4]
jg .loop_v
RET
%endmacro
%macro YUV2YUV_FNS 2 ; ss_w, ss_h
YUV2YUV_FN 8, 8, %1, %2
YUV2YUV_FN 10, 8, %1, %2
YUV2YUV_FN 12, 8, %1, %2
YUV2YUV_FN 8, 10, %1, %2
YUV2YUV_FN 10, 10, %1, %2
YUV2YUV_FN 12, 10, %1, %2
YUV2YUV_FN 8, 12, %1, %2
YUV2YUV_FN 10, 12, %1, %2
YUV2YUV_FN 12, 12, %1, %2
%endmacro
INIT_XMM sse2
YUV2YUV_FNS 0, 0
YUV2YUV_FNS 1, 0
YUV2YUV_FNS 1, 1
; void ff_yuv2rgb_420p8_sse2(int16_t *rgb[3], ptrdiff_t rgb_stride,
; uint8_t *yuv[3], ptrdiff_t yuv_stride[3],
; int w, int h, const int16_t yuv2rgb_coeffs[3][3][8],
; const int16_t yuv_offset[8])
%macro YUV2RGB_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert)
%assign %%sh (%1 - 1)
%assign %%rnd (1 << (%%sh - 1))
%assign %%uvoff (1 << (%1 - 1))
%if %2 == 0
%assign %%ss 444
%elif %3 == 0
%assign %%ss 422
%else ; %3 == 1
%assign %%ss 420
%endif ; %2/%3
cglobal yuv2rgb_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 8 * mmsize, \
rgb, rgbs, yuv, yuvs, ww, h, c, yoff
%if %2 == 1
inc wwd
sar wwd, 1
%endif ; %2 == 1
%if %3 == 1
inc hd
sar hd, 1
%endif ; %3 == 1
pxor m11, m11
mova m15, [yoffq] ; yoff
movh m14, [cq+ 0] ; cy
movh m10, [cq+ 32] ; crv
movh m13, [cq+112] ; cbu
movh m12, [cq+ 64] ; cgu
movh m9, [cq+ 80] ; cgv
punpcklwd m14, [pw_ %+ %%rnd] ; cy, rnd
punpcklwd m13, m11 ; cbu, 0
punpcklwd m11, m10 ; 0, crv
punpcklwd m12, m9 ; cgu, cgv
mova [rsp+0*mmsize], m11
mova [rsp+1*mmsize], m12
mova [rsp+2*mmsize], m13
mova [rsp+3*mmsize], m14
pxor m14, m14
DEFINE_ARGS r, rgbs, y, ys, ww, h, g, b, u, v, us, vs, x, tmp
mov gq, [rq+1*gprsize]
mov bq, [rq+2*gprsize]
mov rq, [rq+0*gprsize]
mov uq, [yq+1*gprsize]
mov vq, [yq+2*gprsize]
mov yq, [yq+0*gprsize]
mov usq, [ysq+1*gprsize]
mov vsq, [ysq+2*gprsize]
mov ysq, [ysq+0*gprsize]
.loop_v:
xor xq, xq
.loop_h:
%if %3 == 1
lea tmpq, [yq+ysq]
%endif ; %3 == 1
%if %1 == 8
movu m0, [yq+xq*(1<<%2)]
%if %3 == 1
movu m2, [tmpq+xq*2]
%endif ; %3 == 1
%if %2 == 1
movh m4, [uq+xq]
movh m5, [vq+xq]
%else ; %2 != 1
movu m4, [uq+xq]
movu m5, [vq+xq]
%endif ; %2 ==/!= 1
punpckhbw m1, m0, m14
punpcklbw m0, m14
%if %3 == 1
punpckhbw m3, m2, m14
punpcklbw m2, m14
%endif ; %3 == 1
%if %2 == 0
punpckhbw m2, m4, m14
punpckhbw m3, m5, m14
%endif ; %2 == 0
punpcklbw m4, m14
punpcklbw m5, m14
%else ; %1 != 8
movu m0, [yq+xq*(2<<%2)]
movu m1, [yq+xq*(2<<%2)+mmsize]
%if %3 == 1
movu m2, [tmpq+xq*4]
movu m3, [tmpq+xq*4+mmsize]
%endif ; %3 == 1
movu m4, [uq+xq*2]
movu m5, [vq+xq*2]
%if %2 == 0
movu m2, [uq+xq*2+mmsize]
movu m3, [vq+xq*2+mmsize]
%endif ; %2 == 0
%endif ; %1 ==/!= 8
psubw m0, m15
psubw m1, m15
%if %3 == 1
psubw m2, m15
psubw m3, m15
%endif ; %3 == 1
psubw m4, [pw_ %+ %%uvoff]
psubw m5, [pw_ %+ %%uvoff]
SBUTTERFLY wd, 4, 5, 6
%if %2 == 0
psubw m2, [pw_ %+ %%uvoff]
psubw m3, [pw_ %+ %%uvoff]
SBUTTERFLY wd, 2, 3, 6
%endif ; %2 == 0
; calculate y+rnd full-resolution [0-3,6-9]
punpckhwd m6, m0, [pw_1] ; y, 1
punpcklwd m0, [pw_1] ; y, 1
punpckhwd m7, m1, [pw_1] ; y, 1
punpcklwd m1, [pw_1] ; y, 1
pmaddwd m0, [rsp+3*mmsize]
pmaddwd m6, [rsp+3*mmsize]
pmaddwd m1, [rsp+3*mmsize]
pmaddwd m7, [rsp+3*mmsize]
%if %3 == 1
punpckhwd m8, m2, [pw_1] ; y, 1
punpcklwd m2, [pw_1] ; y, 1
punpckhwd m9, m3, [pw_1] ; y, 1
punpcklwd m3, [pw_1] ; y, 1
pmaddwd m2, [rsp+3*mmsize]
pmaddwd m8, [rsp+3*mmsize]
pmaddwd m3, [rsp+3*mmsize]
pmaddwd m9, [rsp+3*mmsize]
mova [rsp+4*mmsize], m2
mova [rsp+5*mmsize], m8
mova [rsp+6*mmsize], m3
mova [rsp+7*mmsize], m9
%endif ; %3 == 1
; calculate r offsets (un-subsampled, then duplicate)
pmaddwd m10, m4, [rsp+0*mmsize]
%if %2 == 1
pmaddwd m12, m5, [rsp+0*mmsize]
punpckhdq m11, m10, m10
punpckldq m10, m10
punpckhdq m13, m12, m12
punpckldq m12, m12
%else ; %2 != 1
pmaddwd m11, m5, [rsp+0*mmsize]
pmaddwd m12, m2, [rsp+0*mmsize]
pmaddwd m13, m3, [rsp+0*mmsize]
%endif ; %2 ==/!= 1
%if %3 == 1
paddd m2, m10, [rsp+4*mmsize]
paddd m3, m11, [rsp+5*mmsize]
paddd m8, m12, [rsp+6*mmsize]
paddd m9, m13, [rsp+7*mmsize]
%endif
paddd m10, m0
paddd m11, m6
paddd m12, m1
paddd m13, m7
%if %3 == 1
psrad m2, %%sh
psrad m3, %%sh
psrad m8, %%sh
psrad m9, %%sh
%endif ; %3 == 1
psrad m10, %%sh
psrad m11, %%sh
psrad m12, %%sh
psrad m13, %%sh
%if %3 == 1
lea tmpq, [rq+rgbsq*2]
packssdw m2, m3
packssdw m8, m9
mova [tmpq+xq*4], m2
mova [tmpq+xq*4+mmsize], m8
%endif ; %3 == 1
packssdw m10, m11
packssdw m12, m13
mova [rq+xq*(2 << %2)], m10
mova [rq+xq*(2 << %2)+mmsize], m12
; calculate g offsets (un-subsampled, then duplicate)
pmaddwd m10, m4, [rsp+1*mmsize]
%if %2 == 1
pmaddwd m12, m5, [rsp+1*mmsize]
punpckhdq m11, m10, m10
punpckldq m10, m10
punpckhdq m13, m12, m12
punpckldq m12, m12
%else ; %2 != 1
pmaddwd m11, m5, [rsp+1*mmsize]
pmaddwd m12, m2, [rsp+1*mmsize]
pmaddwd m13, m3, [rsp+1*mmsize]
%endif ; %2 ==/!= 1
%if %3 == 1
paddd m2, m10, [rsp+4*mmsize]
paddd m3, m11, [rsp+5*mmsize]
paddd m8, m12, [rsp+6*mmsize]
paddd m9, m13, [rsp+7*mmsize]
%endif ; %3 == 1
paddd m10, m0
paddd m11, m6
paddd m12, m1
paddd m13, m7
%if %3 == 1
psrad m2, %%sh
psrad m3, %%sh
psrad m8, %%sh
psrad m9, %%sh
%endif ; %3 == 1
psrad m10, %%sh
psrad m11, %%sh
psrad m12, %%sh
psrad m13, %%sh
%if %3 == 1
lea tmpq, [gq+rgbsq*2]
packssdw m2, m3
packssdw m8, m9
mova [tmpq+xq*4], m2
mova [tmpq+xq*4+mmsize], m8
%endif ; %3 == 1
packssdw m10, m11
packssdw m12, m13
mova [gq+xq*(2 << %2)], m10
mova [gq+xq*(2 << %2)+mmsize], m12
; calculate b offsets (un-subsampled, then duplicate)
pmaddwd m4, [rsp+2*mmsize]
pmaddwd m5, [rsp+2*mmsize]
%if %2 == 1
punpckhdq m2, m4, m4
punpckldq m4, m4
punpckhdq m3, m5, m5
punpckldq m5, m5
%else ; %2 != 1
pmaddwd m2, [rsp+2*mmsize]
pmaddwd m3, [rsp+2*mmsize]
SWAP 2, 5
%endif ; %2 ==/!= 1
paddd m0, m4
paddd m6, m2
paddd m1, m5
paddd m7, m3
%if %3 == 1
paddd m4, [rsp+4*mmsize]
paddd m2, [rsp+5*mmsize]
paddd m5, [rsp+6*mmsize]
paddd m3, [rsp+7*mmsize]
%endif ; %3 == 1
psrad m0, %%sh
psrad m6, %%sh
psrad m1, %%sh
psrad m7, %%sh
%if %3 == 1
psrad m4, %%sh
psrad m2, %%sh
psrad m5, %%sh
psrad m3, %%sh
%endif ; %3 == 1
packssdw m0, m6
packssdw m1, m7
movu [bq+xq*(2 << %2)], m0
movu [bq+xq*(2 << %2)+mmsize], m1
%if %3 == 1
lea tmpq, [bq+rgbsq*2]
packssdw m4, m2
packssdw m5, m3
movu [tmpq+xq*4], m4
movu [tmpq+xq*4+mmsize], m5
%endif ; %3 == 1
add xd, mmsize >> %2
cmp xd, wwd
jl .loop_h
lea rq, [rq+rgbsq*(2 << %3)]
lea gq, [gq+rgbsq*(2 << %3)]
lea bq, [bq+rgbsq*(2 << %3)]
%if %3 == 1
lea yq, [yq+ysq*2]
%else ; %3 != 0
add yq, ysq
%endif ; %3 ==/!= 1
add uq, usq
add vq, vsq
dec hd
jg .loop_v
RET
%endmacro
%macro YUV2RGB_FNS 2
YUV2RGB_FN 8, %1, %2
YUV2RGB_FN 10, %1, %2
YUV2RGB_FN 12, %1, %2
%endmacro
INIT_XMM sse2
YUV2RGB_FNS 0, 0
YUV2RGB_FNS 1, 0
YUV2RGB_FNS 1, 1
%macro RGB2YUV_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert)
%assign %%sh 29 - %1
%assign %%rnd (1 << (%%sh - 15))
%assign %%uvrnd ((128 << (%1 - 8)) << (%%sh - 14))
%if %1 != 8
%assign %%maxval ((1 << %1) - 1)
%endif ; %1 != 8
%if %2 == 0
%assign %%ss 444
%elif %3 == 0
%assign %%ss 422
%else ; %3 == 1
%assign %%ss 420
%endif ; %2/%3
cglobal rgb2yuv_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 6 * mmsize, \
yuv, yuvs, rgb, rgbs, ww, h, c, off
%if %2 == 1
inc wwd
sar wwd, 1
%endif ; %2 == 1
%if %3 == 1
inc hd
sar hd, 1
%endif ; %3 == 1
; prepare coeffs
movh m8, [offq]
movh m9, [pw_ %+ %%uvrnd]
psllw m8, %%sh - 14
paddw m9, [pw_ %+ %%rnd]
paddw m8, [pw_ %+ %%rnd]
movh m0, [cq+ 0]
movh m1, [cq+ 16]
movh m2, [cq+ 32]
movh m3, [cq+ 48]
movh m4, [cq+ 64]
movh m5, [cq+ 80]
movh m6, [cq+112]
movh m7, [cq+128]
punpcklwd m0, m1
punpcklwd m2, m8
punpcklwd m3, m4
punpcklwd m4, m5, m9
punpcklwd m5, m6
punpcklwd m7, m9
mova [rsp+0*mmsize], m0 ; cry, cgy
mova [rsp+1*mmsize], m2 ; cby, off + rnd
mova [rsp+2*mmsize], m3 ; cru, cgu
mova [rsp+3*mmsize], m4 ; cburv, uvoff + rnd
mova [rsp+4*mmsize], m5 ; cburv, cgv
mova [rsp+5*mmsize], m7 ; cbv, uvoff + rnd
DEFINE_ARGS y, ys, r, rgbs, ww, h, u, v, us, vs, g, b, tmp, x
mov gq, [rq+gprsize*1]
mov bq, [rq+gprsize*2]
mov rq, [rq+gprsize*0]
mov uq, [yq+gprsize*1]
mov vq, [yq+gprsize*2]
mov yq, [yq+gprsize*0]
mov usq, [ysq+gprsize*1]
mov vsq, [ysq+gprsize*2]
mov ysq, [ysq+gprsize*0]
pxor m15, m15
.loop_v:
xor xd, xd
.loop_h:
; top line y
mova m0, [rq+xq*(2<<%2)]
mova m3, [rq+xq*(2<<%2)+mmsize]
mova m1, [gq+xq*(2<<%2)]
mova m4, [gq+xq*(2<<%2)+mmsize]
mova m2, [bq+xq*(2<<%2)]
mova m5, [bq+xq*(2<<%2)+mmsize]
punpcklwd m6, m0, m1
punpckhwd m7, m0, m1
punpcklwd m8, m3, m4
punpckhwd m9, m3, m4
punpcklwd m10, m2, [pw_16384]
punpckhwd m11, m2, [pw_16384]
punpcklwd m12, m5, [pw_16384]
punpckhwd m13, m5, [pw_16384]
pmaddwd m6, [rsp+0*mmsize]
pmaddwd m7, [rsp+0*mmsize]
pmaddwd m8, [rsp+0*mmsize]
pmaddwd m9, [rsp+0*mmsize]
pmaddwd m10, [rsp+1*mmsize]
pmaddwd m11, [rsp+1*mmsize]
pmaddwd m12, [rsp+1*mmsize]
pmaddwd m13, [rsp+1*mmsize]
paddd m6, m10
paddd m7, m11
paddd m8, m12
paddd m9, m13
psrad m6, %%sh
psrad m7, %%sh
psrad m8, %%sh
psrad m9, %%sh
packssdw m6, m7
packssdw m8, m9
%if %1 == 8
packuswb m6, m8
movu [yq+xq*(1<<%2)], m6
%else
CLIPW m6, m15, [pw_ %+ %%maxval]
CLIPW m8, m15, [pw_ %+ %%maxval]
movu [yq+xq*(2<<%2)], m6
movu [yq+xq*(2<<%2)+mmsize], m8
%endif
%if %2 == 1
; subsampling cached data
pmaddwd m0, [pw_1]
pmaddwd m1, [pw_1]
pmaddwd m2, [pw_1]
pmaddwd m3, [pw_1]
pmaddwd m4, [pw_1]
pmaddwd m5, [pw_1]
%if %3 == 1
; bottom line y, r/g portion only
lea tmpq, [rgbsq+xq*2]
mova m6, [rq+tmpq*2]
mova m9, [rq+tmpq*2+mmsize]
mova m7, [gq+tmpq*2]
mova m10, [gq+tmpq*2+mmsize]
mova m8, [bq+tmpq*2]
mova m11, [bq+tmpq*2+mmsize]
punpcklwd m12, m6, m7
punpckhwd m13, m6, m7
punpcklwd m14, m9, m10
punpckhwd m15, m9, m10
; release two more registers
pmaddwd m6, [pw_1]
pmaddwd m7, [pw_1]
pmaddwd m9, [pw_1]
pmaddwd m10, [pw_1]
paddd m0, m6
paddd m3, m9
paddd m1, m7
paddd m4, m10
; bottom line y, b/rnd portion only
punpcklwd m6, m8, [pw_16384]
punpckhwd m7, m8, [pw_16384]
punpcklwd m9, m11, [pw_16384]
punpckhwd m10, m11, [pw_16384]
pmaddwd m12, [rsp+0*mmsize]
pmaddwd m13, [rsp+0*mmsize]
pmaddwd m14, [rsp+0*mmsize]
pmaddwd m15, [rsp+0*mmsize]
pmaddwd m6, [rsp+1*mmsize]
pmaddwd m7, [rsp+1*mmsize]
pmaddwd m9, [rsp+1*mmsize]
pmaddwd m10, [rsp+1*mmsize]
paddd m12, m6
paddd m13, m7
paddd m14, m9
paddd m15, m10
psrad m12, %%sh
psrad m13, %%sh
psrad m14, %%sh
psrad m15, %%sh
packssdw m12, m13
packssdw m14, m15
lea tmpq, [yq+ysq]
%if %1 == 8
packuswb m12, m14
movu [tmpq+xq*2], m12
%else
pxor m15, m15
CLIPW m12, m15, [pw_ %+ %%maxval]
CLIPW m14, m15, [pw_ %+ %%maxval]
movu [tmpq+xq*4], m12
movu [tmpq+xq*4+mmsize], m14
%endif
; complete subsampling of r/g/b pixels for u/v
pmaddwd m8, [pw_1]
pmaddwd m11, [pw_1]
paddd m2, m8
paddd m5, m11
paddd m0, [pd_2]
paddd m1, [pd_2]
paddd m2, [pd_2]
paddd m3, [pd_2]
paddd m4, [pd_2]
paddd m5, [pd_2]
psrad m0, 2
psrad m1, 2
psrad m2, 2
psrad m3, 2
psrad m4, 2
psrad m5, 2
%else ; %3 != 1
paddd m0, [pd_1]
paddd m1, [pd_1]
paddd m2, [pd_1]
paddd m3, [pd_1]
paddd m4, [pd_1]
paddd m5, [pd_1]
psrad m0, 1
psrad m1, 1
psrad m2, 1
psrad m3, 1
psrad m4, 1
psrad m5, 1
%endif ; %3 ==/!= 1
packssdw m0, m3
packssdw m1, m4
packssdw m2, m5
%endif ; %2 == 1
; convert u/v pixels
SBUTTERFLY wd, 0, 1, 6
punpckhwd m6, m2, [pw_16384]
punpcklwd m2, [pw_16384]
pmaddwd m7, m0, [rsp+2*mmsize]
pmaddwd m8, m1, [rsp+2*mmsize]
pmaddwd m9, m2, [rsp+3*mmsize]
pmaddwd m10, m6, [rsp+3*mmsize]
pmaddwd m0, [rsp+4*mmsize]
pmaddwd m1, [rsp+4*mmsize]
pmaddwd m2, [rsp+5*mmsize]
pmaddwd m6, [rsp+5*mmsize]
paddd m7, m9
paddd m8, m10
paddd m0, m2
paddd m1, m6
psrad m7, %%sh
psrad m8, %%sh
psrad m0, %%sh
psrad m1, %%sh
packssdw m7, m8
packssdw m0, m1
%if %2 == 1
%if %1 == 8
packuswb m7, m0
movh [uq+xq], m7
movhps [vq+xq], m7
%else
CLIPW m7, m15, [pw_ %+ %%maxval]
CLIPW m0, m15, [pw_ %+ %%maxval]
movu [uq+xq*2], m7
movu [vq+xq*2], m0
%endif
%else ; %2 != 1
; second set of u/v pixels
SBUTTERFLY wd, 3, 4, 6
punpckhwd m6, m5, [pw_16384]
punpcklwd m5, [pw_16384]
pmaddwd m8, m3, [rsp+2*mmsize]
pmaddwd m9, m4, [rsp+2*mmsize]
pmaddwd m10, m5, [rsp+3*mmsize]
pmaddwd m11, m6, [rsp+3*mmsize]
pmaddwd m3, [rsp+4*mmsize]
pmaddwd m4, [rsp+4*mmsize]
pmaddwd m5, [rsp+5*mmsize]
pmaddwd m6, [rsp+5*mmsize]
paddd m8, m10
paddd m9, m11
paddd m3, m5
paddd m4, m6
psrad m8, %%sh
psrad m9, %%sh
psrad m3, %%sh
psrad m4, %%sh
packssdw m8, m9
packssdw m3, m4
%if %1 == 8
packuswb m7, m8
packuswb m0, m3
movu [uq+xq], m7
movu [vq+xq], m0
%else
CLIPW m7, m15, [pw_ %+ %%maxval]
CLIPW m0, m15, [pw_ %+ %%maxval]
CLIPW m8, m15, [pw_ %+ %%maxval]
CLIPW m3, m15, [pw_ %+ %%maxval]
movu [uq+xq*2], m7
movu [uq+xq*2+mmsize], m8
movu [vq+xq*2], m0
movu [vq+xq*2+mmsize], m3
%endif
%endif ; %2 ==/!= 1
add xq, mmsize >> %2
cmp xd, wwd
jl .loop_h
%if %3 == 0
add yq, ysq
%else ; %3 != 0
lea yq, [yq+ysq*2]
%endif ; %3 ==/!= 0
add uq, usq
add vq, vsq
lea rq, [rq+rgbsq*(2<<%3)]
lea gq, [gq+rgbsq*(2<<%3)]
lea bq, [bq+rgbsq*(2<<%3)]
dec hd
jg .loop_v
RET
%endmacro
%macro RGB2YUV_FNS 2
RGB2YUV_FN 8, %1, %2
RGB2YUV_FN 10, %1, %2
RGB2YUV_FN 12, %1, %2
%endmacro
INIT_XMM sse2
RGB2YUV_FNS 0, 0
RGB2YUV_FNS 1, 0
RGB2YUV_FNS 1, 1
; void ff_multiply3x3_sse2(int16_t *data[3], ptrdiff_t stride,
; int w, int h, const int16_t coeff[3][3][8])
INIT_XMM sse2
cglobal multiply3x3, 5, 7, 16, data, stride, ww, h, c
movh m0, [cq+ 0]
movh m1, [cq+ 32]
movh m2, [cq+ 48]
movh m3, [cq+ 80]
movh m4, [cq+ 96]
movh m5, [cq+128]
punpcklwd m0, [cq+ 16]
punpcklwd m1, [pw_8192]
punpcklwd m2, [cq+ 64]
punpcklwd m3, [pw_8192]
punpcklwd m4, [cq+112]
punpcklwd m5, [pw_8192]
DEFINE_ARGS data0, stride, ww, h, data1, data2, x
shl strideq, 1
mov data1q, [data0q+gprsize*1]
mov data2q, [data0q+gprsize*2]
mov data0q, [data0q+gprsize*0]
.loop_v:
xor xd, xd
.loop_h:
mova m6, [data0q+xq*2]
mova m7, [data1q+xq*2]
mova m8, [data2q+xq*2]
SBUTTERFLY wd, 6, 7, 9
punpckhwd m9, m8, [pw_1]
punpcklwd m8, [pw_1]
pmaddwd m10, m6, m0
pmaddwd m11, m7, m0
pmaddwd m12, m8, m1
pmaddwd m13, m9, m1
paddd m10, m12
paddd m11, m13
psrad m10, 14
psrad m11, 14
pmaddwd m12, m6, m2
pmaddwd m13, m7, m2
pmaddwd m14, m8, m3
pmaddwd m15, m9, m3
paddd m12, m14
paddd m13, m15
psrad m12, 14
psrad m13, 14
pmaddwd m6, m4
pmaddwd m7, m4
pmaddwd m8, m5
pmaddwd m9, m5
paddd m6, m8
paddd m7, m9
psrad m6, 14
psrad m7, 14
packssdw m10, m11
packssdw m12, m13
packssdw m6, m7
mova [data0q+xq*2], m10
mova [data1q+xq*2], m12
mova [data2q+xq*2], m6
add xd, mmsize / 2
cmp xd, wwd
jl .loop_h
add data0q, strideq
add data1q, strideq
add data2q, strideq
dec hd
jg .loop_v
RET
%endif