| ;***************************************************************************** |
| ;* x86-optimized functions for stereo3d filter |
| ;* |
| ;* Copyright (C) 2015 Paul B Mahol |
| ;* |
| ;* This file is part of FFmpeg. |
| ;* |
| ;* FFmpeg is free software; you can redistribute it and/or |
| ;* modify it under the terms of the GNU Lesser General Public |
| ;* License as published by the Free Software Foundation; either |
| ;* version 2.1 of the License, or (at your option) any later version. |
| ;* |
| ;* FFmpeg is distributed in the hope that it will be useful, |
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| ;* Lesser General Public License for more details. |
| ;* |
| ;* You should have received a copy of the GNU Lesser General Public |
| ;* License along with FFmpeg; if not, write to the Free Software |
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| ;***************************************************************************** |
| |
| %include "libavutil/x86/x86util.asm" |
| |
| SECTION_RODATA |
| |
| ; rgbrgbrgbrgb |
| ; rrrrggggbbbb |
| |
| shuf: db 0, 4, 8, 1,5, 9, 2, 6,10,3, 7,11,-1,-1,-1,-1 |
| ex_r: db 0,-1,-1,-1,3,-1,-1,-1,6,-1,-1,-1, 9,-1,-1,-1 |
| ex_g: db 1,-1,-1,-1,4,-1,-1,-1,7,-1,-1,-1,10,-1,-1,-1 |
| ex_b: db 2,-1,-1,-1,5,-1,-1,-1,8,-1,-1,-1,11,-1,-1,-1 |
| |
| SECTION .text |
| |
| INIT_XMM sse4 |
| %if ARCH_X86_64 |
| cglobal anaglyph, 6, 10, 14, 2*6*mmsize, dst, lsrc, rsrc, dst_linesize, l_linesize, r_linesize, width, height, o, cnt |
| %define ana_matrix_rq r6q |
| %define ana_matrix_gq r7q |
| %define ana_matrix_bq r8q |
| |
| %else ; ARCH_X86_32 |
| %if HAVE_ALIGNED_STACK |
| cglobal anaglyph, 3, 7, 8, 2*9*mmsize, dst, lsrc, rsrc, dst_linesize, l_linesize, o, cnt |
| %else |
| cglobal anaglyph, 3, 6, 8, 2*9*mmsize, dst, lsrc, rsrc, dst_linesize, o, cnt |
| %define l_linesizeq r4mp |
| %endif ; HAVE_ALIGNED_STACK |
| %define ana_matrix_rq r3q |
| %define ana_matrix_gq r4q |
| %define ana_matrix_bq r5q |
| %define r_linesizeq r5mp |
| %define widthd r6mp |
| %define heightd r7mp |
| %define m8 [rsp+mmsize*12] |
| %define m9 [rsp+mmsize*13] |
| %define m10 [rsp+mmsize*14] |
| %define m11 [rsp+mmsize*15] |
| %define m12 [rsp+mmsize*16] |
| %define m13 [rsp+mmsize*17] |
| %endif ; ARCH |
| |
| mov ana_matrix_rq, r8m |
| mov ana_matrix_gq, r9m |
| mov ana_matrix_bq, r10m |
| movu m3, [ana_matrix_rq+ 0] |
| movq m5, [ana_matrix_rq+16] |
| pshufd m0, m3, q0000 |
| pshufd m1, m3, q1111 |
| pshufd m2, m3, q2222 |
| pshufd m3, m3, q3333 |
| pshufd m4, m5, q0000 |
| pshufd m5, m5, q1111 |
| mova [rsp+mmsize*0], m0 |
| mova [rsp+mmsize*1], m1 |
| mova [rsp+mmsize*2], m2 |
| mova [rsp+mmsize*3], m3 |
| mova [rsp+mmsize*4], m4 |
| mova [rsp+mmsize*5], m5 |
| |
| movu m3, [ana_matrix_gq+ 0] |
| movq m5, [ana_matrix_gq+16] |
| pshufd m0, m3, q0000 |
| pshufd m1, m3, q1111 |
| pshufd m2, m3, q2222 |
| pshufd m3, m3, q3333 |
| pshufd m4, m5, q0000 |
| pshufd m5, m5, q1111 |
| mova [rsp+mmsize*6 ], m0 |
| mova [rsp+mmsize*7 ], m1 |
| mova [rsp+mmsize*8 ], m2 |
| mova [rsp+mmsize*9 ], m3 |
| mova [rsp+mmsize*10], m4 |
| mova [rsp+mmsize*11], m5 |
| |
| %if ARCH_X86_64 |
| movu m11, [ana_matrix_bq+ 0] |
| movq m13, [ana_matrix_bq+16] |
| pshufd m8, m11, q0000 |
| pshufd m9, m11, q1111 |
| pshufd m10, m11, q2222 |
| pshufd m11, m11, q3333 |
| pshufd m12, m13, q0000 |
| pshufd m13, m13, q1111 |
| mov widthd, dword widthm |
| mov heightd, dword heightm |
| %else |
| movu m3, [ana_matrix_bq+ 0] |
| movq m5, [ana_matrix_bq+16] |
| pshufd m0, m3, q0000 |
| pshufd m1, m3, q1111 |
| pshufd m2, m3, q2222 |
| pshufd m3, m3, q3333 |
| pshufd m4, m5, q0000 |
| pshufd m5, m5, q1111 |
| mova [rsp+mmsize*12], m0 |
| mova [rsp+mmsize*13], m1 |
| mova [rsp+mmsize*14], m2 |
| mova [rsp+mmsize*15], m3 |
| mova [rsp+mmsize*16], m4 |
| mova [rsp+mmsize*17], m5 |
| mov dst_linesizeq, r3m |
| %if HAVE_ALIGNED_STACK |
| mov l_linesizeq, r4m |
| %endif |
| %endif ; ARCH |
| |
| .nextrow: |
| mov od, widthd |
| xor cntd, cntd |
| |
| .loop: |
| movu m3, [lsrcq+cntq] |
| pshufb m1, m3, [ex_r] |
| pshufb m2, m3, [ex_g] |
| pshufb m3, [ex_b] |
| movu m0, [rsrcq+cntq] |
| pshufb m4, m0, [ex_r] |
| pshufb m5, m0, [ex_g] |
| pshufb m0, [ex_b] |
| pmulld m1, [rsp+mmsize*0] |
| pmulld m2, [rsp+mmsize*1] |
| pmulld m3, [rsp+mmsize*2] |
| pmulld m4, [rsp+mmsize*3] |
| pmulld m5, [rsp+mmsize*4] |
| pmulld m0, [rsp+mmsize*5] |
| paddd m1, m2 |
| paddd m3, m4 |
| paddd m5, m0 |
| paddd m1, m3 |
| paddd m1, m5 |
| |
| movu m3, [lsrcq+cntq] |
| pshufb m7, m3, [ex_r] |
| pshufb m2, m3, [ex_g] |
| pshufb m3, [ex_b] |
| movu m0, [rsrcq+cntq] |
| pshufb m4, m0, [ex_r] |
| pshufb m5, m0, [ex_g] |
| pshufb m0, [ex_b] |
| pmulld m7, [rsp+mmsize*6] |
| pmulld m2, [rsp+mmsize*7] |
| pmulld m3, [rsp+mmsize*8] |
| pmulld m4, [rsp+mmsize*9] |
| pmulld m5, [rsp+mmsize*10] |
| pmulld m0, [rsp+mmsize*11] |
| paddd m7, m2 |
| paddd m3, m4 |
| paddd m5, m0 |
| paddd m7, m3 |
| paddd m7, m5 |
| |
| movu m4, [lsrcq+cntq] |
| pshufb m2, m4, [ex_r] |
| pshufb m3, m4, [ex_g] |
| pshufb m4, [ex_b] |
| movu m0, [rsrcq+cntq] |
| pshufb m5, m0, [ex_r] |
| pshufb m6, m0, [ex_g] |
| pshufb m0, [ex_b] |
| pmulld m2, m8 |
| pmulld m3, m9 |
| pmulld m4, m10 |
| pmulld m5, m11 |
| pmulld m6, m12 |
| pmulld m0, m13 |
| paddd m2, m3 |
| paddd m4, m5 |
| paddd m6, m0 |
| paddd m2, m4 |
| paddd m2, m6 |
| |
| psrld m1, 16 |
| psrld m7, 16 |
| psrld m2, 16 |
| |
| packusdw m1, m7 |
| packusdw m2, m2 |
| packuswb m1, m2 |
| pshufb m1, [shuf] |
| |
| movq [dstq+cntq+0], m1 |
| psrldq m1, 8 |
| movd [dstq+cntq+8], m1 |
| add cntd, 12 |
| sub od, 4 |
| jg .loop |
| |
| add dstq, dst_linesizeq |
| add lsrcq, l_linesizeq |
| add rsrcq, r_linesizeq |
| sub heightd, 1 |
| jg .nextrow |
| REP_RET |