| ;***************************************************************************** |
| ;* x86-optimized functions for afir filter |
| ;* Copyright (c) 2017 Paul B Mahol |
| ;* |
| ;* This file is part of FFmpeg. |
| ;* |
| ;* FFmpeg is free software; you can redistribute it and/or |
| ;* modify it under the terms of the GNU Lesser General Public |
| ;* License as published by the Free Software Foundation; either |
| ;* version 2.1 of the License, or (at your option) any later version. |
| ;* |
| ;* FFmpeg is distributed in the hope that it will be useful, |
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| ;* Lesser General Public License for more details. |
| ;* |
| ;* You should have received a copy of the GNU Lesser General Public |
| ;* License along with FFmpeg; if not, write to the Free Software |
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| ;****************************************************************************** |
| |
| %include "libavutil/x86/x86util.asm" |
| |
| SECTION .text |
| |
| ;------------------------------------------------------------------------------ |
| ; void ff_fcmul_add(float *sum, const float *t, const float *c, int len) |
| ;------------------------------------------------------------------------------ |
| |
| %macro FCMUL_ADD 0 |
| cglobal fcmul_add, 4,4,6, sum, t, c, len |
| shl lend, 3 |
| add tq, lenq |
| add cq, lenq |
| add sumq, lenq |
| neg lenq |
| ALIGN 16 |
| .loop: |
| movsldup m0, [tq + lenq] |
| movsldup m3, [tq + lenq+mmsize] |
| movaps m1, [cq + lenq] |
| movaps m4, [cq + lenq+mmsize] |
| mulps m0, m0, m1 |
| mulps m3, m3, m4 |
| shufps m1, m1, m1, 0xb1 |
| shufps m4, m4, m4, 0xb1 |
| movshdup m2, [tq + lenq] |
| movshdup m5, [tq + lenq+mmsize] |
| mulps m2, m2, m1 |
| mulps m5, m5, m4 |
| addsubps m0, m0, m2 |
| addsubps m3, m3, m5 |
| addps m0, m0, [sumq + lenq] |
| addps m3, m3, [sumq + lenq+mmsize] |
| movaps [sumq + lenq], m0 |
| movaps [sumq + lenq+mmsize], m3 |
| add lenq, mmsize*2 |
| jl .loop |
| movss xm0, [tq + lenq] |
| mulss xm0, [cq + lenq] |
| addss xm0, [sumq + lenq] |
| movss [sumq + lenq], xm0 |
| RET |
| %endmacro |
| |
| INIT_XMM sse3 |
| FCMUL_ADD |
| INIT_YMM avx |
| FCMUL_ADD |