libavcodec/x86/sbcdsp.asm - manifest_repos/ffmpeg - Git at Google

 ;******************************************************************************
 ;* SIMD optimized SBC encoder DSP functions
 ;*
 ;* Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
 ;* Copyright (C) 2008-2010  Nokia Corporation
 ;* Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
 ;* Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
 ;* Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
 ;*
 ;* This file is part of FFmpeg.
 ;*
 ;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
 ;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
 ;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************

 %include "libavutil/x86/x86util.asm"

 SECTION_RODATA

 scale_mask: times 2 dd 0x8000    ; 1 << (SBC_PROTO_FIXED_SCALE - 1)

 SECTION .text

 %macro NIDN 3
 %ifnidn %2, %3
     %1            %2, %3
 %endif
 %endmacro

 %macro ANALYZE_MAC 9 ; out1, out2, in1, in2, tmp1, tmp2, add1, add2, offset
     NIDN movq,    %5, %3
     NIDN movq,    %6, %4
     pmaddwd       %5, [constsq+%9]
     pmaddwd       %6, [constsq+%9+8]
     NIDN paddd,   %1, %7
     NIDN paddd,   %2, %8
 %endmacro

 %macro ANALYZE_MAC_IN 7 ; out1, out2, tmp1, tmp2, add1, add2, offset
     ANALYZE_MAC   %1, %2, [inq+%7], [inq+%7+8], %3, %4, %5, %6, %7
 %endmacro

 %macro ANALYZE_MAC_REG 7 ; out1, out2, in, tmp1, tmp2, offset, pack
 %ifidn %7, pack
     psrad         %3, 16    ; SBC_PROTO_FIXED_SCALE
     packssdw      %3, %3
 %endif
     ANALYZE_MAC   %1, %2, %3, %3, %4, %5, %4, %5, %6
 %endmacro

 ;*******************************************************************
 ;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t *consts);
 ;*******************************************************************
 INIT_MMX mmx
 cglobal sbc_analyze_4, 3, 3, 4, in, out, consts
     ANALYZE_MAC_IN   m0, m1, m0, m1, [scale_mask], [scale_mask], 0
     ANALYZE_MAC_IN   m0, m1, m2, m3, m2, m3, 16
     ANALYZE_MAC_IN   m0, m1, m2, m3, m2, m3, 32
     ANALYZE_MAC_IN   m0, m1, m2, m3, m2, m3, 48
     ANALYZE_MAC_IN   m0, m1, m2, m3, m2, m3, 64

     ANALYZE_MAC_REG  m0, m2, m0, m0, m2, 80, pack
     ANALYZE_MAC_REG  m0, m2, m1, m1, m3, 96, pack

     movq          [outq  ], m0
     movq          [outq+8], m2

     RET


 ;*******************************************************************
 ;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t *consts);
 ;*******************************************************************
 INIT_MMX mmx
 cglobal sbc_analyze_8, 3, 3, 4, in, out, consts
     ANALYZE_MAC_IN   m0, m1, m0, m1, [scale_mask], [scale_mask],  0
     ANALYZE_MAC_IN   m2, m3, m2, m3, [scale_mask], [scale_mask], 16
     ANALYZE_MAC_IN   m0, m1, m4, m5, m4, m5,  32
     ANALYZE_MAC_IN   m2, m3, m6, m7, m6, m7,  48
     ANALYZE_MAC_IN   m0, m1, m4, m5, m4, m5,  64
     ANALYZE_MAC_IN   m2, m3, m6, m7, m6, m7,  80
     ANALYZE_MAC_IN   m0, m1, m4, m5, m4, m5,  96
     ANALYZE_MAC_IN   m2, m3, m6, m7, m6, m7, 112
     ANALYZE_MAC_IN   m0, m1, m4, m5, m4, m5, 128
     ANALYZE_MAC_IN   m2, m3, m6, m7, m6, m7, 144

     ANALYZE_MAC_REG  m4, m5, m0, m4, m5, 160, pack
     ANALYZE_MAC_REG  m4, m5, m1, m6, m7, 192, pack
     ANALYZE_MAC_REG  m4, m5, m2, m6, m7, 224, pack
     ANALYZE_MAC_REG  m4, m5, m3, m6, m7, 256, pack

     movq          [outq  ], m4
     movq          [outq+8], m5

     ANALYZE_MAC_REG  m0, m5, m0, m0, m5, 176, no
     ANALYZE_MAC_REG  m0, m5, m1, m1, m7, 208, no
     ANALYZE_MAC_REG  m0, m5, m2, m2, m7, 240, no
     ANALYZE_MAC_REG  m0, m5, m3, m3, m7, 272, no

     movq          [outq+16], m0
     movq          [outq+24], m5

     RET


 ;*******************************************************************
 ;void ff_sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8],
 ;                              uint32_t scale_factor[2][8],
 ;                              int blocks, int channels, int subbands)
 ;*******************************************************************
 INIT_MMX mmx
 cglobal sbc_calc_scalefactors, 5, 7, 4, sb_sample_f, scale_factor, blocks, channels, subbands, ptr, blk
     ; subbands = 4 * subbands * channels
     movq          m3, [scale_mask]
     shl           subbandsd, 2
     cmp           channelsd, 2
     jl            .loop_1
     shl           subbandsd, 1

 .loop_1:
     sub           subbandsq, 8
     lea           ptrq, [sb_sample_fq + subbandsq]

     ; blk = (blocks - 1) * 64;
     lea           blkq, [blocksq - 1]
     shl           blkd, 6

     movq          m0, m3
 .loop_2:
     movq          m1, [ptrq+blkq]
     pxor          m2, m2
     pcmpgtd       m1, m2
     paddd         m1, [ptrq+blkq]
     pcmpgtd       m2, m1
     pxor          m1, m2

     por           m0, m1

     sub           blkq, 64
     jns           .loop_2

     movd          blkd, m0
     psrlq         m0,   32
     bsr           blkd, blkd
     sub           blkd, 15    ; SCALE_OUT_BITS
     mov           [scale_factorq + subbandsq], blkd

     movd          blkd, m0
     bsr           blkd, blkd
     sub           blkd, 15    ; SCALE_OUT_BITS
     mov           [scale_factorq + subbandsq + 4], blkd

     cmp           subbandsq, 0
     jg            .loop_1

     emms
     RET
	;******************************************************************************
	;* SIMD optimized SBC encoder DSP functions
	;*
	;* Copyright (C) 2017 Aurelien Jacobs <aurel@gnuage.org>
	;* Copyright (C) 2008-2010 Nokia Corporation
	;* Copyright (C) 2004-2010 Marcel Holtmann <marcel@holtmann.org>
	;* Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch>
	;* Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com>
	;*
	;* This file is part of FFmpeg.
	;*
	;* FFmpeg is free software; you can redistribute it and/or
	;* modify it under the terms of the GNU Lesser General Public
	;* License as published by the Free Software Foundation; either
	;* version 2.1 of the License, or (at your option) any later version.
	;*
	;* FFmpeg is distributed in the hope that it will be useful,
	;* but WITHOUT ANY WARRANTY; without even the implied warranty of
	;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	;* Lesser General Public License for more details.
	;*
	;* You should have received a copy of the GNU Lesser General Public
	;* License along with FFmpeg; if not, write to the Free Software
	;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	;******************************************************************************

	%include "libavutil/x86/x86util.asm"

	SECTION_RODATA

	scale_mask: times 2 dd 0x8000 ; 1 << (SBC_PROTO_FIXED_SCALE - 1)

	SECTION .text

	%macro NIDN 3
	%ifnidn %2, %3
	%1 %2, %3
	%endif
	%endmacro

	%macro ANALYZE_MAC 9 ; out1, out2, in1, in2, tmp1, tmp2, add1, add2, offset
	NIDN movq, %5, %3
	NIDN movq, %6, %4
	pmaddwd %5, [constsq+%9]
	pmaddwd %6, [constsq+%9+8]
	NIDN paddd, %1, %7
	NIDN paddd, %2, %8
	%endmacro

	%macro ANALYZE_MAC_IN 7 ; out1, out2, tmp1, tmp2, add1, add2, offset
	ANALYZE_MAC %1, %2, [inq+%7], [inq+%7+8], %3, %4, %5, %6, %7
	%endmacro

	%macro ANALYZE_MAC_REG 7 ; out1, out2, in, tmp1, tmp2, offset, pack
	%ifidn %7, pack
	psrad %3, 16 ; SBC_PROTO_FIXED_SCALE
	packssdw %3, %3
	%endif
	ANALYZE_MAC %1, %2, %3, %3, %4, %5, %4, %5, %6
	%endmacro

	;*******************************************************************
	;void ff_sbc_analyze_4(const int16_t in, int32_t out, const int16_t *consts);
	;*******************************************************************
	INIT_MMX mmx
	cglobal sbc_analyze_4, 3, 3, 4, in, out, consts
	ANALYZE_MAC_IN m0, m1, m0, m1, [scale_mask], [scale_mask], 0
	ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 16
	ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 32
	ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 48
	ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 64

	ANALYZE_MAC_REG m0, m2, m0, m0, m2, 80, pack
	ANALYZE_MAC_REG m0, m2, m1, m1, m3, 96, pack

	movq [outq ], m0
	movq [outq+8], m2

	RET


	;*******************************************************************
	;void ff_sbc_analyze_8(const int16_t in, int32_t out, const int16_t *consts);
	;*******************************************************************
	INIT_MMX mmx
	cglobal sbc_analyze_8, 3, 3, 4, in, out, consts
	ANALYZE_MAC_IN m0, m1, m0, m1, [scale_mask], [scale_mask], 0
	ANALYZE_MAC_IN m2, m3, m2, m3, [scale_mask], [scale_mask], 16
	ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 32
	ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 48
	ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 64
	ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 80
	ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 96
	ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 112
	ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 128
	ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 144

	ANALYZE_MAC_REG m4, m5, m0, m4, m5, 160, pack
	ANALYZE_MAC_REG m4, m5, m1, m6, m7, 192, pack
	ANALYZE_MAC_REG m4, m5, m2, m6, m7, 224, pack
	ANALYZE_MAC_REG m4, m5, m3, m6, m7, 256, pack

	movq [outq ], m4
	movq [outq+8], m5

	ANALYZE_MAC_REG m0, m5, m0, m0, m5, 176, no
	ANALYZE_MAC_REG m0, m5, m1, m1, m7, 208, no
	ANALYZE_MAC_REG m0, m5, m2, m2, m7, 240, no
	ANALYZE_MAC_REG m0, m5, m3, m3, m7, 272, no

	movq [outq+16], m0
	movq [outq+24], m5

	RET


	;*******************************************************************
	;void ff_sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8],
	; uint32_t scale_factor[2][8],
	; int blocks, int channels, int subbands)
	;*******************************************************************
	INIT_MMX mmx
	cglobal sbc_calc_scalefactors, 5, 7, 4, sb_sample_f, scale_factor, blocks, channels, subbands, ptr, blk
	; subbands = 4 * subbands * channels
	movq m3, [scale_mask]
	shl subbandsd, 2
	cmp channelsd, 2
	jl .loop_1
	shl subbandsd, 1

	.loop_1:
	sub subbandsq, 8
	lea ptrq, [sb_sample_fq + subbandsq]

	; blk = (blocks - 1) * 64;
	lea blkq, [blocksq - 1]
	shl blkd, 6

	movq m0, m3
	.loop_2:
	movq m1, [ptrq+blkq]
	pxor m2, m2
	pcmpgtd m1, m2
	paddd m1, [ptrq+blkq]
	pcmpgtd m2, m1
	pxor m1, m2

	por m0, m1

	sub blkq, 64
	jns .loop_2

	movd blkd, m0
	psrlq m0, 32
	bsr blkd, blkd
	sub blkd, 15 ; SCALE_OUT_BITS
	mov [scale_factorq + subbandsq], blkd

	movd blkd, m0
	bsr blkd, blkd
	sub blkd, 15 ; SCALE_OUT_BITS
	mov [scale_factorq + subbandsq + 4], blkd

	cmp subbandsq, 0
	jg .loop_1

	emms
	RET