jaspertv_gpl_out/lib/ffmpeg/ffmpeg/libavcodec/x86/mlpdsp.asm - nest-client-apple-tv/5.9.0/ffmpeg - Git at Google

 ;******************************************************************************
 ;* SIMD-optimized MLP DSP functions
 ;* Copyright (c) 2014 James Almer <jamrial@gmail.com>
 ;*
 ;* This file is part of FFmpeg.
 ;*
 ;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
 ;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
 ;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************

 %include "libavutil/x86/x86util.asm"

 SECTION .text

 %if ARCH_X86_64

 %macro SHLX 2
 %if cpuflag(bmi2)
    shlx %1, %1, %2q
 %else
    shl  %1, %2b
 %endif
 %endmacro

 %macro REMATRIX 0
     movdqa        m0, [samplesq]
     movdqa        m1, [coeffsq ]
     pshufd        m2, m0, q2301
     pshufd        m3, m1, q2301
     pmuldq        m0, m1
     pmuldq        m3, m2
     paddq         m0, m3
 %if notcpuflag(avx2)
     movdqa        m1, [samplesq + 16]
     movdqa        m2, [coeffsq  + 16]
     pshufd        m3, m1, q2301
     pshufd        m4, m2, q2301
     pmuldq        m1, m2
     pmuldq        m4, m3
     paddq         m0, m1
     paddq         m0, m4
 %else
     vextracti128 xm1, m0, 1
     paddq        xm0, xm1
 %endif
 %endmacro

 %macro LOOP_END 0
     pshufd       xm1, xm0, q0032
     paddq        xm0, xm1
     movq      accumq, xm0
     movzx     blsbsd, byte [blsbs_ptrq]             ; load *bypassed_lsbs
     sar       accumq, 14                            ; accum >>= 14
     and       accumd, maskd                         ; accum &= mask
     add       accumd, blsbsd                        ; accum += *bypassed_lsbs
     mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
     add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
     add     samplesq, 32                            ; samples += MAX_CHANNELS;
     cmp   blsbs_ptrq, cntq
 %endmacro

 %macro LOOP_SHIFT_END 0
     pshufd       xm1, xm0, q0032
     paddq        xm0, xm1
     movq      accumq, xm0
     and       indexd, auspd                         ; index &= access_unit_size_pow2;
     movsx     noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index]
     add       indexd, index2d                       ; index += index2
     SHLX      noiseq, mns                           ; noise_buffer[index] <<= matrix_noise_shift
     add       accumq, noiseq                        ; accum += noise_buffer[index]
     movzx     noised, byte [blsbs_ptrq]             ; load *bypassed_lsbs (reuse tmp noise register)
     sar       accumq, 14                            ; accum >>= 14
     and       accumd, maskd                         ; accum &= mask
     add       accumd, noised                        ; accum += *bypassed_lsbs
     mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
     add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
     add     samplesq, 32                            ; samples += MAX_CHANNELS;
     cmp   blsbs_ptrq, cntq
 %endmacro

 ;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs,
 ;                             const uint8_t *bypassed_lsbs, const int8_t *noise_buffer,
 ;                             int index, unsigned int dest_ch, uint16_t blockpos,
 ;                             unsigned int maxchan, int matrix_noise_shift,
 ;                             int access_unit_size_pow2, int32_t mask)
 %macro MLP_REMATRIX_CHANNEL 0
 cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \
                                         index, dest_ch, blockpos, maxchan, mns, \
                                         accum, mask, cnt
     mov         mnsd, mnsm                          ; load matrix_noise_shift
     movzx  blockposq, word blockposm                ; load and zero extend blockpos (16bit)
     mov     maxchand, maxchanm                      ; load maxchan
     mov        maskd, maskm                         ; load mask
 %if WIN64
     mov     dest_chd, dest_chm                      ; load dest_chd (not needed on UNIX64)
 %endif
     shl     dest_chd, 2
     lea         cntq, [blsbs_ptrq + blockposq*8]
     test        mnsd, mnsd                          ; is matrix_noise_shift != 0?
     jne .shift                                      ; jump if true
     cmp     maxchand, 4                             ; is maxchan < 4?
     jl .loop4                                       ; jump if true

 align 16
 .loop8:
     ; Process 5 or more channels
     REMATRIX
     LOOP_END
     jne .loop8
     RET

 align 16
 .loop4:
     ; Process up to 4 channels
     movdqa       xm0, [samplesq]
     movdqa       xm1, [coeffsq ]
     pshufd       xm2, xm0, q2301
     pshufd       xm3, xm1, q2301
     pmuldq       xm0, xm1
     pmuldq       xm3, xm2
     paddq        xm0, xm3
     LOOP_END
     jne .loop4
     RET

 .shift:
 %if WIN64
     mov       indexd, indexm         ; load index (not needed on UNIX64)
 %endif
     mov          r9d, r9m            ; load access_unit_size_pow2
 %if cpuflag(bmi2)
     ; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place.
     DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \
                 index, dest_ch, accum, index2, mns, \
                 ausp, mask, cnt, noise
     add         mnsd, 7              ; matrix_noise_shift += 7
 %else ; sse4
     mov           r6, rcx            ; move rcx elsewhere so we can use cl for matrix_noise_shift
 %if WIN64
     ; r0 = rcx
     DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \
                 index2, accum, ausp, mask, cnt, noise
 %else ; UNIX64
     ; r3 = rcx
     DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \
                 index2, accum, ausp, mask, cnt, noise
 %endif
     lea         mnsd, [r8 + 7]       ; rcx = matrix_noise_shift + 7
 %endif ; cpuflag
     sub        auspd, 1              ; access_unit_size_pow2 -= 1
     cmp          r7d, 4              ; is maxchan < 4?
     lea      index2q, [indexq*2 + 1] ; index2 = 2 * index + 1;
     jl .loop4_shift                  ; jump if maxchan < 4

 align 16
 .loop8_shift:
     ; Process 5 or more channels
     REMATRIX
     LOOP_SHIFT_END
     jne .loop8_shift
     RET

 align 16
 .loop4_shift:
     ; Process up to 4 channels
     movdqa       xm0, [samplesq]
     movdqa       xm1, [coeffsq ]
     pshufd       xm2, xm0, q2301
     pshufd       xm3, xm1, q2301
     pmuldq       xm0, xm1
     pmuldq       xm3, xm2
     paddq        xm0, xm3
     LOOP_SHIFT_END
     jne .loop4_shift
     RET
 %endmacro

 INIT_XMM sse4
 MLP_REMATRIX_CHANNEL
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2, bmi2
 MLP_REMATRIX_CHANNEL
 %endif

 %endif ; ARCH_X86_64
	;******************************************************************************
	;* SIMD-optimized MLP DSP functions
	;* Copyright (c) 2014 James Almer <jamrial@gmail.com>
	;*
	;* This file is part of FFmpeg.
	;*
	;* FFmpeg is free software; you can redistribute it and/or
	;* modify it under the terms of the GNU Lesser General Public
	;* License as published by the Free Software Foundation; either
	;* version 2.1 of the License, or (at your option) any later version.
	;*
	;* FFmpeg is distributed in the hope that it will be useful,
	;* but WITHOUT ANY WARRANTY; without even the implied warranty of
	;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	;* Lesser General Public License for more details.
	;*
	;* You should have received a copy of the GNU Lesser General Public
	;* License along with FFmpeg; if not, write to the Free Software
	;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	;******************************************************************************

	%include "libavutil/x86/x86util.asm"

	SECTION .text

	%if ARCH_X86_64

	%macro SHLX 2
	%if cpuflag(bmi2)
	shlx %1, %1, %2q
	%else
	shl %1, %2b
	%endif
	%endmacro

	%macro REMATRIX 0
	movdqa m0, [samplesq]
	movdqa m1, [coeffsq ]
	pshufd m2, m0, q2301
	pshufd m3, m1, q2301
	pmuldq m0, m1
	pmuldq m3, m2
	paddq m0, m3
	%if notcpuflag(avx2)
	movdqa m1, [samplesq + 16]
	movdqa m2, [coeffsq + 16]
	pshufd m3, m1, q2301
	pshufd m4, m2, q2301
	pmuldq m1, m2
	pmuldq m4, m3
	paddq m0, m1
	paddq m0, m4
	%else
	vextracti128 xm1, m0, 1
	paddq xm0, xm1
	%endif
	%endmacro

	%macro LOOP_END 0
	pshufd xm1, xm0, q0032
	paddq xm0, xm1
	movq accumq, xm0
	movzx blsbsd, byte [blsbs_ptrq] ; load *bypassed_lsbs
	sar accumq, 14 ; accum >>= 14
	and accumd, maskd ; accum &= mask
	add accumd, blsbsd ; accum += *bypassed_lsbs
	mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum
	add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS;
	add samplesq, 32 ; samples += MAX_CHANNELS;
	cmp blsbs_ptrq, cntq
	%endmacro

	%macro LOOP_SHIFT_END 0
	pshufd xm1, xm0, q0032
	paddq xm0, xm1
	movq accumq, xm0
	and indexd, auspd ; index &= access_unit_size_pow2;
	movsx noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index]
	add indexd, index2d ; index += index2
	SHLX noiseq, mns ; noise_buffer[index] <<= matrix_noise_shift
	add accumq, noiseq ; accum += noise_buffer[index]
	movzx noised, byte [blsbs_ptrq] ; load *bypassed_lsbs (reuse tmp noise register)
	sar accumq, 14 ; accum >>= 14
	and accumd, maskd ; accum &= mask
	add accumd, noised ; accum += *bypassed_lsbs
	mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum
	add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS;
	add samplesq, 32 ; samples += MAX_CHANNELS;
	cmp blsbs_ptrq, cntq
	%endmacro

	;void ff_mlp_rematrix_channel(int32_t samples, const int32_t coeffs,
	; const uint8_t bypassed_lsbs, const int8_t noise_buffer,
	; int index, unsigned int dest_ch, uint16_t blockpos,
	; unsigned int maxchan, int matrix_noise_shift,
	; int access_unit_size_pow2, int32_t mask)
	%macro MLP_REMATRIX_CHANNEL 0
	cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \
	index, dest_ch, blockpos, maxchan, mns, \
	accum, mask, cnt
	mov mnsd, mnsm ; load matrix_noise_shift
	movzx blockposq, word blockposm ; load and zero extend blockpos (16bit)
	mov maxchand, maxchanm ; load maxchan
	mov maskd, maskm ; load mask
	%if WIN64
	mov dest_chd, dest_chm ; load dest_chd (not needed on UNIX64)
	%endif
	shl dest_chd, 2
	lea cntq, [blsbs_ptrq + blockposq*8]
	test mnsd, mnsd ; is matrix_noise_shift != 0?
	jne .shift ; jump if true
	cmp maxchand, 4 ; is maxchan < 4?
	jl .loop4 ; jump if true

	align 16
	.loop8:
	; Process 5 or more channels
	REMATRIX
	LOOP_END
	jne .loop8
	RET

	align 16
	.loop4:
	; Process up to 4 channels
	movdqa xm0, [samplesq]
	movdqa xm1, [coeffsq ]
	pshufd xm2, xm0, q2301
	pshufd xm3, xm1, q2301
	pmuldq xm0, xm1
	pmuldq xm3, xm2
	paddq xm0, xm3
	LOOP_END
	jne .loop4
	RET

	.shift:
	%if WIN64
	mov indexd, indexm ; load index (not needed on UNIX64)
	%endif
	mov r9d, r9m ; load access_unit_size_pow2
	%if cpuflag(bmi2)
	; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place.
	DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \
	index, dest_ch, accum, index2, mns, \
	ausp, mask, cnt, noise
	add mnsd, 7 ; matrix_noise_shift += 7
	%else ; sse4
	mov r6, rcx ; move rcx elsewhere so we can use cl for matrix_noise_shift
	%if WIN64
	; r0 = rcx
	DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \
	index2, accum, ausp, mask, cnt, noise
	%else ; UNIX64
	; r3 = rcx
	DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \
	index2, accum, ausp, mask, cnt, noise
	%endif
	lea mnsd, [r8 + 7] ; rcx = matrix_noise_shift + 7
	%endif ; cpuflag
	sub auspd, 1 ; access_unit_size_pow2 -= 1
	cmp r7d, 4 ; is maxchan < 4?
	lea index2q, [indexq2 + 1] ; index2 = 2 index + 1;
	jl .loop4_shift ; jump if maxchan < 4

	align 16
	.loop8_shift:
	; Process 5 or more channels
	REMATRIX
	LOOP_SHIFT_END
	jne .loop8_shift
	RET

	align 16
	.loop4_shift:
	; Process up to 4 channels
	movdqa xm0, [samplesq]
	movdqa xm1, [coeffsq ]
	pshufd xm2, xm0, q2301
	pshufd xm3, xm1, q2301
	pmuldq xm0, xm1
	pmuldq xm3, xm2
	paddq xm0, xm3
	LOOP_SHIFT_END
	jne .loop4_shift
	RET
	%endmacro

	INIT_XMM sse4
	MLP_REMATRIX_CHANNEL
	%if HAVE_AVX2_EXTERNAL
	INIT_YMM avx2, bmi2
	MLP_REMATRIX_CHANNEL
	%endif

	%endif ; ARCH_X86_64