nest-open-source / nest-cam / 4320010 / libopus / 73ff3db2d25392f4f8c12c5d73c839447eccd500 / . / libopus / celt / arm / celt_pitch_xcorr_arm_gnu.s

.syntax unified | |

@ Copyright (c) 2007-2008 CSIRO | |

@ Copyright (c) 2007-2009 Xiph.Org Foundation | |

@ Copyright (c) 2013 Parrot | |

@ Written by AurĂ©lien Zanelli | |

@ | |

@ Redistribution and use in source and binary forms, with or without | |

@ modification, are permitted provided that the following conditions | |

@ are met: | |

@ | |

@ - Redistributions of source code must retain the above copyright | |

@ notice, this list of conditions and the following disclaimer. | |

@ | |

@ - Redistributions in binary form must reproduce the above copyright | |

@ notice, this list of conditions and the following disclaimer in the | |

@ documentation and/or other materials provided with the distribution. | |

@ | |

@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |

@ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |

@ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |

@ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | |

@ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |

@ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |

@ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |

@ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | |

@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |

@ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |

@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |

.text; .p2align 2; .arch armv7-a | |

.fpu neon | |

.object_arch armv4t | |

.include "celt/arm/armopts_gnu.s" | |

.if OPUS_ARM_MAY_HAVE_EDSP | |

.global celt_pitch_xcorr_edsp | |

.endif | |

.if OPUS_ARM_MAY_HAVE_NEON | |

.global celt_pitch_xcorr_neon | |

.endif | |

.if OPUS_ARM_MAY_HAVE_NEON | |

@ Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3 | |

; xcorr_kernel_neon: @ PROC | |

xcorr_kernel_neon_start: | |

@ input: | |

@ r3 = int len | |

@ r4 = opus_val16 *x | |

@ r5 = opus_val16 *y | |

@ q0 = opus_val32 sum[4] | |

@ output: | |

@ q0 = opus_val32 sum[4] | |

@ preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15 | |

@ internal usage: | |

@ r12 = int j | |

@ d3 = y_3|y_2|y_1|y_0 | |

@ q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4 | |

@ q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0 | |

@ q8 = scratch | |

@ | |

@ Load y[0...3] | |

@ This requires len>0 to always be valid (which we assert in the C code). | |

VLD1.16 {d5}, [r5]! | |

SUBS r12, r3, #8 | |

BLE xcorr_kernel_neon_process4 | |

@ Process 8 samples at a time. | |

@ This loop loads one y value more than we actually need. Therefore we have to | |

@ stop as soon as there are 8 or fewer samples left (instead of 7), to avoid | |

@ reading past the end of the array. | |

xcorr_kernel_neon_process8: | |

@ This loop has 19 total instructions (10 cycles to issue, minimum), with | |

@ - 2 cycles of ARM insrtuctions, | |

@ - 10 cycles of load/store/byte permute instructions, and | |

@ - 9 cycles of data processing instructions. | |

@ On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the | |

@ latter two categories, meaning the whole loop should run in 10 cycles per | |

@ iteration, barring cache misses. | |

@ | |

@ Load x[0...7] | |

VLD1.16 {d6, d7}, [r4]! | |

@ Unlike VMOV, VAND is a data processsing instruction (and doesn't get | |

@ assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1. | |

VAND d3, d5, d5 | |

SUBS r12, r12, #8 | |

@ Load y[4...11] | |

VLD1.16 {d4, d5}, [r5]! | |

VMLAL.S16 q0, d3, d6[0] | |

VEXT.16 d16, d3, d4, #1 | |

VMLAL.S16 q0, d4, d7[0] | |

VEXT.16 d17, d4, d5, #1 | |

VMLAL.S16 q0, d16, d6[1] | |

VEXT.16 d16, d3, d4, #2 | |

VMLAL.S16 q0, d17, d7[1] | |

VEXT.16 d17, d4, d5, #2 | |

VMLAL.S16 q0, d16, d6[2] | |

VEXT.16 d16, d3, d4, #3 | |

VMLAL.S16 q0, d17, d7[2] | |

VEXT.16 d17, d4, d5, #3 | |

VMLAL.S16 q0, d16, d6[3] | |

VMLAL.S16 q0, d17, d7[3] | |

BGT xcorr_kernel_neon_process8 | |

@ Process 4 samples here if we have > 4 left (still reading one extra y value). | |

xcorr_kernel_neon_process4: | |

ADDS r12, r12, #4 | |

BLE xcorr_kernel_neon_process2 | |

@ Load x[0...3] | |

VLD1.16 d6, [r4]! | |

@ Use VAND since it's a data processing instruction again. | |

VAND d4, d5, d5 | |

SUB r12, r12, #4 | |

@ Load y[4...7] | |

VLD1.16 d5, [r5]! | |

VMLAL.S16 q0, d4, d6[0] | |

VEXT.16 d16, d4, d5, #1 | |

VMLAL.S16 q0, d16, d6[1] | |

VEXT.16 d16, d4, d5, #2 | |

VMLAL.S16 q0, d16, d6[2] | |

VEXT.16 d16, d4, d5, #3 | |

VMLAL.S16 q0, d16, d6[3] | |

@ Process 2 samples here if we have > 2 left (still reading one extra y value). | |

xcorr_kernel_neon_process2: | |

ADDS r12, r12, #2 | |

BLE xcorr_kernel_neon_process1 | |

@ Load x[0...1] | |

VLD2.16 {d6[],d7[]}, [r4]! | |

@ Use VAND since it's a data processing instruction again. | |

VAND d4, d5, d5 | |

SUB r12, r12, #2 | |

@ Load y[4...5] | |

VLD1.32 {d5[]}, [r5]! | |

VMLAL.S16 q0, d4, d6 | |

VEXT.16 d16, d4, d5, #1 | |

@ Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI | |

@ instead of VEXT, since it's a data-processing instruction. | |

VSRI.64 d5, d4, #32 | |

VMLAL.S16 q0, d16, d7 | |

@ Process 1 sample using the extra y value we loaded above. | |

xcorr_kernel_neon_process1: | |

@ Load next *x | |

VLD1.16 {d6[]}, [r4]! | |

ADDS r12, r12, #1 | |

@ y[0...3] are left in d5 from prior iteration(s) (if any) | |

VMLAL.S16 q0, d5, d6 | |

MOVLE pc, lr | |

@ Now process 1 last sample, not reading ahead. | |

@ Load last *y | |

VLD1.16 {d4[]}, [r5]! | |

VSRI.64 d4, d5, #16 | |

@ Load last *x | |

VLD1.16 {d6[]}, [r4]! | |

VMLAL.S16 q0, d4, d6 | |

MOV pc, lr | |

.size xcorr_kernel_neon, .-xcorr_kernel_neon @ ENDP | |

@ opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y, | |

@ opus_val32 *xcorr, int len, int max_pitch) | |

; celt_pitch_xcorr_neon: @ PROC | |

@ input: | |

@ r0 = opus_val16 *_x | |

@ r1 = opus_val16 *_y | |

@ r2 = opus_val32 *xcorr | |

@ r3 = int len | |

@ output: | |

@ r0 = int maxcorr | |

@ internal usage: | |

@ r4 = opus_val16 *x (for xcorr_kernel_neon()) | |

@ r5 = opus_val16 *y (for xcorr_kernel_neon()) | |

@ r6 = int max_pitch | |

@ r12 = int j | |

@ q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon()) | |

STMFD sp!, {r4-r6, lr} | |

LDR r6, [sp, #16] | |

VMOV.S32 q15, #1 | |

@ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done | |

SUBS r6, r6, #4 | |

BLT celt_pitch_xcorr_neon_process4_done | |

celt_pitch_xcorr_neon_process4: | |

@ xcorr_kernel_neon parameters: | |

@ r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0} | |

MOV r4, r0 | |

MOV r5, r1 | |

VEOR q0, q0, q0 | |

@ xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3. | |

@ So we don't save/restore any other registers. | |

BL xcorr_kernel_neon_start | |

SUBS r6, r6, #4 | |

VST1.32 {q0}, [r2]! | |

@ _y += 4 | |

ADD r1, r1, #8 | |

VMAX.S32 q15, q15, q0 | |

@ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done | |

BGE celt_pitch_xcorr_neon_process4 | |

@ We have less than 4 sums left to compute. | |

celt_pitch_xcorr_neon_process4_done: | |

ADDS r6, r6, #4 | |

@ Reduce maxcorr to a single value | |

VMAX.S32 d30, d30, d31 | |

VPMAX.S32 d30, d30, d30 | |

@ if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done | |

BLE celt_pitch_xcorr_neon_done | |

@ Now compute each remaining sum one at a time. | |

celt_pitch_xcorr_neon_process_remaining: | |

MOV r4, r0 | |

MOV r5, r1 | |

VMOV.I32 q0, #0 | |

SUBS r12, r3, #8 | |

BLT celt_pitch_xcorr_neon_process_remaining4 | |

@ Sum terms 8 at a time. | |

celt_pitch_xcorr_neon_process_remaining_loop8: | |

@ Load x[0...7] | |

VLD1.16 {q1}, [r4]! | |

@ Load y[0...7] | |

VLD1.16 {q2}, [r5]! | |

SUBS r12, r12, #8 | |

VMLAL.S16 q0, d4, d2 | |

VMLAL.S16 q0, d5, d3 | |

BGE celt_pitch_xcorr_neon_process_remaining_loop8 | |

@ Sum terms 4 at a time. | |

celt_pitch_xcorr_neon_process_remaining4: | |

ADDS r12, r12, #4 | |

BLT celt_pitch_xcorr_neon_process_remaining4_done | |

@ Load x[0...3] | |

VLD1.16 {d2}, [r4]! | |

@ Load y[0...3] | |

VLD1.16 {d3}, [r5]! | |

SUB r12, r12, #4 | |

VMLAL.S16 q0, d3, d2 | |

celt_pitch_xcorr_neon_process_remaining4_done: | |

@ Reduce the sum to a single value. | |

VADD.S32 d0, d0, d1 | |

VPADDL.S32 d0, d0 | |

ADDS r12, r12, #4 | |

BLE celt_pitch_xcorr_neon_process_remaining_loop_done | |

@ Sum terms 1 at a time. | |

celt_pitch_xcorr_neon_process_remaining_loop1: | |

VLD1.16 {d2[]}, [r4]! | |

VLD1.16 {d3[]}, [r5]! | |

SUBS r12, r12, #1 | |

VMLAL.S16 q0, d2, d3 | |

BGT celt_pitch_xcorr_neon_process_remaining_loop1 | |

celt_pitch_xcorr_neon_process_remaining_loop_done: | |

VST1.32 {d0[0]}, [r2]! | |

VMAX.S32 d30, d30, d0 | |

SUBS r6, r6, #1 | |

@ _y++ | |

ADD r1, r1, #2 | |

@ if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining | |

BGT celt_pitch_xcorr_neon_process_remaining | |

celt_pitch_xcorr_neon_done: | |

VMOV.32 r0, d30[0] | |

LDMFD sp!, {r4-r6, pc} | |

.size celt_pitch_xcorr_neon, .-celt_pitch_xcorr_neon @ ENDP | |

.endif | |

.if OPUS_ARM_MAY_HAVE_EDSP | |

@ This will get used on ARMv7 devices without NEON, so it has been optimized | |

@ to take advantage of dual-issuing where possible. | |

; xcorr_kernel_edsp: @ PROC | |

xcorr_kernel_edsp_start: | |

@ input: | |

@ r3 = int len | |

@ r4 = opus_val16 *_x (must be 32-bit aligned) | |

@ r5 = opus_val16 *_y (must be 32-bit aligned) | |

@ r6...r9 = opus_val32 sum[4] | |

@ output: | |

@ r6...r9 = opus_val32 sum[4] | |

@ preserved: r0-r5 | |

@ internal usage | |

@ r2 = int j | |

@ r12,r14 = opus_val16 x[4] | |

@ r10,r11 = opus_val16 y[4] | |

STMFD sp!, {r2,r4,r5,lr} | |

LDR r10, [r5], #4 @ Load y[0...1] | |

SUBS r2, r3, #4 @ j = len-4 | |

LDR r11, [r5], #4 @ Load y[2...3] | |

BLE xcorr_kernel_edsp_process4_done | |

LDR r12, [r4], #4 @ Load x[0...1] | |

@ Stall | |

xcorr_kernel_edsp_process4: | |

@ The multiplies must issue from pipeline 0, and can't dual-issue with each | |

@ other. Every other instruction here dual-issues with a multiply, and is | |

@ thus "free". There should be no stalls in the body of the loop. | |

SMLABB r6, r12, r10, r6 @ sum[0] = MAC16_16(sum[0],x_0,y_0) | |

LDR r14, [r4], #4 @ Load x[2...3] | |

SMLABT r7, r12, r10, r7 @ sum[1] = MAC16_16(sum[1],x_0,y_1) | |

SUBS r2, r2, #4 @ j-=4 | |

SMLABB r8, r12, r11, r8 @ sum[2] = MAC16_16(sum[2],x_0,y_2) | |

SMLABT r9, r12, r11, r9 @ sum[3] = MAC16_16(sum[3],x_0,y_3) | |

SMLATT r6, r12, r10, r6 @ sum[0] = MAC16_16(sum[0],x_1,y_1) | |

LDR r10, [r5], #4 @ Load y[4...5] | |

SMLATB r7, r12, r11, r7 @ sum[1] = MAC16_16(sum[1],x_1,y_2) | |

SMLATT r8, r12, r11, r8 @ sum[2] = MAC16_16(sum[2],x_1,y_3) | |

SMLATB r9, r12, r10, r9 @ sum[3] = MAC16_16(sum[3],x_1,y_4) | |

LDRGT r12, [r4], #4 @ Load x[0...1] | |

SMLABB r6, r14, r11, r6 @ sum[0] = MAC16_16(sum[0],x_2,y_2) | |

SMLABT r7, r14, r11, r7 @ sum[1] = MAC16_16(sum[1],x_2,y_3) | |

SMLABB r8, r14, r10, r8 @ sum[2] = MAC16_16(sum[2],x_2,y_4) | |

SMLABT r9, r14, r10, r9 @ sum[3] = MAC16_16(sum[3],x_2,y_5) | |

SMLATT r6, r14, r11, r6 @ sum[0] = MAC16_16(sum[0],x_3,y_3) | |

LDR r11, [r5], #4 @ Load y[6...7] | |

SMLATB r7, r14, r10, r7 @ sum[1] = MAC16_16(sum[1],x_3,y_4) | |

SMLATT r8, r14, r10, r8 @ sum[2] = MAC16_16(sum[2],x_3,y_5) | |

SMLATB r9, r14, r11, r9 @ sum[3] = MAC16_16(sum[3],x_3,y_6) | |

BGT xcorr_kernel_edsp_process4 | |

xcorr_kernel_edsp_process4_done: | |

ADDS r2, r2, #4 | |

BLE xcorr_kernel_edsp_done | |

LDRH r12, [r4], #2 @ r12 = *x++ | |

SUBS r2, r2, #1 @ j-- | |

@ Stall | |

SMLABB r6, r12, r10, r6 @ sum[0] = MAC16_16(sum[0],x,y_0) | |

LDRHGT r14, [r4], #2 @ r14 = *x++ | |

SMLABT r7, r12, r10, r7 @ sum[1] = MAC16_16(sum[1],x,y_1) | |

SMLABB r8, r12, r11, r8 @ sum[2] = MAC16_16(sum[2],x,y_2) | |

SMLABT r9, r12, r11, r9 @ sum[3] = MAC16_16(sum[3],x,y_3) | |

BLE xcorr_kernel_edsp_done | |

SMLABT r6, r14, r10, r6 @ sum[0] = MAC16_16(sum[0],x,y_1) | |

SUBS r2, r2, #1 @ j-- | |

SMLABB r7, r14, r11, r7 @ sum[1] = MAC16_16(sum[1],x,y_2) | |

LDRH r10, [r5], #2 @ r10 = y_4 = *y++ | |

SMLABT r8, r14, r11, r8 @ sum[2] = MAC16_16(sum[2],x,y_3) | |

LDRHGT r12, [r4], #2 @ r12 = *x++ | |

SMLABB r9, r14, r10, r9 @ sum[3] = MAC16_16(sum[3],x,y_4) | |

BLE xcorr_kernel_edsp_done | |

SMLABB r6, r12, r11, r6 @ sum[0] = MAC16_16(sum[0],tmp,y_2) | |

CMP r2, #1 @ j-- | |

SMLABT r7, r12, r11, r7 @ sum[1] = MAC16_16(sum[1],tmp,y_3) | |

LDRH r2, [r5], #2 @ r2 = y_5 = *y++ | |

SMLABB r8, r12, r10, r8 @ sum[2] = MAC16_16(sum[2],tmp,y_4) | |

LDRHGT r14, [r4] @ r14 = *x | |

SMLABB r9, r12, r2, r9 @ sum[3] = MAC16_16(sum[3],tmp,y_5) | |

BLE xcorr_kernel_edsp_done | |

SMLABT r6, r14, r11, r6 @ sum[0] = MAC16_16(sum[0],tmp,y_3) | |

LDRH r11, [r5] @ r11 = y_6 = *y | |

SMLABB r7, r14, r10, r7 @ sum[1] = MAC16_16(sum[1],tmp,y_4) | |

SMLABB r8, r14, r2, r8 @ sum[2] = MAC16_16(sum[2],tmp,y_5) | |

SMLABB r9, r14, r11, r9 @ sum[3] = MAC16_16(sum[3],tmp,y_6) | |

xcorr_kernel_edsp_done: | |

LDMFD sp!, {r2,r4,r5,pc} | |

.size xcorr_kernel_edsp, .-xcorr_kernel_edsp @ ENDP | |

; celt_pitch_xcorr_edsp: @ PROC | |

@ input: | |

@ r0 = opus_val16 *_x (must be 32-bit aligned) | |

@ r1 = opus_val16 *_y (only needs to be 16-bit aligned) | |

@ r2 = opus_val32 *xcorr | |

@ r3 = int len | |

@ output: | |

@ r0 = maxcorr | |

@ internal usage | |

@ r4 = opus_val16 *x | |

@ r5 = opus_val16 *y | |

@ r6 = opus_val32 sum0 | |

@ r7 = opus_val32 sum1 | |

@ r8 = opus_val32 sum2 | |

@ r9 = opus_val32 sum3 | |

@ r1 = int max_pitch | |

@ r12 = int j | |

STMFD sp!, {r4-r11, lr} | |

MOV r5, r1 | |

LDR r1, [sp, #36] | |

MOV r4, r0 | |

TST r5, #3 | |

@ maxcorr = 1 | |

MOV r0, #1 | |

BEQ celt_pitch_xcorr_edsp_process1u_done | |

@ Compute one sum at the start to make y 32-bit aligned. | |

SUBS r12, r3, #4 | |

@ r14 = sum = 0 | |

MOV r14, #0 | |

LDRH r8, [r5], #2 | |

BLE celt_pitch_xcorr_edsp_process1u_loop4_done | |

LDR r6, [r4], #4 | |

MOV r8, r8, LSL #16 | |

celt_pitch_xcorr_edsp_process1u_loop4: | |

LDR r9, [r5], #4 | |

SMLABT r14, r6, r8, r14 @ sum = MAC16_16(sum, x_0, y_0) | |

LDR r7, [r4], #4 | |

SMLATB r14, r6, r9, r14 @ sum = MAC16_16(sum, x_1, y_1) | |

LDR r8, [r5], #4 | |

SMLABT r14, r7, r9, r14 @ sum = MAC16_16(sum, x_2, y_2) | |

SUBS r12, r12, #4 @ j-=4 | |

SMLATB r14, r7, r8, r14 @ sum = MAC16_16(sum, x_3, y_3) | |

LDRGT r6, [r4], #4 | |

BGT celt_pitch_xcorr_edsp_process1u_loop4 | |

MOV r8, r8, LSR #16 | |

celt_pitch_xcorr_edsp_process1u_loop4_done: | |

ADDS r12, r12, #4 | |

celt_pitch_xcorr_edsp_process1u_loop1: | |

LDRHGE r6, [r4], #2 | |

@ Stall | |

SMLABBGE r14, r6, r8, r14 @ sum = MAC16_16(sum, *x, *y) | |

SUBSGE r12, r12, #1 | |

LDRHGT r8, [r5], #2 | |

BGT celt_pitch_xcorr_edsp_process1u_loop1 | |

@ Restore _x | |

SUB r4, r4, r3, LSL #1 | |

@ Restore and advance _y | |

SUB r5, r5, r3, LSL #1 | |

@ maxcorr = max(maxcorr, sum) | |

CMP r0, r14 | |

ADD r5, r5, #2 | |

MOVLT r0, r14 | |

SUBS r1, r1, #1 | |

@ xcorr[i] = sum | |

STR r14, [r2], #4 | |

BLE celt_pitch_xcorr_edsp_done | |

celt_pitch_xcorr_edsp_process1u_done: | |

@ if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2 | |

SUBS r1, r1, #4 | |

BLT celt_pitch_xcorr_edsp_process2 | |

celt_pitch_xcorr_edsp_process4: | |

@ xcorr_kernel_edsp parameters: | |

@ r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0} | |

MOV r6, #0 | |

MOV r7, #0 | |

MOV r8, #0 | |

MOV r9, #0 | |

BL xcorr_kernel_edsp_start @ xcorr_kernel_edsp(_x, _y+i, xcorr+i, len) | |

@ maxcorr = max(maxcorr, sum0, sum1, sum2, sum3) | |

CMP r0, r6 | |

@ _y+=4 | |

ADD r5, r5, #8 | |

MOVLT r0, r6 | |

CMP r0, r7 | |

MOVLT r0, r7 | |

CMP r0, r8 | |

MOVLT r0, r8 | |

CMP r0, r9 | |

MOVLT r0, r9 | |

STMIA r2!, {r6-r9} | |

SUBS r1, r1, #4 | |

BGE celt_pitch_xcorr_edsp_process4 | |

celt_pitch_xcorr_edsp_process2: | |

ADDS r1, r1, #2 | |

BLT celt_pitch_xcorr_edsp_process1a | |

SUBS r12, r3, #4 | |

@ {r10, r11} = {sum0, sum1} = {0, 0} | |

MOV r10, #0 | |

MOV r11, #0 | |

LDR r8, [r5], #4 | |

BLE celt_pitch_xcorr_edsp_process2_loop_done | |

LDR r6, [r4], #4 | |

LDR r9, [r5], #4 | |

celt_pitch_xcorr_edsp_process2_loop4: | |

SMLABB r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_0, y_0) | |

LDR r7, [r4], #4 | |

SMLABT r11, r6, r8, r11 @ sum1 = MAC16_16(sum1, x_0, y_1) | |

SUBS r12, r12, #4 @ j-=4 | |

SMLATT r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_1, y_1) | |

LDR r8, [r5], #4 | |

SMLATB r11, r6, r9, r11 @ sum1 = MAC16_16(sum1, x_1, y_2) | |

LDRGT r6, [r4], #4 | |

SMLABB r10, r7, r9, r10 @ sum0 = MAC16_16(sum0, x_2, y_2) | |

SMLABT r11, r7, r9, r11 @ sum1 = MAC16_16(sum1, x_2, y_3) | |

SMLATT r10, r7, r9, r10 @ sum0 = MAC16_16(sum0, x_3, y_3) | |

LDRGT r9, [r5], #4 | |

SMLATB r11, r7, r8, r11 @ sum1 = MAC16_16(sum1, x_3, y_4) | |

BGT celt_pitch_xcorr_edsp_process2_loop4 | |

celt_pitch_xcorr_edsp_process2_loop_done: | |

ADDS r12, r12, #2 | |

BLE celt_pitch_xcorr_edsp_process2_1 | |

LDR r6, [r4], #4 | |

@ Stall | |

SMLABB r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_0, y_0) | |

LDR r9, [r5], #4 | |

SMLABT r11, r6, r8, r11 @ sum1 = MAC16_16(sum1, x_0, y_1) | |

SUB r12, r12, #2 | |

SMLATT r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_1, y_1) | |

MOV r8, r9 | |

SMLATB r11, r6, r9, r11 @ sum1 = MAC16_16(sum1, x_1, y_2) | |

celt_pitch_xcorr_edsp_process2_1: | |

LDRH r6, [r4], #2 | |

ADDS r12, r12, #1 | |

@ Stall | |

SMLABB r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_0, y_0) | |

LDRHGT r7, [r4], #2 | |

SMLABT r11, r6, r8, r11 @ sum1 = MAC16_16(sum1, x_0, y_1) | |

BLE celt_pitch_xcorr_edsp_process2_done | |

LDRH r9, [r5], #2 | |

SMLABT r10, r7, r8, r10 @ sum0 = MAC16_16(sum0, x_0, y_1) | |

SMLABB r11, r7, r9, r11 @ sum1 = MAC16_16(sum1, x_0, y_2) | |

celt_pitch_xcorr_edsp_process2_done: | |

@ Restore _x | |

SUB r4, r4, r3, LSL #1 | |

@ Restore and advance _y | |

SUB r5, r5, r3, LSL #1 | |

@ maxcorr = max(maxcorr, sum0) | |

CMP r0, r10 | |

ADD r5, r5, #2 | |

MOVLT r0, r10 | |

SUB r1, r1, #2 | |

@ maxcorr = max(maxcorr, sum1) | |

CMP r0, r11 | |

@ xcorr[i] = sum | |

STR r10, [r2], #4 | |

MOVLT r0, r11 | |

STR r11, [r2], #4 | |

celt_pitch_xcorr_edsp_process1a: | |

ADDS r1, r1, #1 | |

BLT celt_pitch_xcorr_edsp_done | |

SUBS r12, r3, #4 | |

@ r14 = sum = 0 | |

MOV r14, #0 | |

BLT celt_pitch_xcorr_edsp_process1a_loop_done | |

LDR r6, [r4], #4 | |

LDR r8, [r5], #4 | |

LDR r7, [r4], #4 | |

LDR r9, [r5], #4 | |

celt_pitch_xcorr_edsp_process1a_loop4: | |

SMLABB r14, r6, r8, r14 @ sum = MAC16_16(sum, x_0, y_0) | |

SUBS r12, r12, #4 @ j-=4 | |

SMLATT r14, r6, r8, r14 @ sum = MAC16_16(sum, x_1, y_1) | |

LDRGE r6, [r4], #4 | |

SMLABB r14, r7, r9, r14 @ sum = MAC16_16(sum, x_2, y_2) | |

LDRGE r8, [r5], #4 | |

SMLATT r14, r7, r9, r14 @ sum = MAC16_16(sum, x_3, y_3) | |

LDRGE r7, [r4], #4 | |

LDRGE r9, [r5], #4 | |

BGE celt_pitch_xcorr_edsp_process1a_loop4 | |

celt_pitch_xcorr_edsp_process1a_loop_done: | |

ADDS r12, r12, #2 | |

LDRGE r6, [r4], #4 | |

LDRGE r8, [r5], #4 | |

@ Stall | |

SMLABBGE r14, r6, r8, r14 @ sum = MAC16_16(sum, x_0, y_0) | |

SUBGE r12, r12, #2 | |

SMLATTGE r14, r6, r8, r14 @ sum = MAC16_16(sum, x_1, y_1) | |

ADDS r12, r12, #1 | |

LDRHGE r6, [r4], #2 | |

LDRHGE r8, [r5], #2 | |

@ Stall | |

SMLABBGE r14, r6, r8, r14 @ sum = MAC16_16(sum, *x, *y) | |

@ maxcorr = max(maxcorr, sum) | |

CMP r0, r14 | |

@ xcorr[i] = sum | |

STR r14, [r2], #4 | |

MOVLT r0, r14 | |

celt_pitch_xcorr_edsp_done: | |

LDMFD sp!, {r4-r11, pc} | |

.size celt_pitch_xcorr_edsp, .-celt_pitch_xcorr_edsp @ ENDP | |

.endif | |

@ END: | |

.section .note.GNU-stack,"",%progbits |