blob: a81aed77b04a008022dbeffeb48d72de09b65203 [file] [log] [blame]
; Copyright (C) 2009 The Android Open Source Project
;
; Licensed under the Apache License, Version 2.0 (the "License");
; you may not use this file except in compliance with the License.
; You may obtain a copy of the License at
;
; http://www.apache.org/licenses/LICENSE-2.0
;
; Unless required by applicable law or agreed to in writing, software
; distributed under the License is distributed on an "AS IS" BASIS,
; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
; See the License for the specific language governing permissions and
; limitations under the License.
;-------------------------------------------------------------------------------
;--
;-- Abstract : ARMv6 optimized version horizontal part of
;-- h264bsdInterpolateMid functions
;--
;-------------------------------------------------------------------------------
IF :DEF: H264DEC_WINASM
;// We dont use REQUIRE8 and PRESERVE8 for winasm
ELSE
REQUIRE8
PRESERVE8
ENDIF
AREA |.text|, CODE
;// Register allocation
ref RN 0 ;// pointer to current position in reference image
mb RN 1 ;// pointer to current position in interpolated mb
count RN 2 ;// bit-packed width and count values
x_2_0 RN 4
x_3_1 RN 5
x_6_4 RN 6
x_7_5 RN 7
tmp1 RN 8
tmp2 RN 9
tmp3 RN 10
tmp4 RN 11
mult_20_01 RN 12 ;// [20, 1]
mult_20_m5 RN 14 ;// [20, -5]
EXPORT h264bsdInterpolateMidHorPart
;// Horizontal filter approach
;//
;// Basic idea in horizontal filtering is to adjust coefficients
;// like below. Calculation is done with 16-bit maths.
;//
;// Reg x_2_0 x_3_1 x_6_4 x_7_5 x_2_0
;// [ 2 0 ] [ 3 1 ] [ 6 4 ] [ 7 5 ] [ 10 8 ] ...
;// y_0 = 20 1 20 -5 -5 1
;// y_1 = -5 20 1 1 20 -5
;// y_2 = 1 -5 -5 20 1 20
;// y_3 = 1 20 -5 -5 20 1
h264bsdInterpolateMidHorPart
STMFD sp!, {r4-r11, lr}
;// pack values to count register
;// [31:28] loop_x (partWidth-1)
;// [27:24] loop_y (partHeight-1)
;// [23:20] partWidth-1
;// [19:16] partHeight-1
;// [15:00] width
LDR mult_20_01, = 0x00140001
LDR mult_20_m5, = 0x0014FFFB
AND tmp3, count, #0x000F0000 ;// partWidth-1
loop_y
LDR x_3_1, [ref, #-8]
ADD count, count, tmp3, LSL #12
LDR x_7_5, [ref, #-4]
UXTB16 x_2_0, x_3_1
UXTB16 x_3_1, x_3_1, ROR #8
UXTB16 x_6_4, x_7_5
loop_x
UXTB16 x_7_5, x_7_5, ROR #8
SMUAD tmp1, x_2_0, mult_20_01
SMULTB tmp2, x_2_0, mult_20_m5
SMULTB tmp3, x_2_0, mult_20_01
SMULTB tmp4, x_3_1, mult_20_01
SMLAD tmp1, x_3_1, mult_20_m5, tmp1
SMLAD tmp2, x_3_1, mult_20_01, tmp2
SMLATB tmp3, x_3_1, mult_20_m5, tmp3
LDR x_3_1, [ref], #4
SMLAD tmp4, x_6_4, mult_20_m5, tmp4
SMLABB tmp1, x_6_4, mult_20_m5, tmp1
SMLADX tmp2, x_6_4, mult_20_01, tmp2
SMLADX tmp3, x_6_4, mult_20_m5, tmp3
SMLADX tmp4, x_7_5, mult_20_m5, tmp4
SMLABB tmp1, x_7_5, mult_20_01, tmp1
SMLABB tmp2, x_7_5, mult_20_m5, tmp2
UXTB16 x_2_0, x_3_1
SMLADX tmp3, x_7_5, mult_20_01, tmp3
SMLABB tmp4, x_2_0, mult_20_01, tmp4
SUBS count, count, #4<<28
STR tmp1, [mb], #4
STR tmp2, [mb], #4
STR tmp3, [mb], #4
STR tmp4, [mb], #4
BCC next_y
UXTB16 x_3_1, x_3_1, ROR #8
SMUAD tmp1, x_6_4, mult_20_01
SMULTB tmp2, x_6_4, mult_20_m5
SMULTB tmp3, x_6_4, mult_20_01
SMULTB tmp4, x_7_5, mult_20_01
SMLAD tmp1, x_7_5, mult_20_m5, tmp1
SMLAD tmp2, x_7_5, mult_20_01, tmp2
SMLATB tmp3, x_7_5, mult_20_m5, tmp3
LDR x_7_5, [ref], #4
SMLAD tmp4, x_2_0, mult_20_m5, tmp4
SMLABB tmp1, x_2_0, mult_20_m5, tmp1
SMLADX tmp2, x_2_0, mult_20_01, tmp2
SMLADX tmp3, x_2_0, mult_20_m5, tmp3
SMLADX tmp4, x_3_1, mult_20_m5, tmp4
SMLABB tmp1, x_3_1, mult_20_01, tmp1
SMLABB tmp2, x_3_1, mult_20_m5, tmp2
UXTB16 x_6_4, x_7_5
SMLADX tmp3, x_3_1, mult_20_01, tmp3
SMLABB tmp4, x_6_4, mult_20_01, tmp4
SUBS count, count, #4<<28
STR tmp1, [mb], #4
STR tmp2, [mb], #4
STR tmp3, [mb], #4
STR tmp4, [mb], #4
BCS loop_x
next_y
AND tmp3, count, #0x000F0000 ;// partWidth-1
SMLABB ref, count, mult_20_01, ref ;// +width
SBC ref, ref, tmp3, LSR #16 ;// -(partWidth-1)-1
ADDS count, count, #(1<<28)-(1<<20)
BGE loop_y
LDMFD sp!, {r4-r11, pc}
END