blob: 634a4846b9d4f8e58d5da4d7e99bd9e43cdfd316 [file] [log] [blame]
; Copyright (C) 2009 The Android Open Source Project
;
; Licensed under the Apache License, Version 2.0 (the "License");
; you may not use this file except in compliance with the License.
; You may obtain a copy of the License at
;
; http://www.apache.org/licenses/LICENSE-2.0
;
; Unless required by applicable law or agreed to in writing, software
; distributed under the License is distributed on an "AS IS" BASIS,
; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
; See the License for the specific language governing permissions and
; limitations under the License.
;-------------------------------------------------------------------------------
;--
;-- Abstract : ARMv6 optimized version of h264bsdInterpolateChromaHor function
;--
;-------------------------------------------------------------------------------
IF :DEF: H264DEC_WINASM
;// We dont use REQUIRE8 and PRESERVE8 for winasm
ELSE
REQUIRE8
PRESERVE8
ENDIF
AREA |.text|, CODE
;// h264bsdInterpolateChromaHor register allocation
ref RN 0
ptrA RN 0
mb RN 1
block RN 1
x0 RN 2
count RN 2
y0 RN 3
valX RN 3
width RN 4
height RN 5
tmp7 RN 5
chrPW RN 6
tmp8 RN 6
tmp1 RN 7
chrPH RN 7
tmp2 RN 8
tmp3 RN 9
tmp4 RN 10
tmp5 RN 11
tmp6 RN 12
c32 RN 14
xFrac RN 14
;// Function exports and imports
IMPORT h264bsdFillBlock
EXPORT h264bsdInterpolateChromaHor
;// Function arguments
;//
;// u8 *ref, : 0xc4
;// u8 *predPartChroma, : 0xc8
;// i32 x0, : 0xcc
;// i32 y0, : 0xd0
;// u32 width, : 0xf8
;// u32 height, : 0xfc
;// u32 xFrac, : 0x100
;// u32 chromaPartWidth, : 0x104
;// u32 chromaPartHeight : 0x108
h264bsdInterpolateChromaHor
STMFD sp!, {r0-r11,lr}
SUB sp, sp, #0xc4
LDR chrPW, [sp, #0x104] ;// chromaPartWidth
LDR width, [sp, #0xf8] ;// width
CMP x0, #0
BLT do_fill
ADD tmp6, x0, chrPW ;// tmp6 = x0+ chromaPartWidth
ADD tmp6, tmp6, #1 ;// tmp6 = x0 + chromaPartWidth + 1
CMP tmp6, width ;// x0+chromaPartWidth+1 > width
BHI do_fill
CMP y0, #0
BLT do_fill
LDR chrPH, [sp, #0x108] ;// chromaPartHeight
LDR height, [sp, #0xfc] ;// height
ADD tmp6, y0, chrPH ;// tmp6 = y0 + chromaPartHeight
CMP tmp6, height
BLS skip_fill
do_fill
LDR chrPH, [sp, #0x108] ;// chromaPartHeight
LDR height, [sp, #0xfc] ;// height
ADD tmp8, chrPW, #1 ;// tmp8 = chromaPartWidth+1
MOV tmp2, tmp8 ;// tmp2 = chromaPartWidth+1
STMIA sp,{width,height,tmp8,chrPH,tmp2}
ADD block, sp, #0x1c ;// block
BL h264bsdFillBlock
LDR x0, [sp, #0xcc]
LDR y0, [sp, #0xd0]
LDR ref, [sp, #0xc4] ;// ref
STMIA sp,{width,height,tmp8,chrPH,tmp2}
ADD block, sp, #0x1c ;// block
MLA ref, height, width, ref ;// ref += width * height;
MLA block, chrPH, tmp8, block;// block + (chromaPH)*(chromaPW+1)
BL h264bsdFillBlock
MOV x0, #0 ;// x0 = 0
MOV y0, #0 ;// y0 = 0
STR x0, [sp, #0xcc]
STR y0, [sp, #0xd0]
ADD ref, sp, #0x1c ;// ref = block
STR ref, [sp, #0xc4] ;// ref
STR chrPH, [sp, #0xfc] ;// height
STR tmp8, [sp, #0xf8] ;// width
MOV width, tmp8
SUB chrPW, chrPW, #1
skip_fill
MLA tmp3, y0, width, x0 ;// tmp3 = y0*width+x0
LDR xFrac, [sp, #0x100] ;// xFrac
ADD ptrA, ref, tmp3 ;// ptrA = ref + y0*width+x0
RSB valX, xFrac, #8 ;// valX = 8-xFrac
LDR mb, [sp, #0xc8] ;// predPartChroma
;// pack values to count register
;// [31:28] loop_x (chromaPartWidth-1)
;// [27:24] loop_y (chromaPartHeight-1)
;// [23:20] chromaPartWidth-1
;// [19:16] chromaPartHeight-1
;// [15:00] nothing
SUB tmp2, chrPH, #1 ;// chromaPartHeight-1
SUB tmp1, chrPW, #1 ;// chromaPartWidth-1
ADD count, count, tmp2, LSL #16 ;// chromaPartHeight-1
ADD count, count, tmp2, LSL #24 ;// loop_y
ADD count, count, tmp1, LSL #20 ;// chromaPartWidth-1
AND tmp2, count, #0x00F00000 ;// loop_x
PKHBT valX, valX, xFrac, LSL #16 ;// |xFrac|valX |
MOV valX, valX, LSL #3 ;// multiply by 8 in advance
MOV c32, #32
;///////////////////////////////////////////////////////////////////////////
;// Cb
;///////////////////////////////////////////////////////////////////////////
;// 2x2 pels per iteration
;// bilinear vertical interpolation
loop1_y
ADD count, count, tmp2, LSL #8
LDRB tmp1, [ptrA, width]
LDRB tmp2, [ptrA], #1
loop1_x
LDRB tmp3, [ptrA, width]
LDRB tmp4, [ptrA], #1
PKHBT tmp5, tmp1, tmp3, LSL #16
PKHBT tmp6, tmp2, tmp4, LSL #16
LDRB tmp1, [ptrA, width]
LDRB tmp2, [ptrA], #1
SMLAD tmp5, tmp5, valX, c32 ;// multiply
SMLAD tmp6, tmp6, valX, c32 ;// multiply
PKHBT tmp7, tmp3, tmp1, LSL #16
PKHBT tmp8, tmp4, tmp2, LSL #16
SMLAD tmp7, tmp7, valX, c32 ;// multiply
SMLAD tmp8, tmp8, valX, c32 ;// multiply
MOV tmp5, tmp5, LSR #6 ;// scale down
STRB tmp5, [mb,#8] ;// store row 2 col 1
MOV tmp6, tmp6, LSR #6 ;// scale down
STRB tmp6, [mb],#1 ;// store row 1 col 1
MOV tmp7, tmp7, LSR #6 ;// scale down
STRB tmp7, [mb,#8] ;// store row 2 col 2
MOV tmp8, tmp8, LSR #6 ;// scale down
STRB tmp8, [mb],#1 ;// store row 1 col 2
SUBS count, count, #2<<28
BCS loop1_x
AND tmp2, count, #0x00F00000
ADDS mb, mb, #16
SBC mb, mb, tmp2, LSR #20
ADD ptrA, ptrA, width, LSL #1
SBC ptrA, ptrA, tmp2, LSR #20
SUB ptrA, ptrA, #1
ADDS count, count, #0xE << 24
BGE loop1_y
;///////////////////////////////////////////////////////////////////////////
;// Cr
;///////////////////////////////////////////////////////////////////////////
LDR height, [sp,#0xfc] ;// height
LDR ref, [sp, #0xc4] ;// ref
LDR tmp1, [sp, #0xd0] ;// y0
LDR tmp2, [sp, #0xcc] ;// x0
LDR mb, [sp, #0xc8] ;// predPartChroma
ADD tmp1, height, tmp1
MLA tmp3, tmp1, width, tmp2
ADD ptrA, ref, tmp3
ADD mb, mb, #64
AND count, count, #0x00FFFFFF
AND tmp1, count, #0x000F0000
ADD count, count, tmp1, LSL #8
AND tmp2, count, #0x00F00000
;// 2x2 pels per iteration
;// bilinear vertical interpolation
loop2_y
ADD count, count, tmp2, LSL #8
LDRB tmp1, [ptrA, width]
LDRB tmp2, [ptrA], #1
loop2_x
LDRB tmp3, [ptrA, width]
LDRB tmp4, [ptrA], #1
PKHBT tmp5, tmp1, tmp3, LSL #16
PKHBT tmp6, tmp2, tmp4, LSL #16
LDRB tmp1, [ptrA, width]
LDRB tmp2, [ptrA], #1
SMLAD tmp5, tmp5, valX, c32 ;// multiply
SMLAD tmp6, tmp6, valX, c32 ;// multiply
PKHBT tmp7, tmp3, tmp1, LSL #16
PKHBT tmp8, tmp4, tmp2, LSL #16
SMLAD tmp7, tmp7, valX, c32 ;// multiply
SMLAD tmp8, tmp8, valX, c32 ;// multiply
MOV tmp5, tmp5, LSR #6 ;// scale down
STRB tmp5, [mb,#8] ;// store row 2 col 1
MOV tmp6, tmp6, LSR #6 ;// scale down
STRB tmp6, [mb],#1 ;// store row 1 col 1
MOV tmp7, tmp7, LSR #6 ;// scale down
STRB tmp7, [mb,#8] ;// store row 2 col 2
MOV tmp8, tmp8, LSR #6 ;// scale down
STRB tmp8, [mb],#1 ;// store row 1 col 2
SUBS count, count, #2<<28
BCS loop2_x
AND tmp2, count, #0x00F00000
ADDS mb, mb, #16
SBC mb, mb, tmp2, LSR #20
ADD ptrA, ptrA, width, LSL #1
SBC ptrA, ptrA, tmp2, LSR #20
SUB ptrA, ptrA, #1
ADDS count, count, #0xE << 24
BGE loop2_y
ADD sp,sp,#0xd4
LDMFD sp!, {r4-r11,pc}
END