blob: 7420ad3cf6403853826e91a51df16fb22d46106c [file] [log] [blame]
; Copyright (C) 2009 The Android Open Source Project
;
; Licensed under the Apache License, Version 2.0 (the "License");
; you may not use this file except in compliance with the License.
; You may obtain a copy of the License at
;
; http://www.apache.org/licenses/LICENSE-2.0
;
; Unless required by applicable law or agreed to in writing, software
; distributed under the License is distributed on an "AS IS" BASIS,
; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
; See the License for the specific language governing permissions and
; limitations under the License.
;-------------------------------------------------------------------------------
;--
;-- Abstract : ARMv6 optimized version of h264bsdInterpolateChromaHorVer
;-- function
;--
;-------------------------------------------------------------------------------
IF :DEF: H264DEC_WINASM
;// We dont use REQUIRE8 and PRESERVE8 for winasm
ELSE
REQUIRE8
PRESERVE8
ENDIF
AREA |.text|, CODE
;// h264bsdInterpolateChromaHorVer register allocation
ref RN 0
ptrA RN 0
mb RN 1
block RN 1
x0 RN 2
count RN 2
y0 RN 3
valY RN 3
width RN 4
tmp4 RN 5
height RN 5
tmp1 RN 6
tmp2 RN 7
tmp3 RN 8
valX RN 9
tmp5 RN 10
chrPW RN 10
tmp6 RN 11
chrPH RN 11
xFrac RN 12
c32 RN 14
yFrac RN 14
;// function exports and imports
IMPORT h264bsdFillBlock
EXPORT h264bsdInterpolateChromaHorVer
;// Function arguments
;//
;// u8 *ref, : 0xc4
;// u8 *predPartChroma, : 0xc8
;// i32 x0, : 0xcc
;// i32 y0, : 0xd0
;// u32 width, : 0xf8
;// u32 height, : 0xfc
;// u32 xFrac, : 0x100
;// u32 yFrac, : 0x104
;// u32 chromaPartWidth, : 0x108
;// u32 chromaPartHeight : 0x10c
h264bsdInterpolateChromaHorVer
STMFD sp!, {r0-r11,lr}
SUB sp, sp, #0xc4
LDR chrPW, [sp, #0x108] ;// chromaPartWidth
LDR xFrac, [sp, #0x100] ;// xFrac
LDR width, [sp, #0xf8] ;// width
CMP x0, #0
BLT do_fill
ADD tmp1, x0, chrPW ;// tmp1 = x0+ chromaPartWidth
ADD tmp1, tmp1, #1 ;// tmp1 = x0+ chromaPartWidth+1
CMP tmp1, width ;// x0+chromaPartWidth+1 > width
BHI do_fill
CMP y0, #0
BLT do_fill
LDR chrPH, [sp, #0x10c] ;// chromaPartHeight
LDR height, [sp, #0xfc] ;// height
ADD tmp1, y0, chrPH ;// tmp1 = y0 + chromaPartHeight
ADD tmp1, tmp1, #1 ;// tmp1 = y0 + chromaPartHeight + 1
CMP tmp1, height
BLS skip_fill
do_fill
LDR chrPH, [sp, #0x10c] ;// chromaPartHeight
LDR height, [sp, #0xfc] ;// height
ADD tmp3, chrPW, #1 ;// tmp3 = chromaPartWidth+1
ADD tmp1, chrPW, #1 ;// tmp1 = chromaPartWidth+1
ADD tmp2, chrPH, #1 ;// tmp2 = chromaPartHeight+1
STMIA sp,{width,height,tmp1,tmp2,tmp3}
ADD block, sp, #0x1c ;// block
BL h264bsdFillBlock
LDR x0, [sp, #0xcc]
LDR y0, [sp, #0xd0]
LDR ref, [sp, #0xc4] ;// ref
STMIA sp,{width,height,tmp1,tmp2,tmp3}
ADD block, sp, #0x1c ;// block
MLA ref, height, width, ref ;// ref += width * height;
MLA block, tmp2, tmp1, block;// block + (chromaPW+1)*(chromaPH+1)
BL h264bsdFillBlock
MOV x0, #0 ;// x0 = 0
MOV y0, #0 ;// y0 = 0
STR x0, [sp, #0xcc]
STR y0, [sp, #0xd0]
ADD ref, sp, #0x1c ;// ref = block
STR ref, [sp, #0xc4] ;// ref
STR tmp2, [sp, #0xfc] ;// height
STR tmp1, [sp, #0xf8] ;// width
MOV width, tmp1
skip_fill
MLA tmp3, y0, width, x0 ;// tmp3 = y0*width+x0
LDR yFrac, [sp, #0x104] ;// yFrac
LDR xFrac, [sp, #0x100]
ADD ptrA, ref, tmp3 ;// ptrA = ref + y0*width+x0
RSB valX, xFrac, #8 ;// valX = 8-xFrac
RSB valY, yFrac, #8 ;// valY = 8-yFrac
LDR mb, [sp, #0xc8] ;// predPartChroma
;// pack values to count register
;// [31:28] loop_x (chromaPartWidth-1)
;// [27:24] loop_y (chromaPartHeight-1)
;// [23:20] chromaPartWidth-1
;// [19:16] chromaPartHeight-1
;// [15:00] nothing
SUB tmp2, chrPH, #1 ;// chromaPartHeight-1
SUB tmp1, chrPW, #1 ;// chromaPartWidth-1
ADD count, count, tmp2, LSL #16 ;// chromaPartHeight-1
ADD count, count, tmp2, LSL #24 ;// loop_y
ADD count, count, tmp1, LSL #20 ;// chromaPartWidth-1
AND tmp2, count, #0x00F00000 ;// loop_x
PKHBT valY, valY, yFrac, LSL #16 ;// |yFrac|valY |
MOV c32, #32
;///////////////////////////////////////////////////////////////////////////
;// Cb
;///////////////////////////////////////////////////////////////////////////
;// 2x2 pels per iteration
;// bilinear vertical and horizontal interpolation
loop1_y
LDRB tmp1, [ptrA]
LDRB tmp3, [ptrA, width]
LDRB tmp5, [ptrA, width, LSL #1]
PKHBT tmp1, tmp1, tmp3, LSL #16 ;// |t3|t1|
PKHBT tmp3, tmp3, tmp5, LSL #16 ;// |t5|t3|
SMUAD tmp1, tmp1, valY ;// t1=(t1*valY + t3*yFrac)
SMUAD tmp3, tmp3, valY ;// t3=(t3*valY + t5*yFrac)
ADD count, count, tmp2, LSL #8
loop1_x
;// first
LDRB tmp2, [ptrA, #1]!
LDRB tmp4, [ptrA, width]
LDRB tmp6, [ptrA, width, LSL #1]
PKHBT tmp2, tmp2, tmp4, LSL #16 ;// |t4|t2|
PKHBT tmp4, tmp4, tmp6, LSL #16 ;// |t6|t4|
SMUAD tmp2, tmp2, valY ;// t2=(t2*valY + t4*yFrac)
MLA tmp5, tmp1, valX, c32 ;// t5=t1*valX+32
MLA tmp5, tmp2, xFrac, tmp5 ;// t5=t2*xFrac+t5
SMUAD tmp4, tmp4, valY ;// t4=(t4*valY + t6*yFrac)
MLA tmp6, tmp3, valX, c32 ;// t3=t3*valX+32
MLA tmp6, tmp4, xFrac, tmp6 ;// t6=t4*xFrac+t6
MOV tmp6, tmp6, LSR #6 ;// scale down
STRB tmp6, [mb, #8] ;// store pixel
MOV tmp5, tmp5, LSR #6 ;// scale down
STRB tmp5, [mb], #1 ;// store pixel
;// second
LDRB tmp1, [ptrA, #1]!
LDRB tmp3, [ptrA, width]
LDRB tmp5, [ptrA, width, LSL #1]
PKHBT tmp1, tmp1, tmp3, LSL #16 ;// |t3|t1|
PKHBT tmp3, tmp3, tmp5, LSL #16 ;// |t5|t3|
SMUAD tmp1, tmp1, valY ;// t1=(t1*valY + t3*yFrac)
MLA tmp5, tmp1, xFrac, c32 ;// t1=t1*xFrac+32
MLA tmp5, tmp2, valX, tmp5 ;// t5=t2*valX+t5
SMUAD tmp3, tmp3, valY ;// t3=(t3*valY + t5*yFrac)
MLA tmp6, tmp3, xFrac, c32 ;// t3=t3*xFrac+32
MLA tmp6, tmp4, valX, tmp6 ;// t6=t4*valX+t6
MOV tmp6, tmp6, LSR #6 ;// scale down
STRB tmp6, [mb, #8] ;// store pixel
MOV tmp5, tmp5, LSR #6 ;// scale down
STRB tmp5, [mb], #1 ;// store pixel
SUBS count, count, #2<<28
BCS loop1_x
AND tmp2, count, #0x00F00000
ADDS mb, mb, #16
SBC mb, mb, tmp2, LSR #20
ADD ptrA, ptrA, width, LSL #1
SBC ptrA, ptrA, tmp2, LSR #20
ADDS count, count, #0xE << 24
BGE loop1_y
;///////////////////////////////////////////////////////////////////////////
;// Cr
;///////////////////////////////////////////////////////////////////////////
LDR height, [sp,#0xfc] ;// height
LDR ref, [sp, #0xc4] ;// ref
LDR tmp1, [sp, #0xd0] ;// y0
LDR tmp2, [sp, #0xcc] ;// x0
LDR mb, [sp, #0xc8] ;// predPartChroma
ADD tmp1, height, tmp1
MLA tmp3, tmp1, width, tmp2
ADD ptrA, ref, tmp3
ADD mb, mb, #64
AND count, count, #0x00FFFFFF
AND tmp1, count, #0x000F0000
ADD count, count, tmp1, LSL #8
AND tmp2, count, #0x00F00000
;// 2x2 pels per iteration
;// bilinear vertical and horizontal interpolation
loop2_y
LDRB tmp1, [ptrA]
LDRB tmp3, [ptrA, width]
LDRB tmp5, [ptrA, width, LSL #1]
PKHBT tmp1, tmp1, tmp3, LSL #16 ;// |t3|t1|
PKHBT tmp3, tmp3, tmp5, LSL #16 ;// |t5|t3|
SMUAD tmp1, tmp1, valY ;// t1=(t1*valY + t3*yFrac)
SMUAD tmp3, tmp3, valY ;// t3=(t3*valY + t5*yFrac)
ADD count, count, tmp2, LSL #8
loop2_x
;// first
LDRB tmp2, [ptrA, #1]!
LDRB tmp4, [ptrA, width]
LDRB tmp6, [ptrA, width, LSL #1]
PKHBT tmp2, tmp2, tmp4, LSL #16 ;// |t4|t2|
PKHBT tmp4, tmp4, tmp6, LSL #16 ;// |t6|t4|
SMUAD tmp2, tmp2, valY ;// t2=(t2*valY + t4*yFrac)
MLA tmp5, tmp1, valX, c32 ;// t5=t1*valX+32
MLA tmp5, tmp2, xFrac, tmp5 ;// t5=t2*xFrac+t5
SMUAD tmp4, tmp4, valY ;// t4=(t4*valY + t6*yFrac)
MLA tmp6, tmp3, valX, c32 ;// t3=t3*valX+32
MLA tmp6, tmp4, xFrac, tmp6 ;// t6=t4*xFrac+t6
MOV tmp6, tmp6, LSR #6 ;// scale down
STRB tmp6, [mb, #8] ;// store pixel
MOV tmp5, tmp5, LSR #6 ;// scale down
STRB tmp5, [mb], #1 ;// store pixel
;// second
LDRB tmp1, [ptrA, #1]!
LDRB tmp3, [ptrA, width]
LDRB tmp5, [ptrA, width, LSL #1]
PKHBT tmp1, tmp1, tmp3, LSL #16 ;// |t3|t1|
PKHBT tmp3, tmp3, tmp5, LSL #16 ;// |t5|t3|
SMUAD tmp1, tmp1, valY ;// t1=(t1*valY + t3*yFrac)
MLA tmp5, tmp1, xFrac, c32 ;// t1=t1*xFrac+32
MLA tmp5, tmp2, valX, tmp5 ;// t5=t2*valX+t5
SMUAD tmp3, tmp3, valY ;// t3=(t3*valY + t5*yFrac)
MLA tmp6, tmp3, xFrac, c32 ;// t3=t3*xFrac+32
MLA tmp6, tmp4, valX, tmp6 ;// t6=t4*valX+t6
MOV tmp6, tmp6, LSR #6 ;// scale down
STRB tmp6, [mb, #8] ;// store pixel
MOV tmp5, tmp5, LSR #6 ;// scale down
STRB tmp5, [mb], #1 ;// store pixel
SUBS count, count, #2<<28
BCS loop2_x
AND tmp2, count, #0x00F00000
ADDS mb, mb, #16
SBC mb, mb, tmp2, LSR #20
ADD ptrA, ptrA, width, LSL #1
SBC ptrA, ptrA, tmp2, LSR #20
ADDS count, count, #0xE << 24
BGE loop2_y
ADD sp,sp,#0xd4
LDMFD sp!,{r4-r11,pc}
END