av/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_ver.s - nest-cam/4320010/av - Git at Google

 ; Copyright (C) 2009 The Android Open Source Project
 ;
 ; Licensed under the Apache License, Version 2.0 (the "License");
 ; you may not use this file except in compliance with the License.
 ; You may obtain a copy of the License at
 ;
 ;      http://www.apache.org/licenses/LICENSE-2.0
 ;
 ; Unless required by applicable law or agreed to in writing, software
 ; distributed under the License is distributed on an "AS IS" BASIS,
 ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ; See the License for the specific language governing permissions and
 ; limitations under the License.

 ;-------------------------------------------------------------------------------
 ;--
 ;-- Abstract : ARMv6 optimized version of h264bsdInterpolateChromaVer function
 ;--
 ;-------------------------------------------------------------------------------


     IF :DEF: H264DEC_WINASM
         ;// We dont use REQUIRE8 and PRESERVE8 for winasm
     ELSE
         REQUIRE8
         PRESERVE8
     ENDIF

     AREA    |.text|, CODE

 ;// h264bsdInterpolateChromaVer register allocation

 ref     RN 0
 ptrA    RN 0

 mb      RN 1
 block   RN 1

 x0      RN 2
 count   RN 2

 y0      RN 3
 valY    RN 3

 width   RN 4

 height  RN 5
 tmp7    RN 5

 chrPW   RN 6
 tmp8    RN 6

 tmp1    RN 7

 tmp2    RN 8

 tmp3    RN 9

 tmp4    RN 10

 tmp5    RN 11
 chrPH   RN 11

 tmp6    RN 12

 c32     RN 14
 yFrac   RN 14

 ;// Function exports and imports

     IMPORT  h264bsdFillBlock

     EXPORT  h264bsdInterpolateChromaVer

 ;//  Function arguments
 ;//
 ;//  u8 *ref,                   : 0xc4
 ;//  u8 *predPartChroma,        : 0xc8
 ;//  i32 x0,                    : 0xcc
 ;//  i32 y0,                    : 0xd0
 ;//  u32 width,                 : 0xf8
 ;//  u32 height,                : 0xfc
 ;//  u32 yFrac,                 : 0x100
 ;//  u32 chromaPartWidth,       : 0x104
 ;//  u32 chromaPartHeight       : 0x108

 h264bsdInterpolateChromaVer
     STMFD   sp!, {r0-r11,lr}
     SUB     sp, sp, #0xc4

     LDR     chrPW, [sp, #0x104]     ;// chromaPartWidth
     LDR     width, [sp, #0xf8]      ;// width
     CMP     x0, #0
     BLT     do_fill

     ADD     tmp1, x0, chrPW         ;// tmp1 = x0+ chromaPartWidth
     CMP     tmp1, width             ;// x0+chromaPartWidth > width
     BHI     do_fill

     CMP     y0, #0
     BLT     do_fill
     LDR     chrPH, [sp, #0x108]     ;// chromaPartHeight
     LDR     height, [sp, #0xfc]     ;// height
     ADD     tmp1, y0, chrPH         ;// tmp1 = y0 + chromaPartHeight
     ADD     tmp1, tmp1, #1          ;// tmp1 = y0 + chromaPartHeight + 1
     CMP     tmp1, height
     BLS     skip_fill

 do_fill
     LDR     chrPH, [sp, #0x108]     ;// chromaPartHeight
     LDR     height, [sp, #0xfc]     ;// height
     ADD     tmp1, chrPH, #1         ;// tmp1 = chromaPartHeight+1
     MOV     tmp2, chrPW             ;// tmp2 = chromaPartWidth
     STMIA   sp,{width,height,chrPW,tmp1,tmp2}
     ADD     block, sp, #0x1c        ;// block
     BL      h264bsdFillBlock

     LDR     x0, [sp, #0xcc]
     LDR     y0, [sp, #0xd0]
     LDR     ref, [sp, #0xc4]        ;// ref
     STMIA   sp,{width,height,chrPW,tmp1,tmp2}
     ADD     block, sp, #0x1c        ;// block
     MLA     ref, height, width, ref ;// ref += width * height;
     MLA     block, chrPW, tmp1, block;// block + (chromaPW)*(chromaPH+1)
     BL      h264bsdFillBlock

     MOV     x0, #0                  ;// x0 = 0
     MOV     y0, #0                  ;// y0 = 0
     STR     x0, [sp, #0xcc]
     STR     y0, [sp, #0xd0]
     ADD     ref, sp, #0x1c          ;// ref = block
     STR     ref, [sp, #0xc4]        ;// ref

     STR     tmp1, [sp, #0xfc]       ;// height
     STR     chrPW, [sp, #0xf8]      ;// width
     MOV     width, chrPW

 skip_fill
     MLA     tmp3, y0, width, x0     ;// tmp3 = y0*width+x0
     LDR     yFrac, [sp, #0x100]     ;// yFrac
     ADD     ptrA, ref, tmp3         ;// ptrA = ref + y0*width+x0
     RSB     valY, yFrac, #8         ;// valY = 8-yFrac

     LDR     mb, [sp, #0xc8]         ;// predPartChroma


     ;// pack values to count register
     ;// [31:28] loop_x (chromaPartWidth-1)
     ;// [27:24] loop_y (chromaPartHeight-1)
     ;// [23:20] chromaPartWidth-1
     ;// [19:16] chromaPartHeight-1
     ;// [15:00] nothing

     SUB     tmp2, chrPH, #1             ;// chromaPartHeight-1
     SUB     tmp1, chrPW, #1             ;// chromaPartWidth-1
     ADD     count, count, tmp2, LSL #16 ;// chromaPartHeight-1
     ADD     count, count, tmp2, LSL #24 ;// loop_y
     ADD     count, count, tmp1, LSL #20 ;// chromaPartWidth-1
     AND     tmp2, count, #0x00F00000    ;// loop_x
     PKHBT   valY, valY, yFrac, LSL #16  ;// |yFrac|valY |
     MOV     valY, valY, LSL #3          ;// multiply by 8 in advance
     MOV     c32, #32


     ;///////////////////////////////////////////////////////////////////////////
     ;// Cb
     ;///////////////////////////////////////////////////////////////////////////

     ;// 2x2 pels per iteration
     ;// bilinear vertical interpolation

 loop1_y
     ADD     count, count, tmp2, LSL #8
 loop1_x
     ;// Process 2x2 block
     LDRB    tmp2, [ptrA,width]          ;// 2 row, 1 col
     LDRB    tmp3, [ptrA,width, LSL #1]  ;// 3 row, 1 col
     LDRB    tmp1, [ptrA],#1             ;// 1 row, 1 col

     LDRB    tmp5, [ptrA,width]          ;// 2 row, 2 col
     LDRB    tmp6, [ptrA,width, LSL #1]  ;// 3 row, 2 col
     LDRB    tmp4, [ptrA],#1             ;// 1 row, 2 col

     PKHBT   tmp1, tmp1, tmp2, LSL #16   ;// |B|A|
     PKHBT   tmp2, tmp2, tmp3, LSL #16   ;// |C|B|
     PKHBT   tmp4, tmp4, tmp5, LSL #16   ;// |B|A|

     SMLAD   tmp7, tmp2, valY, c32       ;// multiply
     PKHBT   tmp5, tmp5, tmp6, LSL #16   ;// |C|B|
     SMLAD   tmp2, tmp1, valY, c32       ;// multiply
     SMLAD   tmp8, tmp5, valY, c32       ;// multiply
     SMLAD   tmp5, tmp4, valY, c32       ;// multiply

     MOV     tmp7, tmp7, LSR #6          ;// scale down
     STRB    tmp7, [mb,#8]               ;// store row 2 col 1
     MOV     tmp2, tmp2, LSR #6          ;// scale down
     STRB    tmp2, [mb],#1               ;// store row 1 col 1

     MOV     tmp8, tmp8, LSR #6          ;// scale down
     STRB    tmp8, [mb,#8]               ;// store row 2 col 2
     MOV     tmp5, tmp5, LSR #6          ;// scale down
     STRB    tmp5, [mb],#1               ;// store row 1 col 2


     SUBS    count, count, #2<<28
     BCS     loop1_x

     AND     tmp2, count, #0x00F00000

     ADDS    mb, mb, #16
     SBC     mb, mb, tmp2, LSR #20
     ADD     ptrA, ptrA, width, LSL #1
     SBC     ptrA, ptrA, tmp2, LSR #20

     ADDS    count, count, #0xE << 24
     BGE     loop1_y

     ;///////////////////////////////////////////////////////////////////////////
     ;// Cr
     ;///////////////////////////////////////////////////////////////////////////
     LDR     height, [sp,#0xfc]          ;// height
     LDR     ref, [sp, #0xc4]            ;// ref
     LDR     tmp1, [sp, #0xd0]           ;// y0
     LDR     tmp2, [sp, #0xcc]           ;// x0
     LDR     mb, [sp, #0xc8]             ;// predPartChroma

     ADD     tmp1, height, tmp1
     MLA     tmp3, tmp1, width, tmp2
     ADD     ptrA, ref, tmp3
     ADD     mb, mb, #64

     AND     count, count, #0x00FFFFFF
     AND     tmp1, count, #0x000F0000
     ADD     count, count, tmp1, LSL #8
     AND     tmp2, count, #0x00F00000

     ;// 2x2 pels per iteration
     ;// bilinear vertical interpolation
 loop2_y
     ADD     count, count, tmp2, LSL #8
 loop2_x
     ;// Process 2x2 block
     LDRB    tmp2, [ptrA,width]          ;// 2 row, 1 col
     LDRB    tmp3, [ptrA,width, LSL #1]  ;// 3 row, 1 col
     LDRB    tmp1, [ptrA],#1             ;// 1 row, 1 col

     LDRB    tmp5, [ptrA,width]          ;// 2 row, 2 col
     LDRB    tmp6, [ptrA,width, LSL #1]  ;// 3 row, 2 col
     LDRB    tmp4, [ptrA],#1             ;// 1 row, 2 col

     PKHBT   tmp1, tmp1, tmp2, LSL #16   ;// |B|A|
     PKHBT   tmp2, tmp2, tmp3, LSL #16   ;// |C|B|
     PKHBT   tmp4, tmp4, tmp5, LSL #16   ;// |B|A|

     SMLAD   tmp7, tmp2, valY, c32       ;// multiply
     PKHBT   tmp5, tmp5, tmp6, LSL #16   ;// |C|B|
     SMLAD   tmp2, tmp1, valY, c32       ;// multiply
     SMLAD   tmp8, tmp5, valY, c32       ;// multiply
     SMLAD   tmp5, tmp4, valY, c32       ;// multiply

     MOV     tmp7, tmp7, LSR #6          ;// scale down
     STRB    tmp7, [mb,#8]               ;// store row 2 col 1
     MOV     tmp2, tmp2, LSR #6          ;// scale down
     STRB    tmp2, [mb],#1               ;// store row 1 col 1

     MOV     tmp8, tmp8, LSR #6          ;// scale down
     STRB    tmp8, [mb,#8]               ;// store row 2 col 2
     MOV     tmp5, tmp5, LSR #6          ;// scale down
     STRB    tmp5, [mb],#1               ;// store row 1 col 2


     SUBS    count, count, #2<<28
     BCS     loop2_x

     AND     tmp2, count, #0x00F00000

     ADDS    mb, mb, #16
     SBC     mb, mb, tmp2, LSR #20
     ADD     ptrA, ptrA, width, LSL #1
     SBC     ptrA, ptrA, tmp2, LSR #20

     ADDS    count, count, #0xE << 24
     BGE     loop2_y

     ADD     sp,sp,#0xd4
     LDMFD   sp!, {r4-r11,pc}

     END
	; Copyright (C) 2009 The Android Open Source Project
	;
	; Licensed under the Apache License, Version 2.0 (the "License");
	; you may not use this file except in compliance with the License.
	; You may obtain a copy of the License at
	;
	; http://www.apache.org/licenses/LICENSE-2.0
	;
	; Unless required by applicable law or agreed to in writing, software
	; distributed under the License is distributed on an "AS IS" BASIS,
	; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	; See the License for the specific language governing permissions and
	; limitations under the License.

	;-------------------------------------------------------------------------------
	;--
	;-- Abstract : ARMv6 optimized version of h264bsdInterpolateChromaVer function
	;--
	;-------------------------------------------------------------------------------


	IF :DEF: H264DEC_WINASM
	;// We dont use REQUIRE8 and PRESERVE8 for winasm
	ELSE
	REQUIRE8
	PRESERVE8
	ENDIF

	AREA \|.text\|, CODE

	;// h264bsdInterpolateChromaVer register allocation

	ref RN 0
	ptrA RN 0

	mb RN 1
	block RN 1

	x0 RN 2
	count RN 2

	y0 RN 3
	valY RN 3

	width RN 4

	height RN 5
	tmp7 RN 5

	chrPW RN 6
	tmp8 RN 6

	tmp1 RN 7

	tmp2 RN 8

	tmp3 RN 9

	tmp4 RN 10

	tmp5 RN 11
	chrPH RN 11

	tmp6 RN 12

	c32 RN 14
	yFrac RN 14

	;// Function exports and imports

	IMPORT h264bsdFillBlock

	EXPORT h264bsdInterpolateChromaVer

	;// Function arguments
	;//
	;// u8 *ref, : 0xc4
	;// u8 *predPartChroma, : 0xc8
	;// i32 x0, : 0xcc
	;// i32 y0, : 0xd0
	;// u32 width, : 0xf8
	;// u32 height, : 0xfc
	;// u32 yFrac, : 0x100
	;// u32 chromaPartWidth, : 0x104
	;// u32 chromaPartHeight : 0x108

	h264bsdInterpolateChromaVer
	STMFD sp!, {r0-r11,lr}
	SUB sp, sp, #0xc4

	LDR chrPW, [sp, #0x104] ;// chromaPartWidth
	LDR width, [sp, #0xf8] ;// width
	CMP x0, #0
	BLT do_fill

	ADD tmp1, x0, chrPW ;// tmp1 = x0+ chromaPartWidth
	CMP tmp1, width ;// x0+chromaPartWidth > width
	BHI do_fill

	CMP y0, #0
	BLT do_fill
	LDR chrPH, [sp, #0x108] ;// chromaPartHeight
	LDR height, [sp, #0xfc] ;// height
	ADD tmp1, y0, chrPH ;// tmp1 = y0 + chromaPartHeight
	ADD tmp1, tmp1, #1 ;// tmp1 = y0 + chromaPartHeight + 1
	CMP tmp1, height
	BLS skip_fill

	do_fill
	LDR chrPH, [sp, #0x108] ;// chromaPartHeight
	LDR height, [sp, #0xfc] ;// height
	ADD tmp1, chrPH, #1 ;// tmp1 = chromaPartHeight+1
	MOV tmp2, chrPW ;// tmp2 = chromaPartWidth
	STMIA sp,{width,height,chrPW,tmp1,tmp2}
	ADD block, sp, #0x1c ;// block
	BL h264bsdFillBlock

	LDR x0, [sp, #0xcc]
	LDR y0, [sp, #0xd0]
	LDR ref, [sp, #0xc4] ;// ref
	STMIA sp,{width,height,chrPW,tmp1,tmp2}
	ADD block, sp, #0x1c ;// block
	MLA ref, height, width, ref ;// ref += width * height;
	MLA block, chrPW, tmp1, block;// block + (chromaPW)*(chromaPH+1)
	BL h264bsdFillBlock

	MOV x0, #0 ;// x0 = 0
	MOV y0, #0 ;// y0 = 0
	STR x0, [sp, #0xcc]
	STR y0, [sp, #0xd0]
	ADD ref, sp, #0x1c ;// ref = block
	STR ref, [sp, #0xc4] ;// ref

	STR tmp1, [sp, #0xfc] ;// height
	STR chrPW, [sp, #0xf8] ;// width
	MOV width, chrPW

	skip_fill
	MLA tmp3, y0, width, x0 ;// tmp3 = y0*width+x0
	LDR yFrac, [sp, #0x100] ;// yFrac
	ADD ptrA, ref, tmp3 ;// ptrA = ref + y0*width+x0
	RSB valY, yFrac, #8 ;// valY = 8-yFrac

	LDR mb, [sp, #0xc8] ;// predPartChroma


	;// pack values to count register
	;// [31:28] loop_x (chromaPartWidth-1)
	;// [27:24] loop_y (chromaPartHeight-1)
	;// [23:20] chromaPartWidth-1
	;// [19:16] chromaPartHeight-1
	;// [15:00] nothing

	SUB tmp2, chrPH, #1 ;// chromaPartHeight-1
	SUB tmp1, chrPW, #1 ;// chromaPartWidth-1
	ADD count, count, tmp2, LSL #16 ;// chromaPartHeight-1
	ADD count, count, tmp2, LSL #24 ;// loop_y
	ADD count, count, tmp1, LSL #20 ;// chromaPartWidth-1
	AND tmp2, count, #0x00F00000 ;// loop_x
	PKHBT valY, valY, yFrac, LSL #16 ;// \|yFrac\|valY \|
	MOV valY, valY, LSL #3 ;// multiply by 8 in advance
	MOV c32, #32


	;///////////////////////////////////////////////////////////////////////////
	;// Cb
	;///////////////////////////////////////////////////////////////////////////

	;// 2x2 pels per iteration
	;// bilinear vertical interpolation

	loop1_y
	ADD count, count, tmp2, LSL #8
	loop1_x
	;// Process 2x2 block
	LDRB tmp2, [ptrA,width] ;// 2 row, 1 col
	LDRB tmp3, [ptrA,width, LSL #1] ;// 3 row, 1 col
	LDRB tmp1, [ptrA],#1 ;// 1 row, 1 col

	LDRB tmp5, [ptrA,width] ;// 2 row, 2 col
	LDRB tmp6, [ptrA,width, LSL #1] ;// 3 row, 2 col
	LDRB tmp4, [ptrA],#1 ;// 1 row, 2 col

	PKHBT tmp1, tmp1, tmp2, LSL #16 ;// \|B\|A\|
	PKHBT tmp2, tmp2, tmp3, LSL #16 ;// \|C\|B\|
	PKHBT tmp4, tmp4, tmp5, LSL #16 ;// \|B\|A\|

	SMLAD tmp7, tmp2, valY, c32 ;// multiply
	PKHBT tmp5, tmp5, tmp6, LSL #16 ;// \|C\|B\|
	SMLAD tmp2, tmp1, valY, c32 ;// multiply
	SMLAD tmp8, tmp5, valY, c32 ;// multiply
	SMLAD tmp5, tmp4, valY, c32 ;// multiply

	MOV tmp7, tmp7, LSR #6 ;// scale down
	STRB tmp7, [mb,#8] ;// store row 2 col 1
	MOV tmp2, tmp2, LSR #6 ;// scale down
	STRB tmp2, [mb],#1 ;// store row 1 col 1

	MOV tmp8, tmp8, LSR #6 ;// scale down
	STRB tmp8, [mb,#8] ;// store row 2 col 2
	MOV tmp5, tmp5, LSR #6 ;// scale down
	STRB tmp5, [mb],#1 ;// store row 1 col 2


	SUBS count, count, #2<<28
	BCS loop1_x

	AND tmp2, count, #0x00F00000

	ADDS mb, mb, #16
	SBC mb, mb, tmp2, LSR #20
	ADD ptrA, ptrA, width, LSL #1
	SBC ptrA, ptrA, tmp2, LSR #20

	ADDS count, count, #0xE << 24
	BGE loop1_y

	;///////////////////////////////////////////////////////////////////////////
	;// Cr
	;///////////////////////////////////////////////////////////////////////////
	LDR height, [sp,#0xfc] ;// height
	LDR ref, [sp, #0xc4] ;// ref
	LDR tmp1, [sp, #0xd0] ;// y0
	LDR tmp2, [sp, #0xcc] ;// x0
	LDR mb, [sp, #0xc8] ;// predPartChroma

	ADD tmp1, height, tmp1
	MLA tmp3, tmp1, width, tmp2
	ADD ptrA, ref, tmp3
	ADD mb, mb, #64

	AND count, count, #0x00FFFFFF
	AND tmp1, count, #0x000F0000
	ADD count, count, tmp1, LSL #8
	AND tmp2, count, #0x00F00000

	;// 2x2 pels per iteration
	;// bilinear vertical interpolation
	loop2_y
	ADD count, count, tmp2, LSL #8
	loop2_x
	;// Process 2x2 block
	LDRB tmp2, [ptrA,width] ;// 2 row, 1 col
	LDRB tmp3, [ptrA,width, LSL #1] ;// 3 row, 1 col
	LDRB tmp1, [ptrA],#1 ;// 1 row, 1 col

	LDRB tmp5, [ptrA,width] ;// 2 row, 2 col
	LDRB tmp6, [ptrA,width, LSL #1] ;// 3 row, 2 col
	LDRB tmp4, [ptrA],#1 ;// 1 row, 2 col

	PKHBT tmp1, tmp1, tmp2, LSL #16 ;// \|B\|A\|
	PKHBT tmp2, tmp2, tmp3, LSL #16 ;// \|C\|B\|
	PKHBT tmp4, tmp4, tmp5, LSL #16 ;// \|B\|A\|

	SMLAD tmp7, tmp2, valY, c32 ;// multiply
	PKHBT tmp5, tmp5, tmp6, LSL #16 ;// \|C\|B\|
	SMLAD tmp2, tmp1, valY, c32 ;// multiply
	SMLAD tmp8, tmp5, valY, c32 ;// multiply
	SMLAD tmp5, tmp4, valY, c32 ;// multiply

	MOV tmp7, tmp7, LSR #6 ;// scale down
	STRB tmp7, [mb,#8] ;// store row 2 col 1
	MOV tmp2, tmp2, LSR #6 ;// scale down
	STRB tmp2, [mb],#1 ;// store row 1 col 1

	MOV tmp8, tmp8, LSR #6 ;// scale down
	STRB tmp8, [mb,#8] ;// store row 2 col 2
	MOV tmp5, tmp5, LSR #6 ;// scale down
	STRB tmp5, [mb],#1 ;// store row 1 col 2


	SUBS count, count, #2<<28
	BCS loop2_x

	AND tmp2, count, #0x00F00000

	ADDS mb, mb, #16
	SBC mb, mb, tmp2, LSR #20
	ADD ptrA, ptrA, width, LSL #1
	SBC ptrA, ptrA, tmp2, LSR #20

	ADDS count, count, #0xE << 24
	BGE loop2_y

	ADD sp,sp,#0xd4
	LDMFD sp!, {r4-r11,pc}

	END