av/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s - nest-cam/4320010/av - Git at Google

 ;//
 ;// Copyright (C) 2007-2008 ARM Limited
 ;//
 ;// Licensed under the Apache License, Version 2.0 (the "License");
 ;// you may not use this file except in compliance with the License.
 ;// You may obtain a copy of the License at
 ;//
 ;//      http://www.apache.org/licenses/LICENSE-2.0
 ;//
 ;// Unless required by applicable law or agreed to in writing, software
 ;// distributed under the License is distributed on an "AS IS" BASIS,
 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ;// See the License for the specific language governing permissions and
 ;// limitations under the License.
 ;//
 ;//
 ;//
 ;// File Name:  omxVCM4P10_TransformDequantLumaDCFromPair_s.s
 ;// OpenMAX DL: v1.0.2
 ;// Revision:   12290
 ;// Date:       Wednesday, April 9, 2008
 ;//
 ;//
 ;//
 ;//
 ;// Description:
 ;// H.264 inverse quantize and transform module
 ;//
 ;//

 ;// Include standard headers

         INCLUDE omxtypes_s.h
         INCLUDE armCOMM_s.h

 ;// Import/Export symbols required from/to other files
 ;// (For example tables)

         IMPORT armVCM4P10_UnpackBlock4x4
         IMPORT armVCM4P10_QPDivTable
         IMPORT armVCM4P10_VMatrixQPModTable

         M_VARIANTS CortexA8

 ;// Set debugging level
 ;//DEBUG_ON    SETL {TRUE}


 ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4


 ;// Guarding implementation by the processor name


 ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4

 ;// Guarding implementation by the processor name

     IF  CortexA8

 ;//Input Registers
 pData               RN  0
 QP                  RN  1


 ;//Local Scratch Registers

 ;// ARM Registers

 pQPDivTable         RN  2
 pQPModTable         RN  3
 Shift               RN  4
 Scale               RN  5

 ;// NEON Registers

 ;// Packed Input pixels
 dIn0                DN  D0.S16
 dIn1                DN  D1.S16
 dIn2                DN  D2.S16
 dIn3                DN  D3.S16

 ;// Intermediate calculations
 dRowSum1            DN  D4.S16
 dRowSum2            DN  D5.S16
 dRowDiff1           DN  D6.S16
 dRowDiff2           DN  D7.S16

 ;// Row operated pixels
 dRowOp0             DN  D0.S16
 dRowOp1                DN  D1.S16
 dRowOp2                DN  D2.S16
 dRowOp3                DN  D3.S16
 qRowOp01            QN  Q0.32
 qRowOp23            QN  Q1.32

 ;// Intermediate calculations
 dColSum1            DN  D4.S16
 dColSum2            DN  D5.S16
 dColDiff1           DN  D6.S16
 dColDiff2           DN  D7.S16

 ;// Coloumn operated pixels
 dColOp0             DN  D0.S16
 dColOp1                DN  D1.S16
 dColOp2                DN  D2.S16
 dColOp3                DN  D3.S16

 ;// Temporary scratch varaibles

 dScale              DN  D5.S16
 qRound0             QN  Q3.S32
 qRound1             QN  Q4.S32
 qRound2             QN  Q5.S32
 qRound3             QN  Q6.S32

 ;// InvTransformed and Dequantized pixels
 dOut0               DN  D0.S16
 dOut1                DN  D1.S16
 dOut2                DN  D2.S16
 dOut3                DN  D3.S16


     ;// Allocate stack memory required by the function


     ;// Write function header
     M_START armVCM4P10_InvTransformDequantLumaDC4x4,r5,d13

     ;******************************************************************
     ;// The strategy used in implementing the transform is as follows:*
     ;// Load the 4x4 block into 4 D-registers                         *
     ;// Transpose the 4x4 matrix                                      *
     ;// Perform the row operations (on columns) using SIMD            *
     ;// Transpose the 4x4 result matrix                               *
     ;// Perform the coloumn operations                                *
     ;******************************************************************

         ;// Load all the 4x4 pixels in Transposed form

         VLD4    {dIn0,dIn1,dIn2,dIn3},[pData]
         LDR     pQPDivTable, =armVCM4P10_QPDivTable        ;// QP Division look-up-table base pointer
         LDR     pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer

         ;****************************************
         ;// Row Operations (Performed on columns)
         ;****************************************
         ;// Scale factor calculation is done using ARM instructions
         ;// Interleaved with NEON instructions inorder to Dual issue

         VADD    dRowSum1,dIn0,dIn1
         VADD    dRowSum2,dIn2,dIn3
         VSUB    dRowDiff1,dIn0,dIn1
         LDRSB   Shift, [pQPDivTable, QP]               ;// ARM CODE: Shift = pQPDivTable[QP]
         VSUB    dRowDiff2,dIn2,dIn3
         LDRSB   Scale, [pQPModTable, QP]               ;// ARM CODE: Scale = pQPModTable[QP]
         VADD    dRowOp0,dRowSum1,dRowSum2
         VSUB    dRowOp1,dRowSum1,dRowSum2
         VSUB    dRowOp2,dRowDiff1,dRowDiff2
         LSL     Scale, Scale, Shift                    ;// ARM CODE: Scale = Scale << Shift
         VADD    dRowOp3,dRowDiff1,dRowDiff2

         ;****************************************
         ;// Transpose the resultant matrix
         ;****************************************

         VTRN    dRowOp0,dRowOp1
         VTRN    dRowOp2,dRowOp3
         VTRN    qRowOp01,qRowOp23

         ;****************************************
         ;// Coloumn Operations
         ;****************************************

         VADD    dColSum1,dRowOp0,dRowOp1
         VADD    dColSum2,dRowOp2,dRowOp3
         VSUB    dColDiff1,dRowOp0,dRowOp1
         VSUB    dColDiff2,dRowOp2,dRowOp3
         VADD    dColOp0,dColSum1,dColSum2
         VSUB    dColOp1,dColSum1,dColSum2
         VSUB    dColOp2,dColDiff1,dColDiff2
         VADD    dColOp3,dColDiff1,dColDiff2

         ;//----------------------------------------------------------------------
         ;//
         ;// <Dequantize> improves on the c-reference code
         ;// Both the  cases i.e., Shift>=0 and Shift<0 cases are covered together
         ;// We do not subtract 2 from Shift as in C reference, instead perform a
         ;// Scale << Shift once in the beginning and do a right shift by a
         ;// constant 2 after the Multiplication. The value of Round would be 2
         ;//
         ;// By doing this we aviod the Branches required and also
         ;// reduce the code size substantially
         ;//
         ;//----------------------------------------------------------------------


         VDUP    dScale, Scale                            ;// ARM -> NEON  copy 'scale' to vector


         VMOV    qRound0,#2                               ;// Set the Round Value
         VMOV    qRound1,#2
         VMOV    qRound2,#2
         VMOV    qRound3,#2

         VMLAL   qRound0,dColOp0,dScale                   ;// pDst[i] * Scale + Round
         VMLAL   qRound1,dColOp1,dScale
         VMLAL   qRound2,dColOp2,dScale
         VMLAL   qRound3,dColOp3,dScale

         VSHRN   dOut0,qRound0,#2                          ;// Right shift by 2 & (OMX_S16)Value
         VSHRN   dOut1,qRound1,#2
         VSHRN   dOut2,qRound2,#2
         VSHRN   dOut3,qRound3,#2

         ;***************************
         ;// Store all the 4x4 pixels
         ;***************************

         VST1  {dOut0,dOut1,dOut2,dOut3}, [pData]


         ;// Set return value

         ;// Write function tail
         M_END

     ENDIF                                                           ;//CORTEXA8


 ;// Function: omxVCM4P10_TransformDequantLumaDCFromPair

 ;//Input Registers
 ppSrc               RN  0
 pDst                RN  1
 QPR2                RN  2

 ;//Output Registers
 result              RN  0

 ;//Local Scratch Registers
 pDstR4              RN  4
 pDstR0              RN  0
 QPR1                RN  1
 QPR5                RN  5

 ;// Guarding implementation by the processor name

     IF CortexA8

     ;// Allocate stack memory required by the function


     ;// Write function header
         M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5

         MOV     pDstR4,pDst                         ;// Saving register r1
         MOV     QPR5,QPR2                           ;// Saving register r2
         BL      armVCM4P10_UnpackBlock4x4

         MOV     pDstR0,pDstR4                       ;// Setting up register r0
         MOV     QPR1,QPR5                           ;// Setting up register r1
         BL      armVCM4P10_InvTransformDequantLumaDC4x4


         ;// Set return value
         MOV     result,#OMX_Sts_NoErr

         ;// Write function tail
         M_END


     ENDIF                                                           ;//ARM1136JS


     END
	;//
	;// Copyright (C) 2007-2008 ARM Limited
	;//
	;// Licensed under the Apache License, Version 2.0 (the "License");
	;// you may not use this file except in compliance with the License.
	;// You may obtain a copy of the License at
	;//
	;// http://www.apache.org/licenses/LICENSE-2.0
	;//
	;// Unless required by applicable law or agreed to in writing, software
	;// distributed under the License is distributed on an "AS IS" BASIS,
	;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	;// See the License for the specific language governing permissions and
	;// limitations under the License.
	;//
	;//
	;//
	;// File Name: omxVCM4P10_TransformDequantLumaDCFromPair_s.s
	;// OpenMAX DL: v1.0.2
	;// Revision: 12290
	;// Date: Wednesday, April 9, 2008
	;//
	;//
	;//
	;//
	;// Description:
	;// H.264 inverse quantize and transform module
	;//
	;//

	;// Include standard headers

	INCLUDE omxtypes_s.h
	INCLUDE armCOMM_s.h

	;// Import/Export symbols required from/to other files
	;// (For example tables)

	IMPORT armVCM4P10_UnpackBlock4x4
	IMPORT armVCM4P10_QPDivTable
	IMPORT armVCM4P10_VMatrixQPModTable

	M_VARIANTS CortexA8

	;// Set debugging level
	;//DEBUG_ON SETL {TRUE}


	;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4


	;// Guarding implementation by the processor name



	;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4

	;// Guarding implementation by the processor name

	IF CortexA8

	;//Input Registers
	pData RN 0
	QP RN 1


	;//Local Scratch Registers

	;// ARM Registers

	pQPDivTable RN 2
	pQPModTable RN 3
	Shift RN 4
	Scale RN 5

	;// NEON Registers

	;// Packed Input pixels
	dIn0 DN D0.S16
	dIn1 DN D1.S16
	dIn2 DN D2.S16
	dIn3 DN D3.S16

	;// Intermediate calculations
	dRowSum1 DN D4.S16
	dRowSum2 DN D5.S16
	dRowDiff1 DN D6.S16
	dRowDiff2 DN D7.S16

	;// Row operated pixels
	dRowOp0 DN D0.S16
	dRowOp1 DN D1.S16
	dRowOp2 DN D2.S16
	dRowOp3 DN D3.S16
	qRowOp01 QN Q0.32
	qRowOp23 QN Q1.32

	;// Intermediate calculations
	dColSum1 DN D4.S16
	dColSum2 DN D5.S16
	dColDiff1 DN D6.S16
	dColDiff2 DN D7.S16

	;// Coloumn operated pixels
	dColOp0 DN D0.S16
	dColOp1 DN D1.S16
	dColOp2 DN D2.S16
	dColOp3 DN D3.S16

	;// Temporary scratch varaibles

	dScale DN D5.S16
	qRound0 QN Q3.S32
	qRound1 QN Q4.S32
	qRound2 QN Q5.S32
	qRound3 QN Q6.S32

	;// InvTransformed and Dequantized pixels
	dOut0 DN D0.S16
	dOut1 DN D1.S16
	dOut2 DN D2.S16
	dOut3 DN D3.S16


	;// Allocate stack memory required by the function


	;// Write function header
	M_START armVCM4P10_InvTransformDequantLumaDC4x4,r5,d13

	;******************************************************************
	;// The strategy used in implementing the transform is as follows:*
	;// Load the 4x4 block into 4 D-registers *
	;// Transpose the 4x4 matrix *
	;// Perform the row operations (on columns) using SIMD *
	;// Transpose the 4x4 result matrix *
	;// Perform the coloumn operations *
	;******************************************************************

	;// Load all the 4x4 pixels in Transposed form

	VLD4 {dIn0,dIn1,dIn2,dIn3},[pData]
	LDR pQPDivTable, =armVCM4P10_QPDivTable ;// QP Division look-up-table base pointer
	LDR pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer

	;****************************************
	;// Row Operations (Performed on columns)
	;****************************************
	;// Scale factor calculation is done using ARM instructions
	;// Interleaved with NEON instructions inorder to Dual issue

	VADD dRowSum1,dIn0,dIn1
	VADD dRowSum2,dIn2,dIn3
	VSUB dRowDiff1,dIn0,dIn1
	LDRSB Shift, [pQPDivTable, QP] ;// ARM CODE: Shift = pQPDivTable[QP]
	VSUB dRowDiff2,dIn2,dIn3
	LDRSB Scale, [pQPModTable, QP] ;// ARM CODE: Scale = pQPModTable[QP]
	VADD dRowOp0,dRowSum1,dRowSum2
	VSUB dRowOp1,dRowSum1,dRowSum2
	VSUB dRowOp2,dRowDiff1,dRowDiff2
	LSL Scale, Scale, Shift ;// ARM CODE: Scale = Scale << Shift
	VADD dRowOp3,dRowDiff1,dRowDiff2

	;****************************************
	;// Transpose the resultant matrix
	;****************************************

	VTRN dRowOp0,dRowOp1
	VTRN dRowOp2,dRowOp3
	VTRN qRowOp01,qRowOp23

	;****************************************
	;// Coloumn Operations
	;****************************************

	VADD dColSum1,dRowOp0,dRowOp1
	VADD dColSum2,dRowOp2,dRowOp3
	VSUB dColDiff1,dRowOp0,dRowOp1
	VSUB dColDiff2,dRowOp2,dRowOp3
	VADD dColOp0,dColSum1,dColSum2
	VSUB dColOp1,dColSum1,dColSum2
	VSUB dColOp2,dColDiff1,dColDiff2
	VADD dColOp3,dColDiff1,dColDiff2

	;//----------------------------------------------------------------------
	;//
	;// <Dequantize> improves on the c-reference code
	;// Both the cases i.e., Shift>=0 and Shift<0 cases are covered together
	;// We do not subtract 2 from Shift as in C reference, instead perform a
	;// Scale << Shift once in the beginning and do a right shift by a
	;// constant 2 after the Multiplication. The value of Round would be 2
	;//
	;// By doing this we aviod the Branches required and also
	;// reduce the code size substantially
	;//
	;//----------------------------------------------------------------------


	VDUP dScale, Scale ;// ARM -> NEON copy 'scale' to vector


	VMOV qRound0,#2 ;// Set the Round Value
	VMOV qRound1,#2
	VMOV qRound2,#2
	VMOV qRound3,#2

	VMLAL qRound0,dColOp0,dScale ;// pDst[i] * Scale + Round
	VMLAL qRound1,dColOp1,dScale
	VMLAL qRound2,dColOp2,dScale
	VMLAL qRound3,dColOp3,dScale

	VSHRN dOut0,qRound0,#2 ;// Right shift by 2 & (OMX_S16)Value
	VSHRN dOut1,qRound1,#2
	VSHRN dOut2,qRound2,#2
	VSHRN dOut3,qRound3,#2

	;***************************
	;// Store all the 4x4 pixels
	;***************************

	VST1 {dOut0,dOut1,dOut2,dOut3}, [pData]


	;// Set return value

	;// Write function tail
	M_END

	ENDIF ;//CORTEXA8



	;// Function: omxVCM4P10_TransformDequantLumaDCFromPair

	;//Input Registers
	ppSrc RN 0
	pDst RN 1
	QPR2 RN 2

	;//Output Registers
	result RN 0

	;//Local Scratch Registers
	pDstR4 RN 4
	pDstR0 RN 0
	QPR1 RN 1
	QPR5 RN 5

	;// Guarding implementation by the processor name

	IF CortexA8

	;// Allocate stack memory required by the function


	;// Write function header
	M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5

	MOV pDstR4,pDst ;// Saving register r1
	MOV QPR5,QPR2 ;// Saving register r2
	BL armVCM4P10_UnpackBlock4x4

	MOV pDstR0,pDstR4 ;// Setting up register r0
	MOV QPR1,QPR5 ;// Setting up register r1
	BL armVCM4P10_InvTransformDequantLumaDC4x4


	;// Set return value
	MOV result,#OMX_Sts_NoErr

	;// Write function tail
	M_END


	ENDIF ;//ARM1136JS


	END