av/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s - nest-cam/4320010/av - Git at Google

 ;//
 ;// Copyright (C) 2007-2008 ARM Limited
 ;//
 ;// Licensed under the Apache License, Version 2.0 (the "License");
 ;// you may not use this file except in compliance with the License.
 ;// You may obtain a copy of the License at
 ;//
 ;//      http://www.apache.org/licenses/LICENSE-2.0
 ;//
 ;// Unless required by applicable law or agreed to in writing, software
 ;// distributed under the License is distributed on an "AS IS" BASIS,
 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ;// See the License for the specific language governing permissions and
 ;// limitations under the License.
 ;//
 ;//
 ;//
 ;// File Name:  omxVCM4P10_TransformDequantLumaDCFromPair_s.s
 ;// OpenMAX DL: v1.0.2
 ;// Revision:   9641
 ;// Date:       Thursday, February 7, 2008
 ;//
 ;//
 ;//
 ;//
 ;// Description:
 ;// H.264 inverse quantize and transform module
 ;//
 ;//

 ;// Include standard headers

         INCLUDE omxtypes_s.h
         INCLUDE armCOMM_s.h

 ;// Import/Export symbols required from/to other files
 ;// (For example tables)

         IMPORT armVCM4P10_UnpackBlock4x4
         IMPORT armVCM4P10_QPDivTable
         IMPORT armVCM4P10_VMatrixQPModTable

         M_VARIANTS ARM1136JS

 ;// Set debugging level
 ;//DEBUG_ON    SETL {TRUE}


 ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4


 ;// Guarding implementation by the processor name

     IF  ARM1136JS


 ;//Input Registers
 pData               RN  0
 QP                  RN  1

 ;//Output Registers


 ;//Local Scratch Registers

 ;// Packed Input pixels
 in00                RN  2                   ;// Src[0] & Src[1]
 in02                RN  3                   ;// Src[2] & Src[3]
 in10                RN  4                   ;// Src[4] & Src[5]
 in12                RN  5                   ;// Src[6] & Src[7]
 in20                RN  6                   ;// Src[8] & Src[9]
 in22                RN  7                   ;// Src[10] & Src[11]
 in30                RN  8                   ;// Src[12] & Src[13]
 in32                RN  9                   ;// Src[14] & Src[15]

 ;// Transpose for Row operations (Rows to cols)
 trRow00             RN  2
 trRow10             RN  10
 trRow02             RN  3
 trRow12             RN  5
 trRow20             RN  11
 trRow30             RN  12
 trRow32             RN  14
 trRow22             RN  7

 ;// Intermediate calculations
 rowSum1             RN  4
 rowSum2             RN  6
 rowDiff1            RN  8
 rowDiff2            RN  9


 ;// Row operated pixels
 rowOp00             RN  2
 rowOp10             RN  10
 rowOp20             RN  11
 rowOp30             RN  12
 rowOp02             RN  3
 rowOp12             RN  5
 rowOp22             RN  7
 rowOp32             RN  14

 ;// Transpose for colulmn operations
 trCol00             RN  2
 trCol02             RN  3
 trCol10             RN  4
 trCol12             RN  5
 trCol20             RN  6
 trCol22             RN  7
 trCol30             RN  8
 trCol32             RN  9

 ;// Intermediate calculations
 colSum1             RN  10
 colSum2             RN  11
 colDiff1            RN  12
 colDiff2            RN  14


 ;// Coloumn operated pixels
 colOp00             RN  2
 colOp02             RN  3
 colOp10             RN  4
 colOp12             RN  5
 colOp20             RN  6
 colOp22             RN  7
 colOp30             RN  8
 colOp32             RN  9

 ;// Temporary scratch varaibles
 pQPDivTable         RN  0
 pQPModTable         RN  11
 Shift               RN  10
 Scale               RN  14
 Round               RN  0

 temp1               RN  10
 temp2                RN  11
 temp3               RN  12
 temp4               RN  1


 ;// InvTransformed and Dequantized pixels
 out00               RN  2
 out02               RN  3
 out10               RN  4
 out12               RN  5
 out20               RN  6
 out22               RN  7
 out30               RN  8
 out32               RN  9


     ;// Allocate stack memory required by the function
         M_ALLOC4    pDataOnStack, 4

     ;// Write function header
         M_START armVCM4P10_InvTransformDequantLumaDC4x4,r11

         ;******************************************************************
         ;// The strategy used in implementing the transform is as follows:*
         ;// Load the 4x4 block into 8 registers                           *
         ;// Transpose the 4x4 matrix                                      *
         ;// Perform the row operations (on columns) using SIMD            *
         ;// Transpose the 4x4 result matrix                               *
         ;// Perform the coloumn operations                                *
         ;// Store the 4x4 block at one go                                 *
         ;******************************************************************

         ;// Load all the 4x4 pixels

         LDMIA   pData,{in00,in02,in10,in12,in20,in22,in30,in32}

         ;//*****************************************************************
         ;//
         ;// Transpose the matrix inorder to perform row ops as coloumn ops
         ;// Input:   in[][] = original matrix
         ;// Output:  trRow[][]= transposed matrix
         ;// Step1: Obtain the LL part of the transposed matrix
         ;// Step2: Obtain the HL part
         ;// step3: Obtain the LH part
         ;// Step4: Obtain the HH part
         ;//
         ;//*****************************************************************

         ;// LL 2x2 transposed matrix
         ;//   d0 d1 - -
         ;//   d4 d5 - -
         ;//   -  -  - -
         ;//   -  -  - -

         PKHTB   trRow10,in10,in00,ASR #16               ;// [5 4] = [f5:f1]
         PKHBT   trRow00,in00,in10,LSL #16               ;// [1 0] = [f4:f0]

         ;// HL 2x2 transposed matrix
         ;//    -   -   - -
         ;//    -   -   - -
         ;//    d8  d9  - -
         ;//   d12 d13  - -


          PKHTB   trRow30,in12,in02,ASR #16              ;// [13 12] = [7 3]
          PKHBT   trRow20,in02,in12,LSL #16              ;// [9 8] = [6 2]

         ;// LH 2x2 transposed matrix
         ;//   - - d2 d3
         ;//   - - d6 d7
         ;//   - - -  -
         ;//   - - -  -

         PKHBT   trRow02,in20,in30,LSL #16               ;// [3 2] = [f12:f8]
         PKHTB   trRow12,in30,in20,ASR #16               ;// [7 6] = [f13:f9]


         ;// HH 2x2 transposed matrix
         ;//    - -   -   -
         ;//    - -   -   -
         ;//    - -  d10 d11
         ;//    - -  d14 d15

         PKHTB   trRow32,in32,in22,ASR #16               ;// [15 14] = [15 11]
         PKHBT   trRow22,in22,in32,LSL #16               ;// [11 10] = [14 10]


         ;****************************************
         ;// Row Operations (Performed on columns)
         ;****************************************


         ;// SIMD operations on first two columns(two rows of the original matrix)

         SADD16      rowSum1,trRow00,trRow10                ;// (c0+c1)
         SADD16      rowSum2,trRow20,trRow30                ;// (c2+c3)
         SSUB16      rowDiff1,trRow00,trRow10               ;// (c0-c1)
         SSUB16      rowDiff2,trRow20,trRow30               ;// (c2-c3)
         SADD16      rowOp00,rowSum1,rowSum2                ;// (c0+c1+c2+c3)
         SSUB16      rowOp10,rowSum1,rowSum2                ;// (c0+c1-c2-c3)
         SSUB16      rowOp20,rowDiff1,rowDiff2              ;// (c0-c1-c2+c3)
         SADD16      rowOp30,rowDiff1,rowDiff2              ;// (c0-c1+c2-c3)


         ;// SIMD operations on next two columns(next two rows of the original matrix)

         SADD16      rowSum1,trRow02,trRow12                ;// (c0+c1)
         SADD16      rowSum2,trRow22,trRow32                ;// (c2+c3)
         SSUB16      rowDiff1,trRow02,trRow12               ;// (c0-c1)
         SSUB16      rowDiff2,trRow22,trRow32               ;// (c2-c3)
         SADD16      rowOp02,rowSum1,rowSum2                ;// (c0+c1+c2+c3)
         SSUB16      rowOp12,rowSum1,rowSum2                ;// (c0+c1-c2-c3)
         SSUB16      rowOp22,rowDiff1,rowDiff2              ;// (c0-c1-c2+c3)
         SADD16      rowOp32,rowDiff1,rowDiff2              ;// (c0-c1+c2-c3)


         ;*****************************************************************
         ;// Transpose the resultant matrix
         ;// Input:  rowOp[][]
         ;// Output: trCol[][]
         ;*****************************************************************

         ;// LL 2x2 transposed matrix
         ;//   d0 d1 - -
         ;//   d4 d5 - -
         ;//   -  -  - -
         ;//   -  -  - -

         PKHTB   trCol10,rowOp10,rowOp00,ASR #16           ;// [5 4] = [f5:f1]
         PKHBT   trCol00,rowOp00,rowOp10,LSL #16           ;// [1 0] = [f4:f0]

         ;// HL 2x2 transposed matrix
         ;//    -   -   - -
         ;//    -   -   - -
         ;//    d8  d9  - -
         ;//   d12 d13  - -


          PKHTB   trCol30,rowOp12,rowOp02,ASR #16          ;// [13 12] = [7 3]
          PKHBT   trCol20,rowOp02,rowOp12,LSL #16          ;// [9 8] = [6 2]

         ;// LH 2x2 transposed matrix
         ;//   - - d2 d3
         ;//   - - d6 d7
         ;//   - - -  -
         ;//   - - -  -

         PKHBT   trCol02,rowOp20,rowOp30,LSL #16           ;// [3 2] = [f12:f8]
         PKHTB   trCol12,rowOp30,rowOp20,ASR #16           ;// [7 6] = [f13:f9]


         ;// HH 2x2 transposed matrix
         ;//    - -   -   -
         ;//    - -   -   -
         ;//    - -  d10 d11
         ;//    - -  d14 d15

         PKHTB   trCol32,rowOp32,rowOp22,ASR #16            ;// [15 14] = [15 11]
         PKHBT   trCol22,rowOp22,rowOp32,LSL #16            ;// [11 10] = [14 10]


         ;*******************************
         ;// Coloumn Operations
         ;*******************************

         ;//--------------------------------------------------------------------------------------
         ;// Store pData(RN0) on stack and restore it only at the final store back
         ;// This frees up a register (RN0) which is used to reduce number of intermediate stalls
         ;//--------------------------------------------------------------------------------------
         M_STR       pData,pDataOnStack


         ;// SIMD operations on first two columns(two rows of the original matrix)

         SADD16      colSum1,trCol00,trCol10                ;// (c0+c1)
         SADD16      colSum2,trCol20,trCol30                ;// (c2+c3)
         SSUB16      colDiff1,trCol00,trCol10               ;// (c0-c1)
         SSUB16      colDiff2,trCol20,trCol30               ;// (c2-c3)
         SADD16      colOp00,colSum1,colSum2                ;// (c0+c1+c2+c3)
         SSUB16      colOp10,colSum1,colSum2                ;// (c0+c1-c2-c3)
         SSUB16      colOp20,colDiff1,colDiff2              ;// (c0-c1-c2+c3)
         SADD16      colOp30,colDiff1,colDiff2              ;// (c0-c1+c2-c3)


         ;// SIMD operations on next two columns(next two rows of the original matrix)

         LDR         pQPDivTable, =armVCM4P10_QPDivTable    ;// QP Division look-up-table base pointer
         SADD16      colSum1,trCol02,trCol12                ;// (c0+c1)
         SADD16      colSum2,trCol22,trCol32                ;// (c2+c3)
         SSUB16      colDiff1,trCol02,trCol12               ;// (c0-c1)
         SSUB16      colDiff2,trCol22,trCol32               ;// (c2-c3)
         SADD16      colOp02,colSum1,colSum2                ;// (c0+c1+c2+c3)
         SSUB16      colOp12,colSum1,colSum2                ;// (c0+c1-c2-c3)
         LDR         pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer
         LDRSB       Shift, [pQPDivTable, QP]               ;// Shift = pQPDivTable[QP]
         SSUB16      colOp22,colDiff1,colDiff2              ;// (c0-c1-c2+c3)
         SADD16      colOp32,colDiff1,colDiff2              ;// (c0-c1+c2-c3)


         LDRSB       Scale, [pQPModTable, QP]               ;// Scale = pQPModTable[QP]

         ;//----------------------------------------------------------------------
         ;//
         ;// <Dequantize> improves on the c-reference code
         ;// Both the  cases i.e., Shift>=0 and Shift<0 cases are covered together
         ;// We do not subtract 2 from Shift as in C reference, instead perform a
         ;// Scale << Shift once in the beginning and do a right shift by a
         ;// constant 2 after the Multiplication. The value of Round would be 2
         ;//
         ;// By doing this we aviod the Branches required and also
         ;// reduce the code size substantially
         ;//
         ;//----------------------------------------------------------------------

         MOV         Round, #2                               ;// Round = 2
         LSL         Scale, Scale, Shift                     ;// Scale = Scale << Shift


         ;// Row 1
         SMLABB  temp1, colOp00, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
         SMLABB  temp3, colOp02, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
         SMLATB  temp2, colOp00, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
         SMLATB  temp4, colOp02, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round

         ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
         ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
         PKHBT   out00,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
         PKHBT   out02,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |


         ;// Row 2
         SMLABB  temp1, colOp10, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
         SMLABB  temp3, colOp12, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
         SMLATB  temp2, colOp10, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
         SMLATB  temp4, colOp12, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round

         ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
         ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
         PKHBT   out10,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
         PKHBT   out12,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |

         ;// Row 3
         SMLABB  temp1, colOp20, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
         SMLABB  temp3, colOp22, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
         SMLATB  temp2, colOp20, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
         SMLATB  temp4, colOp22, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round

         ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
         ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
         PKHBT   out20,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
         PKHBT   out22,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |

         ;// Row 4
         SMLABB  temp1, colOp30, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
         SMLABB  temp3, colOp32, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
         SMLATB  temp2, colOp30, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
         SMLATB  temp4, colOp32, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round

         M_LDR   pData,pDataOnStack                          ;// Restore pData pointer from stack
         ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
         ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
         PKHBT   out30,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
         PKHBT   out32,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |


         ;***************************
         ;// Store all the 4x4 pixels
         ;***************************

 store_coeff

         STMIA   pData,{out00,out02,out10,out12,out20,out22,out30,out32}


         ;// Set return value


         ;// Write function tail
         M_END

     ENDIF                                                           ;//ARM1136JS


 ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4

 ;// Guarding implementation by the processor name


 ;// Function: omxVCM4P10_TransformDequantLumaDCFromPair

 ;//Input Registers
 ppSrc               RN  0
 pDst                RN  1
 QPR2                RN  2

 ;//Output Registers
 result              RN  0

 ;//Local Scratch Registers
 pDstR4              RN  4
 pDstR0              RN  0
 QPR1                RN  1
 QPR5                RN  5

 ;// Guarding implementation by the processor name

     IF ARM1136JS

     ;// Allocate stack memory required by the function


     ;// Write function header
         M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5

         MOV     pDstR4,pDst                         ;// Saving register r1
         MOV     QPR5,QPR2                           ;// Saving register r2
         BL      armVCM4P10_UnpackBlock4x4

         MOV     pDstR0,pDstR4                       ;// Setting up register r0
         MOV     QPR1,QPR5                           ;// Setting up register r1
         BL      armVCM4P10_InvTransformDequantLumaDC4x4


         ;// Set return value
         MOV     result,#OMX_Sts_NoErr

         ;// Write function tail
         M_END


     ENDIF                                                           ;//ARM1136JS


     END
	;//
	;// Copyright (C) 2007-2008 ARM Limited
	;//
	;// Licensed under the Apache License, Version 2.0 (the "License");
	;// you may not use this file except in compliance with the License.
	;// You may obtain a copy of the License at
	;//
	;// http://www.apache.org/licenses/LICENSE-2.0
	;//
	;// Unless required by applicable law or agreed to in writing, software
	;// distributed under the License is distributed on an "AS IS" BASIS,
	;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	;// See the License for the specific language governing permissions and
	;// limitations under the License.
	;//
	;//
	;//
	;// File Name: omxVCM4P10_TransformDequantLumaDCFromPair_s.s
	;// OpenMAX DL: v1.0.2
	;// Revision: 9641
	;// Date: Thursday, February 7, 2008
	;//
	;//
	;//
	;//
	;// Description:
	;// H.264 inverse quantize and transform module
	;//
	;//

	;// Include standard headers

	INCLUDE omxtypes_s.h
	INCLUDE armCOMM_s.h

	;// Import/Export symbols required from/to other files
	;// (For example tables)

	IMPORT armVCM4P10_UnpackBlock4x4
	IMPORT armVCM4P10_QPDivTable
	IMPORT armVCM4P10_VMatrixQPModTable

	M_VARIANTS ARM1136JS

	;// Set debugging level
	;//DEBUG_ON SETL {TRUE}


	;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4


	;// Guarding implementation by the processor name

	IF ARM1136JS


	;//Input Registers
	pData RN 0
	QP RN 1

	;//Output Registers


	;//Local Scratch Registers

	;// Packed Input pixels
	in00 RN 2 ;// Src[0] & Src[1]
	in02 RN 3 ;// Src[2] & Src[3]
	in10 RN 4 ;// Src[4] & Src[5]
	in12 RN 5 ;// Src[6] & Src[7]
	in20 RN 6 ;// Src[8] & Src[9]
	in22 RN 7 ;// Src[10] & Src[11]
	in30 RN 8 ;// Src[12] & Src[13]
	in32 RN 9 ;// Src[14] & Src[15]

	;// Transpose for Row operations (Rows to cols)
	trRow00 RN 2
	trRow10 RN 10
	trRow02 RN 3
	trRow12 RN 5
	trRow20 RN 11
	trRow30 RN 12
	trRow32 RN 14
	trRow22 RN 7

	;// Intermediate calculations
	rowSum1 RN 4
	rowSum2 RN 6
	rowDiff1 RN 8
	rowDiff2 RN 9


	;// Row operated pixels
	rowOp00 RN 2
	rowOp10 RN 10
	rowOp20 RN 11
	rowOp30 RN 12
	rowOp02 RN 3
	rowOp12 RN 5
	rowOp22 RN 7
	rowOp32 RN 14

	;// Transpose for colulmn operations
	trCol00 RN 2
	trCol02 RN 3
	trCol10 RN 4
	trCol12 RN 5
	trCol20 RN 6
	trCol22 RN 7
	trCol30 RN 8
	trCol32 RN 9

	;// Intermediate calculations
	colSum1 RN 10
	colSum2 RN 11
	colDiff1 RN 12
	colDiff2 RN 14


	;// Coloumn operated pixels
	colOp00 RN 2
	colOp02 RN 3
	colOp10 RN 4
	colOp12 RN 5
	colOp20 RN 6
	colOp22 RN 7
	colOp30 RN 8
	colOp32 RN 9

	;// Temporary scratch varaibles
	pQPDivTable RN 0
	pQPModTable RN 11
	Shift RN 10
	Scale RN 14
	Round RN 0

	temp1 RN 10
	temp2 RN 11
	temp3 RN 12
	temp4 RN 1



	;// InvTransformed and Dequantized pixels
	out00 RN 2
	out02 RN 3
	out10 RN 4
	out12 RN 5
	out20 RN 6
	out22 RN 7
	out30 RN 8
	out32 RN 9




	;// Allocate stack memory required by the function
	M_ALLOC4 pDataOnStack, 4

	;// Write function header
	M_START armVCM4P10_InvTransformDequantLumaDC4x4,r11

	;******************************************************************
	;// The strategy used in implementing the transform is as follows:*
	;// Load the 4x4 block into 8 registers *
	;// Transpose the 4x4 matrix *
	;// Perform the row operations (on columns) using SIMD *
	;// Transpose the 4x4 result matrix *
	;// Perform the coloumn operations *
	;// Store the 4x4 block at one go *
	;******************************************************************

	;// Load all the 4x4 pixels

	LDMIA pData,{in00,in02,in10,in12,in20,in22,in30,in32}

	;//*****************************************************************
	;//
	;// Transpose the matrix inorder to perform row ops as coloumn ops
	;// Input: in[][] = original matrix
	;// Output: trRow[][]= transposed matrix
	;// Step1: Obtain the LL part of the transposed matrix
	;// Step2: Obtain the HL part
	;// step3: Obtain the LH part
	;// Step4: Obtain the HH part
	;//
	;//*****************************************************************

	;// LL 2x2 transposed matrix
	;// d0 d1 - -
	;// d4 d5 - -
	;// - - - -
	;// - - - -

	PKHTB trRow10,in10,in00,ASR #16 ;// [5 4] = [f5:f1]
	PKHBT trRow00,in00,in10,LSL #16 ;// [1 0] = [f4:f0]

	;// HL 2x2 transposed matrix
	;// - - - -
	;// - - - -
	;// d8 d9 - -
	;// d12 d13 - -


	PKHTB trRow30,in12,in02,ASR #16 ;// [13 12] = [7 3]
	PKHBT trRow20,in02,in12,LSL #16 ;// [9 8] = [6 2]

	;// LH 2x2 transposed matrix
	;// - - d2 d3
	;// - - d6 d7
	;// - - - -
	;// - - - -

	PKHBT trRow02,in20,in30,LSL #16 ;// [3 2] = [f12:f8]
	PKHTB trRow12,in30,in20,ASR #16 ;// [7 6] = [f13:f9]




	;// HH 2x2 transposed matrix
	;// - - - -
	;// - - - -
	;// - - d10 d11
	;// - - d14 d15

	PKHTB trRow32,in32,in22,ASR #16 ;// [15 14] = [15 11]
	PKHBT trRow22,in22,in32,LSL #16 ;// [11 10] = [14 10]


	;****************************************
	;// Row Operations (Performed on columns)
	;****************************************


	;// SIMD operations on first two columns(two rows of the original matrix)

	SADD16 rowSum1,trRow00,trRow10 ;// (c0+c1)
	SADD16 rowSum2,trRow20,trRow30 ;// (c2+c3)
	SSUB16 rowDiff1,trRow00,trRow10 ;// (c0-c1)
	SSUB16 rowDiff2,trRow20,trRow30 ;// (c2-c3)
	SADD16 rowOp00,rowSum1,rowSum2 ;// (c0+c1+c2+c3)
	SSUB16 rowOp10,rowSum1,rowSum2 ;// (c0+c1-c2-c3)
	SSUB16 rowOp20,rowDiff1,rowDiff2 ;// (c0-c1-c2+c3)
	SADD16 rowOp30,rowDiff1,rowDiff2 ;// (c0-c1+c2-c3)


	;// SIMD operations on next two columns(next two rows of the original matrix)

	SADD16 rowSum1,trRow02,trRow12 ;// (c0+c1)
	SADD16 rowSum2,trRow22,trRow32 ;// (c2+c3)
	SSUB16 rowDiff1,trRow02,trRow12 ;// (c0-c1)
	SSUB16 rowDiff2,trRow22,trRow32 ;// (c2-c3)
	SADD16 rowOp02,rowSum1,rowSum2 ;// (c0+c1+c2+c3)
	SSUB16 rowOp12,rowSum1,rowSum2 ;// (c0+c1-c2-c3)
	SSUB16 rowOp22,rowDiff1,rowDiff2 ;// (c0-c1-c2+c3)
	SADD16 rowOp32,rowDiff1,rowDiff2 ;// (c0-c1+c2-c3)



	;*****************************************************************
	;// Transpose the resultant matrix
	;// Input: rowOp[][]
	;// Output: trCol[][]
	;*****************************************************************

	;// LL 2x2 transposed matrix
	;// d0 d1 - -
	;// d4 d5 - -
	;// - - - -
	;// - - - -

	PKHTB trCol10,rowOp10,rowOp00,ASR #16 ;// [5 4] = [f5:f1]
	PKHBT trCol00,rowOp00,rowOp10,LSL #16 ;// [1 0] = [f4:f0]

	;// HL 2x2 transposed matrix
	;// - - - -
	;// - - - -
	;// d8 d9 - -
	;// d12 d13 - -


	PKHTB trCol30,rowOp12,rowOp02,ASR #16 ;// [13 12] = [7 3]
	PKHBT trCol20,rowOp02,rowOp12,LSL #16 ;// [9 8] = [6 2]

	;// LH 2x2 transposed matrix
	;// - - d2 d3
	;// - - d6 d7
	;// - - - -
	;// - - - -

	PKHBT trCol02,rowOp20,rowOp30,LSL #16 ;// [3 2] = [f12:f8]
	PKHTB trCol12,rowOp30,rowOp20,ASR #16 ;// [7 6] = [f13:f9]




	;// HH 2x2 transposed matrix
	;// - - - -
	;// - - - -
	;// - - d10 d11
	;// - - d14 d15

	PKHTB trCol32,rowOp32,rowOp22,ASR #16 ;// [15 14] = [15 11]
	PKHBT trCol22,rowOp22,rowOp32,LSL #16 ;// [11 10] = [14 10]


	;*******************************
	;// Coloumn Operations
	;*******************************

	;//--------------------------------------------------------------------------------------
	;// Store pData(RN0) on stack and restore it only at the final store back
	;// This frees up a register (RN0) which is used to reduce number of intermediate stalls
	;//--------------------------------------------------------------------------------------
	M_STR pData,pDataOnStack


	;// SIMD operations on first two columns(two rows of the original matrix)

	SADD16 colSum1,trCol00,trCol10 ;// (c0+c1)
	SADD16 colSum2,trCol20,trCol30 ;// (c2+c3)
	SSUB16 colDiff1,trCol00,trCol10 ;// (c0-c1)
	SSUB16 colDiff2,trCol20,trCol30 ;// (c2-c3)
	SADD16 colOp00,colSum1,colSum2 ;// (c0+c1+c2+c3)
	SSUB16 colOp10,colSum1,colSum2 ;// (c0+c1-c2-c3)
	SSUB16 colOp20,colDiff1,colDiff2 ;// (c0-c1-c2+c3)
	SADD16 colOp30,colDiff1,colDiff2 ;// (c0-c1+c2-c3)


	;// SIMD operations on next two columns(next two rows of the original matrix)

	LDR pQPDivTable, =armVCM4P10_QPDivTable ;// QP Division look-up-table base pointer
	SADD16 colSum1,trCol02,trCol12 ;// (c0+c1)
	SADD16 colSum2,trCol22,trCol32 ;// (c2+c3)
	SSUB16 colDiff1,trCol02,trCol12 ;// (c0-c1)
	SSUB16 colDiff2,trCol22,trCol32 ;// (c2-c3)
	SADD16 colOp02,colSum1,colSum2 ;// (c0+c1+c2+c3)
	SSUB16 colOp12,colSum1,colSum2 ;// (c0+c1-c2-c3)
	LDR pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer
	LDRSB Shift, [pQPDivTable, QP] ;// Shift = pQPDivTable[QP]
	SSUB16 colOp22,colDiff1,colDiff2 ;// (c0-c1-c2+c3)
	SADD16 colOp32,colDiff1,colDiff2 ;// (c0-c1+c2-c3)


	LDRSB Scale, [pQPModTable, QP] ;// Scale = pQPModTable[QP]

	;//----------------------------------------------------------------------
	;//
	;// <Dequantize> improves on the c-reference code
	;// Both the cases i.e., Shift>=0 and Shift<0 cases are covered together
	;// We do not subtract 2 from Shift as in C reference, instead perform a
	;// Scale << Shift once in the beginning and do a right shift by a
	;// constant 2 after the Multiplication. The value of Round would be 2
	;//
	;// By doing this we aviod the Branches required and also
	;// reduce the code size substantially
	;//
	;//----------------------------------------------------------------------

	MOV Round, #2 ;// Round = 2
	LSL Scale, Scale, Shift ;// Scale = Scale << Shift


	;// Row 1
	SMLABB temp1, colOp00, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round
	SMLABB temp3, colOp02, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round
	SMLATB temp2, colOp00, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round
	SMLATB temp4, colOp02, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round

	ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2
	ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2
	PKHBT out00, temp1, temp2, LSL #14 ;// c0w0 = \| Temp2 \| Temp1 \|
	PKHBT out02, temp3, temp4, LSL #14 ;// c1w0 = \| Temp2 \| Temp1 \|


	;// Row 2
	SMLABB temp1, colOp10, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round
	SMLABB temp3, colOp12, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round
	SMLATB temp2, colOp10, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round
	SMLATB temp4, colOp12, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round

	ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2
	ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2
	PKHBT out10, temp1, temp2, LSL #14 ;// c0w0 = \| Temp2 \| Temp1 \|
	PKHBT out12, temp3, temp4, LSL #14 ;// c1w0 = \| Temp2 \| Temp1 \|

	;// Row 3
	SMLABB temp1, colOp20, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round
	SMLABB temp3, colOp22, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round
	SMLATB temp2, colOp20, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round
	SMLATB temp4, colOp22, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round

	ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2
	ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2
	PKHBT out20, temp1, temp2, LSL #14 ;// c0w0 = \| Temp2 \| Temp1 \|
	PKHBT out22, temp3, temp4, LSL #14 ;// c1w0 = \| Temp2 \| Temp1 \|

	;// Row 4
	SMLABB temp1, colOp30, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round
	SMLABB temp3, colOp32, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round
	SMLATB temp2, colOp30, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round
	SMLATB temp4, colOp32, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round

	M_LDR pData,pDataOnStack ;// Restore pData pointer from stack
	ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2
	ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2
	PKHBT out30, temp1, temp2, LSL #14 ;// c0w0 = \| Temp2 \| Temp1 \|
	PKHBT out32, temp3, temp4, LSL #14 ;// c1w0 = \| Temp2 \| Temp1 \|



	;***************************
	;// Store all the 4x4 pixels
	;***************************

	store_coeff

	STMIA pData,{out00,out02,out10,out12,out20,out22,out30,out32}



	;// Set return value


	;// Write function tail
	M_END

	ENDIF ;//ARM1136JS


	;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4

	;// Guarding implementation by the processor name




	;// Function: omxVCM4P10_TransformDequantLumaDCFromPair

	;//Input Registers
	ppSrc RN 0
	pDst RN 1
	QPR2 RN 2

	;//Output Registers
	result RN 0

	;//Local Scratch Registers
	pDstR4 RN 4
	pDstR0 RN 0
	QPR1 RN 1
	QPR5 RN 5

	;// Guarding implementation by the processor name

	IF ARM1136JS

	;// Allocate stack memory required by the function


	;// Write function header
	M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5

	MOV pDstR4,pDst ;// Saving register r1
	MOV QPR5,QPR2 ;// Saving register r2
	BL armVCM4P10_UnpackBlock4x4

	MOV pDstR0,pDstR4 ;// Setting up register r0
	MOV QPR1,QPR5 ;// Setting up register r1
	BL armVCM4P10_InvTransformDequantLumaDC4x4


	;// Set return value
	MOV result,#OMX_Sts_NoErr

	;// Write function tail
	M_END


	ENDIF ;//ARM1136JS


	END