av/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s - nest-cam/4320010/av - Git at Google

 ;//
 ;// Copyright (C) 2007 ARM Limited
 ;//
 ;// Licensed under the Apache License, Version 2.0 (the "License");
 ;// you may not use this file except in compliance with the License.
 ;// You may obtain a copy of the License at
 ;//
 ;//      http://www.apache.org/licenses/LICENSE-2.0
 ;//
 ;// Unless required by applicable law or agreed to in writing, software
 ;// distributed under the License is distributed on an "AS IS" BASIS,
 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ;// See the License for the specific language governing permissions and
 ;// limitations under the License.
 ;//
 ;//
 ;//
 ;// Description:
 ;// H.264 inverse quantize and transform module
 ;//
 ;//


 ;// Include standard headers

         INCLUDE omxtypes_s.h
         INCLUDE armCOMM_s.h

 ;// Import symbols required from other files
 ;// (For example tables)

         IMPORT armVCM4P10_UnpackBlock4x4
         IMPORT armVCM4P10_TransformResidual4x4
         IMPORT armVCM4P10_QPDivTable
         IMPORT armVCM4P10_VMatrixU16
         IMPORT armVCM4P10_QPModuloTable

     M_VARIANTS ARM1136JS, ARM1136JS_U

 ;// Set debugging level
 ;//DEBUG_ON    SETL {TRUE}


 ;// Static Function: armVCM4P10_DequantLumaAC4x4

 ;// Guarding implementation by the processor name

     IF  ARM1136JS

 ;//Input Registers
 pSrcDst       RN  0
 QP            RN  1


 ;//Output Registers


 ;//Local Scratch Registers
 pQPdiv          RN  4
 pQPmod          RN  5
 pVRow           RN  2
 QPmod           RN  6
 shift           RN  3
 rowLuma01       RN  1
 rowLuma23       RN  4

 SrcDst00        RN  5
 SrcDst02        RN  6
 SrcDst10        RN  7
 SrcDst12        RN  8
 SrcDst20        RN  9
 SrcDst22        RN  10
 SrcDst30        RN  11
 SrcDst32        RN  12

 temp1           RN  2
 temp2           RN  3
 temp3           RN  14


         ;// Allocate stack memory required by the function

         ;// Write function header
         M_START armVCM4P10_DequantLumaAC4x4,r11

         LDR    pQPmod,=armVCM4P10_QPModuloTable
         LDR    pQPdiv,=armVCM4P10_QPDivTable
         LDR    pVRow,=armVCM4P10_VMatrixU16

         LDRSB  QPmod,[pQPmod,QP]                    ;// (QP%6) * 6
         LDRSB  shift,[pQPdiv,QP]                    ;// Shift = QP / 6

         LDRH    rowLuma01,[pVRow,QPmod]!             ;// rowLuma01 = [00|0a]
         LDRH    temp3,[pVRow,#2]                     ;// temp3     = [00|0b]
         LDRH    rowLuma23,[pVRow,#4]                 ;// rowLuma23 = [00|0c]
         ORR     rowLuma01,rowLuma01,temp3,LSL #16    ;// rowLuma01 = [0b|0a]

         ;// Load all the 16 'src' values
         LDMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}


         ;//*********************************************************************************************
         ;//
         ;// 'Shift' ranges between [0,8]
         ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
         ;//
         ;//*********************************************************************************************

         LSL    rowLuma01,rowLuma01,shift
         LSL    rowLuma23,rowLuma23,shift


         ;//**********************************************************************************************
         ;//
         ;// The idea is to unroll the Loop completely
         ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
         ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16'
         ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
         ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
         ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
         ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
         ;// Here we interleave the PKHBT operations for various rows  to avoide pipeline stalls
         ;//
         ;// We then pack the two 16 bit multiplication result into a word and store at one go
         ;//
         ;//**********************************************************************************************


         ;// Row 1


         SMULTB  temp1,SrcDst00,rowLuma23                    ;// pSrcDst[1] * (pVRow[2]<<Shift)
         SMULBB  SrcDst00,SrcDst00,rowLuma01                 ;// pSrcDst[0] * (pVRow[0]<<Shift)

         SMULTB  temp2,SrcDst02,rowLuma23                    ;// pSrcDst[3] * (pVRow[2]<<Shift)
         SMULBB  SrcDst02,SrcDst02,rowLuma01                 ;// pSrcDst[2] * (pVRow[0]<<Shift)

         PKHBT   SrcDst00,SrcDst00,temp1,LSL #16             ;// Pack the first two product values


         ;// Row 2
         SMULTT  temp1,SrcDst10,rowLuma01                    ;// pSrcDst[5] * (pVRow[1]<<Shift)
         SMULBB  SrcDst10,SrcDst10,rowLuma23                 ;// pSrcDst[4] * (pVRow[2]<<Shift)

         PKHBT   SrcDst02,SrcDst02,temp2,LSL #16             ;// Pack the next two product values
         SMULTT  temp2,SrcDst12,rowLuma01                    ;// pSrcDst[7] * (pVRow[1]<<Shift)
         SMULBB  SrcDst12,SrcDst12,rowLuma23                    ;// pSrcDst[6] * (pVRow[2]<<Shift)

         PKHBT   SrcDst10,SrcDst10,temp1,LSL #16             ;// Pack the next two product values


         ;// Row 3

         SMULTB  temp1,SrcDst20,rowLuma23                    ;// pSrcDst[9] * (pVRow[2]<<Shift)
         SMULBB  SrcDst20,SrcDst20,rowLuma01                    ;// pSrcDst[8] * (pVRow[0]<<Shift)

         PKHBT   SrcDst12,SrcDst12,temp2,LSL #16               ;// Pack the next two product values
         SMULTB  temp2,SrcDst22,rowLuma23                    ;// pSrcDst[11] * (pVRow[2]<<Shift)
         SMULBB  SrcDst22,SrcDst22,rowLuma01                    ;// pSrcDst[10] * (pVRow[0]<<Shift)

         PKHBT   SrcDst20,SrcDst20,temp1,LSL #16             ;// Pack the next two product values


         ;// Row 4

         SMULTT  temp1,SrcDst30,rowLuma01                    ;// pSrcDst[13] * (pVRow[1]<<Shift)
         SMULBB  SrcDst30,SrcDst30,rowLuma23                    ;// pSrcDst[12] * (pVRow[2]<<Shift)

         SMULTT  temp3,SrcDst32,rowLuma01                    ;// pSrcDst[15] * (pVRow[1]<<Shift)
         SMULBB  SrcDst32,SrcDst32,rowLuma23                    ;// pSrcDst[14] * (pVRow[2]<<Shift)

         PKHBT   SrcDst22,SrcDst22,temp2,LSL #16             ;// Pack the remaining product values
         PKHBT   SrcDst30,SrcDst30,temp1,LSL #16
         PKHBT   SrcDst32,SrcDst32,temp3,LSL #16


         STMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}


         ;// Set return value


         ;// Write function tail
         M_END

     ENDIF                                                    ;//ARM1136JS


 ;// Guarding implementation by the processor name

     IF  ARM1136JS_U

 ;//Input Registers
 pSrcDst       RN  0
 QP            RN  1


 ;//Output Registers


 ;//Local Scratch Registers
 pQPdiv          RN  4
 pQPmod          RN  5
 pVRow           RN  2
 QPmod           RN  6
 shift           RN  3
 rowLuma01       RN  1
 rowLuma23       RN  4

 SrcDst00        RN  5
 SrcDst02        RN  6
 SrcDst10        RN  7
 SrcDst12        RN  8
 SrcDst20        RN  9
 SrcDst22        RN  10
 SrcDst30        RN  11
 SrcDst32        RN  12

 temp1           RN  2
 temp2           RN  3
 temp3           RN  14


         ;// Allocate stack memory required by the function

         ;// Write function header
         M_START armVCM4P10_DequantLumaAC4x4,r11

         LDR    pQPmod,=armVCM4P10_QPModuloTable
         LDR    pQPdiv,=armVCM4P10_QPDivTable
         LDR    pVRow,=armVCM4P10_VMatrixU16

         LDRSB  QPmod,[pQPmod,QP]                    ;// (QP%6) * 6
         LDRSB  shift,[pQPdiv,QP]                    ;// Shift = QP / 6

         LDR    rowLuma01,[pVRow,QPmod]!             ;// rowLuma01 = [0b|0a]
         LDR    rowLuma23,[pVRow,#4]                 ;// rowLuma23 = [0d|0c]

         ;// Load all the 16 'src' values
         LDMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}


         ;//*********************************************************************************************
         ;//
         ;// 'Shift' ranges between [0,8]
         ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
         ;//
         ;//*********************************************************************************************

         LSL    rowLuma01,rowLuma01,shift
         LSL    rowLuma23,rowLuma23,shift


         ;//**********************************************************************************************
         ;//
         ;// The idea is to unroll the Loop completely
         ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
         ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16'
         ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
         ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
         ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
         ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
         ;// Here we interleave the PKHBT operations for various rows  to avoide pipeline stalls
         ;//
         ;// We then pack the two 16 bit multiplication result into a word and store at one go
         ;//
         ;//**********************************************************************************************


         ;// Row 1


         SMULTB  temp1,SrcDst00,rowLuma23                    ;// pSrcDst[1] * (pVRow[2]<<Shift)
         SMULBB  SrcDst00,SrcDst00,rowLuma01                 ;// pSrcDst[0] * (pVRow[0]<<Shift)

         SMULTB  temp2,SrcDst02,rowLuma23                    ;// pSrcDst[3] * (pVRow[2]<<Shift)
         SMULBB  SrcDst02,SrcDst02,rowLuma01                 ;// pSrcDst[2] * (pVRow[0]<<Shift)

         PKHBT   SrcDst00,SrcDst00,temp1,LSL #16             ;// Pack the first two product values


         ;// Row 2
         SMULTT  temp1,SrcDst10,rowLuma01                    ;// pSrcDst[5] * (pVRow[1]<<Shift)
         SMULBB  SrcDst10,SrcDst10,rowLuma23                 ;// pSrcDst[4] * (pVRow[2]<<Shift)

         PKHBT   SrcDst02,SrcDst02,temp2,LSL #16             ;// Pack the next two product values
         SMULTT  temp2,SrcDst12,rowLuma01                    ;// pSrcDst[7] * (pVRow[1]<<Shift)
         SMULBB  SrcDst12,SrcDst12,rowLuma23                    ;// pSrcDst[6] * (pVRow[2]<<Shift)

         PKHBT   SrcDst10,SrcDst10,temp1,LSL #16             ;// Pack the next two product values


         ;// Row 3

         SMULTB  temp1,SrcDst20,rowLuma23                    ;// pSrcDst[9] * (pVRow[2]<<Shift)
         SMULBB  SrcDst20,SrcDst20,rowLuma01                    ;// pSrcDst[8] * (pVRow[0]<<Shift)

         PKHBT   SrcDst12,SrcDst12,temp2,LSL #16               ;// Pack the next two product values
         SMULTB  temp2,SrcDst22,rowLuma23                    ;// pSrcDst[11] * (pVRow[2]<<Shift)
         SMULBB  SrcDst22,SrcDst22,rowLuma01                    ;// pSrcDst[10] * (pVRow[0]<<Shift)

         PKHBT   SrcDst20,SrcDst20,temp1,LSL #16             ;// Pack the next two product values


         ;// Row 4

         SMULTT  temp1,SrcDst30,rowLuma01                    ;// pSrcDst[13] * (pVRow[1]<<Shift)
         SMULBB  SrcDst30,SrcDst30,rowLuma23                    ;// pSrcDst[12] * (pVRow[2]<<Shift)

         SMULTT  temp3,SrcDst32,rowLuma01                    ;// pSrcDst[15] * (pVRow[1]<<Shift)
         SMULBB  SrcDst32,SrcDst32,rowLuma23                    ;// pSrcDst[14] * (pVRow[2]<<Shift)

         PKHBT   SrcDst22,SrcDst22,temp2,LSL #16             ;// Pack the remaining product values
         PKHBT   SrcDst30,SrcDst30,temp1,LSL #16
         PKHBT   SrcDst32,SrcDst32,temp3,LSL #16


         STMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}


         ;// Set return value


         ;// Write function tail
         M_END

     ENDIF                                                    ;//ARM1136JS_U


 ;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd

 ;// Guarding implementation by the processor name

     IF  ARM1136JS

 ;//Input Registers
 ppSrc       RN  0
 pPred       RN  1
 pDC         RN  2
 pDst        RN  3


 ;//Output Registers
 result      RN  0

 ;//Local Scratch Registers
 pDelta      RN  4
 pDeltaTmp   RN  6
 AC          RN  5                   ;//Load from stack
 pPredTemp   RN  7
 pDCTemp     RN  8
 pDstTemp    RN  9
 pDeltaArg1  RN  1
 pDeltaArg0  RN  0
 QP          RN  1                   ;//Load from stack
 DCval       RN  10
 DCvalCopy   RN  11
 predstep    RN  1
 dstStep     RN  10
 ycounter    RN  0
 PredVal1    RN  3
 PredVal2    RN  5
 DeltaVal1   RN  2
 DeltaVal2   RN  11
 PredVal     RN  8
 tmpDeltaVal RN  6
 sum1        RN  12
 sum2        RN  14


     ;// Allocate stack memory required by the function
         M_ALLOC8 pBuffer, 32


     ;// Write function header
         M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11

         ;// Define stack arguments
         M_ARG   predStepOnStack, 4
         M_ARG   dstStepOnStack,4
         M_ARG   QPOnStack, 4
         M_ARG   ACOnStack,4


         M_ADR   pDelta,pBuffer
         M_LDR   AC,ACOnStack


         ;// Save registers r1,r2,r3 before function call
         MOV     pPredTemp,pPred
         MOV     pDCTemp,pDC
         MOV     pDstTemp,pDst

         CMP     AC,#0
         BEQ     DCcase
         MOV     pDeltaArg1,pDelta                           ;// Set up r1 for armVCM4P10_UnpackBlock4x4

         BL      armVCM4P10_UnpackBlock4x4

         M_LDR   QP,QPOnStack                                ;// Set up r1 for DequantLumaAC4x4
         MOV     pDeltaArg0,pDelta                           ;// Set up r0 for DequantLumaAC4x4

         BL      armVCM4P10_DequantLumaAC4x4


         CMP     pDCTemp,#0
         LDRSHNE DCval,[pDCTemp]
         MOV     pDeltaArg0,pDelta                           ;// Set up r0 for armVCM4P10_TransformResidual4x4
         MOV     pDeltaArg1,pDelta                           ;// Set up r1 for armVCM4P10_TransformResidual4x4
         STRHNE  DCval,[pDelta]

         BL      armVCM4P10_TransformResidual4x4
         B       OutDCcase


 DCcase
         LDRSH   DCval,[pDCTemp]
         ADD     DCval,DCval,#32
         ASR     DCval,DCval,#6
         PKHBT   DCval,DCval,DCval,LSL #16                  ;// Duplicating the Lower halfword
         MOV     DCvalCopy, DCval                           ;// Needed for STRD
         STRD    DCval, [pDelta, #0]                        ;// pDelta[0]  = pDelta[1]  = pDelta[2]  = pDelta[3] = DCval
         STRD    DCval, [pDelta, #8]                        ;// pDelta[4]  = pDelta[5]  = pDelta[6]  = pDelta[7] = DCval
         STRD    DCval, [pDelta, #16]                       ;// pDelta[8]  = pDelta[9]  = pDelta[10] = pDelta[11] = DCval
         STRD    DCval, [pDelta, #24]


 OutDCcase
         M_LDR   predstep,predStepOnStack
         M_LDR   dstStep,dstStepOnStack

         LDMIA   pDelta!,{tmpDeltaVal,DeltaVal2}             ;// Pre load
         MOV     ycounter,#4                                 ;// Counter for the PredPlusDeltaLoop
         LDR     PredVal,[pPredTemp]                         ;// Pre load

 PredPlusDeltaLoop


         SUBS    ycounter,ycounter,#1
         ADD     pPredTemp,pPredTemp,predstep                ;// Increment pPred ptr

         PKHBT   DeltaVal1,tmpDeltaVal,DeltaVal2,LSL #16     ;// Deltaval1 = [C A]
         PKHTB   DeltaVal2,DeltaVal2,tmpDeltaVal,ASR #16     ;// DeltaVal2 = [D B]

         UXTB16  PredVal1,PredVal                            ;// PredVal1 = [0c0a]
         UXTB16  PredVal2,PredVal,ROR #8                     ;// PredVal2 = [0d0b]

         LDRGT   PredVal,[pPredTemp]                         ;// Pre load

         QADD16  sum2,DeltaVal2,PredVal2                     ;// Add and saturate to 16 bits
         QADD16  sum1,DeltaVal1,PredVal1

         USAT16  sum2,#8,sum2                                ;// armClip(0,255,sum2)
         USAT16  sum1,#8,sum1

         LDMGTIA   pDelta!,{tmpDeltaVal,DeltaVal2}           ;// Pre load

         ORR     sum1,sum1,sum2,LSL #8                       ;// sum1 = [dcba]
         STR     sum1,[pDstTemp]

         ADD     pDstTemp,pDstTemp,dstStep                   ;// Increment pDst ptr
         BGT     PredPlusDeltaLoop


         ;// Set return value
         MOV     result,#OMX_Sts_NoErr

 End


         ;// Write function tail

         M_END

     ENDIF                                                    ;//ARM1136JS


 ;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd

 ;// Guarding implementation by the processor name


     END
	;//
	;// Copyright (C) 2007 ARM Limited
	;//
	;// Licensed under the Apache License, Version 2.0 (the "License");
	;// you may not use this file except in compliance with the License.
	;// You may obtain a copy of the License at
	;//
	;// http://www.apache.org/licenses/LICENSE-2.0
	;//
	;// Unless required by applicable law or agreed to in writing, software
	;// distributed under the License is distributed on an "AS IS" BASIS,
	;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	;// See the License for the specific language governing permissions and
	;// limitations under the License.
	;//
	;//
	;//
	;// Description:
	;// H.264 inverse quantize and transform module
	;//
	;//



	;// Include standard headers

	INCLUDE omxtypes_s.h
	INCLUDE armCOMM_s.h

	;// Import symbols required from other files
	;// (For example tables)

	IMPORT armVCM4P10_UnpackBlock4x4
	IMPORT armVCM4P10_TransformResidual4x4
	IMPORT armVCM4P10_QPDivTable
	IMPORT armVCM4P10_VMatrixU16
	IMPORT armVCM4P10_QPModuloTable

	M_VARIANTS ARM1136JS, ARM1136JS_U

	;// Set debugging level
	;//DEBUG_ON SETL {TRUE}


	;// Static Function: armVCM4P10_DequantLumaAC4x4

	;// Guarding implementation by the processor name

	IF ARM1136JS

	;//Input Registers
	pSrcDst RN 0
	QP RN 1


	;//Output Registers


	;//Local Scratch Registers
	pQPdiv RN 4
	pQPmod RN 5
	pVRow RN 2
	QPmod RN 6
	shift RN 3
	rowLuma01 RN 1
	rowLuma23 RN 4

	SrcDst00 RN 5
	SrcDst02 RN 6
	SrcDst10 RN 7
	SrcDst12 RN 8
	SrcDst20 RN 9
	SrcDst22 RN 10
	SrcDst30 RN 11
	SrcDst32 RN 12

	temp1 RN 2
	temp2 RN 3
	temp3 RN 14


	;// Allocate stack memory required by the function

	;// Write function header
	M_START armVCM4P10_DequantLumaAC4x4,r11

	LDR pQPmod,=armVCM4P10_QPModuloTable
	LDR pQPdiv,=armVCM4P10_QPDivTable
	LDR pVRow,=armVCM4P10_VMatrixU16

	LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6
	LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6

	LDRH rowLuma01,[pVRow,QPmod]! ;// rowLuma01 = [00\|0a]
	LDRH temp3,[pVRow,#2] ;// temp3 = [00\|0b]
	LDRH rowLuma23,[pVRow,#4] ;// rowLuma23 = [00\|0c]
	ORR rowLuma01,rowLuma01,temp3,LSL #16 ;// rowLuma01 = [0b\|0a]

	;// Load all the 16 'src' values
	LDMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}


	;//*********************************************************************************************
	;//
	;// 'Shift' ranges between [0,8]
	;// So we can shift the packed rowLuma values [0b\|0a] with a single LSL operation
	;//
	;//*********************************************************************************************

	LSL rowLuma01,rowLuma01,shift
	LSL rowLuma23,rowLuma23,shift


	;//**********************************************************************************************
	;//
	;// The idea is to unroll the Loop completely
	;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
	;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16'
	;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
	;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
	;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
	;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
	;// Here we interleave the PKHBT operations for various rows to avoide pipeline stalls
	;//
	;// We then pack the two 16 bit multiplication result into a word and store at one go
	;//
	;//**********************************************************************************************


	;// Row 1


	SMULTB temp1,SrcDst00,rowLuma23 ;// pSrcDst[1] * (pVRow[2]<<Shift)
	SMULBB SrcDst00,SrcDst00,rowLuma01 ;// pSrcDst[0] * (pVRow[0]<<Shift)

	SMULTB temp2,SrcDst02,rowLuma23 ;// pSrcDst[3] * (pVRow[2]<<Shift)
	SMULBB SrcDst02,SrcDst02,rowLuma01 ;// pSrcDst[2] * (pVRow[0]<<Shift)

	PKHBT SrcDst00,SrcDst00,temp1,LSL #16 ;// Pack the first two product values


	;// Row 2
	SMULTT temp1,SrcDst10,rowLuma01 ;// pSrcDst[5] * (pVRow[1]<<Shift)
	SMULBB SrcDst10,SrcDst10,rowLuma23 ;// pSrcDst[4] * (pVRow[2]<<Shift)

	PKHBT SrcDst02,SrcDst02,temp2,LSL #16 ;// Pack the next two product values
	SMULTT temp2,SrcDst12,rowLuma01 ;// pSrcDst[7] * (pVRow[1]<<Shift)
	SMULBB SrcDst12,SrcDst12,rowLuma23 ;// pSrcDst[6] * (pVRow[2]<<Shift)

	PKHBT SrcDst10,SrcDst10,temp1,LSL #16 ;// Pack the next two product values


	;// Row 3

	SMULTB temp1,SrcDst20,rowLuma23 ;// pSrcDst[9] * (pVRow[2]<<Shift)
	SMULBB SrcDst20,SrcDst20,rowLuma01 ;// pSrcDst[8] * (pVRow[0]<<Shift)

	PKHBT SrcDst12,SrcDst12,temp2,LSL #16 ;// Pack the next two product values
	SMULTB temp2,SrcDst22,rowLuma23 ;// pSrcDst[11] * (pVRow[2]<<Shift)
	SMULBB SrcDst22,SrcDst22,rowLuma01 ;// pSrcDst[10] * (pVRow[0]<<Shift)

	PKHBT SrcDst20,SrcDst20,temp1,LSL #16 ;// Pack the next two product values



	;// Row 4

	SMULTT temp1,SrcDst30,rowLuma01 ;// pSrcDst[13] * (pVRow[1]<<Shift)
	SMULBB SrcDst30,SrcDst30,rowLuma23 ;// pSrcDst[12] * (pVRow[2]<<Shift)

	SMULTT temp3,SrcDst32,rowLuma01 ;// pSrcDst[15] * (pVRow[1]<<Shift)
	SMULBB SrcDst32,SrcDst32,rowLuma23 ;// pSrcDst[14] * (pVRow[2]<<Shift)

	PKHBT SrcDst22,SrcDst22,temp2,LSL #16 ;// Pack the remaining product values
	PKHBT SrcDst30,SrcDst30,temp1,LSL #16
	PKHBT SrcDst32,SrcDst32,temp3,LSL #16


	STMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}


	;// Set return value



	;// Write function tail
	M_END

	ENDIF ;//ARM1136JS


	;// Guarding implementation by the processor name

	IF ARM1136JS_U

	;//Input Registers
	pSrcDst RN 0
	QP RN 1


	;//Output Registers


	;//Local Scratch Registers
	pQPdiv RN 4
	pQPmod RN 5
	pVRow RN 2
	QPmod RN 6
	shift RN 3
	rowLuma01 RN 1
	rowLuma23 RN 4

	SrcDst00 RN 5
	SrcDst02 RN 6
	SrcDst10 RN 7
	SrcDst12 RN 8
	SrcDst20 RN 9
	SrcDst22 RN 10
	SrcDst30 RN 11
	SrcDst32 RN 12

	temp1 RN 2
	temp2 RN 3
	temp3 RN 14


	;// Allocate stack memory required by the function

	;// Write function header
	M_START armVCM4P10_DequantLumaAC4x4,r11

	LDR pQPmod,=armVCM4P10_QPModuloTable
	LDR pQPdiv,=armVCM4P10_QPDivTable
	LDR pVRow,=armVCM4P10_VMatrixU16

	LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6
	LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6

	LDR rowLuma01,[pVRow,QPmod]! ;// rowLuma01 = [0b\|0a]
	LDR rowLuma23,[pVRow,#4] ;// rowLuma23 = [0d\|0c]

	;// Load all the 16 'src' values
	LDMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}


	;//*********************************************************************************************
	;//
	;// 'Shift' ranges between [0,8]
	;// So we can shift the packed rowLuma values [0b\|0a] with a single LSL operation
	;//
	;//*********************************************************************************************

	LSL rowLuma01,rowLuma01,shift
	LSL rowLuma23,rowLuma23,shift


	;//**********************************************************************************************
	;//
	;// The idea is to unroll the Loop completely
	;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
	;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16'
	;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
	;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
	;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
	;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
	;// Here we interleave the PKHBT operations for various rows to avoide pipeline stalls
	;//
	;// We then pack the two 16 bit multiplication result into a word and store at one go
	;//
	;//**********************************************************************************************


	;// Row 1


	SMULTB temp1,SrcDst00,rowLuma23 ;// pSrcDst[1] * (pVRow[2]<<Shift)
	SMULBB SrcDst00,SrcDst00,rowLuma01 ;// pSrcDst[0] * (pVRow[0]<<Shift)

	SMULTB temp2,SrcDst02,rowLuma23 ;// pSrcDst[3] * (pVRow[2]<<Shift)
	SMULBB SrcDst02,SrcDst02,rowLuma01 ;// pSrcDst[2] * (pVRow[0]<<Shift)

	PKHBT SrcDst00,SrcDst00,temp1,LSL #16 ;// Pack the first two product values


	;// Row 2
	SMULTT temp1,SrcDst10,rowLuma01 ;// pSrcDst[5] * (pVRow[1]<<Shift)
	SMULBB SrcDst10,SrcDst10,rowLuma23 ;// pSrcDst[4] * (pVRow[2]<<Shift)

	PKHBT SrcDst02,SrcDst02,temp2,LSL #16 ;// Pack the next two product values
	SMULTT temp2,SrcDst12,rowLuma01 ;// pSrcDst[7] * (pVRow[1]<<Shift)
	SMULBB SrcDst12,SrcDst12,rowLuma23 ;// pSrcDst[6] * (pVRow[2]<<Shift)

	PKHBT SrcDst10,SrcDst10,temp1,LSL #16 ;// Pack the next two product values


	;// Row 3

	SMULTB temp1,SrcDst20,rowLuma23 ;// pSrcDst[9] * (pVRow[2]<<Shift)
	SMULBB SrcDst20,SrcDst20,rowLuma01 ;// pSrcDst[8] * (pVRow[0]<<Shift)

	PKHBT SrcDst12,SrcDst12,temp2,LSL #16 ;// Pack the next two product values
	SMULTB temp2,SrcDst22,rowLuma23 ;// pSrcDst[11] * (pVRow[2]<<Shift)
	SMULBB SrcDst22,SrcDst22,rowLuma01 ;// pSrcDst[10] * (pVRow[0]<<Shift)

	PKHBT SrcDst20,SrcDst20,temp1,LSL #16 ;// Pack the next two product values



	;// Row 4

	SMULTT temp1,SrcDst30,rowLuma01 ;// pSrcDst[13] * (pVRow[1]<<Shift)
	SMULBB SrcDst30,SrcDst30,rowLuma23 ;// pSrcDst[12] * (pVRow[2]<<Shift)

	SMULTT temp3,SrcDst32,rowLuma01 ;// pSrcDst[15] * (pVRow[1]<<Shift)
	SMULBB SrcDst32,SrcDst32,rowLuma23 ;// pSrcDst[14] * (pVRow[2]<<Shift)

	PKHBT SrcDst22,SrcDst22,temp2,LSL #16 ;// Pack the remaining product values
	PKHBT SrcDst30,SrcDst30,temp1,LSL #16
	PKHBT SrcDst32,SrcDst32,temp3,LSL #16


	STMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}


	;// Set return value



	;// Write function tail
	M_END

	ENDIF ;//ARM1136JS_U





	;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd

	;// Guarding implementation by the processor name

	IF ARM1136JS

	;//Input Registers
	ppSrc RN 0
	pPred RN 1
	pDC RN 2
	pDst RN 3


	;//Output Registers
	result RN 0

	;//Local Scratch Registers
	pDelta RN 4
	pDeltaTmp RN 6
	AC RN 5 ;//Load from stack
	pPredTemp RN 7
	pDCTemp RN 8
	pDstTemp RN 9
	pDeltaArg1 RN 1
	pDeltaArg0 RN 0
	QP RN 1 ;//Load from stack
	DCval RN 10
	DCvalCopy RN 11
	predstep RN 1
	dstStep RN 10
	ycounter RN 0
	PredVal1 RN 3
	PredVal2 RN 5
	DeltaVal1 RN 2
	DeltaVal2 RN 11
	PredVal RN 8
	tmpDeltaVal RN 6
	sum1 RN 12
	sum2 RN 14



	;// Allocate stack memory required by the function
	M_ALLOC8 pBuffer, 32


	;// Write function header
	M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11

	;// Define stack arguments
	M_ARG predStepOnStack, 4
	M_ARG dstStepOnStack,4
	M_ARG QPOnStack, 4
	M_ARG ACOnStack,4


	M_ADR pDelta,pBuffer
	M_LDR AC,ACOnStack


	;// Save registers r1,r2,r3 before function call
	MOV pPredTemp,pPred
	MOV pDCTemp,pDC
	MOV pDstTemp,pDst

	CMP AC,#0
	BEQ DCcase
	MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_UnpackBlock4x4

	BL armVCM4P10_UnpackBlock4x4

	M_LDR QP,QPOnStack ;// Set up r1 for DequantLumaAC4x4
	MOV pDeltaArg0,pDelta ;// Set up r0 for DequantLumaAC4x4

	BL armVCM4P10_DequantLumaAC4x4


	CMP pDCTemp,#0
	LDRSHNE DCval,[pDCTemp]
	MOV pDeltaArg0,pDelta ;// Set up r0 for armVCM4P10_TransformResidual4x4
	MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_TransformResidual4x4
	STRHNE DCval,[pDelta]

	BL armVCM4P10_TransformResidual4x4
	B OutDCcase


	DCcase
	LDRSH DCval,[pDCTemp]
	ADD DCval,DCval,#32
	ASR DCval,DCval,#6
	PKHBT DCval,DCval,DCval,LSL #16 ;// Duplicating the Lower halfword
	MOV DCvalCopy, DCval ;// Needed for STRD
	STRD DCval, [pDelta, #0] ;// pDelta[0] = pDelta[1] = pDelta[2] = pDelta[3] = DCval
	STRD DCval, [pDelta, #8] ;// pDelta[4] = pDelta[5] = pDelta[6] = pDelta[7] = DCval
	STRD DCval, [pDelta, #16] ;// pDelta[8] = pDelta[9] = pDelta[10] = pDelta[11] = DCval
	STRD DCval, [pDelta, #24]


	OutDCcase
	M_LDR predstep,predStepOnStack
	M_LDR dstStep,dstStepOnStack

	LDMIA pDelta!,{tmpDeltaVal,DeltaVal2} ;// Pre load
	MOV ycounter,#4 ;// Counter for the PredPlusDeltaLoop
	LDR PredVal,[pPredTemp] ;// Pre load

	PredPlusDeltaLoop


	SUBS ycounter,ycounter,#1
	ADD pPredTemp,pPredTemp,predstep ;// Increment pPred ptr

	PKHBT DeltaVal1,tmpDeltaVal,DeltaVal2,LSL #16 ;// Deltaval1 = [C A]
	PKHTB DeltaVal2,DeltaVal2,tmpDeltaVal,ASR #16 ;// DeltaVal2 = [D B]

	UXTB16 PredVal1,PredVal ;// PredVal1 = [0c0a]
	UXTB16 PredVal2,PredVal,ROR #8 ;// PredVal2 = [0d0b]

	LDRGT PredVal,[pPredTemp] ;// Pre load

	QADD16 sum2,DeltaVal2,PredVal2 ;// Add and saturate to 16 bits
	QADD16 sum1,DeltaVal1,PredVal1

	USAT16 sum2,#8,sum2 ;// armClip(0,255,sum2)
	USAT16 sum1,#8,sum1

	LDMGTIA pDelta!,{tmpDeltaVal,DeltaVal2} ;// Pre load

	ORR sum1,sum1,sum2,LSL #8 ;// sum1 = [dcba]
	STR sum1,[pDstTemp]

	ADD pDstTemp,pDstTemp,dstStep ;// Increment pDst ptr
	BGT PredPlusDeltaLoop


	;// Set return value
	MOV result,#OMX_Sts_NoErr

	End


	;// Write function tail

	M_END

	ENDIF ;//ARM1136JS


	;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd

	;// Guarding implementation by the processor name




	END