blob: 3187f2be2638d7c70586bcae1a04fe3cded4201a [file] [log] [blame]
;//
;// Copyright (C) 2007 ARM Limited
;//
;// Licensed under the Apache License, Version 2.0 (the "License");
;// you may not use this file except in compliance with the License.
;// You may obtain a copy of the License at
;//
;// http://www.apache.org/licenses/LICENSE-2.0
;//
;// Unless required by applicable law or agreed to in writing, software
;// distributed under the License is distributed on an "AS IS" BASIS,
;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
;// See the License for the specific language governing permissions and
;// limitations under the License.
;//
;//
;//
;// Description:
;// H.264 inverse quantize and transform module
;//
;//
;// Include standard headers
INCLUDE omxtypes_s.h
INCLUDE armCOMM_s.h
;// Import symbols required from other files
;// (For example tables)
IMPORT armVCM4P10_UnpackBlock4x4
IMPORT armVCM4P10_TransformResidual4x4
IMPORT armVCM4P10_QPDivTable
IMPORT armVCM4P10_VMatrixU16
IMPORT armVCM4P10_QPModuloTable
M_VARIANTS ARM1136JS, ARM1136JS_U
;// Set debugging level
;//DEBUG_ON SETL {TRUE}
;// Static Function: armVCM4P10_DequantLumaAC4x4
;// Guarding implementation by the processor name
IF ARM1136JS
;//Input Registers
pSrcDst RN 0
QP RN 1
;//Output Registers
;//Local Scratch Registers
pQPdiv RN 4
pQPmod RN 5
pVRow RN 2
QPmod RN 6
shift RN 3
rowLuma01 RN 1
rowLuma23 RN 4
SrcDst00 RN 5
SrcDst02 RN 6
SrcDst10 RN 7
SrcDst12 RN 8
SrcDst20 RN 9
SrcDst22 RN 10
SrcDst30 RN 11
SrcDst32 RN 12
temp1 RN 2
temp2 RN 3
temp3 RN 14
;// Allocate stack memory required by the function
;// Write function header
M_START armVCM4P10_DequantLumaAC4x4,r11
LDR pQPmod,=armVCM4P10_QPModuloTable
LDR pQPdiv,=armVCM4P10_QPDivTable
LDR pVRow,=armVCM4P10_VMatrixU16
LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6
LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6
LDRH rowLuma01,[pVRow,QPmod]! ;// rowLuma01 = [00|0a]
LDRH temp3,[pVRow,#2] ;// temp3 = [00|0b]
LDRH rowLuma23,[pVRow,#4] ;// rowLuma23 = [00|0c]
ORR rowLuma01,rowLuma01,temp3,LSL #16 ;// rowLuma01 = [0b|0a]
;// Load all the 16 'src' values
LDMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
;//*********************************************************************************************
;//
;// 'Shift' ranges between [0,8]
;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
;//
;//*********************************************************************************************
LSL rowLuma01,rowLuma01,shift
LSL rowLuma23,rowLuma23,shift
;//**********************************************************************************************
;//
;// The idea is to unroll the Loop completely
;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16'
;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
;// Here we interleave the PKHBT operations for various rows to avoide pipeline stalls
;//
;// We then pack the two 16 bit multiplication result into a word and store at one go
;//
;//**********************************************************************************************
;// Row 1
SMULTB temp1,SrcDst00,rowLuma23 ;// pSrcDst[1] * (pVRow[2]<<Shift)
SMULBB SrcDst00,SrcDst00,rowLuma01 ;// pSrcDst[0] * (pVRow[0]<<Shift)
SMULTB temp2,SrcDst02,rowLuma23 ;// pSrcDst[3] * (pVRow[2]<<Shift)
SMULBB SrcDst02,SrcDst02,rowLuma01 ;// pSrcDst[2] * (pVRow[0]<<Shift)
PKHBT SrcDst00,SrcDst00,temp1,LSL #16 ;// Pack the first two product values
;// Row 2
SMULTT temp1,SrcDst10,rowLuma01 ;// pSrcDst[5] * (pVRow[1]<<Shift)
SMULBB SrcDst10,SrcDst10,rowLuma23 ;// pSrcDst[4] * (pVRow[2]<<Shift)
PKHBT SrcDst02,SrcDst02,temp2,LSL #16 ;// Pack the next two product values
SMULTT temp2,SrcDst12,rowLuma01 ;// pSrcDst[7] * (pVRow[1]<<Shift)
SMULBB SrcDst12,SrcDst12,rowLuma23 ;// pSrcDst[6] * (pVRow[2]<<Shift)
PKHBT SrcDst10,SrcDst10,temp1,LSL #16 ;// Pack the next two product values
;// Row 3
SMULTB temp1,SrcDst20,rowLuma23 ;// pSrcDst[9] * (pVRow[2]<<Shift)
SMULBB SrcDst20,SrcDst20,rowLuma01 ;// pSrcDst[8] * (pVRow[0]<<Shift)
PKHBT SrcDst12,SrcDst12,temp2,LSL #16 ;// Pack the next two product values
SMULTB temp2,SrcDst22,rowLuma23 ;// pSrcDst[11] * (pVRow[2]<<Shift)
SMULBB SrcDst22,SrcDst22,rowLuma01 ;// pSrcDst[10] * (pVRow[0]<<Shift)
PKHBT SrcDst20,SrcDst20,temp1,LSL #16 ;// Pack the next two product values
;// Row 4
SMULTT temp1,SrcDst30,rowLuma01 ;// pSrcDst[13] * (pVRow[1]<<Shift)
SMULBB SrcDst30,SrcDst30,rowLuma23 ;// pSrcDst[12] * (pVRow[2]<<Shift)
SMULTT temp3,SrcDst32,rowLuma01 ;// pSrcDst[15] * (pVRow[1]<<Shift)
SMULBB SrcDst32,SrcDst32,rowLuma23 ;// pSrcDst[14] * (pVRow[2]<<Shift)
PKHBT SrcDst22,SrcDst22,temp2,LSL #16 ;// Pack the remaining product values
PKHBT SrcDst30,SrcDst30,temp1,LSL #16
PKHBT SrcDst32,SrcDst32,temp3,LSL #16
STMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
;// Set return value
;// Write function tail
M_END
ENDIF ;//ARM1136JS
;// Guarding implementation by the processor name
IF ARM1136JS_U
;//Input Registers
pSrcDst RN 0
QP RN 1
;//Output Registers
;//Local Scratch Registers
pQPdiv RN 4
pQPmod RN 5
pVRow RN 2
QPmod RN 6
shift RN 3
rowLuma01 RN 1
rowLuma23 RN 4
SrcDst00 RN 5
SrcDst02 RN 6
SrcDst10 RN 7
SrcDst12 RN 8
SrcDst20 RN 9
SrcDst22 RN 10
SrcDst30 RN 11
SrcDst32 RN 12
temp1 RN 2
temp2 RN 3
temp3 RN 14
;// Allocate stack memory required by the function
;// Write function header
M_START armVCM4P10_DequantLumaAC4x4,r11
LDR pQPmod,=armVCM4P10_QPModuloTable
LDR pQPdiv,=armVCM4P10_QPDivTable
LDR pVRow,=armVCM4P10_VMatrixU16
LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6
LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6
LDR rowLuma01,[pVRow,QPmod]! ;// rowLuma01 = [0b|0a]
LDR rowLuma23,[pVRow,#4] ;// rowLuma23 = [0d|0c]
;// Load all the 16 'src' values
LDMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
;//*********************************************************************************************
;//
;// 'Shift' ranges between [0,8]
;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
;//
;//*********************************************************************************************
LSL rowLuma01,rowLuma01,shift
LSL rowLuma23,rowLuma23,shift
;//**********************************************************************************************
;//
;// The idea is to unroll the Loop completely
;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16'
;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
;// Here we interleave the PKHBT operations for various rows to avoide pipeline stalls
;//
;// We then pack the two 16 bit multiplication result into a word and store at one go
;//
;//**********************************************************************************************
;// Row 1
SMULTB temp1,SrcDst00,rowLuma23 ;// pSrcDst[1] * (pVRow[2]<<Shift)
SMULBB SrcDst00,SrcDst00,rowLuma01 ;// pSrcDst[0] * (pVRow[0]<<Shift)
SMULTB temp2,SrcDst02,rowLuma23 ;// pSrcDst[3] * (pVRow[2]<<Shift)
SMULBB SrcDst02,SrcDst02,rowLuma01 ;// pSrcDst[2] * (pVRow[0]<<Shift)
PKHBT SrcDst00,SrcDst00,temp1,LSL #16 ;// Pack the first two product values
;// Row 2
SMULTT temp1,SrcDst10,rowLuma01 ;// pSrcDst[5] * (pVRow[1]<<Shift)
SMULBB SrcDst10,SrcDst10,rowLuma23 ;// pSrcDst[4] * (pVRow[2]<<Shift)
PKHBT SrcDst02,SrcDst02,temp2,LSL #16 ;// Pack the next two product values
SMULTT temp2,SrcDst12,rowLuma01 ;// pSrcDst[7] * (pVRow[1]<<Shift)
SMULBB SrcDst12,SrcDst12,rowLuma23 ;// pSrcDst[6] * (pVRow[2]<<Shift)
PKHBT SrcDst10,SrcDst10,temp1,LSL #16 ;// Pack the next two product values
;// Row 3
SMULTB temp1,SrcDst20,rowLuma23 ;// pSrcDst[9] * (pVRow[2]<<Shift)
SMULBB SrcDst20,SrcDst20,rowLuma01 ;// pSrcDst[8] * (pVRow[0]<<Shift)
PKHBT SrcDst12,SrcDst12,temp2,LSL #16 ;// Pack the next two product values
SMULTB temp2,SrcDst22,rowLuma23 ;// pSrcDst[11] * (pVRow[2]<<Shift)
SMULBB SrcDst22,SrcDst22,rowLuma01 ;// pSrcDst[10] * (pVRow[0]<<Shift)
PKHBT SrcDst20,SrcDst20,temp1,LSL #16 ;// Pack the next two product values
;// Row 4
SMULTT temp1,SrcDst30,rowLuma01 ;// pSrcDst[13] * (pVRow[1]<<Shift)
SMULBB SrcDst30,SrcDst30,rowLuma23 ;// pSrcDst[12] * (pVRow[2]<<Shift)
SMULTT temp3,SrcDst32,rowLuma01 ;// pSrcDst[15] * (pVRow[1]<<Shift)
SMULBB SrcDst32,SrcDst32,rowLuma23 ;// pSrcDst[14] * (pVRow[2]<<Shift)
PKHBT SrcDst22,SrcDst22,temp2,LSL #16 ;// Pack the remaining product values
PKHBT SrcDst30,SrcDst30,temp1,LSL #16
PKHBT SrcDst32,SrcDst32,temp3,LSL #16
STMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
;// Set return value
;// Write function tail
M_END
ENDIF ;//ARM1136JS_U
;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
;// Guarding implementation by the processor name
IF ARM1136JS
;//Input Registers
ppSrc RN 0
pPred RN 1
pDC RN 2
pDst RN 3
;//Output Registers
result RN 0
;//Local Scratch Registers
pDelta RN 4
pDeltaTmp RN 6
AC RN 5 ;//Load from stack
pPredTemp RN 7
pDCTemp RN 8
pDstTemp RN 9
pDeltaArg1 RN 1
pDeltaArg0 RN 0
QP RN 1 ;//Load from stack
DCval RN 10
DCvalCopy RN 11
predstep RN 1
dstStep RN 10
ycounter RN 0
PredVal1 RN 3
PredVal2 RN 5
DeltaVal1 RN 2
DeltaVal2 RN 11
PredVal RN 8
tmpDeltaVal RN 6
sum1 RN 12
sum2 RN 14
;// Allocate stack memory required by the function
M_ALLOC8 pBuffer, 32
;// Write function header
M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11
;// Define stack arguments
M_ARG predStepOnStack, 4
M_ARG dstStepOnStack,4
M_ARG QPOnStack, 4
M_ARG ACOnStack,4
M_ADR pDelta,pBuffer
M_LDR AC,ACOnStack
;// Save registers r1,r2,r3 before function call
MOV pPredTemp,pPred
MOV pDCTemp,pDC
MOV pDstTemp,pDst
CMP AC,#0
BEQ DCcase
MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_UnpackBlock4x4
BL armVCM4P10_UnpackBlock4x4
M_LDR QP,QPOnStack ;// Set up r1 for DequantLumaAC4x4
MOV pDeltaArg0,pDelta ;// Set up r0 for DequantLumaAC4x4
BL armVCM4P10_DequantLumaAC4x4
CMP pDCTemp,#0
LDRSHNE DCval,[pDCTemp]
MOV pDeltaArg0,pDelta ;// Set up r0 for armVCM4P10_TransformResidual4x4
MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_TransformResidual4x4
STRHNE DCval,[pDelta]
BL armVCM4P10_TransformResidual4x4
B OutDCcase
DCcase
LDRSH DCval,[pDCTemp]
ADD DCval,DCval,#32
ASR DCval,DCval,#6
PKHBT DCval,DCval,DCval,LSL #16 ;// Duplicating the Lower halfword
MOV DCvalCopy, DCval ;// Needed for STRD
STRD DCval, [pDelta, #0] ;// pDelta[0] = pDelta[1] = pDelta[2] = pDelta[3] = DCval
STRD DCval, [pDelta, #8] ;// pDelta[4] = pDelta[5] = pDelta[6] = pDelta[7] = DCval
STRD DCval, [pDelta, #16] ;// pDelta[8] = pDelta[9] = pDelta[10] = pDelta[11] = DCval
STRD DCval, [pDelta, #24]
OutDCcase
M_LDR predstep,predStepOnStack
M_LDR dstStep,dstStepOnStack
LDMIA pDelta!,{tmpDeltaVal,DeltaVal2} ;// Pre load
MOV ycounter,#4 ;// Counter for the PredPlusDeltaLoop
LDR PredVal,[pPredTemp] ;// Pre load
PredPlusDeltaLoop
SUBS ycounter,ycounter,#1
ADD pPredTemp,pPredTemp,predstep ;// Increment pPred ptr
PKHBT DeltaVal1,tmpDeltaVal,DeltaVal2,LSL #16 ;// Deltaval1 = [C A]
PKHTB DeltaVal2,DeltaVal2,tmpDeltaVal,ASR #16 ;// DeltaVal2 = [D B]
UXTB16 PredVal1,PredVal ;// PredVal1 = [0c0a]
UXTB16 PredVal2,PredVal,ROR #8 ;// PredVal2 = [0d0b]
LDRGT PredVal,[pPredTemp] ;// Pre load
QADD16 sum2,DeltaVal2,PredVal2 ;// Add and saturate to 16 bits
QADD16 sum1,DeltaVal1,PredVal1
USAT16 sum2,#8,sum2 ;// armClip(0,255,sum2)
USAT16 sum1,#8,sum1
LDMGTIA pDelta!,{tmpDeltaVal,DeltaVal2} ;// Pre load
ORR sum1,sum1,sum2,LSL #8 ;// sum1 = [dcba]
STR sum1,[pDstTemp]
ADD pDstTemp,pDstTemp,dstStep ;// Increment pDst ptr
BGT PredPlusDeltaLoop
;// Set return value
MOV result,#OMX_Sts_NoErr
End
;// Write function tail
M_END
ENDIF ;//ARM1136JS
;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
;// Guarding implementation by the processor name
END