blob: 4787982feb525782ae0bd1b397fe69987515be63 [file] [log] [blame]
;//
;// Copyright (C) 2007-2008 ARM Limited
;//
;// Licensed under the Apache License, Version 2.0 (the "License");
;// you may not use this file except in compliance with the License.
;// You may obtain a copy of the License at
;//
;// http://www.apache.org/licenses/LICENSE-2.0
;//
;// Unless required by applicable law or agreed to in writing, software
;// distributed under the License is distributed on an "AS IS" BASIS,
;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
;// See the License for the specific language governing permissions and
;// limitations under the License.
;//
;//
;//
;// File Name: omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s
;// OpenMAX DL: v1.0.2
;// Revision: 12290
;// Date: Wednesday, April 9, 2008
;//
;//
;//
;//
;// Description:
;// H.264 inverse quantize and transform module
;//
;//
;// Include standard headers
INCLUDE omxtypes_s.h
INCLUDE armCOMM_s.h
;// Import symbols required from other files
;// (For example tables)
IMPORT armVCM4P10_UnpackBlock4x4
IMPORT armVCM4P10_TransformResidual4x4
IMPORT armVCM4P10_QPDivTable
IMPORT armVCM4P10_VMatrixU16
IMPORT armVCM4P10_QPModuloTable
M_VARIANTS CortexA8
;// Set debugging level
;//DEBUG_ON SETL {TRUE}
;// Static Function: armVCM4P10_DequantLumaAC4x4
;// Guarding implementation by the processor name
;// Guarding implementation by the processor name
;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
;// Guarding implementation by the processor name
;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
;// Guarding implementation by the processor name
IF CortexA8
;// ARM Registers
;//Input Registers
ppSrc RN 0
pPred RN 1
pDC RN 2
pDst RN 3
;//Output Registers
result RN 0
;//Local Scratch Registers
;//Registers used in armVCM4P10_DequantLumaAC4x4
pQPdiv RN 10
pQPmod RN 11
pVRow RN 2
QPmod RN 12
shift RN 14
index0 RN 1
index1 RN 10
;//Registers used in DequantTransformResidualFromPairAndAdd
pDelta RN 4
pDeltaTmp RN 6
AC RN 5 ;//Load from stack
pPredTemp RN 7
pDCTemp RN 8
pDstTemp RN 9
pDeltaArg1 RN 1
pDeltaArg0 RN 0
QP RN 1 ;//Load from stack
DCval RN 10
predstep RN 1
dstStep RN 10
PredVal1 RN 3
PredVal2 RN 5
;// Neon Registers
;// Registers used in armVCM4P10_DequantLumaAC4x4
dVmatrix DN D6.8
dindexRow0 DN D7.32
dindexRow1 DN D9.32
dByteIndexRow0 DN D7.8
dByteIndexRow1 DN D9.8
dVRow0 DN D8.8
dVRow1 DN D4.8
dVRow0U16 DN D8.U16
dVRow1U16 DN D4.U16
dVRow2U16 DN D8.U16
dVRow3U16 DN D4.U16
dShift DN D5.U16
dSrcRow0 DN D0.I16
dSrcRow1 DN D1.I16
dSrcRow2 DN D2.I16
dSrcRow3 DN D3.I16
dDqntRow0 DN D0.I16
dDqntRow1 DN D1.I16
dDqntRow2 DN D2.I16
dDqntRow3 DN D3.I16
;// Registers used in TransformResidual4x4
;// Packed Input pixels
dIn0 DN D0.S16
dIn1 DN D1.S16
dIn2 DN D2.S16
dIn3 DN D3.S16
qIn01 QN Q0.32
qIn23 QN Q1.32
;// Intermediate calculations
dZero DN D4.S16
de0 DN D5.S16
de1 DN D6.S16
de2 DN D7.S16
de3 DN D8.S16
dIn1RS DN D7.S16
dIn3RS DN D8.S16
df0 DN D0.S16
df1 DN D1.S16
df2 DN D2.S16
df3 DN D3.S16
qf01 QN Q0.32
qf23 QN Q1.32
dg0 DN D5.S16
dg1 DN D6.S16
dg2 DN D7.S16
dg3 DN D8.S16
df1RS DN D7.S16
df3RS DN D8.S16
;// Output pixels
dh0 DN D0.S16
dh1 DN D1.S16
dh2 DN D2.S16
dh3 DN D3.S16
;// Registers used in DequantTransformResidualFromPairAndAdd
dDeltaRow0 DN D0.S16
dDeltaRow1 DN D1.S16
dDeltaRow2 DN D2.S16
dDeltaRow3 DN D3.S16
qDeltaRow01 QN Q0.S16
qDeltaRow23 QN Q1.S16
dPredValRow01 DN D4.U8
dPredValRow23 DN D5.U8
qSumRow01 QN Q3.S16
qSumRow23 QN Q4.S16
dDstRow01 DN D0.U8
dDstRow23 DN D1.U8
dDstRow0 DN D0.32[0]
dDstRow1 DN D0.32[1]
dDstRow2 DN D1.32[0]
dDstRow3 DN D1.32[1]
;// Allocate stack memory required by the function
M_ALLOC8 pBuffer, 32
;// Write function header
M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11,d9
;// Define stack arguments
M_ARG predStepOnStack, 4
M_ARG dstStepOnStack,4
M_ARG QPOnStack, 4
M_ARG ACOnStack,4
M_ADR pDelta,pBuffer
M_LDR AC,ACOnStack
;// Save registers r1,r2,r3 before function call
MOV pPredTemp,pPred
MOV pDCTemp,pDC
MOV pDstTemp,pDst
CMP AC,#0
BEQ DCcase
MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_UnpackBlock4x4
BL armVCM4P10_UnpackBlock4x4
;//--------------------------------------------------------
;// armVCM4P10_DequantLumaAC4x4 : static function inlined
;//--------------------------------------------------------
;//BL armVCM4P10_DequantLumaAC4x4
M_LDR QP,QPOnStack ;// Set up r1 for armVCM4P10_DequantLumaAC4x4
LDR pQPmod,=armVCM4P10_QPModuloTable
LDR pQPdiv,=armVCM4P10_QPDivTable
LDR pVRow,=armVCM4P10_VMatrixU16
LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6
LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6
LDR index1,=0x03020504
LDR index0,=0x05040100 ;// Indexes into dVmatrix
ADD pVRow,pVRow,QPmod
VDUP dindexRow0,index0
VDUP dindexRow1,index1
VDUP dShift,shift
;// Load all 4x4 pVRow[] values
VLD1 dVmatrix,[pVRow] ;// dVmatrix = [0d|0c|0b|0a]
VTBL dVRow0,dVmatrix,dByteIndexRow0 ;// row0 = row2 = [pVRow[2] | pVRow[0] | pVRow[2] | pVRow[0]]
VTBL dVRow1,dVmatrix,dByteIndexRow1 ;// row1 = row3 = [pVRow[1] | pVRow[2] | pVRow[1] | pVRow[2]]
CMP pDCTemp,#0
;// Load all the 4x4 'src' values
VLD1 { dSrcRow0,dSrcRow1,dSrcRow2,dSrcRow3 },[pDelta]
VSHL dVRow0U16,dVRow0U16,dShift
VSHL dVRow1U16,dVRow1U16,dShift
LDRSHNE DCval,[pDCTemp]
;// Multiply src[] with pVRow[]
VMUL dDqntRow0,dSrcRow0,dVRow0U16
VMUL dDqntRow1,dSrcRow1,dVRow1U16
VMUL dDqntRow2,dSrcRow2,dVRow2U16
VMUL dDqntRow3,dSrcRow3,dVRow3U16
;//-------------------------------------------------------------
;// TransformResidual4x4 : Inlined to avoid Load/Stores
;//-------------------------------------------------------------
;//BL armVCM4P10_TransformResidual4x4
;//STRHNE DCval,[pDelta]
VMOVNE dIn0[0],DCval
;//*****************************************************************
;// Transpose the input pixels : perform Row ops as Col ops
;//*****************************************************************
VTRN dIn0,dIn1
VTRN dIn2,dIn3
VTRN qIn01,qIn23
VMOV dZero,#0 ;// Used to right shift by 1
;//****************************************
;// Row Operations (Performed on columns)
;//****************************************
VADD de0,dIn0,dIn2 ;// e0 = d0 + d2
VSUB de1,dIn0,dIn2 ;// e1 = d0 - d2
VHADD dIn1RS,dIn1,dZero ;// (f1>>1) constZero is a register holding 0
VHADD dIn3RS,dIn3,dZero
VSUB de2,dIn1RS,dIn3 ;// e2 = (d1>>1) - d3
VADD de3,dIn1,dIn3RS ;// e3 = d1 + (d3>>1)
VADD df0,de0,de3 ;// f0 = e0 + e3
VADD df1,de1,de2 ;// f1 = e1 + e2
VSUB df2,de1,de2 ;// f2 = e1 - e2
VSUB df3,de0,de3 ;// f3 = e0 - e3
;//*****************************************************************
;// Transpose the resultant matrix
;//*****************************************************************
VTRN df0,df1
VTRN df2,df3
VTRN qf01,qf23
;//*******************************
;// Coloumn Operations
;//*******************************
VADD dg0,df0,df2 ;// e0 = d0 + d2
VSUB dg1,df0,df2 ;// e1 = d0 - d2
VHADD df1RS,df1,dZero ;// (f1>>1) constZero is a register holding 0
VHADD df3RS,df3,dZero
VSUB dg2,df1RS,df3 ;// e2 = (d1>>1) - d3
VADD dg3,df1,df3RS ;// e3 = d1 + (d3>>1)
VADD dh0,dg0,dg3 ;// f0 = e0 + e3
VADD dh1,dg1,dg2 ;// f1 = e1 + e2
VSUB dh2,dg1,dg2 ;// f2 = e1 - e2
VSUB dh3,dg0,dg3 ;// f3 = e0 - e3
;//************************************************
;// Calculate final value (colOp[i][j] + 32)>>6
;//************************************************
VRSHR dh0,#6
VRSHR dh1,#6
VRSHR dh2,#6
VRSHR dh3,#6
B OutDCcase
DCcase
;// Calculate the Transformed DCvalue : (DCval+32)>>6
LDRSH DCval,[pDCTemp]
ADD DCval,DCval,#32
ASR DCval,DCval,#6
VDUP dDeltaRow0, DCval ;// pDelta[0] = pDelta[1] = pDelta[2] = pDelta[3] = DCval
VDUP dDeltaRow1, DCval ;// pDelta[4] = pDelta[5] = pDelta[6] = pDelta[7] = DCval
VDUP dDeltaRow2, DCval ;// pDelta[8] = pDelta[9] = pDelta[10] = pDelta[11] = DCval
VDUP dDeltaRow3, DCval
OutDCcase
M_LDR predstep,predStepOnStack
M_LDR dstStep,dstStepOnStack
LDR PredVal1,[pPredTemp],predstep
LDR PredVal2,[pPredTemp],predstep
VMOV dPredValRow01,PredVal1,PredVal2
LDR PredVal1,[pPredTemp],predstep
LDR PredVal2,[pPredTemp]
VMOV dPredValRow23,PredVal1,PredVal2
VADDW qSumRow01,qDeltaRow01,dPredValRow01
VADDW qSumRow23,qDeltaRow23,dPredValRow23
VQMOVUN dDstRow01,qSumRow01
VQMOVUN dDstRow23,qSumRow23
VST1 dDstRow0,[pDstTemp],dstStep
VST1 dDstRow1,[pDstTemp],dstStep
VST1 dDstRow2,[pDstTemp],dstStep
VST1 dDstRow3,[pDstTemp]
;// Set return value
MOV result,#OMX_Sts_NoErr
End
;// Write function tail
M_END
ENDIF ;//CORTEXA8
END