blob: d8c2431251a023b171f4d7262de79565cb754a84 [file] [log] [blame]
;//
;// Copyright (C) 2007-2008 ARM Limited
;//
;// Licensed under the Apache License, Version 2.0 (the "License");
;// you may not use this file except in compliance with the License.
;// You may obtain a copy of the License at
;//
;// http://www.apache.org/licenses/LICENSE-2.0
;//
;// Unless required by applicable law or agreed to in writing, software
;// distributed under the License is distributed on an "AS IS" BASIS,
;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
;// See the License for the specific language governing permissions and
;// limitations under the License.
;//
;//
;//
;// File Name: omxVCM4P10_TransformDequantLumaDCFromPair_s.s
;// OpenMAX DL: v1.0.2
;// Revision: 12290
;// Date: Wednesday, April 9, 2008
;//
;//
;//
;//
;// Description:
;// H.264 inverse quantize and transform module
;//
;//
;// Include standard headers
INCLUDE omxtypes_s.h
INCLUDE armCOMM_s.h
;// Import/Export symbols required from/to other files
;// (For example tables)
IMPORT armVCM4P10_UnpackBlock4x4
IMPORT armVCM4P10_QPDivTable
IMPORT armVCM4P10_VMatrixQPModTable
M_VARIANTS CortexA8
;// Set debugging level
;//DEBUG_ON SETL {TRUE}
;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
;// Guarding implementation by the processor name
;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
;// Guarding implementation by the processor name
IF CortexA8
;//Input Registers
pData RN 0
QP RN 1
;//Local Scratch Registers
;// ARM Registers
pQPDivTable RN 2
pQPModTable RN 3
Shift RN 4
Scale RN 5
;// NEON Registers
;// Packed Input pixels
dIn0 DN D0.S16
dIn1 DN D1.S16
dIn2 DN D2.S16
dIn3 DN D3.S16
;// Intermediate calculations
dRowSum1 DN D4.S16
dRowSum2 DN D5.S16
dRowDiff1 DN D6.S16
dRowDiff2 DN D7.S16
;// Row operated pixels
dRowOp0 DN D0.S16
dRowOp1 DN D1.S16
dRowOp2 DN D2.S16
dRowOp3 DN D3.S16
qRowOp01 QN Q0.32
qRowOp23 QN Q1.32
;// Intermediate calculations
dColSum1 DN D4.S16
dColSum2 DN D5.S16
dColDiff1 DN D6.S16
dColDiff2 DN D7.S16
;// Coloumn operated pixels
dColOp0 DN D0.S16
dColOp1 DN D1.S16
dColOp2 DN D2.S16
dColOp3 DN D3.S16
;// Temporary scratch varaibles
dScale DN D5.S16
qRound0 QN Q3.S32
qRound1 QN Q4.S32
qRound2 QN Q5.S32
qRound3 QN Q6.S32
;// InvTransformed and Dequantized pixels
dOut0 DN D0.S16
dOut1 DN D1.S16
dOut2 DN D2.S16
dOut3 DN D3.S16
;// Allocate stack memory required by the function
;// Write function header
M_START armVCM4P10_InvTransformDequantLumaDC4x4,r5,d13
;******************************************************************
;// The strategy used in implementing the transform is as follows:*
;// Load the 4x4 block into 4 D-registers *
;// Transpose the 4x4 matrix *
;// Perform the row operations (on columns) using SIMD *
;// Transpose the 4x4 result matrix *
;// Perform the coloumn operations *
;******************************************************************
;// Load all the 4x4 pixels in Transposed form
VLD4 {dIn0,dIn1,dIn2,dIn3},[pData]
LDR pQPDivTable, =armVCM4P10_QPDivTable ;// QP Division look-up-table base pointer
LDR pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer
;****************************************
;// Row Operations (Performed on columns)
;****************************************
;// Scale factor calculation is done using ARM instructions
;// Interleaved with NEON instructions inorder to Dual issue
VADD dRowSum1,dIn0,dIn1
VADD dRowSum2,dIn2,dIn3
VSUB dRowDiff1,dIn0,dIn1
LDRSB Shift, [pQPDivTable, QP] ;// ARM CODE: Shift = pQPDivTable[QP]
VSUB dRowDiff2,dIn2,dIn3
LDRSB Scale, [pQPModTable, QP] ;// ARM CODE: Scale = pQPModTable[QP]
VADD dRowOp0,dRowSum1,dRowSum2
VSUB dRowOp1,dRowSum1,dRowSum2
VSUB dRowOp2,dRowDiff1,dRowDiff2
LSL Scale, Scale, Shift ;// ARM CODE: Scale = Scale << Shift
VADD dRowOp3,dRowDiff1,dRowDiff2
;****************************************
;// Transpose the resultant matrix
;****************************************
VTRN dRowOp0,dRowOp1
VTRN dRowOp2,dRowOp3
VTRN qRowOp01,qRowOp23
;****************************************
;// Coloumn Operations
;****************************************
VADD dColSum1,dRowOp0,dRowOp1
VADD dColSum2,dRowOp2,dRowOp3
VSUB dColDiff1,dRowOp0,dRowOp1
VSUB dColDiff2,dRowOp2,dRowOp3
VADD dColOp0,dColSum1,dColSum2
VSUB dColOp1,dColSum1,dColSum2
VSUB dColOp2,dColDiff1,dColDiff2
VADD dColOp3,dColDiff1,dColDiff2
;//----------------------------------------------------------------------
;//
;// <Dequantize> improves on the c-reference code
;// Both the cases i.e., Shift>=0 and Shift<0 cases are covered together
;// We do not subtract 2 from Shift as in C reference, instead perform a
;// Scale << Shift once in the beginning and do a right shift by a
;// constant 2 after the Multiplication. The value of Round would be 2
;//
;// By doing this we aviod the Branches required and also
;// reduce the code size substantially
;//
;//----------------------------------------------------------------------
VDUP dScale, Scale ;// ARM -> NEON copy 'scale' to vector
VMOV qRound0,#2 ;// Set the Round Value
VMOV qRound1,#2
VMOV qRound2,#2
VMOV qRound3,#2
VMLAL qRound0,dColOp0,dScale ;// pDst[i] * Scale + Round
VMLAL qRound1,dColOp1,dScale
VMLAL qRound2,dColOp2,dScale
VMLAL qRound3,dColOp3,dScale
VSHRN dOut0,qRound0,#2 ;// Right shift by 2 & (OMX_S16)Value
VSHRN dOut1,qRound1,#2
VSHRN dOut2,qRound2,#2
VSHRN dOut3,qRound3,#2
;***************************
;// Store all the 4x4 pixels
;***************************
VST1 {dOut0,dOut1,dOut2,dOut3}, [pData]
;// Set return value
;// Write function tail
M_END
ENDIF ;//CORTEXA8
;// Function: omxVCM4P10_TransformDequantLumaDCFromPair
;//Input Registers
ppSrc RN 0
pDst RN 1
QPR2 RN 2
;//Output Registers
result RN 0
;//Local Scratch Registers
pDstR4 RN 4
pDstR0 RN 0
QPR1 RN 1
QPR5 RN 5
;// Guarding implementation by the processor name
IF CortexA8
;// Allocate stack memory required by the function
;// Write function header
M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5
MOV pDstR4,pDst ;// Saving register r1
MOV QPR5,QPR2 ;// Saving register r2
BL armVCM4P10_UnpackBlock4x4
MOV pDstR0,pDstR4 ;// Setting up register r0
MOV QPR1,QPR5 ;// Setting up register r1
BL armVCM4P10_InvTransformDequantLumaDC4x4
;// Set return value
MOV result,#OMX_Sts_NoErr
;// Write function tail
M_END
ENDIF ;//ARM1136JS
END