blob: 3c1aec305fd2a4ff3d480be3048cee01576bbde2 [file] [log] [blame]
;//
;// Copyright (C) 2007-2008 ARM Limited
;//
;// Licensed under the Apache License, Version 2.0 (the "License");
;// you may not use this file except in compliance with the License.
;// You may obtain a copy of the License at
;//
;// http://www.apache.org/licenses/LICENSE-2.0
;//
;// Unless required by applicable law or agreed to in writing, software
;// distributed under the License is distributed on an "AS IS" BASIS,
;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
;// See the License for the specific language governing permissions and
;// limitations under the License.
;//
;//
;//
;// File Name: omxVCM4P2_MCReconBlock_s.s
;// OpenMAX DL: v1.0.2
;// Revision: 12290
;// Date: Wednesday, April 9, 2008
;//
;//
;//
;//
;// Description:
;//
;//
;// Include standard headers
INCLUDE omxtypes_s.h
INCLUDE armCOMM_s.h
;// Import symbols required from other files
M_VARIANTS CortexA8
;// ***************************************************************************
;// ARM1136JS implementation
;// ***************************************************************************
;// ***************************************************************************
;// CortexA8 implementation
;// ***************************************************************************
IF CortexA8
;// ***************************************************************************
;// MACRO DEFINITIONS
;// ***************************************************************************
;// Description:
;// Does interpolation for the case of "IntegerPixel" predictType. Both
;// rounding cases are handled. Just copies a block from pSrc to pDst
;//
;// Syntax:
;// M_MCRECONBLOCK_IntegerPixel
;//
;// Inputs: None
;// Outputs: None
MACRO
M_MCRECONBLOCK_IntegerPixel
CaseIntegerPixel_Rnd0
CaseIntegerPixel_Rnd1
VLD1 dRow0, [pSrc], srcStep
VLD1 dRow1, [pSrc], srcStep
VLD1 dRow2, [pSrc], srcStep
VLD1 dRow3, [pSrc], srcStep
VLD1 dRow4, [pSrc], srcStep
VLD1 dRow5, [pSrc], srcStep
VLD1 dRow6, [pSrc], srcStep
VLD1 dRow7, [pSrc], srcStep
VST1 dRow0, [pDst@64], dstStep
VST1 dRow1, [pDst@64], dstStep
VST1 dRow2, [pDst@64], dstStep
VST1 dRow3, [pDst@64], dstStep
VST1 dRow4, [pDst@64], dstStep
VST1 dRow5, [pDst@64], dstStep
VST1 dRow6, [pDst@64], dstStep
VST1 dRow7, [pDst@64], dstStep
B SwitchPredictTypeEnd
MEND
;// ***************************************************************************
;// Description:
;// Does interpolation for the case of "HalfPixelX" predictType. The two
;// rounding cases are handled by the parameter "$rndVal". Averages between
;// a pixel and pixel right to it, rounding it based on $rndVal. The
;// rounding is implemented by using opCode switching between "VRHADD" and
;// "VHADD" instructions.
;//
;// Syntax:
;// M_MCRECONBLOCK_HalfPixelX $rndVal
;//
;// Inputs:
;// $rndVal: 0 for rounding and 1 for no rounding
;// Outputs: None
MACRO
M_MCRECONBLOCK_HalfPixelX $rndVal
LCLS M_VHADDR
IF $rndVal = 0
M_VHADDR SETS "VRHADD"
ELSE
M_VHADDR SETS "VHADD"
ENDIF
CaseHalfPixelX_Rnd$rndVal
VLD1 {dRow0, dRow0Shft}, [pSrc], srcStep
VEXT dRow0Shft, dRow0, dRow0Shft, #1
VLD1 {dRow1, dRow1Shft}, [pSrc], srcStep
VEXT dRow1Shft, dRow1, dRow1Shft, #1
VLD1 {dRow2, dRow2Shft}, [pSrc], srcStep
VEXT dRow2Shft, dRow2, dRow2Shft, #1
VLD1 {dRow3, dRow3Shft}, [pSrc], srcStep
VEXT dRow3Shft, dRow3, dRow3Shft, #1
VLD1 {dRow4, dRow4Shft}, [pSrc], srcStep
VEXT dRow4Shft, dRow4, dRow4Shft, #1
VLD1 {dRow5, dRow5Shft}, [pSrc], srcStep
VEXT dRow5Shft, dRow5, dRow5Shft, #1
VLD1 {dRow6, dRow6Shft}, [pSrc], srcStep
VEXT dRow6Shft, dRow6, dRow6Shft, #1
VLD1 {dRow7, dRow7Shft}, [pSrc], srcStep
VEXT dRow7Shft, dRow7, dRow7Shft, #1
$M_VHADDR dRow0, dRow0, dRow0Shft
$M_VHADDR dRow1, dRow1, dRow1Shft
VST1 dRow0, [pDst@64], dstStep
$M_VHADDR dRow2, dRow2, dRow2Shft
VST1 dRow1, [pDst@64], dstStep
$M_VHADDR dRow3, dRow3, dRow3Shft
VST1 dRow2, [pDst@64], dstStep
$M_VHADDR dRow4, dRow4, dRow4Shft
VST1 dRow3, [pDst@64], dstStep
$M_VHADDR dRow5, dRow5, dRow5Shft
VST1 dRow4, [pDst@64], dstStep
$M_VHADDR dRow6, dRow6, dRow6Shft
VST1 dRow5, [pDst@64], dstStep
$M_VHADDR dRow7, dRow7, dRow7Shft
VST1 dRow6, [pDst@64], dstStep
VST1 dRow7, [pDst@64], dstStep
B SwitchPredictTypeEnd
MEND
;// ***************************************************************************
;// Description:
;// Does interpolation for the case of "HalfPixelY" predictType. The two
;// rounding cases are handled by the parameter "$rndVal". Averages between
;// a pixel and pixel below it, rounding it based on $rndVal. The
;// rounding is implemented by using opCode switching between "VRHADD" and
;// "VHADD" instructions.
;//
;// Syntax:
;// M_MCRECONBLOCK_HalfPixelY $rndVal
;//
;// Inputs:
;// $rndVal: 0 for rounding and 1 for no rounding
;// Outputs: None
MACRO
M_MCRECONBLOCK_HalfPixelY $rndVal
LCLS M_VHADDR
IF $rndVal = 0
M_VHADDR SETS "VRHADD"
ELSE
M_VHADDR SETS "VHADD"
ENDIF
CaseHalfPixelY_Rnd$rndVal
VLD1 dRow0, [pSrc], srcStep
VLD1 dRow1, [pSrc], srcStep
VLD1 dRow2, [pSrc], srcStep
VLD1 dRow3, [pSrc], srcStep
VLD1 dRow4, [pSrc], srcStep
VLD1 dRow5, [pSrc], srcStep
VLD1 dRow6, [pSrc], srcStep
VLD1 dRow7, [pSrc], srcStep
$M_VHADDR dRow0, dRow0, dRow1
VLD1 dRow8, [pSrc], srcStep
$M_VHADDR dRow1, dRow1, dRow2
VST1 dRow0, [pDst@64], dstStep
$M_VHADDR dRow2, dRow2, dRow3
VST1 dRow1, [pDst@64], dstStep
$M_VHADDR dRow3, dRow3, dRow4
VST1 dRow2, [pDst@64], dstStep
$M_VHADDR dRow4, dRow4, dRow5
VST1 dRow3, [pDst@64], dstStep
$M_VHADDR dRow5, dRow5, dRow6
VST1 dRow4, [pDst@64], dstStep
$M_VHADDR dRow6, dRow6, dRow7
VST1 dRow5, [pDst@64], dstStep
$M_VHADDR dRow7, dRow7, dRow8
VST1 dRow6, [pDst@64], dstStep
VST1 dRow7, [pDst@64], dstStep
B SwitchPredictTypeEnd
MEND
;// ***************************************************************************
;// Description:
;// Does interpolation for the case of "IntegerPixel" predictType. Both
;// rounding cases are handled.
;// Typical computation for a row goes like this
;// 1. VLD1 {dRow0, dRow0Shft}, [pSrc], srcStep ;// Load the row and next 8 bytes
;// 2. VEXT dRow0Shft, dRow0, dRow0Shft, #1 ;// Generate the shifted row
;// 3. VADDL qSum0, dRow0, dRow0Shft ;// Generate the sum of row and shifted row
;// 5. VADD qSum0, qSum0, qSum1 ;// Add to the sum of next row (odd row sum has rounding value added to it)
;// 6. VSHRN dRow0, qSum0, #2 ;// Divide by 4
;// 7. VST1 dRow0, [pDst@64], dstStep ;// Store
;// Odd rows undergo following computation after step 3
;// 4. VADD qSum1, qSum1, qRound
;// This saves for adding rounding value to each final sum (overall saves 4
;// instructions).
;// There is reuse of registers for qSum6, qSum7 & qSum8. Overall scheduling takes
;// care of this and also minimizes stalls. Rounding value was modified in
;// ARM register rndVal (originally used for rounding flag) before the switch.
;// It is then populated into all lanes in this macro. No branching out to
;// label "SwitchPredictTypeEnd" is required in the end of the macro as these
;// are the last of switch cases.
;//
;// Syntax:
;// M_MCRECONBLOCK_HalfPixelXY
;//
;// Inputs: None
;// Outputs: None
MACRO
M_MCRECONBLOCK_HalfPixelXY
CaseHalfPixelXY_Rnd0
CaseHalfPixelXY_Rnd1
VLD1 {dRow0, dRow0Shft}, [pSrc], srcStep
VDUP qRound, rndVal
VLD1 {dRow1, dRow1Shft}, [pSrc], srcStep
VEXT dRow0Shft, dRow0, dRow0Shft, #1
VLD1 {dRow2, dRow2Shft}, [pSrc], srcStep
VEXT dRow1Shft, dRow1, dRow1Shft, #1
VLD1 {dRow3, dRow3Shft}, [pSrc], srcStep
VEXT dRow2Shft, dRow2, dRow2Shft, #1
VLD1 {dRow4, dRow4Shft}, [pSrc], srcStep
VADDL qSum0, dRow0, dRow0Shft
VLD1 {dRow5, dRow5Shft}, [pSrc], srcStep
VADDL qSum1, dRow1, dRow1Shft
VLD1 {dRow6, dRow6Shft}, [pSrc], srcStep
VEXT dRow3Shft, dRow3, dRow3Shft, #1
VLD1 {dRow7, dRow7Shft}, [pSrc], srcStep
VEXT dRow4Shft, dRow4, dRow4Shft, #1
VLD1 {dRow8, dRow8Shft}, [pSrc], srcStep
VADD qSum1, qSum1, qRound
VADDL qSum2, dRow2, dRow2Shft
VEXT dRow5Shft, dRow5, dRow5Shft, #1
VADD qSum0, qSum0, qSum1
VADDL qSum3, dRow3, dRow3Shft
VEXT dRow6Shft, dRow6, dRow6Shft, #1
VADD qSum1, qSum1, qSum2
VSHRN dRow0, qSum0, #2
VADDL qSum4, dRow4, dRow4Shft
VSHRN dRow1, qSum1, #2
VADD qSum3, qSum3, qRound
VADDL qSum5, dRow5, dRow5Shft
VST1 dRow0, [pDst@64], dstStep
VEXT dRow7Shft, dRow7, dRow7Shft, #1
VST1 dRow1, [pDst@64], dstStep
VEXT dRow8Shft, dRow8, dRow8Shft, #1
VADD qSum5, qSum5, qRound
VADD qSum2, qSum2, qSum3
VADD qSum3, qSum3, qSum4
VADD qSum4, qSum4, qSum5
VSHRN dRow2, qSum2, #2
VSHRN dRow3, qSum3, #2
VSHRN dRow4, qSum4, #2
VADDL qSum6, dRow6, dRow6Shft
VADDL qSum7, dRow7, dRow7Shft
VST1 dRow2, [pDst@64], dstStep
VADDL qSum8, dRow8, dRow8Shft
VADD qSum7, qSum7, qRound
VST1 dRow3, [pDst@64], dstStep
VST1 dRow4, [pDst@64], dstStep
VADD qSum5, qSum5, qSum6
VADD qSum6, qSum6, qSum7
VADD qSum7, qSum7, qSum8
VSHRN dRow5, qSum5, #2
VSHRN dRow6, qSum6, #2
VSHRN dRow7, qSum7, #2
VST1 dRow5, [pDst@64], dstStep
VST1 dRow6, [pDst@64], dstStep
VST1 dRow7, [pDst@64], dstStep
MEND
;// ***************************************************************************
;// Input/Output Registers
pSrc RN 0
srcStep RN 1
pSrcResidue RN 2
pDst RN 3
dstStep RN 4
predictType RN 5
rndVal RN 6
;// Local Scratch Registers
pDstCopy RN 0
return RN 0
;// Neon Registers
dRow0 DN D0.U8
dRow0Shft DN D1.U8
dRow1 DN D2.U8
dRow1Shft DN D3.U8
dRow2 DN D4.U8
dRow2Shft DN D5.U8
dRow3 DN D6.U8
dRow3Shft DN D7.U8
dRow4 DN D8.U8
dRow4Shft DN D9.U8
dRow5 DN D10.U8
dRow5Shft DN D11.U8
dRow6 DN D12.U8
dRow6Shft DN D13.U8
dRow7 DN D14.U8
dRow7Shft DN D15.U8
dRow8 DN D16.U8
dRow8Shft DN D17.U8
qSum0 QN Q9.U16
qSum1 QN Q10.U16
qSum2 QN Q11.U16
qSum3 QN Q12.U16
qSum4 QN Q13.U16
qSum5 QN Q14.U16
qSum6 QN Q0.U16
qSum7 QN Q1.U16
qSum8 QN Q2.U16
qRound QN Q15.U16
dDst0 DN D0.U8
dDst1 DN D1.U8
dDst2 DN D2.U8
dDst3 DN D3.U8
dDst4 DN D4.U8
dDst5 DN D5.U8
dDst6 DN D6.U8
dDst7 DN D7.U8
qRes0 QN Q4.S16
qRes1 QN Q5.S16
qRes2 QN Q6.S16
qRes3 QN Q7.S16
qRes4 QN Q8.S16
qRes5 QN Q9.S16
qRes6 QN Q10.S16
qRes7 QN Q11.S16
;// Function header
M_START omxVCM4P2_MCReconBlock, r6, d15
;// Define stack arguments
M_ARG Arg_dstStep, 4
M_ARG Arg_predictType, 4
M_ARG Arg_rndVal, 4
;// Load argument from the stack
M_LDR dstStep, Arg_dstStep
M_LDR predictType, Arg_predictType
M_LDR rndVal, Arg_rndVal
ADD predictType, rndVal, predictType, LSL #1
RSB rndVal, rndVal, #2 ;// preparing rndVal for HalfPixelXY
;// The following is implementation of switching to different code segments
;// based on different predictType and rndVal flags. The corresponding
;// labels (e.g. CaseIntegerPixel_Rnd0) are embedded in the macros following
;// M_ENDSWITCH (e.g. M_MCRECONBLOCK_IntegerPixel). While "M_MCRECONBLOCK_IntegerPixel"
;// and "M_MCRECONBLOCK_HalfPixelXY" handle for both rounding cases;
;// "M_MCRECONBLOCK_HalfPixelX" and "M_MCRECONBLOCK_HalfPixelY" macros handle
;// the two rounding cases in separate code bases.
;// All these together implement the interpolation functionality
M_SWITCH predictType
M_CASE CaseIntegerPixel_Rnd0
M_CASE CaseIntegerPixel_Rnd1
M_CASE CaseHalfPixelX_Rnd0
M_CASE CaseHalfPixelX_Rnd1
M_CASE CaseHalfPixelY_Rnd0
M_CASE CaseHalfPixelY_Rnd1
M_CASE CaseHalfPixelXY_Rnd0
M_CASE CaseHalfPixelXY_Rnd1
M_ENDSWITCH
M_MCRECONBLOCK_IntegerPixel
M_MCRECONBLOCK_HalfPixelX 0
M_MCRECONBLOCK_HalfPixelX 1
M_MCRECONBLOCK_HalfPixelY 0
M_MCRECONBLOCK_HalfPixelY 1
M_MCRECONBLOCK_HalfPixelXY
SwitchPredictTypeEnd
;// After interpolation is done, residue needs to be added. This is done
;// only in case "pSrcResidue" parameter to the function is not NULL.
;// Following is a completely unrolled code to do so. Each row and
;// corresponding residue is loaded and residue is added and value
;// stored
CMP pSrcResidue, #0
SUBNE pDst, pDst, dstStep, LSL #3 ;// Restoring pDst
MOVNE pDstCopy, pDst
BEQ pSrcResidueConditionEnd
pSrcResidueNotNull
VLD1 dDst0, [pDst@64], dstStep
VLD1 qRes0, [pSrcResidue@128]!
VLD1 dDst1, [pDst@64], dstStep
VLD1 qRes1, [pSrcResidue@128]!
VLD1 dDst2, [pDst@64], dstStep
VLD1 qRes2, [pSrcResidue@128]!
VADDW qRes0, qRes0, dDst0
VLD1 dDst3, [pDst@64], dstStep
VADDW qRes1, qRes1, dDst1
VLD1 qRes3, [pSrcResidue@128]!
VADDW qRes2, qRes2, dDst2
VLD1 dDst4, [pDst@64], dstStep
VQMOVUN dDst0, qRes0
VLD1 qRes4, [pSrcResidue@128]!
VADDW qRes3, qRes3, dDst3
VLD1 dDst5, [pDst@64], dstStep
VQMOVUN dDst1, qRes1
VLD1 qRes5, [pSrcResidue@128]!
VADDW qRes4, qRes4, dDst4
VLD1 dDst6, [pDst@64], dstStep
VQMOVUN dDst2, qRes2
VLD1 qRes6, [pSrcResidue@128]!
VADDW qRes5, qRes5, dDst5
VLD1 dDst7, [pDst@64], dstStep
VQMOVUN dDst3, qRes3
VLD1 qRes7, [pSrcResidue@128]!
VADDW qRes6, qRes6, dDst6
VST1 dDst0, [pDstCopy@64], dstStep
VQMOVUN dDst4, qRes4
VST1 dDst1, [pDstCopy@64], dstStep
VADDW qRes7, qRes7, dDst7
VST1 dDst2, [pDstCopy@64], dstStep
VQMOVUN dDst5, qRes5
VST1 dDst3, [pDstCopy@64], dstStep
VQMOVUN dDst6, qRes6
VST1 dDst4, [pDstCopy@64], dstStep
VQMOVUN dDst7, qRes7
VST1 dDst5, [pDstCopy@64], dstStep
VST1 dDst6, [pDstCopy@64], dstStep
VST1 dDst7, [pDstCopy@64], dstStep
pSrcResidueConditionEnd
MOV return, #OMX_Sts_NoErr
M_END
ENDIF ;// CortexA8
END
;// ***************************************************************************
;// omxVCM4P2_MCReconBlock ends
;// ***************************************************************************