blob: b5780efa7b21109e2dc4e02d9ed776757511d404 [file] [log] [blame]
;//
;// Copyright (C) 2007-2008 ARM Limited
;//
;// Licensed under the Apache License, Version 2.0 (the "License");
;// you may not use this file except in compliance with the License.
;// You may obtain a copy of the License at
;//
;// http://www.apache.org/licenses/LICENSE-2.0
;//
;// Unless required by applicable law or agreed to in writing, software
;// distributed under the License is distributed on an "AS IS" BASIS,
;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
;// See the License for the specific language governing permissions and
;// limitations under the License.
;//
;//
;//
;// File Name: omxVCM4P10_PredictIntra_4x4_s.s
;// OpenMAX DL: v1.0.2
;// Revision: 12290
;// Date: Wednesday, April 9, 2008
;//
;//
;//
;//
INCLUDE omxtypes_s.h
INCLUDE armCOMM_s.h
;// Define the processor variants supported by this file
M_VARIANTS CortexA8
;//-------------------------------------------------------
;// This table for implementing switch case of C in asm by
;// the mehtod of two levels of indexing.
;//-------------------------------------------------------
M_TABLE armVCM4P10_pSwitchTable4x4
DCD OMX_VC_4x4_VERT, OMX_VC_4x4_HOR
DCD OMX_VC_4x4_DC, OMX_VC_4x4_DIAG_DL
DCD OMX_VC_4x4_DIAG_DR, OMX_VC_4x4_VR
DCD OMX_VC_4x4_HD, OMX_VC_4x4_VL
DCD OMX_VC_4x4_HU
IF CortexA8
;//--------------------------------------------
;// Scratch variable
;//--------------------------------------------
return RN 0
pTable RN 8
pc RN 15
;//--------------------------------------------
;// Declare input registers
;//--------------------------------------------
pSrcLeft RN 0 ;// input pointer
pSrcAbove RN 1 ;// input pointer
pSrcAboveLeft RN 2 ;// input pointer
pDst RN 3 ;// output pointer
leftStep RN 4 ;// input variable
dstStep RN 5 ;// input variable
predMode RN 6 ;// input variable
availability RN 7 ;// input variable
pDst1 RN 1
pDst2 RN 4
pDst3 RN 6
pSrcTmp RN 9
srcStep RN 10
pDstTmp RN 11
dstep RN 12
;//-------------------
;// Neon registers
;//-------------------
;// OMX_VC_CHROMA_VERT
dAboveU32 DN D0.U32
;// OMX_VC_CHROMA_HOR
dLeftVal0 DN D0.8
dLeftVal1 DN D1.8
dLeftVal2 DN D2.8
dLeftVal3 DN D3.8
dLeftVal0U32 DN D0.U32
dLeftVal1U32 DN D1.U32
dLeftVal2U32 DN D2.U32
dLeftVal3U32 DN D3.U32
;// OMX_VC_4x4_DC
dLeftVal DN D0.U8
dLeftValU32 DN D0.U32
dSumAboveLeftU16 DN D1.U16
dSumAboveLeftU32 DN D1.U32
dSumAboveLeftU64 DN D1.U64
dSumAboveLeftU8 DN D1.U8
dSum DN D0.U8
dSumLeftValU16 DN D1.U16
dSumLeftValU32 DN D1.U32
dSumLeftValU64 DN D1.U64
dSumLeftValU8 DN D1.U8
dAboveVal DN D0.U8
dSumAboveValU16 DN D1.U16
dSumAboveValU32 DN D1.U32
dSumAboveValU64 DN D1.U64
dSumAboveValU8 DN D1.U8
dConst128U8 DN D0.U8
;//OMX_VC_4x4_DIAG_DL
dAbove DN D0.U8
dU7 DN D2.U8
dU3 DN D2.U8
dAbove0 DN D3.U8
dAbove1 DN D4.U8
dAbove2 DN D5.U8
dTmp DN D6.U8
dTmp0 DN D7.U8
dTmp1 DN D8.U8
dTmp2 DN D9.U8
dTmp3 DN D10.U8
dTmpU32 DN D6.U32
;//OMX_VC_4x4_DIAG_DR
dLeft DN D1.U8
dUL DN D2.U8
;//OMX_VC_4x4_VR
dLeft0 DN D1.U8
dLeft1 DN D2.U8
dEven0 DN D3.U8
dEven1 DN D4.U8
dEven2 DN D5.U8
dOdd0 DN D6.U8
dOdd1 DN D11.U8
dOdd2 DN D12.U8
dTmp3U32 DN D10.U32
dTmp2U32 DN D9.U32
;//OMX_VC_4x4_HD
dTmp1U64 DN D8.U64
dTmp0U64 DN D7.U64
dTmpU64 DN D6.U64
dTmpU32 DN D6.U32
dTmp1U32 DN D8.U32
;//OMX_VC_4x4_HU
dL3 DN D2.U8
dLeftHU0 DN D3.U8
dLeftHU1 DN D4.U8
dLeftHU2 DN D5.U8
dTmp0U32 DN D7.U32
;//-----------------------------------------------------------------------------------------------
;// omxVCM4P10_PredictIntra_4x4 starts
;//-----------------------------------------------------------------------------------------------
;// Write function header
M_START omxVCM4P10_PredictIntra_4x4, r12,d12
;// Define stack arguments
M_ARG LeftStep, 4
M_ARG DstStep, 4
M_ARG PredMode, 4
M_ARG Availability, 4
LDR pTable,=armVCM4P10_pSwitchTable4x4 ;// Load index table for switch case
;// Load argument from the stack
M_LDRD predMode,availability,PredMode ;// Arg predMode & availability loaded from stack to reg
M_LDRD leftStep,dstStep,LeftStep ;// Arg leftStep & dstStep loaded from stack to reg
LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode
OMX_VC_4x4_HOR
ADD pSrcTmp, pSrcLeft, leftStep
ADD srcStep, leftStep, leftStep
;// Load Left Edge
VLD1 {dLeftVal0[]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep]
VLD1 {dLeftVal1[]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep]
VLD1 {dLeftVal2[]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
VLD1 {dLeftVal3[]},[pSrcTmp] ;// pSrcLeft[3*leftStep]
ADD pDstTmp, pDst, dstStep
ADD dstep, dstStep, dstStep
VST1 dLeftVal0U32[0],[pDst],dstep ;// pDst[0*dstStep+x] :0<= x <= 7
VST1 dLeftVal1U32[0],[pDstTmp],dstep ;// pDst[1*dstStep+x] :0<= x <= 7
VST1 dLeftVal2U32[0],[pDst] ;// pDst[2*dstStep+x] :0<= x <= 7
VST1 dLeftVal3U32[0],[pDstTmp] ;// pDst[3*dstStep+x] :0<= x <= 7
B ExitPredict4x4 ;// Branch to exit code
OMX_VC_4x4_VERT
;// Load Upper Edge
VLD1 dAboveU32[0],[pSrcAbove]
ADD pDstTmp, pDst, dstStep
ADD dstep, dstStep, dstStep
DCPredict4x4VertStore
VST1 dAboveU32[0],[pDst],dstep
VST1 dAboveU32[0],[pDstTmp],dstep
VST1 dAboveU32[0],[pDst]
VST1 dAboveU32[0],[pDstTmp]
B ExitPredict4x4 ;// Branch to exit code
OMX_VC_4x4_DC
TST availability, #OMX_VC_LEFT
BEQ DCPredict4x4LeftNotAvailable
ADD pSrcTmp, pSrcLeft, leftStep
ADD srcStep, leftStep, leftStep
;// Load Left Edge
VLD1 {dLeftVal[0]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep]
VLD1 {dLeftVal[1]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep]
VLD1 {dLeftVal[2]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
VLD1 {dLeftVal[3]},[pSrcTmp] ;// pSrcLeft[3*leftStep]
TST availability, #OMX_VC_UPPER
BEQ DCPredict4x4LeftOnlyAvailable
;// Load Upper Edge also
VLD1 dLeftValU32[1],[pSrcAbove] ;// pSrcAbove[0 to 3]
MOV return, #OMX_Sts_NoErr
VPADDL dSumAboveLeftU16, dLeftVal ;// [pSrcAbove[2+3 | 0+1] | pSrcLeft[2+3 | 0+1]]
VPADDL dSumAboveLeftU32, dSumAboveLeftU16 ;// [pSrcAbove[2+3+0+1] | pSrcLeft[2+3+0+1]]
VPADDL dSumAboveLeftU64, dSumAboveLeftU32 ;// [pSrcAbove[2+3+0+1] + pSrcLeft[2+3+0+1]]
VRSHR dSumAboveLeftU64,dSumAboveLeftU64,#3 ;// Sum = (Sum + 4) >> 3
ADD pDstTmp, pDst, dstStep
ADD dstep, dstStep, dstStep
VDUP dSum,dSumAboveLeftU8[0]
B DCPredict4x4VertStore
DCPredict4x4LeftOnlyAvailable
MOV return, #OMX_Sts_NoErr ;// returnNoError
VPADDL dSumLeftValU16, dLeftVal ;// [ XX | pSrcLeft[2+3 | 0+1]]
VPADDL dSumLeftValU32, dSumLeftValU16 ;// [ XXXX | pSrcLeft[2+3+0+1]]
VRSHR dSumLeftValU32,dSumLeftValU32,#2 ;// Sum = (Sum + 2) >> 2
ADD pDstTmp, pDst, dstStep
ADD dstep, dstStep, dstStep
VDUP dSum,dSumLeftValU8[0]
B DCPredict4x4VertStore
DCPredict4x4LeftNotAvailable
TST availability, #OMX_VC_UPPER
BEQ DCPredict4x4NoneAvailable
;// Load Upper Edge
VLD1 dAboveU32[0],[pSrcAbove] ;// pSrcAbove[0 to 3]
MOV return, #OMX_Sts_NoErr
VPADDL dSumAboveValU16, dAboveVal ;// [ XX | pSrcAbove[2+3 | 0+1]]
VPADDL dSumAboveValU32, dSumAboveValU16 ;// [ XXXX | pSrcAbove[2+3+0+1]]
VRSHR dSumAboveValU32,dSumAboveValU32,#2 ;// Sum = (Sum + 2) >> 2
ADD pDstTmp, pDst, dstStep
ADD dstep, dstStep, dstStep
VDUP dSum,dSumAboveValU8[0]
B DCPredict4x4VertStore
DCPredict4x4NoneAvailable
VMOV dConst128U8,#0x80 ;// 0x8080808080808080 if(count == 0)
MOV return, #OMX_Sts_NoErr
ADD pDstTmp, pDst, dstStep
ADD dstep, dstStep, dstStep
B DCPredict4x4VertStore
OMX_VC_4x4_DIAG_DL
TST availability, #OMX_VC_UPPER_RIGHT
BEQ DiagDLUpperRightNotAvailable
VLD1 dAbove0,[pSrcAbove] ;// [U7|U6|U5|U4|U3|U2|U1|U0]
VDUP dU7, dAbove0[7] ;// [U7|U7|U7|U7|U7|U7|U7|U7]
VEXT dAbove1, dAbove0, dU7, #1 ;// [U7|U7|U6|U5|U4|U3|U2|U1]
VEXT dAbove2, dAbove0, dU7, #2 ;// [U7|U7|U7|U6|U5|U4|U3|U2]
B DiagDLPredict4x4Store
DiagDLUpperRightNotAvailable
VLD1 dAboveU32[1],[pSrcAbove] ;// [U3|U2|U1|U0|-|-|-|-]
VDUP dU3, dAbove[7] ;// [U3 U3 U3 U3 U3 U3 U3 U3]
VEXT dAbove0, dAbove, dU3, #4 ;// [U3 U3 U3 U3 U3 U2 U1 U0]
VEXT dAbove1, dAbove, dU3, #5 ;// [U3 U3 U3 U3 U3 U3 U2 U1]
VEXT dAbove2, dAbove, dU3, #6 ;// [U3 U3 U3 U3 U3 U3 U3 U2]
DiagDLPredict4x4Store
VHADD dTmp, dAbove0, dAbove2
VRHADD dTmp, dTmp, dAbove1 ;// (a+2*b+c+2)>>2
VST1 dTmpU32[0],[pDst],dstStep
VEXT dTmp,dTmp,dTmp,#1
VST1 dTmpU32[0],[pDst],dstStep
VEXT dTmp,dTmp,dTmp,#1
VST1 dTmpU32[0],[pDst],dstStep
VEXT dTmp,dTmp,dTmp,#1
VST1 dTmpU32[0],[pDst]
B ExitPredict4x4 ;// Branch to exit code
OMX_VC_4x4_DIAG_DR
;// Load U0,U1,U2,U3
VLD1 dAboveU32[0],[pSrcAbove] ;// [X|X|X|X|U3|U2|U1|U0]
;// Load UL,L0,L1,L2,L3 ;// dLeft = [UL|L0|L1|L2|L3|X|X|X]
VLD1 {dLeft[7]},[pSrcAboveLeft]
ADD pSrcTmp, pSrcLeft, leftStep
ADD srcStep, leftStep, leftStep
ADD pDst1,pDst,dstStep
VLD1 {dLeft[6]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep]
VLD1 {dLeft[5]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep]
VLD1 {dLeft[4]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
VLD1 {dLeft[3]},[pSrcTmp] ;// pSrcLeft[3*leftStep]
VEXT dAbove0,dLeft,dAbove,#3 ;// [U2|U1|U0|UL|L0|L1|L2|L3]
ADD pDst2,pDst1,dstStep
VEXT dAbove1,dLeft,dAbove,#4 ;// [U3|U2|U1|U0|UL|L0|L1|L2]
ADD pDst3,pDst2,dstStep
VEXT dAbove2,dLeft,dAbove,#5 ;// [ X|U3|U2|U1|U0|UL|L0|L1]
VHADD dTmp, dAbove0, dAbove2
VRHADD dTmp, dTmp, dAbove1 ;// (a+2*b+c+2)>>2
VST1 dTmpU32[0],[pDst3] ;// Store pTmp[0],[1],[2],[3] @ pDst3
VEXT dTmp,dTmp,dTmp,#1
VST1 dTmpU32[0],[pDst2] ;// Store pTmp[1],[2],[3],[4] @ pDst2
VEXT dTmp,dTmp,dTmp,#1
VST1 dTmpU32[0],[pDst1] ;// Store pTmp[2],[3],[4],[5] @ pDst1
VEXT dTmp,dTmp,dTmp,#1
VST1 dTmpU32[0],[pDst] ;// Store pTmp[3],[4],[5],[6] @ pDst
B ExitPredict4x4 ;// Branch to exit code
OMX_VC_4x4_VR
;// Load UL,U0,U1,U2,U3
VLD1 dAboveU32[0],[pSrcAbove]
VLD1 dAbove[7],[pSrcAboveLeft] ;// [UL|X|X|X|U3|U2|U1|U0]
;// Load L0,L1,L2 ;// dLeft0 = [L0|L2|X|X|X|X|X|X]
;// dLeft1 = [L1| X|X|X|X|X|X|X]
VLD1 {dLeft0[7]},[pSrcLeft],leftStep ;// pSrcLeft[0*leftStep]
VLD1 {dLeft1[7]},[pSrcLeft],leftStep ;// pSrcLeft[1*leftStep]
VLD1 {dLeft0[6]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
VEXT dOdd2,dAbove,dAbove,#7 ;// [ x x x U3 U2 U1 U0 UL ]
VEXT dEven0,dLeft0,dOdd2,#6 ;// [ x x x U1 U0 UL L0 L2 ]
VEXT dEven1,dLeft1,dOdd2,#7 ;// [ x x x U2 U1 U0 UL L1 ]
VEXT dEven2,dLeft0,dAbove,#7 ;// [ x x x U3 U2 U1 U0 L0 ]
VEXT dOdd0,dLeft1,dAbove,#7 ;// [ x x x U3 U2 U1 U0 L1 ]
VEXT dOdd1,dLeft0,dOdd2,#7 ;// [ x x x U2 U1 U0 UL L0 ]
VHADD dTmp1, dOdd0, dOdd2
VRHADD dTmp1, dTmp1, dOdd1 ;// Tmp[ x x x 9 7 5 3 1 ]
VHADD dTmp0, dEven0, dEven2
VRHADD dTmp0, dTmp0, dEven1 ;// Tmp[ x x x 8 6 4 2 0 ]
VEXT dTmp3,dTmp1,dTmp1,#1 ;// Tmp[ x x x x 9 7 5 3 ]
ADD pDstTmp, pDst, dstStep
ADD dstep, dstStep, dstStep
VEXT dTmp2,dTmp0,dTmp0,#1 ;// Tmp[ x x x x 8 6 4 2 ]
VST1 dTmp3U32[0],[pDst],dstep ;// Tmp[9],[7],[5],[3]
VST1 dTmp2U32[0],[pDstTmp],dstep ;// Tmp[8],[6],[4],[2]
VST1 dTmp1U32[0],[pDst],dstep ;// Tmp[7],[5],[3],[1]
VST1 dTmp0U32[0],[pDstTmp] ;// Tmp[6],[4],[2],[0]
B ExitPredict4x4 ;// Branch to exit code
OMX_VC_4x4_HD
;// Load U0,U1,U2,U3
VLD1 dAbove,[pSrcAbove] ;//dAboveLeftVal = [U7|U6|U5|U4|U3|U2|U1|U0]
;// Load UL,L0,L1,L2,L3 ;// dLeft = [UL|L0|L1|L2|L3|X|X|X]
VLD1 {dLeft[7]},[pSrcAboveLeft]
ADD pSrcTmp, pSrcLeft, leftStep
ADD srcStep, leftStep, leftStep
VLD1 {dLeft[6]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep]
VLD1 {dLeft[5]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep]
VLD1 {dLeft[4]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
VLD1 {dLeft[3]},[pSrcTmp] ;// pSrcLeft[3*leftStep]
VEXT dAbove0,dLeft,dAbove,#3 ;// [ U2|U1|U0|UL|L0|L1|L2|L3 ]
VEXT dAbove1,dLeft,dAbove,#2 ;// [ U1|U0|UL|L0|L1|L2|L3|X ]
VEXT dAbove2,dLeft,dAbove,#1 ;// [ U0|UL|L0|L1|L2|L3|X|X ]
VHADD dTmp0, dAbove0, dAbove2
VRHADD dTmp0, dTmp0, dAbove1 ;// Tmp[ 0 | 1 | 2 | 4 | 6 | 8 | X | X ]
VRHADD dTmp1, dAbove1, dAbove0 ;// (a+b+1)>>1
VSHL dTmp1U64,dTmp1U64,#24 ;// Tmp[ 3|5| 7 |9 | X | X | X | X ]
VSHL dTmpU64,dTmp0U64,#16 ;// Tmp[ 2|4|6|8| X | X | X | X ]
VZIP dTmp1,dTmp ;// dTmp = [ 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 ]
VEXT dTmp0,dTmp0,dTmp0,#6 ;// Tmp[ X| X| X| X| X| X| 0 | 1 ]
VEXT dTmp1,dTmp,dTmp0,#2 ;// Tmp[ 0 | 1 | 2 | 3 | 4 | 5 | 6 |7 ]
ADD pDstTmp, pDst, dstStep
ADD dstep, dstStep, dstStep
VST1 dTmp1U32[1],[pDst],dstep ;// Store pTmp[0|1|2|3]
VST1 dTmpU32[1],[pDstTmp],dstep ;// Store pTmp[2|3|4|5]
VST1 dTmp1U32[0],[pDst] ;// Store pTmp[4|5|6|7]
VST1 dTmpU32[0],[pDstTmp] ;// Store pTmp[6|7|8|9]
B ExitPredict4x4 ;// Branch to exit code
OMX_VC_4x4_VL
TST availability, #OMX_VC_UPPER_RIGHT
BEQ DiagVLUpperRightNotAvailable
VLD1 dAbove0,[pSrcAbove] ;// [U7|U6|U5|U4|U3|U2|U1|U0]
VEXT dAbove1,dAbove0,dAbove0,#1 ;// [ X|U7|U6|U5|U4|U3|U2|U1]
VEXT dAbove2,dAbove1,dAbove1,#1 ;// [ X| X|U7|U6|U5|U4|U3|U2]
B DiagVLPredict4x4Store
DiagVLUpperRightNotAvailable
VLD1 dAboveU32[1],[pSrcAbove] ;// [U3|U2|U1|U0|-|-|-|-]
VDUP dU3, dAbove[7] ;// [U3 U3 U3 U3 U3 U3 U3 U3]
VEXT dAbove0, dAbove, dU3, #4 ;// [U3 U3 U3 U3 U3 U2 U1 U0]
VEXT dAbove1, dAbove, dU3, #5 ;// [U3 U3 U3 U3 U3 U3 U2 U1]
VEXT dAbove2, dAbove, dU3, #6 ;// [U3 U3 U3 U3 U3 U3 U3 U2]
DiagVLPredict4x4Store
VRHADD dTmp0, dAbove1, dAbove0 ;// (a+b+1)>>1
;// Tmp[ X| X| X| 8| 6| 4| 2| 0 ]
VHADD dTmp3, dAbove0, dAbove2
VRHADD dTmp3, dTmp3, dAbove1 ;// (a+2*b+c+2)>>2
;// Tmp[ X| X| X| 9| 7| 5| 3| 1 ]
VEXT dTmp1,dTmp0,dTmp0,#1 ;// Tmp[ X| X| X| X| 8| 6| 4| 2 ]
ADD pDstTmp, pDst, dstStep
ADD dstep, dstStep, dstStep
VEXT dTmp2,dTmp3,dTmp1,#1 ;// Tmp[ X| X| X| X| 9| 7| 5| 3 ]
VST1 dTmp0U32[0],[pDst],dstep ;// Tmp[6],[4],[2],[0]
VST1 dTmp3U32[0],[pDstTmp],dstep ;// Tmp[7],[5],[3],[1]
VST1 dTmp1U32[0],[pDst] ;// Tmp[8],[6],[4],[2]
VST1 dTmp2U32[0],[pDstTmp] ;// Tmp[9],[7],[5],[3]
B ExitPredict4x4 ;// Branch to exit code
OMX_VC_4x4_HU
ADD pSrcTmp, pSrcLeft, leftStep
ADD srcStep, leftStep, leftStep
;// Load Left Edge ;// [L3|L2|L1|L0|X|X|X|X]
VLD1 {dLeft[4]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep]
VLD1 {dLeft[5]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep]
VLD1 {dLeft[6]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
VLD1 {dLeft[7]},[pSrcTmp] ;// pSrcLeft[3*leftStep]
VDUP dL3,dLeft[7] ;// [L3|L3|L3|L3|L3|L3|L3|L3]
VEXT dLeftHU0,dLeft,dL3,#4 ;// [L3|L3|L3|L3|L3|L2|L1|L0]
VEXT dLeftHU1,dLeft,dL3,#5 ;// [L3|L3|L3|L3|L3|L3|L2|L1]
VEXT dLeftHU2,dLeft,dL3,#6 ;// [L3|L3|L3|L3|L3|L3|L3|L2]
VHADD dTmp0, dLeftHU0, dLeftHU2
VRHADD dTmp0, dTmp0, dLeftHU1 ;// Tmp[ L3 | L3 | L3 | L3 | L3 | 5 | 3 | 1 ]
VRHADD dTmp1, dLeftHU1, dLeftHU0 ;// (a+b+1)>>1
;// Tmp[ L3 | L3 | L3 | L3 | L3 | 4 | 2 | 0 ]
VZIP dTmp1,dTmp0 ;// dTmp1 = Tmp[7| 6| 5| 4| 3| 2| 1| 0]
;// dTmp0 = [L3|L3|L3|L3|L3|L3|L3|L3]
VST1 dTmp1U32[0],[pDst],dstStep ;// [3|2|1|0]
VEXT dTmp1,dTmp1,dTmp1,#2
VST1 dTmp1U32[0],[pDst],dstStep ;// [5|4|3|2]
VEXT dTmp1,dTmp1,dTmp1,#2
VST1 dTmp1U32[0],[pDst],dstStep ;// [7|6|5|4]
VST1 dTmp0U32[0],[pDst] ;// [9|8|7|6]
ExitPredict4x4
MOV return, #OMX_Sts_NoErr
M_END
ENDIF ;// CortexA8
END
;//-----------------------------------------------------------------------------------------------
;// omxVCM4P10_PredictIntra_4x4 ends
;//-----------------------------------------------------------------------------------------------