av/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntraChroma_8x8_s.s - nest-cam/4320010/av - Git at Google

 ;//
 ;// Copyright (C) 2007-2008 ARM Limited
 ;//
 ;// Licensed under the Apache License, Version 2.0 (the "License");
 ;// you may not use this file except in compliance with the License.
 ;// You may obtain a copy of the License at
 ;//
 ;//      http://www.apache.org/licenses/LICENSE-2.0
 ;//
 ;// Unless required by applicable law or agreed to in writing, software
 ;// distributed under the License is distributed on an "AS IS" BASIS,
 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ;// See the License for the specific language governing permissions and
 ;// limitations under the License.
 ;//
 ;//
 ;//
 ;// File Name:  omxVCM4P10_PredictIntraChroma_8x8_s.s
 ;// OpenMAX DL: v1.0.2
 ;// Revision:   12290
 ;// Date:       Wednesday, April 9, 2008
 ;//
 ;//
 ;//
 ;//


         INCLUDE omxtypes_s.h
         INCLUDE armCOMM_s.h

         EXPORT armVCM4P10_pIndexTable8x8

 ;// Define the processor variants supported by this file

          M_VARIANTS CortexA8

      AREA table, DATA
 ;//-------------------------------------------------------
 ;// This table for implementing switch case of C in asm by
 ;// the mehtod of two levels of indexing.
 ;//-------------------------------------------------------

     M_TABLE armVCM4P10_pIndexTable8x8
     DCD  OMX_VC_CHROMA_DC,     OMX_VC_CHROMA_HOR
     DCD  OMX_VC_CHROMA_VERT,   OMX_VC_CHROMA_PLANE

     M_TABLE armVCM4P10_MultiplierTableChroma8x8,1
     DCW   3, 2, 1,4
     DCW  -3,-2,-1,0
     DCW   1, 2, 3,4


     IF CortexA8

 ;//--------------------------------------------
 ;// Scratch variable
 ;//--------------------------------------------

 pc              RN 15
 return          RN 0
 pTable          RN 8

 ;//--------------------------------------------
 ;// Input Arguments
 ;//--------------------------------------------
 pSrcLeft        RN 0    ;// input pointer
 pSrcAbove       RN 1    ;// input pointer
 pSrcAboveLeft   RN 2    ;// input pointer
 pDst            RN 3    ;// output pointer
 leftStep        RN 4    ;// input variable
 dstStep         RN 5    ;// input variable
 predMode        RN 6    ;// input variable
 availability    RN 7    ;// input variable
 pMultiplierTable    RN  2

 pTmp            RN 9
 step            RN 10

 ;//---------------------
 ;// Neon Registers
 ;//---------------------

 ;// OMX_VC_CHROMA_HOR

 dLeftVal0       DN  D0.8
 dLeftVal1       DN  D1.8
 dLeftVal2       DN  D2.8
 dLeftVal3       DN  D3.8
 dLeftVal4       DN  D4.8
 dLeftVal5       DN  D5.8
 dLeftVal6       DN  D6.8
 dLeftVal7       DN  D7.8

 ;// OMX_VC_CHROMA_VERT

 dAboveVal       DN  D0.U8

 ;// OMX_VC_CHROMA_DC

 dLeftVal        DN  D1.U8
 dSumAboveValU16 DN  D2.U16
 dSumAboveValU32 DN  D3.U32
 dSumAboveValU8  DN  D3.U8
 dSumLeftValU16  DN  D2.U16
 dSumLeftValU32  DN  D1.U32
 dSumLeftValU8   DN  D1.U8
 dSumAboveLeft   DN  D2.U32
 dSumAboveLeftU8 DN  D2.U8
 dIndexRow0U8    DN  D5.U8
 dIndexRow0      DN  D5.U64
 dIndexRow4U8    DN  D6.U8
 dIndexRow4      DN  D6.U64
 dDstRow0        DN  D0.U8
 dDstRow4        DN  D4.U8
 dConst128U8     DN  D0.U8

 ;// OMX_VC_CHROMA_PLANE

 dRevAboveVal    DN  D3.U8
 dRevAboveValU64 DN  D3.U64
 dAboveLeftVal   DN  D2.U8
 qAbove7minus0   QN  Q3.S16
 qAboveDiff      QN  Q2.S16
 dIndex          DN  D8.U8
 dDiffAboveU8    DN  D9.U8
 dDiffAboveS16   DN  D9.S16
 dAboveDiff0U8   DN  D4.U8
 dAboveDiff0U64  DN  D4.U64
 dAbove7minus0U8 DN  D6.U8
 dMultiplier     DN  D10.S16
 dHorPred        DN  D11.S16
 dRevLeftVal     DN  D3.U8
 dRevLeftValU64  DN  D3.U64
 qLeft7minus0    QN  Q7.S16
 qLeftDiff       QN  Q6.S16
 dDiffLeftU8     DN  D16.U8
 dDiffLeftS16    DN  D16.S16
 dLeftDiff0U8    DN  D12.U8
 dLeftDiff0U64   DN  D12.U64
 dLeft7minus0U8  DN  D14.U8
 dVerPred        DN  D3.S16
 dHVValS16       DN  D3.S16
 dHVValS32       DN  D3.S32
 dHVTempS32      DN  D2.S32
 qA              QN  Q0.S16
 qB              QN  Q2.S16
 qC              QN  Q3.S16
 qMultiplier     QN  Q5.S16
 dMultiplier0    DN  D10.S16
 dMultiplier1    DN  D11.S16
 qC0             QN  Q0.S16
 qC1             QN  Q1.S16
 qC2             QN  Q4.S16
 qC3             QN  Q5.S16
 qC4             QN  Q6.S16
 qC5             QN  Q7.S16
 qC6             QN  Q8.S16
 qC7             QN  Q9.S16
 qSum0           QN  Q0.S16
 qSum1           QN  Q1.S16
 qSum2           QN  Q4.S16
 qSum3           QN  Q5.S16
 qSum4           QN  Q6.S16
 qSum5           QN  Q7.S16
 qSum6           QN  Q8.S16
 qSum7           QN  Q9.S16
 dSum0           DN  D0.U8
 dSum1           DN  D1.U8
 dSum2           DN  D2.U8
 dSum3           DN  D3.U8
 dSum4           DN  D4.U8
 dSum5           DN  D5.U8
 dSum6           DN  D6.U8
 dSum7           DN  D7.U8

 ;//-----------------------------------------------------------------------------------------------
 ;// omxVCM4P10_PredictIntraChroma_8x8 starts
 ;//-----------------------------------------------------------------------------------------------

         ;// Write function header
         M_START omxVCM4P10_PredictIntraChroma_8x8, r10, d15

         ;// Define stack arguments
         M_ARG    LeftStep,     4
         M_ARG    DstStep,      4
         M_ARG    PredMode,     4
         M_ARG    Availability, 4

         LDR      pTable,=armVCM4P10_pIndexTable8x8   ;// Load index table for switch case

         ;// Load argument from the stack
         M_LDR    predMode, PredMode                  ;// Arg predMode loaded from stack to reg
         M_LDR    leftStep, LeftStep                  ;// Arg leftStep loaded from stack to reg
         M_LDR    dstStep,  DstStep                   ;// Arg dstStep loaded from stack to reg
         M_LDR    availability, Availability          ;// Arg availability loaded from stack to reg


         LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode

 OMX_VC_CHROMA_DC

         TST     availability, #OMX_VC_LEFT
         BEQ     DCChroma8x8LeftNotAvailable

         ADD     pTmp, pSrcLeft, leftStep
         ADD     step, leftStep, leftStep

         ;// Load Left Edge
         VLD1    {dLeftVal[0]},[pSrcLeft],step               ;// pSrcLeft[0*leftStep]
         VLD1    {dLeftVal[1]},[pTmp],step                   ;// pSrcLeft[1*leftStep]
         VLD1    {dLeftVal[2]},[pSrcLeft],step               ;// pSrcLeft[2*leftStep]
         VLD1    {dLeftVal[3]},[pTmp],step                   ;// pSrcLeft[3*leftStep]
         VLD1    {dLeftVal[4]},[pSrcLeft],step               ;// pSrcLeft[4*leftStep]
         VLD1    {dLeftVal[5]},[pTmp],step                   ;// pSrcLeft[5*leftStep]
         VLD1    {dLeftVal[6]},[pSrcLeft],step               ;// pSrcLeft[6*leftStep]
         VLD1    {dLeftVal[7]},[pTmp]                        ;// pSrcLeft[7*leftStep]

         TST     availability, #OMX_VC_UPPER
         BEQ     DCChroma8x8LeftOnlyAvailable

         ;// Load Upper Edge also
         VLD1     dAboveVal,[pSrcAbove]                      ;// pSrcAbove[0 to 7]

         MOV      return, #OMX_Sts_NoErr                     ;// returnNoError

         VPADDL   dSumAboveValU16, dAboveVal                 ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ]
         VPADDL   dSumAboveValU32, dSumAboveValU16           ;// pSrcAbove[ 4+5+6+7 |  0+1+2+3 ]

         VPADDL   dSumLeftValU16, dLeftVal                   ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ]
         VPADDL   dSumLeftValU32, dSumLeftValU16             ;// pSrcLeft[ 4+5+6+7 |  0+1+2+3 ]

         VADD     dSumAboveLeft,dSumAboveValU32,dSumLeftValU32
         VRSHR    dSumAboveLeft,dSumAboveLeft,#3             ;// Sum = (Sum + 4) >> 3
         VRSHR    dSumAboveValU32,dSumAboveValU32,#2         ;// Sum = (Sum + 2) >> 2
         VRSHR    dSumLeftValU32,dSumLeftValU32,#2           ;// Sum = (Sum + 2) >> 2

         VMOV     dIndexRow0U8,#0x0c
         VMOV     dIndexRow4U8,#0x04
         VSHL     dIndexRow0,dIndexRow0,#32                  ;// index0 = 0x0c0c0c0c00000000
         VSHR     dIndexRow4,dIndexRow4,#32                  ;// index4 = 0x0000000004040404
         VADD     dIndexRow4U8,dIndexRow4U8,dIndexRow0U8     ;// index4 = 0x0c0c0c0c04040404
         VTBL     dDstRow0,{dSumAboveLeftU8,dSumAboveValU8},dIndexRow0U8
         VTBL     dDstRow4,{dSumLeftValU8,dSumAboveLeftU8},dIndexRow4U8

 DCChroma8x8LeftStore
         ADD     pTmp, pDst, dstStep
         ADD     step, dstStep, dstStep

         VST1     dDstRow0,[pDst],step                    ;// pDst[0*dstStep+x] :0<= x <= 7
         VST1     dDstRow0,[pTmp],step                    ;// pDst[1*dstStep+x] :0<= x <= 7
         VST1     dDstRow0,[pDst],step                    ;// pDst[2*dstStep+x] :0<= x <= 7
         VST1     dDstRow0,[pTmp],step                    ;// pDst[3*dstStep+x] :0<= x <= 7
         VST1     dDstRow4,[pDst],step                    ;// pDst[4*dstStep+x] :0<= x <= 7
         VST1     dDstRow4,[pTmp],step                    ;// pDst[5*dstStep+x] :0<= x <= 7
         VST1     dDstRow4,[pDst],step                    ;// pDst[6*dstStep+x] :0<= x <= 7
         VST1     dDstRow4,[pTmp]                         ;// pDst[7*dstStep+x] :0<= x <= 7

         M_EXIT


 DCChroma8x8LeftOnlyAvailable

         MOV      return, #OMX_Sts_NoErr

         VPADDL   dSumLeftValU16, dLeftVal                   ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ]
         VPADDL   dSumLeftValU32, dSumLeftValU16             ;// pSrcLeft[ 4+5+6+7 |  0+1+2+3 ]
         VRSHR    dSumLeftValU32,dSumLeftValU32,#2           ;// Sum = (Sum + 2) >> 2

         VDUP     dDstRow0,dSumLeftValU8[0]
         VDUP     dDstRow4,dSumLeftValU8[4]

         B        DCChroma8x8LeftStore


 DCChroma8x8LeftNotAvailable

         TST     availability, #OMX_VC_UPPER
         BEQ     DCChroma8x8NoneAvailable

         ;// Load Upper Edge
         VLD1     dAboveVal,[pSrcAbove]                      ;// pSrcAbove[0 to 7]
         MOV      return, #OMX_Sts_NoErr                     ;// returnNoError

         VPADDL   dSumAboveValU16, dAboveVal                 ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ]
         VPADDL   dSumAboveValU32, dSumAboveValU16           ;// pSrcAbove[ 4+5+6+7 |  0+1+2+3 ]
         VRSHR    dSumAboveValU32,dSumAboveValU32,#2         ;// Sum = (Sum + 2) >> 2
         VMOV     dIndexRow0U8,#0x04
         VSHL     dIndexRow0,dIndexRow0,#32                  ;// index = 0x0404040400000000
         VTBL     dDstRow0,{dSumAboveValU8},dIndexRow0U8

         B        DCChroma8x8UpperStore


 DCChroma8x8NoneAvailable

         VMOV     dConst128U8,#0x80                          ;// 0x8080808080808080 if(count == 0)
         MOV      return, #OMX_Sts_NoErr                     ;// returnNoError

 DCChroma8x8UpperStore

         ADD     pTmp, pDst, dstStep
         ADD     step, dstStep, dstStep

         VST1     dDstRow0,[pDst],step                    ;// pDst[0*dstStep+x] :0<= x <= 7
         VST1     dDstRow0,[pTmp],step                    ;// pDst[1*dstStep+x] :0<= x <= 7
         VST1     dDstRow0,[pDst],step                    ;// pDst[2*dstStep+x] :0<= x <= 7
         VST1     dDstRow0,[pTmp],step                    ;// pDst[3*dstStep+x] :0<= x <= 7
         VST1     dDstRow0,[pDst],step                    ;// pDst[4*dstStep+x] :0<= x <= 7
         VST1     dDstRow0,[pTmp],step                    ;// pDst[5*dstStep+x] :0<= x <= 7
         VST1     dDstRow0,[pDst],step                    ;// pDst[6*dstStep+x] :0<= x <= 7
         VST1     dDstRow0,[pTmp]                         ;// pDst[7*dstStep+x] :0<= x <= 7

         M_EXIT


 OMX_VC_CHROMA_VERT

         VLD1     dAboveVal,[pSrcAbove]                      ;// pSrcAbove[x]      :0<= x <= 7
         MOV      return, #OMX_Sts_NoErr

         B        DCChroma8x8UpperStore


 OMX_VC_CHROMA_HOR

         ADD     pTmp, pSrcLeft, leftStep
         ADD     step, leftStep, leftStep

         VLD1    {dLeftVal0[]},[pSrcLeft],step           ;// pSrcLeft[0*leftStep]
         VLD1    {dLeftVal1[]},[pTmp],step               ;// pSrcLeft[1*leftStep]
         VLD1    {dLeftVal2[]},[pSrcLeft],step           ;// pSrcLeft[2*leftStep]
         VLD1    {dLeftVal3[]},[pTmp],step               ;// pSrcLeft[3*leftStep]
         VLD1    {dLeftVal4[]},[pSrcLeft],step           ;// pSrcLeft[4*leftStep]
         VLD1    {dLeftVal5[]},[pTmp],step               ;// pSrcLeft[5*leftStep]
         VLD1    {dLeftVal6[]},[pSrcLeft],step           ;// pSrcLeft[6*leftStep]
         VLD1    {dLeftVal7[]},[pTmp]                    ;// pSrcLeft[7*leftStep]

         B        DCChroma8x8PlaneStore


 OMX_VC_CHROMA_PLANE
         ADD     pTmp, pSrcLeft, leftStep
         ADD     step, leftStep, leftStep

         VLD1    dAboveVal,[pSrcAbove]                       ;// pSrcAbove[x]      :0<= x <= 7
         VLD1    dAboveLeftVal[0],[pSrcAboveLeft]

         VLD1    {dLeftVal[0]},[pSrcLeft],step               ;// pSrcLeft[0*leftStep]
         VLD1    {dLeftVal[1]},[pTmp],step                   ;// pSrcLeft[1*leftStep]
         VLD1    {dLeftVal[2]},[pSrcLeft],step               ;// pSrcLeft[2*leftStep]
         VLD1    {dLeftVal[3]},[pTmp],step                   ;// pSrcLeft[3*leftStep]
         VLD1    {dLeftVal[4]},[pSrcLeft],step               ;// pSrcLeft[4*leftStep]
         VLD1    {dLeftVal[5]},[pTmp],step                   ;// pSrcLeft[5*leftStep]
         VLD1    {dLeftVal[6]},[pSrcLeft],step               ;// pSrcLeft[6*leftStep]
         VLD1    {dLeftVal[7]},[pTmp]                        ;// pSrcLeft[7*leftStep]


         VREV64  dRevAboveVal,dAboveVal                      ;// Reverse order of bytes = pSrcAbove[0:1:2:3:4:5:6:7]
         VSUBL   qAbove7minus0,dRevAboveVal,dAboveLeftVal    ;// qAbove7minus0[0] = pSrcAbove[7] - pSrcAboveLeft[0]
         VSHR    dRevAboveValU64,dRevAboveValU64,#8          ;// pSrcAbove[X:0:1:2:3:4:5:6]
         VSUBL   qAboveDiff,dRevAboveVal,dAboveVal           ;// pSrcAbove[6] - pSrcAbove[0]
                                                             ;// pSrcAbove[5] - pSrcAbove[1]
                                                             ;// pSrcAbove[4] - pSrcAbove[2]

         VREV64  dRevLeftVal,dLeftVal                        ;// Reverse order of bytes = pSrcLeft[0:1:2:3:4:5:6:7]
         VSUBL   qLeft7minus0,dRevLeftVal,dAboveLeftVal      ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0]
         VSHR    dRevLeftValU64,dRevLeftValU64,#8            ;// pSrcLeft[X:0:1:2:3:4:5:6]
         VSUBL   qLeftDiff,dRevLeftVal,dLeftVal              ;// pSrcLeft[6] - pSrcLeft[0]
                                                             ;// pSrcLeft[5] - pSrcLeft[1]
                                                             ;// pSrcLeft[4] - pSrcLeft[2]

         LDR     pMultiplierTable,=armVCM4P10_MultiplierTableChroma8x8   ;// Used to calculate Hval & Vval
         VSHL    dAboveDiff0U64,dAboveDiff0U64,#16
         VEXT    dDiffAboveU8,dAboveDiff0U8,dAbove7minus0U8,#2           ;// pSrcAbove[ 7-0 | 4-2 | 5-1 | 6-0 ]
         VLD1    dMultiplier,[pMultiplierTable]!
         VSHL    dLeftDiff0U64,dLeftDiff0U64,#16
         VEXT    dDiffLeftU8,dLeftDiff0U8,dLeft7minus0U8,#2              ;// pSrcLeft[ 7-0 | 4-2 | 5-1 | 6-0 ]


         VMUL    dHorPred,dDiffAboveS16,dMultiplier                      ;// pSrcAbove[ 4*(7-0) | 1*(4-2) | 2*(5-1) | 3*(6-0) ]
         VMUL    dVerPred,dDiffLeftS16,dMultiplier
         VPADD   dHVValS16,dHorPred,dVerPred


         VPADDL  dHVValS32,dHVValS16                                     ;// [V|H] in 32 bits each
         VSHL    dHVTempS32,dHVValS32,#4                                 ;// 17*H = 16*H + H = (H<<4)+H
         VADD    dHVValS32,dHVValS32,dHVTempS32                          ;// [ 17*V  | 17*H ]in 32 bits each
         VLD1    {dMultiplier0,dMultiplier1},[pMultiplierTable]          ;// qMultiplier = [ 4|3|2|1|0|-1|-2|-3 ]
         VRSHR   dHVValS32,dHVValS32,#5                                  ;// [c|b] in 16bits each
         VADDL   qA,dAboveVal,dLeftVal
         VDUP    qA,qA[7]
         VSHL    qA,qA,#4                                                ;// [a|a|a|a|a|a|a|a]
         VDUP    qB,dHVValS16[0]                                         ;// [b|b|b|b|b|b|b|b]
         VDUP    qC,dHVValS16[2]                                         ;// [c|c|c|c|c|c|c|c]


         VMUL    qB,qB,qMultiplier
         VMUL    qC,qC,qMultiplier
         VADD    qB,qB,qA

         VDUP    qC0,qC[0]
         VDUP    qC1,qC[1]
         VDUP    qC2,qC[2]
         VDUP    qC3,qC[3]
         VDUP    qC4,qC[4]
         VDUP    qC5,qC[5]
         VDUP    qC6,qC[6]
         VDUP    qC7,qC[7]

         VADD    qSum0,qB,qC0
         VADD    qSum1,qB,qC1
         VADD    qSum2,qB,qC2
         VADD    qSum3,qB,qC3
         VADD    qSum4,qB,qC4
         VADD    qSum5,qB,qC5
         VADD    qSum6,qB,qC6
         VADD    qSum7,qB,qC7

         VQRSHRUN dSum0,qSum0,#5                         ;// (OMX_U8)armClip(0,255,(Sum+16)>>5)
         VQRSHRUN dSum1,qSum1,#5
         VQRSHRUN dSum2,qSum2,#5
         VQRSHRUN dSum3,qSum3,#5
         VQRSHRUN dSum4,qSum4,#5
         VQRSHRUN dSum5,qSum5,#5
         VQRSHRUN dSum6,qSum6,#5
         VQRSHRUN dSum7,qSum7,#5

 DCChroma8x8PlaneStore
         ADD     pTmp, pDst, dstStep
         ADD     step, dstStep, dstStep

         VST1    dSum0,[pDst],step                    ;// pDst[0*dstStep+x] :0<= x <= 7
         VST1    dSum1,[pTmp],step                    ;// pDst[1*dstStep+x] :0<= x <= 7
         VST1    dSum2,[pDst],step                    ;// pDst[2*dstStep+x] :0<= x <= 7
         VST1    dSum3,[pTmp],step                    ;// pDst[3*dstStep+x] :0<= x <= 7
         VST1    dSum4,[pDst],step                    ;// pDst[4*dstStep+x] :0<= x <= 7
         VST1    dSum5,[pTmp],step                    ;// pDst[5*dstStep+x] :0<= x <= 7
         VST1    dSum6,[pDst],step                    ;// pDst[6*dstStep+x] :0<= x <= 7
         VST1    dSum7,[pTmp]                         ;// pDst[7*dstStep+x] :0<= x <= 7

         MOV     return, #OMX_Sts_NoErr
         M_END

         ENDIF ;// CortexA8

         END
 ;//-----------------------------------------------------------------------------------------------
 ;// omxVCM4P10_PredictIntraChroma_8x8 ends
 ;//-----------------------------------------------------------------------------------------------
	;//
	;// Copyright (C) 2007-2008 ARM Limited
	;//
	;// Licensed under the Apache License, Version 2.0 (the "License");
	;// you may not use this file except in compliance with the License.
	;// You may obtain a copy of the License at
	;//
	;// http://www.apache.org/licenses/LICENSE-2.0
	;//
	;// Unless required by applicable law or agreed to in writing, software
	;// distributed under the License is distributed on an "AS IS" BASIS,
	;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	;// See the License for the specific language governing permissions and
	;// limitations under the License.
	;//
	;//
	;//
	;// File Name: omxVCM4P10_PredictIntraChroma_8x8_s.s
	;// OpenMAX DL: v1.0.2
	;// Revision: 12290
	;// Date: Wednesday, April 9, 2008
	;//
	;//
	;//
	;//


	INCLUDE omxtypes_s.h
	INCLUDE armCOMM_s.h

	EXPORT armVCM4P10_pIndexTable8x8

	;// Define the processor variants supported by this file

	M_VARIANTS CortexA8

	AREA table, DATA
	;//-------------------------------------------------------
	;// This table for implementing switch case of C in asm by
	;// the mehtod of two levels of indexing.
	;//-------------------------------------------------------

	M_TABLE armVCM4P10_pIndexTable8x8
	DCD OMX_VC_CHROMA_DC, OMX_VC_CHROMA_HOR
	DCD OMX_VC_CHROMA_VERT, OMX_VC_CHROMA_PLANE

	M_TABLE armVCM4P10_MultiplierTableChroma8x8,1
	DCW 3, 2, 1,4
	DCW -3,-2,-1,0
	DCW 1, 2, 3,4



	IF CortexA8

	;//--------------------------------------------
	;// Scratch variable
	;//--------------------------------------------

	pc RN 15
	return RN 0
	pTable RN 8

	;//--------------------------------------------
	;// Input Arguments
	;//--------------------------------------------
	pSrcLeft RN 0 ;// input pointer
	pSrcAbove RN 1 ;// input pointer
	pSrcAboveLeft RN 2 ;// input pointer
	pDst RN 3 ;// output pointer
	leftStep RN 4 ;// input variable
	dstStep RN 5 ;// input variable
	predMode RN 6 ;// input variable
	availability RN 7 ;// input variable
	pMultiplierTable RN 2

	pTmp RN 9
	step RN 10

	;//---------------------
	;// Neon Registers
	;//---------------------

	;// OMX_VC_CHROMA_HOR

	dLeftVal0 DN D0.8
	dLeftVal1 DN D1.8
	dLeftVal2 DN D2.8
	dLeftVal3 DN D3.8
	dLeftVal4 DN D4.8
	dLeftVal5 DN D5.8
	dLeftVal6 DN D6.8
	dLeftVal7 DN D7.8

	;// OMX_VC_CHROMA_VERT

	dAboveVal DN D0.U8

	;// OMX_VC_CHROMA_DC

	dLeftVal DN D1.U8
	dSumAboveValU16 DN D2.U16
	dSumAboveValU32 DN D3.U32
	dSumAboveValU8 DN D3.U8
	dSumLeftValU16 DN D2.U16
	dSumLeftValU32 DN D1.U32
	dSumLeftValU8 DN D1.U8
	dSumAboveLeft DN D2.U32
	dSumAboveLeftU8 DN D2.U8
	dIndexRow0U8 DN D5.U8
	dIndexRow0 DN D5.U64
	dIndexRow4U8 DN D6.U8
	dIndexRow4 DN D6.U64
	dDstRow0 DN D0.U8
	dDstRow4 DN D4.U8
	dConst128U8 DN D0.U8

	;// OMX_VC_CHROMA_PLANE

	dRevAboveVal DN D3.U8
	dRevAboveValU64 DN D3.U64
	dAboveLeftVal DN D2.U8
	qAbove7minus0 QN Q3.S16
	qAboveDiff QN Q2.S16
	dIndex DN D8.U8
	dDiffAboveU8 DN D9.U8
	dDiffAboveS16 DN D9.S16
	dAboveDiff0U8 DN D4.U8
	dAboveDiff0U64 DN D4.U64
	dAbove7minus0U8 DN D6.U8
	dMultiplier DN D10.S16
	dHorPred DN D11.S16
	dRevLeftVal DN D3.U8
	dRevLeftValU64 DN D3.U64
	qLeft7minus0 QN Q7.S16
	qLeftDiff QN Q6.S16
	dDiffLeftU8 DN D16.U8
	dDiffLeftS16 DN D16.S16
	dLeftDiff0U8 DN D12.U8
	dLeftDiff0U64 DN D12.U64
	dLeft7minus0U8 DN D14.U8
	dVerPred DN D3.S16
	dHVValS16 DN D3.S16
	dHVValS32 DN D3.S32
	dHVTempS32 DN D2.S32
	qA QN Q0.S16
	qB QN Q2.S16
	qC QN Q3.S16
	qMultiplier QN Q5.S16
	dMultiplier0 DN D10.S16
	dMultiplier1 DN D11.S16
	qC0 QN Q0.S16
	qC1 QN Q1.S16
	qC2 QN Q4.S16
	qC3 QN Q5.S16
	qC4 QN Q6.S16
	qC5 QN Q7.S16
	qC6 QN Q8.S16
	qC7 QN Q9.S16
	qSum0 QN Q0.S16
	qSum1 QN Q1.S16
	qSum2 QN Q4.S16
	qSum3 QN Q5.S16
	qSum4 QN Q6.S16
	qSum5 QN Q7.S16
	qSum6 QN Q8.S16
	qSum7 QN Q9.S16
	dSum0 DN D0.U8
	dSum1 DN D1.U8
	dSum2 DN D2.U8
	dSum3 DN D3.U8
	dSum4 DN D4.U8
	dSum5 DN D5.U8
	dSum6 DN D6.U8
	dSum7 DN D7.U8

	;//-----------------------------------------------------------------------------------------------
	;// omxVCM4P10_PredictIntraChroma_8x8 starts
	;//-----------------------------------------------------------------------------------------------

	;// Write function header
	M_START omxVCM4P10_PredictIntraChroma_8x8, r10, d15

	;// Define stack arguments
	M_ARG LeftStep, 4
	M_ARG DstStep, 4
	M_ARG PredMode, 4
	M_ARG Availability, 4

	LDR pTable,=armVCM4P10_pIndexTable8x8 ;// Load index table for switch case

	;// Load argument from the stack
	M_LDR predMode, PredMode ;// Arg predMode loaded from stack to reg
	M_LDR leftStep, LeftStep ;// Arg leftStep loaded from stack to reg
	M_LDR dstStep, DstStep ;// Arg dstStep loaded from stack to reg
	M_LDR availability, Availability ;// Arg availability loaded from stack to reg


	LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode

	OMX_VC_CHROMA_DC

	TST availability, #OMX_VC_LEFT
	BEQ DCChroma8x8LeftNotAvailable

	ADD pTmp, pSrcLeft, leftStep
	ADD step, leftStep, leftStep

	;// Load Left Edge
	VLD1 {dLeftVal[0]},[pSrcLeft],step ;// pSrcLeft[0*leftStep]
	VLD1 {dLeftVal[1]},[pTmp],step ;// pSrcLeft[1*leftStep]
	VLD1 {dLeftVal[2]},[pSrcLeft],step ;// pSrcLeft[2*leftStep]
	VLD1 {dLeftVal[3]},[pTmp],step ;// pSrcLeft[3*leftStep]
	VLD1 {dLeftVal[4]},[pSrcLeft],step ;// pSrcLeft[4*leftStep]
	VLD1 {dLeftVal[5]},[pTmp],step ;// pSrcLeft[5*leftStep]
	VLD1 {dLeftVal[6]},[pSrcLeft],step ;// pSrcLeft[6*leftStep]
	VLD1 {dLeftVal[7]},[pTmp] ;// pSrcLeft[7*leftStep]

	TST availability, #OMX_VC_UPPER
	BEQ DCChroma8x8LeftOnlyAvailable

	;// Load Upper Edge also
	VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[0 to 7]

	MOV return, #OMX_Sts_NoErr ;// returnNoError

	VPADDL dSumAboveValU16, dAboveVal ;// pSrcAbove[ 6+7 \| 4+5 \| 2+3 \| 0+1 ]
	VPADDL dSumAboveValU32, dSumAboveValU16 ;// pSrcAbove[ 4+5+6+7 \| 0+1+2+3 ]

	VPADDL dSumLeftValU16, dLeftVal ;// pSrcLeft[ 6+7 \| 4+5 \| 2+3 \| 0+1 ]
	VPADDL dSumLeftValU32, dSumLeftValU16 ;// pSrcLeft[ 4+5+6+7 \| 0+1+2+3 ]

	VADD dSumAboveLeft,dSumAboveValU32,dSumLeftValU32
	VRSHR dSumAboveLeft,dSumAboveLeft,#3 ;// Sum = (Sum + 4) >> 3
	VRSHR dSumAboveValU32,dSumAboveValU32,#2 ;// Sum = (Sum + 2) >> 2
	VRSHR dSumLeftValU32,dSumLeftValU32,#2 ;// Sum = (Sum + 2) >> 2

	VMOV dIndexRow0U8,#0x0c
	VMOV dIndexRow4U8,#0x04
	VSHL dIndexRow0,dIndexRow0,#32 ;// index0 = 0x0c0c0c0c00000000
	VSHR dIndexRow4,dIndexRow4,#32 ;// index4 = 0x0000000004040404
	VADD dIndexRow4U8,dIndexRow4U8,dIndexRow0U8 ;// index4 = 0x0c0c0c0c04040404
	VTBL dDstRow0,{dSumAboveLeftU8,dSumAboveValU8},dIndexRow0U8
	VTBL dDstRow4,{dSumLeftValU8,dSumAboveLeftU8},dIndexRow4U8

	DCChroma8x8LeftStore
	ADD pTmp, pDst, dstStep
	ADD step, dstStep, dstStep

	VST1 dDstRow0,[pDst],step ;// pDst[0*dstStep+x] :0<= x <= 7
	VST1 dDstRow0,[pTmp],step ;// pDst[1*dstStep+x] :0<= x <= 7
	VST1 dDstRow0,[pDst],step ;// pDst[2*dstStep+x] :0<= x <= 7
	VST1 dDstRow0,[pTmp],step ;// pDst[3*dstStep+x] :0<= x <= 7
	VST1 dDstRow4,[pDst],step ;// pDst[4*dstStep+x] :0<= x <= 7
	VST1 dDstRow4,[pTmp],step ;// pDst[5*dstStep+x] :0<= x <= 7
	VST1 dDstRow4,[pDst],step ;// pDst[6*dstStep+x] :0<= x <= 7
	VST1 dDstRow4,[pTmp] ;// pDst[7*dstStep+x] :0<= x <= 7

	M_EXIT


	DCChroma8x8LeftOnlyAvailable

	MOV return, #OMX_Sts_NoErr

	VPADDL dSumLeftValU16, dLeftVal ;// pSrcLeft[ 6+7 \| 4+5 \| 2+3 \| 0+1 ]
	VPADDL dSumLeftValU32, dSumLeftValU16 ;// pSrcLeft[ 4+5+6+7 \| 0+1+2+3 ]
	VRSHR dSumLeftValU32,dSumLeftValU32,#2 ;// Sum = (Sum + 2) >> 2

	VDUP dDstRow0,dSumLeftValU8[0]
	VDUP dDstRow4,dSumLeftValU8[4]

	B DCChroma8x8LeftStore


	DCChroma8x8LeftNotAvailable

	TST availability, #OMX_VC_UPPER
	BEQ DCChroma8x8NoneAvailable

	;// Load Upper Edge
	VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[0 to 7]
	MOV return, #OMX_Sts_NoErr ;// returnNoError

	VPADDL dSumAboveValU16, dAboveVal ;// pSrcAbove[ 6+7 \| 4+5 \| 2+3 \| 0+1 ]
	VPADDL dSumAboveValU32, dSumAboveValU16 ;// pSrcAbove[ 4+5+6+7 \| 0+1+2+3 ]
	VRSHR dSumAboveValU32,dSumAboveValU32,#2 ;// Sum = (Sum + 2) >> 2
	VMOV dIndexRow0U8,#0x04
	VSHL dIndexRow0,dIndexRow0,#32 ;// index = 0x0404040400000000
	VTBL dDstRow0,{dSumAboveValU8},dIndexRow0U8

	B DCChroma8x8UpperStore


	DCChroma8x8NoneAvailable

	VMOV dConst128U8,#0x80 ;// 0x8080808080808080 if(count == 0)
	MOV return, #OMX_Sts_NoErr ;// returnNoError

	DCChroma8x8UpperStore

	ADD pTmp, pDst, dstStep
	ADD step, dstStep, dstStep

	VST1 dDstRow0,[pDst],step ;// pDst[0*dstStep+x] :0<= x <= 7
	VST1 dDstRow0,[pTmp],step ;// pDst[1*dstStep+x] :0<= x <= 7
	VST1 dDstRow0,[pDst],step ;// pDst[2*dstStep+x] :0<= x <= 7
	VST1 dDstRow0,[pTmp],step ;// pDst[3*dstStep+x] :0<= x <= 7
	VST1 dDstRow0,[pDst],step ;// pDst[4*dstStep+x] :0<= x <= 7
	VST1 dDstRow0,[pTmp],step ;// pDst[5*dstStep+x] :0<= x <= 7
	VST1 dDstRow0,[pDst],step ;// pDst[6*dstStep+x] :0<= x <= 7
	VST1 dDstRow0,[pTmp] ;// pDst[7*dstStep+x] :0<= x <= 7

	M_EXIT


	OMX_VC_CHROMA_VERT

	VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[x] :0<= x <= 7
	MOV return, #OMX_Sts_NoErr

	B DCChroma8x8UpperStore


	OMX_VC_CHROMA_HOR

	ADD pTmp, pSrcLeft, leftStep
	ADD step, leftStep, leftStep

	VLD1 {dLeftVal0[]},[pSrcLeft],step ;// pSrcLeft[0*leftStep]
	VLD1 {dLeftVal1[]},[pTmp],step ;// pSrcLeft[1*leftStep]
	VLD1 {dLeftVal2[]},[pSrcLeft],step ;// pSrcLeft[2*leftStep]
	VLD1 {dLeftVal3[]},[pTmp],step ;// pSrcLeft[3*leftStep]
	VLD1 {dLeftVal4[]},[pSrcLeft],step ;// pSrcLeft[4*leftStep]
	VLD1 {dLeftVal5[]},[pTmp],step ;// pSrcLeft[5*leftStep]
	VLD1 {dLeftVal6[]},[pSrcLeft],step ;// pSrcLeft[6*leftStep]
	VLD1 {dLeftVal7[]},[pTmp] ;// pSrcLeft[7*leftStep]

	B DCChroma8x8PlaneStore


	OMX_VC_CHROMA_PLANE
	ADD pTmp, pSrcLeft, leftStep
	ADD step, leftStep, leftStep

	VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[x] :0<= x <= 7
	VLD1 dAboveLeftVal[0],[pSrcAboveLeft]

	VLD1 {dLeftVal[0]},[pSrcLeft],step ;// pSrcLeft[0*leftStep]
	VLD1 {dLeftVal[1]},[pTmp],step ;// pSrcLeft[1*leftStep]
	VLD1 {dLeftVal[2]},[pSrcLeft],step ;// pSrcLeft[2*leftStep]
	VLD1 {dLeftVal[3]},[pTmp],step ;// pSrcLeft[3*leftStep]
	VLD1 {dLeftVal[4]},[pSrcLeft],step ;// pSrcLeft[4*leftStep]
	VLD1 {dLeftVal[5]},[pTmp],step ;// pSrcLeft[5*leftStep]
	VLD1 {dLeftVal[6]},[pSrcLeft],step ;// pSrcLeft[6*leftStep]
	VLD1 {dLeftVal[7]},[pTmp] ;// pSrcLeft[7*leftStep]


	VREV64 dRevAboveVal,dAboveVal ;// Reverse order of bytes = pSrcAbove[0:1:2:3:4:5:6:7]
	VSUBL qAbove7minus0,dRevAboveVal,dAboveLeftVal ;// qAbove7minus0[0] = pSrcAbove[7] - pSrcAboveLeft[0]
	VSHR dRevAboveValU64,dRevAboveValU64,#8 ;// pSrcAbove[X:0:1:2:3:4:5:6]
	VSUBL qAboveDiff,dRevAboveVal,dAboveVal ;// pSrcAbove[6] - pSrcAbove[0]
	;// pSrcAbove[5] - pSrcAbove[1]
	;// pSrcAbove[4] - pSrcAbove[2]

	VREV64 dRevLeftVal,dLeftVal ;// Reverse order of bytes = pSrcLeft[0:1:2:3:4:5:6:7]
	VSUBL qLeft7minus0,dRevLeftVal,dAboveLeftVal ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0]
	VSHR dRevLeftValU64,dRevLeftValU64,#8 ;// pSrcLeft[X:0:1:2:3:4:5:6]
	VSUBL qLeftDiff,dRevLeftVal,dLeftVal ;// pSrcLeft[6] - pSrcLeft[0]
	;// pSrcLeft[5] - pSrcLeft[1]
	;// pSrcLeft[4] - pSrcLeft[2]

	LDR pMultiplierTable,=armVCM4P10_MultiplierTableChroma8x8 ;// Used to calculate Hval & Vval
	VSHL dAboveDiff0U64,dAboveDiff0U64,#16
	VEXT dDiffAboveU8,dAboveDiff0U8,dAbove7minus0U8,#2 ;// pSrcAbove[ 7-0 \| 4-2 \| 5-1 \| 6-0 ]
	VLD1 dMultiplier,[pMultiplierTable]!
	VSHL dLeftDiff0U64,dLeftDiff0U64,#16
	VEXT dDiffLeftU8,dLeftDiff0U8,dLeft7minus0U8,#2 ;// pSrcLeft[ 7-0 \| 4-2 \| 5-1 \| 6-0 ]


	VMUL dHorPred,dDiffAboveS16,dMultiplier ;// pSrcAbove[ 4(7-0) \| 1(4-2) \| 2(5-1) \| 3(6-0) ]
	VMUL dVerPred,dDiffLeftS16,dMultiplier
	VPADD dHVValS16,dHorPred,dVerPred


	VPADDL dHVValS32,dHVValS16 ;// [V\|H] in 32 bits each
	VSHL dHVTempS32,dHVValS32,#4 ;// 17H = 16H + H = (H<<4)+H
	VADD dHVValS32,dHVValS32,dHVTempS32 ;// [ 17V \| 17H ]in 32 bits each
	VLD1 {dMultiplier0,dMultiplier1},[pMultiplierTable] ;// qMultiplier = [ 4\|3\|2\|1\|0\|-1\|-2\|-3 ]
	VRSHR dHVValS32,dHVValS32,#5 ;// [c\|b] in 16bits each
	VADDL qA,dAboveVal,dLeftVal
	VDUP qA,qA[7]
	VSHL qA,qA,#4 ;// [a\|a\|a\|a\|a\|a\|a\|a]
	VDUP qB,dHVValS16[0] ;// [b\|b\|b\|b\|b\|b\|b\|b]
	VDUP qC,dHVValS16[2] ;// [c\|c\|c\|c\|c\|c\|c\|c]


	VMUL qB,qB,qMultiplier
	VMUL qC,qC,qMultiplier
	VADD qB,qB,qA

	VDUP qC0,qC[0]
	VDUP qC1,qC[1]
	VDUP qC2,qC[2]
	VDUP qC3,qC[3]
	VDUP qC4,qC[4]
	VDUP qC5,qC[5]
	VDUP qC6,qC[6]
	VDUP qC7,qC[7]

	VADD qSum0,qB,qC0
	VADD qSum1,qB,qC1
	VADD qSum2,qB,qC2
	VADD qSum3,qB,qC3
	VADD qSum4,qB,qC4
	VADD qSum5,qB,qC5
	VADD qSum6,qB,qC6
	VADD qSum7,qB,qC7

	VQRSHRUN dSum0,qSum0,#5 ;// (OMX_U8)armClip(0,255,(Sum+16)>>5)
	VQRSHRUN dSum1,qSum1,#5
	VQRSHRUN dSum2,qSum2,#5
	VQRSHRUN dSum3,qSum3,#5
	VQRSHRUN dSum4,qSum4,#5
	VQRSHRUN dSum5,qSum5,#5
	VQRSHRUN dSum6,qSum6,#5
	VQRSHRUN dSum7,qSum7,#5

	DCChroma8x8PlaneStore
	ADD pTmp, pDst, dstStep
	ADD step, dstStep, dstStep

	VST1 dSum0,[pDst],step ;// pDst[0*dstStep+x] :0<= x <= 7
	VST1 dSum1,[pTmp],step ;// pDst[1*dstStep+x] :0<= x <= 7
	VST1 dSum2,[pDst],step ;// pDst[2*dstStep+x] :0<= x <= 7
	VST1 dSum3,[pTmp],step ;// pDst[3*dstStep+x] :0<= x <= 7
	VST1 dSum4,[pDst],step ;// pDst[4*dstStep+x] :0<= x <= 7
	VST1 dSum5,[pTmp],step ;// pDst[5*dstStep+x] :0<= x <= 7
	VST1 dSum6,[pDst],step ;// pDst[6*dstStep+x] :0<= x <= 7
	VST1 dSum7,[pTmp] ;// pDst[7*dstStep+x] :0<= x <= 7

	MOV return, #OMX_Sts_NoErr
	M_END

	ENDIF ;// CortexA8

	END
	;//-----------------------------------------------------------------------------------------------
	;// omxVCM4P10_PredictIntraChroma_8x8 ends
	;//-----------------------------------------------------------------------------------------------