av/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s - nest-cam/4320010/av - Git at Google

 ;//
 ;// Copyright (C) 2007-2008 ARM Limited
 ;//
 ;// Licensed under the Apache License, Version 2.0 (the "License");
 ;// you may not use this file except in compliance with the License.
 ;// You may obtain a copy of the License at
 ;//
 ;//      http://www.apache.org/licenses/LICENSE-2.0
 ;//
 ;// Unless required by applicable law or agreed to in writing, software
 ;// distributed under the License is distributed on an "AS IS" BASIS,
 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ;// See the License for the specific language governing permissions and
 ;// limitations under the License.
 ;//
 ;//
 ;//
 ;// File Name:  armVCM4P10_TransformResidual4x4_s.s
 ;// OpenMAX DL: v1.0.2
 ;// Revision:   12290
 ;// Date:       Wednesday, April 9, 2008
 ;//
 ;//
 ;//
 ;//
 ;// Description:
 ;// Transform Residual 4x4 Coefficients
 ;//
 ;//


 ;// Include standard headers

         INCLUDE omxtypes_s.h
         INCLUDE armCOMM_s.h

         M_VARIANTS CortexA8

 ;// Import symbols required from other files
 ;// (For example tables)


 ;// Set debugging level
 ;//DEBUG_ON    SETL {TRUE}


 ;// Guarding implementation by the processor name


 ;// Guarding implementation by the processor name

     IF  CortexA8

 ;// ARM Registers

 ;//Input Registers
 pDst                RN  0
 pSrc                RN  1


 ;// Neon Registers

 ;// Packed Input pixels
 dIn0                DN  D0.S16
 dIn1                DN  D1.S16
 dIn2                DN  D2.S16
 dIn3                DN  D3.S16

 ;// Intermediate calculations
 dZero               DN  D4.S16
 de0                 DN  D5.S16
 de1                 DN  D6.S16
 de2                 DN  D7.S16
 de3                 DN  D8.S16
 dIn1RS              DN  D7.S16
 dIn3RS              DN  D8.S16
 df0                 DN  D0.S16
 df1                 DN  D1.S16
 df2                 DN  D2.S16
 df3                 DN  D3.S16
 qf01                QN  Q0.32
 qf23                QN  Q1.32
 dg0                 DN  D5.S16
 dg1                 DN  D6.S16
 dg2                 DN  D7.S16
 dg3                 DN  D8.S16
 df1RS               DN  D7.S16
 df3RS               DN  D8.S16

 ;// Output pixels
 dh0                 DN  D0.S16
 dh1                 DN  D1.S16
 dh2                 DN  D2.S16
 dh3                 DN  D3.S16


     ;// Allocate stack memory required by the function


     ;// Write function header
         M_START armVCM4P10_TransformResidual4x4, ,d8

         ;******************************************************************
         ;// The strategy used in implementing the transform is as follows:*
         ;// Load the 4x4 block into 8 registers                           *
         ;// Transpose the 4x4 matrix                                      *
         ;// Perform the row operations (on columns) using SIMD            *
         ;// Transpose the 4x4 result matrix                               *
         ;// Perform the coloumn operations                                *
         ;// Store the 4x4 block at one go                                 *
         ;******************************************************************

         ;// Load all the 4x4 pixels in transposed form

         VLD4    {dIn0,dIn1,dIn2,dIn3},[pSrc]

         VMOV    dZero,#0                                    ;// Used to right shift by 1


         ;****************************************
         ;// Row Operations (Performed on columns)
         ;****************************************


         VADD        de0,dIn0,dIn2                       ;//  e0 = d0 + d2
         VSUB        de1,dIn0,dIn2                        ;//  e1 = d0 - d2
         VHADD       dIn1RS,dIn1,dZero                   ;// (f1>>1) constZero is a register holding 0
         VHADD       dIn3RS,dIn3,dZero
         VSUB        de2,dIn1RS,dIn3                     ;//  e2 = (d1>>1) - d3
         VADD        de3,dIn1,dIn3RS                        ;//  e3 = d1 + (d3>>1)
         VADD        df0,de0,de3                         ;//  f0 = e0 + e3
         VADD        df1,de1,de2                            ;//  f1 = e1 + e2
         VSUB        df2,de1,de2                            ;//  f2 = e1 - e2
         VSUB        df3,de0,de3                            ;//  f3 = e0 - e3


         ;*****************************************************************
         ;// Transpose the resultant matrix
         ;*****************************************************************

         VTRN    df0,df1
         VTRN    df2,df3
         VTRN    qf01,qf23


         ;*******************************
         ;// Coloumn Operations
         ;*******************************


         VADD        dg0,df0,df2                         ;//  e0 = d0 + d2
         VSUB        dg1,df0,df2                            ;//  e1 = d0 - d2
         VHADD       df1RS,df1,dZero                     ;// (f1>>1) constZero is a register holding 0
         VHADD       df3RS,df3,dZero
         VSUB        dg2,df1RS,df3                       ;//  e2 = (d1>>1) - d3
         VADD        dg3,df1,df3RS                        ;//  e3 = d1 + (d3>>1)
         VADD        dh0,dg0,dg3                         ;//  f0 = e0 + e3
         VADD        dh1,dg1,dg2                            ;//  f1 = e1 + e2
         VSUB        dh2,dg1,dg2                            ;//  f2 = e1 - e2
         VSUB        dh3,dg0,dg3                            ;//  f3 = e0 - e3


         ;************************************************
         ;// Calculate final value (colOp[i][j] + 32)>>6
         ;************************************************

         VRSHR       dh0,#6
         VRSHR       dh1,#6
         VRSHR       dh2,#6
         VRSHR       dh3,#6


         ;***************************
         ;// Store all the 4x4 pixels
         ;***************************

         VST1   {dh0,dh1,dh2,dh3},[pDst]


         ;// Set return value

 End


         ;// Write function tail
         M_END

     ENDIF                                                           ;//CortexA8

     END
	;//
	;// Copyright (C) 2007-2008 ARM Limited
	;//
	;// Licensed under the Apache License, Version 2.0 (the "License");
	;// you may not use this file except in compliance with the License.
	;// You may obtain a copy of the License at
	;//
	;// http://www.apache.org/licenses/LICENSE-2.0
	;//
	;// Unless required by applicable law or agreed to in writing, software
	;// distributed under the License is distributed on an "AS IS" BASIS,
	;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	;// See the License for the specific language governing permissions and
	;// limitations under the License.
	;//
	;//
	;//
	;// File Name: armVCM4P10_TransformResidual4x4_s.s
	;// OpenMAX DL: v1.0.2
	;// Revision: 12290
	;// Date: Wednesday, April 9, 2008
	;//
	;//
	;//
	;//
	;// Description:
	;// Transform Residual 4x4 Coefficients
	;//
	;//


	;// Include standard headers

	INCLUDE omxtypes_s.h
	INCLUDE armCOMM_s.h

	M_VARIANTS CortexA8

	;// Import symbols required from other files
	;// (For example tables)




	;// Set debugging level
	;//DEBUG_ON SETL {TRUE}



	;// Guarding implementation by the processor name








	;// Guarding implementation by the processor name

	IF CortexA8

	;// ARM Registers

	;//Input Registers
	pDst RN 0
	pSrc RN 1


	;// Neon Registers

	;// Packed Input pixels
	dIn0 DN D0.S16
	dIn1 DN D1.S16
	dIn2 DN D2.S16
	dIn3 DN D3.S16

	;// Intermediate calculations
	dZero DN D4.S16
	de0 DN D5.S16
	de1 DN D6.S16
	de2 DN D7.S16
	de3 DN D8.S16
	dIn1RS DN D7.S16
	dIn3RS DN D8.S16
	df0 DN D0.S16
	df1 DN D1.S16
	df2 DN D2.S16
	df3 DN D3.S16
	qf01 QN Q0.32
	qf23 QN Q1.32
	dg0 DN D5.S16
	dg1 DN D6.S16
	dg2 DN D7.S16
	dg3 DN D8.S16
	df1RS DN D7.S16
	df3RS DN D8.S16

	;// Output pixels
	dh0 DN D0.S16
	dh1 DN D1.S16
	dh2 DN D2.S16
	dh3 DN D3.S16


	;// Allocate stack memory required by the function


	;// Write function header
	M_START armVCM4P10_TransformResidual4x4, ,d8

	;******************************************************************
	;// The strategy used in implementing the transform is as follows:*
	;// Load the 4x4 block into 8 registers *
	;// Transpose the 4x4 matrix *
	;// Perform the row operations (on columns) using SIMD *
	;// Transpose the 4x4 result matrix *
	;// Perform the coloumn operations *
	;// Store the 4x4 block at one go *
	;******************************************************************

	;// Load all the 4x4 pixels in transposed form

	VLD4 {dIn0,dIn1,dIn2,dIn3},[pSrc]

	VMOV dZero,#0 ;// Used to right shift by 1


	;****************************************
	;// Row Operations (Performed on columns)
	;****************************************


	VADD de0,dIn0,dIn2 ;// e0 = d0 + d2
	VSUB de1,dIn0,dIn2 ;// e1 = d0 - d2
	VHADD dIn1RS,dIn1,dZero ;// (f1>>1) constZero is a register holding 0
	VHADD dIn3RS,dIn3,dZero
	VSUB de2,dIn1RS,dIn3 ;// e2 = (d1>>1) - d3
	VADD de3,dIn1,dIn3RS ;// e3 = d1 + (d3>>1)
	VADD df0,de0,de3 ;// f0 = e0 + e3
	VADD df1,de1,de2 ;// f1 = e1 + e2
	VSUB df2,de1,de2 ;// f2 = e1 - e2
	VSUB df3,de0,de3 ;// f3 = e0 - e3



	;*****************************************************************
	;// Transpose the resultant matrix
	;*****************************************************************

	VTRN df0,df1
	VTRN df2,df3
	VTRN qf01,qf23


	;*******************************
	;// Coloumn Operations
	;*******************************


	VADD dg0,df0,df2 ;// e0 = d0 + d2
	VSUB dg1,df0,df2 ;// e1 = d0 - d2
	VHADD df1RS,df1,dZero ;// (f1>>1) constZero is a register holding 0
	VHADD df3RS,df3,dZero
	VSUB dg2,df1RS,df3 ;// e2 = (d1>>1) - d3
	VADD dg3,df1,df3RS ;// e3 = d1 + (d3>>1)
	VADD dh0,dg0,dg3 ;// f0 = e0 + e3
	VADD dh1,dg1,dg2 ;// f1 = e1 + e2
	VSUB dh2,dg1,dg2 ;// f2 = e1 - e2
	VSUB dh3,dg0,dg3 ;// f3 = e0 - e3


	;************************************************
	;// Calculate final value (colOp[i][j] + 32)>>6
	;************************************************

	VRSHR dh0,#6
	VRSHR dh1,#6
	VRSHR dh2,#6
	VRSHR dh3,#6


	;***************************
	;// Store all the 4x4 pixels
	;***************************

	VST1 {dh0,dh1,dh2,dh3},[pDst]


	;// Set return value

	End


	;// Write function tail
	M_END

	ENDIF ;//CortexA8

	END