av/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s - nest-cam/4320010/av - Git at Google

 ;//
 ;// Copyright (C) 2007-2008 ARM Limited
 ;//
 ;// Licensed under the Apache License, Version 2.0 (the "License");
 ;// you may not use this file except in compliance with the License.
 ;// You may obtain a copy of the License at
 ;//
 ;//      http://www.apache.org/licenses/LICENSE-2.0
 ;//
 ;// Unless required by applicable law or agreed to in writing, software
 ;// distributed under the License is distributed on an "AS IS" BASIS,
 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ;// See the License for the specific language governing permissions and
 ;// limitations under the License.
 ;//
 ;//
 ;//
 ;// File Name:  armVCM4P10_TransformResidual4x4_s.s
 ;// OpenMAX DL: v1.0.2
 ;// Revision:   9641
 ;// Date:       Thursday, February 7, 2008
 ;//
 ;//
 ;//
 ;//
 ;// Description:
 ;// Transform Residual 4x4 Coefficients
 ;//
 ;//


 ;// Include standard headers

         INCLUDE omxtypes_s.h
         INCLUDE armCOMM_s.h

         M_VARIANTS ARM1136JS

 ;// Import symbols required from other files
 ;// (For example tables)


 ;// Set debugging level
 ;//DEBUG_ON    SETL {TRUE}


 ;// Guarding implementation by the processor name

     IF  ARM1136JS

 ;//Input Registers
 pDst                RN  0
 pSrc                RN  1

 ;//Output Registers


 ;//Local Scratch Registers

 ;// Packed Input pixels
 in00                RN  2                   ;// Src[0] & Src[1]
 in02                RN  3                   ;// Src[2] & Src[3]
 in10                RN  4                   ;// Src[4] & Src[5]
 in12                RN  5                   ;// Src[6] & Src[7]
 in20                RN  6                   ;// Src[8] & Src[9]
 in22                RN  7                   ;// Src[10] & Src[11]
 in30                RN  8                   ;// Src[12] & Src[13]
 in32                RN  9                   ;// Src[14] & Src[15]

 ;// Transpose for Row operations (Rows to cols)
 trRow00             RN  2
 trRow10             RN  10
 trRow02             RN  3
 trRow12             RN  5
 trRow20             RN  11
 trRow30             RN  12
 trRow32             RN  14
 trRow22             RN  7

 ;// Intermediate calculations
 e0                  RN  4
 e1                  RN  6
 e2                  RN  8
 e3                  RN  9
 constZero           RN  1

 ;// Row operated pixels
 rowOp00             RN  2
 rowOp10             RN  10
 rowOp20             RN  11
 rowOp30             RN  12
 rowOp02             RN  3
 rowOp12             RN  5
 rowOp22             RN  7
 rowOp32             RN  14

 ;// Transpose for colulmn operations
 trCol00             RN  2
 trCol02             RN  3
 trCol10             RN  4
 trCol12             RN  5
 trCol20             RN  6
 trCol22             RN  7
 trCol30             RN  8
 trCol32             RN  9

 ;// Intermediate calculations
 g0                  RN  10
 g1                  RN  11
 g2                  RN  12
 g3                  RN  14

 ;// Coloumn operated pixels
 colOp00             RN  2
 colOp02             RN  3
 colOp10             RN  4
 colOp12             RN  5
 colOp20             RN  6
 colOp22             RN  7
 colOp30             RN  8
 colOp32             RN  9


 temp1               RN  10                  ;// Temporary scratch varaibles
 const1              RN  11
 const2              RN  12
 mask                RN  14

 ;// Output pixels
 out00               RN  2
 out02               RN  3
 out10               RN  4
 out12               RN  5
 out20               RN  6
 out22               RN  7
 out30               RN  8
 out32               RN  9


     ;// Allocate stack memory required by the function


     ;// Write function header
         M_START armVCM4P10_TransformResidual4x4,r11

         ;******************************************************************
         ;// The strategy used in implementing the transform is as follows:*
         ;// Load the 4x4 block into 8 registers                           *
         ;// Transpose the 4x4 matrix                                      *
         ;// Perform the row operations (on columns) using SIMD            *
         ;// Transpose the 4x4 result matrix                               *
         ;// Perform the coloumn operations                                *
         ;// Store the 4x4 block at one go                                 *
         ;******************************************************************

         ;// Load all the 4x4 pixels

         LDMIA   pSrc,{in00,in02,in10,in12,in20,in22,in30,in32}

         MOV       constZero,#0                                     ;// Used to right shift by 1
         ;LDR       constZero,=0x00000000

         ;*****************************************************************
         ;//
         ;// Transpose the matrix inorder to perform row ops as coloumn ops
         ;// Input:   in[][] = original matrix
         ;// Output:  trRow[][]= transposed matrix
         ;// Step1: Obtain the LL part of the transposed matrix
         ;// Step2: Obtain the HL part
         ;// step3: Obtain the LH part
         ;// Step4: Obtain the HH part
         ;//
         ;*****************************************************************

         ;// LL 2x2 transposed matrix
         ;//   d0 d1 - -
         ;//   d4 d5 - -
         ;//   -  -  - -
         ;//   -  -  - -

         PKHTB   trRow10,in10,in00,ASR #16               ;// [5 4] = [f5:f1]
         PKHBT   trRow00,in00,in10,LSL #16               ;// [1 0] = [f4:f0]

         ;// HL 2x2 transposed matrix
         ;//    -   -   - -
         ;//    -   -   - -
         ;//    d8  d9  - -
         ;//   d12 d13  - -


          PKHTB   trRow30,in12,in02,ASR #16              ;// [13 12] = [7 3]
          PKHBT   trRow20,in02,in12,LSL #16              ;// [9 8] = [6 2]

         ;// LH 2x2 transposed matrix
         ;//   - - d2 d3
         ;//   - - d6 d7
         ;//   - - -  -
         ;//   - - -  -

         PKHBT   trRow02,in20,in30,LSL #16               ;// [3 2] = [f12:f8]
         PKHTB   trRow12,in30,in20,ASR #16               ;// [7 6] = [f13:f9]


         ;// HH 2x2 transposed matrix
         ;//    - -   -   -
         ;//    - -   -   -
         ;//    - -  d10 d11
         ;//    - -  d14 d15

         PKHTB   trRow32,in32,in22,ASR #16               ;// [15 14] = [15 11]
         PKHBT   trRow22,in22,in32,LSL #16               ;// [11 10] = [14 10]


         ;****************************************
         ;// Row Operations (Performed on columns)
         ;****************************************


         ;// SIMD operations on first two columns(two rows of the original matrix)


         SADD16      e0, trRow00,trRow20                   ;//  e0 = d0 + d2
         SSUB16    e1, trRow00,trRow20                   ;//  e1 = d0 - d2
         SHADD16   e2, trRow10,constZero                 ;// (f1>>1) constZero is a register holding 0
         SHADD16   e3, trRow30,constZero                 ;//  avoid pipeline stalls for e2 and e3
         SSUB16    e2, e2, trRow30                       ;//  e2 = (d1>>1) - d3
         SADD16    e3, e3, trRow10                       ;//  e3 = d1 + (d3>>1)
         SADD16    rowOp00, e0, e3                       ;//  f0 = e0 + e3
         SADD16    rowOp10, e1, e2                       ;//  f1 = e1 + e2
         SSUB16    rowOp20, e1, e2                       ;//  f2 = e1 - e2
         SSUB16    rowOp30, e0, e3                       ;//  f3 = e0 - e3

         ;// SIMD operations on next two columns(next two rows of the original matrix)

         SADD16      e0, trRow02,trRow22
         SSUB16    e1, trRow02,trRow22
         SHADD16   e2, trRow12,constZero                 ;//(f1>>1) constZero is a register holding 0
         SHADD16   e3, trRow32,constZero
         SSUB16    e2, e2, trRow32
         SADD16    e3, e3, trRow12
         SADD16    rowOp02, e0, e3
         SADD16    rowOp12, e1, e2
         SSUB16    rowOp22, e1, e2
         SSUB16    rowOp32, e0, e3


         ;*****************************************************************
         ;// Transpose the resultant matrix
         ;// Input:  rowOp[][]
         ;// Output: trCol[][]
         ;*****************************************************************

         ;// LL 2x2 transposed matrix
         ;//   d0 d1 - -
         ;//   d4 d5 - -
         ;//   -  -  - -
         ;//   -  -  - -

         PKHTB   trCol10,rowOp10,rowOp00,ASR #16           ;// [5 4] = [f5:f1]
         PKHBT   trCol00,rowOp00,rowOp10,LSL #16           ;// [1 0] = [f4:f0]

         ;// HL 2x2 transposed matrix
         ;//    -   -   - -
         ;//    -   -   - -
         ;//    d8  d9  - -
         ;//   d12 d13  - -


          PKHTB   trCol30,rowOp12,rowOp02,ASR #16          ;// [13 12] = [7 3]
          PKHBT   trCol20,rowOp02,rowOp12,LSL #16          ;// [9 8] = [6 2]

         ;// LH 2x2 transposed matrix
         ;//   - - d2 d3
         ;//   - - d6 d7
         ;//   - - -  -
         ;//   - - -  -

         PKHBT   trCol02,rowOp20,rowOp30,LSL #16           ;// [3 2] = [f12:f8]
         PKHTB   trCol12,rowOp30,rowOp20,ASR #16           ;// [7 6] = [f13:f9]


         ;// HH 2x2 transposed matrix
         ;//    - -   -   -
         ;//    - -   -   -
         ;//    - -  d10 d11
         ;//    - -  d14 d15

         PKHTB   trCol32,rowOp32,rowOp22,ASR #16            ;// [15 14] = [15 11]
         PKHBT   trCol22,rowOp22,rowOp32,LSL #16            ;// [11 10] = [14 10]


         ;*******************************
         ;// Coloumn Operations
         ;*******************************


         ;// SIMD operations on first two columns


         SADD16      g0, trCol00,trCol20
         SSUB16    g1, trCol00,trCol20
         SHADD16   g2, trCol10,constZero                     ;// (f1>>1) constZero is a register holding 0
         SHADD16   g3, trCol30,constZero
         SSUB16    g2, g2, trCol30
         SADD16    g3, g3, trCol10
         SADD16    colOp00, g0, g3
         SADD16    colOp10, g1, g2
         SSUB16    colOp20, g1, g2
         SSUB16    colOp30, g0, g3

         ;// SIMD operations on next two columns

         SADD16      g0, trCol02,trCol22
         SSUB16    g1, trCol02,trCol22
         SHADD16   g2, trCol12,constZero                     ;// (f1>>1) constZero is a register holding 0
         SHADD16   g3, trCol32,constZero
         SSUB16    g2, g2, trCol32
         SADD16    g3, g3, trCol12
         SADD16    colOp02, g0, g3
         SADD16    colOp12, g1, g2
         SSUB16    colOp22, g1, g2
         SSUB16    colOp32, g0, g3


         ;************************************************
         ;// Calculate final value (colOp[i][j] + 32)>>6
         ;************************************************

         ;// const1: Serves dual purpose
         ;// (1) Add #32 to both the lower and higher 16bits of the SIMD result
         ;// (2) Convert the lower 16 bit value to an unsigned number (Add 32768)

         LDR     const1, =0x00208020

         LDR     mask, =0xffff03ff                       ;// Used to mask the down shifted 6 bits

         ;// const2(#512): used to convert the lower 16bit number back to signed value

         MOV     const2,#0x200                           ;// const2 = 2^9

         ;// First Row

         SADD16    colOp00, colOp00, const1
         SADD16    colOp02, colOp02, const1
         AND     colOp00, mask, colOp00, ASR #6
         AND     colOp02, mask, colOp02, ASR #6
         SSUB16  out00,colOp00,const2
         SSUB16  out02,colOp02,const2


         ;// Second Row

         SADD16    colOp10, colOp10, const1
         SADD16    colOp12, colOp12, const1
         AND     colOp10, mask, colOp10, ASR #6
         AND     colOp12, mask, colOp12, ASR #6
         SSUB16  out10,colOp10,const2
         SSUB16  out12,colOp12,const2


         ;// Third Row

         SADD16    colOp20, colOp20, const1
         SADD16    colOp22, colOp22, const1
         AND     colOp20, mask, colOp20, ASR #6
         AND     colOp22, mask, colOp22, ASR #6
         SSUB16  out20,colOp20,const2
         SSUB16  out22,colOp22,const2


         ;// Fourth Row

         SADD16    colOp30, colOp30, const1
         SADD16    colOp32, colOp32, const1
         AND     colOp30, mask, colOp30, ASR #6
         AND     colOp32, mask, colOp32, ASR #6
         SSUB16  out30,colOp30,const2
         SSUB16  out32,colOp32,const2


         ;***************************
         ;// Store all the 4x4 pixels
         ;***************************

         STMIA   pDst,{out00,out02,out10,out12,out20,out22,out30,out32}


         ;// Set return value

 End


         ;// Write function tail
         M_END

     ENDIF                                                           ;//ARM1136JS


 ;// Guarding implementation by the processor name


     END
	;//
	;// Copyright (C) 2007-2008 ARM Limited
	;//
	;// Licensed under the Apache License, Version 2.0 (the "License");
	;// you may not use this file except in compliance with the License.
	;// You may obtain a copy of the License at
	;//
	;// http://www.apache.org/licenses/LICENSE-2.0
	;//
	;// Unless required by applicable law or agreed to in writing, software
	;// distributed under the License is distributed on an "AS IS" BASIS,
	;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	;// See the License for the specific language governing permissions and
	;// limitations under the License.
	;//
	;//
	;//
	;// File Name: armVCM4P10_TransformResidual4x4_s.s
	;// OpenMAX DL: v1.0.2
	;// Revision: 9641
	;// Date: Thursday, February 7, 2008
	;//
	;//
	;//
	;//
	;// Description:
	;// Transform Residual 4x4 Coefficients
	;//
	;//


	;// Include standard headers

	INCLUDE omxtypes_s.h
	INCLUDE armCOMM_s.h

	M_VARIANTS ARM1136JS

	;// Import symbols required from other files
	;// (For example tables)




	;// Set debugging level
	;//DEBUG_ON SETL {TRUE}



	;// Guarding implementation by the processor name

	IF ARM1136JS

	;//Input Registers
	pDst RN 0
	pSrc RN 1

	;//Output Registers


	;//Local Scratch Registers

	;// Packed Input pixels
	in00 RN 2 ;// Src[0] & Src[1]
	in02 RN 3 ;// Src[2] & Src[3]
	in10 RN 4 ;// Src[4] & Src[5]
	in12 RN 5 ;// Src[6] & Src[7]
	in20 RN 6 ;// Src[8] & Src[9]
	in22 RN 7 ;// Src[10] & Src[11]
	in30 RN 8 ;// Src[12] & Src[13]
	in32 RN 9 ;// Src[14] & Src[15]

	;// Transpose for Row operations (Rows to cols)
	trRow00 RN 2
	trRow10 RN 10
	trRow02 RN 3
	trRow12 RN 5
	trRow20 RN 11
	trRow30 RN 12
	trRow32 RN 14
	trRow22 RN 7

	;// Intermediate calculations
	e0 RN 4
	e1 RN 6
	e2 RN 8
	e3 RN 9
	constZero RN 1

	;// Row operated pixels
	rowOp00 RN 2
	rowOp10 RN 10
	rowOp20 RN 11
	rowOp30 RN 12
	rowOp02 RN 3
	rowOp12 RN 5
	rowOp22 RN 7
	rowOp32 RN 14

	;// Transpose for colulmn operations
	trCol00 RN 2
	trCol02 RN 3
	trCol10 RN 4
	trCol12 RN 5
	trCol20 RN 6
	trCol22 RN 7
	trCol30 RN 8
	trCol32 RN 9

	;// Intermediate calculations
	g0 RN 10
	g1 RN 11
	g2 RN 12
	g3 RN 14

	;// Coloumn operated pixels
	colOp00 RN 2
	colOp02 RN 3
	colOp10 RN 4
	colOp12 RN 5
	colOp20 RN 6
	colOp22 RN 7
	colOp30 RN 8
	colOp32 RN 9


	temp1 RN 10 ;// Temporary scratch varaibles
	const1 RN 11
	const2 RN 12
	mask RN 14

	;// Output pixels
	out00 RN 2
	out02 RN 3
	out10 RN 4
	out12 RN 5
	out20 RN 6
	out22 RN 7
	out30 RN 8
	out32 RN 9



	;// Allocate stack memory required by the function


	;// Write function header
	M_START armVCM4P10_TransformResidual4x4,r11

	;******************************************************************
	;// The strategy used in implementing the transform is as follows:*
	;// Load the 4x4 block into 8 registers *
	;// Transpose the 4x4 matrix *
	;// Perform the row operations (on columns) using SIMD *
	;// Transpose the 4x4 result matrix *
	;// Perform the coloumn operations *
	;// Store the 4x4 block at one go *
	;******************************************************************

	;// Load all the 4x4 pixels

	LDMIA pSrc,{in00,in02,in10,in12,in20,in22,in30,in32}

	MOV constZero,#0 ;// Used to right shift by 1
	;LDR constZero,=0x00000000

	;*****************************************************************
	;//
	;// Transpose the matrix inorder to perform row ops as coloumn ops
	;// Input: in[][] = original matrix
	;// Output: trRow[][]= transposed matrix
	;// Step1: Obtain the LL part of the transposed matrix
	;// Step2: Obtain the HL part
	;// step3: Obtain the LH part
	;// Step4: Obtain the HH part
	;//
	;*****************************************************************

	;// LL 2x2 transposed matrix
	;// d0 d1 - -
	;// d4 d5 - -
	;// - - - -
	;// - - - -

	PKHTB trRow10,in10,in00,ASR #16 ;// [5 4] = [f5:f1]
	PKHBT trRow00,in00,in10,LSL #16 ;// [1 0] = [f4:f0]

	;// HL 2x2 transposed matrix
	;// - - - -
	;// - - - -
	;// d8 d9 - -
	;// d12 d13 - -


	PKHTB trRow30,in12,in02,ASR #16 ;// [13 12] = [7 3]
	PKHBT trRow20,in02,in12,LSL #16 ;// [9 8] = [6 2]

	;// LH 2x2 transposed matrix
	;// - - d2 d3
	;// - - d6 d7
	;// - - - -
	;// - - - -

	PKHBT trRow02,in20,in30,LSL #16 ;// [3 2] = [f12:f8]
	PKHTB trRow12,in30,in20,ASR #16 ;// [7 6] = [f13:f9]




	;// HH 2x2 transposed matrix
	;// - - - -
	;// - - - -
	;// - - d10 d11
	;// - - d14 d15

	PKHTB trRow32,in32,in22,ASR #16 ;// [15 14] = [15 11]
	PKHBT trRow22,in22,in32,LSL #16 ;// [11 10] = [14 10]


	;****************************************
	;// Row Operations (Performed on columns)
	;****************************************


	;// SIMD operations on first two columns(two rows of the original matrix)


	SADD16 e0, trRow00,trRow20 ;// e0 = d0 + d2
	SSUB16 e1, trRow00,trRow20 ;// e1 = d0 - d2
	SHADD16 e2, trRow10,constZero ;// (f1>>1) constZero is a register holding 0
	SHADD16 e3, trRow30,constZero ;// avoid pipeline stalls for e2 and e3
	SSUB16 e2, e2, trRow30 ;// e2 = (d1>>1) - d3
	SADD16 e3, e3, trRow10 ;// e3 = d1 + (d3>>1)
	SADD16 rowOp00, e0, e3 ;// f0 = e0 + e3
	SADD16 rowOp10, e1, e2 ;// f1 = e1 + e2
	SSUB16 rowOp20, e1, e2 ;// f2 = e1 - e2
	SSUB16 rowOp30, e0, e3 ;// f3 = e0 - e3

	;// SIMD operations on next two columns(next two rows of the original matrix)

	SADD16 e0, trRow02,trRow22
	SSUB16 e1, trRow02,trRow22
	SHADD16 e2, trRow12,constZero ;//(f1>>1) constZero is a register holding 0
	SHADD16 e3, trRow32,constZero
	SSUB16 e2, e2, trRow32
	SADD16 e3, e3, trRow12
	SADD16 rowOp02, e0, e3
	SADD16 rowOp12, e1, e2
	SSUB16 rowOp22, e1, e2
	SSUB16 rowOp32, e0, e3


	;*****************************************************************
	;// Transpose the resultant matrix
	;// Input: rowOp[][]
	;// Output: trCol[][]
	;*****************************************************************

	;// LL 2x2 transposed matrix
	;// d0 d1 - -
	;// d4 d5 - -
	;// - - - -
	;// - - - -

	PKHTB trCol10,rowOp10,rowOp00,ASR #16 ;// [5 4] = [f5:f1]
	PKHBT trCol00,rowOp00,rowOp10,LSL #16 ;// [1 0] = [f4:f0]

	;// HL 2x2 transposed matrix
	;// - - - -
	;// - - - -
	;// d8 d9 - -
	;// d12 d13 - -


	PKHTB trCol30,rowOp12,rowOp02,ASR #16 ;// [13 12] = [7 3]
	PKHBT trCol20,rowOp02,rowOp12,LSL #16 ;// [9 8] = [6 2]

	;// LH 2x2 transposed matrix
	;// - - d2 d3
	;// - - d6 d7
	;// - - - -
	;// - - - -

	PKHBT trCol02,rowOp20,rowOp30,LSL #16 ;// [3 2] = [f12:f8]
	PKHTB trCol12,rowOp30,rowOp20,ASR #16 ;// [7 6] = [f13:f9]




	;// HH 2x2 transposed matrix
	;// - - - -
	;// - - - -
	;// - - d10 d11
	;// - - d14 d15

	PKHTB trCol32,rowOp32,rowOp22,ASR #16 ;// [15 14] = [15 11]
	PKHBT trCol22,rowOp22,rowOp32,LSL #16 ;// [11 10] = [14 10]


	;*******************************
	;// Coloumn Operations
	;*******************************


	;// SIMD operations on first two columns


	SADD16 g0, trCol00,trCol20
	SSUB16 g1, trCol00,trCol20
	SHADD16 g2, trCol10,constZero ;// (f1>>1) constZero is a register holding 0
	SHADD16 g3, trCol30,constZero
	SSUB16 g2, g2, trCol30
	SADD16 g3, g3, trCol10
	SADD16 colOp00, g0, g3
	SADD16 colOp10, g1, g2
	SSUB16 colOp20, g1, g2
	SSUB16 colOp30, g0, g3

	;// SIMD operations on next two columns

	SADD16 g0, trCol02,trCol22
	SSUB16 g1, trCol02,trCol22
	SHADD16 g2, trCol12,constZero ;// (f1>>1) constZero is a register holding 0
	SHADD16 g3, trCol32,constZero
	SSUB16 g2, g2, trCol32
	SADD16 g3, g3, trCol12
	SADD16 colOp02, g0, g3
	SADD16 colOp12, g1, g2
	SSUB16 colOp22, g1, g2
	SSUB16 colOp32, g0, g3





	;************************************************
	;// Calculate final value (colOp[i][j] + 32)>>6
	;************************************************

	;// const1: Serves dual purpose
	;// (1) Add #32 to both the lower and higher 16bits of the SIMD result
	;// (2) Convert the lower 16 bit value to an unsigned number (Add 32768)

	LDR const1, =0x00208020

	LDR mask, =0xffff03ff ;// Used to mask the down shifted 6 bits

	;// const2(#512): used to convert the lower 16bit number back to signed value

	MOV const2,#0x200 ;// const2 = 2^9

	;// First Row

	SADD16 colOp00, colOp00, const1
	SADD16 colOp02, colOp02, const1
	AND colOp00, mask, colOp00, ASR #6
	AND colOp02, mask, colOp02, ASR #6
	SSUB16 out00,colOp00,const2
	SSUB16 out02,colOp02,const2


	;// Second Row

	SADD16 colOp10, colOp10, const1
	SADD16 colOp12, colOp12, const1
	AND colOp10, mask, colOp10, ASR #6
	AND colOp12, mask, colOp12, ASR #6
	SSUB16 out10,colOp10,const2
	SSUB16 out12,colOp12,const2


	;// Third Row

	SADD16 colOp20, colOp20, const1
	SADD16 colOp22, colOp22, const1
	AND colOp20, mask, colOp20, ASR #6
	AND colOp22, mask, colOp22, ASR #6
	SSUB16 out20,colOp20,const2
	SSUB16 out22,colOp22,const2


	;// Fourth Row

	SADD16 colOp30, colOp30, const1
	SADD16 colOp32, colOp32, const1
	AND colOp30, mask, colOp30, ASR #6
	AND colOp32, mask, colOp32, ASR #6
	SSUB16 out30,colOp30,const2
	SSUB16 out32,colOp32,const2




	;***************************
	;// Store all the 4x4 pixels
	;***************************

	STMIA pDst,{out00,out02,out10,out12,out20,out22,out30,out32}



	;// Set return value

	End


	;// Write function tail
	M_END

	ENDIF ;//ARM1136JS







	;// Guarding implementation by the processor name


	END