blob: e5372e1e9df681e8fd77915595fc12f9a15aa613 [file] [log] [blame]
;//
;// Copyright (C) 2007-2008 ARM Limited
;//
;// Licensed under the Apache License, Version 2.0 (the "License");
;// you may not use this file except in compliance with the License.
;// You may obtain a copy of the License at
;//
;// http://www.apache.org/licenses/LICENSE-2.0
;//
;// Unless required by applicable law or agreed to in writing, software
;// distributed under the License is distributed on an "AS IS" BASIS,
;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
;// See the License for the specific language governing permissions and
;// limitations under the License.
;//
;//
;//
;// File Name: armVCM4P10_TransformResidual4x4_s.s
;// OpenMAX DL: v1.0.2
;// Revision: 9641
;// Date: Thursday, February 7, 2008
;//
;//
;//
;//
;// Description:
;// Transform Residual 4x4 Coefficients
;//
;//
;// Include standard headers
INCLUDE omxtypes_s.h
INCLUDE armCOMM_s.h
M_VARIANTS ARM1136JS
;// Import symbols required from other files
;// (For example tables)
;// Set debugging level
;//DEBUG_ON SETL {TRUE}
;// Guarding implementation by the processor name
IF ARM1136JS
;//Input Registers
pDst RN 0
pSrc RN 1
;//Output Registers
;//Local Scratch Registers
;// Packed Input pixels
in00 RN 2 ;// Src[0] & Src[1]
in02 RN 3 ;// Src[2] & Src[3]
in10 RN 4 ;// Src[4] & Src[5]
in12 RN 5 ;// Src[6] & Src[7]
in20 RN 6 ;// Src[8] & Src[9]
in22 RN 7 ;// Src[10] & Src[11]
in30 RN 8 ;// Src[12] & Src[13]
in32 RN 9 ;// Src[14] & Src[15]
;// Transpose for Row operations (Rows to cols)
trRow00 RN 2
trRow10 RN 10
trRow02 RN 3
trRow12 RN 5
trRow20 RN 11
trRow30 RN 12
trRow32 RN 14
trRow22 RN 7
;// Intermediate calculations
e0 RN 4
e1 RN 6
e2 RN 8
e3 RN 9
constZero RN 1
;// Row operated pixels
rowOp00 RN 2
rowOp10 RN 10
rowOp20 RN 11
rowOp30 RN 12
rowOp02 RN 3
rowOp12 RN 5
rowOp22 RN 7
rowOp32 RN 14
;// Transpose for colulmn operations
trCol00 RN 2
trCol02 RN 3
trCol10 RN 4
trCol12 RN 5
trCol20 RN 6
trCol22 RN 7
trCol30 RN 8
trCol32 RN 9
;// Intermediate calculations
g0 RN 10
g1 RN 11
g2 RN 12
g3 RN 14
;// Coloumn operated pixels
colOp00 RN 2
colOp02 RN 3
colOp10 RN 4
colOp12 RN 5
colOp20 RN 6
colOp22 RN 7
colOp30 RN 8
colOp32 RN 9
temp1 RN 10 ;// Temporary scratch varaibles
const1 RN 11
const2 RN 12
mask RN 14
;// Output pixels
out00 RN 2
out02 RN 3
out10 RN 4
out12 RN 5
out20 RN 6
out22 RN 7
out30 RN 8
out32 RN 9
;// Allocate stack memory required by the function
;// Write function header
M_START armVCM4P10_TransformResidual4x4,r11
;******************************************************************
;// The strategy used in implementing the transform is as follows:*
;// Load the 4x4 block into 8 registers *
;// Transpose the 4x4 matrix *
;// Perform the row operations (on columns) using SIMD *
;// Transpose the 4x4 result matrix *
;// Perform the coloumn operations *
;// Store the 4x4 block at one go *
;******************************************************************
;// Load all the 4x4 pixels
LDMIA pSrc,{in00,in02,in10,in12,in20,in22,in30,in32}
MOV constZero,#0 ;// Used to right shift by 1
;LDR constZero,=0x00000000
;*****************************************************************
;//
;// Transpose the matrix inorder to perform row ops as coloumn ops
;// Input: in[][] = original matrix
;// Output: trRow[][]= transposed matrix
;// Step1: Obtain the LL part of the transposed matrix
;// Step2: Obtain the HL part
;// step3: Obtain the LH part
;// Step4: Obtain the HH part
;//
;*****************************************************************
;// LL 2x2 transposed matrix
;// d0 d1 - -
;// d4 d5 - -
;// - - - -
;// - - - -
PKHTB trRow10,in10,in00,ASR #16 ;// [5 4] = [f5:f1]
PKHBT trRow00,in00,in10,LSL #16 ;// [1 0] = [f4:f0]
;// HL 2x2 transposed matrix
;// - - - -
;// - - - -
;// d8 d9 - -
;// d12 d13 - -
PKHTB trRow30,in12,in02,ASR #16 ;// [13 12] = [7 3]
PKHBT trRow20,in02,in12,LSL #16 ;// [9 8] = [6 2]
;// LH 2x2 transposed matrix
;// - - d2 d3
;// - - d6 d7
;// - - - -
;// - - - -
PKHBT trRow02,in20,in30,LSL #16 ;// [3 2] = [f12:f8]
PKHTB trRow12,in30,in20,ASR #16 ;// [7 6] = [f13:f9]
;// HH 2x2 transposed matrix
;// - - - -
;// - - - -
;// - - d10 d11
;// - - d14 d15
PKHTB trRow32,in32,in22,ASR #16 ;// [15 14] = [15 11]
PKHBT trRow22,in22,in32,LSL #16 ;// [11 10] = [14 10]
;****************************************
;// Row Operations (Performed on columns)
;****************************************
;// SIMD operations on first two columns(two rows of the original matrix)
SADD16 e0, trRow00,trRow20 ;// e0 = d0 + d2
SSUB16 e1, trRow00,trRow20 ;// e1 = d0 - d2
SHADD16 e2, trRow10,constZero ;// (f1>>1) constZero is a register holding 0
SHADD16 e3, trRow30,constZero ;// avoid pipeline stalls for e2 and e3
SSUB16 e2, e2, trRow30 ;// e2 = (d1>>1) - d3
SADD16 e3, e3, trRow10 ;// e3 = d1 + (d3>>1)
SADD16 rowOp00, e0, e3 ;// f0 = e0 + e3
SADD16 rowOp10, e1, e2 ;// f1 = e1 + e2
SSUB16 rowOp20, e1, e2 ;// f2 = e1 - e2
SSUB16 rowOp30, e0, e3 ;// f3 = e0 - e3
;// SIMD operations on next two columns(next two rows of the original matrix)
SADD16 e0, trRow02,trRow22
SSUB16 e1, trRow02,trRow22
SHADD16 e2, trRow12,constZero ;//(f1>>1) constZero is a register holding 0
SHADD16 e3, trRow32,constZero
SSUB16 e2, e2, trRow32
SADD16 e3, e3, trRow12
SADD16 rowOp02, e0, e3
SADD16 rowOp12, e1, e2
SSUB16 rowOp22, e1, e2
SSUB16 rowOp32, e0, e3
;*****************************************************************
;// Transpose the resultant matrix
;// Input: rowOp[][]
;// Output: trCol[][]
;*****************************************************************
;// LL 2x2 transposed matrix
;// d0 d1 - -
;// d4 d5 - -
;// - - - -
;// - - - -
PKHTB trCol10,rowOp10,rowOp00,ASR #16 ;// [5 4] = [f5:f1]
PKHBT trCol00,rowOp00,rowOp10,LSL #16 ;// [1 0] = [f4:f0]
;// HL 2x2 transposed matrix
;// - - - -
;// - - - -
;// d8 d9 - -
;// d12 d13 - -
PKHTB trCol30,rowOp12,rowOp02,ASR #16 ;// [13 12] = [7 3]
PKHBT trCol20,rowOp02,rowOp12,LSL #16 ;// [9 8] = [6 2]
;// LH 2x2 transposed matrix
;// - - d2 d3
;// - - d6 d7
;// - - - -
;// - - - -
PKHBT trCol02,rowOp20,rowOp30,LSL #16 ;// [3 2] = [f12:f8]
PKHTB trCol12,rowOp30,rowOp20,ASR #16 ;// [7 6] = [f13:f9]
;// HH 2x2 transposed matrix
;// - - - -
;// - - - -
;// - - d10 d11
;// - - d14 d15
PKHTB trCol32,rowOp32,rowOp22,ASR #16 ;// [15 14] = [15 11]
PKHBT trCol22,rowOp22,rowOp32,LSL #16 ;// [11 10] = [14 10]
;*******************************
;// Coloumn Operations
;*******************************
;// SIMD operations on first two columns
SADD16 g0, trCol00,trCol20
SSUB16 g1, trCol00,trCol20
SHADD16 g2, trCol10,constZero ;// (f1>>1) constZero is a register holding 0
SHADD16 g3, trCol30,constZero
SSUB16 g2, g2, trCol30
SADD16 g3, g3, trCol10
SADD16 colOp00, g0, g3
SADD16 colOp10, g1, g2
SSUB16 colOp20, g1, g2
SSUB16 colOp30, g0, g3
;// SIMD operations on next two columns
SADD16 g0, trCol02,trCol22
SSUB16 g1, trCol02,trCol22
SHADD16 g2, trCol12,constZero ;// (f1>>1) constZero is a register holding 0
SHADD16 g3, trCol32,constZero
SSUB16 g2, g2, trCol32
SADD16 g3, g3, trCol12
SADD16 colOp02, g0, g3
SADD16 colOp12, g1, g2
SSUB16 colOp22, g1, g2
SSUB16 colOp32, g0, g3
;************************************************
;// Calculate final value (colOp[i][j] + 32)>>6
;************************************************
;// const1: Serves dual purpose
;// (1) Add #32 to both the lower and higher 16bits of the SIMD result
;// (2) Convert the lower 16 bit value to an unsigned number (Add 32768)
LDR const1, =0x00208020
LDR mask, =0xffff03ff ;// Used to mask the down shifted 6 bits
;// const2(#512): used to convert the lower 16bit number back to signed value
MOV const2,#0x200 ;// const2 = 2^9
;// First Row
SADD16 colOp00, colOp00, const1
SADD16 colOp02, colOp02, const1
AND colOp00, mask, colOp00, ASR #6
AND colOp02, mask, colOp02, ASR #6
SSUB16 out00,colOp00,const2
SSUB16 out02,colOp02,const2
;// Second Row
SADD16 colOp10, colOp10, const1
SADD16 colOp12, colOp12, const1
AND colOp10, mask, colOp10, ASR #6
AND colOp12, mask, colOp12, ASR #6
SSUB16 out10,colOp10,const2
SSUB16 out12,colOp12,const2
;// Third Row
SADD16 colOp20, colOp20, const1
SADD16 colOp22, colOp22, const1
AND colOp20, mask, colOp20, ASR #6
AND colOp22, mask, colOp22, ASR #6
SSUB16 out20,colOp20,const2
SSUB16 out22,colOp22,const2
;// Fourth Row
SADD16 colOp30, colOp30, const1
SADD16 colOp32, colOp32, const1
AND colOp30, mask, colOp30, ASR #6
AND colOp32, mask, colOp32, ASR #6
SSUB16 out30,colOp30,const2
SSUB16 out32,colOp32,const2
;***************************
;// Store all the 4x4 pixels
;***************************
STMIA pDst,{out00,out02,out10,out12,out20,out22,out30,out32}
;// Set return value
End
;// Write function tail
M_END
ENDIF ;//ARM1136JS
;// Guarding implementation by the processor name
END