| ;// |
| ;// Copyright (C) 2007-2008 ARM Limited |
| ;// |
| ;// Licensed under the Apache License, Version 2.0 (the "License"); |
| ;// you may not use this file except in compliance with the License. |
| ;// You may obtain a copy of the License at |
| ;// |
| ;// http://www.apache.org/licenses/LICENSE-2.0 |
| ;// |
| ;// Unless required by applicable law or agreed to in writing, software |
| ;// distributed under the License is distributed on an "AS IS" BASIS, |
| ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| ;// See the License for the specific language governing permissions and |
| ;// limitations under the License. |
| ;// |
| ;// |
| ;// |
| ;// File Name: armVCM4P10_TransformResidual4x4_s.s |
| ;// OpenMAX DL: v1.0.2 |
| ;// Revision: 9641 |
| ;// Date: Thursday, February 7, 2008 |
| ;// |
| ;// |
| ;// |
| ;// |
| ;// Description: |
| ;// Transform Residual 4x4 Coefficients |
| ;// |
| ;// |
| |
| |
| ;// Include standard headers |
| |
| INCLUDE omxtypes_s.h |
| INCLUDE armCOMM_s.h |
| |
| M_VARIANTS ARM1136JS |
| |
| ;// Import symbols required from other files |
| ;// (For example tables) |
| |
| |
| |
| |
| ;// Set debugging level |
| ;//DEBUG_ON SETL {TRUE} |
| |
| |
| |
| ;// Guarding implementation by the processor name |
| |
| IF ARM1136JS |
| |
| ;//Input Registers |
| pDst RN 0 |
| pSrc RN 1 |
| |
| ;//Output Registers |
| |
| |
| ;//Local Scratch Registers |
| |
| ;// Packed Input pixels |
| in00 RN 2 ;// Src[0] & Src[1] |
| in02 RN 3 ;// Src[2] & Src[3] |
| in10 RN 4 ;// Src[4] & Src[5] |
| in12 RN 5 ;// Src[6] & Src[7] |
| in20 RN 6 ;// Src[8] & Src[9] |
| in22 RN 7 ;// Src[10] & Src[11] |
| in30 RN 8 ;// Src[12] & Src[13] |
| in32 RN 9 ;// Src[14] & Src[15] |
| |
| ;// Transpose for Row operations (Rows to cols) |
| trRow00 RN 2 |
| trRow10 RN 10 |
| trRow02 RN 3 |
| trRow12 RN 5 |
| trRow20 RN 11 |
| trRow30 RN 12 |
| trRow32 RN 14 |
| trRow22 RN 7 |
| |
| ;// Intermediate calculations |
| e0 RN 4 |
| e1 RN 6 |
| e2 RN 8 |
| e3 RN 9 |
| constZero RN 1 |
| |
| ;// Row operated pixels |
| rowOp00 RN 2 |
| rowOp10 RN 10 |
| rowOp20 RN 11 |
| rowOp30 RN 12 |
| rowOp02 RN 3 |
| rowOp12 RN 5 |
| rowOp22 RN 7 |
| rowOp32 RN 14 |
| |
| ;// Transpose for colulmn operations |
| trCol00 RN 2 |
| trCol02 RN 3 |
| trCol10 RN 4 |
| trCol12 RN 5 |
| trCol20 RN 6 |
| trCol22 RN 7 |
| trCol30 RN 8 |
| trCol32 RN 9 |
| |
| ;// Intermediate calculations |
| g0 RN 10 |
| g1 RN 11 |
| g2 RN 12 |
| g3 RN 14 |
| |
| ;// Coloumn operated pixels |
| colOp00 RN 2 |
| colOp02 RN 3 |
| colOp10 RN 4 |
| colOp12 RN 5 |
| colOp20 RN 6 |
| colOp22 RN 7 |
| colOp30 RN 8 |
| colOp32 RN 9 |
| |
| |
| temp1 RN 10 ;// Temporary scratch varaibles |
| const1 RN 11 |
| const2 RN 12 |
| mask RN 14 |
| |
| ;// Output pixels |
| out00 RN 2 |
| out02 RN 3 |
| out10 RN 4 |
| out12 RN 5 |
| out20 RN 6 |
| out22 RN 7 |
| out30 RN 8 |
| out32 RN 9 |
| |
| |
| |
| ;// Allocate stack memory required by the function |
| |
| |
| ;// Write function header |
| M_START armVCM4P10_TransformResidual4x4,r11 |
| |
| ;****************************************************************** |
| ;// The strategy used in implementing the transform is as follows:* |
| ;// Load the 4x4 block into 8 registers * |
| ;// Transpose the 4x4 matrix * |
| ;// Perform the row operations (on columns) using SIMD * |
| ;// Transpose the 4x4 result matrix * |
| ;// Perform the coloumn operations * |
| ;// Store the 4x4 block at one go * |
| ;****************************************************************** |
| |
| ;// Load all the 4x4 pixels |
| |
| LDMIA pSrc,{in00,in02,in10,in12,in20,in22,in30,in32} |
| |
| MOV constZero,#0 ;// Used to right shift by 1 |
| ;LDR constZero,=0x00000000 |
| |
| ;***************************************************************** |
| ;// |
| ;// Transpose the matrix inorder to perform row ops as coloumn ops |
| ;// Input: in[][] = original matrix |
| ;// Output: trRow[][]= transposed matrix |
| ;// Step1: Obtain the LL part of the transposed matrix |
| ;// Step2: Obtain the HL part |
| ;// step3: Obtain the LH part |
| ;// Step4: Obtain the HH part |
| ;// |
| ;***************************************************************** |
| |
| ;// LL 2x2 transposed matrix |
| ;// d0 d1 - - |
| ;// d4 d5 - - |
| ;// - - - - |
| ;// - - - - |
| |
| PKHTB trRow10,in10,in00,ASR #16 ;// [5 4] = [f5:f1] |
| PKHBT trRow00,in00,in10,LSL #16 ;// [1 0] = [f4:f0] |
| |
| ;// HL 2x2 transposed matrix |
| ;// - - - - |
| ;// - - - - |
| ;// d8 d9 - - |
| ;// d12 d13 - - |
| |
| |
| PKHTB trRow30,in12,in02,ASR #16 ;// [13 12] = [7 3] |
| PKHBT trRow20,in02,in12,LSL #16 ;// [9 8] = [6 2] |
| |
| ;// LH 2x2 transposed matrix |
| ;// - - d2 d3 |
| ;// - - d6 d7 |
| ;// - - - - |
| ;// - - - - |
| |
| PKHBT trRow02,in20,in30,LSL #16 ;// [3 2] = [f12:f8] |
| PKHTB trRow12,in30,in20,ASR #16 ;// [7 6] = [f13:f9] |
| |
| |
| |
| |
| ;// HH 2x2 transposed matrix |
| ;// - - - - |
| ;// - - - - |
| ;// - - d10 d11 |
| ;// - - d14 d15 |
| |
| PKHTB trRow32,in32,in22,ASR #16 ;// [15 14] = [15 11] |
| PKHBT trRow22,in22,in32,LSL #16 ;// [11 10] = [14 10] |
| |
| |
| ;**************************************** |
| ;// Row Operations (Performed on columns) |
| ;**************************************** |
| |
| |
| ;// SIMD operations on first two columns(two rows of the original matrix) |
| |
| |
| SADD16 e0, trRow00,trRow20 ;// e0 = d0 + d2 |
| SSUB16 e1, trRow00,trRow20 ;// e1 = d0 - d2 |
| SHADD16 e2, trRow10,constZero ;// (f1>>1) constZero is a register holding 0 |
| SHADD16 e3, trRow30,constZero ;// avoid pipeline stalls for e2 and e3 |
| SSUB16 e2, e2, trRow30 ;// e2 = (d1>>1) - d3 |
| SADD16 e3, e3, trRow10 ;// e3 = d1 + (d3>>1) |
| SADD16 rowOp00, e0, e3 ;// f0 = e0 + e3 |
| SADD16 rowOp10, e1, e2 ;// f1 = e1 + e2 |
| SSUB16 rowOp20, e1, e2 ;// f2 = e1 - e2 |
| SSUB16 rowOp30, e0, e3 ;// f3 = e0 - e3 |
| |
| ;// SIMD operations on next two columns(next two rows of the original matrix) |
| |
| SADD16 e0, trRow02,trRow22 |
| SSUB16 e1, trRow02,trRow22 |
| SHADD16 e2, trRow12,constZero ;//(f1>>1) constZero is a register holding 0 |
| SHADD16 e3, trRow32,constZero |
| SSUB16 e2, e2, trRow32 |
| SADD16 e3, e3, trRow12 |
| SADD16 rowOp02, e0, e3 |
| SADD16 rowOp12, e1, e2 |
| SSUB16 rowOp22, e1, e2 |
| SSUB16 rowOp32, e0, e3 |
| |
| |
| ;***************************************************************** |
| ;// Transpose the resultant matrix |
| ;// Input: rowOp[][] |
| ;// Output: trCol[][] |
| ;***************************************************************** |
| |
| ;// LL 2x2 transposed matrix |
| ;// d0 d1 - - |
| ;// d4 d5 - - |
| ;// - - - - |
| ;// - - - - |
| |
| PKHTB trCol10,rowOp10,rowOp00,ASR #16 ;// [5 4] = [f5:f1] |
| PKHBT trCol00,rowOp00,rowOp10,LSL #16 ;// [1 0] = [f4:f0] |
| |
| ;// HL 2x2 transposed matrix |
| ;// - - - - |
| ;// - - - - |
| ;// d8 d9 - - |
| ;// d12 d13 - - |
| |
| |
| PKHTB trCol30,rowOp12,rowOp02,ASR #16 ;// [13 12] = [7 3] |
| PKHBT trCol20,rowOp02,rowOp12,LSL #16 ;// [9 8] = [6 2] |
| |
| ;// LH 2x2 transposed matrix |
| ;// - - d2 d3 |
| ;// - - d6 d7 |
| ;// - - - - |
| ;// - - - - |
| |
| PKHBT trCol02,rowOp20,rowOp30,LSL #16 ;// [3 2] = [f12:f8] |
| PKHTB trCol12,rowOp30,rowOp20,ASR #16 ;// [7 6] = [f13:f9] |
| |
| |
| |
| |
| ;// HH 2x2 transposed matrix |
| ;// - - - - |
| ;// - - - - |
| ;// - - d10 d11 |
| ;// - - d14 d15 |
| |
| PKHTB trCol32,rowOp32,rowOp22,ASR #16 ;// [15 14] = [15 11] |
| PKHBT trCol22,rowOp22,rowOp32,LSL #16 ;// [11 10] = [14 10] |
| |
| |
| ;******************************* |
| ;// Coloumn Operations |
| ;******************************* |
| |
| |
| ;// SIMD operations on first two columns |
| |
| |
| SADD16 g0, trCol00,trCol20 |
| SSUB16 g1, trCol00,trCol20 |
| SHADD16 g2, trCol10,constZero ;// (f1>>1) constZero is a register holding 0 |
| SHADD16 g3, trCol30,constZero |
| SSUB16 g2, g2, trCol30 |
| SADD16 g3, g3, trCol10 |
| SADD16 colOp00, g0, g3 |
| SADD16 colOp10, g1, g2 |
| SSUB16 colOp20, g1, g2 |
| SSUB16 colOp30, g0, g3 |
| |
| ;// SIMD operations on next two columns |
| |
| SADD16 g0, trCol02,trCol22 |
| SSUB16 g1, trCol02,trCol22 |
| SHADD16 g2, trCol12,constZero ;// (f1>>1) constZero is a register holding 0 |
| SHADD16 g3, trCol32,constZero |
| SSUB16 g2, g2, trCol32 |
| SADD16 g3, g3, trCol12 |
| SADD16 colOp02, g0, g3 |
| SADD16 colOp12, g1, g2 |
| SSUB16 colOp22, g1, g2 |
| SSUB16 colOp32, g0, g3 |
| |
| |
| |
| |
| |
| ;************************************************ |
| ;// Calculate final value (colOp[i][j] + 32)>>6 |
| ;************************************************ |
| |
| ;// const1: Serves dual purpose |
| ;// (1) Add #32 to both the lower and higher 16bits of the SIMD result |
| ;// (2) Convert the lower 16 bit value to an unsigned number (Add 32768) |
| |
| LDR const1, =0x00208020 |
| |
| LDR mask, =0xffff03ff ;// Used to mask the down shifted 6 bits |
| |
| ;// const2(#512): used to convert the lower 16bit number back to signed value |
| |
| MOV const2,#0x200 ;// const2 = 2^9 |
| |
| ;// First Row |
| |
| SADD16 colOp00, colOp00, const1 |
| SADD16 colOp02, colOp02, const1 |
| AND colOp00, mask, colOp00, ASR #6 |
| AND colOp02, mask, colOp02, ASR #6 |
| SSUB16 out00,colOp00,const2 |
| SSUB16 out02,colOp02,const2 |
| |
| |
| ;// Second Row |
| |
| SADD16 colOp10, colOp10, const1 |
| SADD16 colOp12, colOp12, const1 |
| AND colOp10, mask, colOp10, ASR #6 |
| AND colOp12, mask, colOp12, ASR #6 |
| SSUB16 out10,colOp10,const2 |
| SSUB16 out12,colOp12,const2 |
| |
| |
| ;// Third Row |
| |
| SADD16 colOp20, colOp20, const1 |
| SADD16 colOp22, colOp22, const1 |
| AND colOp20, mask, colOp20, ASR #6 |
| AND colOp22, mask, colOp22, ASR #6 |
| SSUB16 out20,colOp20,const2 |
| SSUB16 out22,colOp22,const2 |
| |
| |
| ;// Fourth Row |
| |
| SADD16 colOp30, colOp30, const1 |
| SADD16 colOp32, colOp32, const1 |
| AND colOp30, mask, colOp30, ASR #6 |
| AND colOp32, mask, colOp32, ASR #6 |
| SSUB16 out30,colOp30,const2 |
| SSUB16 out32,colOp32,const2 |
| |
| |
| |
| |
| ;*************************** |
| ;// Store all the 4x4 pixels |
| ;*************************** |
| |
| STMIA pDst,{out00,out02,out10,out12,out20,out22,out30,out32} |
| |
| |
| |
| ;// Set return value |
| |
| End |
| |
| |
| ;// Write function tail |
| M_END |
| |
| ENDIF ;//ARM1136JS |
| |
| |
| |
| |
| |
| |
| |
| ;// Guarding implementation by the processor name |
| |
| |
| END |