| ;// |
| ;// Copyright (C) 2007-2008 ARM Limited |
| ;// |
| ;// Licensed under the Apache License, Version 2.0 (the "License"); |
| ;// you may not use this file except in compliance with the License. |
| ;// You may obtain a copy of the License at |
| ;// |
| ;// http://www.apache.org/licenses/LICENSE-2.0 |
| ;// |
| ;// Unless required by applicable law or agreed to in writing, software |
| ;// distributed under the License is distributed on an "AS IS" BASIS, |
| ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| ;// See the License for the specific language governing permissions and |
| ;// limitations under the License. |
| ;// |
| ;// |
| ;// |
| ;// File Name: armVCM4P10_TransformResidual4x4_s.s |
| ;// OpenMAX DL: v1.0.2 |
| ;// Revision: 12290 |
| ;// Date: Wednesday, April 9, 2008 |
| ;// |
| ;// |
| ;// |
| ;// |
| ;// Description: |
| ;// Transform Residual 4x4 Coefficients |
| ;// |
| ;// |
| |
| |
| ;// Include standard headers |
| |
| INCLUDE omxtypes_s.h |
| INCLUDE armCOMM_s.h |
| |
| M_VARIANTS CortexA8 |
| |
| ;// Import symbols required from other files |
| ;// (For example tables) |
| |
| |
| |
| |
| ;// Set debugging level |
| ;//DEBUG_ON SETL {TRUE} |
| |
| |
| |
| ;// Guarding implementation by the processor name |
| |
| |
| |
| |
| |
| |
| |
| |
| ;// Guarding implementation by the processor name |
| |
| IF CortexA8 |
| |
| ;// ARM Registers |
| |
| ;//Input Registers |
| pDst RN 0 |
| pSrc RN 1 |
| |
| |
| ;// Neon Registers |
| |
| ;// Packed Input pixels |
| dIn0 DN D0.S16 |
| dIn1 DN D1.S16 |
| dIn2 DN D2.S16 |
| dIn3 DN D3.S16 |
| |
| ;// Intermediate calculations |
| dZero DN D4.S16 |
| de0 DN D5.S16 |
| de1 DN D6.S16 |
| de2 DN D7.S16 |
| de3 DN D8.S16 |
| dIn1RS DN D7.S16 |
| dIn3RS DN D8.S16 |
| df0 DN D0.S16 |
| df1 DN D1.S16 |
| df2 DN D2.S16 |
| df3 DN D3.S16 |
| qf01 QN Q0.32 |
| qf23 QN Q1.32 |
| dg0 DN D5.S16 |
| dg1 DN D6.S16 |
| dg2 DN D7.S16 |
| dg3 DN D8.S16 |
| df1RS DN D7.S16 |
| df3RS DN D8.S16 |
| |
| ;// Output pixels |
| dh0 DN D0.S16 |
| dh1 DN D1.S16 |
| dh2 DN D2.S16 |
| dh3 DN D3.S16 |
| |
| |
| ;// Allocate stack memory required by the function |
| |
| |
| ;// Write function header |
| M_START armVCM4P10_TransformResidual4x4, ,d8 |
| |
| ;****************************************************************** |
| ;// The strategy used in implementing the transform is as follows:* |
| ;// Load the 4x4 block into 8 registers * |
| ;// Transpose the 4x4 matrix * |
| ;// Perform the row operations (on columns) using SIMD * |
| ;// Transpose the 4x4 result matrix * |
| ;// Perform the coloumn operations * |
| ;// Store the 4x4 block at one go * |
| ;****************************************************************** |
| |
| ;// Load all the 4x4 pixels in transposed form |
| |
| VLD4 {dIn0,dIn1,dIn2,dIn3},[pSrc] |
| |
| VMOV dZero,#0 ;// Used to right shift by 1 |
| |
| |
| ;**************************************** |
| ;// Row Operations (Performed on columns) |
| ;**************************************** |
| |
| |
| VADD de0,dIn0,dIn2 ;// e0 = d0 + d2 |
| VSUB de1,dIn0,dIn2 ;// e1 = d0 - d2 |
| VHADD dIn1RS,dIn1,dZero ;// (f1>>1) constZero is a register holding 0 |
| VHADD dIn3RS,dIn3,dZero |
| VSUB de2,dIn1RS,dIn3 ;// e2 = (d1>>1) - d3 |
| VADD de3,dIn1,dIn3RS ;// e3 = d1 + (d3>>1) |
| VADD df0,de0,de3 ;// f0 = e0 + e3 |
| VADD df1,de1,de2 ;// f1 = e1 + e2 |
| VSUB df2,de1,de2 ;// f2 = e1 - e2 |
| VSUB df3,de0,de3 ;// f3 = e0 - e3 |
| |
| |
| |
| ;***************************************************************** |
| ;// Transpose the resultant matrix |
| ;***************************************************************** |
| |
| VTRN df0,df1 |
| VTRN df2,df3 |
| VTRN qf01,qf23 |
| |
| |
| ;******************************* |
| ;// Coloumn Operations |
| ;******************************* |
| |
| |
| VADD dg0,df0,df2 ;// e0 = d0 + d2 |
| VSUB dg1,df0,df2 ;// e1 = d0 - d2 |
| VHADD df1RS,df1,dZero ;// (f1>>1) constZero is a register holding 0 |
| VHADD df3RS,df3,dZero |
| VSUB dg2,df1RS,df3 ;// e2 = (d1>>1) - d3 |
| VADD dg3,df1,df3RS ;// e3 = d1 + (d3>>1) |
| VADD dh0,dg0,dg3 ;// f0 = e0 + e3 |
| VADD dh1,dg1,dg2 ;// f1 = e1 + e2 |
| VSUB dh2,dg1,dg2 ;// f2 = e1 - e2 |
| VSUB dh3,dg0,dg3 ;// f3 = e0 - e3 |
| |
| |
| ;************************************************ |
| ;// Calculate final value (colOp[i][j] + 32)>>6 |
| ;************************************************ |
| |
| VRSHR dh0,#6 |
| VRSHR dh1,#6 |
| VRSHR dh2,#6 |
| VRSHR dh3,#6 |
| |
| |
| ;*************************** |
| ;// Store all the 4x4 pixels |
| ;*************************** |
| |
| VST1 {dh0,dh1,dh2,dh3},[pDst] |
| |
| |
| ;// Set return value |
| |
| End |
| |
| |
| ;// Write function tail |
| M_END |
| |
| ENDIF ;//CortexA8 |
| |
| END |