| ;// |
| ;// Copyright (C) 2004 ARM Limited |
| ;// |
| ;// Licensed under the Apache License, Version 2.0 (the "License"); |
| ;// you may not use this file except in compliance with the License. |
| ;// You may obtain a copy of the License at |
| ;// |
| ;// http://www.apache.org/licenses/LICENSE-2.0 |
| ;// |
| ;// Unless required by applicable law or agreed to in writing, software |
| ;// distributed under the License is distributed on an "AS IS" BASIS, |
| ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| ;// See the License for the specific language governing permissions and |
| ;// limitations under the License. |
| ;// |
| ;// |
| ;// |
| ;// IDCT_s.s |
| ;// |
| ;// Inverse DCT module |
| ;// |
| ;// |
| ;// ALGORITHM DESCRIPTION |
| ;// |
| ;// The 8x8 2D IDCT is performed by calculating a 1D IDCT for each |
| ;// column and then a 1D IDCT for each row. |
| ;// |
| ;// The 8-point 1D IDCT is defined by |
| ;// f(x) = (C(0)*T(0)*c(0,x) + ... + C(7)*T(7)*c(7,x))/2 |
| ;// |
| ;// C(u) = 1/sqrt(2) if u=0 or 1 if u!=0 |
| ;// c(u,x) = cos( (2x+1)*u*pi/16 ) |
| ;// |
| ;// We compute the 8-point 1D IDCT using the reverse of |
| ;// the Arai-Agui-Nakajima flow graph which we split into |
| ;// 5 stages named in reverse order to identify with the |
| ;// forward DCT. Direct inversion of the forward formulae |
| ;// in file FDCT_s.s gives: |
| ;// |
| ;// IStage 5: j(u) = T(u)*A(u) [ A(u)=4*C(u)*c(u,0) ] |
| ;// [ A(0) = 2*sqrt(2) |
| ;// A(u) = 4*cos(u*pi/16) for (u!=0) ] |
| ;// |
| ;// IStage 4: i0 = j0 i1 = j4 |
| ;// i3 = (j2+j6)/2 i2 = (j2-j6)/2 |
| ;// i7 = (j5+j3)/2 i4 = (j5-j3)/2 |
| ;// i5 = (j1+j7)/2 i6 = (j1-j7)/2 |
| ;// |
| ;// IStage 3: h0 = (i0+i1)/2 h1 = (i0-i1)/2 |
| ;// h2 = (i2*sqrt2)-i3 h3 = i3 |
| ;// h4 = cos(pi/8)*i4 + sin(pi/8)*i6 |
| ;// h6 = -sin(pi/8)*i4 + cos(pi/8)*i6 |
| ;// [ The above two lines rotate by -(pi/8) ] |
| ;// h5 = (i5-i7)/sqrt2 h7 = (i5+i7)/2 |
| ;// |
| ;// IStage 2: g0 = (h0+h3)/2 g3 = (h0-h3)/2 |
| ;// g1 = (h1+h2)/2 g2 = (h1-h2)/2 |
| ;// g7 = h7 g6 = h6 - h7 |
| ;// g5 = h5 - g6 g4 = h4 - g5 |
| ;// |
| ;// IStage 1: f0 = (g0+g7)/2 f7 = (g0-g7)/2 |
| ;// f1 = (g1+g6)/2 f6 = (g1-g6)/2 |
| ;// f2 = (g2+g5)/2 f5 = (g2-g5)/2 |
| ;// f3 = (g3+g4)/2 f4 = (g3-g4)/2 |
| ;// |
| ;// Note that most coefficients are halved 3 times during the |
| ;// above calculation. We can rescale the algorithm dividing |
| ;// the input by 8 to remove the halvings. |
| ;// |
| ;// IStage 5: j(u) = T(u)*A(u)/8 |
| ;// |
| ;// IStage 4: i0 = j0 i1 = j4 |
| ;// i3 = j2 + j6 i2 = j2 - j6 |
| ;// i7 = j5 + j3 i4 = j5 - j3 |
| ;// i5 = j1 + j7 i6 = j1 - j7 |
| ;// |
| ;// IStage 3: h0 = i0 + i1 h1 = i0 - i1 |
| ;// h2 = (i2*sqrt2)-i3 h3 = i3 |
| ;// h4 = 2*( cos(pi/8)*i4 + sin(pi/8)*i6) |
| ;// h6 = 2*(-sin(pi/8)*i4 + cos(pi/8)*i6) |
| ;// h5 = (i5-i7)*sqrt2 h7 = i5 + i7 |
| ;// |
| ;// IStage 2: g0 = h0 + h3 g3 = h0 - h3 |
| ;// g1 = h1 + h2 g2 = h1 - h2 |
| ;// g7 = h7 g6 = h6 - h7 |
| ;// g5 = h5 - g6 g4 = h4 - g5 |
| ;// |
| ;// IStage 1: f0 = g0 + g7 f7 = g0 - g7 |
| ;// f1 = g1 + g6 f6 = g1 - g6 |
| ;// f2 = g2 + g5 f5 = g2 - g5 |
| ;// f3 = g3 + g4 f4 = g3 - g4 |
| ;// |
| ;// Note: |
| ;// 1. The scaling by A(u)/8 can often be combined with inverse |
| ;// quantization. The column and row scalings can be combined. |
| ;// 2. The flowgraph in the AAN paper has h4,g6 negated compared |
| ;// to the above code but is otherwise identical. |
| ;// 3. The rotation by -pi/8 can be peformed using three multiplies |
| ;// Eg c*i4+s*i6 = (i6-i4)*s + (c+s)*i4 |
| ;// -s*i4+c*i6 = (i6-i4)*s + (c-s)*i6 |
| ;// 4. If |T(u)|<=1 then from the IDCT definition, |
| ;// |f(x)| <= ((1/sqrt2) + |c(1,x)| + .. + |c(7,x)|)/2 |
| ;// = ((1/sqrt2) + cos(pi/16) + ... + cos(7*pi/16))/2 |
| ;// = ((1/sqrt2) + (cot(pi/32)-1)/2)/2 |
| ;// = (1 + cos(pi/16) + cos(2pi/16) + cos(3pi/16))/sqrt(2) |
| ;// = (approx)2.64 |
| ;// So the max gain of the 2D IDCT is ~x7.0 = 3 bits. |
| ;// The table below shows input patterns generating the maximum |
| ;// value of |f(u)| for input in the range |T(x)|<=1. M=-1, P=+1 |
| ;// InputPattern Max |f(x)| |
| ;// PPPPPPPP |f0| = 2.64 |
| ;// PPPMMMMM |f1| = 2.64 |
| ;// PPMMMPPP |f2| = 2.64 |
| ;// PPMMPPMM |f3| = 2.64 |
| ;// PMMPPMMP |f4| = 2.64 |
| ;// PMMPMMPM |f5| = 2.64 |
| ;// PMPPMPMP |f6| = 2.64 |
| ;// PMPMPMPM |f7| = 2.64 |
| ;// Note that this input pattern is the transpose of the |
| ;// corresponding max input patter for the FDCT. |
| |
| ;// Arguments |
| |
| pSrc RN 0 ;// source data buffer |
| Stride RN 1 ;// destination stride in bytes |
| pDest RN 2 ;// destination data buffer |
| pScale RN 3 ;// pointer to scaling table |
| |
| |
| ;// DCT Inverse Macro |
| ;// The DCT code should be parametrized according |
| ;// to the following inputs: |
| ;// $outsize = "u8" : 8-bit unsigned data saturated (0 to +255) |
| ;// "s9" : 16-bit signed data saturated to 9-bit (-256 to +255) |
| ;// "s16" : 16-bit signed data not saturated (max size ~+/-14273) |
| ;// $inscale = "s16" : signed 16-bit aan-scale table, Q15 format, with 4 byte alignment |
| ;// "s32" : signed 32-bit aan-scale table, Q23 format, with 4 byte alignment |
| ;// |
| ;// Inputs: |
| ;// pSrc = r0 = Pointer to input data |
| ;// Range is -256 to +255 (9-bit) |
| ;// Stride = r1 = Stride between input lines |
| ;// pDest = r2 = Pointer to output data |
| ;// pScale = r3 = Pointer to aan-scale table in the format defined by $inscale |
| |
| |
| |
| MACRO |
| M_IDCT $outsize, $inscale, $stride |
| LCLA SHIFT |
| |
| |
| IF ARM1136JS |
| |
| ;// REGISTER ALLOCATION |
| ;// This is hard since we have 8 values, 9 free registers and each |
| ;// butterfly requires a temporary register. We also want to |
| ;// maintain register order so we can use LDM/STM. The table below |
| ;// summarises the register allocation that meets all these criteria. |
| ;// a=1stcol, b=2ndcol, f,g,h,i are dataflow points described above. |
| ;// |
| ;// r1 a01 g0 h0 |
| ;// r4 b01 f0 g1 h1 i0 |
| ;// r5 a23 f1 g2 i1 |
| ;// r6 b23 f2 g3 h2 i2 |
| ;// r7 a45 f3 h3 i3 |
| ;// r8 b45 f4 g4 h4 i4 |
| ;// r9 a67 f5 g5 h5 i5 |
| ;// r10 b67 f6 g6 h6 i6 |
| ;// r11 f7 g7 h7 i7 |
| ;// |
| ra01 RN 1 |
| rb01 RN 4 |
| ra23 RN 5 |
| rb23 RN 6 |
| ra45 RN 7 |
| rb45 RN 8 |
| ra67 RN 9 |
| rb67 RN 10 |
| rtmp RN 11 |
| csPiBy8 RN 12 ;// [ (Sin(pi/8)@Q15), (Cos(pi/8)@Q15) ] |
| LoopRR2 RN 14 ;// [ LoopNumber<<13 , (1/Sqrt(2))@Q15 ] |
| ;// Transpose allocation |
| xft RN ra01 |
| xf0 RN rb01 |
| xf1 RN ra23 |
| xf2 RN rb23 |
| xf3 RN ra45 |
| xf4 RN rb45 |
| xf5 RN ra67 |
| xf6 RN rb67 |
| xf7 RN rtmp |
| ;// IStage 1 allocation |
| xg0 RN xft |
| xg1 RN xf0 |
| xg2 RN xf1 |
| xg3 RN xf2 |
| xgt RN xf3 |
| xg4 RN xf4 |
| xg5 RN xf5 |
| xg6 RN xf6 |
| xg7 RN xf7 |
| ;// IStage 2 allocation |
| xh0 RN xg0 |
| xh1 RN xg1 |
| xht RN xg2 |
| xh2 RN xg3 |
| xh3 RN xgt |
| xh4 RN xg4 |
| xh5 RN xg5 |
| xh6 RN xg6 |
| xh7 RN xg7 |
| ;// IStage 3,4 allocation |
| xit RN xh0 |
| xi0 RN xh1 |
| xi1 RN xht |
| xi2 RN xh2 |
| xi3 RN xh3 |
| xi4 RN xh4 |
| xi5 RN xh5 |
| xi6 RN xh6 |
| xi7 RN xh7 |
| |
| M_STR pDest, ppDest |
| IF "$stride"="s" |
| M_STR Stride, pStride |
| ENDIF |
| M_ADR pDest, pBlk |
| LDR csPiBy8, =0x30fc7642 |
| LDR LoopRR2, =0x00005a82 |
| |
| v6_idct_col$_F |
| ;// Load even values |
| LDR xi4, [pSrc], #4 ;// j0 |
| LDR xi5, [pSrc, #4*16-4] ;// j4 |
| LDR xi6, [pSrc, #2*16-4] ;// j2 |
| LDR xi7, [pSrc, #6*16-4] ;// j6 |
| |
| ;// Scale Even Values |
| IF "$inscale"="s16" ;// 16x16 mul |
| SHIFT SETA 12 |
| LDR xi0, [pScale], #4 |
| LDR xi1, [pScale, #4*16-4] |
| LDR xi2, [pScale, #2*16-4] |
| MOV xit, #1<<(SHIFT-1) |
| SMLABB xi3, xi0, xi4, xit |
| SMLATT xi4, xi0, xi4, xit |
| SMLABB xi0, xi1, xi5, xit |
| SMLATT xi5, xi1, xi5, xit |
| MOV xi3, xi3, ASR #SHIFT |
| PKHBT xi4, xi3, xi4, LSL #(16-SHIFT) |
| LDR xi3, [pScale, #6*16-4] |
| SMLABB xi1, xi2, xi6, xit |
| SMLATT xi6, xi2, xi6, xit |
| MOV xi0, xi0, ASR #SHIFT |
| PKHBT xi5, xi0, xi5, LSL #(16-SHIFT) |
| SMLABB xi2, xi3, xi7, xit |
| SMLATT xi7, xi3, xi7, xit |
| MOV xi1, xi1, ASR #SHIFT |
| PKHBT xi6, xi1, xi6, LSL #(16-SHIFT) |
| MOV xi2, xi2, ASR #SHIFT |
| PKHBT xi7, xi2, xi7, LSL #(16-SHIFT) |
| ENDIF |
| IF "$inscale"="s32" ;// 32x16 mul |
| SHIFT SETA (12+8-16) |
| MOV xit, #1<<(SHIFT-1) |
| LDR xi0, [pScale], #8 |
| LDR xi1, [pScale, #0*32+4-8] |
| LDR xi2, [pScale, #4*32-8] |
| LDR xi3, [pScale, #4*32+4-8] |
| SMLAWB xi0, xi0, xi4, xit |
| SMLAWT xi1, xi1, xi4, xit |
| SMLAWB xi2, xi2, xi5, xit |
| SMLAWT xi3, xi3, xi5, xit |
| MOV xi0, xi0, ASR #SHIFT |
| PKHBT xi4, xi0, xi1, LSL #(16-SHIFT) |
| MOV xi2, xi2, ASR #SHIFT |
| PKHBT xi5, xi2, xi3, LSL #(16-SHIFT) |
| LDR xi0, [pScale, #2*32-8] |
| LDR xi1, [pScale, #2*32+4-8] |
| LDR xi2, [pScale, #6*32-8] |
| LDR xi3, [pScale, #6*32+4-8] |
| SMLAWB xi0, xi0, xi6, xit |
| SMLAWT xi1, xi1, xi6, xit |
| SMLAWB xi2, xi2, xi7, xit |
| SMLAWT xi3, xi3, xi7, xit |
| MOV xi0, xi0, ASR #SHIFT |
| PKHBT xi6, xi0, xi1, LSL #(16-SHIFT) |
| MOV xi2, xi2, ASR #SHIFT |
| PKHBT xi7, xi2, xi3, LSL #(16-SHIFT) |
| ENDIF |
| |
| ;// Load odd values |
| LDR xi0, [pSrc, #1*16-4] ;// j1 |
| LDR xi1, [pSrc, #7*16-4] ;// j7 |
| LDR xi2, [pSrc, #5*16-4] ;// j5 |
| LDR xi3, [pSrc, #3*16-4] ;// j3 |
| |
| IF {TRUE} |
| ;// shortcut if odd values 0 |
| TEQ xi0, #0 |
| TEQEQ xi1, #0 |
| TEQEQ xi2, #0 |
| TEQEQ xi3, #0 |
| BEQ v6OddZero$_F |
| ENDIF |
| |
| ;// Store scaled even values |
| STMIA pDest, {xi4, xi5, xi6, xi7} |
| |
| ;// Scale odd values |
| IF "$inscale"="s16" |
| ;// Perform AAN Scale |
| LDR xi4, [pScale, #1*16-4] |
| LDR xi5, [pScale, #7*16-4] |
| LDR xi6, [pScale, #5*16-4] |
| SMLABB xi7, xi0, xi4, xit |
| SMLATT xi0, xi0, xi4, xit |
| SMLABB xi4, xi1, xi5, xit |
| SMLATT xi1, xi1, xi5, xit |
| MOV xi7, xi7, ASR #SHIFT |
| PKHBT xi0, xi7, xi0, LSL #(16-SHIFT) |
| LDR xi7, [pScale, #3*16-4] |
| SMLABB xi5, xi2, xi6, xit |
| SMLATT xi2, xi2, xi6, xit |
| MOV xi4, xi4, ASR #SHIFT |
| PKHBT xi1, xi4, xi1, LSL #(16-SHIFT) |
| SMLABB xi6, xi3, xi7, xit |
| SMLATT xi3, xi3, xi7, xit |
| MOV xi5, xi5, ASR #SHIFT |
| PKHBT xi2, xi5, xi2, LSL #(16-SHIFT) |
| MOV xi6, xi6, ASR #SHIFT |
| PKHBT xi3, xi6, xi3, LSL #(16-SHIFT) |
| ENDIF |
| IF "$inscale"="s32" ;// 32x16 mul |
| LDR xi4, [pScale, #1*32-8] |
| LDR xi5, [pScale, #1*32+4-8] |
| LDR xi6, [pScale, #7*32-8] |
| LDR xi7, [pScale, #7*32+4-8] |
| SMLAWB xi4, xi4, xi0, xit |
| SMLAWT xi5, xi5, xi0, xit |
| SMLAWB xi6, xi6, xi1, xit |
| SMLAWT xi7, xi7, xi1, xit |
| MOV xi4, xi4, ASR #SHIFT |
| PKHBT xi0, xi4, xi5, LSL #(16-SHIFT) |
| MOV xi6, xi6, ASR #SHIFT |
| PKHBT xi1, xi6, xi7, LSL #(16-SHIFT) |
| LDR xi4, [pScale, #5*32-8] |
| LDR xi5, [pScale, #5*32+4-8] |
| LDR xi6, [pScale, #3*32-8] |
| LDR xi7, [pScale, #3*32+4-8] |
| SMLAWB xi4, xi4, xi2, xit |
| SMLAWT xi5, xi5, xi2, xit |
| SMLAWB xi6, xi6, xi3, xit |
| SMLAWT xi7, xi7, xi3, xit |
| MOV xi4, xi4, ASR #SHIFT |
| PKHBT xi2, xi4, xi5, LSL #(16-SHIFT) |
| MOV xi6, xi6, ASR #SHIFT |
| PKHBT xi3, xi6, xi7, LSL #(16-SHIFT) |
| ENDIF |
| |
| SHADD16 xi5, xi0, xi1 ;// (j1+j7)/2 |
| SSUB16 xi6, xi0, xi1 ;// j1-j7 |
| SHADD16 xi7, xi2, xi3 ;// (j5+j3)/2 |
| SSUB16 xi4, xi2, xi3 ;// j5-j3 |
| |
| SSUB16 xi3, xi5, xi7 ;// (i5-i7)/2 |
| |
| PKHBT xi0, xi6, xi4, LSL#16 ;// [i4,i6] row a |
| PKHTB xi1, xi4, xi6, ASR#16 ;// [i4,i6] row b |
| |
| SMUADX xi2, xi0, csPiBy8 ;// rowa by [c,s] |
| SMUADX xi4, xi1, csPiBy8 ;// rowb by [c,s] |
| SMUSD xi0, xi0, csPiBy8 ;// rowa by [-s,c] |
| SMUSD xi6, xi1, csPiBy8 ;// rowb by [-s,c] |
| |
| SMULBB xi1, xi3, LoopRR2 |
| SMULTB xi3, xi3, LoopRR2 |
| |
| PKHTB xh4, xi4, xi2, ASR#16 ;// h4/4 |
| PKHTB xh6, xi6, xi0, ASR#16 ;// h6/4 |
| SHADD16 xh7, xi5, xi7 ;// (i5+i7)/4 |
| |
| ;// xi0,xi1,xi2,xi3 now free |
| ;// IStage 4,3, rows 2to3 x1/2 |
| |
| MOV xi3, xi3, LSL #1 |
| PKHTB xh5, xi3, xi1, ASR#15 ;// h5/4 |
| LDRD xi0, [pDest, #8] ;// j2,j6 scaled |
| |
| ;// IStage 2, rows4to7 |
| SSUB16 xg6, xh6, xh7 |
| SSUB16 xg5, xh5, xg6 |
| SSUB16 xg4, xh4, xg5 |
| |
| SSUB16 xi2, xi0, xi1 ;// (j2-j6) |
| SHADD16 xi3, xi0, xi1 ;// (j2+j6)/2 |
| |
| SMULBB xi0, xi2, LoopRR2 |
| SMULTB xi2, xi2, LoopRR2 |
| |
| MOV xi2, xi2, LSL #1 |
| PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4 |
| |
| ;// xi0, xi1 now free |
| ;// IStage 4,3 rows 0to1 x 1/2 |
| LDRD xi0, [pDest] ;// j0, j4 scaled |
| SSUB16 xh2, xh2, xi3 |
| ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows |
| |
| SHADD16 xh0, xi0, xi1 |
| SHSUB16 xh1, xi0, xi1 |
| |
| ;// IStage 2 rows 0to3 x 1/2 |
| SHSUB16 xg2, xh1, xh2 |
| SHADD16 xg1, xh1, xh2 |
| SHSUB16 xg3, xh0, xh3 |
| SHADD16 xg0, xh0, xh3 |
| |
| ;// IStage 1 all rows |
| SADD16 xf3, xg3, xg4 |
| SSUB16 xf4, xg3, xg4 |
| SADD16 xf2, xg2, xg5 |
| SSUB16 xf5, xg2, xg5 |
| SADD16 xf1, xg1, xg6 |
| SSUB16 xf6, xg1, xg6 |
| SADD16 xf0, xg0, xg7 |
| SSUB16 xf7, xg0, xg7 |
| |
| ;// Transpose, store and loop |
| PKHBT ra01, xf0, xf1, LSL #16 |
| PKHTB rb01, xf1, xf0, ASR #16 |
| |
| PKHBT ra23, xf2, xf3, LSL #16 |
| PKHTB rb23, xf3, xf2, ASR #16 |
| |
| PKHBT ra45, xf4, xf5, LSL #16 |
| PKHTB rb45, xf5, xf4, ASR #16 |
| |
| PKHBT ra67, xf6, xf7, LSL #16 |
| STMIA pDest!, {ra01, ra23, ra45, ra67} |
| PKHTB rb67, xf7, xf6, ASR #16 |
| STMIA pDest!, {rb01, rb23, rb45, rb67} |
| BCC v6_idct_col$_F |
| |
| SUB pSrc, pDest, #(64*2) |
| M_LDR pDest, ppDest |
| IF "$stride"="s" |
| M_LDR pScale, pStride |
| ENDIF |
| B v6_idct_row$_F |
| |
| v6OddZero$_F |
| SSUB16 xi2, xi6, xi7 ;// (j2-j6) |
| SHADD16 xi3, xi6, xi7 ;// (j2+j6)/2 |
| |
| SMULBB xi0, xi2, LoopRR2 |
| SMULTB xi2, xi2, LoopRR2 |
| |
| MOV xi2, xi2, LSL #1 |
| PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4 |
| SSUB16 xh2, xh2, xi3 |
| |
| ;// xi0, xi1 now free |
| ;// IStage 4,3 rows 0to1 x 1/2 |
| |
| SHADD16 xh0, xi4, xi5 |
| SHSUB16 xh1, xi4, xi5 |
| |
| ;// IStage 2 rows 0to3 x 1/2 |
| SHSUB16 xg2, xh1, xh2 |
| SHADD16 xg1, xh1, xh2 |
| SHSUB16 xg3, xh0, xh3 |
| SHADD16 xg0, xh0, xh3 |
| |
| ;// IStage 1 all rows |
| MOV xf3, xg3 |
| MOV xf4, xg3 |
| MOV xf2, xg2 |
| MOV xf5, xg2 |
| MOV xf1, xg1 |
| MOV xf6, xg1 |
| MOV xf0, xg0 |
| MOV xf7, xg0 |
| |
| ;// Transpose |
| PKHBT ra01, xf0, xf1, LSL #16 |
| PKHTB rb01, xf1, xf0, ASR #16 |
| |
| PKHBT ra23, xf2, xf3, LSL #16 |
| PKHTB rb23, xf3, xf2, ASR #16 |
| |
| PKHBT ra45, xf4, xf5, LSL #16 |
| PKHTB rb45, xf5, xf4, ASR #16 |
| |
| PKHBT ra67, xf6, xf7, LSL #16 |
| PKHTB rb67, xf7, xf6, ASR #16 |
| |
| STMIA pDest!, {ra01, ra23, ra45, ra67} |
| ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows |
| STMIA pDest!, {rb01, rb23, rb45, rb67} |
| |
| BCC v6_idct_col$_F |
| SUB pSrc, pDest, #(64*2) |
| M_LDR pDest, ppDest |
| IF "$stride"="s" |
| M_LDR pScale, pStride |
| ENDIF |
| |
| |
| v6_idct_row$_F |
| ;// IStage 4,3, rows4to7 x1/4 |
| LDR xit, =0x00010001 ;// rounding constant |
| LDR xi0, [pSrc, #1*16] ;// j1 |
| LDR xi1, [pSrc, #7*16] ;// 4*j7 |
| LDR xi2, [pSrc, #5*16] ;// j5 |
| LDR xi3, [pSrc, #3*16] ;// j3 |
| |
| SHADD16 xi1, xi1, xit ;// 2*j7 |
| SHADD16 xi1, xi1, xit ;// j7 |
| |
| SHADD16 xi5, xi0, xi1 ;// (j1+j7)/2 |
| SSUB16 xi6, xi0, xi1 ;// j1-j7 |
| SHADD16 xi7, xi2, xi3 ;// (j5+j3)/2 |
| SSUB16 xi4, xi2, xi3 ;// j5-j3 |
| |
| SSUB16 xi3, xi5, xi7 ;// (i5-i7)/2 |
| |
| PKHBT xi0, xi6, xi4, LSL#16 ;// [i4,i6] row a |
| PKHTB xi1, xi4, xi6, ASR#16 ;// [i4,i6] row b |
| |
| SMUADX xi2, xi0, csPiBy8 ;// rowa by [c,s] |
| SMUADX xi4, xi1, csPiBy8 ;// rowb by [c,s] |
| SMUSD xi0, xi0, csPiBy8 ;// rowa by [-s,c] |
| SMUSD xi6, xi1, csPiBy8 ;// rowb by [-s,c] |
| |
| SMULBB xi1, xi3, LoopRR2 |
| SMULTB xi3, xi3, LoopRR2 |
| |
| PKHTB xh4, xi4, xi2, ASR#16 ;// h4/4 |
| PKHTB xh6, xi6, xi0, ASR#16 ;// h6/4 |
| SHADD16 xh7, xi5, xi7 ;// (i5+i7)/4 |
| |
| MOV xi3, xi3, LSL #1 |
| PKHTB xh5, xi3, xi1, ASR#15 ;// h5/4 |
| |
| ;// xi0,xi1,xi2,xi3 now free |
| ;// IStage 4,3, rows 2to3 x1/2 |
| |
| LDR xi0, [pSrc, #2*16] ;// j2 |
| LDR xi1, [pSrc, #6*16] ;// 2*j6 |
| |
| ;// IStage 2, rows4to7 |
| SSUB16 xg6, xh6, xh7 |
| SSUB16 xg5, xh5, xg6 |
| SSUB16 xg4, xh4, xg5 |
| |
| SHADD16 xi1, xi1, xit ;// j6 |
| SSUB16 xi2, xi0, xi1 ;// (j2-j6) |
| SHADD16 xi3, xi0, xi1 ;// (j2+j6)/2 |
| |
| SMULBB xi0, xi2, LoopRR2 |
| SMULTB xi2, xi2, LoopRR2 |
| |
| MOV xi2, xi2, LSL #1 |
| |
| PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4 |
| |
| ;// xi0, xi1 now free |
| ;// IStage 4,3 rows 0to1 x 1/2 |
| LDR xi1, [pSrc, #4*16] ;// j4 |
| LDR xi0, [pSrc], #4 ;// j0 |
| |
| SSUB16 xh2, xh2, xi3 |
| ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows |
| |
| ADD xi0, xi0, xit, LSL #2 ;// ensure correct round |
| SHADD16 xh0, xi0, xi1 ;// of DC result |
| SHSUB16 xh1, xi0, xi1 |
| |
| ;// IStage 2 rows 0to3 x 1/2 |
| SHSUB16 xg2, xh1, xh2 |
| SHADD16 xg1, xh1, xh2 |
| SHSUB16 xg3, xh0, xh3 |
| SHADD16 xg0, xh0, xh3 |
| |
| ;// IStage 1 all rows |
| SHADD16 xf3, xg3, xg4 |
| SHSUB16 xf4, xg3, xg4 |
| SHADD16 xf2, xg2, xg5 |
| SHSUB16 xf5, xg2, xg5 |
| SHADD16 xf1, xg1, xg6 |
| SHSUB16 xf6, xg1, xg6 |
| SHADD16 xf0, xg0, xg7 |
| SHSUB16 xf7, xg0, xg7 |
| |
| ;// Saturate |
| IF ("$outsize"="u8") |
| USAT16 xf0, #8, xf0 |
| USAT16 xf1, #8, xf1 |
| USAT16 xf2, #8, xf2 |
| USAT16 xf3, #8, xf3 |
| USAT16 xf4, #8, xf4 |
| USAT16 xf5, #8, xf5 |
| USAT16 xf6, #8, xf6 |
| USAT16 xf7, #8, xf7 |
| ENDIF |
| IF ("$outsize"="s9") |
| SSAT16 xf0, #9, xf0 |
| SSAT16 xf1, #9, xf1 |
| SSAT16 xf2, #9, xf2 |
| SSAT16 xf3, #9, xf3 |
| SSAT16 xf4, #9, xf4 |
| SSAT16 xf5, #9, xf5 |
| SSAT16 xf6, #9, xf6 |
| SSAT16 xf7, #9, xf7 |
| ENDIF |
| |
| ;// Transpose to Row, Pack and store |
| IF ("$outsize"="u8") |
| ORR xf0, xf0, xf1, LSL #8 ;// [ b1 b0 a1 a0 ] |
| ORR xf2, xf2, xf3, LSL #8 ;// [ b3 b2 a3 a2 ] |
| ORR xf4, xf4, xf5, LSL #8 ;// [ b5 b4 a5 a4 ] |
| ORR xf6, xf6, xf7, LSL #8 ;// [ b7 b6 a7 a6 ] |
| PKHBT ra01, xf0, xf2, LSL #16 |
| PKHTB rb01, xf2, xf0, ASR #16 |
| PKHBT ra23, xf4, xf6, LSL #16 |
| PKHTB rb23, xf6, xf4, ASR #16 |
| STMIA pDest, {ra01, ra23} |
| IF "$stride"="s" |
| ADD pDest, pDest, pScale |
| STMIA pDest, {rb01, rb23} |
| ADD pDest, pDest, pScale |
| ELSE |
| ADD pDest, pDest, #($stride) |
| STMIA pDest, {rb01, rb23} |
| ADD pDest, pDest, #($stride) |
| ENDIF |
| ENDIF |
| IF ("$outsize"="s9"):LOR:("$outsize"="s16") |
| PKHBT ra01, xf0, xf1, LSL #16 |
| PKHTB rb01, xf1, xf0, ASR #16 |
| |
| PKHBT ra23, xf2, xf3, LSL #16 |
| PKHTB rb23, xf3, xf2, ASR #16 |
| |
| PKHBT ra45, xf4, xf5, LSL #16 |
| PKHTB rb45, xf5, xf4, ASR #16 |
| |
| PKHBT ra67, xf6, xf7, LSL #16 |
| PKHTB rb67, xf7, xf6, ASR #16 |
| |
| STMIA pDest, {ra01, ra23, ra45, ra67} |
| IF "$stride"="s" |
| ADD pDest, pDest, pScale |
| STMIA pDest, {rb01, rb23, rb45, rb67} |
| ADD pDest, pDest, pScale |
| ELSE |
| ADD pDest, pDest, #($stride) |
| STMIA pDest, {rb01, rb23, rb45, rb67} |
| ADD pDest, pDest, #($stride) |
| ENDIF |
| ENDIF |
| |
| BCC v6_idct_row$_F |
| ENDIF ;// ARM1136JS |
| |
| |
| IF CortexA8 |
| |
| Src0 EQU 7 |
| Src1 EQU 8 |
| Src2 EQU 9 |
| Src3 EQU 10 |
| Src4 EQU 11 |
| Src5 EQU 12 |
| Src6 EQU 13 |
| Src7 EQU 14 |
| Tmp EQU 15 |
| |
| qXj0 QN Src0.S16 |
| qXj1 QN Src1.S16 |
| qXj2 QN Src2.S16 |
| qXj3 QN Src3.S16 |
| qXj4 QN Src4.S16 |
| qXj5 QN Src5.S16 |
| qXj6 QN Src6.S16 |
| qXj7 QN Src7.S16 |
| qXjt QN Tmp.S16 |
| |
| dXj0lo DN (Src0*2).S16 |
| dXj0hi DN (Src0*2+1).S16 |
| dXj1lo DN (Src1*2).S16 |
| dXj1hi DN (Src1*2+1).S16 |
| dXj2lo DN (Src2*2).S16 |
| dXj2hi DN (Src2*2+1).S16 |
| dXj3lo DN (Src3*2).S16 |
| dXj3hi DN (Src3*2+1).S16 |
| dXj4lo DN (Src4*2).S16 |
| dXj4hi DN (Src4*2+1).S16 |
| dXj5lo DN (Src5*2).S16 |
| dXj5hi DN (Src5*2+1).S16 |
| dXj6lo DN (Src6*2).S16 |
| dXj6hi DN (Src6*2+1).S16 |
| dXj7lo DN (Src7*2).S16 |
| dXj7hi DN (Src7*2+1).S16 |
| dXjtlo DN (Tmp*2).S16 |
| dXjthi DN (Tmp*2+1).S16 |
| |
| qXi0 QN qXj0 |
| qXi1 QN qXj4 |
| qXi2 QN qXj2 |
| qXi3 QN qXj7 |
| qXi4 QN qXj5 |
| qXi5 QN qXjt |
| qXi6 QN qXj1 |
| qXi7 QN qXj6 |
| qXit QN qXj3 |
| |
| dXi0lo DN dXj0lo |
| dXi0hi DN dXj0hi |
| dXi1lo DN dXj4lo |
| dXi1hi DN dXj4hi |
| dXi2lo DN dXj2lo |
| dXi2hi DN dXj2hi |
| dXi3lo DN dXj7lo |
| dXi3hi DN dXj7hi |
| dXi4lo DN dXj5lo |
| dXi4hi DN dXj5hi |
| dXi5lo DN dXjtlo |
| dXi5hi DN dXjthi |
| dXi6lo DN dXj1lo |
| dXi6hi DN dXj1hi |
| dXi7lo DN dXj6lo |
| dXi7hi DN dXj6hi |
| dXitlo DN dXj3lo |
| dXithi DN dXj3hi |
| |
| qXh0 QN qXit |
| qXh1 QN qXi0 |
| qXh2 QN qXi2 |
| qXh3 QN qXi3 |
| qXh4 QN qXi7 |
| qXh5 QN qXi5 |
| qXh6 QN qXi4 |
| qXh7 QN qXi1 |
| qXht QN qXi6 |
| |
| dXh0lo DN dXitlo |
| dXh0hi DN dXithi |
| dXh1lo DN dXi0lo |
| dXh1hi DN dXi0hi |
| dXh2lo DN dXi2lo |
| dXh2hi DN dXi2hi |
| dXh3lo DN dXi3lo |
| dXh3hi DN dXi3hi |
| dXh4lo DN dXi7lo |
| dXh4hi DN dXi7hi |
| dXh5lo DN dXi5lo |
| dXh5hi DN dXi5hi |
| dXh6lo DN dXi4lo |
| dXh6hi DN dXi4hi |
| dXh7lo DN dXi1lo |
| dXh7hi DN dXi1hi |
| dXhtlo DN dXi6lo |
| dXhthi DN dXi6hi |
| |
| qXg0 QN qXh2 |
| qXg1 QN qXht |
| qXg2 QN qXh1 |
| qXg3 QN qXh0 |
| qXg4 QN qXh4 |
| qXg5 QN qXh5 |
| qXg6 QN qXh6 |
| qXg7 QN qXh7 |
| qXgt QN qXh3 |
| |
| qXf0 QN qXg6 |
| qXf1 QN qXg5 |
| qXf2 QN qXg4 |
| qXf3 QN qXgt |
| qXf4 QN qXg3 |
| qXf5 QN qXg2 |
| qXf6 QN qXg1 |
| qXf7 QN qXg0 |
| qXft QN qXg7 |
| |
| |
| qXt0 QN 1.S32 |
| qXt1 QN 2.S32 |
| qT0lo QN 1.S32 |
| qT0hi QN 2.S32 |
| qT1lo QN 3.S32 |
| qT1hi QN 4.S32 |
| qScalelo QN 5.S32 ;// used to read post scale values |
| qScalehi QN 6.S32 |
| qTemp0 QN 5.S32 |
| qTemp1 QN 6.S32 |
| |
| |
| Scale1 EQU 6 |
| Scale2 EQU 15 |
| qScale1 QN Scale1.S16 |
| qScale2 QN Scale2.S16 |
| dScale1lo DN (Scale1*2).S16 |
| dScale1hi DN (Scale1*2+1).S16 |
| dScale2lo DN (Scale2*2).S16 |
| dScale2hi DN (Scale2*2+1).S16 |
| |
| dCoefs DN 0.S16 ;// Scale coefficients in format {[0] [C] [S] [InvSqrt2]} |
| InvSqrt2 DN dCoefs[0] ;// 1/sqrt(2) in Q15 |
| S DN dCoefs[1] ;// Sin(PI/8) in Q15 |
| C DN dCoefs[2] ;// Cos(PI/8) in Q15 |
| |
| pTemp RN 12 |
| |
| |
| IMPORT armCOMM_IDCTCoef |
| |
| VLD1 {qXj0,qXj1}, [pSrc @64]! |
| VLD1 {qXj2,qXj3}, [pSrc @64]! |
| VLD1 {qXj4,qXj5}, [pSrc @64]! |
| VLD1 {qXj6,qXj7}, [pSrc @64]! |
| |
| ;// Load PreScale and multiply with Src |
| ;// IStage 4 |
| |
| IF "$inscale"="s16" ;// 16X16 Mul |
| M_IDCT_PRESCALE16 |
| ENDIF |
| |
| IF "$inscale"="s32" ;// 32X32 ,ul |
| M_IDCT_PRESCALE32 |
| ENDIF |
| |
| ;// IStage 3 |
| VQRDMULH qXi2, qXi2, InvSqrt2 ;// i2/sqrt(2) |
| VHADD qXh0, qXi0, qXi1 ;// (i0+i1)/2 |
| VHSUB qXh1, qXi0, qXi1 ;// (i0-i1)/2 |
| VHADD qXh7, qXi5, qXi7 ;// (i5+i7)/4 |
| VSUB qXh5, qXi5, qXi7 ;// (i5-i7)/2 |
| VQRDMULH qXh5, qXh5, InvSqrt2 ;// h5/sqrt(2) |
| VSUB qXh2, qXi2, qXi3 ;// h2, h3 |
| |
| VMULL qXt0, dXi4lo, C ;// c*i4 |
| VMLAL qXt0, dXi6lo, S ;// c*i4+s*i6 |
| VMULL qXt1, dXi4hi, C |
| VMLAL qXt1, dXi6hi, S |
| VSHRN dXh4lo, qXt0, #16 ;// h4 |
| VSHRN dXh4hi, qXt1, #16 |
| |
| VMULL qXt0, dXi6lo, C ;// c*i6 |
| VMLSL qXt0, dXi4lo, S ;// -s*i4 + c*h6 |
| VMULL qXt1, dXi6hi, C |
| VMLSL qXt1, dXi4hi, S |
| VSHRN dXh6lo, qXt0, #16 ;// h6 |
| VSHRN dXh6hi, qXt1, #16 |
| |
| ;// IStage 2 |
| VSUB qXg6, qXh6, qXh7 |
| VSUB qXg5, qXh5, qXg6 |
| VSUB qXg4, qXh4, qXg5 |
| VHADD qXg1, qXh1, qXh2 ;// (h1+h2)/2 |
| VHSUB qXg2, qXh1, qXh2 ;// (h1-h2)/2 |
| VHADD qXg0, qXh0, qXh3 ;// (h0+h3)/2 |
| VHSUB qXg3, qXh0, qXh3 ;// (h0-h3)/2 |
| |
| ;// IStage 1 all rows |
| VADD qXf3, qXg3, qXg4 |
| VSUB qXf4, qXg3, qXg4 |
| VADD qXf2, qXg2, qXg5 |
| VSUB qXf5, qXg2, qXg5 |
| VADD qXf1, qXg1, qXg6 |
| VSUB qXf6, qXg1, qXg6 |
| VADD qXf0, qXg0, qXg7 |
| VSUB qXf7, qXg0, qXg7 |
| |
| ;// Transpose, store and loop |
| XTR0 EQU Src5 |
| XTR1 EQU Tmp |
| XTR2 EQU Src6 |
| XTR3 EQU Src7 |
| XTR4 EQU Src3 |
| XTR5 EQU Src0 |
| XTR6 EQU Src1 |
| XTR7 EQU Src2 |
| XTRt EQU Src4 |
| |
| qA0 QN XTR0.S32 ;// for XTRpose |
| qA1 QN XTR1.S32 |
| qA2 QN XTR2.S32 |
| qA3 QN XTR3.S32 |
| qA4 QN XTR4.S32 |
| qA5 QN XTR5.S32 |
| qA6 QN XTR6.S32 |
| qA7 QN XTR7.S32 |
| |
| dB0 DN XTR0*2+1 ;// for using VSWP |
| dB1 DN XTR1*2+1 |
| dB2 DN XTR2*2+1 |
| dB3 DN XTR3*2+1 |
| dB4 DN XTR4*2 |
| dB5 DN XTR5*2 |
| dB6 DN XTR6*2 |
| dB7 DN XTR7*2 |
| |
| |
| VTRN qXf0, qXf1 |
| VTRN qXf2, qXf3 |
| VTRN qXf4, qXf5 |
| VTRN qXf6, qXf7 |
| VTRN qA0, qA2 |
| VTRN qA1, qA3 |
| VTRN qA4, qA6 |
| VTRN qA5, qA7 |
| VSWP dB0, dB4 |
| VSWP dB1, dB5 |
| VSWP dB2, dB6 |
| VSWP dB3, dB7 |
| |
| |
| qYj0 QN qXf0 |
| qYj1 QN qXf1 |
| qYj2 QN qXf2 |
| qYj3 QN qXf3 |
| qYj4 QN qXf4 |
| qYj5 QN qXf5 |
| qYj6 QN qXf6 |
| qYj7 QN qXf7 |
| qYjt QN qXft |
| |
| dYj0lo DN (XTR0*2).S16 |
| dYj0hi DN (XTR0*2+1).S16 |
| dYj1lo DN (XTR1*2).S16 |
| dYj1hi DN (XTR1*2+1).S16 |
| dYj2lo DN (XTR2*2).S16 |
| dYj2hi DN (XTR2*2+1).S16 |
| dYj3lo DN (XTR3*2).S16 |
| dYj3hi DN (XTR3*2+1).S16 |
| dYj4lo DN (XTR4*2).S16 |
| dYj4hi DN (XTR4*2+1).S16 |
| dYj5lo DN (XTR5*2).S16 |
| dYj5hi DN (XTR5*2+1).S16 |
| dYj6lo DN (XTR6*2).S16 |
| dYj6hi DN (XTR6*2+1).S16 |
| dYj7lo DN (XTR7*2).S16 |
| dYj7hi DN (XTR7*2+1).S16 |
| dYjtlo DN (XTRt*2).S16 |
| dYjthi DN (XTRt*2+1).S16 |
| |
| qYi0 QN qYj0 |
| qYi1 QN qYj4 |
| qYi2 QN qYj2 |
| qYi3 QN qYj7 |
| qYi4 QN qYj5 |
| qYi5 QN qYjt |
| qYi6 QN qYj1 |
| qYi7 QN qYj6 |
| qYit QN qYj3 |
| |
| dYi0lo DN dYj0lo |
| dYi0hi DN dYj0hi |
| dYi1lo DN dYj4lo |
| dYi1hi DN dYj4hi |
| dYi2lo DN dYj2lo |
| dYi2hi DN dYj2hi |
| dYi3lo DN dYj7lo |
| dYi3hi DN dYj7hi |
| dYi4lo DN dYj5lo |
| dYi4hi DN dYj5hi |
| dYi5lo DN dYjtlo |
| dYi5hi DN dYjthi |
| dYi6lo DN dYj1lo |
| dYi6hi DN dYj1hi |
| dYi7lo DN dYj6lo |
| dYi7hi DN dYj6hi |
| dYitlo DN dYj3lo |
| dYithi DN dYj3hi |
| |
| qYh0 QN qYit |
| qYh1 QN qYi0 |
| qYh2 QN qYi2 |
| qYh3 QN qYi3 |
| qYh4 QN qYi7 |
| qYh5 QN qYi5 |
| qYh6 QN qYi4 |
| qYh7 QN qYi1 |
| qYht QN qYi6 |
| |
| dYh0lo DN dYitlo |
| dYh0hi DN dYithi |
| dYh1lo DN dYi0lo |
| dYh1hi DN dYi0hi |
| dYh2lo DN dYi2lo |
| dYh2hi DN dYi2hi |
| dYh3lo DN dYi3lo |
| dYh3hi DN dYi3hi |
| dYh4lo DN dYi7lo |
| dYh4hi DN dYi7hi |
| dYh5lo DN dYi5lo |
| dYh5hi DN dYi5hi |
| dYh6lo DN dYi4lo |
| dYh6hi DN dYi4hi |
| dYh7lo DN dYi1lo |
| dYh7hi DN dYi1hi |
| dYhtlo DN dYi6lo |
| dYhthi DN dYi6hi |
| |
| qYg0 QN qYh2 |
| qYg1 QN qYht |
| qYg2 QN qYh1 |
| qYg3 QN qYh0 |
| qYg4 QN qYh4 |
| qYg5 QN qYh5 |
| qYg6 QN qYh6 |
| qYg7 QN qYh7 |
| qYgt QN qYh3 |
| |
| qYf0 QN qYg6 |
| qYf1 QN qYg5 |
| qYf2 QN qYg4 |
| qYf3 QN qYgt |
| qYf4 QN qYg3 |
| qYf5 QN qYg2 |
| qYf6 QN qYg1 |
| qYf7 QN qYg0 |
| qYft QN qYg7 |
| |
| VRSHR qYj7, qYj7, #2 |
| VRSHR qYj6, qYj6, #1 |
| |
| VHADD qYi5, qYj1, qYj7 ;// i5 = (j1+j7)/2 |
| VSUB qYi6, qYj1, qYj7 ;// i6 = j1-j7 |
| VHADD qYi3, qYj2, qYj6 ;// i3 = (j2+j6)/2 |
| VSUB qYi2, qYj2, qYj6 ;// i2 = j2-j6 |
| VHADD qYi7, qYj5, qYj3 ;// i7 = (j5+j3)/2 |
| VSUB qYi4, qYj5, qYj3 ;// i4 = j5-j3 |
| |
| VQRDMULH qYi2, qYi2, InvSqrt2 ;// i2/sqrt(2) |
| ;// IStage 4,3 rows 0to1 x 1/2 |
| |
| MOV pTemp, #0x4 ;// ensure correct round |
| VDUP qScale1, pTemp ;// of DC result |
| VADD qYi0, qYi0, qScale1 |
| |
| VHADD qYh0, qYi0, qYi1 ;// (i0+i1)/2 |
| VHSUB qYh1, qYi0, qYi1 ;// (i0-i1)/2 |
| |
| VHADD qYh7, qYi5, qYi7 ;// (i5+i7)/4 |
| VSUB qYh5, qYi5, qYi7 ;// (i5-i7)/2 |
| VSUB qYh2, qYi2, qYi3 ;// h2, h3 |
| VQRDMULH qYh5, qYh5, InvSqrt2 ;// h5/sqrt(2) |
| |
| VMULL qXt0, dYi4lo, C ;// c*i4 |
| VMLAL qXt0, dYi6lo, S ;// c*i4+s*i6 |
| VMULL qXt1, dYi4hi, C |
| VMLAL qXt1, dYi6hi, S |
| VSHRN dYh4lo, qXt0, #16 ;// h4 |
| VSHRN dYh4hi, qXt1, #16 |
| |
| VMULL qXt0, dYi6lo, C ;// c*i6 |
| VMLSL qXt0, dYi4lo, S ;// -s*i4 + c*h6 |
| VMULL qXt1, dYi6hi, C |
| VMLSL qXt1, dYi4hi, S |
| VSHRN dYh6lo, qXt0, #16 ;// h6 |
| VSHRN dYh6hi, qXt1, #16 |
| |
| VSUB qYg6, qYh6, qYh7 |
| VSUB qYg5, qYh5, qYg6 |
| VSUB qYg4, qYh4, qYg5 |
| |
| ;// IStage 2 rows 0to3 x 1/2 |
| VHADD qYg1, qYh1, qYh2 ;// (h1+h2)/2 |
| VHSUB qYg2, qYh1, qYh2 ;// (h1-h2)/2 |
| VHADD qYg0, qYh0, qYh3 ;// (h0+h3)/2 |
| VHSUB qYg3, qYh0, qYh3 ;// (h0-h3)/2 |
| |
| |
| ;// IStage 1 all rows |
| VHADD qYf3, qYg3, qYg4 |
| VHSUB qYf4, qYg3, qYg4 |
| VHADD qYf2, qYg2, qYg5 |
| VHSUB qYf5, qYg2, qYg5 |
| VHADD qYf1, qYg1, qYg6 |
| VHSUB qYf6, qYg1, qYg6 |
| VHADD qYf0, qYg0, qYg7 |
| VHSUB qYf7, qYg0, qYg7 |
| |
| YTR0 EQU Src0 |
| YTR1 EQU Src4 |
| YTR2 EQU Src1 |
| YTR3 EQU Src2 |
| YTR4 EQU Src7 |
| YTR5 EQU Src5 |
| YTR6 EQU Tmp |
| YTR7 EQU Src6 |
| YTRt EQU Src3 |
| |
| qC0 QN YTR0.S32 ;// for YTRpose |
| qC1 QN YTR1.S32 |
| qC2 QN YTR2.S32 |
| qC3 QN YTR3.S32 |
| qC4 QN YTR4.S32 |
| qC5 QN YTR5.S32 |
| qC6 QN YTR6.S32 |
| qC7 QN YTR7.S32 |
| |
| dD0 DN YTR0*2+1 ;// for using VSWP |
| dD1 DN YTR1*2+1 |
| dD2 DN YTR2*2+1 |
| dD3 DN YTR3*2+1 |
| dD4 DN YTR4*2 |
| dD5 DN YTR5*2 |
| dD6 DN YTR6*2 |
| dD7 DN YTR7*2 |
| |
| VTRN qYf0, qYf1 |
| VTRN qYf2, qYf3 |
| VTRN qYf4, qYf5 |
| VTRN qYf6, qYf7 |
| VTRN qC0, qC2 |
| VTRN qC1, qC3 |
| VTRN qC4, qC6 |
| VTRN qC5, qC7 |
| VSWP dD0, dD4 |
| VSWP dD1, dD5 |
| VSWP dD2, dD6 |
| VSWP dD3, dD7 |
| |
| |
| dYf0U8 DN YTR0*2.U8 |
| dYf1U8 DN YTR1*2.U8 |
| dYf2U8 DN YTR2*2.U8 |
| dYf3U8 DN YTR3*2.U8 |
| dYf4U8 DN YTR4*2.U8 |
| dYf5U8 DN YTR5*2.U8 |
| dYf6U8 DN YTR6*2.U8 |
| dYf7U8 DN YTR7*2.U8 |
| |
| ;// |
| ;// Do saturation if outsize is other than S16 |
| ;// |
| |
| IF ("$outsize"="u8") |
| ;// Output range [0-255] |
| VQMOVN dYf0U8, qYf0 |
| VQMOVN dYf1U8, qYf1 |
| VQMOVN dYf2U8, qYf2 |
| VQMOVN dYf3U8, qYf3 |
| VQMOVN dYf4U8, qYf4 |
| VQMOVN dYf5U8, qYf5 |
| VQMOVN dYf6U8, qYf6 |
| VQMOVN dYf7U8, qYf7 |
| ENDIF |
| |
| IF ("$outsize"="s9") |
| ;// Output range [-256 to +255] |
| VQSHL qYf0, qYf0, #16-9 |
| VQSHL qYf1, qYf1, #16-9 |
| VQSHL qYf2, qYf2, #16-9 |
| VQSHL qYf3, qYf3, #16-9 |
| VQSHL qYf4, qYf4, #16-9 |
| VQSHL qYf5, qYf5, #16-9 |
| VQSHL qYf6, qYf6, #16-9 |
| VQSHL qYf7, qYf7, #16-9 |
| |
| VSHR qYf0, qYf0, #16-9 |
| VSHR qYf1, qYf1, #16-9 |
| VSHR qYf2, qYf2, #16-9 |
| VSHR qYf3, qYf3, #16-9 |
| VSHR qYf4, qYf4, #16-9 |
| VSHR qYf5, qYf5, #16-9 |
| VSHR qYf6, qYf6, #16-9 |
| VSHR qYf7, qYf7, #16-9 |
| ENDIF |
| |
| ;// Store output depending on the Stride size |
| IF "$stride"="s" |
| VST1 qYf0, [pDest @64], Stride |
| VST1 qYf1, [pDest @64], Stride |
| VST1 qYf2, [pDest @64], Stride |
| VST1 qYf3, [pDest @64], Stride |
| VST1 qYf4, [pDest @64], Stride |
| VST1 qYf5, [pDest @64], Stride |
| VST1 qYf6, [pDest @64], Stride |
| VST1 qYf7, [pDest @64] |
| ELSE |
| IF ("$outsize"="u8") |
| VST1 dYf0U8, [pDest @64], #8 |
| VST1 dYf1U8, [pDest @64], #8 |
| VST1 dYf2U8, [pDest @64], #8 |
| VST1 dYf3U8, [pDest @64], #8 |
| VST1 dYf4U8, [pDest @64], #8 |
| VST1 dYf5U8, [pDest @64], #8 |
| VST1 dYf6U8, [pDest @64], #8 |
| VST1 dYf7U8, [pDest @64] |
| ELSE |
| ;// ("$outsize"="s9") or ("$outsize"="s16") |
| VST1 qYf0, [pDest @64], #16 |
| VST1 qYf1, [pDest @64], #16 |
| VST1 qYf2, [pDest @64], #16 |
| VST1 qYf3, [pDest @64], #16 |
| VST1 qYf4, [pDest @64], #16 |
| VST1 qYf5, [pDest @64], #16 |
| VST1 qYf6, [pDest @64], #16 |
| VST1 qYf7, [pDest @64] |
| ENDIF |
| |
| ENDIF |
| |
| |
| |
| ENDIF ;// CortexA8 |
| |
| |
| |
| MEND |
| |
| ;// Scale TWO input rows with TWO rows of 16 bit scale values |
| ;// |
| ;// This macro is used by M_IDCT_PRESCALE16 to pre-scale one row |
| ;// input (Eight input values) with one row of scale values. Also |
| ;// Loads next scale values from pScale, if $LastRow flag is not set. |
| ;// |
| ;// Input Registers: |
| ;// |
| ;// $dAlo - Input D register with first four S16 values of row n |
| ;// $dAhi - Input D register with next four S16 values of row n |
| ;// $dBlo - Input D register with first four S16 values of row n+1 |
| ;// $dBhi - Input D register with next four S16 values of row n+1 |
| ;// pScale - Pointer to next row of scale values |
| ;// qT0lo - Temporary scratch register |
| ;// qT0hi - Temporary scratch register |
| ;// qT1lo - Temporary scratch register |
| ;// qT1hi - Temporary scratch register |
| ;// dScale1lo - Scale value of row n |
| ;// dScale1hi - Scale value of row n |
| ;// dScale2lo - Scale value of row n+1 |
| ;// dScale2hi - Scale value of row n+1 |
| ;// |
| ;// Input Flag |
| ;// |
| ;// $LastRow - Flag to indicate whether current row is last row |
| ;// |
| ;// Output Registers: |
| ;// |
| ;// $dAlo - Scaled output values (first four S16 of row n) |
| ;// $dAhi - Scaled output values (next four S16 of row n) |
| ;// $dBlo - Scaled output values (first four S16 of row n+1) |
| ;// $dBhi - Scaled output values (next four S16 of row n+1) |
| ;// qScale1 - Scale values for next row |
| ;// qScale2 - Scale values for next row+1 |
| ;// pScale - Pointer to next row of scale values |
| ;// |
| MACRO |
| M_IDCT_SCALE16 $dAlo, $dAhi, $dBlo, $dBhi, $LastRow |
| VMULL qT0lo, $dAlo, dScale1lo |
| VMULL qT0hi, $dAhi, dScale1hi |
| VMULL qT1lo, $dBlo, dScale2lo |
| VMULL qT1hi, $dBhi, dScale2hi |
| IF "$LastRow"="0" |
| VLD1 qScale1, [pScale], #16 ;// Load scale for row n+1 |
| VLD1 qScale2, [pScale], #16 ;// Load scale for row n+2 |
| ENDIF |
| VQRSHRN $dAlo, qT0lo, #12 |
| VQRSHRN $dAhi, qT0hi, #12 |
| VQRSHRN $dBlo, qT1lo, #12 |
| VQRSHRN $dBhi, qT1hi, #12 |
| MEND |
| |
| ;// Scale 8x8 block input values with 16 bit scale values |
| ;// |
| ;// This macro is used to pre-scale block of 8x8 input. |
| ;// This also do the Ist stage transformations of IDCT. |
| ;// |
| ;// Input Registers: |
| ;// |
| ;// dXjnlo - n th input D register with first four S16 values |
| ;// dXjnhi - n th input D register with next four S16 values |
| ;// qXjn - n th input Q register with eight S16 values |
| ;// pScale - Pointer to scale values |
| ;// |
| ;// Output Registers: |
| ;// |
| ;// qXin - n th output Q register with eight S16 output values of 1st stage |
| ;// |
| MACRO |
| M_IDCT_PRESCALE16 |
| VLD1 qScale1, [pScale], #16 ;// Load Pre scale for row 0 |
| VLD1 qScale2, [pScale], #16 ;// Load Pre scale for row 0 |
| M_IDCT_SCALE16 dXj0lo, dXj0hi, dXj1lo, dXj1hi, 0 ;// Pre scale row 0 & 1 |
| M_IDCT_SCALE16 dXj2lo, dXj2hi, dXj3lo, dXj3hi, 0 |
| M_IDCT_SCALE16 dXj4lo, dXj4hi, dXj5lo, dXj5hi, 0 |
| M_IDCT_SCALE16 dXj6lo, dXj6hi, dXj7lo, dXj7hi, 1 |
| VHADD qXi5, qXj1, qXj7 ;// (j1+j7)/2 |
| VSUB qXi6, qXj1, qXj7 ;// j1-j7 |
| LDR pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants |
| VHADD qXi3, qXj2, qXj6 ;// (j2+j6)/2 |
| VSUB qXi2, qXj2, qXj6 ;// j2-j6 |
| VLDR dCoefs, [pSrc] ;// Load DCT inverse AAN constants |
| VHADD qXi7, qXj5, qXj3 ;// (j5+j3)/2 |
| VSUB qXi4, qXj5, qXj3 ;// j5-j3 |
| MEND |
| |
| |
| ;// Scale 8x8 block input values with 32 bit scale values |
| ;// |
| ;// This macro is used to pre-scale block of 8x8 input. |
| ;// This also do the Ist stage transformations of IDCT. |
| ;// |
| ;// Input Registers: |
| ;// |
| ;// dXjnlo - n th input D register with first four S16 values |
| ;// dXjnhi - n th input D register with next four S16 values |
| ;// qXjn - n th input Q register with eight S16 values |
| ;// pScale - Pointer to 32bit scale values in Q23 format |
| ;// |
| ;// Output Registers: |
| ;// |
| ;// dXinlo - n th output D register with first four S16 output values of 1st stage |
| ;// dXinhi - n th output D register with next four S16 output values of 1st stage |
| ;// |
| MACRO |
| M_IDCT_PRESCALE32 |
| qScale0lo QN 0.S32 |
| qScale0hi QN 1.S32 |
| qScale1lo QN 2.S32 |
| qScale1hi QN 3.S32 |
| qScale2lo QN qScale1lo |
| qScale2hi QN qScale1hi |
| qScale3lo QN qScale1lo |
| qScale3hi QN qScale1hi |
| qScale4lo QN qScale1lo |
| qScale4hi QN qScale1hi |
| qScale5lo QN qScale0lo |
| qScale5hi QN qScale0hi |
| qScale6lo QN qScale0lo |
| qScale6hi QN qScale0hi |
| qScale7lo QN qScale0lo |
| qScale7hi QN qScale0hi |
| |
| qSrc0lo QN 4.S32 |
| qSrc0hi QN 5.S32 |
| qSrc1lo QN 6.S32 |
| qSrc1hi QN Src4.S32 |
| qSrc2lo QN qSrc0lo |
| qSrc2hi QN qSrc0hi |
| qSrc3lo QN qSrc0lo |
| qSrc3hi QN qSrc0hi |
| qSrc4lo QN qSrc0lo |
| qSrc4hi QN qSrc0hi |
| qSrc5lo QN qSrc1lo |
| qSrc5hi QN qSrc1hi |
| qSrc6lo QN qSrc1lo |
| qSrc6hi QN qSrc1hi |
| qSrc7lo QN qSrc0lo |
| qSrc7hi QN qSrc0hi |
| |
| qRes17lo QN qScale0lo |
| qRes17hi QN qScale0hi |
| qRes26lo QN qScale0lo |
| qRes26hi QN qScale0hi |
| qRes53lo QN qScale0lo |
| qRes53hi QN qScale0hi |
| |
| ADD pTemp, pScale, #4*8*7 ;// Address of pScale[7] |
| |
| ;// Row 0 |
| VLD1 {qScale0lo, qScale0hi}, [pScale]! |
| VSHLL qSrc0lo, dXj0lo, #(12-1) |
| VSHLL qSrc0hi, dXj0hi, #(12-1) |
| VLD1 {qScale1lo, qScale1hi}, [pScale]! |
| VQRDMULH qSrc0lo, qScale0lo, qSrc0lo |
| VQRDMULH qSrc0hi, qScale0hi, qSrc0hi |
| VLD1 {qScale7lo, qScale7hi}, [pTemp]! |
| VSHLL qSrc1lo, dXj1lo, #(12-1) |
| VSHLL qSrc1hi, dXj1hi, #(12-1) |
| VMOVN dXi0lo, qSrc0lo ;// Output i0 |
| VMOVN dXi0hi, qSrc0hi |
| VSHLL qSrc7lo, dXj7lo, #(12-1) |
| VSHLL qSrc7hi, dXj7hi, #(12-1) |
| SUB pTemp, pTemp, #((16*2)+(4*8*1)) |
| VQRDMULH qSrc1lo, qScale1lo, qSrc1lo |
| VQRDMULH qSrc1hi, qScale1hi, qSrc1hi |
| VQRDMULH qSrc7lo, qScale7lo, qSrc7lo |
| VQRDMULH qSrc7hi, qScale7hi, qSrc7hi |
| VLD1 {qScale2lo, qScale2hi}, [pScale]! |
| |
| ;// Row 1 & 7 |
| VHADD qRes17lo, qSrc1lo, qSrc7lo ;// (j1+j7)/2 |
| VHADD qRes17hi, qSrc1hi, qSrc7hi ;// (j1+j7)/2 |
| VMOVN dXi5lo, qRes17lo ;// Output i5 |
| VMOVN dXi5hi, qRes17hi |
| VSUB qRes17lo, qSrc1lo, qSrc7lo ;// j1-j7 |
| VSUB qRes17hi, qSrc1hi, qSrc7hi ;// j1-j7 |
| VMOVN dXi6lo, qRes17lo ;// Output i6 |
| VMOVN dXi6hi, qRes17hi |
| VSHLL qSrc2lo, dXj2lo, #(12-1) |
| VSHLL qSrc2hi, dXj2hi, #(12-1) |
| VLD1 {qScale6lo, qScale6hi}, [pTemp]! |
| VSHLL qSrc6lo, dXj6lo, #(12-1) |
| VSHLL qSrc6hi, dXj6hi, #(12-1) |
| SUB pTemp, pTemp, #((16*2)+(4*8*1)) |
| VQRDMULH qSrc2lo, qScale2lo, qSrc2lo |
| VQRDMULH qSrc2hi, qScale2hi, qSrc2hi |
| VQRDMULH qSrc6lo, qScale6lo, qSrc6lo |
| VQRDMULH qSrc6hi, qScale6hi, qSrc6hi |
| VLD1 {qScale3lo, qScale3hi}, [pScale]! |
| |
| ;// Row 2 & 6 |
| VHADD qRes26lo, qSrc2lo, qSrc6lo ;// (j2+j6)/2 |
| VHADD qRes26hi, qSrc2hi, qSrc6hi ;// (j2+j6)/2 |
| VMOVN dXi3lo, qRes26lo ;// Output i3 |
| VMOVN dXi3hi, qRes26hi |
| VSUB qRes26lo, qSrc2lo, qSrc6lo ;// j2-j6 |
| VSUB qRes26hi, qSrc2hi, qSrc6hi ;// j2-j6 |
| VMOVN dXi2lo, qRes26lo ;// Output i2 |
| VMOVN dXi2hi, qRes26hi |
| VSHLL qSrc3lo, dXj3lo, #(12-1) |
| VSHLL qSrc3hi, dXj3hi, #(12-1) |
| VLD1 {qScale5lo, qScale5hi}, [pTemp]! |
| VSHLL qSrc5lo, dXj5lo, #(12-1) |
| VSHLL qSrc5hi, dXj5hi, #(12-1) |
| VQRDMULH qSrc3lo, qScale3lo, qSrc3lo |
| VQRDMULH qSrc3hi, qScale3hi, qSrc3hi |
| VQRDMULH qSrc5lo, qScale5lo, qSrc5lo |
| VQRDMULH qSrc5hi, qScale5hi, qSrc5hi |
| |
| ;// Row 3 & 5 |
| VHADD qRes53lo, qSrc5lo, qSrc3lo ;// (j5+j3)/2 |
| VHADD qRes53hi, qSrc5hi, qSrc3hi ;// (j5+j3)/2 |
| SUB pSrc, pSrc, #16*2*2 |
| VMOVN dXi7lo, qRes53lo ;// Output i7 |
| VMOVN dXi7hi, qRes53hi |
| VSUB qRes53lo, qSrc5lo, qSrc3lo ;// j5-j3 |
| VSUB qRes53hi, qSrc5hi, qSrc3hi ;// j5-j3 |
| VLD1 qXj4, [pSrc @64] |
| VMOVN dXi4lo, qRes53lo ;// Output i4 |
| VMOVN dXi4hi, qRes53hi |
| VSHLL qSrc4lo, dXj4lo, #(12-1) |
| VSHLL qSrc4hi, dXj4hi, #(12-1) |
| VLD1 {qScale4lo, qScale4hi}, [pScale] |
| LDR pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants |
| VQRDMULH qSrc4lo, qScale4lo, qSrc4lo |
| VQRDMULH qSrc4hi, qScale4hi, qSrc4hi |
| VLDR dCoefs, [pSrc] ;// Load DCT inverse AAN constants |
| ;// Row 4 |
| VMOVN dXi1lo, qSrc4lo ;// Output i1 |
| VMOVN dXi1hi, qSrc4hi |
| |
| MEND |
| |
| END |