blob: 9130223492c99ef97973351499a36f47c9a4524b [file] [log] [blame]
;//
;// Copyright (C) 2004 ARM Limited
;//
;// Licensed under the Apache License, Version 2.0 (the "License");
;// you may not use this file except in compliance with the License.
;// You may obtain a copy of the License at
;//
;// http://www.apache.org/licenses/LICENSE-2.0
;//
;// Unless required by applicable law or agreed to in writing, software
;// distributed under the License is distributed on an "AS IS" BASIS,
;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
;// See the License for the specific language governing permissions and
;// limitations under the License.
;//
;//
;//
;// IDCT_s.s
;//
;// Inverse DCT module
;//
;//
;// ALGORITHM DESCRIPTION
;//
;// The 8x8 2D IDCT is performed by calculating a 1D IDCT for each
;// column and then a 1D IDCT for each row.
;//
;// The 8-point 1D IDCT is defined by
;// f(x) = (C(0)*T(0)*c(0,x) + ... + C(7)*T(7)*c(7,x))/2
;//
;// C(u) = 1/sqrt(2) if u=0 or 1 if u!=0
;// c(u,x) = cos( (2x+1)*u*pi/16 )
;//
;// We compute the 8-point 1D IDCT using the reverse of
;// the Arai-Agui-Nakajima flow graph which we split into
;// 5 stages named in reverse order to identify with the
;// forward DCT. Direct inversion of the forward formulae
;// in file FDCT_s.s gives:
;//
;// IStage 5: j(u) = T(u)*A(u) [ A(u)=4*C(u)*c(u,0) ]
;// [ A(0) = 2*sqrt(2)
;// A(u) = 4*cos(u*pi/16) for (u!=0) ]
;//
;// IStage 4: i0 = j0 i1 = j4
;// i3 = (j2+j6)/2 i2 = (j2-j6)/2
;// i7 = (j5+j3)/2 i4 = (j5-j3)/2
;// i5 = (j1+j7)/2 i6 = (j1-j7)/2
;//
;// IStage 3: h0 = (i0+i1)/2 h1 = (i0-i1)/2
;// h2 = (i2*sqrt2)-i3 h3 = i3
;// h4 = cos(pi/8)*i4 + sin(pi/8)*i6
;// h6 = -sin(pi/8)*i4 + cos(pi/8)*i6
;// [ The above two lines rotate by -(pi/8) ]
;// h5 = (i5-i7)/sqrt2 h7 = (i5+i7)/2
;//
;// IStage 2: g0 = (h0+h3)/2 g3 = (h0-h3)/2
;// g1 = (h1+h2)/2 g2 = (h1-h2)/2
;// g7 = h7 g6 = h6 - h7
;// g5 = h5 - g6 g4 = h4 - g5
;//
;// IStage 1: f0 = (g0+g7)/2 f7 = (g0-g7)/2
;// f1 = (g1+g6)/2 f6 = (g1-g6)/2
;// f2 = (g2+g5)/2 f5 = (g2-g5)/2
;// f3 = (g3+g4)/2 f4 = (g3-g4)/2
;//
;// Note that most coefficients are halved 3 times during the
;// above calculation. We can rescale the algorithm dividing
;// the input by 8 to remove the halvings.
;//
;// IStage 5: j(u) = T(u)*A(u)/8
;//
;// IStage 4: i0 = j0 i1 = j4
;// i3 = j2 + j6 i2 = j2 - j6
;// i7 = j5 + j3 i4 = j5 - j3
;// i5 = j1 + j7 i6 = j1 - j7
;//
;// IStage 3: h0 = i0 + i1 h1 = i0 - i1
;// h2 = (i2*sqrt2)-i3 h3 = i3
;// h4 = 2*( cos(pi/8)*i4 + sin(pi/8)*i6)
;// h6 = 2*(-sin(pi/8)*i4 + cos(pi/8)*i6)
;// h5 = (i5-i7)*sqrt2 h7 = i5 + i7
;//
;// IStage 2: g0 = h0 + h3 g3 = h0 - h3
;// g1 = h1 + h2 g2 = h1 - h2
;// g7 = h7 g6 = h6 - h7
;// g5 = h5 - g6 g4 = h4 - g5
;//
;// IStage 1: f0 = g0 + g7 f7 = g0 - g7
;// f1 = g1 + g6 f6 = g1 - g6
;// f2 = g2 + g5 f5 = g2 - g5
;// f3 = g3 + g4 f4 = g3 - g4
;//
;// Note:
;// 1. The scaling by A(u)/8 can often be combined with inverse
;// quantization. The column and row scalings can be combined.
;// 2. The flowgraph in the AAN paper has h4,g6 negated compared
;// to the above code but is otherwise identical.
;// 3. The rotation by -pi/8 can be peformed using three multiplies
;// Eg c*i4+s*i6 = (i6-i4)*s + (c+s)*i4
;// -s*i4+c*i6 = (i6-i4)*s + (c-s)*i6
;// 4. If |T(u)|<=1 then from the IDCT definition,
;// |f(x)| <= ((1/sqrt2) + |c(1,x)| + .. + |c(7,x)|)/2
;// = ((1/sqrt2) + cos(pi/16) + ... + cos(7*pi/16))/2
;// = ((1/sqrt2) + (cot(pi/32)-1)/2)/2
;// = (1 + cos(pi/16) + cos(2pi/16) + cos(3pi/16))/sqrt(2)
;// = (approx)2.64
;// So the max gain of the 2D IDCT is ~x7.0 = 3 bits.
;// The table below shows input patterns generating the maximum
;// value of |f(u)| for input in the range |T(x)|<=1. M=-1, P=+1
;// InputPattern Max |f(x)|
;// PPPPPPPP |f0| = 2.64
;// PPPMMMMM |f1| = 2.64
;// PPMMMPPP |f2| = 2.64
;// PPMMPPMM |f3| = 2.64
;// PMMPPMMP |f4| = 2.64
;// PMMPMMPM |f5| = 2.64
;// PMPPMPMP |f6| = 2.64
;// PMPMPMPM |f7| = 2.64
;// Note that this input pattern is the transpose of the
;// corresponding max input patter for the FDCT.
;// Arguments
pSrc RN 0 ;// source data buffer
Stride RN 1 ;// destination stride in bytes
pDest RN 2 ;// destination data buffer
pScale RN 3 ;// pointer to scaling table
;// DCT Inverse Macro
;// The DCT code should be parametrized according
;// to the following inputs:
;// $outsize = "u8" : 8-bit unsigned data saturated (0 to +255)
;// "s9" : 16-bit signed data saturated to 9-bit (-256 to +255)
;// "s16" : 16-bit signed data not saturated (max size ~+/-14273)
;// $inscale = "s16" : signed 16-bit aan-scale table, Q15 format, with 4 byte alignment
;// "s32" : signed 32-bit aan-scale table, Q23 format, with 4 byte alignment
;//
;// Inputs:
;// pSrc = r0 = Pointer to input data
;// Range is -256 to +255 (9-bit)
;// Stride = r1 = Stride between input lines
;// pDest = r2 = Pointer to output data
;// pScale = r3 = Pointer to aan-scale table in the format defined by $inscale
MACRO
M_IDCT $outsize, $inscale, $stride
LCLA SHIFT
IF ARM1136JS
;// REGISTER ALLOCATION
;// This is hard since we have 8 values, 9 free registers and each
;// butterfly requires a temporary register. We also want to
;// maintain register order so we can use LDM/STM. The table below
;// summarises the register allocation that meets all these criteria.
;// a=1stcol, b=2ndcol, f,g,h,i are dataflow points described above.
;//
;// r1 a01 g0 h0
;// r4 b01 f0 g1 h1 i0
;// r5 a23 f1 g2 i1
;// r6 b23 f2 g3 h2 i2
;// r7 a45 f3 h3 i3
;// r8 b45 f4 g4 h4 i4
;// r9 a67 f5 g5 h5 i5
;// r10 b67 f6 g6 h6 i6
;// r11 f7 g7 h7 i7
;//
ra01 RN 1
rb01 RN 4
ra23 RN 5
rb23 RN 6
ra45 RN 7
rb45 RN 8
ra67 RN 9
rb67 RN 10
rtmp RN 11
csPiBy8 RN 12 ;// [ (Sin(pi/8)@Q15), (Cos(pi/8)@Q15) ]
LoopRR2 RN 14 ;// [ LoopNumber<<13 , (1/Sqrt(2))@Q15 ]
;// Transpose allocation
xft RN ra01
xf0 RN rb01
xf1 RN ra23
xf2 RN rb23
xf3 RN ra45
xf4 RN rb45
xf5 RN ra67
xf6 RN rb67
xf7 RN rtmp
;// IStage 1 allocation
xg0 RN xft
xg1 RN xf0
xg2 RN xf1
xg3 RN xf2
xgt RN xf3
xg4 RN xf4
xg5 RN xf5
xg6 RN xf6
xg7 RN xf7
;// IStage 2 allocation
xh0 RN xg0
xh1 RN xg1
xht RN xg2
xh2 RN xg3
xh3 RN xgt
xh4 RN xg4
xh5 RN xg5
xh6 RN xg6
xh7 RN xg7
;// IStage 3,4 allocation
xit RN xh0
xi0 RN xh1
xi1 RN xht
xi2 RN xh2
xi3 RN xh3
xi4 RN xh4
xi5 RN xh5
xi6 RN xh6
xi7 RN xh7
M_STR pDest, ppDest
IF "$stride"="s"
M_STR Stride, pStride
ENDIF
M_ADR pDest, pBlk
LDR csPiBy8, =0x30fc7642
LDR LoopRR2, =0x00005a82
v6_idct_col$_F
;// Load even values
LDR xi4, [pSrc], #4 ;// j0
LDR xi5, [pSrc, #4*16-4] ;// j4
LDR xi6, [pSrc, #2*16-4] ;// j2
LDR xi7, [pSrc, #6*16-4] ;// j6
;// Scale Even Values
IF "$inscale"="s16" ;// 16x16 mul
SHIFT SETA 12
LDR xi0, [pScale], #4
LDR xi1, [pScale, #4*16-4]
LDR xi2, [pScale, #2*16-4]
MOV xit, #1<<(SHIFT-1)
SMLABB xi3, xi0, xi4, xit
SMLATT xi4, xi0, xi4, xit
SMLABB xi0, xi1, xi5, xit
SMLATT xi5, xi1, xi5, xit
MOV xi3, xi3, ASR #SHIFT
PKHBT xi4, xi3, xi4, LSL #(16-SHIFT)
LDR xi3, [pScale, #6*16-4]
SMLABB xi1, xi2, xi6, xit
SMLATT xi6, xi2, xi6, xit
MOV xi0, xi0, ASR #SHIFT
PKHBT xi5, xi0, xi5, LSL #(16-SHIFT)
SMLABB xi2, xi3, xi7, xit
SMLATT xi7, xi3, xi7, xit
MOV xi1, xi1, ASR #SHIFT
PKHBT xi6, xi1, xi6, LSL #(16-SHIFT)
MOV xi2, xi2, ASR #SHIFT
PKHBT xi7, xi2, xi7, LSL #(16-SHIFT)
ENDIF
IF "$inscale"="s32" ;// 32x16 mul
SHIFT SETA (12+8-16)
MOV xit, #1<<(SHIFT-1)
LDR xi0, [pScale], #8
LDR xi1, [pScale, #0*32+4-8]
LDR xi2, [pScale, #4*32-8]
LDR xi3, [pScale, #4*32+4-8]
SMLAWB xi0, xi0, xi4, xit
SMLAWT xi1, xi1, xi4, xit
SMLAWB xi2, xi2, xi5, xit
SMLAWT xi3, xi3, xi5, xit
MOV xi0, xi0, ASR #SHIFT
PKHBT xi4, xi0, xi1, LSL #(16-SHIFT)
MOV xi2, xi2, ASR #SHIFT
PKHBT xi5, xi2, xi3, LSL #(16-SHIFT)
LDR xi0, [pScale, #2*32-8]
LDR xi1, [pScale, #2*32+4-8]
LDR xi2, [pScale, #6*32-8]
LDR xi3, [pScale, #6*32+4-8]
SMLAWB xi0, xi0, xi6, xit
SMLAWT xi1, xi1, xi6, xit
SMLAWB xi2, xi2, xi7, xit
SMLAWT xi3, xi3, xi7, xit
MOV xi0, xi0, ASR #SHIFT
PKHBT xi6, xi0, xi1, LSL #(16-SHIFT)
MOV xi2, xi2, ASR #SHIFT
PKHBT xi7, xi2, xi3, LSL #(16-SHIFT)
ENDIF
;// Load odd values
LDR xi0, [pSrc, #1*16-4] ;// j1
LDR xi1, [pSrc, #7*16-4] ;// j7
LDR xi2, [pSrc, #5*16-4] ;// j5
LDR xi3, [pSrc, #3*16-4] ;// j3
IF {TRUE}
;// shortcut if odd values 0
TEQ xi0, #0
TEQEQ xi1, #0
TEQEQ xi2, #0
TEQEQ xi3, #0
BEQ v6OddZero$_F
ENDIF
;// Store scaled even values
STMIA pDest, {xi4, xi5, xi6, xi7}
;// Scale odd values
IF "$inscale"="s16"
;// Perform AAN Scale
LDR xi4, [pScale, #1*16-4]
LDR xi5, [pScale, #7*16-4]
LDR xi6, [pScale, #5*16-4]
SMLABB xi7, xi0, xi4, xit
SMLATT xi0, xi0, xi4, xit
SMLABB xi4, xi1, xi5, xit
SMLATT xi1, xi1, xi5, xit
MOV xi7, xi7, ASR #SHIFT
PKHBT xi0, xi7, xi0, LSL #(16-SHIFT)
LDR xi7, [pScale, #3*16-4]
SMLABB xi5, xi2, xi6, xit
SMLATT xi2, xi2, xi6, xit
MOV xi4, xi4, ASR #SHIFT
PKHBT xi1, xi4, xi1, LSL #(16-SHIFT)
SMLABB xi6, xi3, xi7, xit
SMLATT xi3, xi3, xi7, xit
MOV xi5, xi5, ASR #SHIFT
PKHBT xi2, xi5, xi2, LSL #(16-SHIFT)
MOV xi6, xi6, ASR #SHIFT
PKHBT xi3, xi6, xi3, LSL #(16-SHIFT)
ENDIF
IF "$inscale"="s32" ;// 32x16 mul
LDR xi4, [pScale, #1*32-8]
LDR xi5, [pScale, #1*32+4-8]
LDR xi6, [pScale, #7*32-8]
LDR xi7, [pScale, #7*32+4-8]
SMLAWB xi4, xi4, xi0, xit
SMLAWT xi5, xi5, xi0, xit
SMLAWB xi6, xi6, xi1, xit
SMLAWT xi7, xi7, xi1, xit
MOV xi4, xi4, ASR #SHIFT
PKHBT xi0, xi4, xi5, LSL #(16-SHIFT)
MOV xi6, xi6, ASR #SHIFT
PKHBT xi1, xi6, xi7, LSL #(16-SHIFT)
LDR xi4, [pScale, #5*32-8]
LDR xi5, [pScale, #5*32+4-8]
LDR xi6, [pScale, #3*32-8]
LDR xi7, [pScale, #3*32+4-8]
SMLAWB xi4, xi4, xi2, xit
SMLAWT xi5, xi5, xi2, xit
SMLAWB xi6, xi6, xi3, xit
SMLAWT xi7, xi7, xi3, xit
MOV xi4, xi4, ASR #SHIFT
PKHBT xi2, xi4, xi5, LSL #(16-SHIFT)
MOV xi6, xi6, ASR #SHIFT
PKHBT xi3, xi6, xi7, LSL #(16-SHIFT)
ENDIF
LDR xit, =0x00010001 ;// rounding constant
SADD16 xi5, xi0, xi1 ;// (j1+j7)/2
SHADD16 xi5, xi5, xit
SSUB16 xi6, xi0, xi1 ;// j1-j7
SADD16 xi7, xi2, xi3 ;// (j5+j3)/2
SHADD16 xi7, xi7, xit
SSUB16 xi4, xi2, xi3 ;// j5-j3
SSUB16 xi3, xi5, xi7 ;// (i5-i7)/2
PKHBT xi0, xi6, xi4, LSL#16 ;// [i4,i6] row a
PKHTB xi1, xi4, xi6, ASR#16 ;// [i4,i6] row b
SMUADX xi2, xi0, csPiBy8 ;// rowa by [c,s]
SMUADX xi4, xi1, csPiBy8 ;// rowb by [c,s]
SMUSD xi0, xi0, csPiBy8 ;// rowa by [-s,c]
SMUSD xi6, xi1, csPiBy8 ;// rowb by [-s,c]
SMULBB xi1, xi3, LoopRR2
SMULTB xi3, xi3, LoopRR2
PKHTB xh4, xi4, xi2, ASR#16 ;// h4/4
PKHTB xh6, xi6, xi0, ASR#16 ;// h6/4
SHADD16 xh7, xi5, xi7 ;// (i5+i7)/4
;// xi0,xi1,xi2,xi3 now free
;// IStage 4,3, rows 2to3 x1/2
MOV xi3, xi3, LSL #1
PKHTB xh5, xi3, xi1, ASR#15 ;// h5/4
LDRD xi0, [pDest, #8] ;// j2,j6 scaled
;// IStage 2, rows4to7
SSUB16 xg6, xh6, xh7
SSUB16 xg5, xh5, xg6
SSUB16 xg4, xh4, xg5
SSUB16 xi2, xi0, xi1 ;// (j2-j6)
SHADD16 xi3, xi0, xi1 ;// (j2+j6)/2
SMULBB xi0, xi2, LoopRR2
SMULTB xi2, xi2, LoopRR2
MOV xi2, xi2, LSL #1
PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4
;// xi0, xi1 now free
;// IStage 4,3 rows 0to1 x 1/2
LDRD xi0, [pDest] ;// j0, j4 scaled
SSUB16 xh2, xh2, xi3
ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows
SHADD16 xh0, xi0, xi1
SHSUB16 xh1, xi0, xi1
;// IStage 2 rows 0to3 x 1/2
SHSUB16 xg2, xh1, xh2
SHADD16 xg1, xh1, xh2
SHSUB16 xg3, xh0, xh3
SHADD16 xg0, xh0, xh3
;// IStage 1 all rows
SADD16 xf3, xg3, xg4
SSUB16 xf4, xg3, xg4
SADD16 xf2, xg2, xg5
SSUB16 xf5, xg2, xg5
SADD16 xf1, xg1, xg6
SSUB16 xf6, xg1, xg6
SADD16 xf0, xg0, xg7
SSUB16 xf7, xg0, xg7
;// Transpose, store and loop
PKHBT ra01, xf0, xf1, LSL #16
PKHTB rb01, xf1, xf0, ASR #16
PKHBT ra23, xf2, xf3, LSL #16
PKHTB rb23, xf3, xf2, ASR #16
PKHBT ra45, xf4, xf5, LSL #16
PKHTB rb45, xf5, xf4, ASR #16
PKHBT ra67, xf6, xf7, LSL #16
STMIA pDest!, {ra01, ra23, ra45, ra67}
PKHTB rb67, xf7, xf6, ASR #16
STMIA pDest!, {rb01, rb23, rb45, rb67}
BCC v6_idct_col$_F
SUB pSrc, pDest, #(64*2)
M_LDR pDest, ppDest
IF "$stride"="s"
M_LDR pScale, pStride
ENDIF
B v6_idct_row$_F
v6OddZero$_F
SSUB16 xi2, xi6, xi7 ;// (j2-j6)
SHADD16 xi3, xi6, xi7 ;// (j2+j6)/2
SMULBB xi0, xi2, LoopRR2
SMULTB xi2, xi2, LoopRR2
MOV xi2, xi2, LSL #1
PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4
SSUB16 xh2, xh2, xi3
;// xi0, xi1 now free
;// IStage 4,3 rows 0to1 x 1/2
SHADD16 xh0, xi4, xi5
SHSUB16 xh1, xi4, xi5
;// IStage 2 rows 0to3 x 1/2
SHSUB16 xg2, xh1, xh2
SHADD16 xg1, xh1, xh2
SHSUB16 xg3, xh0, xh3
SHADD16 xg0, xh0, xh3
;// IStage 1 all rows
MOV xf3, xg3
MOV xf4, xg3
MOV xf2, xg2
MOV xf5, xg2
MOV xf1, xg1
MOV xf6, xg1
MOV xf0, xg0
MOV xf7, xg0
;// Transpose
PKHBT ra01, xf0, xf1, LSL #16
PKHTB rb01, xf1, xf0, ASR #16
PKHBT ra23, xf2, xf3, LSL #16
PKHTB rb23, xf3, xf2, ASR #16
PKHBT ra45, xf4, xf5, LSL #16
PKHTB rb45, xf5, xf4, ASR #16
PKHBT ra67, xf6, xf7, LSL #16
PKHTB rb67, xf7, xf6, ASR #16
STMIA pDest!, {ra01, ra23, ra45, ra67}
ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows
STMIA pDest!, {rb01, rb23, rb45, rb67}
BCC v6_idct_col$_F
SUB pSrc, pDest, #(64*2)
M_LDR pDest, ppDest
IF "$stride"="s"
M_LDR pScale, pStride
ENDIF
v6_idct_row$_F
;// IStage 4,3, rows4to7 x1/4
LDR xit, =0x00010001 ;// rounding constant
LDR xi0, [pSrc, #1*16] ;// j1
LDR xi1, [pSrc, #7*16] ;// 4*j7
LDR xi2, [pSrc, #5*16] ;// j5
LDR xi3, [pSrc, #3*16] ;// j3
SHADD16 xi1, xi1, xit ;// 2*j7
SHADD16 xi1, xi1, xit ;// j7
SHADD16 xi5, xi0, xi1 ;// (j1+j7)/2
SSUB16 xi6, xi0, xi1 ;// j1-j7
SHADD16 xi7, xi2, xi3 ;// (j5+j3)/2
SSUB16 xi4, xi2, xi3 ;// j5-j3
SSUB16 xi3, xi5, xi7 ;// (i5-i7)/2
PKHBT xi0, xi6, xi4, LSL#16 ;// [i4,i6] row a
PKHTB xi1, xi4, xi6, ASR#16 ;// [i4,i6] row b
SMUADX xi2, xi0, csPiBy8 ;// rowa by [c,s]
SMUADX xi4, xi1, csPiBy8 ;// rowb by [c,s]
SMUSD xi0, xi0, csPiBy8 ;// rowa by [-s,c]
SMUSD xi6, xi1, csPiBy8 ;// rowb by [-s,c]
SMULBB xi1, xi3, LoopRR2
SMULTB xi3, xi3, LoopRR2
PKHTB xh4, xi4, xi2, ASR#16 ;// h4/4
PKHTB xh6, xi6, xi0, ASR#16 ;// h6/4
SHADD16 xh7, xi5, xi7 ;// (i5+i7)/4
MOV xi3, xi3, LSL #1
PKHTB xh5, xi3, xi1, ASR#15 ;// h5/4
;// xi0,xi1,xi2,xi3 now free
;// IStage 4,3, rows 2to3 x1/2
LDR xi0, [pSrc, #2*16] ;// j2
LDR xi1, [pSrc, #6*16] ;// 2*j6
;// IStage 2, rows4to7
SSUB16 xg6, xh6, xh7
SSUB16 xg5, xh5, xg6
SSUB16 xg4, xh4, xg5
SHADD16 xi1, xi1, xit ;// j6
SSUB16 xi2, xi0, xi1 ;// (j2-j6)
SHADD16 xi3, xi0, xi1 ;// (j2+j6)/2
SMULBB xi0, xi2, LoopRR2
SMULTB xi2, xi2, LoopRR2
MOV xi2, xi2, LSL #1
PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4
;// xi0, xi1 now free
;// IStage 4,3 rows 0to1 x 1/2
LDR xi1, [pSrc, #4*16] ;// j4
LDR xi0, [pSrc], #4 ;// j0
SSUB16 xh2, xh2, xi3
ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows
ADD xi0, xi0, xit, LSL #2 ;// ensure correct round
SHADD16 xh0, xi0, xi1 ;// of DC result
SHSUB16 xh1, xi0, xi1
;// IStage 2 rows 0to3 x 1/2
SHSUB16 xg2, xh1, xh2
SHADD16 xg1, xh1, xh2
SHSUB16 xg3, xh0, xh3
SHADD16 xg0, xh0, xh3
;// IStage 1 all rows
SHADD16 xf3, xg3, xg4
SHSUB16 xf4, xg3, xg4
SHADD16 xf2, xg2, xg5
SHSUB16 xf5, xg2, xg5
SHADD16 xf1, xg1, xg6
SHSUB16 xf6, xg1, xg6
SHADD16 xf0, xg0, xg7
SHSUB16 xf7, xg0, xg7
;// Saturate
IF ("$outsize"="u8")
USAT16 xf0, #8, xf0
USAT16 xf1, #8, xf1
USAT16 xf2, #8, xf2
USAT16 xf3, #8, xf3
USAT16 xf4, #8, xf4
USAT16 xf5, #8, xf5
USAT16 xf6, #8, xf6
USAT16 xf7, #8, xf7
ENDIF
IF ("$outsize"="s9")
SSAT16 xf0, #9, xf0
SSAT16 xf1, #9, xf1
SSAT16 xf2, #9, xf2
SSAT16 xf3, #9, xf3
SSAT16 xf4, #9, xf4
SSAT16 xf5, #9, xf5
SSAT16 xf6, #9, xf6
SSAT16 xf7, #9, xf7
ENDIF
;// Transpose to Row, Pack and store
IF ("$outsize"="u8")
ORR xf0, xf0, xf1, LSL #8 ;// [ b1 b0 a1 a0 ]
ORR xf2, xf2, xf3, LSL #8 ;// [ b3 b2 a3 a2 ]
ORR xf4, xf4, xf5, LSL #8 ;// [ b5 b4 a5 a4 ]
ORR xf6, xf6, xf7, LSL #8 ;// [ b7 b6 a7 a6 ]
PKHBT ra01, xf0, xf2, LSL #16
PKHTB rb01, xf2, xf0, ASR #16
PKHBT ra23, xf4, xf6, LSL #16
PKHTB rb23, xf6, xf4, ASR #16
STMIA pDest, {ra01, ra23}
IF "$stride"="s"
ADD pDest, pDest, pScale
STMIA pDest, {rb01, rb23}
ADD pDest, pDest, pScale
ELSE
ADD pDest, pDest, #($stride)
STMIA pDest, {rb01, rb23}
ADD pDest, pDest, #($stride)
ENDIF
ENDIF
IF ("$outsize"="s9"):LOR:("$outsize"="s16")
PKHBT ra01, xf0, xf1, LSL #16
PKHTB rb01, xf1, xf0, ASR #16
PKHBT ra23, xf2, xf3, LSL #16
PKHTB rb23, xf3, xf2, ASR #16
PKHBT ra45, xf4, xf5, LSL #16
PKHTB rb45, xf5, xf4, ASR #16
PKHBT ra67, xf6, xf7, LSL #16
PKHTB rb67, xf7, xf6, ASR #16
STMIA pDest, {ra01, ra23, ra45, ra67}
IF "$stride"="s"
ADD pDest, pDest, pScale
STMIA pDest, {rb01, rb23, rb45, rb67}
ADD pDest, pDest, pScale
ELSE
ADD pDest, pDest, #($stride)
STMIA pDest, {rb01, rb23, rb45, rb67}
ADD pDest, pDest, #($stride)
ENDIF
ENDIF
BCC v6_idct_row$_F
ENDIF ;// ARM1136JS
IF CortexA8
Src0 EQU 7
Src1 EQU 8
Src2 EQU 9
Src3 EQU 10
Src4 EQU 11
Src5 EQU 12
Src6 EQU 13
Src7 EQU 14
Tmp EQU 15
qXj0 QN Src0.S16
qXj1 QN Src1.S16
qXj2 QN Src2.S16
qXj3 QN Src3.S16
qXj4 QN Src4.S16
qXj5 QN Src5.S16
qXj6 QN Src6.S16
qXj7 QN Src7.S16
qXjt QN Tmp.S16
dXj0lo DN (Src0*2).S16
dXj0hi DN (Src0*2+1).S16
dXj1lo DN (Src1*2).S16
dXj1hi DN (Src1*2+1).S16
dXj2lo DN (Src2*2).S16
dXj2hi DN (Src2*2+1).S16
dXj3lo DN (Src3*2).S16
dXj3hi DN (Src3*2+1).S16
dXj4lo DN (Src4*2).S16
dXj4hi DN (Src4*2+1).S16
dXj5lo DN (Src5*2).S16
dXj5hi DN (Src5*2+1).S16
dXj6lo DN (Src6*2).S16
dXj6hi DN (Src6*2+1).S16
dXj7lo DN (Src7*2).S16
dXj7hi DN (Src7*2+1).S16
dXjtlo DN (Tmp*2).S16
dXjthi DN (Tmp*2+1).S16
qXi0 QN qXj0
qXi1 QN qXj4
qXi2 QN qXj2
qXi3 QN qXj7
qXi4 QN qXj5
qXi5 QN qXjt
qXi6 QN qXj1
qXi7 QN qXj6
qXit QN qXj3
dXi0lo DN dXj0lo
dXi0hi DN dXj0hi
dXi1lo DN dXj4lo
dXi1hi DN dXj4hi
dXi2lo DN dXj2lo
dXi2hi DN dXj2hi
dXi3lo DN dXj7lo
dXi3hi DN dXj7hi
dXi4lo DN dXj5lo
dXi4hi DN dXj5hi
dXi5lo DN dXjtlo
dXi5hi DN dXjthi
dXi6lo DN dXj1lo
dXi6hi DN dXj1hi
dXi7lo DN dXj6lo
dXi7hi DN dXj6hi
dXitlo DN dXj3lo
dXithi DN dXj3hi
qXh0 QN qXit
qXh1 QN qXi0
qXh2 QN qXi2
qXh3 QN qXi3
qXh4 QN qXi7
qXh5 QN qXi5
qXh6 QN qXi4
qXh7 QN qXi1
qXht QN qXi6
dXh0lo DN dXitlo
dXh0hi DN dXithi
dXh1lo DN dXi0lo
dXh1hi DN dXi0hi
dXh2lo DN dXi2lo
dXh2hi DN dXi2hi
dXh3lo DN dXi3lo
dXh3hi DN dXi3hi
dXh4lo DN dXi7lo
dXh4hi DN dXi7hi
dXh5lo DN dXi5lo
dXh5hi DN dXi5hi
dXh6lo DN dXi4lo
dXh6hi DN dXi4hi
dXh7lo DN dXi1lo
dXh7hi DN dXi1hi
dXhtlo DN dXi6lo
dXhthi DN dXi6hi
qXg0 QN qXh2
qXg1 QN qXht
qXg2 QN qXh1
qXg3 QN qXh0
qXg4 QN qXh4
qXg5 QN qXh5
qXg6 QN qXh6
qXg7 QN qXh7
qXgt QN qXh3
qXf0 QN qXg6
qXf1 QN qXg5
qXf2 QN qXg4
qXf3 QN qXgt
qXf4 QN qXg3
qXf5 QN qXg2
qXf6 QN qXg1
qXf7 QN qXg0
qXft QN qXg7
qXt0 QN 1.S32
qXt1 QN 2.S32
qT0lo QN 1.S32
qT0hi QN 2.S32
qT1lo QN 3.S32
qT1hi QN 4.S32
qScalelo QN 5.S32 ;// used to read post scale values
qScalehi QN 6.S32
qTemp0 QN 5.S32
qTemp1 QN 6.S32
Scale1 EQU 6
Scale2 EQU 15
qScale1 QN Scale1.S16
qScale2 QN Scale2.S16
dScale1lo DN (Scale1*2).S16
dScale1hi DN (Scale1*2+1).S16
dScale2lo DN (Scale2*2).S16
dScale2hi DN (Scale2*2+1).S16
dCoefs DN 0.S16 ;// Scale coefficients in format {[0] [C] [S] [InvSqrt2]}
InvSqrt2 DN dCoefs[0] ;// 1/sqrt(2) in Q15
S DN dCoefs[1] ;// Sin(PI/8) in Q15
C DN dCoefs[2] ;// Cos(PI/8) in Q15
pTemp RN 12
IMPORT armCOMM_IDCTCoef
VLD1 {qXj0,qXj1}, [pSrc @64]!
VLD1 {qXj2,qXj3}, [pSrc @64]!
VLD1 {qXj4,qXj5}, [pSrc @64]!
VLD1 {qXj6,qXj7}, [pSrc @64]!
;// Load PreScale and multiply with Src
;// IStage 4
IF "$inscale"="s16" ;// 16X16 Mul
M_IDCT_PRESCALE16
ENDIF
IF "$inscale"="s32" ;// 32X32 ,ul
M_IDCT_PRESCALE32
ENDIF
;// IStage 3
VQDMULH qXi2, qXi2, InvSqrt2 ;// i2/sqrt(2)
VHADD qXh0, qXi0, qXi1 ;// (i0+i1)/2
VHSUB qXh1, qXi0, qXi1 ;// (i0-i1)/2
VHADD qXh7, qXi5, qXi7 ;// (i5+i7)/4
VSUB qXh5, qXi5, qXi7 ;// (i5-i7)/2
VQDMULH qXh5, qXh5, InvSqrt2 ;// h5/sqrt(2)
VSUB qXh2, qXi2, qXi3 ;// h2, h3
VMULL qXt0, dXi4lo, C ;// c*i4
VMLAL qXt0, dXi6lo, S ;// c*i4+s*i6
VMULL qXt1, dXi4hi, C
VMLAL qXt1, dXi6hi, S
VSHRN dXh4lo, qXt0, #16 ;// h4
VSHRN dXh4hi, qXt1, #16
VMULL qXt0, dXi6lo, C ;// c*i6
VMLSL qXt0, dXi4lo, S ;// -s*i4 + c*h6
VMULL qXt1, dXi6hi, C
VMLSL qXt1, dXi4hi, S
VSHRN dXh6lo, qXt0, #16 ;// h6
VSHRN dXh6hi, qXt1, #16
;// IStage 2
VSUB qXg6, qXh6, qXh7
VSUB qXg5, qXh5, qXg6
VSUB qXg4, qXh4, qXg5
VHADD qXg1, qXh1, qXh2 ;// (h1+h2)/2
VHSUB qXg2, qXh1, qXh2 ;// (h1-h2)/2
VHADD qXg0, qXh0, qXh3 ;// (h0+h3)/2
VHSUB qXg3, qXh0, qXh3 ;// (h0-h3)/2
;// IStage 1 all rows
VADD qXf3, qXg3, qXg4
VSUB qXf4, qXg3, qXg4
VADD qXf2, qXg2, qXg5
VSUB qXf5, qXg2, qXg5
VADD qXf1, qXg1, qXg6
VSUB qXf6, qXg1, qXg6
VADD qXf0, qXg0, qXg7
VSUB qXf7, qXg0, qXg7
;// Transpose, store and loop
XTR0 EQU Src5
XTR1 EQU Tmp
XTR2 EQU Src6
XTR3 EQU Src7
XTR4 EQU Src3
XTR5 EQU Src0
XTR6 EQU Src1
XTR7 EQU Src2
XTRt EQU Src4
qA0 QN XTR0.S32 ;// for XTRpose
qA1 QN XTR1.S32
qA2 QN XTR2.S32
qA3 QN XTR3.S32
qA4 QN XTR4.S32
qA5 QN XTR5.S32
qA6 QN XTR6.S32
qA7 QN XTR7.S32
dB0 DN XTR0*2+1 ;// for using VSWP
dB1 DN XTR1*2+1
dB2 DN XTR2*2+1
dB3 DN XTR3*2+1
dB4 DN XTR4*2
dB5 DN XTR5*2
dB6 DN XTR6*2
dB7 DN XTR7*2
VTRN qXf0, qXf1
VTRN qXf2, qXf3
VTRN qXf4, qXf5
VTRN qXf6, qXf7
VTRN qA0, qA2
VTRN qA1, qA3
VTRN qA4, qA6
VTRN qA5, qA7
VSWP dB0, dB4
VSWP dB1, dB5
VSWP dB2, dB6
VSWP dB3, dB7
qYj0 QN qXf0
qYj1 QN qXf1
qYj2 QN qXf2
qYj3 QN qXf3
qYj4 QN qXf4
qYj5 QN qXf5
qYj6 QN qXf6
qYj7 QN qXf7
qYjt QN qXft
dYj0lo DN (XTR0*2).S16
dYj0hi DN (XTR0*2+1).S16
dYj1lo DN (XTR1*2).S16
dYj1hi DN (XTR1*2+1).S16
dYj2lo DN (XTR2*2).S16
dYj2hi DN (XTR2*2+1).S16
dYj3lo DN (XTR3*2).S16
dYj3hi DN (XTR3*2+1).S16
dYj4lo DN (XTR4*2).S16
dYj4hi DN (XTR4*2+1).S16
dYj5lo DN (XTR5*2).S16
dYj5hi DN (XTR5*2+1).S16
dYj6lo DN (XTR6*2).S16
dYj6hi DN (XTR6*2+1).S16
dYj7lo DN (XTR7*2).S16
dYj7hi DN (XTR7*2+1).S16
dYjtlo DN (XTRt*2).S16
dYjthi DN (XTRt*2+1).S16
qYi0 QN qYj0
qYi1 QN qYj4
qYi2 QN qYj2
qYi3 QN qYj7
qYi4 QN qYj5
qYi5 QN qYjt
qYi6 QN qYj1
qYi7 QN qYj6
qYit QN qYj3
dYi0lo DN dYj0lo
dYi0hi DN dYj0hi
dYi1lo DN dYj4lo
dYi1hi DN dYj4hi
dYi2lo DN dYj2lo
dYi2hi DN dYj2hi
dYi3lo DN dYj7lo
dYi3hi DN dYj7hi
dYi4lo DN dYj5lo
dYi4hi DN dYj5hi
dYi5lo DN dYjtlo
dYi5hi DN dYjthi
dYi6lo DN dYj1lo
dYi6hi DN dYj1hi
dYi7lo DN dYj6lo
dYi7hi DN dYj6hi
dYitlo DN dYj3lo
dYithi DN dYj3hi
qYh0 QN qYit
qYh1 QN qYi0
qYh2 QN qYi2
qYh3 QN qYi3
qYh4 QN qYi7
qYh5 QN qYi5
qYh6 QN qYi4
qYh7 QN qYi1
qYht QN qYi6
dYh0lo DN dYitlo
dYh0hi DN dYithi
dYh1lo DN dYi0lo
dYh1hi DN dYi0hi
dYh2lo DN dYi2lo
dYh2hi DN dYi2hi
dYh3lo DN dYi3lo
dYh3hi DN dYi3hi
dYh4lo DN dYi7lo
dYh4hi DN dYi7hi
dYh5lo DN dYi5lo
dYh5hi DN dYi5hi
dYh6lo DN dYi4lo
dYh6hi DN dYi4hi
dYh7lo DN dYi1lo
dYh7hi DN dYi1hi
dYhtlo DN dYi6lo
dYhthi DN dYi6hi
qYg0 QN qYh2
qYg1 QN qYht
qYg2 QN qYh1
qYg3 QN qYh0
qYg4 QN qYh4
qYg5 QN qYh5
qYg6 QN qYh6
qYg7 QN qYh7
qYgt QN qYh3
qYf0 QN qYg6
qYf1 QN qYg5
qYf2 QN qYg4
qYf3 QN qYgt
qYf4 QN qYg3
qYf5 QN qYg2
qYf6 QN qYg1
qYf7 QN qYg0
qYft QN qYg7
VRSHR qYj7, qYj7, #2
VRSHR qYj6, qYj6, #1
VHADD qYi5, qYj1, qYj7 ;// i5 = (j1+j7)/2
VSUB qYi6, qYj1, qYj7 ;// i6 = j1-j7
VHADD qYi3, qYj2, qYj6 ;// i3 = (j2+j6)/2
VSUB qYi2, qYj2, qYj6 ;// i2 = j2-j6
VHADD qYi7, qYj5, qYj3 ;// i7 = (j5+j3)/2
VSUB qYi4, qYj5, qYj3 ;// i4 = j5-j3
VQDMULH qYi2, qYi2, InvSqrt2 ;// i2/sqrt(2)
;// IStage 4,3 rows 0to1 x 1/2
MOV pTemp, #0x4 ;// ensure correct round
VDUP qScale1, pTemp ;// of DC result
VADD qYi0, qYi0, qScale1
VHADD qYh0, qYi0, qYi1 ;// (i0+i1)/2
VHSUB qYh1, qYi0, qYi1 ;// (i0-i1)/2
VHADD qYh7, qYi5, qYi7 ;// (i5+i7)/4
VSUB qYh5, qYi5, qYi7 ;// (i5-i7)/2
VSUB qYh2, qYi2, qYi3 ;// h2, h3
VQDMULH qYh5, qYh5, InvSqrt2 ;// h5/sqrt(2)
VMULL qXt0, dYi4lo, C ;// c*i4
VMLAL qXt0, dYi6lo, S ;// c*i4+s*i6
VMULL qXt1, dYi4hi, C
VMLAL qXt1, dYi6hi, S
VSHRN dYh4lo, qXt0, #16 ;// h4
VSHRN dYh4hi, qXt1, #16
VMULL qXt0, dYi6lo, C ;// c*i6
VMLSL qXt0, dYi4lo, S ;// -s*i4 + c*h6
VMULL qXt1, dYi6hi, C
VMLSL qXt1, dYi4hi, S
VSHRN dYh6lo, qXt0, #16 ;// h6
VSHRN dYh6hi, qXt1, #16
VSUB qYg6, qYh6, qYh7
VSUB qYg5, qYh5, qYg6
VSUB qYg4, qYh4, qYg5
;// IStage 2 rows 0to3 x 1/2
VHADD qYg1, qYh1, qYh2 ;// (h1+h2)/2
VHSUB qYg2, qYh1, qYh2 ;// (h1-h2)/2
VHADD qYg0, qYh0, qYh3 ;// (h0+h3)/2
VHSUB qYg3, qYh0, qYh3 ;// (h0-h3)/2
;// IStage 1 all rows
VHADD qYf3, qYg3, qYg4
VHSUB qYf4, qYg3, qYg4
VHADD qYf2, qYg2, qYg5
VHSUB qYf5, qYg2, qYg5
VHADD qYf1, qYg1, qYg6
VHSUB qYf6, qYg1, qYg6
VHADD qYf0, qYg0, qYg7
VHSUB qYf7, qYg0, qYg7
YTR0 EQU Src0
YTR1 EQU Src4
YTR2 EQU Src1
YTR3 EQU Src2
YTR4 EQU Src7
YTR5 EQU Src5
YTR6 EQU Tmp
YTR7 EQU Src6
YTRt EQU Src3
qC0 QN YTR0.S32 ;// for YTRpose
qC1 QN YTR1.S32
qC2 QN YTR2.S32
qC3 QN YTR3.S32
qC4 QN YTR4.S32
qC5 QN YTR5.S32
qC6 QN YTR6.S32
qC7 QN YTR7.S32
dD0 DN YTR0*2+1 ;// for using VSWP
dD1 DN YTR1*2+1
dD2 DN YTR2*2+1
dD3 DN YTR3*2+1
dD4 DN YTR4*2
dD5 DN YTR5*2
dD6 DN YTR6*2
dD7 DN YTR7*2
VTRN qYf0, qYf1
VTRN qYf2, qYf3
VTRN qYf4, qYf5
VTRN qYf6, qYf7
VTRN qC0, qC2
VTRN qC1, qC3
VTRN qC4, qC6
VTRN qC5, qC7
VSWP dD0, dD4
VSWP dD1, dD5
VSWP dD2, dD6
VSWP dD3, dD7
dYf0U8 DN YTR0*2.U8
dYf1U8 DN YTR1*2.U8
dYf2U8 DN YTR2*2.U8
dYf3U8 DN YTR3*2.U8
dYf4U8 DN YTR4*2.U8
dYf5U8 DN YTR5*2.U8
dYf6U8 DN YTR6*2.U8
dYf7U8 DN YTR7*2.U8
;//
;// Do saturation if outsize is other than S16
;//
IF ("$outsize"="u8")
;// Output range [0-255]
VQMOVN dYf0U8, qYf0
VQMOVN dYf1U8, qYf1
VQMOVN dYf2U8, qYf2
VQMOVN dYf3U8, qYf3
VQMOVN dYf4U8, qYf4
VQMOVN dYf5U8, qYf5
VQMOVN dYf6U8, qYf6
VQMOVN dYf7U8, qYf7
ENDIF
IF ("$outsize"="s9")
;// Output range [-256 to +255]
VQSHL qYf0, qYf0, #16-9
VQSHL qYf1, qYf1, #16-9
VQSHL qYf2, qYf2, #16-9
VQSHL qYf3, qYf3, #16-9
VQSHL qYf4, qYf4, #16-9
VQSHL qYf5, qYf5, #16-9
VQSHL qYf6, qYf6, #16-9
VQSHL qYf7, qYf7, #16-9
VSHR qYf0, qYf0, #16-9
VSHR qYf1, qYf1, #16-9
VSHR qYf2, qYf2, #16-9
VSHR qYf3, qYf3, #16-9
VSHR qYf4, qYf4, #16-9
VSHR qYf5, qYf5, #16-9
VSHR qYf6, qYf6, #16-9
VSHR qYf7, qYf7, #16-9
ENDIF
;// Store output depending on the Stride size
IF "$stride"="s"
VST1 qYf0, [pDest @64], Stride
VST1 qYf1, [pDest @64], Stride
VST1 qYf2, [pDest @64], Stride
VST1 qYf3, [pDest @64], Stride
VST1 qYf4, [pDest @64], Stride
VST1 qYf5, [pDest @64], Stride
VST1 qYf6, [pDest @64], Stride
VST1 qYf7, [pDest @64]
ELSE
IF ("$outsize"="u8")
VST1 dYf0U8, [pDest @64], #8
VST1 dYf1U8, [pDest @64], #8
VST1 dYf2U8, [pDest @64], #8
VST1 dYf3U8, [pDest @64], #8
VST1 dYf4U8, [pDest @64], #8
VST1 dYf5U8, [pDest @64], #8
VST1 dYf6U8, [pDest @64], #8
VST1 dYf7U8, [pDest @64]
ELSE
;// ("$outsize"="s9") or ("$outsize"="s16")
VST1 qYf0, [pDest @64], #16
VST1 qYf1, [pDest @64], #16
VST1 qYf2, [pDest @64], #16
VST1 qYf3, [pDest @64], #16
VST1 qYf4, [pDest @64], #16
VST1 qYf5, [pDest @64], #16
VST1 qYf6, [pDest @64], #16
VST1 qYf7, [pDest @64]
ENDIF
ENDIF
ENDIF ;// CortexA8
MEND
;// Scale TWO input rows with TWO rows of 16 bit scale values
;//
;// This macro is used by M_IDCT_PRESCALE16 to pre-scale one row
;// input (Eight input values) with one row of scale values. Also
;// Loads next scale values from pScale, if $LastRow flag is not set.
;//
;// Input Registers:
;//
;// $dAlo - Input D register with first four S16 values of row n
;// $dAhi - Input D register with next four S16 values of row n
;// $dBlo - Input D register with first four S16 values of row n+1
;// $dBhi - Input D register with next four S16 values of row n+1
;// pScale - Pointer to next row of scale values
;// qT0lo - Temporary scratch register
;// qT0hi - Temporary scratch register
;// qT1lo - Temporary scratch register
;// qT1hi - Temporary scratch register
;// dScale1lo - Scale value of row n
;// dScale1hi - Scale value of row n
;// dScale2lo - Scale value of row n+1
;// dScale2hi - Scale value of row n+1
;//
;// Input Flag
;//
;// $LastRow - Flag to indicate whether current row is last row
;//
;// Output Registers:
;//
;// $dAlo - Scaled output values (first four S16 of row n)
;// $dAhi - Scaled output values (next four S16 of row n)
;// $dBlo - Scaled output values (first four S16 of row n+1)
;// $dBhi - Scaled output values (next four S16 of row n+1)
;// qScale1 - Scale values for next row
;// qScale2 - Scale values for next row+1
;// pScale - Pointer to next row of scale values
;//
MACRO
M_IDCT_SCALE16 $dAlo, $dAhi, $dBlo, $dBhi, $LastRow
VMULL qT0lo, $dAlo, dScale1lo
VMULL qT0hi, $dAhi, dScale1hi
VMULL qT1lo, $dBlo, dScale2lo
VMULL qT1hi, $dBhi, dScale2hi
IF "$LastRow"="0"
VLD1 qScale1, [pScale], #16 ;// Load scale for row n+1
VLD1 qScale2, [pScale], #16 ;// Load scale for row n+2
ENDIF
VQRSHRN $dAlo, qT0lo, #12
VQRSHRN $dAhi, qT0hi, #12
VQRSHRN $dBlo, qT1lo, #12
VQRSHRN $dBhi, qT1hi, #12
MEND
;// Scale 8x8 block input values with 16 bit scale values
;//
;// This macro is used to pre-scale block of 8x8 input.
;// This also do the Ist stage transformations of IDCT.
;//
;// Input Registers:
;//
;// dXjnlo - n th input D register with first four S16 values
;// dXjnhi - n th input D register with next four S16 values
;// qXjn - n th input Q register with eight S16 values
;// pScale - Pointer to scale values
;//
;// Output Registers:
;//
;// qXin - n th output Q register with eight S16 output values of 1st stage
;//
MACRO
M_IDCT_PRESCALE16
VLD1 qScale1, [pScale], #16 ;// Load Pre scale for row 0
VLD1 qScale2, [pScale], #16 ;// Load Pre scale for row 0
M_IDCT_SCALE16 dXj0lo, dXj0hi, dXj1lo, dXj1hi, 0 ;// Pre scale row 0 & 1
M_IDCT_SCALE16 dXj2lo, dXj2hi, dXj3lo, dXj3hi, 0
M_IDCT_SCALE16 dXj4lo, dXj4hi, dXj5lo, dXj5hi, 0
M_IDCT_SCALE16 dXj6lo, dXj6hi, dXj7lo, dXj7hi, 1
VHADD qXi5, qXj1, qXj7 ;// (j1+j7)/2
VSUB qXi6, qXj1, qXj7 ;// j1-j7
LDR pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants
VHADD qXi3, qXj2, qXj6 ;// (j2+j6)/2
VSUB qXi2, qXj2, qXj6 ;// j2-j6
VLDR dCoefs, [pSrc] ;// Load DCT inverse AAN constants
VHADD qXi7, qXj5, qXj3 ;// (j5+j3)/2
VSUB qXi4, qXj5, qXj3 ;// j5-j3
MEND
;// Scale 8x8 block input values with 32 bit scale values
;//
;// This macro is used to pre-scale block of 8x8 input.
;// This also do the Ist stage transformations of IDCT.
;//
;// Input Registers:
;//
;// dXjnlo - n th input D register with first four S16 values
;// dXjnhi - n th input D register with next four S16 values
;// qXjn - n th input Q register with eight S16 values
;// pScale - Pointer to 32bit scale values in Q23 format
;//
;// Output Registers:
;//
;// dXinlo - n th output D register with first four S16 output values of 1st stage
;// dXinhi - n th output D register with next four S16 output values of 1st stage
;//
MACRO
M_IDCT_PRESCALE32
qScale0lo QN 0.S32
qScale0hi QN 1.S32
qScale1lo QN 2.S32
qScale1hi QN 3.S32
qScale2lo QN qScale1lo
qScale2hi QN qScale1hi
qScale3lo QN qScale1lo
qScale3hi QN qScale1hi
qScale4lo QN qScale1lo
qScale4hi QN qScale1hi
qScale5lo QN qScale0lo
qScale5hi QN qScale0hi
qScale6lo QN qScale0lo
qScale6hi QN qScale0hi
qScale7lo QN qScale0lo
qScale7hi QN qScale0hi
qSrc0lo QN 4.S32
qSrc0hi QN 5.S32
qSrc1lo QN 6.S32
qSrc1hi QN Src4.S32
qSrc2lo QN qSrc0lo
qSrc2hi QN qSrc0hi
qSrc3lo QN qSrc0lo
qSrc3hi QN qSrc0hi
qSrc4lo QN qSrc0lo
qSrc4hi QN qSrc0hi
qSrc5lo QN qSrc1lo
qSrc5hi QN qSrc1hi
qSrc6lo QN qSrc1lo
qSrc6hi QN qSrc1hi
qSrc7lo QN qSrc0lo
qSrc7hi QN qSrc0hi
qRes17lo QN qScale0lo
qRes17hi QN qScale0hi
qRes26lo QN qScale0lo
qRes26hi QN qScale0hi
qRes53lo QN qScale0lo
qRes53hi QN qScale0hi
ADD pTemp, pScale, #4*8*7 ;// Address of pScale[7]
;// Row 0
VLD1 {qScale0lo, qScale0hi}, [pScale]!
VSHLL qSrc0lo, dXj0lo, #(12-1)
VSHLL qSrc0hi, dXj0hi, #(12-1)
VLD1 {qScale1lo, qScale1hi}, [pScale]!
VQRDMULH qSrc0lo, qScale0lo, qSrc0lo
VQRDMULH qSrc0hi, qScale0hi, qSrc0hi
VLD1 {qScale7lo, qScale7hi}, [pTemp]!
VSHLL qSrc1lo, dXj1lo, #(12-1)
VSHLL qSrc1hi, dXj1hi, #(12-1)
VMOVN dXi0lo, qSrc0lo ;// Output i0
VMOVN dXi0hi, qSrc0hi
VSHLL qSrc7lo, dXj7lo, #(12-1)
VSHLL qSrc7hi, dXj7hi, #(12-1)
SUB pTemp, pTemp, #((16*2)+(4*8*1))
VQRDMULH qSrc1lo, qScale1lo, qSrc1lo
VQRDMULH qSrc1hi, qScale1hi, qSrc1hi
VQRDMULH qSrc7lo, qScale7lo, qSrc7lo
VQRDMULH qSrc7hi, qScale7hi, qSrc7hi
VLD1 {qScale2lo, qScale2hi}, [pScale]!
;// Row 1 & 7
VHADD qRes17lo, qSrc1lo, qSrc7lo ;// (j1+j7)/2
VHADD qRes17hi, qSrc1hi, qSrc7hi ;// (j1+j7)/2
VMOVN dXi5lo, qRes17lo ;// Output i5
VMOVN dXi5hi, qRes17hi
VSUB qRes17lo, qSrc1lo, qSrc7lo ;// j1-j7
VSUB qRes17hi, qSrc1hi, qSrc7hi ;// j1-j7
VMOVN dXi6lo, qRes17lo ;// Output i6
VMOVN dXi6hi, qRes17hi
VSHLL qSrc2lo, dXj2lo, #(12-1)
VSHLL qSrc2hi, dXj2hi, #(12-1)
VLD1 {qScale6lo, qScale6hi}, [pTemp]!
VSHLL qSrc6lo, dXj6lo, #(12-1)
VSHLL qSrc6hi, dXj6hi, #(12-1)
SUB pTemp, pTemp, #((16*2)+(4*8*1))
VQRDMULH qSrc2lo, qScale2lo, qSrc2lo
VQRDMULH qSrc2hi, qScale2hi, qSrc2hi
VQRDMULH qSrc6lo, qScale6lo, qSrc6lo
VQRDMULH qSrc6hi, qScale6hi, qSrc6hi
VLD1 {qScale3lo, qScale3hi}, [pScale]!
;// Row 2 & 6
VHADD qRes26lo, qSrc2lo, qSrc6lo ;// (j2+j6)/2
VHADD qRes26hi, qSrc2hi, qSrc6hi ;// (j2+j6)/2
VMOVN dXi3lo, qRes26lo ;// Output i3
VMOVN dXi3hi, qRes26hi
VSUB qRes26lo, qSrc2lo, qSrc6lo ;// j2-j6
VSUB qRes26hi, qSrc2hi, qSrc6hi ;// j2-j6
VMOVN dXi2lo, qRes26lo ;// Output i2
VMOVN dXi2hi, qRes26hi
VSHLL qSrc3lo, dXj3lo, #(12-1)
VSHLL qSrc3hi, dXj3hi, #(12-1)
VLD1 {qScale5lo, qScale5hi}, [pTemp]!
VSHLL qSrc5lo, dXj5lo, #(12-1)
VSHLL qSrc5hi, dXj5hi, #(12-1)
VQRDMULH qSrc3lo, qScale3lo, qSrc3lo
VQRDMULH qSrc3hi, qScale3hi, qSrc3hi
VQRDMULH qSrc5lo, qScale5lo, qSrc5lo
VQRDMULH qSrc5hi, qScale5hi, qSrc5hi
;// Row 3 & 5
VHADD qRes53lo, qSrc5lo, qSrc3lo ;// (j5+j3)/2
VHADD qRes53hi, qSrc5hi, qSrc3hi ;// (j5+j3)/2
SUB pSrc, pSrc, #16*2*2
VMOVN dXi7lo, qRes53lo ;// Output i7
VMOVN dXi7hi, qRes53hi
VSUB qRes53lo, qSrc5lo, qSrc3lo ;// j5-j3
VSUB qRes53hi, qSrc5hi, qSrc3hi ;// j5-j3
VLD1 qXj4, [pSrc @64]
VMOVN dXi4lo, qRes53lo ;// Output i4
VMOVN dXi4hi, qRes53hi
VSHLL qSrc4lo, dXj4lo, #(12-1)
VSHLL qSrc4hi, dXj4hi, #(12-1)
VLD1 {qScale4lo, qScale4hi}, [pScale]
LDR pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants
VQRDMULH qSrc4lo, qScale4lo, qSrc4lo
VQRDMULH qSrc4hi, qScale4hi, qSrc4hi
VLDR dCoefs, [pSrc] ;// Load DCT inverse AAN constants
;// Row 4
VMOVN dXi1lo, qSrc4lo ;// Output i1
VMOVN dXi1hi, qSrc4hi
MEND
END