blob: d2a134eb8244e76980fb705eb0b3debea8e8ca26 [file] [log] [blame]
;//
;// Copyright (C) 2007-2008 ARM Limited
;//
;// Licensed under the Apache License, Version 2.0 (the "License");
;// you may not use this file except in compliance with the License.
;// You may obtain a copy of the License at
;//
;// http://www.apache.org/licenses/LICENSE-2.0
;//
;// Unless required by applicable law or agreed to in writing, software
;// distributed under the License is distributed on an "AS IS" BASIS,
;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
;// See the License for the specific language governing permissions and
;// limitations under the License.
;//
;//
;//
;// File Name: omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s
;// OpenMAX DL: v1.0.2
;// Revision: 12290
;// Date: Wednesday, April 9, 2008
;//
;//
;//
;//
INCLUDE omxtypes_s.h
INCLUDE armCOMM_s.h
M_VARIANTS CortexA8
IMPORT armVCM4P10_DeblockingLumabSLT4_unsafe
IMPORT armVCM4P10_DeblockingLumabSGE4_unsafe
IF CortexA8
LOOP_COUNT EQU 0x11000000
;// Function arguments
pSrcDst RN 0
srcdstStep RN 1
pAlpha RN 2
pBeta RN 3
pThresholds RN 5
pBS RN 4
bS10 RN 12
pAlpha_0 RN 2
pBeta_0 RN 3
pAlpha_1 RN 7
pBeta_1 RN 8
pTmp RN 10
pTmpStep RN 11
;// Loop
XY RN 9
;// Rows input
dRow0 DN D7.U8
dRow1 DN D8.U8
dRow2 DN D5.U8
dRow3 DN D10.U8
dRow4 DN D6.U8
dRow5 DN D9.U8
dRow6 DN D4.U8
dRow7 DN D11.U8
;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2
;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3
;// Rows output
dRown0 DN D7.U8
dRown1 DN D24.U8
dRown2 DN D30.U8
dRown3 DN D10.U8
dRown4 DN D6.U8
dRown5 DN D25.U8
dRown6 DN D29.U8
dRown7 DN D11.U8
;// dP_0n DN D29.U8
;// dP_1n DN D30.U8
;// dP_2n DN D31.U8
;//
;// dQ_0n DN D24.U8 ;!!;Temp2
;// dQ_1n DN D25.U8 ;!!;Temp2
;// dQ_2n DN D28.U8 ;!!;dQ_0t
;//
;// dRown0 - dP_3, dRown1 - dQ_0n
;// dRown2 - dP_1n, dRown3 - dQ_2
;// dRown4 - dP_2, dRown5 - dQ_1n
;// dRown6 - dP_0n, dRown7 - dQ_3
dRow0n DN D7.U8
dRow1n DN D24.U8
dRow2n DN D30.U8
dRow3n DN D28.U8
dRow4n DN D31.U8
dRow5n DN D25.U8
dRow6n DN D29.U8
dRow7n DN D11.U8
;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n
;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3
;// Pixels
dP_0 DN D4.U8
dP_1 DN D5.U8
dP_2 DN D6.U8
dP_3 DN D7.U8
dQ_0 DN D8.U8
dQ_1 DN D9.U8
dQ_2 DN D10.U8
dQ_3 DN D11.U8
;// Filtering Decision
dAlpha DN D0.U8
dBeta DN D2.U8
dFilt DN D16.U8
dAqflg DN D12.U8
dApflg DN D17.U8
dAp0q0 DN D13.U8
dAp1p0 DN D12.U8
dAq1q0 DN D18.U8
dAp2p0 DN D19.U8
dAq2q0 DN D17.U8
;// bSLT4
dTC0 DN D18.U8
dTC1 DN D19.U8
dTC01 DN D18.U8
dTCs DN D31.S8
dTC DN D31.U8
dMask_0 DN D14.U8
dMask_1 DN D15.U8
Mask_0 RN 6
dTemp DN D19.U8
;// Computing P0,Q0
qDq0p0 QN Q10.S16
qDp1q1 QN Q11.S16
qDelta QN Q10.S16 ; reuse qDq0p0
dDelta DN D20.S8
;// Computing P1,Q1
dRp0q0 DN D24.U8
dMaxP DN D23.U8
dMinP DN D22.U8
dMaxQ DN D19.U8
dMinQ DN D21.U8
dDeltaP DN D26.U8
dDeltaQ DN D27.U8
qP_0n QN Q14.S16
qQ_0n QN Q12.S16
dQ_0n DN D24.U8
dQ_1n DN D25.U8
dP_0n DN D29.U8
dP_1n DN D30.U8
;// bSGE4
qSp0q0 QN Q10.U16
qSp2q1 QN Q11.U16
qSp0q0p1 QN Q12.U16
qSp3p2 QN Q13.U16
dHSp0q1 DN D28.U8
qSq2p1 QN Q11.U16
qSp0q0q1 QN Q12.U16
qSq3q2 QN Q13.U16 ;!!
dHSq0p1 DN D28.U8 ;!!
qTemp1 QN Q11.U16 ;!!;qSp2q1
qTemp2 QN Q12.U16 ;!!;qSp0q0p1
dP_0t DN D28.U8 ;!!;dHSp0q1
dQ_0t DN D22.U8 ;!!;Temp1
dP_0n DN D29.U8
dP_1n DN D30.U8
dP_2n DN D31.U8
dQ_0n DN D24.U8 ;!!;Temp2
dQ_1n DN D25.U8 ;!!;Temp2
dQ_2n DN D28.U8 ;!!;dQ_0t
;// Function header
M_START omxVCM4P10_FilterDeblockingLuma_VerEdge_I, r11, d15
;//Arguments on the stack
M_ARG ppThresholds, 4
M_ARG ppBS, 4
;// d0-dAlpha_0
;// d2-dBeta_0
ADD pAlpha_1, pAlpha_0, #1
ADD pBeta_1, pBeta_0, #1
VLD1 {dAlpha[]}, [pAlpha_0]
SUB pSrcDst, pSrcDst, #4
VLD1 {dBeta[]}, [pBeta_0]
M_LDR pBS, ppBS
M_LDR pThresholds, ppThresholds
MOV Mask_0,#0
;dMask_0-14
;dMask_1-15
VMOV dMask_0, #0
VMOV dMask_1, #1
LDR XY,=LOOP_COUNT
ADD pTmpStep, srcdstStep, srcdstStep
;// p0-p3 - d4-d7
;// q0-q3 - d8-d11
LoopY
LoopX
LDRH bS10, [pBS], #4
CMP bS10, #0
BEQ NoFilterBS0
;// Load 8 rows of data
ADD pTmp, pSrcDst, srcdstStep
VLD1 dRow0, [pSrcDst], pTmpStep
VLD1 dRow1, [pTmp], pTmpStep
VLD1 dRow2, [pSrcDst], pTmpStep
VZIP.8 dRow0, dRow1
VLD1 dRow3, [pTmp], pTmpStep
VLD1 dRow4, [pSrcDst], pTmpStep
VZIP.8 dRow2, dRow3
VLD1 dRow5, [pTmp], pTmpStep
VLD1 dRow6, [pSrcDst], pTmpStep
VLD1 dRow7, [pTmp], pTmpStep
VZIP.8 dRow4, dRow5
VZIP.16 dRow1, dRow3
;// dRow0 = [q3r0 q2r0 q1r0 q0r0 p0r0 p1r0 p2r0 p3r0]
;// dRow1 = [q3r1 q2r1 q1r1 q0r1 p0r1 p1r1 p2r1 p3r1]
;// dRow2 = [q3r2 q2r2 q1r2 q0r2 p0r2 p1r2 p2r2 p3r2]
;// dRow3 = [q3r3 q2r3 q1r3 q0r3 p0r3 p1r3 p2r3 p3r3]
;// dRow4 = [q3r4 q2r4 q1r4 q0r4 p0r4 p1r4 p2r4 p3r4]
;// dRow5 = [q3r5 q2r5 q1r5 q0r5 p0r5 p1r5 p2r5 p3r5]
;// dRow6 = [q3r6 q2r6 q1r6 q0r6 p0r6 p1r6 p2r6 p3r6]
;// dRow7 = [q3r7 q2r7 q1r7 q0r7 p0r7 p1r7 p2r7 p3r7]
;// 8x8 Transpose
VZIP.8 dRow6, dRow7
SUB pSrcDst, pSrcDst, srcdstStep, LSL #3
VZIP.16 dRow0, dRow2
VZIP.16 dRow5, dRow7
VZIP.16 dRow4, dRow6
VZIP.32 dRow1, dRow5
VZIP.32 dRow2, dRow6
VZIP.32 dRow3, dRow7
VZIP.32 dRow0, dRow4
;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2
;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3
;// dQ_0 = [q0r7 q0r6 q0r5 q0r4 q0r3 q0r2 q0r1 q0r0]
;// dQ_1 = [q1r7 q1r6 q1r5 q1r4 q1r3 q1r2 q1r1 q1r0]
;// dQ_2 = [q2r7 q2r6 q2r5 q2r4 q2r3 q2r2 q2r1 q2r0]
;// dQ_3 = [q3r7 q3r6 q3r5 q3r4 q3r3 q3r2 q3r1 q3r0]
;// dP_0 = [p0r7 p0r6 p0r5 p0r4 p0r3 p0r2 p0r1 p0r0]
;// dP_1 = [p1r7 p1r6 p1r5 p1r4 p1r3 p1r2 p1r1 p1r0]
;// dP_2 = [p2r7 p2r6 p2r5 p2r4 p2r3 p2r2 p2r1 p2r0]
;// dP_3 = [p3r7 p3r6 p3r5 p3r4 p3r3 p3r2 p3r1 p3r0]
VABD dAp0q0, dP_0, dQ_0
VABD dAp1p0, dP_1, dP_0
VABD dAq1q0, dQ_1, dQ_0
VABD dAp2p0, dP_2, dP_0
TST bS10, #0xff
VCGT dFilt, dAlpha, dAp0q0
VMAX dAp1p0, dAq1q0, dAp1p0
VABD dAq2q0, dQ_2, dQ_0
VMOVEQ.U32 dFilt[0], Mask_0
TST bS10, #0xff00
VCGT dAp2p0, dBeta, dAp2p0
VCGT dAp1p0, dBeta, dAp1p0
VMOVEQ.U32 dFilt[1], Mask_0
VCGT dAq2q0, dBeta, dAq2q0
VAND dFilt, dFilt, dAp1p0
TST bS10, #4
VAND dAqflg, dFilt, dAq2q0
VAND dApflg, dFilt, dAp2p0
BNE bSGE4
bSLT4
;// bS < 4 Filtering
BL armVCM4P10_DeblockingLumabSLT4_unsafe
;// Transpose
VZIP.8 dP_3, dP_2
VZIP.8 dP_1n, dP_0n
VZIP.8 dQ_0n, dQ_1n
VZIP.8 dQ_2, dQ_3
VZIP.16 dP_3, dP_1n
ADD pTmp, pSrcDst, srcdstStep
VZIP.16 dQ_0n, dQ_2
VZIP.16 dQ_1n, dQ_3
VZIP.16 dP_2, dP_0n
VZIP.32 dP_3, dQ_0n
VZIP.32 dP_1n, dQ_2
VZIP.32 dP_2, dQ_1n
VZIP.32 dP_0n, dQ_3
;// dRown0 - dP_3, dRown1 - dQ_0n
;// dRown2 - dP_1n, dRown3 - dQ_2
;// dRown4 - dP_2, dRown5 - dQ_1n
;// dRown6 - dP_0n, dRown7 - dQ_3
VST1 dRown0, [pSrcDst], pTmpStep
VST1 dRown1, [pTmp], pTmpStep
VST1 dRown2, [pSrcDst], pTmpStep
VST1 dRown3, [pTmp], pTmpStep
;1
VST1 dRown4, [pSrcDst], pTmpStep
VST1 dRown5, [pTmp], pTmpStep
ADDS XY, XY, XY
VST1 dRown6, [pSrcDst], pTmpStep
ADD pThresholds, pThresholds, #2
VST1 dRown7, [pTmp], srcdstStep
SUB pSrcDst, pSrcDst, srcdstStep, LSL #3
VLD1 {dAlpha[]}, [pAlpha_1]
ADD pSrcDst, pSrcDst, #4
VLD1 {dBeta[]}, [pBeta_1]
BCC LoopX
B ExitLoopY
NoFilterBS0
ADD pSrcDst, pSrcDst, #4
ADDS XY, XY, XY
VLD1 {dAlpha[]}, [pAlpha_1]
ADD pThresholds, pThresholds, #4
VLD1 {dBeta[]}, [pBeta_1]
BCC LoopX
B ExitLoopY
bSGE4
;// bS >= 4 Filtering
BL armVCM4P10_DeblockingLumabSGE4_unsafe
;// Transpose
VZIP.8 dP_3, dP_2n
VZIP.8 dP_1n, dP_0n
VZIP.8 dQ_0n, dQ_1n
VZIP.8 dQ_2n, dQ_3
VZIP.16 dP_3, dP_1n
ADD pTmp, pSrcDst, srcdstStep
VZIP.16 dQ_0n, dQ_2n
VZIP.16 dQ_1n, dQ_3
VZIP.16 dP_2n, dP_0n
VZIP.32 dP_3, dQ_0n
VZIP.32 dP_1n, dQ_2n
VZIP.32 dP_2n, dQ_1n
VZIP.32 dP_0n, dQ_3
;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n
;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3
VST1 dRow0n, [pSrcDst], pTmpStep
VST1 dRow1n, [pTmp], pTmpStep
VST1 dRow2n, [pSrcDst], pTmpStep
VST1 dRow3n, [pTmp], pTmpStep
VST1 dRow4n, [pSrcDst], pTmpStep
VST1 dRow5n, [pTmp], pTmpStep
ADDS XY,XY,XY
VST1 dRow6n, [pSrcDst], pTmpStep
ADD pThresholds, pThresholds, #4
VST1 dRow7n, [pTmp], pTmpStep
SUB pSrcDst, pSrcDst, srcdstStep, LSL #3
VLD1 {dAlpha[]}, [pAlpha_1]
ADD pSrcDst, pSrcDst, #4
VLD1 {dBeta[]}, [pBeta_1]
BCC LoopX
ExitLoopY
SUB pBS, pBS, #14
SUB pThresholds, pThresholds, #14
SUB pSrcDst, pSrcDst, #16
VLD1 {dAlpha[]}, [pAlpha_0]
ADD pSrcDst, pSrcDst, srcdstStep, LSL #3
VLD1 {dBeta[]}, [pBeta_0]
BNE LoopY
MOV r0, #OMX_Sts_NoErr
M_END
ENDIF
END