blob: bf2152c7a9e5f1d8b18c26817dc8704697bb5330 [file] [log] [blame]
;//
;// Copyright (C) 2007-2008 ARM Limited
;//
;// Licensed under the Apache License, Version 2.0 (the "License");
;// you may not use this file except in compliance with the License.
;// You may obtain a copy of the License at
;//
;// http://www.apache.org/licenses/LICENSE-2.0
;//
;// Unless required by applicable law or agreed to in writing, software
;// distributed under the License is distributed on an "AS IS" BASIS,
;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
;// See the License for the specific language governing permissions and
;// limitations under the License.
;//
;//
;//
;// File Name: omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s
;// OpenMAX DL: v1.0.2
;// Revision: 12290
;// Date: Wednesday, April 9, 2008
;//
;//
;//
;//
INCLUDE omxtypes_s.h
INCLUDE armCOMM_s.h
M_VARIANTS CortexA8
IF CortexA8
IMPORT armVCM4P10_DeblockingChromabSGE4_unsafe
IMPORT armVCM4P10_DeblockingChromabSLT4_unsafe
LOOP_COUNT EQU 0x40000000
MASK_3 EQU 0x03030303
MASK_4 EQU 0x04040404
;// Function arguments
pSrcDst RN 0
srcdstStep RN 1
pAlpha RN 2
pBeta RN 3
pThresholds RN 5
pBS RN 4
bS3210 RN 6
pSrcDst_P RN 10
pSrcDst_Q RN 12
pTmp RN 10
pTmp2 RN 12
step RN 14
;// Loop
XY RN 7
;// Rows input
dRow0 DN D7.U8
dRow1 DN D8.U8
dRow2 DN D5.U8
dRow3 DN D10.U8
dRow4 DN D6.U8
dRow5 DN D9.U8
dRow6 DN D4.U8
dRow7 DN D11.U8
;// Pixels
dP_0 DN D4.U8
dP_1 DN D5.U8
dP_2 DN D6.U8
dQ_0 DN D8.U8
dQ_1 DN D9.U8
dQ_2 DN D10.U8
;// Filtering Decision
dAlpha DN D0.U8
dBeta DN D2.U8
dFilt DN D16.U8
dAqflg DN D12.U8
dApflg DN D17.U8
dAp0q0 DN D13.U8
dAp1p0 DN D12.U8
dAq1q0 DN D18.U8
dAp2p0 DN D19.U8
dAq2q0 DN D17.U8
qBS3210 QN Q13.U16
dBS3210 DN D26
dMask_bs DN D27
dFilt_bs DN D26.U16
;// bSLT4
dMask_0 DN D14.U8
dMask_1 DN D15.U8
dMask_4 DN D1.U16
Mask_4 RN 8
Mask_3 RN 9
dTemp DN D19.U8
;// Result
dP_0t DN D13.U8
dQ_0t DN D31.U8
dP_0n DN D29.U8
dQ_0n DN D24.U8
;// Function header
M_START omxVCM4P10_FilterDeblockingChroma_VerEdge_I, r12, d15
;//Arguments on the stack
M_ARG ppThresholds, 4
M_ARG ppBS, 4
;// d0-dAlpha_0
;// d2-dBeta_0
;load alpha1,beta1 somewhere to avoid more loads
VLD1 {dAlpha[]}, [pAlpha]!
SUB pSrcDst, pSrcDst, #4
VLD1 {dBeta[]}, [pBeta]!
M_LDR pBS, ppBS
M_LDR pThresholds, ppThresholds
LDR Mask_4, =MASK_4
LDR Mask_3, =MASK_3
;dMask_0-14
;dMask_1-15
;dMask_4-19
VMOV dMask_0, #0
VMOV dMask_1, #1
VMOV dMask_4, #4
LDR XY, =LOOP_COUNT
;// p0-p3 - d4-d7
;// q0-q3 - d8-d11
LoopY
LDR bS3210, [pBS], #8
ADD pTmp, pSrcDst, srcdstStep
ADD step, srcdstStep, srcdstStep
;1
VLD1 dRow0, [pSrcDst], step
;1
VLD1 dRow1, [pTmp], step
VLD1 dRow2, [pSrcDst], step
VLD1 dRow3, [pTmp], step
VLD1 dRow4, [pSrcDst], step
VLD1 dRow5, [pTmp], step
VLD1 dRow6, [pSrcDst], step
VLD1 dRow7, [pTmp], step
;// dRow0 = [q3r0 q2r0 q1r0 q0r0 p0r0 p1r0 p2r0 p3r0]
;// dRow1 = [q3r1 q2r1 q1r1 q0r1 p0r1 p1r1 p2r1 p3r1]
;// dRow2 = [q3r2 q2r2 q1r2 q0r2 p0r2 p1r2 p2r2 p3r2]
;// dRow3 = [q3r3 q2r3 q1r3 q0r3 p0r3 p1r3 p2r3 p3r3]
;// dRow4 = [q3r4 q2r4 q1r4 q0r4 p0r4 p1r4 p2r4 p3r4]
;// dRow5 = [q3r5 q2r5 q1r5 q0r5 p0r5 p1r5 p2r5 p3r5]
;// dRow6 = [q3r6 q2r6 q1r6 q0r6 p0r6 p1r6 p2r6 p3r6]
;// dRow7 = [q3r7 q2r7 q1r7 q0r7 p0r7 p1r7 p2r7 p3r7]
;// 8x8 Transpose
VZIP.8 dRow0, dRow1
VZIP.8 dRow2, dRow3
VZIP.8 dRow4, dRow5
VZIP.8 dRow6, dRow7
VZIP.16 dRow0, dRow2
VZIP.16 dRow1, dRow3
VZIP.16 dRow4, dRow6
VZIP.16 dRow5, dRow7
VZIP.32 dRow0, dRow4
VZIP.32 dRow2, dRow6
VZIP.32 dRow3, dRow7
VZIP.32 dRow1, dRow5
;Realign the pointers
CMP bS3210, #0
VABD dAp2p0, dP_2, dP_0
VABD dAp0q0, dP_0, dQ_0
BEQ NoFilterBS0
VABD dAp1p0, dP_1, dP_0
VABD dAq1q0, dQ_1, dQ_0
VMOV.U32 dBS3210[0], bS3210
VCGT dFilt, dAlpha, dAp0q0
VMAX dAp1p0, dAq1q0, dAp1p0
VMOVL qBS3210, dBS3210.U8
VABD dAq2q0, dQ_2, dQ_0
VCGT dMask_bs.S16, dBS3210.S16, #0
VCGT dAp1p0, dBeta, dAp1p0
VCGT dAp2p0, dBeta, dAp2p0
VAND dFilt, dMask_bs.U8
TST bS3210, Mask_3
VCGT dAq2q0, dBeta, dAq2q0
VAND dFilt, dFilt, dAp1p0
VAND dAqflg, dFilt, dAq2q0
VAND dApflg, dFilt, dAp2p0
;// bS < 4 Filtering
BLNE armVCM4P10_DeblockingChromabSLT4_unsafe
TST bS3210, Mask_4
SUB pSrcDst, pSrcDst, srcdstStep, LSL #3
VTST dFilt_bs, dFilt_bs, dMask_4
;// bS == 4 Filtering
BLNE armVCM4P10_DeblockingChromabSGE4_unsafe
VBIT dP_0n, dP_0t, dFilt_bs
VBIT dQ_0n, dQ_0t, dFilt_bs
;// Result Storage
ADD pSrcDst_P, pSrcDst, #3
VBIF dP_0n, dP_0, dFilt
ADD pTmp2, pSrcDst_P, srcdstStep
ADD step, srcdstStep, srcdstStep
VBIF dQ_0n, dQ_0, dFilt
ADDS XY, XY, XY
VST1 {dP_0n[0]}, [pSrcDst_P], step
VST1 {dP_0n[1]}, [pTmp2], step
VST1 {dP_0n[2]}, [pSrcDst_P], step
VST1 {dP_0n[3]}, [pTmp2], step
VST1 {dP_0n[4]}, [pSrcDst_P], step
VST1 {dP_0n[5]}, [pTmp2], step
VST1 {dP_0n[6]}, [pSrcDst_P], step
VST1 {dP_0n[7]}, [pTmp2], step
ADD pSrcDst_Q, pSrcDst, #4
ADD pTmp, pSrcDst_Q, srcdstStep
VST1 {dQ_0n[0]}, [pSrcDst_Q], step
VST1 {dQ_0n[1]}, [pTmp], step
VST1 {dQ_0n[2]}, [pSrcDst_Q], step
VST1 {dQ_0n[3]}, [pTmp], step
VST1 {dQ_0n[4]}, [pSrcDst_Q], step
VST1 {dQ_0n[5]}, [pTmp], step
VST1 {dQ_0n[6]}, [pSrcDst_Q], step
VST1 {dQ_0n[7]}, [pTmp], step
ADD pSrcDst, pSrcDst, #4
BNE LoopY
MOV r0, #OMX_Sts_NoErr
M_EXIT
NoFilterBS0
VLD1 {dAlpha[]}, [pAlpha]
ADD pSrcDst, pSrcDst, #4
SUB pSrcDst, pSrcDst, srcdstStep, LSL #3
ADDS XY, XY, XY
VLD1 {dBeta[]}, [pBeta]
ADD pThresholds, pThresholds, #4
BNE LoopY
MOV r0, #OMX_Sts_NoErr
M_END
ENDIF
END