blob: 3bc9534011f6383526fc8b5a9586bdc03dda01b4 [file] [log] [blame]
;//
;// Copyright (C) 2007-2008 ARM Limited
;//
;// Licensed under the Apache License, Version 2.0 (the "License");
;// you may not use this file except in compliance with the License.
;// You may obtain a copy of the License at
;//
;// http://www.apache.org/licenses/LICENSE-2.0
;//
;// Unless required by applicable law or agreed to in writing, software
;// distributed under the License is distributed on an "AS IS" BASIS,
;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
;// See the License for the specific language governing permissions and
;// limitations under the License.
;//
;//
;//
;// File Name: armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s
;// OpenMAX DL: v1.0.2
;// Revision: 12290
;// Date: Wednesday, April 9, 2008
;//
;//
;//
;//
INCLUDE omxtypes_s.h
INCLUDE armCOMM_s.h
EXPORT armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
M_VARIANTS CortexA8
IF CortexA8
M_START armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe, r11
;// Declare input registers
pSrc RN 0
srcStep RN 1
pDst RN 2
dstStep RN 3
;// Declare Neon registers
dTCoeff5 DN 30.U8
dTCoeff20 DN 31.U8
dCoeff5 DN 30.S16
dCoeff20 DN 31.S16
qSrcA01 QN 0.U8
qSrcB23 QN 1.U8
qSrcC45 QN 2.U8
qSrcD67 QN 3.U8
qSrcE89 QN 4.U8
qSrcF1011 QN 5.U8
qSrcG1213 QN 6.U8
qSrcH1415 QN 7.U8
qSrcI1617 QN 8.U8
dSrcA0 DN 0.U8
dSrcB2 DN 2.U8
dSrcC4 DN 4.U8
dSrcD6 DN 6.U8
dSrcE8 DN 8.U8
dSrcF10 DN 10.U8
dSrcG12 DN 12.U8
dSrcH14 DN 14.U8
dSrcI16 DN 16.U8
dSrcA1 DN 1.U8
dSrcB3 DN 3.U8
dSrcC5 DN 5.U8
dSrcD7 DN 7.U8
dSrcE9 DN 9.U8
dSrcF11 DN 11.U8
dSrcG13 DN 13.U8
dSrcH15 DN 15.U8
dSrcI17 DN 17.U8
qTempP01 QN 9.S16
qTempQ01 QN 10.S16
qTempR01 QN 11.S16
qTempS01 QN 12.S16
qTempP23 QN 0.S16
qTempQ23 QN 1.S16
qTempR23 QN 2.S16
qTempS23 QN 3.S16
dTempP0 DN 18.S16
dTempP1 DN 19.S16
dTempP2 DN 0.S16
dTempQ0 DN 20.S16
dTempQ1 DN 21.S16
dTempQ2 DN 2.S16
dTempR0 DN 22.S16
dTempR1 DN 23.S16
dTempR2 DN 4.S16
dTempS0 DN 24.S16
dTempS1 DN 25.S16
dTempS2 DN 6.S16
dTempB0 DN 26.S16
dTempC0 DN 27.S16
dTempD0 DN 28.S16
dTempF0 DN 29.S16
dTempAcc0 DN 0.U16
dTempAcc1 DN 2.U16
dTempAcc2 DN 4.U16
dTempAcc3 DN 6.U16
dAcc0 DN 0.U8
dAcc1 DN 2.U8
dAcc2 DN 4.U8
dAcc3 DN 6.U8
qAcc0 QN 0.S32
qAcc1 QN 1.S32
qAcc2 QN 2.S32
qAcc3 QN 3.S32
qTAcc0 QN 0.U16
qTAcc1 QN 1.U16
qTAcc2 QN 2.U16
qTAcc3 QN 3.U16
qTmp QN 4.S16
dTmp DN 8.S16
VLD1 qSrcA01, [pSrc], srcStep ;// [a0 a1 a2 a3 .. a15]
ADD r12, pSrc, srcStep, LSL #2
VMOV dTCoeff5, #5
VMOV dTCoeff20, #20
VLD1 qSrcF1011, [r12], srcStep
VLD1 qSrcB23, [pSrc], srcStep ;// [b0 b1 b2 b3 .. b15]
VLD1 qSrcG1213, [r12], srcStep
VADDL qTempP01, dSrcA0, dSrcF10
VLD1 qSrcC45, [pSrc], srcStep ;// [c0 c1 c2 c3 .. c15]
VADDL qTempP23, dSrcA1, dSrcF11
VLD1 qSrcD67, [pSrc], srcStep
VADDL qTempQ01, dSrcB2, dSrcG12
VLD1 qSrcE89, [pSrc], srcStep
;//t0
VMLAL qTempP01, dSrcC4, dTCoeff20
VLD1 qSrcH1415, [r12], srcStep
VMLAL qTempP23, dSrcC5, dTCoeff20
VLD1 qSrcI1617, [r12], srcStep ;// [i0 i1 i2 i3 .. ]
VMLAL qTempP01, dSrcD6, dTCoeff20
VMLAL qTempQ01, dSrcD6, dTCoeff20
VMLSL qTempP23, dSrcB3, dTCoeff5
VADDL qTempR01, dSrcC4, dSrcH14
VMLSL qTempP01, dSrcB2, dTCoeff5
VADDL qTempQ23, dSrcB3, dSrcG13
VMLAL qTempP23, dSrcD7, dTCoeff20
VMLAL qTempQ01, dSrcE8, dTCoeff20
VMLSL qTempP01, dSrcE8, dTCoeff5
VMLAL qTempQ23, dSrcD7, dTCoeff20
VMLSL qTempP23, dSrcE9, dTCoeff5
;//t1
VMLAL qTempR01, dSrcE8, dTCoeff20
VMLSL qTempQ01, dSrcC4, dTCoeff5
VMLSL qTempQ23, dSrcC5, dTCoeff5
VADDL qTempR23, dSrcC5, dSrcH15
VMLAL qTempR01, dSrcF10, dTCoeff20
VMLSL qTempQ01, dSrcF10, dTCoeff5
VMLAL qTempQ23, dSrcE9, dTCoeff20
VMLAL qTempR23, dSrcE9, dTCoeff20
VADDL qTempS01, dSrcD6, dSrcI16
VMLSL qTempR01, dSrcD6, dTCoeff5
VMLSL qTempQ23, dSrcF11, dTCoeff5
VMLSL qTempR23, dSrcD7, dTCoeff5
;//t2
VADDL qTempS23, dSrcD7, dSrcI17
VMLAL qTempS01, dSrcF10, dTCoeff20
VMLSL qTempR01, dSrcG12, dTCoeff5
VMLSL qTempR23, dSrcG13, dTCoeff5
VMLAL qTempS23, dSrcF11, dTCoeff20
VMLAL qTempS01, dSrcG12, dTCoeff20
VEXT dTempB0, dTempP0, dTempP1, #1
VMLAL qTempR23, dSrcF11, dTCoeff20
;//t3
VMLAL qTempS23, dSrcG13, dTCoeff20
VMLSL qTempS01, dSrcE8, dTCoeff5
VEXT dTempC0, dTempP0, dTempP1, #2
VMOV dCoeff20, #20
VMLSL qTempS23, dSrcE9, dTCoeff5
VMLSL qTempS01, dSrcH14, dTCoeff5
VEXT dTempF0, dTempP1, dTempP2, #1
VEXT dTempD0, dTempP0, dTempP1, #3
VMLSL qTempS23, dSrcH15, dTCoeff5
VADDL qAcc0, dTempP0, dTempF0
VADD dTempC0, dTempC0, dTempD0
;//h
VMOV dCoeff5, #5
;// res0
VADD dTempB0, dTempB0, dTempP1
VMLAL qAcc0, dTempC0, dCoeff20
VEXT dTempC0, dTempQ0, dTempQ1, #2
VEXT dTempD0, dTempQ0, dTempQ1, #3
VEXT dTempF0, dTempQ1, dTempQ2, #1
VMLSL qAcc0, dTempB0, dCoeff5
;// res1
VEXT dTempB0, dTempQ0, dTempQ1, #1
VADDL qAcc1, dTempQ0, dTempF0
VADD dTempC0, dTempC0, dTempD0
VADD dTempB0, dTempB0, dTempQ1
VEXT dTempD0, dTempR0, dTempR1, #3
VMLAL qAcc1, dTempC0, dCoeff20
VEXT dTempF0, dTempR1, dTempR2, #1
VEXT dTempC0, dTempR0, dTempR1, #2
VEXT dTmp, dTempR0, dTempR1, #1
VADDL qAcc2, dTempR0, dTempF0
VMLSL qAcc1, dTempB0, dCoeff5
; VEXT dTempB0, dTempR0, dTempR1, #1
VADD dTempC0, dTempC0, dTempD0
;// res2
VADD dTempB0, dTmp, dTempR1
VEXT dTempD0, dTempS0, dTempS1, #3
VMLAL qAcc2, dTempC0, dCoeff20
; VADD dTempB0, dTempB0, dTempR1
;// res3
VEXT dTempC0, dTempS0, dTempS1, #2
VEXT dTempF0, dTempS1, dTempS2, #1
VADD dTempC0, dTempC0, dTempD0
VEXT dTmp, dTempS0, dTempS1, #1
VADDL qAcc3, dTempS0, dTempF0
VMLSL qAcc2, dTempB0, dCoeff5
VMLAL qAcc3, dTempC0, dCoeff20
VADD dTmp, dTmp, dTempS1
VMLSL qAcc3, dTmp, dCoeff5
VQRSHRUN dTempAcc0, qAcc0, #10
VQRSHRUN dTempAcc1, qAcc1, #10
VQRSHRUN dTempAcc2, qAcc2, #10
VQRSHRUN dTempAcc3, qAcc3, #10
VQMOVN dAcc0, qTAcc0
VQMOVN dAcc1, qTAcc1
VQMOVN dAcc2, qTAcc2
VQMOVN dAcc3, qTAcc3
M_END
ENDIF
END