blob: 4d867829a9f5d188f7615198e3064afcb70e74d2 [file] [log] [blame]
;//
;// Copyright (C) 2007-2008 ARM Limited
;//
;// Licensed under the Apache License, Version 2.0 (the "License");
;// you may not use this file except in compliance with the License.
;// You may obtain a copy of the License at
;//
;// http://www.apache.org/licenses/LICENSE-2.0
;//
;// Unless required by applicable law or agreed to in writing, software
;// distributed under the License is distributed on an "AS IS" BASIS,
;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
;// See the License for the specific language governing permissions and
;// limitations under the License.
;//
;//
;//
;// File Name: armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s
;// OpenMAX DL: v1.0.2
;// Revision: 12290
;// Date: Wednesday, April 9, 2008
;//
;//
;//
;//
INCLUDE omxtypes_s.h
INCLUDE armCOMM_s.h
EXPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
M_VARIANTS CortexA8
IF CortexA8
M_START armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe, r11
;// Declare input registers
pSrc RN 0
srcStep RN 1
pDst RN 2
dstStep RN 3
;// Declare Neon registers
dCoeff5 DN 30.S16
dCoeff20 DN 31.S16
qCoeff5 QN 14.S32
qCoeff20 QN 15.S32
qSrc01 QN 0.U8
dSrc0 DN 0.U8
dSrc1 DN 1.U8
dSrcb DN 4.U8
dSrcc DN 2.U8
dSrcd DN 3.U8
dSrce DN 5.U8
dSrcf DN 1.U8
qSrcb QN 2.S16
qSrcc QN 1.S16
dSrcB DN 4.S16
dSrcC DN 2.S16
qRes0 QN 5.S16
qRes1 QN 6.S16
qRes2 QN 7.S16
qRes3 QN 8.S16
qRes4 QN 9.S16
qRes5 QN 10.S16
qRes6 QN 11.S16
qRes7 QN 12.S16
qRes8 QN 13.S16
dRes0 DN 10.S16
dRes1 DN 12.S16
dRes2 DN 14.S16
dRes3 DN 16.S16
dRes4 DN 18.S16
dRes5 DN 20.S16
dRes6 DN 22.S16
dRes7 DN 24.S16
dRes8 DN 26.S16
qAcc01 QN 5.S32
qAcc23 QN 6.S32
qAcc45 QN 2.S32
qAcc67 QN 3.S32
qSumBE QN 0.S32
qSumCD QN 1.S32
dTempAcc0 DN 0.U16
dTempAcc1 DN 2.U16
dTempAcc2 DN 4.U16
dTempAcc3 DN 6.U16
qTAcc0 QN 0.U16
qTAcc1 QN 1.U16
qTAcc2 QN 2.U16
qTAcc3 QN 3.U16
dAcc0 DN 0.U8
dAcc1 DN 2.U8
dAcc2 DN 4.U8
dAcc3 DN 6.U8
dTmp0 DN 8.S16
dTmp1 DN 9.S16
qTmp0 QN 4.S32
VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
VMOV dCoeff20, #20
VMOV dCoeff5, #5
;// Row0
VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
VEXT dSrcc, dSrc0, dSrc1, #2
VEXT dSrcd, dSrc0, dSrc1, #3
VEXT dSrce, dSrc0, dSrc1, #4
VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
VADDL qSrcc, dSrcc, dSrcd ;// c+d
VADDL qSrcb, dSrcb, dSrce ;// b+e
VADDL qRes0, dSrc0, dSrcf ;// Acc=a+f
VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
VMLA dRes0, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
; VMLS dRes0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
;// Row1
VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
VEXT dSrcc, dSrc0, dSrc1, #2
VEXT dSrcd, dSrc0, dSrc1, #3
VEXT dSrce, dSrc0, dSrc1, #4
VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
VADDL qSrcc, dSrcc, dSrcd ;// c+d
VADDL qSrcb, dSrcb, dSrce ;// b+e
VADDL qRes1, dSrc0, dSrcf ;// Acc=a+f
VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
VSUB dRes0, dRes0, dTmp0 ;// TeRi
VMLA dRes1, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
; VMLS dRes1, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
;// Row2
VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
VEXT dSrcc, dSrc0, dSrc1, #2
VEXT dSrcd, dSrc0, dSrc1, #3
VEXT dSrce, dSrc0, dSrc1, #4
VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
VADDL qSrcc, dSrcc, dSrcd ;// c+d
VADDL qSrcb, dSrcb, dSrce ;// b+e
VADDL qRes2, dSrc0, dSrcf ;// Acc=a+f
VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
VSUB dRes1, dRes1, dTmp0
VMLA dRes2, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
; VMLS dRes2, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
;// Row3
VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
VEXT dSrcc, dSrc0, dSrc1, #2
VEXT dSrcd, dSrc0, dSrc1, #3
VEXT dSrce, dSrc0, dSrc1, #4
VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
VADDL qSrcc, dSrcc, dSrcd ;// c+d
VADDL qSrcb, dSrcb, dSrce ;// b+e
VADDL qRes3, dSrc0, dSrcf ;// Acc=a+f
VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
VSUB dRes2, dRes2, dTmp0
VMLA dRes3, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
; VMLS dRes3, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
;// Row4
VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
VEXT dSrcc, dSrc0, dSrc1, #2
VEXT dSrcd, dSrc0, dSrc1, #3
VEXT dSrce, dSrc0, dSrc1, #4
VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
VADDL qSrcc, dSrcc, dSrcd ;// c+d
VADDL qSrcb, dSrcb, dSrce ;// b+e
VADDL qRes4, dSrc0, dSrcf ;// Acc=a+f
VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
VSUB dRes3, dRes3, dTmp0
VMLA dRes4, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
; VMLS dRes4, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
;// Row5
VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
VEXT dSrcc, dSrc0, dSrc1, #2
VEXT dSrcd, dSrc0, dSrc1, #3
VEXT dSrce, dSrc0, dSrc1, #4
VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
VADDL qSrcc, dSrcc, dSrcd ;// c+d
VADDL qSrcb, dSrcb, dSrce ;// b+e
VADDL qRes5, dSrc0, dSrcf ;// Acc=a+f
VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
VSUB dRes4, dRes4, dTmp0
VMLA dRes5, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
; VMLS dRes5, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
;// Row6
VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
VEXT dSrcc, dSrc0, dSrc1, #2
VEXT dSrcd, dSrc0, dSrc1, #3
VEXT dSrce, dSrc0, dSrc1, #4
VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
VADDL qSrcc, dSrcc, dSrcd ;// c+d
VADDL qSrcb, dSrcb, dSrce ;// b+e
VADDL qRes6, dSrc0, dSrcf ;// Acc=a+f
VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
VSUB dRes5, dRes5, dTmp0
VMLA dRes6, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
; VMLS dRes6, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
;// Row7
VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
VEXT dSrcc, dSrc0, dSrc1, #2
VEXT dSrcd, dSrc0, dSrc1, #3
VEXT dSrce, dSrc0, dSrc1, #4
VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
VADDL qSrcc, dSrcc, dSrcd ;// c+d
VADDL qSrcb, dSrcb, dSrce ;// b+e
VADDL qRes7, dSrc0, dSrcf ;// Acc=a+f
VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
VSUB dRes6, dRes6, dTmp0
VMLA dRes7, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
; VMLS dRes7, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
;// Row8
VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
VEXT dSrcc, dSrc0, dSrc1, #2
VEXT dSrcd, dSrc0, dSrc1, #3
VEXT dSrce, dSrc0, dSrc1, #4
VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
VADDL qSrcc, dSrcc, dSrcd ;// c+d
VADDL qSrcb, dSrcb, dSrce ;// b+e
VADDL qRes8, dSrc0, dSrcf ;// Acc=a+f
VSUB dRes7, dRes7, dTmp0
VMLA dRes8, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
; VMLS dRes8, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
VMOV qCoeff20, #20
VMOV qCoeff5, #5
;// Col0
VADDL qAcc01, dRes0, dRes5 ;// Acc = a+f
VADDL qSumCD, dRes2, dRes3 ;// c+d
VADDL qSumBE, dRes1, dRes4 ;// b+e
VSUB dRes8, dRes8, dTmp0
VMLA qAcc01, qSumCD, qCoeff20 ;// Acc += 20*(c+d)
; VMLS qAcc01, qSumBE, qCoeff5 ;// Acc -= 20*(b+e)
VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e)
;// Col1
VADDL qAcc23, dRes1, dRes6 ;// Acc = a+f
VADDL qSumCD, dRes3, dRes4 ;// c+d
VADDL qSumBE, dRes2, dRes5 ;// b+e
VMLA qAcc23, qSumCD, qCoeff20 ;// Acc += 20*(c+d)
VSUB qAcc01, qAcc01, qTmp0
; VMLS qAcc23, qSumBE, qCoeff5 ;// Acc -= 20*(b+e)
VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e)
;// Col2
VADDL qAcc45, dRes2, dRes7 ;// Acc = a+f
VADDL qSumCD, dRes4, dRes5 ;// c+d
VADDL qSumBE, dRes3, dRes6 ;// b+e
VMLA qAcc45, qSumCD, qCoeff20 ;// Acc += 20*(c+d)
VSUB qAcc23, qAcc23, qTmp0
; VMLS qAcc45, qSumBE, qCoeff5 ;// Acc -= 20*(b+e)
VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e)
;// Col3
VADDL qAcc67, dRes3, dRes8 ;// Acc = a+f
VADDL qSumCD, dRes5, dRes6 ;// c+d
VADDL qSumBE, dRes4, dRes7 ;// b+e
VMLA qAcc67, qSumCD, qCoeff20 ;// Acc += 20*(c+d)
VSUB qAcc45, qAcc45, qTmp0
VMLS qAcc67, qSumBE, qCoeff5 ;// Acc -= 20*(b+e)
VQRSHRUN dTempAcc0, qAcc01, #10
VQRSHRUN dTempAcc1, qAcc23, #10
VQRSHRUN dTempAcc2, qAcc45, #10
VQRSHRUN dTempAcc3, qAcc67, #10
VQMOVN dAcc0, qTAcc0
VQMOVN dAcc1, qTAcc1
VQMOVN dAcc2, qTAcc2
VQMOVN dAcc3, qTAcc3
M_END
ENDIF
END