av/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p2/src/omxVCM4P2_MCReconBlock_s.s - nest-cam/4320010/av - Git at Google

 ;//
 ;// Copyright (C) 2007-2008 ARM Limited
 ;//
 ;// Licensed under the Apache License, Version 2.0 (the "License");
 ;// you may not use this file except in compliance with the License.
 ;// You may obtain a copy of the License at
 ;//
 ;//      http://www.apache.org/licenses/LICENSE-2.0
 ;//
 ;// Unless required by applicable law or agreed to in writing, software
 ;// distributed under the License is distributed on an "AS IS" BASIS,
 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ;// See the License for the specific language governing permissions and
 ;// limitations under the License.
 ;//
 ;//
 ;//
 ;// File Name:  omxVCM4P2_MCReconBlock_s.s
 ;// OpenMAX DL: v1.0.2
 ;// Revision:   9641
 ;// Date:       Thursday, February 7, 2008
 ;//
 ;//
 ;//
 ;//
 ;// Description:
 ;//
 ;//

 ;// Include standard headers
     INCLUDE omxtypes_s.h
     INCLUDE armCOMM_s.h

 ;// Import symbols required from other files

     M_VARIANTS ARM1136JS

 ;// ***************************************************************************
 ;// ARM1136JS implementation
 ;// ***************************************************************************
     IF  ARM1136JS

 ;// ***************************************************************************
 ;// MACRO DEFINITIONS
 ;// ***************************************************************************
     ;// Description:
     ;//
     ;//   dest[j] = (x[j] + y[j] + round) >> 1,   j=0..3
     ;//
     ;// Similar to UHADD8 instruction, but with a rounding value of 1 added to
     ;// each sum before dividing by two, if round is 1
     ;//
     ;// Syntax:
     ;// M_UHADD8R   $dest, $x, $y, $round, $mask
     ;//
     ;// Inputs:
     ;// $x        four packed bytes,   x[3] :  x[2]  :  x[1]  :  x[0]
     ;// $y        four packed bytes,   y[3] :  y[2]  :  y[1]  :  y[0]
     ;// $round    0 if no rounding to be added, 1 if rounding to be done
     ;// $mask     some register set to 0x80808080
     ;//
     ;// Outputs:
     ;// $dest     four packed bytes,   z[3] :  z[2]  :  z[1]  :  z[0]

     MACRO
     M_UHADD8R   $dest, $x, $y, $round, $mask
     IF $round = 1
         IF  $dest /= $y
             MVN         $dest, $x
             UHSUB8      $dest, $y, $dest
             EOR         $dest, $dest, $mask
         ELSE
             MVN         $dest, $y
             UHSUB8      $dest, $x, $dest
             EOR         $dest, $dest, $mask
         ENDIF
     ELSE
         UHADD8      $dest, $x, $y
     ENDIF
     MEND
 ;// ***************************************************************************
     ;// Description:
     ;// Load 8 bytes from $pSrc (aligned or unaligned locations)
     ;//
     ;// Syntax:
     ;// M_LOAD_X    $pSrc, $srcStep, $out0, $out1, $scratch, $offset
     ;//
     ;// Inputs:
     ;// $pSrc       4 byte aligned source pointer to an address just less than
     ;//             or equal to the data location
     ;// $srcStep    The stride on source
     ;// $scratch    A scratch register, used internally for temp calculations
     ;// $offset     Difference of source data location to the source pointer
     ;//             Use when $offset != 0 (unaligned load)
     ;//
     ;// Outputs:
     ;// $pSrc       In case the macro accepts stride, it increments the pSrc by
     ;//             that value, else unchanged
     ;// $out0       four packed bytes,   z[3] :  z[2]  :  z[1]  :  z[0]
     ;// $out1       four packed bytes,   z[7] :  z[6]  :  z[5]  :  z[4]
     ;//
     ;// Note: {$out0, $out1, $scratch} should be registers with ascending
     ;// register numbering. In case offset is 0, $scratch is not modified.

     MACRO
     M_LOAD_X    $pSrc, $srcStep, $out0, $out1, $scratch, $offset
         IF $offset = 0
             LDM         $pSrc, {$out0, $out1}
             ADD         $pSrc, $pSrc, $srcStep
         ELSE
             LDM         $pSrc, {$out0, $out1, $scratch}
             ADD         $pSrc, $pSrc, $srcStep

             MOV         $out0, $out0, LSR #8 * $offset
             ORR         $out0, $out0, $out1, LSL #(32 - 8 * ($offset))
             MOV         $out1, $out1, LSR #8 * $offset
             ORR         $out1, $out1, $scratch, LSL #(32 - 8 * ($offset))
         ENDIF
     MEND

 ;// ***************************************************************************
     ;// Description:
     ;// Loads three words for X interpolation, update pointer to next row. For
     ;// X interpolation, given a truncated-4byteAligned source pointer,
     ;// invariably three continous words are required from there to get the
     ;// nine bytes from the source pointer for filtering.
     ;//
     ;// Syntax:
     ;// M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3
     ;//
     ;// Inputs:
     ;// $pSrc       4 byte aligned source pointer to an address just less than
     ;//             or equal to the data location
     ;//
     ;// $srcStep    The stride on source
     ;//
     ;// $offset     Difference of source data location to the source pointer
     ;//             Use when $offset != 0 (unaligned load)
     ;//
     ;// Outputs:
     ;// $pSrc       Incremented by $srcStep
     ;//
     ;// $word0, $word1, $word2, $word3
     ;//             Three of these are outputs based on the $offset parameter.
     ;//             The outputs are specifically generated to be processed by
     ;//             the M_EXT_XINT macro. Following is the illustration to show
     ;//             how the nine bytes are spanned for different offsets from
     ;//             notTruncatedForAlignmentSourcePointer.
     ;//
     ;//              ------------------------------------------------------
     ;//             | Offset | Aligned Ptr | word0 | word1 | word2 | word3 |
     ;//             |------------------------------------------------------|
     ;//             |    0   |       0     | 0123  | 4567  | 8xxx  |       |
     ;//             |    1   |      -1     | x012  | 3456  | 78xx  |       |
     ;//             |    2   |      -2     | xx01  | 2345  | 678x  |       |
     ;//             |    3   |      -3     | xxx0  |       | 1234  | 5678  |
     ;//              ------------------------------------------------------
     ;//
     ;//             where the numbering (0-8) is to designate the 9 bytes from
     ;//             start of a particular row. The illustration doesn't take in
     ;//             account the positioning of bytes with in the word and the
     ;//             macro combination with M_EXT_XINT will work only in little
     ;//             endian environs
     ;//
     ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending
     ;// register numbering

     MACRO
     M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3
         IF $offset /= 3
             LDM         $pSrc, {$word0, $word1, $word2}
         ELSE
             LDM         $pSrc, {$word0, $word2, $word3}
         ENDIF
         ADD         $pSrc, $pSrc, $srcStep
     MEND

 ;// ***************************************************************************
     ;// Description:
     ;// Extract four registers of four pixels for X interpolation
     ;//
     ;// Syntax:
     ;// M_EXT_XINT $offset, $word0, $word1, $word2, $word3
     ;//
     ;// Inputs:
     ;// $offset     Difference of source data location to the source pointer
     ;//             Use when $offset != 0 (unaligned load)
     ;//
     ;// $word0, $word1, $word2, $word3
     ;//             Three of these are inputs based on the $offset parameter.
     ;//             The inputs are specifically selected to be processed by
     ;//             the M_EXT_XINT macro.
     ;//
     ;//              ------------------------------------------------------
     ;//             | Offset | Aligned Ptr | word0 | word1 | word2 | word3 |
     ;//             |------------------------------------------------------|
     ;//             |    0   |       0     | 0123  | 4567  | 8xxx  | yyyy  |
     ;//             |    1   |      -1     | x012  | 3456  | 78xx  | yyyy  |
     ;//             |    2   |      -2     | xx01  | 2345  | 678x  | yyyy  |
     ;//             |    3   |      -3     | xxx0  | yyyy  | 1234  | 5678  |
     ;//              ------------------------------------------------------
     ;//
     ;// Outputs:
     ;// $word0, $word1, $word2, $word3
     ;//             Bytes from the original source pointer (not truncated for
     ;//             4 byte alignment) as shown in the table.
     ;//              -------------------------------
     ;//             | word0 | word1 | word2 | word3 |
     ;//             |-------------------------------|
     ;//             | 0123  | 4567  | 1234  | 5678  |
     ;//              -------------------------------
     ;//
     ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending
     ;// register numbering

     MACRO
     M_EXT_XINT $offset, $word0, $word1, $word2, $word3
         IF $offset = 0
             ; $word0 and $word1 are ok
             ; $word2, $word3 are just 8 shifted versions
             MOV         $word3, $word1, LSR #8
             ORR         $word3, $word3, $word2, LSL #24
             MOV         $word2, $word0, LSR #8
             ORR         $word2, $word2, $word1, LSL #24
         ELIF $offset = 3
             ; $word2 and $word3 are ok (taken care while loading itself)
             ; set $word0 & $word1
             MOV         $word0, $word0, LSR #24
             ORR         $word0, $word0, $word2, LSL #8
             MOV         $word1, $word2, LSR #24
             ORR         $word1, $word1, $word3, LSL #8
         ELSE
             MOV         $word0, $word0, LSR #8 * $offset
             ORR         $word0, $word0, $word1, LSL #(32 - 8 * ($offset))
             MOV         $word1, $word1, LSR #8 * $offset
             ORR         $word1, $word1, $word2, LSL #(32 - 8 * ($offset))

             MOV         $word3, $word1, LSR #8
             ORR         $word3, $word3, $word2, LSL #(32 - 8 * (($offset)+1))
             MOV         $word2, $word0, LSR #8
             ORR         $word2, $word2, $word1, LSL #24
         ENDIF
     MEND

 ;// ***************************************************************************
     ;// Description:
     ;// Computes half-sum and xor of two inputs and puts them in the input
     ;// registers in that order
     ;//
     ;// Syntax:
     ;// M_HSUM_XOR      $v0, $v1, $tmp
     ;//
     ;// Inputs:
     ;// $v0         a, first input
     ;// $v1         b, second input
     ;// $tmp        scratch register
     ;//
     ;// Outputs:
     ;// $v0         (a + b)/2
     ;// $v1         a ^ b

     MACRO
     M_HSUM_XOR      $v0, $v1, $tmp
         UHADD8      $tmp, $v0, $v1     ;// s0 = a + b
         EOR         $v1, $v0, $v1      ;// l0 = a ^ b
         MOV         $v0, $tmp          ;// s0
     MEND
 ;// ***************************************************************************
     ;// Description:
     ;// Calculates average of 4 values (a,b,c,d) for HalfPixelXY predict type in
     ;// mcReconBlock module. Very specific to the implementation of
     ;// M_MCRECONBLOCK_HalfPixelXY done here. Uses "tmp" as scratch register and
     ;// "yMask" for mask variable "0x1010101x" set in it. In yMask 4 lsbs are
     ;// not significant and are used by the callee for row counter (y)
     ;//
     ;// Some points to note are:
     ;// 1. Input is pair of pair-averages and Xors
     ;// 2. $sum1 and $lsb1 are not modified and hence can be reused in another
     ;//    running average
     ;// 3. Output is in the first argument
     ;//
     ;// Syntax:
     ;// M_AVG4         $sum0, $lsb0, $sum1, $lsb1, $rndVal
     ;//
     ;// Inputs:
     ;// $sum0       (a + b) >> 1, where a and b are 1st and 2nd inputs to be averaged
     ;// $lsb0       (a ^ b)
     ;// $sum1       (c + d) >> 1. Not modified
     ;// $lsb1       (c ^ d)       Not modified
     ;// $rndVal     Assembler Variable. 0 for rounding, 1 for no rounding
     ;//
     ;// Outputs:
     ;// $sum0       (a + b + c + d + 1) / 4 : If no rounding
     ;//             (a + b + c + d + 2) / 4 : If rounding

     MACRO
     M_AVG4          $sum0, $lsb0, $sum1, $lsb1, $rndVal
         LCLS OP1
         LCLS OP2
         IF $rndVal = 0 ;// rounding case
 OP1 SETS "AND"
 OP2 SETS "ORR"
         ELSE           ;// Not rounding case
 OP1 SETS "ORR"
 OP2 SETS "AND"
         ENDIF

         LCLS lsb2
         LCLS sum2
         LCLS dest

 lsb2  SETS "tmp"
 sum2  SETS "$lsb0"
 dest  SETS "$sum0"

         $OP1        $lsb0, $lsb0, $lsb1          ;// e0 = e0 & e1
         EOR         $lsb2, $sum0, $sum1          ;// e2 = s0 ^ s1
         $OP2        $lsb2, $lsb2, $lsb0          ;// e2 = e2 | e0
         AND         $lsb2, $lsb2, yMask, LSR # 4 ;// e2 = e2 & mask
         UHADD8      $sum2, $sum0, $sum1          ;// s2 = (s0 + s1)/2
         UADD8       $dest, $sum2, $lsb2          ;// dest =  s2 + e2
     MEND
 ;// ***************************************************************************
 ;// Motion compensation handler macros
 ;// ***************************************************************************
     ;// Description:
     ;// Implement motion compensation routines using the named registers in
     ;// callee function. Each of the following 4 implement the 4 predict type
     ;// Each handles 8 cases each ie all the combinations of 4 types of source
     ;// alignment offsets and 2 types of rounding flag
     ;//
     ;// Syntax:
     ;// M_MCRECONBLOCK_IntegerPixel $rndVal, $offset
     ;// M_MCRECONBLOCK_HalfPixelX   $rndVal, $offset
     ;// M_MCRECONBLOCK_HalfPixelY   $rndVal, $offset
     ;// M_MCRECONBLOCK_HalfPixelXY  $rndVal, $offset
     ;//
     ;// Inputs:
     ;// $rndVal     Assembler Variable. 0 for rounding, 1 for no rounding
     ;// $offset     $pSrc MOD 4 value. Offset from 4 byte aligned location.
     ;//
     ;// Outputs:
     ;// Outputs come in the named registers of the callee functions
     ;// The macro loads the data from the source pointer, processes it and
     ;// stores in the destination pointer. Does the whole prediction cycle
     ;// of Motion Compensation routine for a particular predictType
     ;// After this only residue addition to the predicted values remain

     MACRO
     M_MCRECONBLOCK_IntegerPixel $rndVal, $offset
     ;// Algorithmic Description:
     ;// This handles motion compensation for IntegerPixel predictType. Both
     ;// rounding cases are handled by the same code base. It is just a copy
     ;// from source to destination. Two lines are done per loop to reduce
     ;// stalls. Loop has been software pipelined as well for that purpose.
     ;//
     ;// M_LOAD_X loads a whole row in two registers and then they are stored

 CaseIntegerPixelRnd0Offset$offset
 CaseIntegerPixelRnd1Offset$offset
     M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp3, $offset
     M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
 YloopIntegerPixelOffset$offset
     SUBS        y, y, #2
     STRD        tmp1, tmp2, [pDst], dstStep
     STRD        tmp3, tmp4, [pDst], dstStep
     M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp3, $offset
     M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
     BGT         YloopIntegerPixelOffset$offset

     B           SwitchPredictTypeEnd
     MEND
 ;// ***************************************************************************
     MACRO
     M_MCRECONBLOCK_HalfPixelX $rndVal, $offset
     ;// Algorithmic Description:
     ;// This handles motion compensation for HalfPixelX predictType. The two
     ;// rounding cases are handled by the different code base and spanned by
     ;// different macro calls. Loop has been software pipelined to reduce
     ;// stalls.
     ;//
     ;// Filtering involves averaging a pixel with the next horizontal pixel.
     ;// M_LOAD_XINT and M_EXT_XINT combination generate 4 registers, 2 with
     ;// all pixels in a row with 4 pixel in each register and another 2
     ;// registers with pixels corresponding to one horizontally shifted pixel
     ;// corresponding to the initial row pixels. These are set of packed
     ;// registers appropriate to do 4 lane SIMD.
     ;// After that M_UHADD8R macro does the averaging taking care of the
     ;// rounding as required

 CaseHalfPixelXRnd$rndVal.Offset$offset
     IF $rndVal = 0
         LDR mask, =0x80808080
     ENDIF

     M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4
 YloopHalfPixelXRnd$rndVal.Offset$offset
     SUBS        y, y, #1
     M_EXT_XINT  $offset, tmp1, tmp2, tmp3, tmp4
     M_UHADD8R   tmp5, tmp1, tmp3, (1-$rndVal), mask
     M_UHADD8R   tmp6, tmp2, tmp4, (1-$rndVal), mask
     STRD        tmp5, tmp6, [pDst], dstStep
     M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4
     BGT         YloopHalfPixelXRnd$rndVal.Offset$offset

     B           SwitchPredictTypeEnd
     MEND
 ;// ***************************************************************************
     MACRO
     M_MCRECONBLOCK_HalfPixelY $rndVal, $offset
     ;// Algorithmic Description:
     ;// This handles motion compensation for HalfPixelY predictType. The two
     ;// rounding cases are handled by the different code base and spanned by
     ;// different macro calls. PreLoading is used to avoid reload of same data.
     ;//
     ;// Filtering involves averaging a pixel with the next vertical pixel.
     ;// M_LOAD_X generates 2 registers with all pixels in a row with 4 pixel in
     ;// each register. These are set of packed registers appropriate to do
     ;// 4 lane SIMD. After that M_UHADD8R macro does the averaging taking care
     ;// of the rounding as required

 CaseHalfPixelYRnd$rndVal.Offset$offset
     IF $rndVal = 0
         LDR mask, =0x80808080
     ENDIF

     M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp5, $offset ;// Pre-load
 YloopHalfPixelYRnd$rndVal.Offset$offset
     SUBS        y, y, #2
     ;// Processing one line
     M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
     M_UHADD8R   tmp1, tmp1, tmp3, (1-$rndVal), mask
     M_UHADD8R   tmp2, tmp2, tmp4, (1-$rndVal), mask
     STRD        tmp1, tmp2, [pDst], dstStep
     ;// Processing another line
     M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp5, $offset
     M_UHADD8R   tmp3, tmp3, tmp1, (1-$rndVal), mask
     M_UHADD8R   tmp4, tmp4, tmp2, (1-$rndVal), mask
     STRD        tmp3, tmp4, [pDst], dstStep

     BGT         YloopHalfPixelYRnd$rndVal.Offset$offset

     B           SwitchPredictTypeEnd
     MEND
 ;// ***************************************************************************
     MACRO
     M_MCRECONBLOCK_HalfPixelXY $rndVal, $offset
     ;// Algorithmic Description:
     ;// This handles motion compensation for HalfPixelXY predictType. The two
     ;// rounding cases are handled by the different code base and spanned by
     ;// different macro calls. PreLoading is used to avoid reload of same data.
     ;//
     ;// Filtering involves averaging a pixel with the next vertical, horizontal
     ;// and right-down diagonal pixels. Just as in HalfPixelX case, M_LOAD_XINT
     ;// and M_EXT_XINT combination generates 4 registers with a row and its
     ;// 1 pixel right shifted version, with 4 pixels in one register. Another
     ;// call of that macro-combination gets another row. Then M_HSUM_XOR is
     ;// called to get mutual half-sum and xor combinations of a row with its
     ;// shifted version as they are inputs to the M_AVG4 macro which computes
     ;// the 4 element average with rounding. Note that it is the half-sum/xor
     ;// values that are preserved for next row as they can be re-used in the
     ;// next call to the M_AVG4 and saves recomputation.
     ;// Due to lack of register, the row counter and a masking value required
     ;// in M_AVG4 are packed into a single register yMask where the last nibble
     ;// holds the row counter values and rest holds the masking variable left
     ;// shifted by 4

 CaseHalfPixelXYRnd$rndVal.Offset$offset
     LDR         yMask, =((0x01010101 << 4) + 8)

     M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b'
     M_EXT_XINT  $offset, t00, t01, t10, t11
     M_HSUM_XOR  t00, t10, tmp               ;// s0, l0
     M_HSUM_XOR  t01, t11, tmp               ;// s0', l0'

 YloopHalfPixelXYRnd$rndVal.Offset$offset
     ;// Processsing one line
     ;// t00, t01, t10, t11 required from previous loop
     M_LOAD_XINT pSrc, srcStep, $offset, t20, t21, t30, t31 ;// Load c, c', d, d'
     SUB         yMask, yMask, #2
     M_EXT_XINT  $offset, t20, t21, t30, t31
     M_HSUM_XOR  t20, t30, tmp               ;// s1, l1
     M_HSUM_XOR  t21, t31, tmp               ;// s1', l1'
     M_AVG4      t00, t10, t20, t30, $rndVal ;// s0, l0, s1, l1
     M_AVG4      t01, t11, t21, t31, $rndVal ;// s0', l0', s1', l1'
     STRD        t00, t01, [pDst], dstStep   ;// store the average

     ;// Processsing another line
     ;// t20, t21, t30, t31 required from above
     M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b'
     TST         yMask, #7
     M_EXT_XINT  $offset, t00, t01, t10, t11
     M_HSUM_XOR  t00, t10, tmp
     M_HSUM_XOR  t01, t11, tmp
     M_AVG4      t20, t30, t00, t10, $rndVal
     M_AVG4      t21, t31, t01, t11, $rndVal
     STRD        t20, t21, [pDst], dstStep

     BGT         YloopHalfPixelXYRnd$rndVal.Offset$offset

     IF $offset/=3 :LOR: $rndVal/=1
         B           SwitchPredictTypeEnd
     ENDIF
     MEND
 ;// ***************************************************************************
 ;// Motion compensation handler macros end here
 ;// ***************************************************************************
     ;// Description:
     ;// Populates all 4 kinds of offsets "cases" for each predictType and rndVal
     ;// combination in the "switch" to prediction processing code segment
     ;//
     ;// Syntax:
     ;// M_CASE_OFFSET $rnd, $predictType
     ;//
     ;// Inputs:
     ;// $rnd            0 for rounding, 1 for no rounding
     ;// $predictType    The prediction mode
     ;//
     ;// Outputs:
     ;// Populated list of "M_CASE"s for the "M_SWITCH" macro

     MACRO
     M_CASE_OFFSET $rnd, $predictType
         M_CASE      Case$predictType.Rnd$rnd.Offset0
         M_CASE      Case$predictType.Rnd$rnd.Offset1
         M_CASE      Case$predictType.Rnd$rnd.Offset2
         M_CASE      Case$predictType.Rnd$rnd.Offset3
     MEND
 ;// ***************************************************************************
     ;// Description:
     ;// Populates all 2 kinds of rounding "cases" for each predictType in the
     ;// "switch" to prediction processing code segment
     ;//
     ;// Syntax:
     ;// M_CASE_OFFSET $predictType
     ;//
     ;// Inputs:
     ;// $predictType    The prediction mode
     ;//
     ;// Outputs:
     ;// Populated list of "M_CASE_OFFSET" macros

     MACRO
     M_CASE_MCRECONBLOCK $predictType
         M_CASE_OFFSET  0, $predictType ;// 0 for rounding
         M_CASE_OFFSET  1, $predictType ;// 1 for no rounding
     MEND
 ;// ***************************************************************************
     ;// Description:
     ;// Populates all 8 kinds of rounding and offset combinations handling macros
     ;// for the specified predictType. In case of "IntegerPixel" predictType,
     ;// rounding is not required so same code segment handles both cases
     ;//
     ;// Syntax:
     ;// M_MCRECONBLOCK    $predictType
     ;//
     ;// Inputs:
     ;// $predictType    The prediction mode
     ;//
     ;// Outputs:
     ;// Populated list of "M_MCRECONBLOCK_<predictType>" macros for specified
     ;// predictType. Each
     ;//                 M_MCRECONBLOCK_<predictType> $rnd, $offset
     ;// is an code segment (starting with a label indicating the predictType,
     ;// rounding and offset combination)
     ;// Four calls of this macro with the 4 prediction modes populate all the 32
     ;// handlers

     MACRO
     M_MCRECONBLOCK $predictType
         M_MCRECONBLOCK_$predictType 0, 0
         M_MCRECONBLOCK_$predictType 0, 1
         M_MCRECONBLOCK_$predictType 0, 2
         M_MCRECONBLOCK_$predictType 0, 3
     IF "$predictType" /= "IntegerPixel" ;// If not IntegerPixel then rounding makes a difference
         M_MCRECONBLOCK_$predictType 1, 0
         M_MCRECONBLOCK_$predictType 1, 1
         M_MCRECONBLOCK_$predictType 1, 2
         M_MCRECONBLOCK_$predictType 1, 3
     ENDIF
     MEND
 ;// ***************************************************************************
 ;// Input/Output Registers
 pSrc                  RN 0
 srcStep               RN 1
 arg_pSrcResidue       RN 2
 pSrcResidue           RN 12
 pDst                  RN 3
 dstStep               RN 2
 predictType           RN 10
 rndVal                RN 11
 mask                  RN 11

 ;// Local Scratch Registers
 zero                  RN 12
 y                     RN 14

 tmp1                  RN 4
 tmp2                  RN 5
 tmp3                  RN 6
 tmp4                  RN 7
 tmp5                  RN 8
 tmp6                  RN 9
 tmp7                  RN 10
 tmp8                  RN 11
 tmp9                  RN 12

 t00                   RN 4
 t01                   RN 5
 t10                   RN 6
 t11                   RN 7
 t20                   RN 8
 t21                   RN 9
 t30                   RN 10
 t31                   RN 11
 tmp                   RN 12

 yMask                 RN 14

 dst                   RN 1
 return                RN 0

     ;// Allocate memory on stack
     M_ALLOC4    Stk_pDst,           4
     M_ALLOC4    Stk_pSrcResidue,    4
     ;// Function header
     M_START     omxVCM4P2_MCReconBlock, r11
     ;// Define stack arguments
     M_ARG       Arg_dstStep,        4
     M_ARG       Arg_predictType,    4
     M_ARG       Arg_rndVal,         4
     ;// Save on stack
     M_STR       pDst, Stk_pDst
     M_STR       arg_pSrcResidue, Stk_pSrcResidue
     ;// Load argument from the stack
     M_LDR       dstStep, Arg_dstStep
     M_LDR       predictType, Arg_predictType
     M_LDR       rndVal, Arg_rndVal

     MOV         y, #8

     AND         tmp1, pSrc, #3
     ORR         predictType, tmp1, predictType, LSL #3
     ORR         predictType, predictType, rndVal, LSL #2
     ;// Truncating source pointer to align to 4 byte location
     BIC         pSrc, pSrc, #3

     ;// Implementation takes care of all combinations of different
     ;// predictTypes, rounding cases and source pointer offsets to alignment
     ;// of 4 bytes in different code bases unless one of these parameter wasn't
     ;// making any difference to the implementation. Below M_CASE_MCRECONBLOCK
     ;// macros branch into 8 M_CASE macros for all combinations of the 2
     ;// rounding cases and 4 offsets of the pSrc pointer to the 4 byte
     ;// alignment.
     M_SWITCH    predictType
         M_CASE_MCRECONBLOCK IntegerPixel
         M_CASE_MCRECONBLOCK HalfPixelX
         M_CASE_MCRECONBLOCK HalfPixelY
         M_CASE_MCRECONBLOCK HalfPixelXY
     M_ENDSWITCH

     ;// The M_MCRECONBLOCK macros populate the code bases by calling all 8
     ;// particular macros (4 in case of IntegerPixel as rounding makes no
     ;// difference there) to generate the code for all cases of rounding and
     ;// offsets. LTORG is used to segment the code as code size bloated beyond
     ;// 4KB.
     M_MCRECONBLOCK IntegerPixel
     M_MCRECONBLOCK HalfPixelX
     LTORG
     M_MCRECONBLOCK HalfPixelY
     M_MCRECONBLOCK HalfPixelXY
 SwitchPredictTypeEnd

     ;// Residue Addition
     ;// This is done in 2 lane SIMD though loads are further optimized and
     ;// 4 bytes are loaded in case of destination buffer. Algorithmic
     ;// details are in inlined comments
     M_LDR       pSrcResidue, Stk_pSrcResidue
     CMP         pSrcResidue, #0
     BEQ         pSrcResidueConditionEnd
 pSrcResidueNotNull
     M_LDR       pDst, Stk_pDst
     MOV         y, #8
     SUB         dstStep, dstStep, #4
 Yloop_pSrcResidueNotNull
     SUBS        y, y, #1
     LDR         dst, [pDst]                ;// dst = [dcba]
     LDMIA       pSrcResidue!, {tmp1, tmp2} ;// tmp1=[DC] tmp2=[BA]
     PKHBT       tmp3, tmp1, tmp2, LSL #16  ;// Deltaval1 = [C A]
     PKHTB       tmp4, tmp2, tmp1, ASR #16  ;// DeltaVal2 = [D B]
     UXTB16      tmp1, dst                  ;// tmp1 = [0c0a]
     UXTB16      tmp2, dst, ROR #8          ;// tmp2 = [0d0b]
     QADD16      tmp1, tmp1, tmp3           ;// Add and saturate to 16 bits
     QADD16      tmp2, tmp2, tmp4
     USAT16      tmp1, #8, tmp1
     USAT16      tmp2, #8, tmp2             ;// armClip(0, 255, tmp2)
     ORR         tmp1, tmp1, tmp2, LSL #8   ;// tmp1 = [dcba]
     STR         tmp1, [pDst], #4

     LDR         dst, [pDst]
     LDMIA       pSrcResidue!, {tmp1, tmp2}
     PKHBT       tmp3, tmp1, tmp2, LSL #16
     PKHTB       tmp4, tmp2, tmp1, ASR #16
     UXTB16      tmp1, dst
     UXTB16      tmp2, dst, ROR #8
     QADD16      tmp1, tmp1, tmp3
     QADD16      tmp2, tmp2, tmp4
     USAT16      tmp1, #8, tmp1
     USAT16      tmp2, #8, tmp2
     ORR         tmp1, tmp1, tmp2, LSL #8
     STR         tmp1, [pDst], dstStep

     BGT         Yloop_pSrcResidueNotNull
 pSrcResidueConditionEnd

     MOV         return, #OMX_Sts_NoErr

     M_END
     ENDIF ;// ARM1136JS

 ;// ***************************************************************************
 ;// CortexA8 implementation
 ;// ***************************************************************************
     END
 ;// ***************************************************************************
 ;// omxVCM4P2_MCReconBlock ends
 ;// ***************************************************************************