blob: 20ca7ebcfcf7b9986f8a2b3ba49070a9ba0f132e [file] [log] [blame]
/* ------------------------------------------------------------------
* Copyright (C) 1998-2009 PacketVideo
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied.
* See the License for the specific language governing permissions
* and limitations under the License.
* -------------------------------------------------------------------
*/
/* Intentionally not using the gcc asm version, since it is
* slightly slower than the plain C version on modern GCC versions. */
#if !defined(__CC_ARM) /* Generic C version */
#if (NUMBER==3)
__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin)
#elif (NUMBER==2)
__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin)
#elif (NUMBER==1)
__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin)
#endif
{
int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
// x5 = (x4<<8) - x4;
x4 = x5 = 0;
x6 = 0xFFFF00FF;
x9 = 0x80808080; /* const. */
ref -= NUMBER; /* bic ref, ref, #3 */
ref -= lx;
blk -= 16;
x8 = 16;
#if (NUMBER==3)
LOOP_SAD3:
#elif (NUMBER==2)
LOOP_SAD2:
#elif (NUMBER==1)
LOOP_SAD1:
#endif
/****** process 8 pixels ******/
x10 = *((uint32*)(ref += lx)); /* D C B A */
x11 = *((uint32*)(ref + 4)); /* H G F E */
x12 = *((uint32*)(ref + 8)); /* L K J I */
x10 = ((uint32)x10 >> SHIFT); /* 0 0 0 D */
x10 = x10 | (x11 << (32 - SHIFT)); /* G F E D */
x11 = ((uint32)x11 >> SHIFT); /* 0 0 0 H */
x11 = x11 | (x12 << (32 - SHIFT)); /* K J I H */
x12 = *((uint32*)(blk += 16));
x14 = *((uint32*)(blk + 4));
/* process x11 & x14 */
x11 = sad_4pixel(x11, x14, x9);
/* process x12 & x10 */
x10 = sad_4pixel(x10, x12, x9);
x5 = x5 + x10; /* accumulate low bytes */
x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
x5 = x5 + x11; /* accumulate low bytes */
x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */
/****** process 8 pixels ******/
x10 = *((uint32*)(ref + 8)); /* D C B A */
x11 = *((uint32*)(ref + 12)); /* H G F E */
x12 = *((uint32*)(ref + 16)); /* L K J I */
x10 = ((uint32)x10 >> SHIFT); /* mvn x10, x10, lsr #24 = 0xFF 0xFF 0xFF ~D */
x10 = x10 | (x11 << (32 - SHIFT)); /* bic x10, x10, x11, lsl #8 = ~G ~F ~E ~D */
x11 = ((uint32)x11 >> SHIFT); /* 0xFF 0xFF 0xFF ~H */
x11 = x11 | (x12 << (32 - SHIFT)); /* ~K ~J ~I ~H */
x12 = *((uint32*)(blk + 8));
x14 = *((uint32*)(blk + 12));
/* process x11 & x14 */
x11 = sad_4pixel(x11, x14, x9);
/* process x12 & x10 */
x10 = sad_4pixel(x10, x12, x9);
x5 = x5 + x10; /* accumulate low bytes */
x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
x5 = x5 + x11; /* accumulate low bytes */
x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */
/****************/
x10 = x5 - (x4 << 8); /* extract low bytes */
x10 = x10 + x4; /* add with high bytes */
x10 = x10 + (x10 << 16); /* add with lower half word */
if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */
{
if (--x8)
{
#if (NUMBER==3)
goto LOOP_SAD3;
#elif (NUMBER==2)
goto LOOP_SAD2;
#elif (NUMBER==1)
goto LOOP_SAD1;
#endif
}
}
return ((uint32)x10 >> 16);
}
#elif defined(__CC_ARM) /* only work with arm v5 */
#if (NUMBER==3)
__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
#elif (NUMBER==2)
__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
#elif (NUMBER==1)
__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
#endif
{
int32 x4, x5, x6, x9, x10, x11, x12, x14;
x9 = 0x80808080; /* const. */
x4 = x5 = 0;
__asm{
MVN x6, #0xff0000;
#if (NUMBER==3)
LOOP_SAD3:
#elif (NUMBER==2)
LOOP_SAD2:
#elif (NUMBER==1)
LOOP_SAD1:
#endif
BIC ref, ref, #3;
}
/****** process 8 pixels ******/
x11 = *((int32*)(ref + 12));
x12 = *((int32*)(ref + 16));
x10 = *((int32*)(ref + 8));
x14 = *((int32*)(blk + 12));
__asm{
MVN x10, x10, lsr #SHIFT;
BIC x10, x10, x11, lsl #(32-SHIFT);
MVN x11, x11, lsr #SHIFT;
BIC x11, x11, x12, lsl #(32-SHIFT);
LDR x12, [blk, #8];
}
/* process x11 & x14 */
x11 = sad_4pixelN(x11, x14, x9);
/* process x12 & x10 */
x10 = sad_4pixelN(x10, x12, x9);
sum_accumulate;
__asm{
/****** process 8 pixels ******/
LDR x11, [ref, #4];
LDR x12, [ref, #8];
LDR x10, [ref], lx ;
LDR x14, [blk, #4];
MVN x10, x10, lsr #SHIFT;
BIC x10, x10, x11, lsl #(32-SHIFT);
MVN x11, x11, lsr #SHIFT;
BIC x11, x11, x12, lsl #(32-SHIFT);
LDR x12, [blk], #16;
}
/* process x11 & x14 */
x11 = sad_4pixelN(x11, x14, x9);
/* process x12 & x10 */
x10 = sad_4pixelN(x10, x12, x9);
sum_accumulate;
/****************/
x10 = x5 - (x4 << 8); /* extract low bytes */
x10 = x10 + x4; /* add with high bytes */
x10 = x10 + (x10 << 16); /* add with lower half word */
__asm{
RSBS x11, dmin, x10, lsr #16
ADDLSS x8, x8, #INC_X8
#if (NUMBER==3)
BLS LOOP_SAD3;
#elif (NUMBER==2)
BLS LOOP_SAD2;
#elif (NUMBER==1)
BLS LOOP_SAD1;
#endif
}
return ((uint32)x10 >> 16);
}
#elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER */
#if (NUMBER==3)
__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin)
#elif (NUMBER==2)
__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin)
#elif (NUMBER==1)
__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin)
#endif
{
int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
x9 = 0x80808080; /* const. */
x4 = x5 = 0;
x8 = 16; //<<===========*******
__asm__ volatile("MVN %0, #0xFF0000": "=r"(x6));
#if (NUMBER==3)
LOOP_SAD3:
#elif (NUMBER==2)
LOOP_SAD2:
#elif (NUMBER==1)
LOOP_SAD1:
#endif
__asm__ volatile("BIC %0, %0, #3": "+r"(ref));
/****** process 8 pixels ******/
x11 = *((int32*)(ref + 12));
x12 = *((int32*)(ref + 16));
x10 = *((int32*)(ref + 8));
x14 = *((int32*)(blk + 12));
#if (SHIFT==8)
__asm__ volatile(
"MVN %0, %0, lsr #8\n\t"
"BIC %0, %0, %1, lsl #24\n\t"
"MVN %1, %1, lsr #8\n\t"
"BIC %1, %1, %2, lsl #24"
: "+r"(x10), "+r"(x11)
: "r"(x12)
);
#elif (SHIFT==16)
__asm__ volatile(
"MVN %0, %0, lsr #16\n\t"
"BIC %0, %0, %1, lsl #16\n\t"
"MVN %1, %1, lsr #16\n\t"
"BIC %1, %1, %2, lsl #16"
: "+r"(x10), "+r"(x11)
: "r"(x12)
);
#elif (SHIFT==24)
__asm__ volatile(
"MVN %0, %0, lsr #24\n\t"
"BIC %0, %0, %1, lsl #8\n\t"
"MVN %1, %1, lsr #24\n\t"
"BIC %1, %1, %2, lsl #8"
: "+r"(x10), "+r"(x11)
: "r"(x12)
);
#endif
x12 = *((int32*)(blk + 8));
/* process x11 & x14 */
x11 = sad_4pixelN(x11, x14, x9);
/* process x12 & x10 */
x10 = sad_4pixelN(x10, x12, x9);
sum_accumulate;
/****** process 8 pixels ******/
x11 = *((int32*)(ref + 4));
x12 = *((int32*)(ref + 8));
x10 = *((int32*)ref); ref += lx;
x14 = *((int32*)(blk + 4));
#if (SHIFT==8)
__asm__ volatile(
"MVN %0, %0, lsr #8\n\t"
"BIC %0, %0, %1, lsl #24\n\t"
"MVN %1, %1, lsr #8\n\t"
"BIC %1, %1, %2, lsl #24"
: "+r"(x10), "+r"(x11)
: "r"(x12)
);
#elif (SHIFT==16)
__asm__ volatile(
"MVN %0, %0, lsr #16\n\t"
"BIC %0, %0, %1, lsl #16\n\t"
"MVN %1, %1, lsr #16\n\t"
"BIC %1, %1, %2, lsl #16"
: "+r"(x10), "+r"(x11)
: "r"(x12)
);
#elif (SHIFT==24)
__asm__ volatile(
"MVN %0, %0, lsr #24\n\t"
"BIC %0, %0, %1, lsl #8\n\t"
"MVN %1, %1, lsr #24\n\t"
"BIC %1, %1, %2, lsl #8"
: "+r"(x10), "+r"(x11)
: "r"(x12)
);
#endif
__asm__ volatile("LDR %0, [%1], #16": "=&r"(x12), "+r"(blk));
/* process x11 & x14 */
x11 = sad_4pixelN(x11, x14, x9);
/* process x12 & x10 */
x10 = sad_4pixelN(x10, x12, x9);
sum_accumulate;
/****************/
x10 = x5 - (x4 << 8); /* extract low bytes */
x10 = x10 + x4; /* add with high bytes */
x10 = x10 + (x10 << 16); /* add with lower half word */
if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
{
if (--x8)
{
#if (NUMBER==3)
goto LOOP_SAD3;
#elif (NUMBER==2)
goto LOOP_SAD2;
#elif (NUMBER==1)
goto LOOP_SAD1;
#endif
}
}
return ((uint32)x10 >> 16);
}
#endif