blob: 941ae5a074e03ef536eb1af49715ae8c828dfe9e [file] [log] [blame]
/* ------------------------------------------------------------------
* Copyright (C) 1998-2009 PacketVideo
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied.
* See the License for the specific language governing permissions
* and limitations under the License.
* -------------------------------------------------------------------
*/
#include "avcenc_lib.h"
/* 3/29/01 fast half-pel search based on neighboring guess */
/* value ranging from 0 to 4, high complexity (more accurate) to
low complexity (less accurate) */
#define HP_DISTANCE_TH 5 // 2 /* half-pel distance threshold */
#define PREF_16_VEC 129 /* 1MV bias versus 4MVs*/
#define CLIP_RESULT(x) if((uint)(x) > 0xFF){ \
(x) = 0xFF & (~((x)>>31));}
#define CLIP_UPPER16(x) if((uint)(x) >= 0x20000000){ \
(x) = 0xFF0000 & (~((x)>>31));} \
else { \
(x) = ((x)>>5)&0xFF0000; \
}
/*=====================================================================
Function: AVCFindHalfPelMB
Date: 10/31/2007
Purpose: Find half pel resolution MV surrounding the full-pel MV
=====================================================================*/
int AVCFindHalfPelMB(AVCEncObject *encvid, uint8 *cur, AVCMV *mot, uint8 *ncand,
int xpos, int ypos, int hp_guess, int cmvx, int cmvy)
{
AVCPictureData *currPic = encvid->common->currPic;
int lx = currPic->pitch;
int d, dmin, satd_min;
uint8* cand;
int lambda_motion = encvid->lambda_motion;
uint8 *mvbits = encvid->mvbits;
int mvcost;
/* list of candidate to go through for half-pel search*/
uint8 *subpel_pred = (uint8*) encvid->subpel_pred; // all 16 sub-pel positions
uint8 **hpel_cand = (uint8**) encvid->hpel_cand; /* half-pel position */
int xh[9] = {0, 0, 2, 2, 2, 0, -2, -2, -2};
int yh[9] = {0, -2, -2, 0, 2, 2, 2, 0, -2};
int xq[8] = {0, 1, 1, 1, 0, -1, -1, -1};
int yq[8] = { -1, -1, 0, 1, 1, 1, 0, -1};
int h, hmin, q, qmin;
OSCL_UNUSED_ARG(xpos);
OSCL_UNUSED_ARG(ypos);
OSCL_UNUSED_ARG(hp_guess);
GenerateHalfPelPred(subpel_pred, ncand, lx);
cur = encvid->currYMB; // pre-load current original MB
cand = hpel_cand[0];
// find cost for the current full-pel position
dmin = SATD_MB(cand, cur, 65535); // get Hadamaard transform SAD
mvcost = MV_COST_S(lambda_motion, mot->x, mot->y, cmvx, cmvy);
satd_min = dmin;
dmin += mvcost;
hmin = 0;
/* find half-pel */
for (h = 1; h < 9; h++)
{
d = SATD_MB(hpel_cand[h], cur, dmin);
mvcost = MV_COST_S(lambda_motion, mot->x + xh[h], mot->y + yh[h], cmvx, cmvy);
d += mvcost;
if (d < dmin)
{
dmin = d;
hmin = h;
satd_min = d - mvcost;
}
}
mot->sad = dmin;
mot->x += xh[hmin];
mot->y += yh[hmin];
encvid->best_hpel_pos = hmin;
/*** search for quarter-pel ****/
GenerateQuartPelPred(encvid->bilin_base[hmin], &(encvid->qpel_cand[0][0]), hmin);
encvid->best_qpel_pos = qmin = -1;
for (q = 0; q < 8; q++)
{
d = SATD_MB(encvid->qpel_cand[q], cur, dmin);
mvcost = MV_COST_S(lambda_motion, mot->x + xq[q], mot->y + yq[q], cmvx, cmvy);
d += mvcost;
if (d < dmin)
{
dmin = d;
qmin = q;
satd_min = d - mvcost;
}
}
if (qmin != -1)
{
mot->sad = dmin;
mot->x += xq[qmin];
mot->y += yq[qmin];
encvid->best_qpel_pos = qmin;
}
return satd_min;
}
/** This function generates sub-pel prediction around the full-pel candidate.
Each sub-pel position array is 20 pixel wide (for word-alignment) and 17 pixel tall. */
/** The sub-pel position is labeled in spiral manner from the center. */
void GenerateHalfPelPred(uint8* subpel_pred, uint8 *ncand, int lx)
{
/* let's do straightforward way first */
uint8 *ref;
uint8 *dst;
uint8 tmp8;
int32 tmp32;
int16 tmp_horz[18*22], *dst_16, *src_16;
int a = 0, b = 0, c = 0, d = 0, e = 0, f = 0; // temp
int i, j;
/* first copy full-pel to the first array */
/* to be optimized later based on byte-offset load */
ref = ncand - 3 - lx - (lx << 1); /* move back (-3,-3) */
dst = subpel_pred;
dst -= 4; /* offset */
for (j = 0; j < 22; j++) /* 24x22 */
{
i = 6;
while (i > 0)
{
tmp32 = *ref++;
tmp8 = *ref++;
tmp32 |= (tmp8 << 8);
tmp8 = *ref++;
tmp32 |= (tmp8 << 16);
tmp8 = *ref++;
tmp32 |= (tmp8 << 24);
*((uint32*)(dst += 4)) = tmp32;
i--;
}
ref += (lx - 24);
}
/* from the first array, we do horizontal interp */
ref = subpel_pred + 2;
dst_16 = tmp_horz; /* 17 x 22 */
for (j = 4; j > 0; j--)
{
for (i = 16; i > 0; i -= 4)
{
a = ref[-2];
b = ref[-1];
c = ref[0];
d = ref[1];
e = ref[2];
f = ref[3];
*dst_16++ = a + f - 5 * (b + e) + 20 * (c + d);
a = ref[4];
*dst_16++ = b + a - 5 * (c + f) + 20 * (d + e);
b = ref[5];
*dst_16++ = c + b - 5 * (d + a) + 20 * (e + f);
c = ref[6];
*dst_16++ = d + c - 5 * (e + b) + 20 * (f + a);
ref += 4;
}
/* do the 17th column here */
d = ref[3];
*dst_16 = e + d - 5 * (f + c) + 20 * (a + b);
dst_16 += 2; /* stride for tmp_horz is 18 */
ref += 8; /* stride for ref is 24 */
if (j == 3) // move 18 lines down
{
dst_16 += 324;//18*18;
ref += 432;//18*24;
}
}
ref -= 480;//20*24;
dst_16 -= 360;//20*18;
dst = subpel_pred + V0Q_H2Q * SUBPEL_PRED_BLK_SIZE; /* go to the 14th array 17x18*/
for (j = 18; j > 0; j--)
{
for (i = 16; i > 0; i -= 4)
{
a = ref[-2];
b = ref[-1];
c = ref[0];
d = ref[1];
e = ref[2];
f = ref[3];
tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
*dst_16++ = tmp32;
tmp32 = (tmp32 + 16) >> 5;
CLIP_RESULT(tmp32)
*dst++ = tmp32;
a = ref[4];
tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
*dst_16++ = tmp32;
tmp32 = (tmp32 + 16) >> 5;
CLIP_RESULT(tmp32)
*dst++ = tmp32;
b = ref[5];
tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
*dst_16++ = tmp32;
tmp32 = (tmp32 + 16) >> 5;
CLIP_RESULT(tmp32)
*dst++ = tmp32;
c = ref[6];
tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
*dst_16++ = tmp32;
tmp32 = (tmp32 + 16) >> 5;
CLIP_RESULT(tmp32)
*dst++ = tmp32;
ref += 4;
}
/* do the 17th column here */
d = ref[3];
tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
*dst_16 = tmp32;
tmp32 = (tmp32 + 16) >> 5;
CLIP_RESULT(tmp32)
*dst = tmp32;
dst += 8; /* stride for dst is 24 */
dst_16 += 2; /* stride for tmp_horz is 18 */
ref += 8; /* stride for ref is 24 */
}
/* Do middle point filtering*/
src_16 = tmp_horz; /* 17 x 22 */
dst = subpel_pred + V2Q_H2Q * SUBPEL_PRED_BLK_SIZE; /* 12th array 17x17*/
dst -= 24; // offset
for (i = 0; i < 17; i++)
{
for (j = 16; j > 0; j -= 4)
{
a = *src_16;
b = *(src_16 += 18);
c = *(src_16 += 18);
d = *(src_16 += 18);
e = *(src_16 += 18);
f = *(src_16 += 18);
tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
tmp32 = (tmp32 + 512) >> 10;
CLIP_RESULT(tmp32)
*(dst += 24) = tmp32;
a = *(src_16 += 18);
tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
tmp32 = (tmp32 + 512) >> 10;
CLIP_RESULT(tmp32)
*(dst += 24) = tmp32;
b = *(src_16 += 18);
tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
tmp32 = (tmp32 + 512) >> 10;
CLIP_RESULT(tmp32)
*(dst += 24) = tmp32;
c = *(src_16 += 18);
tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
tmp32 = (tmp32 + 512) >> 10;
CLIP_RESULT(tmp32)
*(dst += 24) = tmp32;
src_16 -= (18 << 2);
}
d = src_16[90]; // 18*5
tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
tmp32 = (tmp32 + 512) >> 10;
CLIP_RESULT(tmp32)
dst[24] = tmp32;
src_16 -= ((18 << 4) - 1);
dst -= ((24 << 4) - 1);
}
/* do vertical interpolation */
ref = subpel_pred + 2;
dst = subpel_pred + V2Q_H0Q * SUBPEL_PRED_BLK_SIZE; /* 10th array 18x17 */
dst -= 24; // offset
for (i = 2; i > 0; i--)
{
for (j = 16; j > 0; j -= 4)
{
a = *ref;
b = *(ref += 24);
c = *(ref += 24);
d = *(ref += 24);
e = *(ref += 24);
f = *(ref += 24);
tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
tmp32 = (tmp32 + 16) >> 5;
CLIP_RESULT(tmp32)
*(dst += 24) = tmp32; // 10th
a = *(ref += 24);
tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
tmp32 = (tmp32 + 16) >> 5;
CLIP_RESULT(tmp32)
*(dst += 24) = tmp32; // 10th
b = *(ref += 24);
tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
tmp32 = (tmp32 + 16) >> 5;
CLIP_RESULT(tmp32)
*(dst += 24) = tmp32; // 10th
c = *(ref += 24);
tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
tmp32 = (tmp32 + 16) >> 5;
CLIP_RESULT(tmp32)
*(dst += 24) = tmp32; // 10th
ref -= (24 << 2);
}
d = ref[120]; // 24*5
tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
tmp32 = (tmp32 + 16) >> 5;
CLIP_RESULT(tmp32)
dst[24] = tmp32; // 10th
dst -= ((24 << 4) - 1);
ref -= ((24 << 4) - 1);
}
// note that using SIMD here doesn't help much, the cycle almost stays the same
// one can just use the above code and change the for(i=2 to for(i=18
for (i = 16; i > 0; i -= 4)
{
for (j = 17; j > 0; j--)
{
a = *((uint32*)ref); /* load 4 bytes */
b = (a >> 8) & 0xFF00FF; /* second and fourth byte */
a &= 0xFF00FF;
c = *((uint32*)(ref + 120));
d = (c >> 8) & 0xFF00FF;
c &= 0xFF00FF;
a += c;
b += d;
e = *((uint32*)(ref + 72)); /* e, f */
f = (e >> 8) & 0xFF00FF;
e &= 0xFF00FF;
c = *((uint32*)(ref + 48)); /* c, d */
d = (c >> 8) & 0xFF00FF;
c &= 0xFF00FF;
c += e;
d += f;
a += 20 * c;
b += 20 * d;
a += 0x100010;
b += 0x100010;
e = *((uint32*)(ref += 24)); /* e, f */
f = (e >> 8) & 0xFF00FF;
e &= 0xFF00FF;
c = *((uint32*)(ref + 72)); /* c, d */
d = (c >> 8) & 0xFF00FF;
c &= 0xFF00FF;
c += e;
d += f;
a -= 5 * c;
b -= 5 * d;
c = a << 16;
d = b << 16;
CLIP_UPPER16(a)
CLIP_UPPER16(c)
CLIP_UPPER16(b)
CLIP_UPPER16(d)
a |= (c >> 16);
b |= (d >> 16);
// a>>=5;
// b>>=5;
/* clip */
// msk |= b; msk|=a;
// a &= 0xFF00FF;
// b &= 0xFF00FF;
a |= (b << 8); /* pack it back */
*((uint16*)(dst += 24)) = a & 0xFFFF; //dst is not word-aligned.
*((uint16*)(dst + 2)) = a >> 16;
}
dst -= 404; // 24*17-4
ref -= 404;
/* if(msk & 0xFF00FF00) // need clipping
{
VertInterpWClip(dst,ref); // re-do 4 column with clip
}*/
}
return ;
}
void VertInterpWClip(uint8 *dst, uint8 *ref)
{
int i, j;
int a, b, c, d, e, f;
int32 tmp32;
dst -= 4;
ref -= 4;
for (i = 4; i > 0; i--)
{
for (j = 16; j > 0; j -= 4)
{
a = *ref;
b = *(ref += 24);
c = *(ref += 24);
d = *(ref += 24);
e = *(ref += 24);
f = *(ref += 24);
tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
tmp32 = (tmp32 + 16) >> 5;
CLIP_RESULT(tmp32)
*(dst += 24) = tmp32; // 10th
a = *(ref += 24);
tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
tmp32 = (tmp32 + 16) >> 5;
CLIP_RESULT(tmp32)
*(dst += 24) = tmp32; // 10th
b = *(ref += 24);
tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
tmp32 = (tmp32 + 16) >> 5;
CLIP_RESULT(tmp32)
*(dst += 24) = tmp32; // 10th
c = *(ref += 24);
tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
tmp32 = (tmp32 + 16) >> 5;
CLIP_RESULT(tmp32)
*(dst += 24) = tmp32; // 10th
ref -= (24 << 2);
}
d = ref[120]; // 24*5
tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
tmp32 = (tmp32 + 16) >> 5;
CLIP_RESULT(tmp32)
dst[24] = tmp32; // 10th
dst -= ((24 << 4) - 1);
ref -= ((24 << 4) - 1);
}
return ;
}
void GenerateQuartPelPred(uint8 **bilin_base, uint8 *qpel_cand, int hpel_pos)
{
// for even value of hpel_pos, start with pattern 1, otherwise, start with pattern 2
int i, j;
uint8 *c1 = qpel_cand;
uint8 *tl = bilin_base[0];
uint8 *tr = bilin_base[1];
uint8 *bl = bilin_base[2];
uint8 *br = bilin_base[3];
int a, b, c, d;
int offset = 1 - (384 * 7);
if (!(hpel_pos&1)) // diamond pattern
{
j = 16;
while (j--)
{
i = 16;
while (i--)
{
d = tr[24];
a = *tr++;
b = bl[1];
c = *br++;
*c1 = (c + a + 1) >> 1;
*(c1 += 384) = (b + a + 1) >> 1; /* c2 */
*(c1 += 384) = (b + c + 1) >> 1; /* c3 */
*(c1 += 384) = (b + d + 1) >> 1; /* c4 */
b = *bl++;
*(c1 += 384) = (c + d + 1) >> 1; /* c5 */
*(c1 += 384) = (b + d + 1) >> 1; /* c6 */
*(c1 += 384) = (b + c + 1) >> 1; /* c7 */
*(c1 += 384) = (b + a + 1) >> 1; /* c8 */
c1 += offset;
}
// advance to the next line, pitch is 24
tl += 8;
tr += 8;
bl += 8;
br += 8;
c1 += 8;
}
}
else // star pattern
{
j = 16;
while (j--)
{
i = 16;
while (i--)
{
a = *br++;
b = *tr++;
c = tl[1];
*c1 = (a + b + 1) >> 1;
b = bl[1];
*(c1 += 384) = (a + c + 1) >> 1; /* c2 */
c = tl[25];
*(c1 += 384) = (a + b + 1) >> 1; /* c3 */
b = tr[23];
*(c1 += 384) = (a + c + 1) >> 1; /* c4 */
c = tl[24];
*(c1 += 384) = (a + b + 1) >> 1; /* c5 */
b = *bl++;
*(c1 += 384) = (a + c + 1) >> 1; /* c6 */
c = *tl++;
*(c1 += 384) = (a + b + 1) >> 1; /* c7 */
*(c1 += 384) = (a + c + 1) >> 1; /* c8 */
c1 += offset;
}
// advance to the next line, pitch is 24
tl += 8;
tr += 8;
bl += 8;
br += 8;
c1 += 8;
}
}
return ;
}
/* assuming cand always has a pitch of 24 */
int SATD_MB(uint8 *cand, uint8 *cur, int dmin)
{
int cost;
dmin = (dmin << 16) | 24;
cost = AVCSAD_Macroblock_C(cand, cur, dmin, NULL);
return cost;
}