00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
00021 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
00022 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
00023 signed int ABCD[4] __attribute__((aligned(16)));
00024 register int i;
00025 ABCD[0] = ((8 - x) * (8 - y));
00026 ABCD[1] = ((x) * (8 - y));
00027 ABCD[2] = ((8 - x) * (y));
00028 ABCD[3] = ((x) * (y));
00029 const vector signed int vABCD = vec_ld(0, ABCD);
00030 const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
00031 const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
00032 const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
00033 const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
00034 const vector signed int vzero = vec_splat_s32(0);
00035 const vector signed short v32ss = (const vector signed short)AVV(32);
00036 const vector unsigned short v6us = vec_splat_u16(6);
00037
00038 vector unsigned char fperm;
00039
00040 if (((unsigned long)dst) % 16 == 0) {
00041 fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
00042 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
00043 } else {
00044 fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
00045 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
00046 }
00047
00048 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
00049 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
00050
00051 vector unsigned char vsrcAuc;
00052 vector unsigned char vsrcBuc;
00053 vector unsigned char vsrcperm0;
00054 vector unsigned char vsrcperm1;
00055 vsrcAuc = vec_ld(0, src);
00056 if (loadSecond)
00057 vsrcBuc = vec_ld(16, src);
00058 vsrcperm0 = vec_lvsl(0, src);
00059 vsrcperm1 = vec_lvsl(1, src);
00060
00061 vector unsigned char vsrc0uc;
00062 vector unsigned char vsrc1uc;
00063 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
00064 if (reallyBadAlign)
00065 vsrc1uc = vsrcBuc;
00066 else
00067 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
00068
00069 vector signed short vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc0uc);
00070 vector signed short vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc1uc);
00071
00072 if (!loadSecond) {
00073 for (i = 0 ; i < h ; i++) {
00074 vector unsigned char vsrcCuc;
00075 vsrcCuc = vec_ld(stride + 0, src);
00076
00077 vector unsigned char vsrc2uc;
00078 vector unsigned char vsrc3uc;
00079 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00080 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00081
00082 vector signed short vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc2uc);
00083 vector signed short vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc3uc);
00084
00085 vector signed short psum;
00086
00087 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
00088 psum = vec_mladd(vB, vsrc1ssH, psum);
00089 psum = vec_mladd(vC, vsrc2ssH, psum);
00090 psum = vec_mladd(vD, vsrc3ssH, psum);
00091 psum = vec_add(v32ss, psum);
00092 psum = vec_sra(psum, v6us);
00093
00094 vector unsigned char vdst = vec_ld(0, dst);
00095 vector unsigned char ppsum = (vector unsigned char)vec_packsu(psum, psum);
00096
00097 vector unsigned char vfdst = vec_perm(vdst, ppsum, fperm);
00098 vector unsigned char fsum;
00099
00100 OP_U8_ALTIVEC(fsum, vfdst, vdst);
00101
00102 vec_st(fsum, 0, dst);
00103
00104 vsrc0ssH = vsrc2ssH;
00105 vsrc1ssH = vsrc3ssH;
00106
00107 dst += stride;
00108 src += stride;
00109 }
00110 } else {
00111 for (i = 0 ; i < h ; i++) {
00112 vector unsigned char vsrcCuc;
00113 vector unsigned char vsrcDuc;
00114 vsrcCuc = vec_ld(stride + 0, src);
00115 vsrcDuc = vec_ld(stride + 16, src);
00116
00117 vector unsigned char vsrc2uc;
00118 vector unsigned char vsrc3uc;
00119 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00120 if (reallyBadAlign)
00121 vsrc3uc = vsrcDuc;
00122 else
00123 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00124
00125 vector signed short vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc2uc);
00126 vector signed short vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc3uc);
00127
00128 vector signed short psum;
00129
00130 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
00131 psum = vec_mladd(vB, vsrc1ssH, psum);
00132 psum = vec_mladd(vC, vsrc2ssH, psum);
00133 psum = vec_mladd(vD, vsrc3ssH, psum);
00134 psum = vec_add(v32ss, psum);
00135 psum = vec_sr(psum, v6us);
00136
00137 vector unsigned char vdst = vec_ld(0, dst);
00138 vector unsigned char ppsum = (vector unsigned char)vec_pack(psum, psum);
00139
00140 vector unsigned char vfdst = vec_perm(vdst, ppsum, fperm);
00141 vector unsigned char fsum;
00142
00143 OP_U8_ALTIVEC(fsum, vfdst, vdst);
00144
00145 vec_st(fsum, 0, dst);
00146
00147 vsrc0ssH = vsrc2ssH;
00148 vsrc1ssH = vsrc3ssH;
00149
00150 dst += stride;
00151 src += stride;
00152 }
00153 }
00154 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
00155 }
00156
00157
00158 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
00159 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
00160 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
00161 register int i;
00162
00163 const vector signed int vzero = vec_splat_s32(0);
00164 const vector unsigned char permM2 = vec_lvsl(-2, src);
00165 const vector unsigned char permM1 = vec_lvsl(-1, src);
00166 const vector unsigned char permP0 = vec_lvsl(+0, src);
00167 const vector unsigned char permP1 = vec_lvsl(+1, src);
00168 const vector unsigned char permP2 = vec_lvsl(+2, src);
00169 const vector unsigned char permP3 = vec_lvsl(+3, src);
00170 const vector signed short v20ss = (const vector signed short)AVV(20);
00171 const vector unsigned short v5us = vec_splat_u16(5);
00172 const vector signed short v5ss = vec_splat_s16(5);
00173 const vector signed short v16ss = (const vector signed short)AVV(16);
00174 const vector unsigned char dstperm = vec_lvsr(0, dst);
00175 const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
00176 const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
00177
00178 register int align = ((((unsigned long)src) - 2) % 16);
00179
00180 for (i = 0 ; i < 16 ; i ++) {
00181 vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
00182 vector unsigned char srcR1 = vec_ld(-2, src);
00183 vector unsigned char srcR2 = vec_ld(14, src);
00184
00185 switch (align) {
00186 default: {
00187 srcM2 = vec_perm(srcR1, srcR2, permM2);
00188 srcM1 = vec_perm(srcR1, srcR2, permM1);
00189 srcP0 = vec_perm(srcR1, srcR2, permP0);
00190 srcP1 = vec_perm(srcR1, srcR2, permP1);
00191 srcP2 = vec_perm(srcR1, srcR2, permP2);
00192 srcP3 = vec_perm(srcR1, srcR2, permP3);
00193 } break;
00194 case 11: {
00195 srcM2 = vec_perm(srcR1, srcR2, permM2);
00196 srcM1 = vec_perm(srcR1, srcR2, permM1);
00197 srcP0 = vec_perm(srcR1, srcR2, permP0);
00198 srcP1 = vec_perm(srcR1, srcR2, permP1);
00199 srcP2 = vec_perm(srcR1, srcR2, permP2);
00200 srcP3 = srcR2;
00201 } break;
00202 case 12: {
00203 vector unsigned char srcR3 = vec_ld(30, src);
00204 srcM2 = vec_perm(srcR1, srcR2, permM2);
00205 srcM1 = vec_perm(srcR1, srcR2, permM1);
00206 srcP0 = vec_perm(srcR1, srcR2, permP0);
00207 srcP1 = vec_perm(srcR1, srcR2, permP1);
00208 srcP2 = srcR2;
00209 srcP3 = vec_perm(srcR2, srcR3, permP3);
00210 } break;
00211 case 13: {
00212 vector unsigned char srcR3 = vec_ld(30, src);
00213 srcM2 = vec_perm(srcR1, srcR2, permM2);
00214 srcM1 = vec_perm(srcR1, srcR2, permM1);
00215 srcP0 = vec_perm(srcR1, srcR2, permP0);
00216 srcP1 = srcR2;
00217 srcP2 = vec_perm(srcR2, srcR3, permP2);
00218 srcP3 = vec_perm(srcR2, srcR3, permP3);
00219 } break;
00220 case 14: {
00221 vector unsigned char srcR3 = vec_ld(30, src);
00222 srcM2 = vec_perm(srcR1, srcR2, permM2);
00223 srcM1 = vec_perm(srcR1, srcR2, permM1);
00224 srcP0 = srcR2;
00225 srcP1 = vec_perm(srcR2, srcR3, permP1);
00226 srcP2 = vec_perm(srcR2, srcR3, permP2);
00227 srcP3 = vec_perm(srcR2, srcR3, permP3);
00228 } break;
00229 case 15: {
00230 vector unsigned char srcR3 = vec_ld(30, src);
00231 srcM2 = vec_perm(srcR1, srcR2, permM2);
00232 srcM1 = srcR2;
00233 srcP0 = vec_perm(srcR2, srcR3, permP0);
00234 srcP1 = vec_perm(srcR2, srcR3, permP1);
00235 srcP2 = vec_perm(srcR2, srcR3, permP2);
00236 srcP3 = vec_perm(srcR2, srcR3, permP3);
00237 } break;
00238 }
00239
00240 const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0);
00241 const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0);
00242 const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1);
00243 const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1);
00244
00245 const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2);
00246 const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2);
00247 const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3);
00248 const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3);
00249
00250 const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1);
00251 const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1);
00252 const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2);
00253 const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2);
00254
00255 const vector signed short sum1A = vec_adds(srcP0A, srcP1A);
00256 const vector signed short sum1B = vec_adds(srcP0B, srcP1B);
00257 const vector signed short sum2A = vec_adds(srcM1A, srcP2A);
00258 const vector signed short sum2B = vec_adds(srcM1B, srcP2B);
00259 const vector signed short sum3A = vec_adds(srcM2A, srcP3A);
00260 const vector signed short sum3B = vec_adds(srcM2B, srcP3B);
00261
00262 const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss);
00263 const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss);
00264
00265 const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
00266 const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
00267
00268 const vector signed short pp3A = vec_add(sum3A, pp1A);
00269 const vector signed short pp3B = vec_add(sum3B, pp1B);
00270
00271 const vector signed short psumA = vec_sub(pp3A, pp2A);
00272 const vector signed short psumB = vec_sub(pp3B, pp2B);
00273
00274 const vector signed short sumA = vec_sra(psumA, v5us);
00275 const vector signed short sumB = vec_sra(psumB, v5us);
00276
00277 const vector unsigned char sum = vec_packsu(sumA, sumB);
00278
00279 const vector unsigned char dst1 = vec_ld(0, dst);
00280 const vector unsigned char dst2 = vec_ld(16, dst);
00281 const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
00282
00283 vector unsigned char fsum;
00284 OP_U8_ALTIVEC(fsum, sum, vdst);
00285
00286 const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm);
00287 const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask);
00288 const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask);
00289
00290 vec_st(fdst1, 0, dst);
00291 vec_st(fdst2, 16, dst);
00292
00293 src += srcStride;
00294 dst += dstStride;
00295 }
00296 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
00297 }
00298
00299
00300 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
00301 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
00302 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
00303
00304 register int i;
00305
00306 const vector signed int vzero = vec_splat_s32(0);
00307 const vector unsigned char perm = vec_lvsl(0, src);
00308 const vector signed short v20ss = (const vector signed short)AVV(20);
00309 const vector unsigned short v5us = vec_splat_u16(5);
00310 const vector signed short v5ss = vec_splat_s16(5);
00311 const vector signed short v16ss = (const vector signed short)AVV(16);
00312 const vector unsigned char dstperm = vec_lvsr(0, dst);
00313 const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
00314 const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
00315
00316 uint8_t *srcbis = src - (srcStride * 2);
00317
00318 const vector unsigned char srcM2a = vec_ld(0, srcbis);
00319 const vector unsigned char srcM2b = vec_ld(16, srcbis);
00320 const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm);
00321 srcbis += srcStride;
00322 const vector unsigned char srcM1a = vec_ld(0, srcbis);
00323 const vector unsigned char srcM1b = vec_ld(16, srcbis);
00324 const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm);
00325 srcbis += srcStride;
00326 const vector unsigned char srcP0a = vec_ld(0, srcbis);
00327 const vector unsigned char srcP0b = vec_ld(16, srcbis);
00328 const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm);
00329 srcbis += srcStride;
00330 const vector unsigned char srcP1a = vec_ld(0, srcbis);
00331 const vector unsigned char srcP1b = vec_ld(16, srcbis);
00332 const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm);
00333 srcbis += srcStride;
00334 const vector unsigned char srcP2a = vec_ld(0, srcbis);
00335 const vector unsigned char srcP2b = vec_ld(16, srcbis);
00336 const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm);
00337 srcbis += srcStride;
00338
00339 vector signed short srcM2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2);
00340 vector signed short srcM2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2);
00341 vector signed short srcM1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1);
00342 vector signed short srcM1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1);
00343 vector signed short srcP0ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0);
00344 vector signed short srcP0ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0);
00345 vector signed short srcP1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1);
00346 vector signed short srcP1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1);
00347 vector signed short srcP2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2);
00348 vector signed short srcP2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2);
00349
00350 for (i = 0 ; i < 16 ; i++) {
00351 const vector unsigned char srcP3a = vec_ld(0, srcbis);
00352 const vector unsigned char srcP3b = vec_ld(16, srcbis);
00353 const vector unsigned char srcP3 = vec_perm(srcP3a, srcP3b, perm);
00354 const vector signed short srcP3ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3);
00355 const vector signed short srcP3ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3);
00356 srcbis += srcStride;
00357
00358 const vector signed short sum1A = vec_adds(srcP0ssA, srcP1ssA);
00359 const vector signed short sum1B = vec_adds(srcP0ssB, srcP1ssB);
00360 const vector signed short sum2A = vec_adds(srcM1ssA, srcP2ssA);
00361 const vector signed short sum2B = vec_adds(srcM1ssB, srcP2ssB);
00362 const vector signed short sum3A = vec_adds(srcM2ssA, srcP3ssA);
00363 const vector signed short sum3B = vec_adds(srcM2ssB, srcP3ssB);
00364
00365 srcM2ssA = srcM1ssA;
00366 srcM2ssB = srcM1ssB;
00367 srcM1ssA = srcP0ssA;
00368 srcM1ssB = srcP0ssB;
00369 srcP0ssA = srcP1ssA;
00370 srcP0ssB = srcP1ssB;
00371 srcP1ssA = srcP2ssA;
00372 srcP1ssB = srcP2ssB;
00373 srcP2ssA = srcP3ssA;
00374 srcP2ssB = srcP3ssB;
00375
00376 const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss);
00377 const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss);
00378
00379 const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
00380 const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
00381
00382 const vector signed short pp3A = vec_add(sum3A, pp1A);
00383 const vector signed short pp3B = vec_add(sum3B, pp1B);
00384
00385 const vector signed short psumA = vec_sub(pp3A, pp2A);
00386 const vector signed short psumB = vec_sub(pp3B, pp2B);
00387
00388 const vector signed short sumA = vec_sra(psumA, v5us);
00389 const vector signed short sumB = vec_sra(psumB, v5us);
00390
00391 const vector unsigned char sum = vec_packsu(sumA, sumB);
00392
00393 const vector unsigned char dst1 = vec_ld(0, dst);
00394 const vector unsigned char dst2 = vec_ld(16, dst);
00395 const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
00396
00397 vector unsigned char fsum;
00398 OP_U8_ALTIVEC(fsum, sum, vdst);
00399
00400 const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm);
00401 const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask);
00402 const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask);
00403
00404 vec_st(fdst1, 0, dst);
00405 vec_st(fdst2, 16, dst);
00406
00407 dst += dstStride;
00408 }
00409 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
00410 }
00411
00412
00413 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
00414 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
00415 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
00416 register int i;
00417 const vector signed int vzero = vec_splat_s32(0);
00418 const vector unsigned char permM2 = vec_lvsl(-2, src);
00419 const vector unsigned char permM1 = vec_lvsl(-1, src);
00420 const vector unsigned char permP0 = vec_lvsl(+0, src);
00421 const vector unsigned char permP1 = vec_lvsl(+1, src);
00422 const vector unsigned char permP2 = vec_lvsl(+2, src);
00423 const vector unsigned char permP3 = vec_lvsl(+3, src);
00424 const vector signed short v20ss = (const vector signed short)AVV(20);
00425 const vector unsigned int v10ui = vec_splat_u32(10);
00426 const vector signed short v5ss = vec_splat_s16(5);
00427 const vector signed short v1ss = vec_splat_s16(1);
00428 const vector signed int v512si = (const vector signed int)AVV(512);
00429 const vector unsigned int v16ui = (const vector unsigned int)AVV(16);
00430
00431 register int align = ((((unsigned long)src) - 2) % 16);
00432
00433 src -= (2 * srcStride);
00434
00435 for (i = 0 ; i < 21 ; i ++) {
00436 vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
00437 vector unsigned char srcR1 = vec_ld(-2, src);
00438 vector unsigned char srcR2 = vec_ld(14, src);
00439
00440 switch (align) {
00441 default: {
00442 srcM2 = vec_perm(srcR1, srcR2, permM2);
00443 srcM1 = vec_perm(srcR1, srcR2, permM1);
00444 srcP0 = vec_perm(srcR1, srcR2, permP0);
00445 srcP1 = vec_perm(srcR1, srcR2, permP1);
00446 srcP2 = vec_perm(srcR1, srcR2, permP2);
00447 srcP3 = vec_perm(srcR1, srcR2, permP3);
00448 } break;
00449 case 11: {
00450 srcM2 = vec_perm(srcR1, srcR2, permM2);
00451 srcM1 = vec_perm(srcR1, srcR2, permM1);
00452 srcP0 = vec_perm(srcR1, srcR2, permP0);
00453 srcP1 = vec_perm(srcR1, srcR2, permP1);
00454 srcP2 = vec_perm(srcR1, srcR2, permP2);
00455 srcP3 = srcR2;
00456 } break;
00457 case 12: {
00458 vector unsigned char srcR3 = vec_ld(30, src);
00459 srcM2 = vec_perm(srcR1, srcR2, permM2);
00460 srcM1 = vec_perm(srcR1, srcR2, permM1);
00461 srcP0 = vec_perm(srcR1, srcR2, permP0);
00462 srcP1 = vec_perm(srcR1, srcR2, permP1);
00463 srcP2 = srcR2;
00464 srcP3 = vec_perm(srcR2, srcR3, permP3);
00465 } break;
00466 case 13: {
00467 vector unsigned char srcR3 = vec_ld(30, src);
00468 srcM2 = vec_perm(srcR1, srcR2, permM2);
00469 srcM1 = vec_perm(srcR1, srcR2, permM1);
00470 srcP0 = vec_perm(srcR1, srcR2, permP0);
00471 srcP1 = srcR2;
00472 srcP2 = vec_perm(srcR2, srcR3, permP2);
00473 srcP3 = vec_perm(srcR2, srcR3, permP3);
00474 } break;
00475 case 14: {
00476 vector unsigned char srcR3 = vec_ld(30, src);
00477 srcM2 = vec_perm(srcR1, srcR2, permM2);
00478 srcM1 = vec_perm(srcR1, srcR2, permM1);
00479 srcP0 = srcR2;
00480 srcP1 = vec_perm(srcR2, srcR3, permP1);
00481 srcP2 = vec_perm(srcR2, srcR3, permP2);
00482 srcP3 = vec_perm(srcR2, srcR3, permP3);
00483 } break;
00484 case 15: {
00485 vector unsigned char srcR3 = vec_ld(30, src);
00486 srcM2 = vec_perm(srcR1, srcR2, permM2);
00487 srcM1 = srcR2;
00488 srcP0 = vec_perm(srcR2, srcR3, permP0);
00489 srcP1 = vec_perm(srcR2, srcR3, permP1);
00490 srcP2 = vec_perm(srcR2, srcR3, permP2);
00491 srcP3 = vec_perm(srcR2, srcR3, permP3);
00492 } break;
00493 }
00494
00495 const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0);
00496 const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0);
00497 const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1);
00498 const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1);
00499
00500 const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2);
00501 const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2);
00502 const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3);
00503 const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3);
00504
00505 const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1);
00506 const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1);
00507 const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2);
00508 const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2);
00509
00510 const vector signed short sum1A = vec_adds(srcP0A, srcP1A);
00511 const vector signed short sum1B = vec_adds(srcP0B, srcP1B);
00512 const vector signed short sum2A = vec_adds(srcM1A, srcP2A);
00513 const vector signed short sum2B = vec_adds(srcM1B, srcP2B);
00514 const vector signed short sum3A = vec_adds(srcM2A, srcP3A);
00515 const vector signed short sum3B = vec_adds(srcM2B, srcP3B);
00516
00517 const vector signed short pp1A = vec_mladd(sum1A, v20ss, sum3A);
00518 const vector signed short pp1B = vec_mladd(sum1B, v20ss, sum3B);
00519
00520 const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
00521 const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
00522
00523 const vector signed short psumA = vec_sub(pp1A, pp2A);
00524 const vector signed short psumB = vec_sub(pp1B, pp2B);
00525
00526 vec_st(psumA, 0, tmp);
00527 vec_st(psumB, 16, tmp);
00528
00529 src += srcStride;
00530 tmp += tmpStride;
00531 }
00532
00533 const vector unsigned char dstperm = vec_lvsr(0, dst);
00534 const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
00535 const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
00536 const vector unsigned char mperm = (const vector unsigned char)
00537 AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
00538 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
00539
00540 int16_t *tmpbis = tmp - (tmpStride * 21);
00541
00542 vector signed short tmpM2ssA = vec_ld(0, tmpbis);
00543 vector signed short tmpM2ssB = vec_ld(16, tmpbis);
00544 tmpbis += tmpStride;
00545 vector signed short tmpM1ssA = vec_ld(0, tmpbis);
00546 vector signed short tmpM1ssB = vec_ld(16, tmpbis);
00547 tmpbis += tmpStride;
00548 vector signed short tmpP0ssA = vec_ld(0, tmpbis);
00549 vector signed short tmpP0ssB = vec_ld(16, tmpbis);
00550 tmpbis += tmpStride;
00551 vector signed short tmpP1ssA = vec_ld(0, tmpbis);
00552 vector signed short tmpP1ssB = vec_ld(16, tmpbis);
00553 tmpbis += tmpStride;
00554 vector signed short tmpP2ssA = vec_ld(0, tmpbis);
00555 vector signed short tmpP2ssB = vec_ld(16, tmpbis);
00556 tmpbis += tmpStride;
00557
00558 for (i = 0 ; i < 16 ; i++) {
00559 const vector signed short tmpP3ssA = vec_ld(0, tmpbis);
00560 const vector signed short tmpP3ssB = vec_ld(16, tmpbis);
00561 tmpbis += tmpStride;
00562
00563 const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
00564 const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
00565 const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
00566 const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
00567 const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
00568 const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
00569
00570 tmpM2ssA = tmpM1ssA;
00571 tmpM2ssB = tmpM1ssB;
00572 tmpM1ssA = tmpP0ssA;
00573 tmpM1ssB = tmpP0ssB;
00574 tmpP0ssA = tmpP1ssA;
00575 tmpP0ssB = tmpP1ssB;
00576 tmpP1ssA = tmpP2ssA;
00577 tmpP1ssB = tmpP2ssB;
00578 tmpP2ssA = tmpP3ssA;
00579 tmpP2ssB = tmpP3ssB;
00580
00581 const vector signed int pp1Ae = vec_mule(sum1A, v20ss);
00582 const vector signed int pp1Ao = vec_mulo(sum1A, v20ss);
00583 const vector signed int pp1Be = vec_mule(sum1B, v20ss);
00584 const vector signed int pp1Bo = vec_mulo(sum1B, v20ss);
00585
00586 const vector signed int pp2Ae = vec_mule(sum2A, v5ss);
00587 const vector signed int pp2Ao = vec_mulo(sum2A, v5ss);
00588 const vector signed int pp2Be = vec_mule(sum2B, v5ss);
00589 const vector signed int pp2Bo = vec_mulo(sum2B, v5ss);
00590
00591 const vector signed int pp3Ae = vec_sra((vector signed int)sum3A, v16ui);
00592 const vector signed int pp3Ao = vec_mulo(sum3A, v1ss);
00593 const vector signed int pp3Be = vec_sra((vector signed int)sum3B, v16ui);
00594 const vector signed int pp3Bo = vec_mulo(sum3B, v1ss);
00595
00596 const vector signed int pp1cAe = vec_add(pp1Ae, v512si);
00597 const vector signed int pp1cAo = vec_add(pp1Ao, v512si);
00598 const vector signed int pp1cBe = vec_add(pp1Be, v512si);
00599 const vector signed int pp1cBo = vec_add(pp1Bo, v512si);
00600
00601 const vector signed int pp32Ae = vec_sub(pp3Ae, pp2Ae);
00602 const vector signed int pp32Ao = vec_sub(pp3Ao, pp2Ao);
00603 const vector signed int pp32Be = vec_sub(pp3Be, pp2Be);
00604 const vector signed int pp32Bo = vec_sub(pp3Bo, pp2Bo);
00605
00606 const vector signed int sumAe = vec_add(pp1cAe, pp32Ae);
00607 const vector signed int sumAo = vec_add(pp1cAo, pp32Ao);
00608 const vector signed int sumBe = vec_add(pp1cBe, pp32Be);
00609 const vector signed int sumBo = vec_add(pp1cBo, pp32Bo);
00610
00611 const vector signed int ssumAe = vec_sra(sumAe, v10ui);
00612 const vector signed int ssumAo = vec_sra(sumAo, v10ui);
00613 const vector signed int ssumBe = vec_sra(sumBe, v10ui);
00614 const vector signed int ssumBo = vec_sra(sumBo, v10ui);
00615
00616 const vector signed short ssume = vec_packs(ssumAe, ssumBe);
00617 const vector signed short ssumo = vec_packs(ssumAo, ssumBo);
00618
00619 const vector unsigned char sumv = vec_packsu(ssume, ssumo);
00620 const vector unsigned char sum = vec_perm(sumv, sumv, mperm);
00621
00622 const vector unsigned char dst1 = vec_ld(0, dst);
00623 const vector unsigned char dst2 = vec_ld(16, dst);
00624 const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
00625
00626 vector unsigned char fsum;
00627 OP_U8_ALTIVEC(fsum, sum, vdst);
00628
00629 const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm);
00630 const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask);
00631 const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask);
00632
00633 vec_st(fdst1, 0, dst);
00634 vec_st(fdst2, 16, dst);
00635
00636 dst += dstStride;
00637 }
00638 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
00639 }