00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include "../dsputil.h"
00022
00023 #include "gcc_fixes.h"
00024
00025 #include "dsputil_altivec.h"
00026
00027 #ifdef CONFIG_DARWIN
00028 #include <sys/sysctl.h>
00029 #else
00030 #ifdef __AMIGAOS4__
00031 #include <exec/exec.h>
00032 #include <interfaces/exec.h>
00033 #include <proto/exec.h>
00034 #else
00035 #include <signal.h>
00036 #include <setjmp.h>
00037
00038 static sigjmp_buf jmpbuf;
00039 static volatile sig_atomic_t canjump = 0;
00040
00041 static void sigill_handler (int sig)
00042 {
00043 if (!canjump) {
00044 signal (sig, SIG_DFL);
00045 raise (sig);
00046 }
00047
00048 canjump = 0;
00049 siglongjmp (jmpbuf, 1);
00050 }
00051 #endif
00052 #endif
00053
00054 int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
00055 {
00056 int i;
00057 int s __attribute__((aligned(16)));
00058 const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
00059 vector unsigned char *tv;
00060 vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
00061 vector unsigned int sad;
00062 vector signed int sumdiffs;
00063
00064 s = 0;
00065 sad = (vector unsigned int)vec_splat_u32(0);
00066 for(i=0;i<h;i++) {
00067
00068
00069
00070
00071
00072 tv = (vector unsigned char *) pix1;
00073 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
00074
00075 tv = (vector unsigned char *) &pix2[0];
00076 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
00077
00078 tv = (vector unsigned char *) &pix2[1];
00079 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
00080
00081
00082 avgv = vec_avg(pix2v, pix2iv);
00083
00084
00085 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
00086
00087
00088 sad = vec_sum4s(t5, sad);
00089
00090 pix1 += line_size;
00091 pix2 += line_size;
00092 }
00093
00094 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
00095 sumdiffs = vec_splat(sumdiffs, 3);
00096 vec_ste(sumdiffs, 0, &s);
00097
00098 return s;
00099 }
00100
00101 int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
00102 {
00103 int i;
00104 int s __attribute__((aligned(16)));
00105 const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
00106 vector unsigned char *tv;
00107 vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
00108 vector unsigned int sad;
00109 vector signed int sumdiffs;
00110 uint8_t *pix3 = pix2 + line_size;
00111
00112 s = 0;
00113 sad = (vector unsigned int)vec_splat_u32(0);
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124 tv = (vector unsigned char *) &pix2[0];
00125 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
00126
00127 for(i=0;i<h;i++) {
00128
00129
00130
00131
00132
00133 tv = (vector unsigned char *) pix1;
00134 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
00135
00136 tv = (vector unsigned char *) &pix3[0];
00137 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
00138
00139
00140 avgv = vec_avg(pix2v, pix3v);
00141
00142
00143 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
00144
00145
00146 sad = vec_sum4s(t5, sad);
00147
00148 pix1 += line_size;
00149 pix2v = pix3v;
00150 pix3 += line_size;
00151
00152 }
00153
00154
00155 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
00156 sumdiffs = vec_splat(sumdiffs, 3);
00157 vec_ste(sumdiffs, 0, &s);
00158 return s;
00159 }
00160
00161 int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
00162 {
00163 int i;
00164 int s __attribute__((aligned(16)));
00165 uint8_t *pix3 = pix2 + line_size;
00166 const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
00167 const_vector unsigned short two = (const_vector unsigned short)vec_splat_u16(2);
00168 vector unsigned char *tv, avgv, t5;
00169 vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
00170 vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
00171 vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
00172 vector unsigned short avghv, avglv;
00173 vector unsigned short t1, t2, t3, t4;
00174 vector unsigned int sad;
00175 vector signed int sumdiffs;
00176
00177 sad = (vector unsigned int)vec_splat_u32(0);
00178
00179 s = 0;
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190 tv = (vector unsigned char *) &pix2[0];
00191 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
00192
00193 tv = (vector unsigned char *) &pix2[1];
00194 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
00195
00196 pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
00197 pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
00198 pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
00199 pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
00200 t1 = vec_add(pix2hv, pix2ihv);
00201 t2 = vec_add(pix2lv, pix2ilv);
00202
00203 for(i=0;i<h;i++) {
00204
00205
00206
00207
00208
00209 tv = (vector unsigned char *) pix1;
00210 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
00211
00212 tv = (vector unsigned char *) &pix3[0];
00213 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
00214
00215 tv = (vector unsigned char *) &pix3[1];
00216 pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227 pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
00228 pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
00229 pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
00230 pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
00231
00232
00233 t3 = vec_add(pix3hv, pix3ihv);
00234 t4 = vec_add(pix3lv, pix3ilv);
00235
00236 avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
00237 avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
00238
00239
00240 avgv = vec_pack(avghv, avglv);
00241
00242
00243 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
00244
00245
00246 sad = vec_sum4s(t5, sad);
00247
00248 pix1 += line_size;
00249 pix3 += line_size;
00250
00251 t1 = t3;
00252 t2 = t4;
00253 }
00254
00255 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
00256 sumdiffs = vec_splat(sumdiffs, 3);
00257 vec_ste(sumdiffs, 0, &s);
00258
00259 return s;
00260 }
00261
00262 int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
00263 {
00264 int i;
00265 int s __attribute__((aligned(16)));
00266 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
00267 vector unsigned char perm1, perm2, *pix1v, *pix2v;
00268 vector unsigned char t1, t2, t3,t4, t5;
00269 vector unsigned int sad;
00270 vector signed int sumdiffs;
00271
00272 sad = (vector unsigned int)vec_splat_u32(0);
00273
00274
00275 for(i=0;i<h;i++) {
00276
00277 perm1 = vec_lvsl(0, pix1);
00278 pix1v = (vector unsigned char *) pix1;
00279 perm2 = vec_lvsl(0, pix2);
00280 pix2v = (vector unsigned char *) pix2;
00281 t1 = vec_perm(pix1v[0], pix1v[1], perm1);
00282 t2 = vec_perm(pix2v[0], pix2v[1], perm2);
00283
00284
00285 t3 = vec_max(t1, t2);
00286 t4 = vec_min(t1, t2);
00287 t5 = vec_sub(t3, t4);
00288
00289
00290 sad = vec_sum4s(t5, sad);
00291
00292 pix1 += line_size;
00293 pix2 += line_size;
00294 }
00295
00296
00297 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
00298 sumdiffs = vec_splat(sumdiffs, 3);
00299 vec_ste(sumdiffs, 0, &s);
00300
00301 return s;
00302 }
00303
00304 int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
00305 {
00306 int i;
00307 int s __attribute__((aligned(16)));
00308 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
00309 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
00310 vector unsigned char t1, t2, t3,t4, t5;
00311 vector unsigned int sad;
00312 vector signed int sumdiffs;
00313
00314 sad = (vector unsigned int)vec_splat_u32(0);
00315
00316 permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
00317
00318 for(i=0;i<h;i++) {
00319
00320
00321
00322 perm1 = vec_lvsl(0, pix1);
00323 pix1v = (vector unsigned char *) pix1;
00324 perm2 = vec_lvsl(0, pix2);
00325 pix2v = (vector unsigned char *) pix2;
00326 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
00327 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
00328
00329
00330 t3 = vec_max(t1, t2);
00331 t4 = vec_min(t1, t2);
00332 t5 = vec_sub(t3, t4);
00333
00334
00335 sad = vec_sum4s(t5, sad);
00336
00337 pix1 += line_size;
00338 pix2 += line_size;
00339 }
00340
00341
00342 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
00343 sumdiffs = vec_splat(sumdiffs, 3);
00344 vec_ste(sumdiffs, 0, &s);
00345
00346 return s;
00347 }
00348
00349 int pix_norm1_altivec(uint8_t *pix, int line_size)
00350 {
00351 int i;
00352 int s __attribute__((aligned(16)));
00353 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
00354 vector unsigned char *tv;
00355 vector unsigned char pixv;
00356 vector unsigned int sv;
00357 vector signed int sum;
00358
00359 sv = (vector unsigned int)vec_splat_u32(0);
00360
00361 s = 0;
00362 for (i = 0; i < 16; i++) {
00363
00364 tv = (vector unsigned char *) pix;
00365 pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
00366
00367
00368 sv = vec_msum(pixv, pixv, sv);
00369
00370 pix += line_size;
00371 }
00372
00373 sum = vec_sums((vector signed int) sv, (vector signed int) zero);
00374 sum = vec_splat(sum, 3);
00375 vec_ste(sum, 0, &s);
00376
00377 return s;
00378 }
00379
00385 int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
00386 {
00387 int i;
00388 int s __attribute__((aligned(16)));
00389 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
00390 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
00391 vector unsigned char t1, t2, t3,t4, t5;
00392 vector unsigned int sum;
00393 vector signed int sumsqr;
00394
00395 sum = (vector unsigned int)vec_splat_u32(0);
00396
00397 permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
00398
00399
00400 for(i=0;i<h;i++) {
00401
00402
00403
00404 perm1 = vec_lvsl(0, pix1);
00405 pix1v = (vector unsigned char *) pix1;
00406 perm2 = vec_lvsl(0, pix2);
00407 pix2v = (vector unsigned char *) pix2;
00408 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
00409 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
00410
00411
00412
00413
00414
00415
00416
00417 t3 = vec_max(t1, t2);
00418 t4 = vec_min(t1, t2);
00419 t5 = vec_sub(t3, t4);
00420
00421
00422 sum = vec_msum(t5, t5, sum);
00423
00424 pix1 += line_size;
00425 pix2 += line_size;
00426 }
00427
00428
00429 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
00430 sumsqr = vec_splat(sumsqr, 3);
00431 vec_ste(sumsqr, 0, &s);
00432
00433 return s;
00434 }
00435
00441 int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
00442 {
00443 int i;
00444 int s __attribute__((aligned(16)));
00445 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
00446 vector unsigned char perm1, perm2, *pix1v, *pix2v;
00447 vector unsigned char t1, t2, t3,t4, t5;
00448 vector unsigned int sum;
00449 vector signed int sumsqr;
00450
00451 sum = (vector unsigned int)vec_splat_u32(0);
00452
00453 for(i=0;i<h;i++) {
00454
00455 perm1 = vec_lvsl(0, pix1);
00456 pix1v = (vector unsigned char *) pix1;
00457 perm2 = vec_lvsl(0, pix2);
00458 pix2v = (vector unsigned char *) pix2;
00459 t1 = vec_perm(pix1v[0], pix1v[1], perm1);
00460 t2 = vec_perm(pix2v[0], pix2v[1], perm2);
00461
00462
00463
00464
00465
00466
00467
00468 t3 = vec_max(t1, t2);
00469 t4 = vec_min(t1, t2);
00470 t5 = vec_sub(t3, t4);
00471
00472
00473 sum = vec_msum(t5, t5, sum);
00474
00475 pix1 += line_size;
00476 pix2 += line_size;
00477 }
00478
00479
00480 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
00481 sumsqr = vec_splat(sumsqr, 3);
00482 vec_ste(sumsqr, 0, &s);
00483
00484 return s;
00485 }
00486
00487 int pix_sum_altivec(uint8_t * pix, int line_size)
00488 {
00489 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
00490 vector unsigned char perm, *pixv;
00491 vector unsigned char t1;
00492 vector unsigned int sad;
00493 vector signed int sumdiffs;
00494
00495 int i;
00496 int s __attribute__((aligned(16)));
00497
00498 sad = (vector unsigned int)vec_splat_u32(0);
00499
00500 for (i = 0; i < 16; i++) {
00501
00502 perm = vec_lvsl(0, pix);
00503 pixv = (vector unsigned char *) pix;
00504 t1 = vec_perm(pixv[0], pixv[1], perm);
00505
00506
00507 sad = vec_sum4s(t1, sad);
00508
00509 pix += line_size;
00510 }
00511
00512
00513 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
00514 sumdiffs = vec_splat(sumdiffs, 3);
00515 vec_ste(sumdiffs, 0, &s);
00516
00517 return s;
00518 }
00519
00520 void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
00521 {
00522 int i;
00523 vector unsigned char perm, bytes, *pixv;
00524 const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
00525 vector signed short shorts;
00526
00527 for(i=0;i<8;i++)
00528 {
00529
00530
00531
00532 perm = vec_lvsl(0, pixels);
00533 pixv = (vector unsigned char *) pixels;
00534 bytes = vec_perm(pixv[0], pixv[1], perm);
00535
00536
00537 shorts = (vector signed short)vec_mergeh(zero, bytes);
00538
00539
00540 vec_st(shorts, i*16, (vector signed short*)block);
00541
00542 pixels += line_size;
00543 }
00544 }
00545
00546 void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
00547 const uint8_t *s2, int stride)
00548 {
00549 int i;
00550 vector unsigned char perm, bytes, *pixv;
00551 const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
00552 vector signed short shorts1, shorts2;
00553
00554 for(i=0;i<4;i++)
00555 {
00556
00557
00558
00559 perm = vec_lvsl(0, s1);
00560 pixv = (vector unsigned char *) s1;
00561 bytes = vec_perm(pixv[0], pixv[1], perm);
00562
00563
00564 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
00565
00566
00567 perm = vec_lvsl(0, s2);
00568 pixv = (vector unsigned char *) s2;
00569 bytes = vec_perm(pixv[0], pixv[1], perm);
00570
00571
00572 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
00573
00574
00575 shorts1 = vec_sub(shorts1, shorts2);
00576
00577
00578 vec_st(shorts1, 0, (vector signed short*)block);
00579
00580 s1 += stride;
00581 s2 += stride;
00582 block += 8;
00583
00584
00585
00586
00587
00588
00589
00590
00591 perm = vec_lvsl(0, s1);
00592 pixv = (vector unsigned char *) s1;
00593 bytes = vec_perm(pixv[0], pixv[1], perm);
00594
00595
00596 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
00597
00598
00599 perm = vec_lvsl(0, s2);
00600 pixv = (vector unsigned char *) s2;
00601 bytes = vec_perm(pixv[0], pixv[1], perm);
00602
00603
00604 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
00605
00606
00607 shorts1 = vec_sub(shorts1, shorts2);
00608
00609
00610 vec_st(shorts1, 0, (vector signed short*)block);
00611
00612 s1 += stride;
00613 s2 += stride;
00614 block += 8;
00615 }
00616 }
00617
00618 void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
00619 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
00620 int i;
00621 for(i=0; i+7<w; i++){
00622 dst[i+0] += src[i+0];
00623 dst[i+1] += src[i+1];
00624 dst[i+2] += src[i+2];
00625 dst[i+3] += src[i+3];
00626 dst[i+4] += src[i+4];
00627 dst[i+5] += src[i+5];
00628 dst[i+6] += src[i+6];
00629 dst[i+7] += src[i+7];
00630 }
00631 for(; i<w; i++)
00632 dst[i+0] += src[i+0];
00633 #else
00634 register int i;
00635 register vector unsigned char vdst, vsrc;
00636
00637
00638 for(i = 0 ; (i + 15) < w ; i++)
00639 {
00640 vdst = vec_ld(i << 4, (unsigned char*)dst);
00641 vsrc = vec_ld(i << 4, (unsigned char*)src);
00642 vdst = vec_add(vsrc, vdst);
00643 vec_st(vdst, i << 4, (unsigned char*)dst);
00644 }
00645
00646 for (; (i < w) ; i++)
00647 {
00648 dst[i] = src[i];
00649 }
00650 #endif
00651 }
00652
00653
00654 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00655 {
00656 POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
00657 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
00658 int i;
00659
00660 POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
00661
00662 for(i=0; i<h; i++) {
00663 *((uint32_t*)(block)) = LD32(pixels);
00664 *((uint32_t*)(block+4)) = LD32(pixels+4);
00665 *((uint32_t*)(block+8)) = LD32(pixels+8);
00666 *((uint32_t*)(block+12)) = LD32(pixels+12);
00667 pixels+=line_size;
00668 block +=line_size;
00669 }
00670
00671 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
00672
00673 #else
00674 register vector unsigned char pixelsv1, pixelsv2;
00675 register vector unsigned char pixelsv1B, pixelsv2B;
00676 register vector unsigned char pixelsv1C, pixelsv2C;
00677 register vector unsigned char pixelsv1D, pixelsv2D;
00678
00679 register vector unsigned char perm = vec_lvsl(0, pixels);
00680 int i;
00681 register int line_size_2 = line_size << 1;
00682 register int line_size_3 = line_size + line_size_2;
00683 register int line_size_4 = line_size << 2;
00684
00685 POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
00686
00687
00688
00689
00690
00691 #if 0
00692 for(i=0; i<h; i++) {
00693 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
00694 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
00695 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
00696 0, (unsigned char*)block);
00697 pixels+=line_size;
00698 block +=line_size;
00699 }
00700 #else
00701 for(i=0; i<h; i+=4) {
00702 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
00703 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
00704 pixelsv1B = vec_ld(line_size, (unsigned char*)pixels);
00705 pixelsv2B = vec_ld(16 + line_size, (unsigned char*)pixels);
00706 pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels);
00707 pixelsv2C = vec_ld(16 + line_size_2, (unsigned char*)pixels);
00708 pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels);
00709 pixelsv2D = vec_ld(16 + line_size_3, (unsigned char*)pixels);
00710 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
00711 0, (unsigned char*)block);
00712 vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
00713 line_size, (unsigned char*)block);
00714 vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
00715 line_size_2, (unsigned char*)block);
00716 vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
00717 line_size_3, (unsigned char*)block);
00718 pixels+=line_size_4;
00719 block +=line_size_4;
00720 }
00721 #endif
00722 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
00723
00724 #endif
00725 }
00726
00727
00728 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
00729 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00730 {
00731 POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
00732 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
00733 int i;
00734
00735 POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
00736
00737 for(i=0; i<h; i++) {
00738 op_avg(*((uint32_t*)(block)),LD32(pixels));
00739 op_avg(*((uint32_t*)(block+4)),LD32(pixels+4));
00740 op_avg(*((uint32_t*)(block+8)),LD32(pixels+8));
00741 op_avg(*((uint32_t*)(block+12)),LD32(pixels+12));
00742 pixels+=line_size;
00743 block +=line_size;
00744 }
00745
00746 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
00747
00748 #else
00749 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
00750 register vector unsigned char perm = vec_lvsl(0, pixels);
00751 int i;
00752
00753 POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
00754
00755 for(i=0; i<h; i++) {
00756 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
00757 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
00758 blockv = vec_ld(0, block);
00759 pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
00760 blockv = vec_avg(blockv,pixelsv);
00761 vec_st(blockv, 0, (unsigned char*)block);
00762 pixels+=line_size;
00763 block +=line_size;
00764 }
00765
00766 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
00767
00768 #endif
00769 }
00770
00771
00772 void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
00773 {
00774 POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
00775 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
00776 int i;
00777 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
00778 for (i = 0; i < h; i++) {
00779 *((uint32_t *) (block)) =
00780 (((*((uint32_t *) (block))) |
00781 ((((const struct unaligned_32 *) (pixels))->l))) -
00782 ((((*((uint32_t *) (block))) ^
00783 ((((const struct unaligned_32 *) (pixels))->
00784 l))) & 0xFEFEFEFEUL) >> 1));
00785 *((uint32_t *) (block + 4)) =
00786 (((*((uint32_t *) (block + 4))) |
00787 ((((const struct unaligned_32 *) (pixels + 4))->l))) -
00788 ((((*((uint32_t *) (block + 4))) ^
00789 ((((const struct unaligned_32 *) (pixels +
00790 4))->
00791 l))) & 0xFEFEFEFEUL) >> 1));
00792 pixels += line_size;
00793 block += line_size;
00794 }
00795 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
00796
00797 #else
00798 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
00799 int i;
00800
00801 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
00802
00803 for (i = 0; i < h; i++) {
00804
00805
00806
00807
00808 int rightside = ((unsigned long)block & 0x0000000F);
00809
00810 blockv = vec_ld(0, block);
00811 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
00812 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
00813 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
00814
00815 if (rightside)
00816 {
00817 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
00818 }
00819 else
00820 {
00821 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
00822 }
00823
00824 blockv = vec_avg(blockv, pixelsv);
00825
00826 vec_st(blockv, 0, block);
00827
00828 pixels += line_size;
00829 block += line_size;
00830 }
00831
00832 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
00833
00834 #endif
00835 }
00836
00837
00838 void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00839 {
00840 POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
00841 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
00842 int j;
00843 POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
00844 for (j = 0; j < 2; j++) {
00845 int i;
00846 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
00847 const uint32_t b =
00848 (((const struct unaligned_32 *) (pixels + 1))->l);
00849 uint32_t l0 =
00850 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
00851 uint32_t h0 =
00852 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
00853 uint32_t l1, h1;
00854 pixels += line_size;
00855 for (i = 0; i < h; i += 2) {
00856 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
00857 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
00858 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
00859 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
00860 *((uint32_t *) block) =
00861 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
00862 pixels += line_size;
00863 block += line_size;
00864 a = (((const struct unaligned_32 *) (pixels))->l);
00865 b = (((const struct unaligned_32 *) (pixels + 1))->l);
00866 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
00867 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
00868 *((uint32_t *) block) =
00869 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
00870 pixels += line_size;
00871 block += line_size;
00872 } pixels += 4 - line_size * (h + 1);
00873 block += 4 - line_size * h;
00874 }
00875
00876 POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
00877
00878 #else
00879 register int i;
00880 register vector unsigned char
00881 pixelsv1, pixelsv2,
00882 pixelsavg;
00883 register vector unsigned char
00884 blockv, temp1, temp2;
00885 register vector unsigned short
00886 pixelssum1, pixelssum2, temp3;
00887 register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
00888 register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
00889
00890 temp1 = vec_ld(0, pixels);
00891 temp2 = vec_ld(16, pixels);
00892 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
00893 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
00894 {
00895 pixelsv2 = temp2;
00896 }
00897 else
00898 {
00899 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
00900 }
00901 pixelsv1 = vec_mergeh(vczero, pixelsv1);
00902 pixelsv2 = vec_mergeh(vczero, pixelsv2);
00903 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
00904 (vector unsigned short)pixelsv2);
00905 pixelssum1 = vec_add(pixelssum1, vctwo);
00906
00907 POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
00908 for (i = 0; i < h ; i++) {
00909 int rightside = ((unsigned long)block & 0x0000000F);
00910 blockv = vec_ld(0, block);
00911
00912 temp1 = vec_ld(line_size, pixels);
00913 temp2 = vec_ld(line_size + 16, pixels);
00914 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
00915 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
00916 {
00917 pixelsv2 = temp2;
00918 }
00919 else
00920 {
00921 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
00922 }
00923
00924 pixelsv1 = vec_mergeh(vczero, pixelsv1);
00925 pixelsv2 = vec_mergeh(vczero, pixelsv2);
00926 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
00927 (vector unsigned short)pixelsv2);
00928 temp3 = vec_add(pixelssum1, pixelssum2);
00929 temp3 = vec_sra(temp3, vctwo);
00930 pixelssum1 = vec_add(pixelssum2, vctwo);
00931 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
00932
00933 if (rightside)
00934 {
00935 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
00936 }
00937 else
00938 {
00939 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
00940 }
00941
00942 vec_st(blockv, 0, block);
00943
00944 block += line_size;
00945 pixels += line_size;
00946 }
00947
00948 POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
00949 #endif
00950 }
00951
00952
00953 void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00954 {
00955 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
00956 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
00957 int j;
00958 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
00959 for (j = 0; j < 2; j++) {
00960 int i;
00961 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
00962 const uint32_t b =
00963 (((const struct unaligned_32 *) (pixels + 1))->l);
00964 uint32_t l0 =
00965 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
00966 uint32_t h0 =
00967 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
00968 uint32_t l1, h1;
00969 pixels += line_size;
00970 for (i = 0; i < h; i += 2) {
00971 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
00972 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
00973 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
00974 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
00975 *((uint32_t *) block) =
00976 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
00977 pixels += line_size;
00978 block += line_size;
00979 a = (((const struct unaligned_32 *) (pixels))->l);
00980 b = (((const struct unaligned_32 *) (pixels + 1))->l);
00981 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
00982 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
00983 *((uint32_t *) block) =
00984 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
00985 pixels += line_size;
00986 block += line_size;
00987 } pixels += 4 - line_size * (h + 1);
00988 block += 4 - line_size * h;
00989 }
00990
00991 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
00992
00993 #else
00994 register int i;
00995 register vector unsigned char
00996 pixelsv1, pixelsv2,
00997 pixelsavg;
00998 register vector unsigned char
00999 blockv, temp1, temp2;
01000 register vector unsigned short
01001 pixelssum1, pixelssum2, temp3;
01002 register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
01003 register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
01004 register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
01005
01006 temp1 = vec_ld(0, pixels);
01007 temp2 = vec_ld(16, pixels);
01008 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
01009 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
01010 {
01011 pixelsv2 = temp2;
01012 }
01013 else
01014 {
01015 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
01016 }
01017 pixelsv1 = vec_mergeh(vczero, pixelsv1);
01018 pixelsv2 = vec_mergeh(vczero, pixelsv2);
01019 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
01020 (vector unsigned short)pixelsv2);
01021 pixelssum1 = vec_add(pixelssum1, vcone);
01022
01023 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
01024 for (i = 0; i < h ; i++) {
01025 int rightside = ((unsigned long)block & 0x0000000F);
01026 blockv = vec_ld(0, block);
01027
01028 temp1 = vec_ld(line_size, pixels);
01029 temp2 = vec_ld(line_size + 16, pixels);
01030 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
01031 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
01032 {
01033 pixelsv2 = temp2;
01034 }
01035 else
01036 {
01037 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
01038 }
01039
01040 pixelsv1 = vec_mergeh(vczero, pixelsv1);
01041 pixelsv2 = vec_mergeh(vczero, pixelsv2);
01042 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
01043 (vector unsigned short)pixelsv2);
01044 temp3 = vec_add(pixelssum1, pixelssum2);
01045 temp3 = vec_sra(temp3, vctwo);
01046 pixelssum1 = vec_add(pixelssum2, vcone);
01047 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
01048
01049 if (rightside)
01050 {
01051 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
01052 }
01053 else
01054 {
01055 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
01056 }
01057
01058 vec_st(blockv, 0, block);
01059
01060 block += line_size;
01061 pixels += line_size;
01062 }
01063
01064 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
01065 #endif
01066 }
01067
01068
01069 void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
01070 {
01071 POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
01072 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
01073 int j;
01074 POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
01075 for (j = 0; j < 4; j++) {
01076 int i;
01077 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
01078 const uint32_t b =
01079 (((const struct unaligned_32 *) (pixels + 1))->l);
01080 uint32_t l0 =
01081 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
01082 uint32_t h0 =
01083 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
01084 uint32_t l1, h1;
01085 pixels += line_size;
01086 for (i = 0; i < h; i += 2) {
01087 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
01088 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
01089 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
01090 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
01091 *((uint32_t *) block) =
01092 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
01093 pixels += line_size;
01094 block += line_size;
01095 a = (((const struct unaligned_32 *) (pixels))->l);
01096 b = (((const struct unaligned_32 *) (pixels + 1))->l);
01097 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
01098 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
01099 *((uint32_t *) block) =
01100 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
01101 pixels += line_size;
01102 block += line_size;
01103 } pixels += 4 - line_size * (h + 1);
01104 block += 4 - line_size * h;
01105 }
01106
01107 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
01108
01109 #else
01110 register int i;
01111 register vector unsigned char
01112 pixelsv1, pixelsv2, pixelsv3, pixelsv4;
01113 register vector unsigned char
01114 blockv, temp1, temp2;
01115 register vector unsigned short
01116 pixelssum1, pixelssum2, temp3,
01117 pixelssum3, pixelssum4, temp4;
01118 register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
01119 register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
01120
01121 POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
01122
01123 temp1 = vec_ld(0, pixels);
01124 temp2 = vec_ld(16, pixels);
01125 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
01126 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
01127 {
01128 pixelsv2 = temp2;
01129 }
01130 else
01131 {
01132 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
01133 }
01134 pixelsv3 = vec_mergel(vczero, pixelsv1);
01135 pixelsv4 = vec_mergel(vczero, pixelsv2);
01136 pixelsv1 = vec_mergeh(vczero, pixelsv1);
01137 pixelsv2 = vec_mergeh(vczero, pixelsv2);
01138 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
01139 (vector unsigned short)pixelsv4);
01140 pixelssum3 = vec_add(pixelssum3, vctwo);
01141 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
01142 (vector unsigned short)pixelsv2);
01143 pixelssum1 = vec_add(pixelssum1, vctwo);
01144
01145 for (i = 0; i < h ; i++) {
01146 blockv = vec_ld(0, block);
01147
01148 temp1 = vec_ld(line_size, pixels);
01149 temp2 = vec_ld(line_size + 16, pixels);
01150 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
01151 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
01152 {
01153 pixelsv2 = temp2;
01154 }
01155 else
01156 {
01157 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
01158 }
01159
01160 pixelsv3 = vec_mergel(vczero, pixelsv1);
01161 pixelsv4 = vec_mergel(vczero, pixelsv2);
01162 pixelsv1 = vec_mergeh(vczero, pixelsv1);
01163 pixelsv2 = vec_mergeh(vczero, pixelsv2);
01164
01165 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
01166 (vector unsigned short)pixelsv4);
01167 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
01168 (vector unsigned short)pixelsv2);
01169 temp4 = vec_add(pixelssum3, pixelssum4);
01170 temp4 = vec_sra(temp4, vctwo);
01171 temp3 = vec_add(pixelssum1, pixelssum2);
01172 temp3 = vec_sra(temp3, vctwo);
01173
01174 pixelssum3 = vec_add(pixelssum4, vctwo);
01175 pixelssum1 = vec_add(pixelssum2, vctwo);
01176
01177 blockv = vec_packsu(temp3, temp4);
01178
01179 vec_st(blockv, 0, block);
01180
01181 block += line_size;
01182 pixels += line_size;
01183 }
01184
01185 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
01186 #endif
01187 }
01188
01189
01190 void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
01191 {
01192 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
01193 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
01194 int j;
01195 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
01196 for (j = 0; j < 4; j++) {
01197 int i;
01198 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
01199 const uint32_t b =
01200 (((const struct unaligned_32 *) (pixels + 1))->l);
01201 uint32_t l0 =
01202 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
01203 uint32_t h0 =
01204 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
01205 uint32_t l1, h1;
01206 pixels += line_size;
01207 for (i = 0; i < h; i += 2) {
01208 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
01209 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
01210 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
01211 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
01212 *((uint32_t *) block) =
01213 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
01214 pixels += line_size;
01215 block += line_size;
01216 a = (((const struct unaligned_32 *) (pixels))->l);
01217 b = (((const struct unaligned_32 *) (pixels + 1))->l);
01218 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
01219 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
01220 *((uint32_t *) block) =
01221 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
01222 pixels += line_size;
01223 block += line_size;
01224 } pixels += 4 - line_size * (h + 1);
01225 block += 4 - line_size * h;
01226 }
01227
01228 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
01229
01230 #else
01231 register int i;
01232 register vector unsigned char
01233 pixelsv1, pixelsv2, pixelsv3, pixelsv4;
01234 register vector unsigned char
01235 blockv, temp1, temp2;
01236 register vector unsigned short
01237 pixelssum1, pixelssum2, temp3,
01238 pixelssum3, pixelssum4, temp4;
01239 register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
01240 register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
01241 register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
01242
01243 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
01244
01245 temp1 = vec_ld(0, pixels);
01246 temp2 = vec_ld(16, pixels);
01247 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
01248 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
01249 {
01250 pixelsv2 = temp2;
01251 }
01252 else
01253 {
01254 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
01255 }
01256 pixelsv3 = vec_mergel(vczero, pixelsv1);
01257 pixelsv4 = vec_mergel(vczero, pixelsv2);
01258 pixelsv1 = vec_mergeh(vczero, pixelsv1);
01259 pixelsv2 = vec_mergeh(vczero, pixelsv2);
01260 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
01261 (vector unsigned short)pixelsv4);
01262 pixelssum3 = vec_add(pixelssum3, vcone);
01263 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
01264 (vector unsigned short)pixelsv2);
01265 pixelssum1 = vec_add(pixelssum1, vcone);
01266
01267 for (i = 0; i < h ; i++) {
01268 blockv = vec_ld(0, block);
01269
01270 temp1 = vec_ld(line_size, pixels);
01271 temp2 = vec_ld(line_size + 16, pixels);
01272 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
01273 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
01274 {
01275 pixelsv2 = temp2;
01276 }
01277 else
01278 {
01279 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
01280 }
01281
01282 pixelsv3 = vec_mergel(vczero, pixelsv1);
01283 pixelsv4 = vec_mergel(vczero, pixelsv2);
01284 pixelsv1 = vec_mergeh(vczero, pixelsv1);
01285 pixelsv2 = vec_mergeh(vczero, pixelsv2);
01286
01287 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
01288 (vector unsigned short)pixelsv4);
01289 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
01290 (vector unsigned short)pixelsv2);
01291 temp4 = vec_add(pixelssum3, pixelssum4);
01292 temp4 = vec_sra(temp4, vctwo);
01293 temp3 = vec_add(pixelssum1, pixelssum2);
01294 temp3 = vec_sra(temp3, vctwo);
01295
01296 pixelssum3 = vec_add(pixelssum4, vcone);
01297 pixelssum1 = vec_add(pixelssum2, vcone);
01298
01299 blockv = vec_packsu(temp3, temp4);
01300
01301 vec_st(blockv, 0, block);
01302
01303 block += line_size;
01304 pixels += line_size;
01305 }
01306
01307 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
01308 #endif
01309 }
01310
01311 #ifdef CONFIG_DARWIN
01312 int hadamard8_diff8x8_altivec( void *s, uint8_t *dst, uint8_t *src, int stride, int h){
01313 POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
01314 int sum;
01315 POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
01316 register const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0);
01317 register vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
01318 {
01319 register const_vector signed short vprod1 = (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
01320 register const_vector signed short vprod2 = (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
01321 register const_vector signed short vprod3 = (const_vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1);
01322 register const_vector unsigned char perm1 = (const_vector unsigned char)
01323 AVV(0x02, 0x03, 0x00, 0x01,
01324 0x06, 0x07, 0x04, 0x05,
01325 0x0A, 0x0B, 0x08, 0x09,
01326 0x0E, 0x0F, 0x0C, 0x0D);
01327 register const_vector unsigned char perm2 = (const_vector unsigned char)
01328 AVV(0x04, 0x05, 0x06, 0x07,
01329 0x00, 0x01, 0x02, 0x03,
01330 0x0C, 0x0D, 0x0E, 0x0F,
01331 0x08, 0x09, 0x0A, 0x0B);
01332 register const_vector unsigned char perm3 = (const_vector unsigned char)
01333 AVV(0x08, 0x09, 0x0A, 0x0B,
01334 0x0C, 0x0D, 0x0E, 0x0F,
01335 0x00, 0x01, 0x02, 0x03,
01336 0x04, 0x05, 0x06, 0x07);
01337
01338 #define ONEITERBUTTERFLY(i, res) \
01339 { \
01340 register vector unsigned char src1, src2, srcO; \
01341 register vector unsigned char dst1, dst2, dstO; \
01342 src1 = vec_ld(stride * i, src); \
01343 if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8) \
01344 src2 = vec_ld((stride * i) + 16, src); \
01345 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
01346 dst1 = vec_ld(stride * i, dst); \
01347 if ((((stride * i) + (unsigned long)dst) & 0x0000000F) > 8) \
01348 dst2 = vec_ld((stride * i) + 16, dst); \
01349 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
01350 \
01351 \
01352 register vector signed short srcV = \
01353 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
01354 register vector signed short dstV = \
01355 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
01356 \
01357 register vector signed short but0 = vec_sub(srcV, dstV); \
01358 register vector signed short op1 = vec_perm(but0, but0, perm1); \
01359 register vector signed short but1 = vec_mladd(but0, vprod1, op1); \
01360 register vector signed short op2 = vec_perm(but1, but1, perm2); \
01361 register vector signed short but2 = vec_mladd(but1, vprod2, op2); \
01362 register vector signed short op3 = vec_perm(but2, but2, perm3); \
01363 res = vec_mladd(but2, vprod3, op3); \
01364 }
01365 ONEITERBUTTERFLY(0, temp0);
01366 ONEITERBUTTERFLY(1, temp1);
01367 ONEITERBUTTERFLY(2, temp2);
01368 ONEITERBUTTERFLY(3, temp3);
01369 ONEITERBUTTERFLY(4, temp4);
01370 ONEITERBUTTERFLY(5, temp5);
01371 ONEITERBUTTERFLY(6, temp6);
01372 ONEITERBUTTERFLY(7, temp7);
01373 }
01374 #undef ONEITERBUTTERFLY
01375 {
01376 register vector signed int vsum;
01377 register vector signed short line0 = vec_add(temp0, temp1);
01378 register vector signed short line1 = vec_sub(temp0, temp1);
01379 register vector signed short line2 = vec_add(temp2, temp3);
01380 register vector signed short line3 = vec_sub(temp2, temp3);
01381 register vector signed short line4 = vec_add(temp4, temp5);
01382 register vector signed short line5 = vec_sub(temp4, temp5);
01383 register vector signed short line6 = vec_add(temp6, temp7);
01384 register vector signed short line7 = vec_sub(temp6, temp7);
01385
01386 register vector signed short line0B = vec_add(line0, line2);
01387 register vector signed short line2B = vec_sub(line0, line2);
01388 register vector signed short line1B = vec_add(line1, line3);
01389 register vector signed short line3B = vec_sub(line1, line3);
01390 register vector signed short line4B = vec_add(line4, line6);
01391 register vector signed short line6B = vec_sub(line4, line6);
01392 register vector signed short line5B = vec_add(line5, line7);
01393 register vector signed short line7B = vec_sub(line5, line7);
01394
01395 register vector signed short line0C = vec_add(line0B, line4B);
01396 register vector signed short line4C = vec_sub(line0B, line4B);
01397 register vector signed short line1C = vec_add(line1B, line5B);
01398 register vector signed short line5C = vec_sub(line1B, line5B);
01399 register vector signed short line2C = vec_add(line2B, line6B);
01400 register vector signed short line6C = vec_sub(line2B, line6B);
01401 register vector signed short line3C = vec_add(line3B, line7B);
01402 register vector signed short line7C = vec_sub(line3B, line7B);
01403
01404 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
01405 vsum = vec_sum4s(vec_abs(line1C), vsum);
01406 vsum = vec_sum4s(vec_abs(line2C), vsum);
01407 vsum = vec_sum4s(vec_abs(line3C), vsum);
01408 vsum = vec_sum4s(vec_abs(line4C), vsum);
01409 vsum = vec_sum4s(vec_abs(line5C), vsum);
01410 vsum = vec_sum4s(vec_abs(line6C), vsum);
01411 vsum = vec_sum4s(vec_abs(line7C), vsum);
01412 vsum = vec_sums(vsum, (vector signed int)vzero);
01413 vsum = vec_splat(vsum, 3);
01414 vec_ste(vsum, 0, &sum);
01415 }
01416 POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
01417 return sum;
01418 }
01419
01420
01421
01422
01423
01424
01425
01426
01427
01428
01429
01430
01431
01432
01433
01434
01435
01436
01437
01438
01439
01440
01441
01442 static int hadamard8_diff16x8_altivec( void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
01443 int sum;
01444 register vector signed short
01445 temp0 asm ("v0"),
01446 temp1 asm ("v1"),
01447 temp2 asm ("v2"),
01448 temp3 asm ("v3"),
01449 temp4 asm ("v4"),
01450 temp5 asm ("v5"),
01451 temp6 asm ("v6"),
01452 temp7 asm ("v7");
01453 register vector signed short
01454 temp0S asm ("v8"),
01455 temp1S asm ("v9"),
01456 temp2S asm ("v10"),
01457 temp3S asm ("v11"),
01458 temp4S asm ("v12"),
01459 temp5S asm ("v13"),
01460 temp6S asm ("v14"),
01461 temp7S asm ("v15");
01462 register const_vector unsigned char vzero asm ("v31")= (const_vector unsigned char)vec_splat_u8(0);
01463 {
01464 register const_vector signed short vprod1 asm ("v16")= (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
01465 register const_vector signed short vprod2 asm ("v17")= (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
01466 register const_vector signed short vprod3 asm ("v18")= (const_vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1);
01467 register const_vector unsigned char perm1 asm ("v19")= (const_vector unsigned char)
01468 AVV(0x02, 0x03, 0x00, 0x01,
01469 0x06, 0x07, 0x04, 0x05,
01470 0x0A, 0x0B, 0x08, 0x09,
01471 0x0E, 0x0F, 0x0C, 0x0D);
01472 register const_vector unsigned char perm2 asm ("v20")= (const_vector unsigned char)
01473 AVV(0x04, 0x05, 0x06, 0x07,
01474 0x00, 0x01, 0x02, 0x03,
01475 0x0C, 0x0D, 0x0E, 0x0F,
01476 0x08, 0x09, 0x0A, 0x0B);
01477 register const_vector unsigned char perm3 asm ("v21")= (const_vector unsigned char)
01478 AVV(0x08, 0x09, 0x0A, 0x0B,
01479 0x0C, 0x0D, 0x0E, 0x0F,
01480 0x00, 0x01, 0x02, 0x03,
01481 0x04, 0x05, 0x06, 0x07);
01482
01483 #define ONEITERBUTTERFLY(i, res1, res2) \
01484 { \
01485 register vector unsigned char src1 asm ("v22"), src2 asm ("v23"); \
01486 register vector unsigned char dst1 asm ("v24"), dst2 asm ("v25"); \
01487 src1 = vec_ld(stride * i, src); \
01488 src2 = vec_ld((stride * i) + 16, src); \
01489 register vector unsigned char srcO asm ("v22") = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
01490 dst1 = vec_ld(stride * i, dst); \
01491 dst2 = vec_ld((stride * i) + 16, dst); \
01492 register vector unsigned char dstO asm ("v23") = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
01493 \
01494 register vector signed short srcV asm ("v24") = \
01495 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
01496 register vector signed short dstV asm ("v25") = \
01497 (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
01498 register vector signed short srcW asm ("v26") = \
01499 (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)srcO); \
01500 register vector signed short dstW asm ("v27") = \
01501 (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)dstO); \
01502 \
01503 register vector signed short but0 asm ("v28") = vec_sub(srcV, dstV); \
01504 register vector signed short but0S asm ("v29") = vec_sub(srcW, dstW); \
01505 register vector signed short op1 asm ("v30") = vec_perm(but0, but0, perm1); \
01506 register vector signed short but1 asm ("v22") = vec_mladd(but0, vprod1, op1); \
01507 register vector signed short op1S asm ("v23") = vec_perm(but0S, but0S, perm1); \
01508 register vector signed short but1S asm ("v24") = vec_mladd(but0S, vprod1, op1S); \
01509 register vector signed short op2 asm ("v25") = vec_perm(but1, but1, perm2); \
01510 register vector signed short but2 asm ("v26") = vec_mladd(but1, vprod2, op2); \
01511 register vector signed short op2S asm ("v27") = vec_perm(but1S, but1S, perm2); \
01512 register vector signed short but2S asm ("v28") = vec_mladd(but1S, vprod2, op2S); \
01513 register vector signed short op3 asm ("v29") = vec_perm(but2, but2, perm3); \
01514 res1 = vec_mladd(but2, vprod3, op3); \
01515 register vector signed short op3S asm ("v30") = vec_perm(but2S, but2S, perm3); \
01516 res2 = vec_mladd(but2S, vprod3, op3S); \
01517 }
01518 ONEITERBUTTERFLY(0, temp0, temp0S);
01519 ONEITERBUTTERFLY(1, temp1, temp1S);
01520 ONEITERBUTTERFLY(2, temp2, temp2S);
01521 ONEITERBUTTERFLY(3, temp3, temp3S);
01522 ONEITERBUTTERFLY(4, temp4, temp4S);
01523 ONEITERBUTTERFLY(5, temp5, temp5S);
01524 ONEITERBUTTERFLY(6, temp6, temp6S);
01525 ONEITERBUTTERFLY(7, temp7, temp7S);
01526 }
01527 #undef ONEITERBUTTERFLY
01528 {
01529 register vector signed int vsum;
01530 register vector signed short line0 = vec_add(temp0, temp1);
01531 register vector signed short line1 = vec_sub(temp0, temp1);
01532 register vector signed short line2 = vec_add(temp2, temp3);
01533 register vector signed short line3 = vec_sub(temp2, temp3);
01534 register vector signed short line4 = vec_add(temp4, temp5);
01535 register vector signed short line5 = vec_sub(temp4, temp5);
01536 register vector signed short line6 = vec_add(temp6, temp7);
01537 register vector signed short line7 = vec_sub(temp6, temp7);
01538
01539 register vector signed short line0B = vec_add(line0, line2);
01540 register vector signed short line2B = vec_sub(line0, line2);
01541 register vector signed short line1B = vec_add(line1, line3);
01542 register vector signed short line3B = vec_sub(line1, line3);
01543 register vector signed short line4B = vec_add(line4, line6);
01544 register vector signed short line6B = vec_sub(line4, line6);
01545 register vector signed short line5B = vec_add(line5, line7);
01546 register vector signed short line7B = vec_sub(line5, line7);
01547
01548 register vector signed short line0C = vec_add(line0B, line4B);
01549 register vector signed short line4C = vec_sub(line0B, line4B);
01550 register vector signed short line1C = vec_add(line1B, line5B);
01551 register vector signed short line5C = vec_sub(line1B, line5B);
01552 register vector signed short line2C = vec_add(line2B, line6B);
01553 register vector signed short line6C = vec_sub(line2B, line6B);
01554 register vector signed short line3C = vec_add(line3B, line7B);
01555 register vector signed short line7C = vec_sub(line3B, line7B);
01556
01557 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
01558 vsum = vec_sum4s(vec_abs(line1C), vsum);
01559 vsum = vec_sum4s(vec_abs(line2C), vsum);
01560 vsum = vec_sum4s(vec_abs(line3C), vsum);
01561 vsum = vec_sum4s(vec_abs(line4C), vsum);
01562 vsum = vec_sum4s(vec_abs(line5C), vsum);
01563 vsum = vec_sum4s(vec_abs(line6C), vsum);
01564 vsum = vec_sum4s(vec_abs(line7C), vsum);
01565
01566 register vector signed short line0S = vec_add(temp0S, temp1S);
01567 register vector signed short line1S = vec_sub(temp0S, temp1S);
01568 register vector signed short line2S = vec_add(temp2S, temp3S);
01569 register vector signed short line3S = vec_sub(temp2S, temp3S);
01570 register vector signed short line4S = vec_add(temp4S, temp5S);
01571 register vector signed short line5S = vec_sub(temp4S, temp5S);
01572 register vector signed short line6S = vec_add(temp6S, temp7S);
01573 register vector signed short line7S = vec_sub(temp6S, temp7S);
01574
01575 register vector signed short line0BS = vec_add(line0S, line2S);
01576 register vector signed short line2BS = vec_sub(line0S, line2S);
01577 register vector signed short line1BS = vec_add(line1S, line3S);
01578 register vector signed short line3BS = vec_sub(line1S, line3S);
01579 register vector signed short line4BS = vec_add(line4S, line6S);
01580 register vector signed short line6BS = vec_sub(line4S, line6S);
01581 register vector signed short line5BS = vec_add(line5S, line7S);
01582 register vector signed short line7BS = vec_sub(line5S, line7S);
01583
01584 register vector signed short line0CS = vec_add(line0BS, line4BS);
01585 register vector signed short line4CS = vec_sub(line0BS, line4BS);
01586 register vector signed short line1CS = vec_add(line1BS, line5BS);
01587 register vector signed short line5CS = vec_sub(line1BS, line5BS);
01588 register vector signed short line2CS = vec_add(line2BS, line6BS);
01589 register vector signed short line6CS = vec_sub(line2BS, line6BS);
01590 register vector signed short line3CS = vec_add(line3BS, line7BS);
01591 register vector signed short line7CS = vec_sub(line3BS, line7BS);
01592
01593 vsum = vec_sum4s(vec_abs(line0CS), vsum);
01594 vsum = vec_sum4s(vec_abs(line1CS), vsum);
01595 vsum = vec_sum4s(vec_abs(line2CS), vsum);
01596 vsum = vec_sum4s(vec_abs(line3CS), vsum);
01597 vsum = vec_sum4s(vec_abs(line4CS), vsum);
01598 vsum = vec_sum4s(vec_abs(line5CS), vsum);
01599 vsum = vec_sum4s(vec_abs(line6CS), vsum);
01600 vsum = vec_sum4s(vec_abs(line7CS), vsum);
01601 vsum = vec_sums(vsum, (vector signed int)vzero);
01602 vsum = vec_splat(vsum, 3);
01603 vec_ste(vsum, 0, &sum);
01604 }
01605 return sum;
01606 }
01607
01608 int hadamard8_diff16_altivec( void *s, uint8_t *dst, uint8_t *src, int stride, int h){
01609 POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1);
01610 int score;
01611 POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);
01612 score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
01613 if (h==16) {
01614 dst += 8*stride;
01615 src += 8*stride;
01616 score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
01617 }
01618 POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
01619 return score;
01620 }
01621 #endif //CONFIG_DARWIN
01622
01623 int has_altivec(void)
01624 {
01625 #ifdef __AMIGAOS4__
01626 ULONG result = 0;
01627 extern struct ExecIFace *IExec;
01628
01629 IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE);
01630 if (result == VECTORTYPE_ALTIVEC) return 1;
01631 return 0;
01632 #else
01633
01634 #ifdef CONFIG_DARWIN
01635 int sels[2] = {CTL_HW, HW_VECTORUNIT};
01636 int has_vu = 0;
01637 size_t len = sizeof(has_vu);
01638 int err;
01639
01640 err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
01641
01642 if (err == 0) return (has_vu != 0);
01643 #else
01644
01645
01646 {
01647 signal (SIGILL, sigill_handler);
01648 if (sigsetjmp (jmpbuf, 1)) {
01649 signal (SIGILL, SIG_DFL);
01650 } else {
01651 canjump = 1;
01652
01653 asm volatile ("mtspr 256, %0\n\t"
01654 "vand %%v0, %%v0, %%v0"
01655 :
01656 : "r" (-1));
01657
01658 signal (SIGILL, SIG_DFL);
01659 return 1;
01660 }
01661 }
01662 #endif
01663 return 0;
01664 #endif
01665 }
01666
01667
01668 void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
01669 {
01670 POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1);
01671 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
01672
01673 int j;
01674 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
01675 for (j = 0; j < 2; j++) {
01676 int i;
01677 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
01678 const uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
01679 uint32_t l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
01680 uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
01681 uint32_t l1, h1;
01682 pixels += line_size;
01683 for (i = 0; i < h; i += 2) {
01684 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
01685 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
01686 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
01687 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
01688 *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
01689 pixels += line_size;
01690 block += line_size;
01691 a = (((const struct unaligned_32 *) (pixels))->l);
01692 b = (((const struct unaligned_32 *) (pixels + 1))->l);
01693 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
01694 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
01695 *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
01696 pixels += line_size;
01697 block += line_size;
01698 } pixels += 4 - line_size * (h + 1);
01699 block += 4 - line_size * h;
01700 }
01701 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
01702 #else
01703 register int i;
01704 register vector unsigned char
01705 pixelsv1, pixelsv2,
01706 pixelsavg;
01707 register vector unsigned char
01708 blockv, temp1, temp2, blocktemp;
01709 register vector unsigned short
01710 pixelssum1, pixelssum2, temp3;
01711 register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
01712 register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
01713
01714 temp1 = vec_ld(0, pixels);
01715 temp2 = vec_ld(16, pixels);
01716 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
01717 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
01718 {
01719 pixelsv2 = temp2;
01720 }
01721 else
01722 {
01723 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
01724 }
01725 pixelsv1 = vec_mergeh(vczero, pixelsv1);
01726 pixelsv2 = vec_mergeh(vczero, pixelsv2);
01727 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
01728 (vector unsigned short)pixelsv2);
01729 pixelssum1 = vec_add(pixelssum1, vctwo);
01730
01731 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
01732 for (i = 0; i < h ; i++) {
01733 int rightside = ((unsigned long)block & 0x0000000F);
01734 blockv = vec_ld(0, block);
01735
01736 temp1 = vec_ld(line_size, pixels);
01737 temp2 = vec_ld(line_size + 16, pixels);
01738 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
01739 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
01740 {
01741 pixelsv2 = temp2;
01742 }
01743 else
01744 {
01745 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
01746 }
01747
01748 pixelsv1 = vec_mergeh(vczero, pixelsv1);
01749 pixelsv2 = vec_mergeh(vczero, pixelsv2);
01750 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
01751 (vector unsigned short)pixelsv2);
01752 temp3 = vec_add(pixelssum1, pixelssum2);
01753 temp3 = vec_sra(temp3, vctwo);
01754 pixelssum1 = vec_add(pixelssum2, vctwo);
01755 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
01756
01757 if (rightside)
01758 {
01759 blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
01760 }
01761 else
01762 {
01763 blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
01764 }
01765
01766 blockv = vec_avg(blocktemp, blockv);
01767 vec_st(blockv, 0, block);
01768
01769 block += line_size;
01770 pixels += line_size;
01771 }
01772
01773 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
01774 #endif
01775 }