00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "../common.h"
00026 #include "../dsputil.h"
00027
00028 #include "mmx.h"
00029
00030 #define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
00031
00032 #define ROW_SHIFT 11
00033 #define COL_SHIFT 6
00034
00035 #define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT)))
00036 #define rounder(bias) {round (bias), round (bias)}
00037
00038 #if 0
00039
00040 static inline void idct_row (int16_t * row, int offset,
00041 int16_t * table, int32_t * rounder)
00042 {
00043 int C1, C2, C3, C4, C5, C6, C7;
00044 int a0, a1, a2, a3, b0, b1, b2, b3;
00045
00046 row += offset;
00047
00048 C1 = table[1];
00049 C2 = table[2];
00050 C3 = table[3];
00051 C4 = table[4];
00052 C5 = table[5];
00053 C6 = table[6];
00054 C7 = table[7];
00055
00056 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder;
00057 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder;
00058 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder;
00059 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder;
00060
00061 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
00062 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
00063 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
00064 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
00065
00066 row[0] = (a0 + b0) >> ROW_SHIFT;
00067 row[1] = (a1 + b1) >> ROW_SHIFT;
00068 row[2] = (a2 + b2) >> ROW_SHIFT;
00069 row[3] = (a3 + b3) >> ROW_SHIFT;
00070 row[4] = (a3 - b3) >> ROW_SHIFT;
00071 row[5] = (a2 - b2) >> ROW_SHIFT;
00072 row[6] = (a1 - b1) >> ROW_SHIFT;
00073 row[7] = (a0 - b0) >> ROW_SHIFT;
00074 }
00075 #endif
00076
00077
00078
00079
00080 #define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \
00081 c4, c6, c4, c6, \
00082 c1, c3, -c1, -c5, \
00083 c5, c7, c3, -c7, \
00084 c4, -c6, c4, -c6, \
00085 -c4, c2, c4, -c2, \
00086 c5, -c1, c3, -c1, \
00087 c7, c3, c7, -c5 }
00088
00089 static inline void mmxext_row_head (int16_t * row, int offset, int16_t * table)
00090 {
00091 movq_m2r (*(row+offset), mm2);
00092
00093 movq_m2r (*(row+offset+4), mm5);
00094 movq_r2r (mm2, mm0);
00095
00096 movq_m2r (*table, mm3);
00097 movq_r2r (mm5, mm6);
00098
00099 movq_m2r (*(table+4), mm4);
00100 pmaddwd_r2r (mm0, mm3);
00101
00102 pshufw_r2r (mm2, mm2, 0x4e);
00103 }
00104
00105 static inline void mmxext_row (int16_t * table, int32_t * rounder)
00106 {
00107 movq_m2r (*(table+8), mm1);
00108 pmaddwd_r2r (mm2, mm4);
00109
00110 pmaddwd_m2r (*(table+16), mm0);
00111 pshufw_r2r (mm6, mm6, 0x4e);
00112
00113 movq_m2r (*(table+12), mm7);
00114 pmaddwd_r2r (mm5, mm1);
00115
00116 paddd_m2r (*rounder, mm3);
00117 pmaddwd_r2r (mm6, mm7);
00118
00119 pmaddwd_m2r (*(table+20), mm2);
00120 paddd_r2r (mm4, mm3);
00121
00122 pmaddwd_m2r (*(table+24), mm5);
00123 movq_r2r (mm3, mm4);
00124
00125 pmaddwd_m2r (*(table+28), mm6);
00126 paddd_r2r (mm7, mm1);
00127
00128 paddd_m2r (*rounder, mm0);
00129 psubd_r2r (mm1, mm3);
00130
00131 psrad_i2r (ROW_SHIFT, mm3);
00132 paddd_r2r (mm4, mm1);
00133
00134 paddd_r2r (mm2, mm0);
00135 psrad_i2r (ROW_SHIFT, mm1);
00136
00137 paddd_r2r (mm6, mm5);
00138 movq_r2r (mm0, mm4);
00139
00140 paddd_r2r (mm5, mm0);
00141 psubd_r2r (mm5, mm4);
00142 }
00143
00144 static inline void mmxext_row_tail (int16_t * row, int store)
00145 {
00146 psrad_i2r (ROW_SHIFT, mm0);
00147
00148 psrad_i2r (ROW_SHIFT, mm4);
00149
00150 packssdw_r2r (mm0, mm1);
00151
00152 packssdw_r2r (mm3, mm4);
00153
00154 movq_r2m (mm1, *(row+store));
00155 pshufw_r2r (mm4, mm4, 0xb1);
00156
00157
00158
00159 movq_r2m (mm4, *(row+store+4));
00160 }
00161
00162 static inline void mmxext_row_mid (int16_t * row, int store,
00163 int offset, int16_t * table)
00164 {
00165 movq_m2r (*(row+offset), mm2);
00166 psrad_i2r (ROW_SHIFT, mm0);
00167
00168 movq_m2r (*(row+offset+4), mm5);
00169 psrad_i2r (ROW_SHIFT, mm4);
00170
00171 packssdw_r2r (mm0, mm1);
00172 movq_r2r (mm5, mm6);
00173
00174 packssdw_r2r (mm3, mm4);
00175 movq_r2r (mm2, mm0);
00176
00177 movq_r2m (mm1, *(row+store));
00178 pshufw_r2r (mm4, mm4, 0xb1);
00179
00180 movq_m2r (*table, mm3);
00181 movq_r2m (mm4, *(row+store+4));
00182
00183 pmaddwd_r2r (mm0, mm3);
00184
00185 movq_m2r (*(table+4), mm4);
00186 pshufw_r2r (mm2, mm2, 0x4e);
00187 }
00188
00189
00190
00191
00192 #define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \
00193 c4, c6, -c4, -c2, \
00194 c1, c3, c3, -c7, \
00195 c5, c7, -c1, -c5, \
00196 c4, -c6, c4, -c2, \
00197 -c4, c2, c4, -c6, \
00198 c5, -c1, c7, -c5, \
00199 c7, c3, c3, -c1 }
00200
00201 static inline void mmx_row_head (int16_t * row, int offset, int16_t * table)
00202 {
00203 movq_m2r (*(row+offset), mm2);
00204
00205 movq_m2r (*(row+offset+4), mm5);
00206 movq_r2r (mm2, mm0);
00207
00208 movq_m2r (*table, mm3);
00209 movq_r2r (mm5, mm6);
00210
00211 punpckldq_r2r (mm0, mm0);
00212
00213 movq_m2r (*(table+4), mm4);
00214 pmaddwd_r2r (mm0, mm3);
00215
00216 movq_m2r (*(table+8), mm1);
00217 punpckhdq_r2r (mm2, mm2);
00218 }
00219
00220 static inline void mmx_row (int16_t * table, int32_t * rounder)
00221 {
00222 pmaddwd_r2r (mm2, mm4);
00223 punpckldq_r2r (mm5, mm5);
00224
00225 pmaddwd_m2r (*(table+16), mm0);
00226 punpckhdq_r2r (mm6, mm6);
00227
00228 movq_m2r (*(table+12), mm7);
00229 pmaddwd_r2r (mm5, mm1);
00230
00231 paddd_m2r (*rounder, mm3);
00232 pmaddwd_r2r (mm6, mm7);
00233
00234 pmaddwd_m2r (*(table+20), mm2);
00235 paddd_r2r (mm4, mm3);
00236
00237 pmaddwd_m2r (*(table+24), mm5);
00238 movq_r2r (mm3, mm4);
00239
00240 pmaddwd_m2r (*(table+28), mm6);
00241 paddd_r2r (mm7, mm1);
00242
00243 paddd_m2r (*rounder, mm0);
00244 psubd_r2r (mm1, mm3);
00245
00246 psrad_i2r (ROW_SHIFT, mm3);
00247 paddd_r2r (mm4, mm1);
00248
00249 paddd_r2r (mm2, mm0);
00250 psrad_i2r (ROW_SHIFT, mm1);
00251
00252 paddd_r2r (mm6, mm5);
00253 movq_r2r (mm0, mm7);
00254
00255 paddd_r2r (mm5, mm0);
00256 psubd_r2r (mm5, mm7);
00257 }
00258
00259 static inline void mmx_row_tail (int16_t * row, int store)
00260 {
00261 psrad_i2r (ROW_SHIFT, mm0);
00262
00263 psrad_i2r (ROW_SHIFT, mm7);
00264
00265 packssdw_r2r (mm0, mm1);
00266
00267 packssdw_r2r (mm3, mm7);
00268
00269 movq_r2m (mm1, *(row+store));
00270 movq_r2r (mm7, mm4);
00271
00272 pslld_i2r (16, mm7);
00273
00274 psrld_i2r (16, mm4);
00275
00276 por_r2r (mm4, mm7);
00277
00278
00279
00280 movq_r2m (mm7, *(row+store+4));
00281 }
00282
00283 static inline void mmx_row_mid (int16_t * row, int store,
00284 int offset, int16_t * table)
00285 {
00286 movq_m2r (*(row+offset), mm2);
00287 psrad_i2r (ROW_SHIFT, mm0);
00288
00289 movq_m2r (*(row+offset+4), mm5);
00290 psrad_i2r (ROW_SHIFT, mm7);
00291
00292 packssdw_r2r (mm0, mm1);
00293 movq_r2r (mm5, mm6);
00294
00295 packssdw_r2r (mm3, mm7);
00296 movq_r2r (mm2, mm0);
00297
00298 movq_r2m (mm1, *(row+store));
00299 movq_r2r (mm7, mm1);
00300
00301 punpckldq_r2r (mm0, mm0);
00302 psrld_i2r (16, mm7);
00303
00304 movq_m2r (*table, mm3);
00305 pslld_i2r (16, mm1);
00306
00307 movq_m2r (*(table+4), mm4);
00308 por_r2r (mm1, mm7);
00309
00310 movq_m2r (*(table+8), mm1);
00311 punpckhdq_r2r (mm2, mm2);
00312
00313 movq_r2m (mm7, *(row+store+4));
00314 pmaddwd_r2r (mm0, mm3);
00315 }
00316
00317
00318 #if 0
00319
00320 static inline void idct_col (int16_t * col, int offset)
00321 {
00322
00323 #define F(c,x) (((c) * (x)) >> 16)
00324
00325
00326 #define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x))
00327
00328 int16_t x0, x1, x2, x3, x4, x5, x6, x7;
00329 int16_t y0, y1, y2, y3, y4, y5, y6, y7;
00330 int16_t a0, a1, a2, a3, b0, b1, b2, b3;
00331 int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12;
00332
00333 col += offset;
00334
00335 x0 = col[0*8];
00336 x1 = col[1*8];
00337 x2 = col[2*8];
00338 x3 = col[3*8];
00339 x4 = col[4*8];
00340 x5 = col[5*8];
00341 x6 = col[6*8];
00342 x7 = col[7*8];
00343
00344 u04 = S (x0 + x4);
00345 v04 = S (x0 - x4);
00346 u26 = S (F (T2, x6) + x2);
00347 v26 = S (F (T2, x2) - x6);
00348
00349 a0 = S (u04 + u26);
00350 a1 = S (v04 + v26);
00351 a2 = S (v04 - v26);
00352 a3 = S (u04 - u26);
00353
00354 u17 = S (F (T1, x7) + x1);
00355 v17 = S (F (T1, x1) - x7);
00356 u35 = S (F (T3, x5) + x3);
00357 v35 = S (F (T3, x3) - x5);
00358
00359 b0 = S (u17 + u35);
00360 b3 = S (v17 - v35);
00361 u12 = S (u17 - u35);
00362 v12 = S (v17 + v35);
00363 u12 = S (2 * F (C4, u12));
00364 v12 = S (2 * F (C4, v12));
00365 b1 = S (u12 + v12);
00366 b2 = S (u12 - v12);
00367
00368 y0 = S (a0 + b0) >> COL_SHIFT;
00369 y1 = S (a1 + b1) >> COL_SHIFT;
00370 y2 = S (a2 + b2) >> COL_SHIFT;
00371 y3 = S (a3 + b3) >> COL_SHIFT;
00372
00373 y4 = S (a3 - b3) >> COL_SHIFT;
00374 y5 = S (a2 - b2) >> COL_SHIFT;
00375 y6 = S (a1 - b1) >> COL_SHIFT;
00376 y7 = S (a0 - b0) >> COL_SHIFT;
00377
00378 col[0*8] = y0;
00379 col[1*8] = y1;
00380 col[2*8] = y2;
00381 col[3*8] = y3;
00382 col[4*8] = y4;
00383 col[5*8] = y5;
00384 col[6*8] = y6;
00385 col[7*8] = y7;
00386 }
00387 #endif
00388
00389
00390
00391 static inline void idct_col (int16_t * col, int offset)
00392 {
00393 #define T1 13036
00394 #define T2 27146
00395 #define T3 43790
00396 #define C4 23170
00397
00398 static short _T1[] ATTR_ALIGN(8) = {T1,T1,T1,T1};
00399 static short _T2[] ATTR_ALIGN(8) = {T2,T2,T2,T2};
00400 static short _T3[] ATTR_ALIGN(8) = {T3,T3,T3,T3};
00401 static short _C4[] ATTR_ALIGN(8) = {C4,C4,C4,C4};
00402
00403
00404
00405
00406 movq_m2r (*_T1, mm0);
00407
00408 movq_m2r (*(col+offset+1*8), mm1);
00409 movq_r2r (mm0, mm2);
00410
00411 movq_m2r (*(col+offset+7*8), mm4);
00412 pmulhw_r2r (mm1, mm0);
00413
00414 movq_m2r (*_T3, mm5);
00415 pmulhw_r2r (mm4, mm2);
00416
00417 movq_m2r (*(col+offset+5*8), mm6);
00418 movq_r2r (mm5, mm7);
00419
00420 movq_m2r (*(col+offset+3*8), mm3);
00421 psubsw_r2r (mm4, mm0);
00422
00423 movq_m2r (*_T2, mm4);
00424 pmulhw_r2r (mm3, mm5);
00425
00426 paddsw_r2r (mm2, mm1);
00427 pmulhw_r2r (mm6, mm7);
00428
00429
00430
00431 movq_r2r (mm4, mm2);
00432 paddsw_r2r (mm3, mm5);
00433
00434 pmulhw_m2r (*(col+offset+2*8), mm4);
00435 paddsw_r2r (mm6, mm7);
00436
00437 psubsw_r2r (mm6, mm5);
00438 paddsw_r2r (mm3, mm7);
00439
00440 movq_m2r (*(col+offset+6*8), mm3);
00441 movq_r2r (mm0, mm6);
00442
00443 pmulhw_r2r (mm3, mm2);
00444 psubsw_r2r (mm5, mm0);
00445
00446 psubsw_r2r (mm3, mm4);
00447 paddsw_r2r (mm6, mm5);
00448
00449 movq_r2m (mm0, *(col+offset+3*8));
00450 movq_r2r (mm1, mm6);
00451
00452 paddsw_m2r (*(col+offset+2*8), mm2);
00453 paddsw_r2r (mm7, mm6);
00454
00455 psubsw_r2r (mm7, mm1);
00456 movq_r2r (mm1, mm7);
00457
00458 movq_m2r (*(col+offset+0*8), mm3);
00459 paddsw_r2r (mm5, mm1);
00460
00461 movq_m2r (*_C4, mm0);
00462 psubsw_r2r (mm5, mm7);
00463
00464 movq_r2m (mm6, *(col+offset+5*8));
00465 pmulhw_r2r (mm0, mm1);
00466
00467 movq_r2r (mm4, mm6);
00468 pmulhw_r2r (mm0, mm7);
00469
00470 movq_m2r (*(col+offset+4*8), mm5);
00471 movq_r2r (mm3, mm0);
00472
00473 psubsw_r2r (mm5, mm3);
00474 paddsw_r2r (mm5, mm0);
00475
00476 paddsw_r2r (mm3, mm4);
00477 movq_r2r (mm0, mm5);
00478
00479 psubsw_r2r (mm6, mm3);
00480 paddsw_r2r (mm2, mm5);
00481
00482 paddsw_r2r (mm1, mm1);
00483 psubsw_r2r (mm2, mm0);
00484
00485 paddsw_r2r (mm7, mm7);
00486 movq_r2r (mm3, mm2);
00487
00488 movq_r2r (mm4, mm6);
00489 paddsw_r2r (mm7, mm3);
00490
00491 psraw_i2r (COL_SHIFT, mm3);
00492 paddsw_r2r (mm1, mm4);
00493
00494 psraw_i2r (COL_SHIFT, mm4);
00495 psubsw_r2r (mm1, mm6);
00496
00497 movq_m2r (*(col+offset+5*8), mm1);
00498 psubsw_r2r (mm7, mm2);
00499
00500 psraw_i2r (COL_SHIFT, mm6);
00501 movq_r2r (mm5, mm7);
00502
00503 movq_r2m (mm4, *(col+offset+1*8));
00504 psraw_i2r (COL_SHIFT, mm2);
00505
00506 movq_r2m (mm3, *(col+offset+2*8));
00507 paddsw_r2r (mm1, mm5);
00508
00509 movq_m2r (*(col+offset+3*8), mm4);
00510 psubsw_r2r (mm1, mm7);
00511
00512 psraw_i2r (COL_SHIFT, mm5);
00513 movq_r2r (mm0, mm3);
00514
00515 movq_r2m (mm2, *(col+offset+5*8));
00516 psubsw_r2r (mm4, mm3);
00517
00518 psraw_i2r (COL_SHIFT, mm7);
00519 paddsw_r2r (mm0, mm4);
00520
00521 movq_r2m (mm5, *(col+offset+0*8));
00522 psraw_i2r (COL_SHIFT, mm3);
00523
00524 movq_r2m (mm6, *(col+offset+6*8));
00525 psraw_i2r (COL_SHIFT, mm4);
00526
00527 movq_r2m (mm7, *(col+offset+7*8));
00528
00529 movq_r2m (mm3, *(col+offset+4*8));
00530
00531 movq_r2m (mm4, *(col+offset+3*8));
00532
00533 #undef T1
00534 #undef T2
00535 #undef T3
00536 #undef C4
00537 }
00538
00539 static int32_t rounder0[] ATTR_ALIGN(8) =
00540 rounder ((1 << (COL_SHIFT - 1)) - 0.5);
00541 static int32_t rounder4[] ATTR_ALIGN(8) = rounder (0);
00542 static int32_t rounder1[] ATTR_ALIGN(8) =
00543 rounder (1.25683487303);
00544 static int32_t rounder7[] ATTR_ALIGN(8) =
00545 rounder (-0.25);
00546 static int32_t rounder2[] ATTR_ALIGN(8) =
00547 rounder (0.60355339059);
00548 static int32_t rounder6[] ATTR_ALIGN(8) =
00549 rounder (-0.25);
00550 static int32_t rounder3[] ATTR_ALIGN(8) =
00551 rounder (0.087788325588);
00552 static int32_t rounder5[] ATTR_ALIGN(8) =
00553 rounder (-0.441341716183);
00554
00555 #undef COL_SHIFT
00556 #undef ROW_SHIFT
00557
00558 #define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \
00559 void idct (int16_t * block) \
00560 { \
00561 static int16_t table04[] ATTR_ALIGN(16) = \
00562 table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \
00563 static int16_t table17[] ATTR_ALIGN(16) = \
00564 table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \
00565 static int16_t table26[] ATTR_ALIGN(16) = \
00566 table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \
00567 static int16_t table35[] ATTR_ALIGN(16) = \
00568 table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \
00569 \
00570 idct_row_head (block, 0*8, table04); \
00571 idct_row (table04, rounder0); \
00572 idct_row_mid (block, 0*8, 4*8, table04); \
00573 idct_row (table04, rounder4); \
00574 idct_row_mid (block, 4*8, 1*8, table17); \
00575 idct_row (table17, rounder1); \
00576 idct_row_mid (block, 1*8, 7*8, table17); \
00577 idct_row (table17, rounder7); \
00578 idct_row_mid (block, 7*8, 2*8, table26); \
00579 idct_row (table26, rounder2); \
00580 idct_row_mid (block, 2*8, 6*8, table26); \
00581 idct_row (table26, rounder6); \
00582 idct_row_mid (block, 6*8, 3*8, table35); \
00583 idct_row (table35, rounder3); \
00584 idct_row_mid (block, 3*8, 5*8, table35); \
00585 idct_row (table35, rounder5); \
00586 idct_row_tail (block, 5*8); \
00587 \
00588 idct_col (block, 0); \
00589 idct_col (block, 4); \
00590 }
00591
00592 void ff_mmx_idct(DCTELEM *block);
00593 void ff_mmxext_idct(DCTELEM *block);
00594
00595 declare_idct (ff_mmxext_idct, mmxext_table,
00596 mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid)
00597
00598 declare_idct (ff_mmx_idct, mmx_table,
00599 mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)
00600