00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00024 #include "../dsputil.h"
00025 #include "mmx.h"
00026
00027 static const unsigned short __align16 SSE2_dequant_const[] =
00028 {
00029 0,65535,65535,0,0,0,0,0,
00030 0,0,0,0,65535,65535,0,0,
00031 65535,65535,65535,0,0,0,0,0,
00032 0,0,0,65535,0,0,0,0,
00033 0,0,0,65535,65535,0,0,0,
00034 65535,0,0,0,0,65535,0,0,
00035 0,0,65535,65535, 0,0,0,0
00036 };
00037
00038 static const unsigned int __align16 eight_data[] =
00039 {
00040 0x00080008,
00041 0x00080008,
00042 0x00080008,
00043 0x00080008
00044 };
00045
00046 static const unsigned short __align16 SSE2_idct_data[7 * 8] =
00047 {
00048 64277,64277,64277,64277,64277,64277,64277,64277,
00049 60547,60547,60547,60547,60547,60547,60547,60547,
00050 54491,54491,54491,54491,54491,54491,54491,54491,
00051 46341,46341,46341,46341,46341,46341,46341,46341,
00052 36410,36410,36410,36410,36410,36410,36410,36410,
00053 25080,25080,25080,25080,25080,25080,25080,25080,
00054 12785,12785,12785,12785,12785,12785,12785,12785
00055 };
00056
00057
00058 #define SSE2_Column_IDCT() { \
00059 \
00060 movdqu_m2r(*I(3), xmm2); \
00061 movdqu_m2r(*C(3), xmm6); \
00062 \
00063 movdqu_r2r(xmm2, xmm4); \
00064 movdqu_m2r(*I(5), xmm7); \
00065 \
00066 pmulhw_r2r(xmm6, xmm4); \
00067 movdqu_m2r(*C(5), xmm1); \
00068 \
00069 pmulhw_r2r(xmm7, xmm6); \
00070 movdqu_r2r(xmm1, xmm5); \
00071 \
00072 pmulhw_r2r(xmm2, xmm1); \
00073 movdqu_m2r(*I(1), xmm3); \
00074 \
00075 pmulhw_r2r(xmm7, xmm5); \
00076 movdqu_m2r(*C(1), xmm0); \
00077 \
00078 \
00079 \
00080 paddw_r2r(xmm2, xmm4); \
00081 paddw_r2r(xmm7, xmm6); \
00082 \
00083 paddw_r2r(xmm1, xmm2); \
00084 movdqu_m2r(*I(7), xmm1); \
00085 \
00086 paddw_r2r(xmm5, xmm7); \
00087 movdqu_r2r(xmm0, xmm5); \
00088 \
00089 pmulhw_r2r(xmm3, xmm0); \
00090 paddsw_r2r(xmm7, xmm4); \
00091 \
00092 pmulhw_r2r(xmm1, xmm5); \
00093 movdqu_m2r(*C(7), xmm7); \
00094 \
00095 psubsw_r2r(xmm2, xmm6); \
00096 paddw_r2r(xmm3, xmm0); \
00097 \
00098 pmulhw_r2r(xmm7, xmm3); \
00099 movdqu_m2r(*I(2), xmm2); \
00100 \
00101 pmulhw_r2r(xmm1, xmm7); \
00102 paddw_r2r(xmm1, xmm5); \
00103 \
00104 movdqu_r2r(xmm2, xmm1); \
00105 pmulhw_m2r(*C(2), xmm2); \
00106 \
00107 psubsw_r2r(xmm5, xmm3); \
00108 movdqu_m2r(*I(6), xmm5); \
00109 \
00110 paddsw_r2r(xmm7, xmm0); \
00111 movdqu_r2r(xmm5, xmm7); \
00112 \
00113 psubsw_r2r(xmm4, xmm0); \
00114 pmulhw_m2r(*C(2), xmm5); \
00115 \
00116 paddw_r2r(xmm1, xmm2); \
00117 pmulhw_m2r(*C(6), xmm1); \
00118 \
00119 paddsw_r2r(xmm4, xmm4); \
00120 paddsw_r2r(xmm0, xmm4); \
00121 \
00122 psubsw_r2r(xmm6, xmm3); \
00123 paddw_r2r(xmm7, xmm5); \
00124 \
00125 paddsw_r2r(xmm6, xmm6); \
00126 pmulhw_m2r(*C(6), xmm7); \
00127 \
00128 paddsw_r2r(xmm3, xmm6); \
00129 movdqu_r2m(xmm4, *I(1)); \
00130 \
00131 psubsw_r2r(xmm5, xmm1); \
00132 movdqu_m2r(*C(4), xmm4); \
00133 \
00134 movdqu_r2r(xmm3, xmm5); \
00135 pmulhw_r2r(xmm4, xmm3); \
00136 \
00137 paddsw_r2r(xmm2, xmm7); \
00138 movdqu_r2m(xmm6, *I(2)); \
00139 \
00140 movdqu_r2r(xmm0, xmm2); \
00141 movdqu_m2r(*I(0), xmm6); \
00142 \
00143 pmulhw_r2r(xmm4, xmm0); \
00144 paddw_r2r(xmm3, xmm5); \
00145 \
00146 movdqu_m2r(*I(4), xmm3); \
00147 psubsw_r2r(xmm1, xmm5); \
00148 \
00149 paddw_r2r(xmm0, xmm2); \
00150 psubsw_r2r(xmm3, xmm6); \
00151 \
00152 movdqu_r2r(xmm6, xmm0); \
00153 pmulhw_r2r(xmm4, xmm6); \
00154 \
00155 paddsw_r2r(xmm3, xmm3); \
00156 paddsw_r2r(xmm1, xmm1); \
00157 \
00158 paddsw_r2r(xmm0, xmm3); \
00159 paddsw_r2r(xmm5, xmm1); \
00160 \
00161 pmulhw_r2r(xmm3, xmm4); \
00162 paddw_r2r(xmm0, xmm6); \
00163 \
00164 psubsw_r2r(xmm2, xmm6); \
00165 paddsw_r2r(xmm2, xmm2); \
00166 \
00167 movdqu_m2r(*I(1), xmm0); \
00168 paddsw_r2r(xmm6, xmm2); \
00169 \
00170 paddw_r2r(xmm3, xmm4); \
00171 psubsw_r2r(xmm1, xmm2); \
00172 \
00173 paddsw_m2r(*Eight, xmm2); \
00174 paddsw_r2r(xmm1, xmm1); \
00175 \
00176 paddsw_r2r(xmm2, xmm1); \
00177 psraw_i2r(4, xmm2); \
00178 \
00179 psubsw_r2r(xmm7, xmm4); \
00180 psraw_i2r(4, xmm1); \
00181 \
00182 movdqu_m2r(*I(2), xmm3); \
00183 paddsw_r2r(xmm7, xmm7); \
00184 \
00185 movdqu_r2m(xmm2, *O(2)); \
00186 paddsw_r2r(xmm4, xmm7); \
00187 \
00188 movdqu_r2m(xmm1, *O(1)); \
00189 psubsw_r2r(xmm3, xmm4); \
00190 \
00191 paddsw_m2r(*Eight, xmm4); \
00192 paddsw_r2r(xmm3, xmm3); \
00193 \
00194 paddsw_r2r(xmm4, xmm3); \
00195 psraw_i2r(4, xmm4); \
00196 \
00197 psubsw_r2r(xmm5, xmm6); \
00198 psraw_i2r(4, xmm3); \
00199 \
00200 paddsw_m2r(*Eight, xmm6); \
00201 paddsw_r2r(xmm5, xmm5); \
00202 \
00203 paddsw_r2r(xmm6, xmm5); \
00204 psraw_i2r(4, xmm6); \
00205 \
00206 movdqu_r2m(xmm4, *O(4)); \
00207 psraw_i2r(4, xmm5); \
00208 \
00209 movdqu_r2m(xmm3, *O(3)); \
00210 psubsw_r2r(xmm0, xmm7); \
00211 \
00212 paddsw_m2r(*Eight, xmm7); \
00213 paddsw_r2r(xmm0, xmm0); \
00214 \
00215 paddsw_r2r(xmm7, xmm0); \
00216 psraw_i2r(4, xmm7); \
00217 \
00218 movdqu_r2m(xmm6, *O(6)); \
00219 psraw_i2r(4, xmm0); \
00220 \
00221 movdqu_r2m(xmm5, *O(5)); \
00222 movdqu_r2m(xmm7, *O(7)); \
00223 \
00224 movdqu_r2m(xmm0, *O(0)); \
00225 \
00226 }
00227
00228
00229 #define SSE2_Row_IDCT() { \
00230 \
00231 movdqu_m2r(*I(3), xmm2); \
00232 movdqu_m2r(*C(3), xmm6); \
00233 \
00234 movdqu_r2r(xmm2, xmm4); \
00235 movdqu_m2r(*I(5), xmm7); \
00236 \
00237 pmulhw_r2r(xmm6, xmm4); \
00238 movdqu_m2r(*C(5), xmm1); \
00239 \
00240 pmulhw_r2r(xmm7, xmm6); \
00241 movdqu_r2r(xmm1, xmm5); \
00242 \
00243 pmulhw_r2r(xmm2, xmm1); \
00244 movdqu_m2r(*I(1), xmm3); \
00245 \
00246 pmulhw_r2r(xmm7, xmm5); \
00247 movdqu_m2r(*C(1), xmm0); \
00248 \
00249 \
00250 \
00251 paddw_r2r(xmm2, xmm4); \
00252 paddw_r2r(xmm7, xmm6); \
00253 \
00254 paddw_r2r(xmm1, xmm2); \
00255 movdqu_m2r(*I(7), xmm1); \
00256 \
00257 paddw_r2r(xmm5, xmm7); \
00258 movdqu_r2r(xmm0, xmm5); \
00259 \
00260 pmulhw_r2r(xmm3, xmm0); \
00261 paddsw_r2r(xmm7, xmm4); \
00262 \
00263 pmulhw_r2r(xmm1, xmm5); \
00264 movdqu_m2r(*C(7), xmm7); \
00265 \
00266 psubsw_r2r(xmm2, xmm6); \
00267 paddw_r2r(xmm3, xmm0); \
00268 \
00269 pmulhw_r2r(xmm7, xmm3); \
00270 movdqu_m2r(*I(2), xmm2); \
00271 \
00272 pmulhw_r2r(xmm1, xmm7); \
00273 paddw_r2r(xmm1, xmm5); \
00274 \
00275 movdqu_r2r(xmm2, xmm1); \
00276 pmulhw_m2r(*C(2), xmm2); \
00277 \
00278 psubsw_r2r(xmm5, xmm3); \
00279 movdqu_m2r(*I(6), xmm5); \
00280 \
00281 paddsw_r2r(xmm7, xmm0); \
00282 movdqu_r2r(xmm5, xmm7); \
00283 \
00284 psubsw_r2r(xmm4, xmm0); \
00285 pmulhw_m2r(*C(2), xmm5); \
00286 \
00287 paddw_r2r(xmm1, xmm2); \
00288 pmulhw_m2r(*C(6), xmm1); \
00289 \
00290 paddsw_r2r(xmm4, xmm4); \
00291 paddsw_r2r(xmm0, xmm4); \
00292 \
00293 psubsw_r2r(xmm6, xmm3); \
00294 paddw_r2r(xmm7, xmm5); \
00295 \
00296 paddsw_r2r(xmm6, xmm6); \
00297 pmulhw_m2r(*C(6), xmm7); \
00298 \
00299 paddsw_r2r(xmm3, xmm6); \
00300 movdqu_r2m(xmm4, *I(1)); \
00301 \
00302 psubsw_r2r(xmm5, xmm1); \
00303 movdqu_m2r(*C(4), xmm4); \
00304 \
00305 movdqu_r2r(xmm3, xmm5); \
00306 pmulhw_r2r(xmm4, xmm3); \
00307 \
00308 paddsw_r2r(xmm2, xmm7); \
00309 movdqu_r2m(xmm6, *I(2)); \
00310 \
00311 movdqu_r2r(xmm0, xmm2); \
00312 movdqu_m2r(*I(0), xmm6); \
00313 \
00314 pmulhw_r2r(xmm4, xmm0); \
00315 paddw_r2r(xmm3, xmm5); \
00316 \
00317 movdqu_m2r(*I(4), xmm3); \
00318 psubsw_r2r(xmm1, xmm5); \
00319 \
00320 paddw_r2r(xmm0, xmm2); \
00321 psubsw_r2r(xmm3, xmm6); \
00322 \
00323 movdqu_r2r(xmm6, xmm0); \
00324 pmulhw_r2r(xmm4, xmm6); \
00325 \
00326 paddsw_r2r(xmm3, xmm3); \
00327 paddsw_r2r(xmm1, xmm1); \
00328 \
00329 paddsw_r2r(xmm0, xmm3); \
00330 paddsw_r2r(xmm5, xmm1); \
00331 \
00332 pmulhw_r2r(xmm3, xmm4); \
00333 paddw_r2r(xmm0, xmm6); \
00334 \
00335 psubsw_r2r(xmm2, xmm6); \
00336 paddsw_r2r(xmm2, xmm2); \
00337 \
00338 movdqu_m2r(*I(1), xmm0); \
00339 paddsw_r2r(xmm6, xmm2); \
00340 \
00341 paddw_r2r(xmm3, xmm4); \
00342 psubsw_r2r(xmm1, xmm2); \
00343 \
00344 paddsw_r2r(xmm1, xmm1); \
00345 paddsw_r2r(xmm2, xmm1); \
00346 \
00347 psubsw_r2r(xmm7, xmm4); \
00348 \
00349 movdqu_m2r(*I(2), xmm3); \
00350 paddsw_r2r(xmm7, xmm7); \
00351 \
00352 movdqu_r2m(xmm2, *I(2)); \
00353 paddsw_r2r(xmm4, xmm7); \
00354 \
00355 movdqu_r2m(xmm1, *I(1)); \
00356 psubsw_r2r(xmm3, xmm4); \
00357 \
00358 paddsw_r2r(xmm3, xmm3); \
00359 \
00360 paddsw_r2r(xmm4, xmm3); \
00361 \
00362 psubsw_r2r(xmm5, xmm6); \
00363 \
00364 paddsw_r2r(xmm5, xmm5); \
00365 \
00366 paddsw_r2r(xmm6, xmm5); \
00367 \
00368 movdqu_r2m(xmm4, *I(4)); \
00369 \
00370 movdqu_r2m(xmm3, *I(3)); \
00371 psubsw_r2r(xmm0, xmm7); \
00372 \
00373 paddsw_r2r(xmm0, xmm0); \
00374 \
00375 paddsw_r2r(xmm7, xmm0); \
00376 \
00377 movdqu_r2m(xmm6, *I(6)); \
00378 \
00379 movdqu_r2m(xmm5, *I(5)); \
00380 movdqu_r2m(xmm7, *I(7)); \
00381 \
00382 movdqu_r2m(xmm0, *I(0)); \
00383 \
00384 }
00385
00386
00387 #define SSE2_Transpose() { \
00388 \
00389 movdqu_m2r(*I(4), xmm4); \
00390 movdqu_m2r(*I(5), xmm0); \
00391 \
00392 movdqu_r2r(xmm4, xmm5); \
00393 punpcklwd_r2r(xmm0, xmm4); \
00394 \
00395 punpckhwd_r2r(xmm0, xmm5); \
00396 movdqu_m2r(*I(6), xmm6); \
00397 \
00398 movdqu_m2r(*I(7), xmm0); \
00399 movdqu_r2r(xmm6, xmm7); \
00400 \
00401 punpcklwd_r2r(xmm0, xmm6); \
00402 punpckhwd_r2r(xmm0, xmm7); \
00403 \
00404 movdqu_r2r(xmm4, xmm3); \
00405 punpckldq_r2r(xmm6, xmm4); \
00406 \
00407 punpckhdq_r2r(xmm6, xmm3); \
00408 movdqu_r2m(xmm3, *I(6)); \
00409 \
00410 movdqu_r2r(xmm5, xmm6); \
00411 punpckldq_r2r(xmm7, xmm5); \
00412 \
00413 punpckhdq_r2r(xmm7, xmm6); \
00414 movdqu_m2r(*I(0), xmm0); \
00415 \
00416 movdqu_m2r(*I(1), xmm1); \
00417 movdqu_r2r(xmm0, xmm7); \
00418 \
00419 punpcklwd_r2r(xmm1, xmm0); \
00420 punpckhwd_r2r(xmm1, xmm7); \
00421 \
00422 movdqu_m2r(*I(2), xmm2); \
00423 movdqu_m2r(*I(3), xmm3); \
00424 \
00425 movdqu_r2r(xmm2, xmm1); \
00426 punpcklwd_r2r(xmm3, xmm2); \
00427 \
00428 punpckhwd_r2r(xmm3, xmm1); \
00429 movdqu_r2r(xmm0, xmm3); \
00430 \
00431 punpckldq_r2r(xmm2, xmm0); \
00432 punpckhdq_r2r(xmm2, xmm3); \
00433 \
00434 movdqu_r2r(xmm7, xmm2); \
00435 punpckldq_r2r(xmm1, xmm2); \
00436 \
00437 punpckhdq_r2r(xmm1, xmm7); \
00438 movdqu_r2r(xmm0, xmm1); \
00439 \
00440 punpcklqdq_r2r(xmm4, xmm0); \
00441 punpckhqdq_r2r(xmm4, xmm1); \
00442 \
00443 movdqu_r2m(xmm0, *I(0)); \
00444 movdqu_r2m(xmm1, *I(1)); \
00445 \
00446 movdqu_m2r(*I(6), xmm0); \
00447 movdqu_r2r(xmm3, xmm1); \
00448 \
00449 punpcklqdq_r2r(xmm0, xmm1); \
00450 punpckhqdq_r2r(xmm0, xmm3); \
00451 \
00452 movdqu_r2r(xmm2, xmm4); \
00453 punpcklqdq_r2r(xmm5, xmm4); \
00454 \
00455 punpckhqdq_r2r(xmm5, xmm2); \
00456 movdqu_r2m(xmm1, *I(2)); \
00457 \
00458 movdqu_r2m(xmm3, *I(3)); \
00459 movdqu_r2m(xmm4, *I(4)); \
00460 \
00461 movdqu_r2m(xmm2, *I(5)); \
00462 movdqu_r2r(xmm7, xmm5); \
00463 \
00464 punpcklqdq_r2r(xmm6, xmm5); \
00465 punpckhqdq_r2r(xmm6, xmm7); \
00466 \
00467 movdqu_r2m(xmm5, *I(6)); \
00468 movdqu_r2m(xmm7, *I(7)); \
00469 \
00470 }
00471
00472
00473 #define SSE2_Dequantize() { \
00474 movdqu_m2r(*(eax), xmm0); \
00475 \
00476 pmullw_m2r(*(ebx), xmm0); \
00477 movdqu_m2r(*(eax + 16), xmm1); \
00478 \
00479 pmullw_m2r(*(ebx + 16), xmm1); \
00480 pshuflw_r2r(xmm0, xmm3, 0x078); \
00481 \
00482 movdqu_r2r(xmm1, xmm2); \
00483 movdqu_m2r(*(ecx), xmm7); \
00484 \
00485 movdqu_m2r(*(eax + 32), xmm4); \
00486 movdqu_m2r(*(eax + 64), xmm5); \
00487 \
00488 pmullw_m2r(*(ebx + 32), xmm4); \
00489 pmullw_m2r(*(ebx + 64), xmm5); \
00490 \
00491 movdqu_m2r(*(ecx + 16), xmm6); \
00492 pand_r2r(xmm2, xmm7); \
00493 \
00494 pand_r2r(xmm4, xmm6); \
00495 pxor_r2r(xmm7, xmm2); \
00496 \
00497 pxor_r2r(xmm6, xmm4); \
00498 pslldq_i2r(4, xmm7); \
00499 \
00500 pslldq_i2r(2, xmm6); \
00501 por_r2r(xmm6, xmm7); \
00502 \
00503 movdqu_m2r(*(ecx + 32), xmm0); \
00504 movdqu_m2r(*(ecx + 48), xmm6); \
00505 \
00506 pand_r2r(xmm3, xmm0); \
00507 pand_r2r(xmm5, xmm6); \
00508 \
00509 pxor_r2r(xmm0, xmm3); \
00510 pxor_r2r(xmm6, xmm5); \
00511 \
00512 por_r2r(xmm7, xmm0); \
00513 pslldq_i2r(8, xmm6); \
00514 \
00515 por_r2r(xmm6, xmm0); \
00516 \
00517 \
00518 movdqu_m2r(*(ecx + 64 ), xmm1); \
00519 pshuflw_r2r(xmm5, xmm5, 0x0B4); \
00520 \
00521 movdqu_r2r(xmm1, xmm7); \
00522 movdqu_r2r(xmm1, xmm6); \
00523 \
00524 movdqu_r2m(xmm0, *(eax)); \
00525 pshufhw_r2r(xmm4, xmm4, 0x0C2); \
00526 \
00527 pand_r2r(xmm4, xmm7); \
00528 pand_r2r(xmm5, xmm1); \
00529 \
00530 pxor_r2r(xmm7, xmm4); \
00531 pxor_r2r(xmm1, xmm5); \
00532 \
00533 pshuflw_r2r(xmm2, xmm2, 0x0C6); \
00534 movdqu_r2r(xmm6, xmm0); \
00535 \
00536 pslldq_i2r(2, xmm7); \
00537 pslldq_i2r(6, xmm1); \
00538 \
00539 psrldq_i2r(2, xmm0); \
00540 pand_r2r(xmm3, xmm6); \
00541 \
00542 pand_r2r(xmm2, xmm0); \
00543 pxor_r2r(xmm6, xmm3); \
00544 \
00545 pxor_r2r(xmm0, xmm2); \
00546 psrldq_i2r(6, xmm6); \
00547 \
00548 por_r2r(xmm7, xmm1); \
00549 por_r2r(xmm6, xmm0); \
00550 \
00551 por_r2r(xmm0, xmm1); \
00552 pshuflw_r2r(xmm4, xmm4, 0x093); \
00553 \
00554 pshufhw_r2r(xmm4, xmm4, 0x093); \
00555 movdqu_r2m(xmm1, *(eax + 16)); \
00556 \
00557 pshufhw_r2r(xmm3, xmm3, 0x0D2); \
00558 movdqu_m2r(*(ecx + 64), xmm0); \
00559 \
00560 pand_r2r(xmm3, xmm0); \
00561 psrldq_i2r(12, xmm3); \
00562 \
00563 psrldq_i2r(8, xmm0); \
00564 \
00565 movdqu_m2r(*(ecx + 64), xmm6); \
00566 movdqu_m2r(*(ecx + 96), xmm7); \
00567 \
00568 pand_r2r(xmm4, xmm6); \
00569 pxor_r2r(xmm6, xmm4); \
00570 \
00571 por_r2r(xmm6, xmm3); \
00572 pand_r2r(xmm4, xmm7); \
00573 \
00574 por_r2r(xmm7, xmm0); \
00575 pxor_r2r(xmm7, xmm4); \
00576 \
00577 movdqu_m2r(*(ecx + 16 ), xmm6); \
00578 movdqu_m2r(*(ecx + 64 ), xmm1); \
00579 \
00580 pand_r2r(xmm2, xmm6); \
00581 pand_r2r(xmm6, xmm1); \
00582 \
00583 pxor_r2r(xmm6, xmm2); \
00584 pxor_r2r(xmm1, xmm6); \
00585 \
00586 psrldq_i2r(4, xmm1); \
00587 \
00588 psrldq_i2r(8, xmm6); \
00589 por_r2r(xmm1, xmm3); \
00590 \
00591 por_r2r(xmm6, xmm0); \
00592 pshufhw_r2r(xmm5, xmm5, 0x0E1); \
00593 \
00594 movdqu_m2r(*(ecx + 64), xmm1); \
00595 pshuflw_r2r(xmm5, xmm5, 0x072); \
00596 \
00597 movdqu_r2r(xmm1, xmm6); \
00598 pand_r2r(xmm5, xmm1); \
00599 \
00600 pxor_r2r(xmm1, xmm5); \
00601 pslldq_i2r(4, xmm1); \
00602 \
00603 pshufd_r2r(xmm5, xmm5, 0x09C); \
00604 por_r2r(xmm1, xmm3); \
00605 \
00606 movdqu_m2r(*(eax + 96), xmm1); \
00607 pmullw_m2r(*(ebx + 96), xmm1); \
00608 \
00609 movdqu_m2r(*(ecx), xmm7); \
00610 \
00611 psrldq_i2r(8, xmm6); \
00612 pand_r2r(xmm5, xmm7); \
00613 \
00614 pand_r2r(xmm1, xmm6); \
00615 pxor_r2r(xmm7, xmm5); \
00616 \
00617 pxor_r2r(xmm6, xmm1); \
00618 pslldq_i2r(2, xmm5); \
00619 \
00620 pslldq_i2r(14, xmm6); \
00621 por_r2r(xmm5, xmm4); \
00622 \
00623 por_r2r(xmm6, xmm3); \
00624 pslldq_i2r(6, xmm7); \
00625 \
00626 movdqu_r2m(xmm3, *(eax+32)); \
00627 por_r2r(xmm7, xmm0); \
00628 \
00629 movdqu_m2r(*(eax + 48), xmm3); \
00630 movdqu_m2r(*(eax + 80), xmm5); \
00631 \
00632 pmullw_m2r(*(ebx + 48), xmm3); \
00633 pmullw_m2r(*(ebx + 80), xmm5); \
00634 \
00635 movdqu_m2r(*(ecx + 64), xmm6); \
00636 movdqu_m2r(*(ecx + 64), xmm7); \
00637 \
00638 psrldq_i2r(8, xmm6); \
00639 pslldq_i2r(8, xmm7); \
00640 \
00641 pand_r2r(xmm3, xmm6); \
00642 pand_r2r(xmm5, xmm7); \
00643 \
00644 pxor_r2r(xmm6, xmm3); \
00645 pxor_r2r(xmm7, xmm5); \
00646 \
00647 pslldq_i2r(6, xmm6); \
00648 psrldq_i2r(2, xmm7); \
00649 \
00650 por_r2r(xmm7, xmm6); \
00651 movdqu_m2r(*(ecx), xmm7); \
00652 \
00653 por_r2r(xmm6, xmm0); \
00654 psrldq_i2r(2, xmm7); \
00655 \
00656 movdqu_r2r(xmm2, xmm6); \
00657 pand_r2r(xmm1, xmm7); \
00658 \
00659 pslldq_i2r(2, xmm6); \
00660 psrldq_i2r(14, xmm2); \
00661 \
00662 pxor_r2r(xmm7, xmm1); \
00663 pslldq_i2r(12, xmm7); \
00664 \
00665 psrldq_i2r(14, xmm6); \
00666 por_r2r(xmm6, xmm4); \
00667 \
00668 por_r2r(xmm7, xmm0); \
00669 movdqu_m2r(*(ecx), xmm6); \
00670 \
00671 psrldq_i2r(2, xmm6); \
00672 movdqu_r2m(xmm0, *(eax+48)); \
00673 \
00674 movdqu_m2r(*(ecx), xmm0); \
00675 pand_r2r(xmm3, xmm6); \
00676 \
00677 movdqu_r2r(xmm3, xmm7); \
00678 pxor_r2r(xmm6, xmm3); \
00679 \
00680 pslldq_i2r(2, xmm3); \
00681 pand_r2r(xmm1, xmm0); \
00682 \
00683 psrldq_i2r(14, xmm7); \
00684 pxor_r2r(xmm0, xmm1); \
00685 \
00686 por_r2r(xmm7, xmm6); \
00687 movdqu_m2r(*(ecx + 64), xmm7); \
00688 \
00689 pshuflw_r2r(xmm6, xmm6, 0x01E); \
00690 pslldq_i2r(6, xmm7); \
00691 \
00692 por_r2r(xmm6, xmm4); \
00693 pand_r2r(xmm5, xmm7); \
00694 \
00695 pslldq_i2r(8, xmm0); \
00696 pxor_r2r(xmm7, xmm5); \
00697 \
00698 psrldq_i2r(2, xmm7); \
00699 \
00700 pshufhw_r2r(xmm3, xmm3, 0x087); \
00701 por_r2r(xmm7, xmm0); \
00702 \
00703 movdqu_m2r(*(eax + 112), xmm7); \
00704 pmullw_m2r(*(ebx + 112), xmm7); \
00705 \
00706 movdqu_m2r(*(ecx + 64), xmm6); \
00707 por_r2r(xmm0, xmm4); \
00708 \
00709 pshuflw_r2r(xmm7, xmm7, 0x0E1); \
00710 psrldq_i2r(8, xmm6); \
00711 \
00712 movdqu_m2r(*(ecx + 64), xmm0); \
00713 pand_r2r(xmm7, xmm6); \
00714 \
00715 pand_r2r(xmm3, xmm0); \
00716 pxor_r2r(xmm6, xmm7); \
00717 \
00718 pxor_r2r(xmm0, xmm3); \
00719 pslldq_i2r(14, xmm6); \
00720 \
00721 psrldq_i2r(4, xmm0); \
00722 por_r2r(xmm6, xmm4); \
00723 \
00724 por_r2r(xmm0, xmm2); \
00725 movdqu_r2m(xmm4, *(eax + 64)); \
00726 \
00727 movdqu_m2r(*(ecx + 80), xmm6); \
00728 pshufhw_r2r(xmm7, xmm7, 0x0D2); \
00729 \
00730 movdqu_m2r(*(ecx), xmm4); \
00731 movdqu_m2r(*(ecx+48), xmm0); \
00732 \
00733 pand_r2r(xmm5, xmm6); \
00734 pand_r2r(xmm7, xmm4); \
00735 \
00736 pand_r2r(xmm1, xmm0); \
00737 pxor_r2r(xmm6, xmm5); \
00738 \
00739 pxor_r2r(xmm4, xmm7); \
00740 pxor_r2r(xmm0, xmm1); \
00741 \
00742 pshuflw_r2r(xmm6, xmm6, 0x02B); \
00743 pslldq_i2r(10, xmm4); \
00744 \
00745 pshufhw_r2r(xmm6, xmm6, 0x0B1); \
00746 pslldq_i2r(4, xmm0); \
00747 \
00748 por_r2r(xmm4, xmm6); \
00749 por_r2r(xmm0, xmm2); \
00750 \
00751 por_r2r(xmm6, xmm2); \
00752 pshufhw_r2r(xmm1, xmm1, 0x0C9); \
00753 \
00754 movdqu_r2r(xmm3, xmm6); \
00755 movdqu_r2m(xmm2, *(eax+80)); \
00756 \
00757 psrldq_i2r(12, xmm6); \
00758 pslldq_i2r(4, xmm3); \
00759 \
00760 pshuflw_r2r(xmm5, xmm5, 0x04E); \
00761 movdqu_r2r(xmm7, xmm4); \
00762 \
00763 movdqu_r2r(xmm5, xmm2); \
00764 psrldq_i2r(10, xmm7); \
00765 \
00766 pslldq_i2r(6, xmm4); \
00767 pslldq_i2r(12, xmm2); \
00768 \
00769 movdqu_r2r(xmm1, xmm0); \
00770 psrldq_i2r(12, xmm1); \
00771 \
00772 psrldq_i2r(6, xmm5); \
00773 psrldq_i2r(14, xmm3); \
00774 \
00775 pslldq_i2r(10, xmm7); \
00776 por_r2r(xmm6, xmm4); \
00777 \
00778 psrldq_i2r(10, xmm2); \
00779 pslldq_i2r(4, xmm0); \
00780 \
00781 pslldq_i2r(8, xmm1); \
00782 por_r2r(xmm7, xmm3); \
00783 \
00784 psrldq_i2r(6, xmm0); \
00785 pslldq_i2r(4, xmm5); \
00786 \
00787 por_r2r(xmm1, xmm4); \
00788 por_r2r(xmm2, xmm3); \
00789 \
00790 por_r2r(xmm5, xmm4); \
00791 por_r2r(xmm0, xmm3); \
00792 \
00793 movdqu_r2m(xmm4, *(eax+96)); \
00794 movdqu_r2m(xmm3, *(eax+112)); \
00795 \
00796 }
00797
00798
00799 void ff_vp3_idct_sse2(int16_t *input_data)
00800 {
00801 unsigned char *input_bytes = (unsigned char *)input_data;
00802 unsigned char *dequant_const_bytes = (unsigned char *)SSE2_dequant_const;
00803 unsigned char *output_data_bytes = (unsigned char *)input_data;
00804 unsigned char *idct_data_bytes = (unsigned char *)SSE2_idct_data;
00805 unsigned char *Eight = (unsigned char *)eight_data;
00806
00807 #define eax input_bytes
00808
00809 #define ecx dequant_const_bytes
00810 #define edx idct_data_bytes
00811
00812 #define I(i) (eax + 16 * i)
00813 #define O(i) (ebx + 16 * i)
00814 #define C(i) (edx + 16 * (i-1))
00815
00816
00817
00818 #undef ebx
00819 #define ebx output_data_bytes
00820
00821 SSE2_Row_IDCT();
00822
00823 SSE2_Transpose();
00824
00825 SSE2_Column_IDCT();
00826 }