00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include "../dsputil.h"
00024 #include "../mpegvideo.h"
00025 #include "../avcodec.h"
00026 #include "mmx.h"
00027
00028 extern uint8_t zigzag_direct_noperm[64];
00029 extern uint16_t inv_zigzag_direct16[64];
00030
00031 static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL;
00032 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
00033
00034
00035 static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
00036 DCTELEM *block, int n, int qscale)
00037 {
00038 long level, qmul, qadd, nCoeffs;
00039
00040 qmul = qscale << 1;
00041
00042 assert(s->block_last_index[n]>=0 || s->h263_aic);
00043
00044 if (!s->h263_aic) {
00045 if (n < 4)
00046 level = block[0] * s->y_dc_scale;
00047 else
00048 level = block[0] * s->c_dc_scale;
00049 qadd = (qscale - 1) | 1;
00050 }else{
00051 qadd = 0;
00052 level= block[0];
00053 }
00054 if(s->ac_pred)
00055 nCoeffs=63;
00056 else
00057 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
00058
00059 asm volatile(
00060 "movd %1, %%mm6 \n\t"
00061 "packssdw %%mm6, %%mm6 \n\t"
00062 "packssdw %%mm6, %%mm6 \n\t"
00063 "movd %2, %%mm5 \n\t"
00064 "pxor %%mm7, %%mm7 \n\t"
00065 "packssdw %%mm5, %%mm5 \n\t"
00066 "packssdw %%mm5, %%mm5 \n\t"
00067 "psubw %%mm5, %%mm7 \n\t"
00068 "pxor %%mm4, %%mm4 \n\t"
00069 ".balign 16\n\t"
00070 "1: \n\t"
00071 "movq (%0, %3), %%mm0 \n\t"
00072 "movq 8(%0, %3), %%mm1 \n\t"
00073
00074 "pmullw %%mm6, %%mm0 \n\t"
00075 "pmullw %%mm6, %%mm1 \n\t"
00076
00077 "movq (%0, %3), %%mm2 \n\t"
00078 "movq 8(%0, %3), %%mm3 \n\t"
00079
00080 "pcmpgtw %%mm4, %%mm2 \n\t"
00081 "pcmpgtw %%mm4, %%mm3 \n\t"
00082
00083 "pxor %%mm2, %%mm0 \n\t"
00084 "pxor %%mm3, %%mm1 \n\t"
00085
00086 "paddw %%mm7, %%mm0 \n\t"
00087 "paddw %%mm7, %%mm1 \n\t"
00088
00089 "pxor %%mm0, %%mm2 \n\t"
00090 "pxor %%mm1, %%mm3 \n\t"
00091
00092 "pcmpeqw %%mm7, %%mm0 \n\t"
00093 "pcmpeqw %%mm7, %%mm1 \n\t"
00094
00095 "pandn %%mm2, %%mm0 \n\t"
00096 "pandn %%mm3, %%mm1 \n\t"
00097
00098 "movq %%mm0, (%0, %3) \n\t"
00099 "movq %%mm1, 8(%0, %3) \n\t"
00100
00101 "add $16, %3 \n\t"
00102 "jng 1b \n\t"
00103 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs))
00104 : "memory"
00105 );
00106 block[0]= level;
00107 }
00108
00109
00110 static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
00111 DCTELEM *block, int n, int qscale)
00112 {
00113 long qmul, qadd, nCoeffs;
00114
00115 qmul = qscale << 1;
00116 qadd = (qscale - 1) | 1;
00117
00118 assert(s->block_last_index[n]>=0 || s->h263_aic);
00119
00120 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
00121
00122 asm volatile(
00123 "movd %1, %%mm6 \n\t"
00124 "packssdw %%mm6, %%mm6 \n\t"
00125 "packssdw %%mm6, %%mm6 \n\t"
00126 "movd %2, %%mm5 \n\t"
00127 "pxor %%mm7, %%mm7 \n\t"
00128 "packssdw %%mm5, %%mm5 \n\t"
00129 "packssdw %%mm5, %%mm5 \n\t"
00130 "psubw %%mm5, %%mm7 \n\t"
00131 "pxor %%mm4, %%mm4 \n\t"
00132 ".balign 16\n\t"
00133 "1: \n\t"
00134 "movq (%0, %3), %%mm0 \n\t"
00135 "movq 8(%0, %3), %%mm1 \n\t"
00136
00137 "pmullw %%mm6, %%mm0 \n\t"
00138 "pmullw %%mm6, %%mm1 \n\t"
00139
00140 "movq (%0, %3), %%mm2 \n\t"
00141 "movq 8(%0, %3), %%mm3 \n\t"
00142
00143 "pcmpgtw %%mm4, %%mm2 \n\t"
00144 "pcmpgtw %%mm4, %%mm3 \n\t"
00145
00146 "pxor %%mm2, %%mm0 \n\t"
00147 "pxor %%mm3, %%mm1 \n\t"
00148
00149 "paddw %%mm7, %%mm0 \n\t"
00150 "paddw %%mm7, %%mm1 \n\t"
00151
00152 "pxor %%mm0, %%mm2 \n\t"
00153 "pxor %%mm1, %%mm3 \n\t"
00154
00155 "pcmpeqw %%mm7, %%mm0 \n\t"
00156 "pcmpeqw %%mm7, %%mm1 \n\t"
00157
00158 "pandn %%mm2, %%mm0 \n\t"
00159 "pandn %%mm3, %%mm1 \n\t"
00160
00161 "movq %%mm0, (%0, %3) \n\t"
00162 "movq %%mm1, 8(%0, %3) \n\t"
00163
00164 "add $16, %3 \n\t"
00165 "jng 1b \n\t"
00166 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs))
00167 : "memory"
00168 );
00169 }
00170
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201 static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
00202 DCTELEM *block, int n, int qscale)
00203 {
00204 long nCoeffs;
00205 const uint16_t *quant_matrix;
00206 int block0;
00207
00208 assert(s->block_last_index[n]>=0);
00209
00210 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
00211
00212 if (n < 4)
00213 block0 = block[0] * s->y_dc_scale;
00214 else
00215 block0 = block[0] * s->c_dc_scale;
00216
00217 quant_matrix = s->intra_matrix;
00218 asm volatile(
00219 "pcmpeqw %%mm7, %%mm7 \n\t"
00220 "psrlw $15, %%mm7 \n\t"
00221 "movd %2, %%mm6 \n\t"
00222 "packssdw %%mm6, %%mm6 \n\t"
00223 "packssdw %%mm6, %%mm6 \n\t"
00224 "mov %3, %%"REG_a" \n\t"
00225 ".balign 16\n\t"
00226 "1: \n\t"
00227 "movq (%0, %%"REG_a"), %%mm0 \n\t"
00228 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
00229 "movq (%1, %%"REG_a"), %%mm4 \n\t"
00230 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
00231 "pmullw %%mm6, %%mm4 \n\t"
00232 "pmullw %%mm6, %%mm5 \n\t"
00233 "pxor %%mm2, %%mm2 \n\t"
00234 "pxor %%mm3, %%mm3 \n\t"
00235 "pcmpgtw %%mm0, %%mm2 \n\t"
00236 "pcmpgtw %%mm1, %%mm3 \n\t"
00237 "pxor %%mm2, %%mm0 \n\t"
00238 "pxor %%mm3, %%mm1 \n\t"
00239 "psubw %%mm2, %%mm0 \n\t"
00240 "psubw %%mm3, %%mm1 \n\t"
00241 "pmullw %%mm4, %%mm0 \n\t"
00242 "pmullw %%mm5, %%mm1 \n\t"
00243 "pxor %%mm4, %%mm4 \n\t"
00244 "pxor %%mm5, %%mm5 \n\t"
00245 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t"
00246 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t"
00247 "psraw $3, %%mm0 \n\t"
00248 "psraw $3, %%mm1 \n\t"
00249 "psubw %%mm7, %%mm0 \n\t"
00250 "psubw %%mm7, %%mm1 \n\t"
00251 "por %%mm7, %%mm0 \n\t"
00252 "por %%mm7, %%mm1 \n\t"
00253 "pxor %%mm2, %%mm0 \n\t"
00254 "pxor %%mm3, %%mm1 \n\t"
00255 "psubw %%mm2, %%mm0 \n\t"
00256 "psubw %%mm3, %%mm1 \n\t"
00257 "pandn %%mm0, %%mm4 \n\t"
00258 "pandn %%mm1, %%mm5 \n\t"
00259 "movq %%mm4, (%0, %%"REG_a") \n\t"
00260 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
00261
00262 "add $16, %%"REG_a" \n\t"
00263 "js 1b \n\t"
00264 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
00265 : "%"REG_a, "memory"
00266 );
00267 block[0]= block0;
00268 }
00269
00270 static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
00271 DCTELEM *block, int n, int qscale)
00272 {
00273 long nCoeffs;
00274 const uint16_t *quant_matrix;
00275
00276 assert(s->block_last_index[n]>=0);
00277
00278 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
00279
00280 quant_matrix = s->inter_matrix;
00281 asm volatile(
00282 "pcmpeqw %%mm7, %%mm7 \n\t"
00283 "psrlw $15, %%mm7 \n\t"
00284 "movd %2, %%mm6 \n\t"
00285 "packssdw %%mm6, %%mm6 \n\t"
00286 "packssdw %%mm6, %%mm6 \n\t"
00287 "mov %3, %%"REG_a" \n\t"
00288 ".balign 16\n\t"
00289 "1: \n\t"
00290 "movq (%0, %%"REG_a"), %%mm0 \n\t"
00291 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
00292 "movq (%1, %%"REG_a"), %%mm4 \n\t"
00293 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
00294 "pmullw %%mm6, %%mm4 \n\t"
00295 "pmullw %%mm6, %%mm5 \n\t"
00296 "pxor %%mm2, %%mm2 \n\t"
00297 "pxor %%mm3, %%mm3 \n\t"
00298 "pcmpgtw %%mm0, %%mm2 \n\t"
00299 "pcmpgtw %%mm1, %%mm3 \n\t"
00300 "pxor %%mm2, %%mm0 \n\t"
00301 "pxor %%mm3, %%mm1 \n\t"
00302 "psubw %%mm2, %%mm0 \n\t"
00303 "psubw %%mm3, %%mm1 \n\t"
00304 "paddw %%mm0, %%mm0 \n\t"
00305 "paddw %%mm1, %%mm1 \n\t"
00306 "paddw %%mm7, %%mm0 \n\t"
00307 "paddw %%mm7, %%mm1 \n\t"
00308 "pmullw %%mm4, %%mm0 \n\t"
00309 "pmullw %%mm5, %%mm1 \n\t"
00310 "pxor %%mm4, %%mm4 \n\t"
00311 "pxor %%mm5, %%mm5 \n\t"
00312 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t"
00313 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t"
00314 "psraw $4, %%mm0 \n\t"
00315 "psraw $4, %%mm1 \n\t"
00316 "psubw %%mm7, %%mm0 \n\t"
00317 "psubw %%mm7, %%mm1 \n\t"
00318 "por %%mm7, %%mm0 \n\t"
00319 "por %%mm7, %%mm1 \n\t"
00320 "pxor %%mm2, %%mm0 \n\t"
00321 "pxor %%mm3, %%mm1 \n\t"
00322 "psubw %%mm2, %%mm0 \n\t"
00323 "psubw %%mm3, %%mm1 \n\t"
00324 "pandn %%mm0, %%mm4 \n\t"
00325 "pandn %%mm1, %%mm5 \n\t"
00326 "movq %%mm4, (%0, %%"REG_a") \n\t"
00327 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
00328
00329 "add $16, %%"REG_a" \n\t"
00330 "js 1b \n\t"
00331 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
00332 : "%"REG_a, "memory"
00333 );
00334 }
00335
00336 static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
00337 DCTELEM *block, int n, int qscale)
00338 {
00339 long nCoeffs;
00340 const uint16_t *quant_matrix;
00341 int block0;
00342
00343 assert(s->block_last_index[n]>=0);
00344
00345 if(s->alternate_scan) nCoeffs= 63;
00346 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
00347
00348 if (n < 4)
00349 block0 = block[0] * s->y_dc_scale;
00350 else
00351 block0 = block[0] * s->c_dc_scale;
00352 quant_matrix = s->intra_matrix;
00353 asm volatile(
00354 "pcmpeqw %%mm7, %%mm7 \n\t"
00355 "psrlw $15, %%mm7 \n\t"
00356 "movd %2, %%mm6 \n\t"
00357 "packssdw %%mm6, %%mm6 \n\t"
00358 "packssdw %%mm6, %%mm6 \n\t"
00359 "mov %3, %%"REG_a" \n\t"
00360 ".balign 16\n\t"
00361 "1: \n\t"
00362 "movq (%0, %%"REG_a"), %%mm0 \n\t"
00363 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
00364 "movq (%1, %%"REG_a"), %%mm4 \n\t"
00365 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
00366 "pmullw %%mm6, %%mm4 \n\t"
00367 "pmullw %%mm6, %%mm5 \n\t"
00368 "pxor %%mm2, %%mm2 \n\t"
00369 "pxor %%mm3, %%mm3 \n\t"
00370 "pcmpgtw %%mm0, %%mm2 \n\t"
00371 "pcmpgtw %%mm1, %%mm3 \n\t"
00372 "pxor %%mm2, %%mm0 \n\t"
00373 "pxor %%mm3, %%mm1 \n\t"
00374 "psubw %%mm2, %%mm0 \n\t"
00375 "psubw %%mm3, %%mm1 \n\t"
00376 "pmullw %%mm4, %%mm0 \n\t"
00377 "pmullw %%mm5, %%mm1 \n\t"
00378 "pxor %%mm4, %%mm4 \n\t"
00379 "pxor %%mm5, %%mm5 \n\t"
00380 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t"
00381 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t"
00382 "psraw $3, %%mm0 \n\t"
00383 "psraw $3, %%mm1 \n\t"
00384 "pxor %%mm2, %%mm0 \n\t"
00385 "pxor %%mm3, %%mm1 \n\t"
00386 "psubw %%mm2, %%mm0 \n\t"
00387 "psubw %%mm3, %%mm1 \n\t"
00388 "pandn %%mm0, %%mm4 \n\t"
00389 "pandn %%mm1, %%mm5 \n\t"
00390 "movq %%mm4, (%0, %%"REG_a") \n\t"
00391 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
00392
00393 "add $16, %%"REG_a" \n\t"
00394 "jng 1b \n\t"
00395 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
00396 : "%"REG_a, "memory"
00397 );
00398 block[0]= block0;
00399
00400 }
00401
00402 static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
00403 DCTELEM *block, int n, int qscale)
00404 {
00405 long nCoeffs;
00406 const uint16_t *quant_matrix;
00407
00408 assert(s->block_last_index[n]>=0);
00409
00410 if(s->alternate_scan) nCoeffs= 63;
00411 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
00412
00413 quant_matrix = s->inter_matrix;
00414 asm volatile(
00415 "pcmpeqw %%mm7, %%mm7 \n\t"
00416 "psrlq $48, %%mm7 \n\t"
00417 "movd %2, %%mm6 \n\t"
00418 "packssdw %%mm6, %%mm6 \n\t"
00419 "packssdw %%mm6, %%mm6 \n\t"
00420 "mov %3, %%"REG_a" \n\t"
00421 ".balign 16\n\t"
00422 "1: \n\t"
00423 "movq (%0, %%"REG_a"), %%mm0 \n\t"
00424 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
00425 "movq (%1, %%"REG_a"), %%mm4 \n\t"
00426 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
00427 "pmullw %%mm6, %%mm4 \n\t"
00428 "pmullw %%mm6, %%mm5 \n\t"
00429 "pxor %%mm2, %%mm2 \n\t"
00430 "pxor %%mm3, %%mm3 \n\t"
00431 "pcmpgtw %%mm0, %%mm2 \n\t"
00432 "pcmpgtw %%mm1, %%mm3 \n\t"
00433 "pxor %%mm2, %%mm0 \n\t"
00434 "pxor %%mm3, %%mm1 \n\t"
00435 "psubw %%mm2, %%mm0 \n\t"
00436 "psubw %%mm3, %%mm1 \n\t"
00437 "paddw %%mm0, %%mm0 \n\t"
00438 "paddw %%mm1, %%mm1 \n\t"
00439 "pmullw %%mm4, %%mm0 \n\t"
00440 "pmullw %%mm5, %%mm1 \n\t"
00441 "paddw %%mm4, %%mm0 \n\t"
00442 "paddw %%mm5, %%mm1 \n\t"
00443 "pxor %%mm4, %%mm4 \n\t"
00444 "pxor %%mm5, %%mm5 \n\t"
00445 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t"
00446 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t"
00447 "psrlw $4, %%mm0 \n\t"
00448 "psrlw $4, %%mm1 \n\t"
00449 "pxor %%mm2, %%mm0 \n\t"
00450 "pxor %%mm3, %%mm1 \n\t"
00451 "psubw %%mm2, %%mm0 \n\t"
00452 "psubw %%mm3, %%mm1 \n\t"
00453 "pandn %%mm0, %%mm4 \n\t"
00454 "pandn %%mm1, %%mm5 \n\t"
00455 "pxor %%mm4, %%mm7 \n\t"
00456 "pxor %%mm5, %%mm7 \n\t"
00457 "movq %%mm4, (%0, %%"REG_a") \n\t"
00458 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
00459
00460 "add $16, %%"REG_a" \n\t"
00461 "jng 1b \n\t"
00462 "movd 124(%0, %3), %%mm0 \n\t"
00463 "movq %%mm7, %%mm6 \n\t"
00464 "psrlq $32, %%mm7 \n\t"
00465 "pxor %%mm6, %%mm7 \n\t"
00466 "movq %%mm7, %%mm6 \n\t"
00467 "psrlq $16, %%mm7 \n\t"
00468 "pxor %%mm6, %%mm7 \n\t"
00469 "pslld $31, %%mm7 \n\t"
00470 "psrlq $15, %%mm7 \n\t"
00471 "pxor %%mm7, %%mm0 \n\t"
00472 "movd %%mm0, 124(%0, %3) \n\t"
00473
00474 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs)
00475 : "%"REG_a, "memory"
00476 );
00477 }
00478
00479
00480
00481 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
00482 {
00483 uint8_t *ptr, *last_line;
00484 int i;
00485
00486 last_line = buf + (height - 1) * wrap;
00487
00488 ptr = buf;
00489 if(w==8)
00490 {
00491 asm volatile(
00492 "1: \n\t"
00493 "movd (%0), %%mm0 \n\t"
00494 "punpcklbw %%mm0, %%mm0 \n\t"
00495 "punpcklwd %%mm0, %%mm0 \n\t"
00496 "punpckldq %%mm0, %%mm0 \n\t"
00497 "movq %%mm0, -8(%0) \n\t"
00498 "movq -8(%0, %2), %%mm1 \n\t"
00499 "punpckhbw %%mm1, %%mm1 \n\t"
00500 "punpckhwd %%mm1, %%mm1 \n\t"
00501 "punpckhdq %%mm1, %%mm1 \n\t"
00502 "movq %%mm1, (%0, %2) \n\t"
00503 "add %1, %0 \n\t"
00504 "cmp %3, %0 \n\t"
00505 " jb 1b \n\t"
00506 : "+r" (ptr)
00507 : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
00508 );
00509 }
00510 else
00511 {
00512 asm volatile(
00513 "1: \n\t"
00514 "movd (%0), %%mm0 \n\t"
00515 "punpcklbw %%mm0, %%mm0 \n\t"
00516 "punpcklwd %%mm0, %%mm0 \n\t"
00517 "punpckldq %%mm0, %%mm0 \n\t"
00518 "movq %%mm0, -8(%0) \n\t"
00519 "movq %%mm0, -16(%0) \n\t"
00520 "movq -8(%0, %2), %%mm1 \n\t"
00521 "punpckhbw %%mm1, %%mm1 \n\t"
00522 "punpckhwd %%mm1, %%mm1 \n\t"
00523 "punpckhdq %%mm1, %%mm1 \n\t"
00524 "movq %%mm1, (%0, %2) \n\t"
00525 "movq %%mm1, 8(%0, %2) \n\t"
00526 "add %1, %0 \n\t"
00527 "cmp %3, %0 \n\t"
00528 " jb 1b \n\t"
00529 : "+r" (ptr)
00530 : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
00531 );
00532 }
00533
00534 for(i=0;i<w;i+=4) {
00535
00536 ptr= buf - (i + 1) * wrap - w;
00537 asm volatile(
00538 "1: \n\t"
00539 "movq (%1, %0), %%mm0 \n\t"
00540 "movq %%mm0, (%0) \n\t"
00541 "movq %%mm0, (%0, %2) \n\t"
00542 "movq %%mm0, (%0, %2, 2) \n\t"
00543 "movq %%mm0, (%0, %3) \n\t"
00544 "add $8, %0 \n\t"
00545 "cmp %4, %0 \n\t"
00546 " jb 1b \n\t"
00547 : "+r" (ptr)
00548 : "r" ((long)buf - (long)ptr - w), "r" ((long)-wrap), "r" ((long)-wrap*3), "r" (ptr+width+2*w)
00549 );
00550 ptr= last_line + (i + 1) * wrap - w;
00551 asm volatile(
00552 "1: \n\t"
00553 "movq (%1, %0), %%mm0 \n\t"
00554 "movq %%mm0, (%0) \n\t"
00555 "movq %%mm0, (%0, %2) \n\t"
00556 "movq %%mm0, (%0, %2, 2) \n\t"
00557 "movq %%mm0, (%0, %3) \n\t"
00558 "add $8, %0 \n\t"
00559 "cmp %4, %0 \n\t"
00560 " jb 1b \n\t"
00561 : "+r" (ptr)
00562 : "r" ((long)last_line - (long)ptr - w), "r" ((long)wrap), "r" ((long)wrap*3), "r" (ptr+width+2*w)
00563 );
00564 }
00565 }
00566
00567 static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){
00568 const int intra= s->mb_intra;
00569 int *sum= s->dct_error_sum[intra];
00570 uint16_t *offset= s->dct_offset[intra];
00571
00572 s->dct_count[intra]++;
00573
00574 asm volatile(
00575 "pxor %%mm7, %%mm7 \n\t"
00576 "1: \n\t"
00577 "pxor %%mm0, %%mm0 \n\t"
00578 "pxor %%mm1, %%mm1 \n\t"
00579 "movq (%0), %%mm2 \n\t"
00580 "movq 8(%0), %%mm3 \n\t"
00581 "pcmpgtw %%mm2, %%mm0 \n\t"
00582 "pcmpgtw %%mm3, %%mm1 \n\t"
00583 "pxor %%mm0, %%mm2 \n\t"
00584 "pxor %%mm1, %%mm3 \n\t"
00585 "psubw %%mm0, %%mm2 \n\t"
00586 "psubw %%mm1, %%mm3 \n\t"
00587 "movq %%mm2, %%mm4 \n\t"
00588 "movq %%mm3, %%mm5 \n\t"
00589 "psubusw (%2), %%mm2 \n\t"
00590 "psubusw 8(%2), %%mm3 \n\t"
00591 "pxor %%mm0, %%mm2 \n\t"
00592 "pxor %%mm1, %%mm3 \n\t"
00593 "psubw %%mm0, %%mm2 \n\t"
00594 "psubw %%mm1, %%mm3 \n\t"
00595 "movq %%mm2, (%0) \n\t"
00596 "movq %%mm3, 8(%0) \n\t"
00597 "movq %%mm4, %%mm2 \n\t"
00598 "movq %%mm5, %%mm3 \n\t"
00599 "punpcklwd %%mm7, %%mm4 \n\t"
00600 "punpckhwd %%mm7, %%mm2 \n\t"
00601 "punpcklwd %%mm7, %%mm5 \n\t"
00602 "punpckhwd %%mm7, %%mm3 \n\t"
00603 "paddd (%1), %%mm4 \n\t"
00604 "paddd 8(%1), %%mm2 \n\t"
00605 "paddd 16(%1), %%mm5 \n\t"
00606 "paddd 24(%1), %%mm3 \n\t"
00607 "movq %%mm4, (%1) \n\t"
00608 "movq %%mm2, 8(%1) \n\t"
00609 "movq %%mm5, 16(%1) \n\t"
00610 "movq %%mm3, 24(%1) \n\t"
00611 "add $16, %0 \n\t"
00612 "add $32, %1 \n\t"
00613 "add $16, %2 \n\t"
00614 "cmp %3, %0 \n\t"
00615 " jb 1b \n\t"
00616 : "+r" (block), "+r" (sum), "+r" (offset)
00617 : "r"(block+64)
00618 );
00619 }
00620
00621 static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){
00622 const int intra= s->mb_intra;
00623 int *sum= s->dct_error_sum[intra];
00624 uint16_t *offset= s->dct_offset[intra];
00625
00626 s->dct_count[intra]++;
00627
00628 asm volatile(
00629 "pxor %%xmm7, %%xmm7 \n\t"
00630 "1: \n\t"
00631 "pxor %%xmm0, %%xmm0 \n\t"
00632 "pxor %%xmm1, %%xmm1 \n\t"
00633 "movdqa (%0), %%xmm2 \n\t"
00634 "movdqa 16(%0), %%xmm3 \n\t"
00635 "pcmpgtw %%xmm2, %%xmm0 \n\t"
00636 "pcmpgtw %%xmm3, %%xmm1 \n\t"
00637 "pxor %%xmm0, %%xmm2 \n\t"
00638 "pxor %%xmm1, %%xmm3 \n\t"
00639 "psubw %%xmm0, %%xmm2 \n\t"
00640 "psubw %%xmm1, %%xmm3 \n\t"
00641 "movdqa %%xmm2, %%xmm4 \n\t"
00642 "movdqa %%xmm3, %%xmm5 \n\t"
00643 "psubusw (%2), %%xmm2 \n\t"
00644 "psubusw 16(%2), %%xmm3 \n\t"
00645 "pxor %%xmm0, %%xmm2 \n\t"
00646 "pxor %%xmm1, %%xmm3 \n\t"
00647 "psubw %%xmm0, %%xmm2 \n\t"
00648 "psubw %%xmm1, %%xmm3 \n\t"
00649 "movdqa %%xmm2, (%0) \n\t"
00650 "movdqa %%xmm3, 16(%0) \n\t"
00651 "movdqa %%xmm4, %%xmm6 \n\t"
00652 "movdqa %%xmm5, %%xmm0 \n\t"
00653 "punpcklwd %%xmm7, %%xmm4 \n\t"
00654 "punpckhwd %%xmm7, %%xmm6 \n\t"
00655 "punpcklwd %%xmm7, %%xmm5 \n\t"
00656 "punpckhwd %%xmm7, %%xmm0 \n\t"
00657 "paddd (%1), %%xmm4 \n\t"
00658 "paddd 16(%1), %%xmm6 \n\t"
00659 "paddd 32(%1), %%xmm5 \n\t"
00660 "paddd 48(%1), %%xmm0 \n\t"
00661 "movdqa %%xmm4, (%1) \n\t"
00662 "movdqa %%xmm6, 16(%1) \n\t"
00663 "movdqa %%xmm5, 32(%1) \n\t"
00664 "movdqa %%xmm0, 48(%1) \n\t"
00665 "add $32, %0 \n\t"
00666 "add $64, %1 \n\t"
00667 "add $32, %2 \n\t"
00668 "cmp %3, %0 \n\t"
00669 " jb 1b \n\t"
00670 : "+r" (block), "+r" (sum), "+r" (offset)
00671 : "r"(block+64)
00672 );
00673 }
00674
00675 #undef HAVE_MMX2
00676 #define RENAME(a) a ## _MMX
00677 #define RENAMEl(a) a ## _mmx
00678 #include "mpegvideo_mmx_template.c"
00679
00680 #define HAVE_MMX2
00681 #undef RENAME
00682 #undef RENAMEl
00683 #define RENAME(a) a ## _MMX2
00684 #define RENAMEl(a) a ## _mmx2
00685 #include "mpegvideo_mmx_template.c"
00686
00687 #undef RENAME
00688 #undef RENAMEl
00689 #define RENAME(a) a ## _SSE2
00690 #define RENAMEl(a) a ## _sse2
00691 #include "mpegvideo_mmx_template.c"
00692
00693 void MPV_common_init_mmx(MpegEncContext *s)
00694 {
00695 if (mm_flags & MM_MMX) {
00696 const int dct_algo = s->avctx->dct_algo;
00697
00698 s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
00699 s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
00700 s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
00701 s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
00702 s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
00703 s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
00704
00705 draw_edges = draw_edges_mmx;
00706
00707 if (mm_flags & MM_SSE2) {
00708 s->denoise_dct= denoise_dct_sse2;
00709 } else {
00710 s->denoise_dct= denoise_dct_mmx;
00711 }
00712
00713 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
00714 if(mm_flags & MM_SSE2){
00715 s->dct_quantize= dct_quantize_SSE2;
00716 } else if(mm_flags & MM_MMXEXT){
00717 s->dct_quantize= dct_quantize_MMX2;
00718 } else {
00719 s->dct_quantize= dct_quantize_MMX;
00720 }
00721 }
00722 }
00723 }