00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039 #include "config.h"
00040 #include <stdio.h>
00041 #include <math.h>
00042 #include <fenv.h>
00043 #include "global.h"
00044 #include "cpu_accel.h"
00045 #include "simd.h"
00046 #include "attributes.h"
00047 #include "mmx.h"
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074 int quant_non_intra_hv_3dnow(
00075 pict_data_s *picture,
00076 int16_t *src, int16_t *dst,
00077 int mquant,
00078 int *nonsat_mquant)
00079 {
00080 int saturated;
00081 int satlim = dctsatlim;
00082 float *i_quant_matf;
00083 int coeff_count = 64*block_count;
00084 uint32_t nzflag, flags;
00085 int16_t *psrc, *pdst;
00086 float *piqf;
00087 int i;
00088 uint32_t tmp;
00089
00090
00091
00092 __asm__ ( "movl %0, %%eax\n"
00093 "movd %%eax, %%mm6\n"
00094 : :"g" (1) : "eax" );
00095
00096 movd_m2r( satlim, mm1 );
00097 punpcklwd_r2r( mm1, mm1 );
00098 punpckldq_r2r( mm1, mm1 );
00099 restart:
00100 i_quant_matf = i_inter_q_tblf[mquant];
00101 flags = 0;
00102 piqf = i_quant_matf;
00103 saturated = 0;
00104 nzflag = 0;
00105 psrc = src;
00106 pdst = dst;
00107 for (i=0; i < coeff_count ; i+=4)
00108 {
00109
00110
00111
00112
00113
00114
00115
00116
00117 movq_m2r( *(mmx_t *)&psrc[0], mm2 );
00118 movq_r2r( mm2, mm7 );
00119 psraw_i2r( 16, mm7 );
00120 movq_r2r( mm2, mm3 );
00121 punpcklwd_r2r( mm7, mm2 );
00122 punpckhwd_r2r( mm7, mm3);
00123
00124
00125 pslld_i2r( 4, mm2 );
00126 pslld_i2r( 4, mm3 );
00127
00128
00129
00130
00131
00132
00133
00134 movq_m2r( *(mmx_t*)&piqf[0], mm4);
00135 pi2fd_r2r( mm2, mm2);
00136 movq_m2r( *(mmx_t*)&piqf[2], mm5);
00137 pi2fd_r2r( mm3, mm3);
00138
00139
00140
00141 pfmul_r2r( mm4, mm2 );
00142 pf2id_r2r( mm2, mm2);
00143 pfmul_r2r( mm5, mm3);
00144 pf2id_r2r( mm3, mm3);
00145
00146
00147
00148 packssdw_r2r( mm3, mm2);
00149
00150
00151
00152 movq_r2r( mm2, mm4 );
00153
00154 pxor_r2r( mm5, mm5 );
00155 pcmpgtw_r2r( mm1, mm4 );
00156 psubw_r2r( mm2, mm5 );
00157 pcmpgtw_r2r( mm1, mm5 );
00158 por_r2r( mm5, mm4 );
00159 movq_r2r( mm4, mm5 );
00160 psrlq_i2r( 32, mm5);
00161 por_r2r( mm5, mm4 );
00162
00163 movd_m2r( saturated, mm5 );
00164 por_r2r( mm4, mm5 );
00165 movd_r2m( mm5, saturated );
00166
00167
00168 movq_r2r( mm2, mm3 );
00169 movq_r2m( mm2, *(mmx_t*)pdst );
00170 psrlq_i2r( 32, mm3 );
00171 por_r2r( mm3, mm2 );
00172 movd_r2m( mm2, tmp );
00173 flags |= tmp;
00174
00175 piqf += 4;
00176 pdst += 4;
00177 psrc += 4;
00178
00179 if( (i & 63) == (63/4)*4 )
00180 {
00181
00182 if( saturated )
00183 {
00184 int new_mquant = next_larger_quant_hv( picture, mquant );
00185 if( new_mquant != mquant )
00186 {
00187 mquant = new_mquant;
00188 goto restart;
00189 }
00190 else
00191 {
00192 return quant_non_intra_hv(picture, src, dst, mquant,
00193 nonsat_mquant);
00194 }
00195 }
00196
00197 nzflag = (nzflag<<1) | !!flags;
00198 flags = 0;
00199 piqf = i_quant_matf;
00200 }
00201
00202 }
00203 femms();
00204
00205
00206 return nzflag;
00207 }
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217 #if 0
00218
00219
00220
00221
00222 static int trunc_mxcsr = 0x7f80;
00223
00224 int quant_non_intra_hv_sse(
00225 pict_data_s *picture,
00226 int16_t *src, int16_t *dst,
00227 int mquant,
00228 int *nonsat_mquant)
00229 {
00230 int saturated;
00231 int satlim = dctsatlim;
00232 float *i_quant_matf;
00233 int coeff_count = 64*block_count;
00234 uint32_t nzflag, flags;
00235 int16_t *psrc, *pdst;
00236 float *piqf;
00237 int i;
00238 uint32_t tmp;
00239
00240
00241
00242 __asm__ ( "movl %0, %%eax\n"
00243 "movd %%eax, %%mm6\n"
00244 : :"g" (1) : "eax" );
00245
00246 __asm__ ( "ldmxcsr (%0)\n" : : "X" (trunc_mxcsr) );
00247
00248
00249 movd_m2r( satlim, mm1 );
00250 punpcklwd_r2r( mm1, mm1 );
00251 punpckldq_r2r( mm1, mm1 );
00252 restart:
00253 i_quant_matf = i_inter_q_tblf[mquant];
00254 flags = 0;
00255 piqf = i_quant_matf;
00256 saturated = 0;
00257 nzflag = 0;
00258 psrc = src;
00259 pdst = dst;
00260 for (i=0; i < coeff_count ; i+=4)
00261 {
00262
00263
00264
00265
00266 movq_m2r( *(mmx_t *)&psrc[0], mm2 );
00267 movq_r2r( mm2, mm7 );
00268 psraw_i2r( 16, mm7 );
00269 movq_r2r( mm2, mm3 );
00270 punpcklwd_r2r( mm7, mm2 );
00271 punpckhwd_r2r( mm7, mm3);
00272
00273
00274 pslld_i2r( 4, mm2 );
00275 pslld_i2r( 4, mm3 );
00276
00277
00278
00279
00280 cvtpi2ps_r2r( mm2, xmm2 );
00281 cvtpi2ps_r2r( mm3, xmm3 );
00282 shufps_r2ri( xmm3, xmm2, 0*1 + 1*4 + 0 * 16 + 1 * 64 );
00283
00284
00285
00286 mulps_m2r( *(mmx_t*)&piqf[0], xmm2 );
00287 cvtps2pi_r2r( xmm2, mm2 );
00288 shufps_r2ri( xmm2, xmm2, 2*1 + 3*4 + 0 * 16 + 1 * 64 );
00289 cvtps2pi_r2r( xmm2, mm3 );
00290
00291
00292 packssdw_r2r( mm3, mm2);
00293
00294
00295
00296 movq_r2r( mm2, mm4 );
00297
00298 pxor_r2r( mm5, mm5 );
00299 pcmpgtw_r2r( mm1, mm4 );
00300 psubw_r2r( mm2, mm5 );
00301 pcmpgtw_r2r( mm1, mm5 );
00302 por_r2r( mm5, mm4 );
00303 movq_r2r( mm4, mm5 );
00304 psrlq_i2r( 32, mm5);
00305 por_r2r( mm5, mm4 );
00306
00307 movd_m2r( saturated, mm5 );
00308 por_r2r( mm4, mm5 );
00309 movd_r2m( mm5, saturated );
00310
00311
00312 movq_r2r( mm2, mm3 );
00313 movq_r2m( mm2, *(mmx_t*)pdst );
00314 psrlq_i2r( 32, mm3 );
00315 por_r2r( mm3, mm2 );
00316 movd_r2m( mm2, tmp );
00317 flags |= tmp;
00318
00319 piqf += 4;
00320 pdst += 4;
00321 psrc += 4;
00322
00323 if( (i & 63) == (63/4)*4 )
00324 {
00325
00326 if( saturated )
00327 {
00328 int new_mquant = next_larger_quant_hv( picture, mquant );
00329 if( new_mquant != mquant )
00330 {
00331 mquant = new_mquant;
00332 goto restart;
00333 }
00334 else
00335 {
00336 return quant_non_intra_hv(picture, src, dst, mquant,
00337 nonsat_mquant);
00338 }
00339 }
00340
00341 nzflag = (nzflag<<1) | !!flags;
00342 flags = 0;
00343 piqf = i_quant_matf;
00344 }
00345
00346 }
00347 emms();
00348
00349
00350 return nzflag;
00351 }
00352
00353 #endif
00354
00355
00356
00357
00358
00359
00360
00361
00362
00363
00364
00365 int quant_non_intra_hv_mmx(
00366 pict_data_s *picture,
00367 int16_t *src, int16_t *dst,
00368 int mquant,
00369 int *nonsat_mquant)
00370 {
00371
00372 int nzflag;
00373 int clipvalue = dctsatlim;
00374 int flags = 0;
00375 int saturated = 0;
00376 uint16_t *quant_mat = inter_q;
00377 int comp;
00378 uint16_t *i_quant_mat = i_inter_q;
00379 int imquant;
00380 int16_t *psrc, *pdst;
00381
00382
00383 if( mquant == 2 )
00384 {
00385 return quant_non_intra_hv(picture, src, dst, mquant, nonsat_mquant);
00386 }
00387
00388
00389
00390
00391
00392
00393
00394
00395
00396 nzflag = 0;
00397 pdst = dst;
00398 psrc = src;
00399 comp = 0;
00400 do
00401 {
00402 imquant = (IQUANT_SCALE/mquant);
00403 flags = quantize_ni_mmx( pdst, psrc, quant_mat, i_quant_mat,
00404 imquant, mquant, clipvalue );
00405 nzflag = (nzflag << 1) |( !!(flags & 0xffff0000));
00406
00407
00408
00409
00410
00411
00412 if( (flags & 0xff00) != 0 )
00413 {
00414 int new_mquant = next_larger_quant_hv( picture, mquant );
00415 if( new_mquant != mquant )
00416 {
00417 mquant = new_mquant;
00418 }
00419 else
00420 {
00421 saturated = 1;
00422 break;
00423 }
00424
00425 comp = 0;
00426 nzflag = 0;
00427 pdst = dst;
00428 psrc = src;
00429 }
00430 else
00431 {
00432 ++comp;
00433 pdst += 64;
00434 psrc +=64;
00435 }
00436
00437
00438
00439 }
00440 while( comp < block_count && (flags & 0xff) == 0 );
00441
00442
00443
00444
00445 if( (flags & 0xff) != 0 || saturated)
00446 {
00447 return quant_non_intra_hv(picture, src, dst, mquant, nonsat_mquant);
00448 }
00449
00450 *nonsat_mquant = mquant;
00451 return nzflag;
00452 }
00453
00454
00455 void iquant1_intra(int16_t *src, int16_t *dst, int dc_prec, int mquant)
00456 {
00457 int i, val;
00458 uint16_t *quant_mat = intra_q;
00459
00460 dst[0] = src[0] << (3-dc_prec);
00461 for (i=1; i<64; i++)
00462 {
00463 val = (int)(src[i]*quant_mat[i]*mquant)/16;
00464
00465
00466 if ((val&1)==0 && val!=0)
00467 val+= (val>0) ? -1 : 1;
00468
00469
00470 dst[i] = (val>2047) ? 2047 : ((val<-2048) ? -2048 : val);
00471 }
00472 }