00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #undef SPREADW
00021 #undef PMAXW
00022 #ifdef HAVE_MMX2
00023 #define SPREADW(a) "pshufw $0, " #a ", " #a " \n\t"
00024 #define PMAXW(a,b) "pmaxsw " #a ", " #b " \n\t"
00025
00026 #else
00027 #define SPREADW(a) \
00028 "punpcklwd " #a ", " #a " \n\t"\
00029 "punpcklwd " #a ", " #a " \n\t"
00030 #define PMAXW(a,b) \
00031 "psubusw " #a ", " #b " \n\t"\
00032 "paddw " #a ", " #b " \n\t"
00033 #endif
00034
00035 static int RENAME(dct_quantize)(MpegEncContext *s,
00036 DCTELEM *block, int n,
00037 int qscale, int *overflow)
00038 {
00039 long last_non_zero_p1;
00040 int level=0, q;
00041 const uint16_t *qmat, *bias;
00042 __align8 int16_t temp_block[64];
00043
00044 assert((7&(int)(&temp_block[0])) == 0);
00045
00046
00047 RENAMEl(ff_fdct) (block);
00048
00049 if(s->dct_error_sum)
00050 s->denoise_dct(s, block);
00051
00052 if (s->mb_intra) {
00053 int dummy;
00054 if (n < 4)
00055 q = s->y_dc_scale;
00056 else
00057 q = s->c_dc_scale;
00058
00059 if (!s->h263_aic) {
00060 #if 1
00061 asm volatile (
00062 "mul %%ecx \n\t"
00063 : "=d" (level), "=a"(dummy)
00064 : "a" ((block[0]>>2) + q), "c" (inverse[q<<1])
00065 );
00066 #else
00067 asm volatile (
00068 "xorl %%edx, %%edx \n\t"
00069 "divw %%cx \n\t"
00070 "movzwl %%ax, %%eax \n\t"
00071 : "=a" (level)
00072 : "a" ((block[0]>>2) + q), "c" (q<<1)
00073 : "%edx"
00074 );
00075 #endif
00076 } else
00077
00078 level = (block[0] + 4)>>3;
00079
00080 block[0]=0;
00081
00082 last_non_zero_p1 = 1;
00083 bias = s->q_intra_matrix16[qscale][1];
00084 qmat = s->q_intra_matrix16[qscale][0];
00085 } else {
00086 last_non_zero_p1 = 0;
00087 bias = s->q_inter_matrix16[qscale][1];
00088 qmat = s->q_inter_matrix16[qscale][0];
00089 }
00090
00091 if((s->out_format == FMT_H263 || s->out_format == FMT_H261) && s->mpeg_quant==0){
00092
00093 asm volatile(
00094 "movd %%"REG_a", %%mm3 \n\t"
00095 SPREADW(%%mm3)
00096 "pxor %%mm7, %%mm7 \n\t"
00097 "pxor %%mm4, %%mm4 \n\t"
00098 "movq (%2), %%mm5 \n\t"
00099 "pxor %%mm6, %%mm6 \n\t"
00100 "psubw (%3), %%mm6 \n\t"
00101 "mov $-128, %%"REG_a" \n\t"
00102 ".balign 16 \n\t"
00103 "1: \n\t"
00104 "pxor %%mm1, %%mm1 \n\t"
00105 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00106 "pcmpgtw %%mm0, %%mm1 \n\t"
00107 "pxor %%mm1, %%mm0 \n\t"
00108 "psubw %%mm1, %%mm0 \n\t"
00109 "psubusw %%mm6, %%mm0 \n\t"
00110 "pmulhw %%mm5, %%mm0 \n\t"
00111 "por %%mm0, %%mm4 \n\t"
00112 "pxor %%mm1, %%mm0 \n\t"
00113 "psubw %%mm1, %%mm0 \n\t"
00114 "movq %%mm0, (%5, %%"REG_a") \n\t"
00115 "pcmpeqw %%mm7, %%mm0 \n\t"
00116 "movq (%4, %%"REG_a"), %%mm1 \n\t"
00117 "movq %%mm7, (%1, %%"REG_a") \n\t"
00118 "pandn %%mm1, %%mm0 \n\t"
00119 PMAXW(%%mm0, %%mm3)
00120 "add $8, %%"REG_a" \n\t"
00121 " js 1b \n\t"
00122 "movq %%mm3, %%mm0 \n\t"
00123 "psrlq $32, %%mm3 \n\t"
00124 PMAXW(%%mm0, %%mm3)
00125 "movq %%mm3, %%mm0 \n\t"
00126 "psrlq $16, %%mm3 \n\t"
00127 PMAXW(%%mm0, %%mm3)
00128 "movd %%mm3, %%"REG_a" \n\t"
00129 "movzb %%al, %%"REG_a" \n\t"
00130 : "+a" (last_non_zero_p1)
00131 : "r" (block+64), "r" (qmat), "r" (bias),
00132 "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
00133 );
00134
00135 asm volatile(
00136 "movd %1, %%mm1 \n\t"
00137 SPREADW(%%mm1)
00138 "psubusw %%mm1, %%mm4 \n\t"
00139 "packuswb %%mm4, %%mm4 \n\t"
00140 "movd %%mm4, %0 \n\t"
00141 : "=g" (*overflow)
00142 : "g" (s->max_qcoeff)
00143 );
00144 }else{
00145 asm volatile(
00146 "movd %%"REG_a", %%mm3 \n\t"
00147 SPREADW(%%mm3)
00148 "pxor %%mm7, %%mm7 \n\t"
00149 "pxor %%mm4, %%mm4 \n\t"
00150 "mov $-128, %%"REG_a" \n\t"
00151 ".balign 16 \n\t"
00152 "1: \n\t"
00153 "pxor %%mm1, %%mm1 \n\t"
00154 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00155 "pcmpgtw %%mm0, %%mm1 \n\t"
00156 "pxor %%mm1, %%mm0 \n\t"
00157 "psubw %%mm1, %%mm0 \n\t"
00158 "movq (%3, %%"REG_a"), %%mm6 \n\t"
00159 "paddusw %%mm6, %%mm0 \n\t"
00160 "movq (%2, %%"REG_a"), %%mm5 \n\t"
00161 "pmulhw %%mm5, %%mm0 \n\t"
00162 "por %%mm0, %%mm4 \n\t"
00163 "pxor %%mm1, %%mm0 \n\t"
00164 "psubw %%mm1, %%mm0 \n\t"
00165 "movq %%mm0, (%5, %%"REG_a") \n\t"
00166 "pcmpeqw %%mm7, %%mm0 \n\t"
00167 "movq (%4, %%"REG_a"), %%mm1 \n\t"
00168 "movq %%mm7, (%1, %%"REG_a") \n\t"
00169 "pandn %%mm1, %%mm0 \n\t"
00170 PMAXW(%%mm0, %%mm3)
00171 "add $8, %%"REG_a" \n\t"
00172 " js 1b \n\t"
00173 "movq %%mm3, %%mm0 \n\t"
00174 "psrlq $32, %%mm3 \n\t"
00175 PMAXW(%%mm0, %%mm3)
00176 "movq %%mm3, %%mm0 \n\t"
00177 "psrlq $16, %%mm3 \n\t"
00178 PMAXW(%%mm0, %%mm3)
00179 "movd %%mm3, %%"REG_a" \n\t"
00180 "movzb %%al, %%"REG_a" \n\t"
00181 : "+a" (last_non_zero_p1)
00182 : "r" (block+64), "r" (qmat+64), "r" (bias+64),
00183 "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
00184 );
00185
00186 asm volatile(
00187 "movd %1, %%mm1 \n\t"
00188 SPREADW(%%mm1)
00189 "psubusw %%mm1, %%mm4 \n\t"
00190 "packuswb %%mm4, %%mm4 \n\t"
00191 "movd %%mm4, %0 \n\t"
00192 : "=g" (*overflow)
00193 : "g" (s->max_qcoeff)
00194 );
00195 }
00196
00197 if(s->mb_intra) block[0]= level;
00198 else block[0]= temp_block[0];
00199
00200 if(s->dsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM){
00201 if(last_non_zero_p1 <= 1) goto end;
00202 block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08];
00203 block[0x20] = temp_block[0x10];
00204 if(last_non_zero_p1 <= 4) goto end;
00205 block[0x18] = temp_block[0x09]; block[0x04] = temp_block[0x02];
00206 block[0x09] = temp_block[0x03];
00207 if(last_non_zero_p1 <= 7) goto end;
00208 block[0x14] = temp_block[0x0A]; block[0x28] = temp_block[0x11];
00209 block[0x12] = temp_block[0x18]; block[0x02] = temp_block[0x20];
00210 if(last_non_zero_p1 <= 11) goto end;
00211 block[0x1A] = temp_block[0x19]; block[0x24] = temp_block[0x12];
00212 block[0x19] = temp_block[0x0B]; block[0x01] = temp_block[0x04];
00213 block[0x0C] = temp_block[0x05];
00214 if(last_non_zero_p1 <= 16) goto end;
00215 block[0x11] = temp_block[0x0C]; block[0x29] = temp_block[0x13];
00216 block[0x16] = temp_block[0x1A]; block[0x0A] = temp_block[0x21];
00217 block[0x30] = temp_block[0x28]; block[0x22] = temp_block[0x30];
00218 block[0x38] = temp_block[0x29]; block[0x06] = temp_block[0x22];
00219 if(last_non_zero_p1 <= 24) goto end;
00220 block[0x1B] = temp_block[0x1B]; block[0x21] = temp_block[0x14];
00221 block[0x1C] = temp_block[0x0D]; block[0x05] = temp_block[0x06];
00222 block[0x0D] = temp_block[0x07]; block[0x15] = temp_block[0x0E];
00223 block[0x2C] = temp_block[0x15]; block[0x13] = temp_block[0x1C];
00224 if(last_non_zero_p1 <= 32) goto end;
00225 block[0x0B] = temp_block[0x23]; block[0x34] = temp_block[0x2A];
00226 block[0x2A] = temp_block[0x31]; block[0x32] = temp_block[0x38];
00227 block[0x3A] = temp_block[0x39]; block[0x26] = temp_block[0x32];
00228 block[0x39] = temp_block[0x2B]; block[0x03] = temp_block[0x24];
00229 if(last_non_zero_p1 <= 40) goto end;
00230 block[0x1E] = temp_block[0x1D]; block[0x25] = temp_block[0x16];
00231 block[0x1D] = temp_block[0x0F]; block[0x2D] = temp_block[0x17];
00232 block[0x17] = temp_block[0x1E]; block[0x0E] = temp_block[0x25];
00233 block[0x31] = temp_block[0x2C]; block[0x2B] = temp_block[0x33];
00234 if(last_non_zero_p1 <= 48) goto end;
00235 block[0x36] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B];
00236 block[0x23] = temp_block[0x34]; block[0x3C] = temp_block[0x2D];
00237 block[0x07] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
00238 block[0x0F] = temp_block[0x27]; block[0x35] = temp_block[0x2E];
00239 if(last_non_zero_p1 <= 56) goto end;
00240 block[0x2E] = temp_block[0x35]; block[0x33] = temp_block[0x3C];
00241 block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36];
00242 block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37];
00243 block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
00244 }else if(s->dsp.idct_permutation_type == FF_LIBMPEG2_IDCT_PERM){
00245 if(last_non_zero_p1 <= 1) goto end;
00246 block[0x04] = temp_block[0x01];
00247 block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
00248 if(last_non_zero_p1 <= 4) goto end;
00249 block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02];
00250 block[0x05] = temp_block[0x03];
00251 if(last_non_zero_p1 <= 7) goto end;
00252 block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11];
00253 block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
00254 if(last_non_zero_p1 <= 11) goto end;
00255 block[0x1C] = temp_block[0x19];
00256 block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B];
00257 block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05];
00258 if(last_non_zero_p1 <= 16) goto end;
00259 block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13];
00260 block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21];
00261 block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
00262 block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22];
00263 if(last_non_zero_p1 <= 24) goto end;
00264 block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14];
00265 block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06];
00266 block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E];
00267 block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C];
00268 if(last_non_zero_p1 <= 32) goto end;
00269 block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A];
00270 block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38];
00271 block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32];
00272 block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24];
00273 if(last_non_zero_p1 <= 40) goto end;
00274 block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16];
00275 block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
00276 block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25];
00277 block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33];
00278 if(last_non_zero_p1 <= 48) goto end;
00279 block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B];
00280 block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D];
00281 block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
00282 block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E];
00283 if(last_non_zero_p1 <= 56) goto end;
00284 block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C];
00285 block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36];
00286 block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
00287 block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
00288 }else{
00289 if(last_non_zero_p1 <= 1) goto end;
00290 block[0x01] = temp_block[0x01];
00291 block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
00292 if(last_non_zero_p1 <= 4) goto end;
00293 block[0x09] = temp_block[0x09]; block[0x02] = temp_block[0x02];
00294 block[0x03] = temp_block[0x03];
00295 if(last_non_zero_p1 <= 7) goto end;
00296 block[0x0A] = temp_block[0x0A]; block[0x11] = temp_block[0x11];
00297 block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
00298 if(last_non_zero_p1 <= 11) goto end;
00299 block[0x19] = temp_block[0x19];
00300 block[0x12] = temp_block[0x12]; block[0x0B] = temp_block[0x0B];
00301 block[0x04] = temp_block[0x04]; block[0x05] = temp_block[0x05];
00302 if(last_non_zero_p1 <= 16) goto end;
00303 block[0x0C] = temp_block[0x0C]; block[0x13] = temp_block[0x13];
00304 block[0x1A] = temp_block[0x1A]; block[0x21] = temp_block[0x21];
00305 block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
00306 block[0x29] = temp_block[0x29]; block[0x22] = temp_block[0x22];
00307 if(last_non_zero_p1 <= 24) goto end;
00308 block[0x1B] = temp_block[0x1B]; block[0x14] = temp_block[0x14];
00309 block[0x0D] = temp_block[0x0D]; block[0x06] = temp_block[0x06];
00310 block[0x07] = temp_block[0x07]; block[0x0E] = temp_block[0x0E];
00311 block[0x15] = temp_block[0x15]; block[0x1C] = temp_block[0x1C];
00312 if(last_non_zero_p1 <= 32) goto end;
00313 block[0x23] = temp_block[0x23]; block[0x2A] = temp_block[0x2A];
00314 block[0x31] = temp_block[0x31]; block[0x38] = temp_block[0x38];
00315 block[0x39] = temp_block[0x39]; block[0x32] = temp_block[0x32];
00316 block[0x2B] = temp_block[0x2B]; block[0x24] = temp_block[0x24];
00317 if(last_non_zero_p1 <= 40) goto end;
00318 block[0x1D] = temp_block[0x1D]; block[0x16] = temp_block[0x16];
00319 block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
00320 block[0x1E] = temp_block[0x1E]; block[0x25] = temp_block[0x25];
00321 block[0x2C] = temp_block[0x2C]; block[0x33] = temp_block[0x33];
00322 if(last_non_zero_p1 <= 48) goto end;
00323 block[0x3A] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B];
00324 block[0x34] = temp_block[0x34]; block[0x2D] = temp_block[0x2D];
00325 block[0x26] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
00326 block[0x27] = temp_block[0x27]; block[0x2E] = temp_block[0x2E];
00327 if(last_non_zero_p1 <= 56) goto end;
00328 block[0x35] = temp_block[0x35]; block[0x3C] = temp_block[0x3C];
00329 block[0x3D] = temp_block[0x3D]; block[0x36] = temp_block[0x36];
00330 block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
00331 block[0x3E] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
00332 }
00333 end:
00334
00335
00336
00337
00338
00339
00340
00341
00342 return last_non_zero_p1 - 1;
00343 }