00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 #include "config.h"
00031 #include "global.h"
00032 #include <stdio.h>
00033 #include <math.h>
00034 #include "cpu_accel.h"
00035
00036 #ifdef X86_CPU
00037 extern void fdct_mmx( int16_t * blk );
00038 extern void idct_mmx( int16_t * blk, unsigned char *temp );
00039
00040 void add_pred_mmx (uint8_t *pred, uint8_t *cur,
00041 int lx, int16_t *blk);
00042 void sub_pred_mmx (uint8_t *pred, uint8_t *cur,
00043 int lx, int16_t *blk);
00044 #endif
00045
00046 extern void fdct( int16_t *blk );
00047 extern void idct( int16_t *blk, unsigned char *temp );
00048
00049
00050
00051
00052 static void add_pred (uint8_t *pred, uint8_t *cur,
00053 int lx, int16_t *blk);
00054 static void sub_pred (uint8_t *pred, uint8_t *cur,
00055 int lx, int16_t *blk);
00056
00057
00058
00059
00060
00061
00062 static void (*pfdct)( int16_t * blk );
00063 static void (*pidct)( int16_t * blk , unsigned char *temp);
00064 static void (*padd_pred) (uint8_t *pred, uint8_t *cur,
00065 int lx, int16_t *blk);
00066 static void (*psub_pred) (uint8_t *pred, uint8_t *cur,
00067 int lx, int16_t *blk);
00068
00069
00070
00071
00072
00073
00074
00075 void init_transform_hv()
00076 {
00077 int flags;
00078 flags = cpu_accel();
00079
00080 #ifdef X86_CPU
00081 if( (flags & ACCEL_X86_MMX) )
00082 {
00083 if(verbose) fprintf( stderr, "SETTING MMX for TRANSFORM!\n");
00084 pfdct = fdct_mmx;
00085 pidct = idct_mmx;
00086 padd_pred = add_pred_mmx;
00087 psub_pred = sub_pred_mmx;
00088 }
00089 else
00090 #endif
00091 {
00092 pfdct = fdct;
00093 pidct = idct;
00094 padd_pred = add_pred;
00095 psub_pred = sub_pred;
00096
00097 }
00098 }
00099
00100
00101 static void add_pred(unsigned char *pred,
00102 unsigned char *cur,
00103 int lx,
00104 short *blk)
00105 {
00106 register int j;
00107
00108 for (j=0; j<8; j++)
00109 {
00110
00111
00112
00113
00114 cur[0] = clp[blk[0] + pred[0]];
00115 cur[1] = clp[blk[1] + pred[1]];
00116 cur[2] = clp[blk[2] + pred[2]];
00117 cur[3] = clp[blk[3] + pred[3]];
00118 cur[4] = clp[blk[4] + pred[4]];
00119 cur[5] = clp[blk[5] + pred[5]];
00120 cur[6] = clp[blk[6] + pred[6]];
00121 cur[7] = clp[blk[7] + pred[7]];
00122
00123 blk += 8;
00124 cur += lx;
00125 pred += lx;
00126 }
00127 }
00128
00129
00130 static void sub_pred(unsigned char *pred,
00131 unsigned char *cur,
00132 int lx,
00133 short *blk)
00134 {
00135 register int j;
00136
00137 for (j=0; j<8; j++)
00138 {
00139
00140
00141
00142
00143 blk[0] = cur[0] - pred[0];
00144 blk[1] = cur[1] - pred[1];
00145 blk[2] = cur[2] - pred[2];
00146 blk[3] = cur[3] - pred[3];
00147 blk[4] = cur[4] - pred[4];
00148 blk[5] = cur[5] - pred[5];
00149 blk[6] = cur[6] - pred[6];
00150 blk[7] = cur[7] - pred[7];
00151
00152 blk += 8;
00153 cur += lx;
00154 pred += lx;
00155 }
00156 }
00157
00158 void transform_engine_loop(transform_engine_t *engine)
00159 {
00160 while(!engine->done)
00161 {
00162 pthread_mutex_lock(&(engine->input_lock));
00163
00164 if(!engine->done)
00165 {
00166 pict_data_s *picture = engine->picture;
00167 uint8_t **pred = engine->pred;
00168 uint8_t **cur = engine->cur;
00169 mbinfo_s *mbi = picture->mbinfo;
00170 int16_t (*blocks)[64] = picture->blocks;
00171 int i, j, i1, j1, k, n, cc, offs, lx;
00172
00173 k = (engine->start_row / 16) * (width / 16);
00174
00175 for(j = engine->start_row; j < engine->end_row; j += 16)
00176 for(i = 0; i < width; i += 16)
00177 {
00178 mbi[k].dctblocks = &blocks[k * block_count];
00179
00180 for(n = 0; n < block_count; n++)
00181 {
00182
00183 cc = (n < 4) ? 0 : (n & 1) + 1;
00184 if(cc == 0)
00185 {
00186
00187
00188
00189 if ((picture->pict_struct == FRAME_PICTURE) && mbi[k].dct_type)
00190 {
00191
00192 offs = i + ((n & 1) << 3) + width * (j + ((n & 2) >> 1));
00193 lx = width << 1;
00194 }
00195 else
00196 {
00197
00198 offs = i + ((n & 1) << 3) + width2 * (j + ((n & 2) << 2));
00199 lx = width2;
00200 }
00201
00202 if (picture->pict_struct == BOTTOM_FIELD)
00203 offs += width;
00204 }
00205 else
00206 {
00207
00208
00209 i1 = (chroma_format == CHROMA444) ? i : i >> 1;
00210 j1 = (chroma_format != CHROMA420) ? j : j >> 1;
00211
00212 if ((picture->pict_struct==FRAME_PICTURE) && mbi[k].dct_type
00213 && (chroma_format!=CHROMA420))
00214 {
00215
00216 offs = i1 + (n&8) + chrom_width*(j1+((n&2)>>1));
00217 lx = chrom_width<<1;
00218 }
00219 else
00220 {
00221
00222 offs = i1 + (n&8) + chrom_width2*(j1+((n&2)<<2));
00223 lx = chrom_width2;
00224 }
00225
00226 if(picture->pict_struct==BOTTOM_FIELD)
00227 offs += chrom_width;
00228 }
00229
00230 (*psub_pred)(pred[cc]+offs,cur[cc]+offs,lx,
00231 blocks[k*block_count+n]);
00232 (*pfdct)(blocks[k*block_count+n]);
00233 }
00234
00235 k++;
00236 }
00237 }
00238 pthread_mutex_unlock(&(engine->output_lock));
00239 }
00240 }
00241
00242
00243 void transform(pict_data_s *picture,
00244 uint8_t *pred[], uint8_t *cur[])
00245 {
00246 int i;
00247
00248 for(i = 0; i < processors; i++)
00249 {
00250 transform_engines[i].picture = picture;
00251 transform_engines[i].pred = pred;
00252 transform_engines[i].cur = cur;
00253 pthread_mutex_unlock(&(transform_engines[i].input_lock));
00254 }
00255
00256
00257 for(i = 0; i < processors; i++)
00258 {
00259 pthread_mutex_lock(&(transform_engines[i].output_lock));
00260 }
00261 }
00262
00263
00264
00265 void start_transform_engines()
00266 {
00267 int i;
00268 int rows_per_processor = (int)((float)height2 / 16 / processors + 0.5);
00269 int current_row = 0;
00270 pthread_attr_t attr;
00271 pthread_mutexattr_t mutex_attr;
00272
00273 pthread_mutexattr_init(&mutex_attr);
00274 pthread_attr_init(&attr);
00275 transform_engines = calloc(1, sizeof(transform_engine_t) * processors);
00276 for(i = 0; i < processors; i++)
00277 {
00278 transform_engines[i].start_row = current_row * 16;
00279 current_row += rows_per_processor;
00280 if(current_row > height2 / 16) current_row = height2 / 16;
00281 transform_engines[i].end_row = current_row * 16;
00282 pthread_mutex_init(&(transform_engines[i].input_lock), &mutex_attr);
00283 pthread_mutex_lock(&(transform_engines[i].input_lock));
00284 pthread_mutex_init(&(transform_engines[i].output_lock), &mutex_attr);
00285 pthread_mutex_lock(&(transform_engines[i].output_lock));
00286 transform_engines[i].done = 0;
00287 pthread_create(&(transform_engines[i].tid),
00288 &attr,
00289 (void*)transform_engine_loop,
00290 &transform_engines[i]);
00291 }
00292 }
00293
00294 void stop_transform_engines()
00295 {
00296 int i;
00297 for(i = 0; i < processors; i++)
00298 {
00299 transform_engines[i].done = 1;
00300 pthread_mutex_unlock(&(transform_engines[i].input_lock));
00301 pthread_join(transform_engines[i].tid, 0);
00302 pthread_mutex_destroy(&(transform_engines[i].input_lock));
00303 pthread_mutex_destroy(&(transform_engines[i].output_lock));
00304 }
00305 free(transform_engines);
00306 }
00307
00308
00309
00310
00311
00312
00313
00314
00315
00316
00317 void itransform_engine_loop(transform_engine_t *engine)
00318 {
00319 while(!engine->done)
00320 {
00321 pthread_mutex_lock(&(engine->input_lock));
00322
00323 if(!engine->done)
00324 {
00325 pict_data_s *picture = engine->picture;
00326 uint8_t **pred = engine->pred;
00327 uint8_t **cur = engine->cur;
00328 int i, j, i1, j1, k, n, cc, offs, lx;
00329 mbinfo_s *mbi = picture->mbinfo;
00330
00331
00332 int16_t (*blocks)[64] = picture->qblocks;
00333
00334 k = (engine->start_row / 16) * (width / 16);
00335
00336 for(j = engine->start_row; j < engine->end_row; j += 16)
00337 for(i = 0; i < width; i += 16)
00338 {
00339 for(n = 0; n < block_count; n++)
00340 {
00341 cc = (n < 4) ? 0 : (n & 1) + 1;
00342
00343 if(cc == 0)
00344 {
00345
00346 if((picture->pict_struct == FRAME_PICTURE) && mbi[k].dct_type)
00347 {
00348
00349 offs = i + ((n & 1) << 3) + width * (j + ((n & 2) >> 1));
00350 lx = width<<1;
00351 }
00352 else
00353 {
00354
00355 offs = i + ((n & 1) << 3) + width2 * (j + ((n & 2) << 2));
00356 lx = width2;
00357 }
00358
00359 if(picture->pict_struct == BOTTOM_FIELD)
00360 offs += width;
00361 }
00362 else
00363 {
00364
00365
00366
00367 i1 = (chroma_format==CHROMA444) ? i : i>>1;
00368 j1 = (chroma_format!=CHROMA420) ? j : j>>1;
00369
00370 if((picture->pict_struct == FRAME_PICTURE) && mbi[k].dct_type
00371 && (chroma_format != CHROMA420))
00372 {
00373
00374 offs = i1 + (n & 8) + chrom_width * (j1 + ((n & 2) >> 1));
00375 lx = chrom_width << 1;
00376 }
00377 else
00378 {
00379
00380 offs = i1 + (n&8) + chrom_width2 * (j1 + ((n & 2) << 2));
00381 lx = chrom_width2;
00382 }
00383
00384 if(picture->pict_struct == BOTTOM_FIELD)
00385 offs += chrom_width;
00386 }
00387
00388
00389 (*pidct)(blocks[k*block_count+n], engine->temp);
00390 (*padd_pred)(pred[cc]+offs,cur[cc]+offs,lx,blocks[k*block_count+n]);
00391
00392 }
00393
00394 k++;
00395 }
00396 }
00397 pthread_mutex_unlock(&(engine->output_lock));
00398 }
00399 }
00400
00401 void itransform(pict_data_s *picture,
00402 uint8_t *pred[], uint8_t *cur[])
00403 {
00404 int i;
00405
00406 for(i = 0; i < processors; i++)
00407 {
00408 itransform_engines[i].picture = picture;
00409 itransform_engines[i].cur = cur;
00410 itransform_engines[i].pred = pred;
00411 pthread_mutex_unlock(&(itransform_engines[i].input_lock));
00412 }
00413
00414
00415 for(i = 0; i < processors; i++)
00416 {
00417 pthread_mutex_lock(&(itransform_engines[i].output_lock));
00418 }
00419 }
00420
00421 void start_itransform_engines()
00422 {
00423 int i;
00424 int rows_per_processor = (int)((float)height2 / 16 / processors + 0.5);
00425 int current_row = 0;
00426 pthread_attr_t attr;
00427 pthread_mutexattr_t mutex_attr;
00428
00429 pthread_mutexattr_init(&mutex_attr);
00430 pthread_attr_init(&attr);
00431 itransform_engines = calloc(1, sizeof(transform_engine_t) * processors);
00432 for(i = 0; i < processors; i++)
00433 {
00434 itransform_engines[i].start_row = current_row * 16;
00435 current_row += rows_per_processor;
00436 if(current_row > height2 / 16) current_row = height2 / 16;
00437 itransform_engines[i].end_row = current_row * 16;
00438 pthread_mutex_init(&(itransform_engines[i].input_lock), &mutex_attr);
00439 pthread_mutex_lock(&(itransform_engines[i].input_lock));
00440 pthread_mutex_init(&(itransform_engines[i].output_lock), &mutex_attr);
00441 pthread_mutex_lock(&(itransform_engines[i].output_lock));
00442 itransform_engines[i].done = 0;
00443 pthread_create(&(itransform_engines[i].tid),
00444 &attr,
00445 (void*)itransform_engine_loop,
00446 &itransform_engines[i]);
00447 }
00448 }
00449
00450 void stop_itransform_engines()
00451 {
00452 int i;
00453 for(i = 0; i < processors; i++)
00454 {
00455 itransform_engines[i].done = 1;
00456 pthread_mutex_unlock(&(itransform_engines[i].input_lock));
00457 pthread_join(itransform_engines[i].tid, 0);
00458 pthread_mutex_destroy(&(itransform_engines[i].input_lock));
00459 pthread_mutex_destroy(&(itransform_engines[i].output_lock));
00460 }
00461 free(itransform_engines);
00462 }
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472
00473 void dct_type_estimation(
00474 pict_data_s *picture,
00475 uint8_t *pred, uint8_t *cur
00476 )
00477 {
00478
00479 struct mbinfo *mbi = picture->mbinfo;
00480
00481 int16_t blk0[128], blk1[128];
00482 int i, j, i0, j0, k, offs, s0, s1, sq0, sq1, s01;
00483 double d, r;
00484
00485 k = 0;
00486
00487 for (j0=0; j0<height2; j0+=16)
00488 for (i0=0; i0<width; i0+=16)
00489 {
00490 if (picture->frame_pred_dct || picture->pict_struct!=FRAME_PICTURE)
00491 mbi[k].dct_type = 0;
00492 else
00493 {
00494
00495
00496
00497
00498
00499 for (j=0; j<8; j++)
00500 {
00501 offs = width*((j<<1)+j0) + i0;
00502 for (i=0; i<16; i++)
00503 {
00504 blk0[16*j+i] = cur[offs] - pred[offs];
00505 blk1[16*j+i] = cur[offs+width] - pred[offs+width];
00506 offs++;
00507 }
00508 }
00509
00510 s0=s1=sq0=sq1=s01=0;
00511
00512 for (i=0; i<128; i++)
00513 {
00514 s0+= blk0[i];
00515 sq0+= blk0[i]*blk0[i];
00516 s1+= blk1[i];
00517 sq1+= blk1[i]*blk1[i];
00518 s01+= blk0[i]*blk1[i];
00519 }
00520
00521 d = (sq0-(s0*s0)/128.0)*(sq1-(s1*s1)/128.0);
00522
00523 if (d>0.0)
00524 {
00525 r = (s01-(s0*s1)/128.0)/sqrt(d);
00526 if (r>0.5)
00527 mbi[k].dct_type = 0;
00528 else
00529 mbi[k].dct_type = 1;
00530 }
00531 else
00532 mbi[k].dct_type = 1;
00533 }
00534 k++;
00535 }
00536 }