00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038 #include <stdlib.h>
00039 #include <string.h>
00040 #include "../dsputil.h"
00041
00042 #include "gcc_fixes.h"
00043
00044 #include "dsputil_altivec.h"
00045
00046 #define vector_s16_t vector signed short
00047 #define const_vector_s16_t const_vector signed short
00048 #define vector_u16_t vector unsigned short
00049 #define vector_s8_t vector signed char
00050 #define vector_u8_t vector unsigned char
00051 #define vector_s32_t vector signed int
00052 #define vector_u32_t vector unsigned int
00053
00054 #define IDCT_HALF \
00055 \
00056 t1 = vec_mradds (a1, vx7, vx1 ); \
00057 t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \
00058 t7 = vec_mradds (a2, vx5, vx3); \
00059 t3 = vec_mradds (ma2, vx3, vx5); \
00060 \
00061 \
00062 t5 = vec_adds (vx0, vx4); \
00063 t0 = vec_subs (vx0, vx4); \
00064 t2 = vec_mradds (a0, vx6, vx2); \
00065 t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \
00066 t6 = vec_adds (t8, t3); \
00067 t3 = vec_subs (t8, t3); \
00068 t8 = vec_subs (t1, t7); \
00069 t1 = vec_adds (t1, t7); \
00070 \
00071 \
00072 t7 = vec_adds (t5, t2); \
00073 t2 = vec_subs (t5, t2); \
00074 t5 = vec_adds (t0, t4); \
00075 t0 = vec_subs (t0, t4); \
00076 t4 = vec_subs (t8, t3); \
00077 t3 = vec_adds (t8, t3); \
00078 \
00079 \
00080 vy0 = vec_adds (t7, t1); \
00081 vy7 = vec_subs (t7, t1); \
00082 vy1 = vec_mradds (c4, t3, t5); \
00083 vy6 = vec_mradds (mc4, t3, t5); \
00084 vy2 = vec_mradds (c4, t4, t0); \
00085 vy5 = vec_mradds (mc4, t4, t0); \
00086 vy3 = vec_adds (t2, t6); \
00087 vy4 = vec_subs (t2, t6);
00088
00089
00090 #define IDCT \
00091 vector_s16_t vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \
00092 vector_s16_t vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \
00093 vector_s16_t a0, a1, a2, ma2, c4, mc4, zero, bias; \
00094 vector_s16_t t0, t1, t2, t3, t4, t5, t6, t7, t8; \
00095 vector_u16_t shift; \
00096 \
00097 c4 = vec_splat (constants[0], 0); \
00098 a0 = vec_splat (constants[0], 1); \
00099 a1 = vec_splat (constants[0], 2); \
00100 a2 = vec_splat (constants[0], 3); \
00101 mc4 = vec_splat (constants[0], 4); \
00102 ma2 = vec_splat (constants[0], 5); \
00103 bias = (vector_s16_t)vec_splat ((vector_s32_t)constants[0], 3); \
00104 \
00105 zero = vec_splat_s16 (0); \
00106 shift = vec_splat_u16 (4); \
00107 \
00108 vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \
00109 vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \
00110 vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \
00111 vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \
00112 vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \
00113 vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \
00114 vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \
00115 vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \
00116 \
00117 IDCT_HALF \
00118 \
00119 vx0 = vec_mergeh (vy0, vy4); \
00120 vx1 = vec_mergel (vy0, vy4); \
00121 vx2 = vec_mergeh (vy1, vy5); \
00122 vx3 = vec_mergel (vy1, vy5); \
00123 vx4 = vec_mergeh (vy2, vy6); \
00124 vx5 = vec_mergel (vy2, vy6); \
00125 vx6 = vec_mergeh (vy3, vy7); \
00126 vx7 = vec_mergel (vy3, vy7); \
00127 \
00128 vy0 = vec_mergeh (vx0, vx4); \
00129 vy1 = vec_mergel (vx0, vx4); \
00130 vy2 = vec_mergeh (vx1, vx5); \
00131 vy3 = vec_mergel (vx1, vx5); \
00132 vy4 = vec_mergeh (vx2, vx6); \
00133 vy5 = vec_mergel (vx2, vx6); \
00134 vy6 = vec_mergeh (vx3, vx7); \
00135 vy7 = vec_mergel (vx3, vx7); \
00136 \
00137 vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \
00138 vx1 = vec_mergel (vy0, vy4); \
00139 vx2 = vec_mergeh (vy1, vy5); \
00140 vx3 = vec_mergel (vy1, vy5); \
00141 vx4 = vec_mergeh (vy2, vy6); \
00142 vx5 = vec_mergel (vy2, vy6); \
00143 vx6 = vec_mergeh (vy3, vy7); \
00144 vx7 = vec_mergel (vy3, vy7); \
00145 \
00146 IDCT_HALF \
00147 \
00148 shift = vec_splat_u16 (6); \
00149 vx0 = vec_sra (vy0, shift); \
00150 vx1 = vec_sra (vy1, shift); \
00151 vx2 = vec_sra (vy2, shift); \
00152 vx3 = vec_sra (vy3, shift); \
00153 vx4 = vec_sra (vy4, shift); \
00154 vx5 = vec_sra (vy5, shift); \
00155 vx6 = vec_sra (vy6, shift); \
00156 vx7 = vec_sra (vy7, shift);
00157
00158
00159 static const_vector_s16_t constants[5] = {
00160 (vector_s16_t) AVV(23170, 13573, 6518, 21895, -23170, -21895, 32, 31),
00161 (vector_s16_t) AVV(16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725),
00162 (vector_s16_t) AVV(22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521),
00163 (vector_s16_t) AVV(21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692),
00164 (vector_s16_t) AVV(19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722)
00165 };
00166
00167 void idct_put_altivec(uint8_t* dest, int stride, vector_s16_t* block)
00168 {
00169 POWERPC_PERF_DECLARE(altivec_idct_put_num, 1);
00170 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
00171 POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
00172 void simple_idct_put(uint8_t *dest, int line_size, int16_t *block);
00173 simple_idct_put(dest, stride, (int16_t*)block);
00174 POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1);
00175 #else
00176 vector_u8_t tmp;
00177
00178 #ifdef POWERPC_PERFORMANCE_REPORT
00179 POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
00180 #endif
00181 IDCT
00182
00183 #define COPY(dest,src) \
00184 tmp = vec_packsu (src, src); \
00185 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \
00186 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
00187
00188 COPY (dest, vx0) dest += stride;
00189 COPY (dest, vx1) dest += stride;
00190 COPY (dest, vx2) dest += stride;
00191 COPY (dest, vx3) dest += stride;
00192 COPY (dest, vx4) dest += stride;
00193 COPY (dest, vx5) dest += stride;
00194 COPY (dest, vx6) dest += stride;
00195 COPY (dest, vx7)
00196
00197 POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1);
00198 #endif
00199 }
00200
00201 void idct_add_altivec(uint8_t* dest, int stride, vector_s16_t* block)
00202 {
00203 POWERPC_PERF_DECLARE(altivec_idct_add_num, 1);
00204 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
00205 POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);
00206 void simple_idct_add(uint8_t *dest, int line_size, int16_t *block);
00207 simple_idct_add(dest, stride, (int16_t*)block);
00208 POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1);
00209 #else
00210 vector_u8_t tmp;
00211 vector_s16_t tmp2, tmp3;
00212 vector_u8_t perm0;
00213 vector_u8_t perm1;
00214 vector_u8_t p0, p1, p;
00215
00216 #ifdef POWERPC_PERFORMANCE_REPORT
00217 POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);
00218 #endif
00219
00220 IDCT
00221
00222 p0 = vec_lvsl (0, dest);
00223 p1 = vec_lvsl (stride, dest);
00224 p = vec_splat_u8 (-1);
00225 perm0 = vec_mergeh (p, p0);
00226 perm1 = vec_mergeh (p, p1);
00227
00228 #define ADD(dest,src,perm) \
00229 \
00230 tmp = vec_ld (0, dest); \
00231 tmp2 = (vector_s16_t)vec_perm (tmp, (vector_u8_t)zero, perm); \
00232 tmp3 = vec_adds (tmp2, src); \
00233 tmp = vec_packsu (tmp3, tmp3); \
00234 vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \
00235 vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
00236
00237 ADD (dest, vx0, perm0) dest += stride;
00238 ADD (dest, vx1, perm1) dest += stride;
00239 ADD (dest, vx2, perm0) dest += stride;
00240 ADD (dest, vx3, perm1) dest += stride;
00241 ADD (dest, vx4, perm0) dest += stride;
00242 ADD (dest, vx5, perm1) dest += stride;
00243 ADD (dest, vx6, perm0) dest += stride;
00244 ADD (dest, vx7, perm1)
00245
00246 POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1);
00247 #endif
00248 }
00249