00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include "../dsputil.h"
00022
00023 #include "gcc_fixes.h"
00024
00025 #include "dsputil_altivec.h"
00026
00027
00028
00029
00030
00031
00032 #define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \
00033 {\
00034 FFTSample ax, ay, bx, by;\
00035 bx=pre1;\
00036 by=pim1;\
00037 ax=qre1;\
00038 ay=qim1;\
00039 pre = (bx + ax);\
00040 pim = (by + ay);\
00041 qre = (bx - ax);\
00042 qim = (by - ay);\
00043 }
00044 #define MUL16(a,b) ((a) * (b))
00045 #define CMUL(pre, pim, are, aim, bre, bim) \
00046 {\
00047 pre = (MUL16(are, bre) - MUL16(aim, bim));\
00048 pim = (MUL16(are, bim) + MUL16(bre, aim));\
00049 }
00050
00051
00063 void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z)
00064 {
00065 POWERPC_PERF_DECLARE(altivec_fft_num, s->nbits >= 6);
00066 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
00067 int ln = s->nbits;
00068 int j, np, np2;
00069 int nblocks, nloops;
00070 register FFTComplex *p, *q;
00071 FFTComplex *exptab = s->exptab;
00072 int l;
00073 FFTSample tmp_re, tmp_im;
00074
00075 POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6);
00076
00077 np = 1 << ln;
00078
00079
00080
00081 p=&z[0];
00082 j=(np >> 1);
00083 do {
00084 BF(p[0].re, p[0].im, p[1].re, p[1].im,
00085 p[0].re, p[0].im, p[1].re, p[1].im);
00086 p+=2;
00087 } while (--j != 0);
00088
00089
00090
00091
00092 p=&z[0];
00093 j=np >> 2;
00094 if (s->inverse) {
00095 do {
00096 BF(p[0].re, p[0].im, p[2].re, p[2].im,
00097 p[0].re, p[0].im, p[2].re, p[2].im);
00098 BF(p[1].re, p[1].im, p[3].re, p[3].im,
00099 p[1].re, p[1].im, -p[3].im, p[3].re);
00100 p+=4;
00101 } while (--j != 0);
00102 } else {
00103 do {
00104 BF(p[0].re, p[0].im, p[2].re, p[2].im,
00105 p[0].re, p[0].im, p[2].re, p[2].im);
00106 BF(p[1].re, p[1].im, p[3].re, p[3].im,
00107 p[1].re, p[1].im, p[3].im, -p[3].re);
00108 p+=4;
00109 } while (--j != 0);
00110 }
00111
00112
00113 nblocks = np >> 3;
00114 nloops = 1 << 2;
00115 np2 = np >> 1;
00116 do {
00117 p = z;
00118 q = z + nloops;
00119 for (j = 0; j < nblocks; ++j) {
00120 BF(p->re, p->im, q->re, q->im,
00121 p->re, p->im, q->re, q->im);
00122
00123 p++;
00124 q++;
00125 for(l = nblocks; l < np2; l += nblocks) {
00126 CMUL(tmp_re, tmp_im, exptab[l].re, exptab[l].im, q->re, q->im);
00127 BF(p->re, p->im, q->re, q->im,
00128 p->re, p->im, tmp_re, tmp_im);
00129 p++;
00130 q++;
00131 }
00132
00133 p += nloops;
00134 q += nloops;
00135 }
00136 nblocks = nblocks >> 1;
00137 nloops = nloops << 1;
00138 } while (nblocks != 0);
00139
00140 POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6);
00141
00142 #else
00143 #ifdef CONFIG_DARWIN
00144 register const vector float vczero = (const vector float)(0.);
00145 #else
00146 register const vector float vczero = (const vector float){0.,0.,0.,0.};
00147 #endif
00148
00149 int ln = s->nbits;
00150 int j, np, np2;
00151 int nblocks, nloops;
00152 register FFTComplex *p, *q;
00153 FFTComplex *cptr, *cptr1;
00154 int k;
00155
00156 POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6);
00157
00158 np = 1 << ln;
00159
00160 {
00161 vector float *r, a, b, a1, c1, c2;
00162
00163 r = (vector float *)&z[0];
00164
00165 c1 = vcii(p,p,n,n);
00166
00167 if (s->inverse)
00168 {
00169 c2 = vcii(p,p,n,p);
00170 }
00171 else
00172 {
00173 c2 = vcii(p,p,p,n);
00174 }
00175
00176 j = (np >> 2);
00177 do {
00178 a = vec_ld(0, r);
00179 a1 = vec_ld(sizeof(vector float), r);
00180
00181 b = vec_perm(a,a,vcprmle(1,0,3,2));
00182 a = vec_madd(a,c1,b);
00183
00184
00185 b = vec_perm(a1,a1,vcprmle(1,0,3,2));
00186 b = vec_madd(a1,c1,b);
00187
00188
00189
00190 b = vec_perm(b,b,vcprmle(2,3,1,0));
00191
00192
00193 vec_st(vec_madd(b,c2,a), 0, r);
00194 vec_st(vec_nmsub(b,c2,a), sizeof(vector float), r);
00195
00196 r += 2;
00197 } while (--j != 0);
00198 }
00199
00200
00201 nblocks = np >> 3;
00202 nloops = 1 << 2;
00203 np2 = np >> 1;
00204
00205 cptr1 = s->exptab1;
00206 do {
00207 p = z;
00208 q = z + nloops;
00209 j = nblocks;
00210 do {
00211 cptr = cptr1;
00212 k = nloops >> 1;
00213 do {
00214 vector float a,b,c,t1;
00215
00216 a = vec_ld(0, (float*)p);
00217 b = vec_ld(0, (float*)q);
00218
00219
00220 c = vec_ld(0, (float*)cptr);
00221
00222 t1 = vec_madd(c, vec_perm(b,b,vcprmle(2,2,0,0)),vczero);
00223 c = vec_ld(sizeof(vector float), (float*)cptr);
00224
00225 b = vec_madd(c, vec_perm(b,b,vcprmle(3,3,1,1)),t1);
00226
00227
00228 vec_st(vec_add(a,b), 0, (float*)p);
00229 vec_st(vec_sub(a,b), 0, (float*)q);
00230
00231 p += 2;
00232 q += 2;
00233 cptr += 4;
00234 } while (--k);
00235
00236 p += nloops;
00237 q += nloops;
00238 } while (--j);
00239 cptr1 += nloops * 2;
00240 nblocks = nblocks >> 1;
00241 nloops = nloops << 1;
00242 } while (nblocks != 0);
00243
00244 POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6);
00245
00246 #endif
00247 }