00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #include "../dsputil.h"
00020 #include <math.h>
00021
00022 #ifdef HAVE_BUILTIN_VECTOR
00023
00024 #include <xmmintrin.h>
00025
00026 static const float p1p1p1m1[4] __attribute__((aligned(16))) =
00027 { 1.0, 1.0, 1.0, -1.0 };
00028
00029 static const float p1p1m1p1[4] __attribute__((aligned(16))) =
00030 { 1.0, 1.0, -1.0, 1.0 };
00031
00032 static const float p1p1m1m1[4] __attribute__((aligned(16))) =
00033 { 1.0, 1.0, -1.0, -1.0 };
00034
00035 #if 0
00036 static void print_v4sf(const char *str, __m128 a)
00037 {
00038 float *p = (float *)&a;
00039 printf("%s: %f %f %f %f\n",
00040 str, p[0], p[1], p[2], p[3]);
00041 }
00042 #endif
00043
00044
00045 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
00046 {
00047 int ln = s->nbits;
00048 int j, np, np2;
00049 int nblocks, nloops;
00050 register FFTComplex *p, *q;
00051 FFTComplex *cptr, *cptr1;
00052 int k;
00053
00054 np = 1 << ln;
00055
00056 {
00057 __m128 *r, a, b, a1, c1, c2;
00058
00059 r = (__m128 *)&z[0];
00060 c1 = *(__m128 *)p1p1m1m1;
00061 c2 = *(__m128 *)p1p1p1m1;
00062 if (s->inverse)
00063 c2 = *(__m128 *)p1p1m1p1;
00064 else
00065 c2 = *(__m128 *)p1p1p1m1;
00066
00067 j = (np >> 2);
00068 do {
00069 a = r[0];
00070 b = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2));
00071 a = _mm_mul_ps(a, c1);
00072
00073 a = _mm_add_ps(a, b);
00074
00075 a1 = r[1];
00076 b = _mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 0, 3, 2));
00077 a1 = _mm_mul_ps(a1, c1);
00078
00079 b = _mm_add_ps(a1, b);
00080
00081
00082 b = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 1, 0));
00083 b = _mm_mul_ps(b, c2);
00084
00085
00086 r[0] = _mm_add_ps(a, b);
00087 r[1] = _mm_sub_ps(a, b);
00088 r += 2;
00089 } while (--j != 0);
00090 }
00091
00092
00093 nblocks = np >> 3;
00094 nloops = 1 << 2;
00095 np2 = np >> 1;
00096
00097 cptr1 = s->exptab1;
00098 do {
00099 p = z;
00100 q = z + nloops;
00101 j = nblocks;
00102 do {
00103 cptr = cptr1;
00104 k = nloops >> 1;
00105 do {
00106 __m128 a, b, c, t1, t2;
00107
00108 a = *(__m128 *)p;
00109 b = *(__m128 *)q;
00110
00111
00112 c = *(__m128 *)cptr;
00113
00114 t1 = _mm_mul_ps(c,
00115 _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 0, 0)));
00116 c = *(__m128 *)(cptr + 2);
00117
00118 t2 = _mm_mul_ps(c,
00119 _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 1, 1)));
00120 b = _mm_add_ps(t1, t2);
00121
00122
00123 *(__m128 *)p = _mm_add_ps(a, b);
00124 *(__m128 *)q = _mm_sub_ps(a, b);
00125
00126 p += 2;
00127 q += 2;
00128 cptr += 4;
00129 } while (--k);
00130
00131 p += nloops;
00132 q += nloops;
00133 } while (--j);
00134 cptr1 += nloops * 2;
00135 nblocks = nblocks >> 1;
00136 nloops = nloops << 1;
00137 } while (nblocks != 0);
00138 }
00139
00140 #endif