00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041 #ifndef _FFT_1D_R2_H_
00042 #define _FFT_1D_R2_H_ 1
00043
00044 #include "fft_1d.h"
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109 static __inline void _fft_1d_r2(vector float *out, vector float *in, vector float *W, int log2_size)
00110 {
00111 int i, j, k;
00112 int stage, offset;
00113 int i_rev;
00114 int n, n_2, n_4, n_8, n_16, n_3_16;
00115 int w_stride, w_2stride, w_3stride, w_4stride;
00116 int stride, stride_2, stride_4, stride_3_4;
00117 vector float *W0, *W1, *W2, *W3;
00118 vector float *re0, *re1, *re2, *re3;
00119 vector float *im0, *im1, *im2, *im3;
00120 vector float *in0, *in1, *in2, *in3, *in4, *in5, *in6, *in7;
00121 vector float *out0, *out1, *out2, *out3;
00122 vector float tmp0, tmp1;
00123 vector float w0_re, w0_im, w1_re, w1_im;
00124 vector float w0, w1, w2, w3;
00125 vector float src_lo0, src_lo1, src_lo2, src_lo3;
00126 vector float src_hi0, src_hi1, src_hi2, src_hi3;
00127 vector float dst_lo0, dst_lo1, dst_lo2, dst_lo3;
00128 vector float dst_hi0, dst_hi1, dst_hi2, dst_hi3;
00129 vector float out_re_lo0, out_re_lo1, out_re_lo2, out_re_lo3;
00130 vector float out_im_lo0, out_im_lo1, out_im_lo2, out_im_lo3;
00131 vector float out_re_hi0, out_re_hi1, out_re_hi2, out_re_hi3;
00132 vector float out_im_hi0, out_im_hi1, out_im_hi2, out_im_hi3;
00133 vector float re_lo0, re_lo1, re_lo2, re_lo3;
00134 vector float im_lo0, im_lo1, im_lo2, im_lo3;
00135 vector float re_hi0, re_hi1, re_hi2, re_hi3;
00136 vector float im_hi0, im_hi1, im_hi2, im_hi3;
00137 vector float pq_lo0, pq_lo1, pq_lo2, pq_lo3;
00138 vector float pq_hi0, pq_hi1, pq_hi2, pq_hi3;
00139 vector float re[MAX_FFT_1D_SIZE/4], im[MAX_FFT_1D_SIZE/4];
00140 vector float ppmm = (vector float) { 1.0f, 1.0f, -1.0f, -1.0f};
00141 vector float pmmp = (vector float) { 1.0f, -1.0f, -1.0f, 1.0f};
00142 vector unsigned char reverse;
00143 vector unsigned char shuf_lo = (vector unsigned char) {
00144 0, 1, 2, 3, 4, 5, 6, 7,
00145 16,17,18,19, 20,21,22,23};
00146 vector unsigned char shuf_hi = (vector unsigned char) {
00147 8, 9,10,11, 12,13,14,15,
00148 24,25,26,27, 28,29,30,31};
00149 vector unsigned char shuf_0202 = (vector unsigned char) {
00150 0, 1, 2, 3, 8, 9,10,11,
00151 0, 1, 2, 3, 8, 9,10,11};
00152 vector unsigned char shuf_1313 = (vector unsigned char) {
00153 4, 5, 6, 7, 12,13,14,15,
00154 4, 5, 6, 7, 12,13,14,15};
00155 vector unsigned char shuf_0303 = (vector unsigned char) {
00156 0, 1, 2, 3, 12,13,14,15,
00157 0, 1, 2, 3, 12,13,14,15};
00158 vector unsigned char shuf_1212 = (vector unsigned char) {
00159 4, 5, 6, 7, 8, 9,10,11,
00160 4, 5, 6, 7, 8, 9,10,11};
00161 vector unsigned char shuf_0415 = (vector unsigned char) {
00162 0, 1, 2, 3, 16,17,18,19,
00163 4, 5, 6, 7, 20,21,22,23};
00164 vector unsigned char shuf_2637 = (vector unsigned char) {
00165 8, 9,10,11, 24,25,26,27,
00166 12,13,14,15,28,29,30,31};
00167 vector unsigned char shuf_0246 = (vector unsigned char) {
00168 0, 1, 2, 3, 8, 9,10,11,
00169 16,17,18,19,24,25,26,27};
00170 vector unsigned char shuf_1357 = (vector unsigned char) {
00171 4, 5, 6, 7, 12,13,14,15,
00172 20,21,22,23,28,29,30,31};
00173
00174 n = 1 << log2_size;
00175 n_2 = n >> 1;
00176 n_4 = n >> 2;
00177 n_8 = n >> 3;
00178 n_16 = n >> 4;
00179
00180 n_3_16 = n_8 + n_16;
00181
00182
00183
00184
00185 reverse = spu_or(spu_slqwbyte(spu_splats((unsigned char)0x80), log2_size),
00186 spu_rlmaskqwbyte(((vec_uchar16){15,14,13,12, 11,10,9,8,
00187 7, 6, 5, 4, 3, 2,1,0}),
00188 log2_size-16));
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204 i = 0;
00205 i_rev = 0;
00206
00207 in0 = in;
00208 in1 = in + n_8;
00209 in2 = in + n_16;
00210 in3 = in + n_3_16;
00211
00212 in4 = in + n_4;
00213 in5 = in1 + n_4;
00214 in6 = in2 + n_4;
00215 in7 = in3 + n_4;
00216
00217 re0 = re;
00218 re1 = re + n_8;
00219 im0 = im;
00220 im1 = im + n_8;
00221
00222 w0_re = (vector float) { 1.0f, INV_SQRT_2, 0.0f, -INV_SQRT_2};
00223 w0_im = (vector float) { 0.0f, -INV_SQRT_2, -1.0f, -INV_SQRT_2};
00224
00225 do {
00226 src_lo0 = in0[i_rev];
00227 src_lo1 = in1[i_rev];
00228 src_lo2 = in2[i_rev];
00229 src_lo3 = in3[i_rev];
00230
00231 src_hi0 = in4[i_rev];
00232 src_hi1 = in5[i_rev];
00233 src_hi2 = in6[i_rev];
00234 src_hi3 = in7[i_rev];
00235
00236
00237
00238 dst_lo0 = spu_shuffle(src_lo0, src_hi0, shuf_lo);
00239 dst_hi0 = spu_shuffle(src_lo0, src_hi0, shuf_hi);
00240 dst_lo1 = spu_shuffle(src_lo1, src_hi1, shuf_lo);
00241 dst_hi1 = spu_shuffle(src_lo1, src_hi1, shuf_hi);
00242 dst_lo2 = spu_shuffle(src_lo2, src_hi2, shuf_lo);
00243 dst_hi2 = spu_shuffle(src_lo2, src_hi2, shuf_hi);
00244 dst_lo3 = spu_shuffle(src_lo3, src_hi3, shuf_lo);
00245 dst_hi3 = spu_shuffle(src_lo3, src_hi3, shuf_hi);
00246
00247
00248
00249
00250
00251 pq_lo0 = spu_madd(ppmm, dst_lo0, spu_rlqwbyte(dst_lo0, 8));
00252 pq_hi0 = spu_madd(ppmm, dst_hi0, spu_rlqwbyte(dst_hi0, 8));
00253 pq_lo1 = spu_madd(ppmm, dst_lo1, spu_rlqwbyte(dst_lo1, 8));
00254 pq_hi1 = spu_madd(ppmm, dst_hi1, spu_rlqwbyte(dst_hi1, 8));
00255 pq_lo2 = spu_madd(ppmm, dst_lo2, spu_rlqwbyte(dst_lo2, 8));
00256 pq_hi2 = spu_madd(ppmm, dst_hi2, spu_rlqwbyte(dst_hi2, 8));
00257 pq_lo3 = spu_madd(ppmm, dst_lo3, spu_rlqwbyte(dst_lo3, 8));
00258 pq_hi3 = spu_madd(ppmm, dst_hi3, spu_rlqwbyte(dst_hi3, 8));
00259
00260
00261
00262
00263
00264
00265
00266
00267
00268
00269 re_lo0 = spu_madd(ppmm,
00270 spu_shuffle(pq_lo1, pq_lo1, shuf_0303),
00271 spu_shuffle(pq_lo0, pq_lo0, shuf_0202));
00272 im_lo0 = spu_madd(pmmp,
00273 spu_shuffle(pq_lo1, pq_lo1, shuf_1212),
00274 spu_shuffle(pq_lo0, pq_lo0, shuf_1313));
00275
00276 re_lo1 = spu_madd(ppmm,
00277 spu_shuffle(pq_lo3, pq_lo3, shuf_0303),
00278 spu_shuffle(pq_lo2, pq_lo2, shuf_0202));
00279 im_lo1 = spu_madd(pmmp,
00280 spu_shuffle(pq_lo3, pq_lo3, shuf_1212),
00281 spu_shuffle(pq_lo2, pq_lo2, shuf_1313));
00282
00283
00284 re_hi0 = spu_madd(ppmm,
00285 spu_shuffle(pq_hi1, pq_hi1, shuf_0303),
00286 spu_shuffle(pq_hi0, pq_hi0, shuf_0202));
00287 im_hi0 = spu_madd(pmmp,
00288 spu_shuffle(pq_hi1, pq_hi1, shuf_1212),
00289 spu_shuffle(pq_hi0, pq_hi0, shuf_1313));
00290
00291 re_hi1 = spu_madd(ppmm,
00292 spu_shuffle(pq_hi3, pq_hi3, shuf_0303),
00293 spu_shuffle(pq_hi2, pq_hi2, shuf_0202));
00294 im_hi1 = spu_madd(pmmp,
00295 spu_shuffle(pq_hi3, pq_hi3, shuf_1212),
00296 spu_shuffle(pq_hi2, pq_hi2, shuf_1313));
00297
00298
00299
00300
00301 FFT_1D_BUTTERFLY(re0[0], im0[0], re0[1], im0[1], re_lo0, im_lo0, re_lo1, im_lo1, w0_re, w0_im);
00302 FFT_1D_BUTTERFLY(re1[0], im1[0], re1[1], im1[1], re_hi0, im_hi0, re_hi1, im_hi1, w0_re, w0_im);
00303
00304 re0 += 2;
00305 re1 += 2;
00306 im0 += 2;
00307 im1 += 2;
00308
00309 i += 8;
00310 i_rev = BIT_SWAP(i, reverse) / 2;
00311 } while (i < n_2);
00312
00313
00314
00315 for (stage=4, stride=4; stage<log2_size-1; stage++, stride += stride) {
00316 w_stride = n_2 >> stage;
00317 w_2stride = n >> stage;
00318 w_3stride = w_stride + w_2stride;
00319 w_4stride = w_2stride + w_2stride;
00320
00321 W0 = W;
00322 W1 = W + w_stride;
00323 W2 = W + w_2stride;
00324 W3 = W + w_3stride;
00325
00326 stride_2 = stride >> 1;
00327 stride_4 = stride >> 2;
00328 stride_3_4 = stride_2 + stride_4;
00329
00330 re0 = re; im0 = im;
00331 re1 = re + stride_2; im1 = im + stride_2;
00332 re2 = re + stride_4; im2 = im + stride_4;
00333 re3 = re + stride_3_4; im3 = im + stride_3_4;
00334
00335 for (i=0, offset=0; i<stride_4; i++, offset += w_4stride) {
00336
00337
00338 w0 = W0[offset];
00339 w1 = W1[offset];
00340 w2 = W2[offset];
00341 w3 = W3[offset];
00342
00343 tmp0 = spu_shuffle(w0, w2, shuf_0415);
00344 tmp1 = spu_shuffle(w1, w3, shuf_0415);
00345
00346 w0_re = spu_shuffle(tmp0, tmp1, shuf_0415);
00347 w0_im = spu_shuffle(tmp0, tmp1, shuf_2637);
00348
00349 j = i;
00350 k = i + stride;
00351 do {
00352 re_lo0 = re0[j]; im_lo0 = im0[j];
00353 re_lo1 = re1[j]; im_lo1 = im1[j];
00354
00355 re_hi0 = re2[j]; im_hi0 = im2[j];
00356 re_hi1 = re3[j]; im_hi1 = im3[j];
00357
00358 re_lo2 = re0[k]; im_lo2 = im0[k];
00359 re_lo3 = re1[k]; im_lo3 = im1[k];
00360
00361 re_hi2 = re2[k]; im_hi2 = im2[k];
00362 re_hi3 = re3[k]; im_hi3 = im3[k];
00363
00364 FFT_1D_BUTTERFLY (re0[j], im0[j], re1[j], im1[j], re_lo0, im_lo0, re_lo1, im_lo1, w0_re, w0_im);
00365 FFT_1D_BUTTERFLY_HI(re2[j], im2[j], re3[j], im3[j], re_hi0, im_hi0, re_hi1, im_hi1, w0_re, w0_im);
00366
00367 FFT_1D_BUTTERFLY (re0[k], im0[k], re1[k], im1[k], re_lo2, im_lo2, re_lo3, im_lo3, w0_re, w0_im);
00368 FFT_1D_BUTTERFLY_HI(re2[k], im2[k], re3[k], im3[k], re_hi2, im_hi2, re_hi3, im_hi3, w0_re, w0_im);
00369
00370 j += 2 * stride;
00371 k += 2 * stride;
00372 } while (j < n_4);
00373 }
00374 }
00375
00376
00377
00378
00379
00380 w_stride = n_2 >> stage;
00381 w_2stride = n >> stage;
00382 w_3stride = w_stride + w_2stride;
00383 w_4stride = w_2stride + w_2stride;
00384
00385 stride_2 = stride >> 1;
00386 stride_4 = stride >> 2;
00387
00388 stride_3_4 = stride_2 + stride_4;
00389
00390 re0 = re; im0 = im;
00391 re1 = re + stride_2; im1 = im + stride_2;
00392 re2 = re + stride_4; im2 = im + stride_4;
00393 re3 = re + stride_3_4; im3 = im + stride_3_4;
00394
00395 for (i=0, offset=0; i<stride_4; i++, offset += w_4stride) {
00396
00397
00398 w0 = W[offset];
00399 w1 = W[offset + w_stride];
00400 w2 = W[offset + w_2stride];
00401 w3 = W[offset + w_3stride];
00402
00403 tmp0 = spu_shuffle(w0, w2, shuf_0415);
00404 tmp1 = spu_shuffle(w1, w3, shuf_0415);
00405
00406 w0_re = spu_shuffle(tmp0, tmp1, shuf_0415);
00407 w0_im = spu_shuffle(tmp0, tmp1, shuf_2637);
00408
00409 j = i;
00410 k = i + stride;
00411
00412 re_lo0 = re0[j]; im_lo0 = im0[j];
00413 re_lo1 = re1[j]; im_lo1 = im1[j];
00414
00415 re_hi0 = re2[j]; im_hi0 = im2[j];
00416 re_hi1 = re3[j]; im_hi1 = im3[j];
00417
00418 re_lo2 = re0[k]; im_lo2 = im0[k];
00419 re_lo3 = re1[k]; im_lo3 = im1[k];
00420
00421 re_hi2 = re2[k]; im_hi2 = im2[k];
00422 re_hi3 = re3[k]; im_hi3 = im3[k];
00423
00424 FFT_1D_BUTTERFLY (re0[j], im0[j], re1[j], im1[j], re_lo0, im_lo0, re_lo1, im_lo1, w0_re, w0_im);
00425 FFT_1D_BUTTERFLY_HI(re2[j], im2[j], re3[j], im3[j], re_hi0, im_hi0, re_hi1, im_hi1, w0_re, w0_im);
00426
00427 FFT_1D_BUTTERFLY (re0[k], im0[k], re1[k], im1[k], re_lo2, im_lo2, re_lo3, im_lo3, w0_re, w0_im);
00428 FFT_1D_BUTTERFLY_HI(re2[k], im2[k], re3[k], im3[k], re_hi2, im_hi2, re_hi3, im_hi3, w0_re, w0_im);
00429 }
00430
00431
00432
00433
00434
00435
00436
00437
00438
00439
00440 re0 = re;
00441 re1 = re + n_8;
00442 re2 = re + n_16;
00443 re3 = re + n_3_16;
00444
00445 im0 = im;
00446 im1 = im + n_8;
00447 im2 = im + n_16;
00448 im3 = im + n_3_16;
00449
00450 out0 = out;
00451 out1 = out + n_4;
00452 out2 = out + n_8;
00453 out3 = out1 + n_8;
00454
00455 i = n_16;
00456
00457 do {
00458
00459
00460 w0 = W[0];
00461 w1 = W[1];
00462 w2 = W[2];
00463 w3 = W[3];
00464
00465 W += 4;
00466
00467 w0_re = spu_shuffle(w0, w1, shuf_0246);
00468 w0_im = spu_shuffle(w0, w1, shuf_1357);
00469 w1_re = spu_shuffle(w2, w3, shuf_0246);
00470 w1_im = spu_shuffle(w2, w3, shuf_1357);
00471
00472
00473
00474 re_lo0 = re0[0]; im_lo0 = im0[0];
00475 re_lo1 = re1[0]; im_lo1 = im1[0];
00476 re_lo2 = re0[1]; im_lo2 = im0[1];
00477 re_lo3 = re1[1]; im_lo3 = im1[1];
00478
00479 re_hi0 = re2[0]; im_hi0 = im2[0];
00480 re_hi1 = re3[0]; im_hi1 = im3[0];
00481 re_hi2 = re2[1]; im_hi2 = im2[1];
00482 re_hi3 = re3[1]; im_hi3 = im3[1];
00483
00484 re0 += 2; im0 += 2;
00485 re1 += 2; im1 += 2;
00486 re2 += 2; im2 += 2;
00487 re3 += 2; im3 += 2;
00488
00489
00490
00491 FFT_1D_BUTTERFLY (out_re_lo0, out_im_lo0, out_re_lo1, out_im_lo1, re_lo0, im_lo0, re_lo1, im_lo1, w0_re, w0_im);
00492 FFT_1D_BUTTERFLY (out_re_lo2, out_im_lo2, out_re_lo3, out_im_lo3, re_lo2, im_lo2, re_lo3, im_lo3, w1_re, w1_im);
00493
00494 FFT_1D_BUTTERFLY_HI(out_re_hi0, out_im_hi0, out_re_hi1, out_im_hi1, re_hi0, im_hi0, re_hi1, im_hi1, w0_re, w0_im);
00495 FFT_1D_BUTTERFLY_HI(out_re_hi2, out_im_hi2, out_re_hi3, out_im_hi3, re_hi2, im_hi2, re_hi3, im_hi3, w1_re, w1_im);
00496
00497
00498
00499
00500 out0[0] = spu_shuffle(out_re_lo0, out_im_lo0, shuf_0415);
00501 out0[1] = spu_shuffle(out_re_lo0, out_im_lo0, shuf_2637);
00502 out0[2] = spu_shuffle(out_re_lo2, out_im_lo2, shuf_0415);
00503 out0[3] = spu_shuffle(out_re_lo2, out_im_lo2, shuf_2637);
00504
00505 out1[0] = spu_shuffle(out_re_lo1, out_im_lo1, shuf_0415);
00506 out1[1] = spu_shuffle(out_re_lo1, out_im_lo1, shuf_2637);
00507 out1[2] = spu_shuffle(out_re_lo3, out_im_lo3, shuf_0415);
00508 out1[3] = spu_shuffle(out_re_lo3, out_im_lo3, shuf_2637);
00509
00510 out2[0] = spu_shuffle(out_re_hi0, out_im_hi0, shuf_0415);
00511 out2[1] = spu_shuffle(out_re_hi0, out_im_hi0, shuf_2637);
00512 out2[2] = spu_shuffle(out_re_hi2, out_im_hi2, shuf_0415);
00513 out2[3] = spu_shuffle(out_re_hi2, out_im_hi2, shuf_2637);
00514
00515 out3[0] = spu_shuffle(out_re_hi1, out_im_hi1, shuf_0415);
00516 out3[1] = spu_shuffle(out_re_hi1, out_im_hi1, shuf_2637);
00517 out3[2] = spu_shuffle(out_re_hi3, out_im_hi3, shuf_0415);
00518 out3[3] = spu_shuffle(out_re_hi3, out_im_hi3, shuf_2637);
00519
00520 out0 += 4;
00521 out1 += 4;
00522 out2 += 4;
00523 out3 += 4;
00524
00525 i -= 2;
00526 } while (i);
00527 }
00528
00529 #endif