NFFT Logo 3.2.3
nfft.c
1 /*
2  * Copyright (c) 2002, 2012 Jens Keiner, Stefan Kunis, Daniel Potts
3  *
4  * This program is free software; you can redistribute it and/or modify it under
5  * the terms of the GNU General Public License as published by the Free Software
6  * Foundation; either version 2 of the License, or (at your option) any later
7  * version.
8  *
9  * This program is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
12  * details.
13  *
14  * You should have received a copy of the GNU General Public License along with
15  * this program; if not, write to the Free Software Foundation, Inc., 51
16  * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  */
18 
19 /* $Id: nfft.c 3896 2012-10-10 12:19:26Z tovo $ */
20 
21 /* Nonequispaced FFT */
22 
23 /* Authors: D. Potts, S. Kunis 2002-2009, Jens Keiner 2009, Toni Volkmer 2012 */
24 
25 /* configure header */
26 #include "config.h"
27 
28 /* complex datatype (maybe) */
29 #ifdef HAVE_COMPLEX_H
30 #include<complex.h>
31 #endif
32 
33 /* NFFT headers */
34 #include "nfft3util.h"
35 #include "nfft3.h"
36 #include "infft.h"
37 
38 #ifdef _OPENMP
39 #include <omp.h>
40 #endif
41 
42 #ifdef OMP_ASSERT
43 #include <assert.h>
44 #endif
45 
46 
61 static void nfft_sort_nodes_for_better_cache_handle(int d,
62  const int *n, int m, int local_x_num, const R *local_x, int *ar_x)
63 {
64  int u_j[d], i, j, help, rhigh;
65  int *ar_x_temp;
66  int nprod;
67 
68  for(i = 0; i < local_x_num; i++) {
69  ar_x[2*i] = 0;
70  ar_x[2*i+1] = i;
71  for(j = 0; j < d; j++) {
72  help = (int) floor( n[j]*local_x[d*i+j] - m);
73  u_j[j] = (help%n[j]+n[j])%n[j];
74 
75  ar_x[2*i] += u_j[j];
76  if (j+1 < d)
77  ar_x[2*i] *= n[j+1];
78  }
79  }
80 
81  for (j = 0, nprod = 1; j < d; j++)
82  nprod *= n[j];
83 
84  rhigh = (int) ceil(log2(nprod)) - 1;
85 
86  ar_x_temp = (int *) nfft_malloc(2*local_x_num*sizeof(int));
87  nfft_sort_node_indices_radix_lsdf(local_x_num, ar_x, ar_x_temp, rhigh);
88 #ifdef OMP_ASSERT
89  for (i = 1; i < local_x_num; i++)
90  assert(ar_x[2*(i-1)] <= ar_x[2*i]);
91 #endif
92  nfft_free(ar_x_temp);
93 }
94 
103 static void nfft_sort_nodes(const nfft_plan *ths)
104 {
105  if (ths->nfft_flags & NFFT_SORT_NODES)
106  nfft_sort_nodes_for_better_cache_handle(ths->d, ths->n, ths->m, ths->M_total, ths->x, ths->index_x);
107 }
108 
130 /* some macros to initialize arrays before executing a transformation */
131 #define MACRO_ndft_init_result_trafo memset(f,0,ths->M_total*sizeof(C));
132 #define MACRO_ndft_init_result_conjugated MACRO_ndft_init_result_trafo
133 #define MACRO_ndft_init_result_adjoint memset(f_hat,0,ths->N_total*sizeof(C));
134 #define MACRO_ndft_init_result_transposed MACRO_ndft_init_result_adjoint
135 
136 /* exponent of complex exponentials */
137 #define MACRO_ndft_sign_trafo K2PI*ths->x[j*ths->d+t]
138 #define MACRO_ndft_sign_conjugated -K2PI*ths->x[j*ths->d+t]
139 #define MACRO_ndft_sign_adjoint K2PI*ths->x[j*ths->d+t]
140 #define MACRO_ndft_sign_transposed -K2PI*ths->x[j*ths->d+t]
141 
142 #define MACRO_init_k_N_Omega_x(which_one) \
143 { \
144  for (t = 0; t < ths->d; t++) \
145  { \
146  k[t] = -ths->N[t]/2; \
147  x[t] = MACRO_ndft_sign_ ## which_one; \
148  Omega[t+1] = k[t]*x[t] + Omega[t]; \
149  } \
150  omega = Omega[ths->d]; \
151 } \
152 
153 #define MACRO_count_k_N_Omega \
154 { \
155  for (t = ths->d-1; (t >= 1) && (k[t] == ths->N[t]/2-1); t--) \
156  k[t]-= ths->N[t]-1; \
157  \
158  k[t]++; \
159 \
160  for (t2 = t; t2 < ths->d; t2++) \
161  Omega[t2+1] = k[t2]*x[t2] + Omega[t2]; \
162  \
163  omega = Omega[ths->d]; \
164 }
165 
166 #define MACRO_ndft_compute_trafo f[j] += f_hat[k_L]*CEXP(-II*omega);
167 #define MACRO_ndft_compute_conjugated MACRO_ndft_compute_trafo
168 #define MACRO_ndft_compute_adjoint f_hat[k_L] += f[j]*CEXP(+II*omega);
169 #define MACRO_ndft_compute_transposed MACRO_ndft_compute_adjoint
170 
171 #define MACRO_ndft(which_one) \
172 void nfft_ ## which_one ## _direct (nfft_plan *ths) \
173 { \
174  C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f; \
175  \
176  MACRO_ndft_init_result_ ## which_one \
177  \
178  if (ths->d == 1) \
179  { \
180  /* specialize for univariate case, rationale: faster */ \
181  const int t = 0; \
182  int j; \
183  for (j = 0; j < ths->M_total; j++) \
184  { \
185  int k_L; \
186  for(k_L = 0; k_L < ths->N_total; k_L++) \
187  { \
188  R omega = (k_L - (ths->N_total/2)) * MACRO_ndft_sign_ ## which_one; \
189  MACRO_ndft_compute_ ## which_one; \
190  } \
191  } \
192  } \
193  else \
194  { \
195  /* multivariate case */ \
196  int j; \
197  for (j = 0; j < ths->M_total; j++) \
198  { \
199  R x[ths->d], omega, Omega[ths->d+1]; \
200  int t, t2, k_L, k[ths->d]; \
201  Omega[0] = K(0.0); \
202  MACRO_init_k_N_Omega_x(which_one); \
203  for(k_L = 0; k_L < ths->N_total; k_L++) \
204  { \
205  MACRO_ndft_compute_ ## which_one; \
206  MACRO_count_k_N_Omega; \
207  } \
208  } \
209  } \
210 }
211 
212 // macro expanded for OpenMP parallelization
213 //MACRO_ndft(trafo)
214 void nfft_trafo_direct (nfft_plan *ths)
215 {
216  C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
217 
218  memset(f,0,ths->M_total*sizeof(C));
219 
220  if (ths->d == 1)
221  {
222  /* specialize for univariate case, rationale: faster */
223  int j;
224  #pragma omp parallel for default(shared) private(j)
225  for (j = 0; j < ths->M_total; j++)
226  {
227  int k_L;
228  for(k_L = 0; k_L < ths->N_total; k_L++)
229  {
230  R omega = (k_L - ths->N_total/2) * K2PI * ths->x[j];
231  f[j] += f_hat[k_L]*cexp(-( 1.0iF)*omega);
232  }
233  }
234  }
235  else
236  {
237  /* multivariate case */
238  int j;
239  #pragma omp parallel for default(shared) private(j)
240  for (j = 0; j < ths->M_total; j++)
241  {
242  R x[ths->d], omega, Omega[ths->d+1];
243  int t, t2, k_L, k[ths->d];
244  Omega[0] = ((R) 0.0);
245  for (t = 0; t < ths->d; t++)
246  {
247  k[t] = -ths->N[t]/2;
248  x[t] = K2PI*ths->x[j*ths->d+t];
249  Omega[t+1] = k[t]*x[t] + Omega[t];
250  }
251  omega = Omega[ths->d];
252 
253  for(k_L = 0; k_L < ths->N_total; k_L++)
254  {
255  f[j] += f_hat[k_L]*cexp(-( 1.0iF)*omega);
256  {
257  for (t = ths->d-1; (t >= 1) && (k[t] == ths->N[t]/2-1); t--)
258  k[t]-= ths->N[t]-1;
259 
260  k[t]++;
261 
262  for (t2 = t; t2 < ths->d; t2++)
263  Omega[t2+1] = k[t2]*x[t2] + Omega[t2];
264 
265  omega = Omega[ths->d];
266  }
267  }
268  }
269  }
270 }
271 
272 // macro expanded for OpenMP parallelization since parallel calculation
273 // requires outer loop over frequencies and inner loop over nodes.
274 //MACRO_ndft(adjoint)
275 void nfft_adjoint_direct (nfft_plan *ths)
276 {
277  C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
278 
279  memset(f_hat,0,ths->N_total*sizeof(C));
280 
281  if (ths->d == 1)
282  {
283  /* specialize for univariate case, rationale: faster */
284 #ifdef _OPENMP
285  int k_L;
286  #pragma omp parallel for default(shared) private(k_L)
287  for(k_L = 0; k_L < ths->N_total; k_L++)
288  {
289  int j;
290  for (j = 0; j < ths->M_total; j++)
291  {
292  R omega = (k_L - (ths->N_total/2)) * K2PI * ths->x[j];
293  f_hat[k_L] += f[j]*cexp(+( 1.0iF)*omega);
294  }
295  }
296 #else
297  int j;
298  for (j = 0; j < ths->M_total; j++)
299  {
300  int k_L;
301  for(k_L = 0; k_L < ths->N_total; k_L++)
302  {
303  R omega = (k_L - (ths->N_total/2)) * K2PI * ths->x[j];
304  f_hat[k_L] += f[j]*cexp(+( 1.0iF)*omega);
305  }
306  }
307 #endif
308  }
309  else
310  {
311  /* multivariate case */
312  int j, k_L;
313 #ifdef _OPENMP
314  #pragma omp parallel for default(shared) private(j, k_L)
315  for(k_L = 0; k_L < ths->N_total; k_L++)
316  {
317  int k[ths->d], k_temp, t;
318 
319  k_temp = k_L;
320 
321  for (t = ths->d-1; t >= 0; t--)
322  {
323  k[t] = k_temp % ths->N[t] - ths->N[t]/2;
324  k_temp /= ths->N[t];
325  }
326 
327  for (j = 0; j < ths->M_total; j++)
328  {
329  R omega = 0.0;
330  for (t = 0; t < ths->d; t++)
331  omega += k[t] * K2PI * ths->x[j*ths->d+t];
332  f_hat[k_L] += f[j]*cexp(+( 1.0iF)*omega);
333  }
334  }
335 #else
336  for (j = 0; j < ths->M_total; j++)
337  {
338  R x[ths->d], omega, Omega[ths->d+1];
339  int t, t2, k[ths->d];
340  Omega[0] = ((R) 0.0);
341  for (t = 0; t < ths->d; t++)
342  {
343  k[t] = -ths->N[t]/2;
344  x[t] = K2PI * ths->x[j*ths->d+t];
345  Omega[t+1] = k[t]*x[t] + Omega[t];
346  }
347  omega = Omega[ths->d];
348  for(k_L = 0; k_L < ths->N_total; k_L++)
349  {
350  f_hat[k_L] += f[j]*cexp(+( 1.0iF)*omega);
351 
352  for (t = ths->d-1; (t >= 1) && (k[t] == ths->N[t]/2-1); t--)
353  k[t]-= ths->N[t]-1;
354 
355  k[t]++;
356 
357  for (t2 = t; t2 < ths->d; t2++)
358  Omega[t2+1] = k[t2]*x[t2] + Omega[t2];
359 
360  omega = Omega[ths->d];
361  }
362  }
363 #endif
364  }
365 }
366 
392 static inline void nfft_uo(const nfft_plan *ths, const int j, int *up, int *op,
393  const int act_dim)
394 {
395  const R xj = ths->x[j * ths->d + act_dim];
396  int c = LRINT(FLOOR(xj * ths->n[act_dim]));
397 
398  (*up) = c - (ths->m);
399  (*op) = c + 1 + (ths->m);
400 }
401 
402 static inline void nfft_uo2(int *u, int *o, const R x, const int n, const int m)
403 {
404  int c = LRINT(FLOOR(x * n));
405 
406  *u = (c - m + n) % n;
407  *o = (c + 1 + m + n) % n;
408 }
409 
410 #define MACRO_nfft_D_compute_A \
411 { \
412  g_hat[k_plain[ths->d]] = f_hat[ks_plain[ths->d]] * c_phi_inv_k[ths->d]; \
413 }
414 
415 #define MACRO_nfft_D_compute_T \
416 { \
417  f_hat[ks_plain[ths->d]] = g_hat[k_plain[ths->d]] * c_phi_inv_k[ths->d]; \
418 }
419 
420 #define MACRO_nfft_D_init_result_A memset(g_hat,0,ths->n_total*sizeof(C));
421 
422 #define MACRO_nfft_D_init_result_T memset(f_hat,0,ths->N_total*sizeof(C));
423 
424 #define MACRO_with_PRE_PHI_HUT * ths->c_phi_inv[t2][ks[t2]];
425 
426 #define MACRO_without_PRE_PHI_HUT / (PHI_HUT(ks[t2]-(ths->N[t2]/2),t2));
427 
428 #define MACRO_init_k_ks \
429 { \
430  for (t = ths->d-1; 0 <= t; t--) \
431  { \
432  kp[t] = k[t] = 0; \
433  ks[t] = ths->N[t]/2; \
434  } \
435  t++; \
436 }
437 
438 #define MACRO_update_c_phi_inv_k(which_one) \
439 { \
440  for (t2 = t; t2 < ths->d; t2++) \
441  { \
442  c_phi_inv_k[t2+1] = c_phi_inv_k[t2] MACRO_ ##which_one; \
443  ks_plain[t2+1] = ks_plain[t2]*ths->N[t2] + ks[t2]; \
444  k_plain[t2+1] = k_plain[t2]*ths->n[t2] + k[t2]; \
445  } \
446 }
447 
448 #define MACRO_count_k_ks \
449 { \
450  for (t = ths->d-1; (t > 0) && (kp[t] == ths->N[t]-1); t--) \
451  { \
452  kp[t] = k[t] = 0; \
453  ks[t]= ths->N[t]/2; \
454  } \
455  \
456  kp[t]++; k[t]++; ks[t]++; \
457  if(kp[t] == ths->N[t]/2) \
458  { \
459  k[t] = ths->n[t] - ths->N[t]/2; \
460  ks[t] = 0; \
461  } \
462 } \
463 
464 
468 #define MACRO_nfft_D(which_one) \
469 static inline void nfft_D_serial_ ## which_one (nfft_plan *ths) \
470 { \
471  C *f_hat, *g_hat; \
472  R c_phi_inv_k[ths->d+1]; \
473  int t, t2; \
474  int k_L; \
475  int kp[ths->d]; \
476  int k[ths->d]; \
477  int ks[ths->d]; \
478  int k_plain[ths->d+1]; \
479  int ks_plain[ths->d+1]; \
480  \
481  f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat; \
482  MACRO_nfft_D_init_result_ ## which_one; \
483  \
484  c_phi_inv_k[0] = K(1.0); \
485  k_plain[0] = 0; \
486  ks_plain[0] = 0; \
487  \
488  if (ths->nfft_flags & PRE_PHI_HUT) \
489  { \
490  MACRO_init_k_ks; \
491  \
492  for (k_L = 0; k_L < ths->N_total; k_L++) \
493  { \
494  MACRO_update_c_phi_inv_k(with_PRE_PHI_HUT); \
495  MACRO_nfft_D_compute_ ## which_one; \
496  MACRO_count_k_ks; \
497  } /* for(k_L) */ \
498  } /* if(PRE_PHI_HUT) */ \
499  else \
500  { \
501  MACRO_init_k_ks; \
502  for (k_L = 0; k_L < ths->N_total; k_L++) \
503  { \
504  MACRO_update_c_phi_inv_k(without_PRE_PHI_HUT); \
505  MACRO_nfft_D_compute_ ## which_one; \
506  MACRO_count_k_ks; \
507  } /* for(k_L) */ \
508  } /* else(PRE_PHI_HUT) */ \
509 } /* nfft_D */
510 
511 #ifdef _OPENMP
512 static void nfft_D_openmp_A(nfft_plan *ths)
513 {
514  C *f_hat, *g_hat;
515  int k_L;
517  f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
518  memset(g_hat,0,ths->n_total*sizeof(C));
519 
520  if (ths->nfft_flags & PRE_PHI_HUT)
521  {
522  #pragma omp parallel for default(shared) private(k_L)
523  for (k_L = 0; k_L < ths->N_total; k_L++)
524  {
525  int kp[ths->d]; //0..N-1
526  int k[ths->d];
527  int ks[ths->d];
528  R c_phi_inv_k_val = K(1.0);
529  int k_plain_val = 0;
530  int ks_plain_val = 0;
531  int t;
532  int k_temp = k_L;
533 
534  for (t = ths->d-1; t >= 0; t--)
535  {
536  kp[t] = k_temp % ths->N[t];
537  if (kp[t] >= ths->N[t]/2)
538  k[t] = ths->n[t] - ths->N[t] + kp[t];
539  else
540  k[t] = kp[t];
541  ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
542  k_temp /= ths->N[t];
543  }
544 
545  for (t = 0; t < ths->d; t++)
546  {
547  c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
548  ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
549  k_plain_val = k_plain_val*ths->n[t] + k[t];
550  }
551 
552  g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
553  } /* for(k_L) */
554  } /* if(PRE_PHI_HUT) */
555  else
556  {
557  #pragma omp parallel for default(shared) private(k_L)
558  for (k_L = 0; k_L < ths->N_total; k_L++)
559  {
560  int kp[ths->d]; //0..N-1
561  int k[ths->d];
562  int ks[ths->d];
563  R c_phi_inv_k_val = K(1.0);
564  int k_plain_val = 0;
565  int ks_plain_val = 0;
566  int t;
567  int k_temp = k_L;
568 
569  for (t = ths->d-1; t >= 0; t--)
570  {
571  kp[t] = k_temp % ths->N[t];
572  if (kp[t] >= ths->N[t]/2)
573  k[t] = ths->n[t] - ths->N[t] + kp[t];
574  else
575  k[t] = kp[t];
576  ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
577  k_temp /= ths->N[t];
578  }
579 
580  for (t = 0; t < ths->d; t++)
581  {
582  c_phi_inv_k_val /= (PHI_HUT(ks[t]-(ths->N[t]/2),t));
583  ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
584  k_plain_val = k_plain_val*ths->n[t] + k[t];
585  }
586 
587  g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
588  } /* for(k_L) */
589  } /* else(PRE_PHI_HUT) */
590 }
591 #endif
592 
593 #ifndef _OPENMP
594 MACRO_nfft_D(A)
595 #endif
596 
597 static void nfft_D_A(nfft_plan *ths)
598 {
599 #ifdef _OPENMP
600  nfft_D_openmp_A(ths);
601 #else
602  nfft_D_serial_A(ths);
603 #endif
604 }
605 
606 #ifdef _OPENMP
607 static void nfft_D_openmp_T(nfft_plan *ths)
608 {
609  C *f_hat, *g_hat;
610  int k_L;
612  f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
613  memset(f_hat,0,ths->N_total*sizeof(C));
614 
615  if (ths->nfft_flags & PRE_PHI_HUT)
616  {
617  #pragma omp parallel for default(shared) private(k_L)
618  for (k_L = 0; k_L < ths->N_total; k_L++)
619  {
620  int kp[ths->d]; //0..N-1
621  int k[ths->d];
622  int ks[ths->d];
623  R c_phi_inv_k_val = K(1.0);
624  int k_plain_val = 0;
625  int ks_plain_val = 0;
626  int t;
627  int k_temp = k_L;
628 
629  for (t = ths->d-1; t >= 0; t--)
630  {
631  kp[t] = k_temp % ths->N[t];
632  if (kp[t] >= ths->N[t]/2)
633  k[t] = ths->n[t] - ths->N[t] + kp[t];
634  else
635  k[t] = kp[t];
636  ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
637  k_temp /= ths->N[t];
638  }
639 
640  for (t = 0; t < ths->d; t++)
641  {
642  c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
643  ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
644  k_plain_val = k_plain_val*ths->n[t] + k[t];
645  }
646 
647  f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
648  } /* for(k_L) */
649  } /* if(PRE_PHI_HUT) */
650  else
651  {
652  #pragma omp parallel for default(shared) private(k_L)
653  for (k_L = 0; k_L < ths->N_total; k_L++)
654  {
655  int kp[ths->d]; //0..N-1
656  int k[ths->d];
657  int ks[ths->d];
658  R c_phi_inv_k_val = K(1.0);
659  int k_plain_val = 0;
660  int ks_plain_val = 0;
661  int t;
662  int k_temp = k_L;
663 
664  for (t = ths->d-1; t >= 0; t--)
665  {
666  kp[t] = k_temp % ths->N[t];
667  if (kp[t] >= ths->N[t]/2)
668  k[t] = ths->n[t] - ths->N[t] + kp[t];
669  else
670  k[t] = kp[t];
671  ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
672  k_temp /= ths->N[t];
673  }
674 
675  for (t = 0; t < ths->d; t++)
676  {
677  c_phi_inv_k_val /= (PHI_HUT(ks[t]-(ths->N[t]/2),t));
678  ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
679  k_plain_val = k_plain_val*ths->n[t] + k[t];
680  }
681 
682  f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
683  } /* for(k_L) */
684  } /* else(PRE_PHI_HUT) */
685 }
686 #endif
687 
688 #ifndef _OPENMP
689 MACRO_nfft_D(T)
690 #endif
691 
692 static void nfft_D_T(nfft_plan *ths)
693 {
694 #ifdef _OPENMP
695  nfft_D_openmp_T(ths);
696 #else
697  nfft_D_serial_T(ths);
698 #endif
699 }
700 
704 #define MACRO_nfft_B_init_result_A memset(f,0,ths->M_total*sizeof(C));
705 #define MACRO_nfft_B_init_result_T memset(g,0,ths->n_total*sizeof(C));
706 
707 #define MACRO_nfft_B_PRE_FULL_PSI_compute_A \
708 { \
709  (*fj) += ths->psi[ix] * g[ths->psi_index_g[ix]]; \
710 }
711 
712 #define MACRO_nfft_B_PRE_FULL_PSI_compute_T \
713 { \
714  g[ths->psi_index_g[ix]] += ths->psi[ix] * (*fj); \
715 }
716 
717 #define MACRO_nfft_B_compute_A \
718 { \
719  (*fj) += phi_prod[ths->d] * g[ll_plain[ths->d]]; \
720 }
721 
722 #define MACRO_nfft_B_compute_T \
723 { \
724  g[ll_plain[ths->d]] += phi_prod[ths->d] * (*fj); \
725 }
726 
727 #define MACRO_with_FG_PSI fg_psi[t2][lj[t2]]
728 
729 #define MACRO_with_PRE_PSI ths->psi[(j*ths->d+t2) * (2*ths->m+2)+lj[t2]]
730 
731 #define MACRO_without_PRE_PSI PHI(ths->x[j*ths->d+t2] \
732  - ((R)l[t2])/((R)ths->n[t2]), t2)
733 
734 #define MACRO_init_uo_l_lj_t \
735 { \
736  for (t = ths->d-1; t >= 0; t--) \
737  { \
738  nfft_uo(ths,j,&u[t],&o[t],t); \
739  l[t] = u[t]; \
740  lj[t] = 0; \
741  } /* for(t) */ \
742  t++; \
743 }
744 
745 #define MACRO_update_phi_prod_ll_plain(which_one) { \
746  for(t2=t; t2<ths->d; t2++) \
747  { \
748  phi_prod[t2+1] = phi_prod[t2]* MACRO_ ## which_one; \
749  ll_plain[t2+1] = ll_plain[t2]*ths->n[t2] +(l[t2]+ths->n[t2])%ths->n[t2];\
750  } /* for(t2) */ \
751 }
752 
753 #define MACRO_count_uo_l_lj_t { \
754  for(t = ths->d-1; (t > 0) && (l[t] == o[t]); t--) \
755  { \
756  l[t] = u[t]; \
757  lj[t] = 0; \
758  } /* for(t) */ \
759  \
760  l[t]++; \
761  lj[t]++; \
762 }
763 
764 #define MACRO_nfft_B(which_one) \
765 static inline void nfft_B_serial_ ## which_one (nfft_plan *ths) \
766 { \
767  int lprod; /* 'regular bandwidth' of matrix B */ \
768  int u[ths->d], o[ths->d]; /* multi band with respect to x_j */ \
769  int t, t2; /* index dimensions */ \
770  int j; /* index nodes */ \
771  int l_L, ix; /* index one row of B */ \
772  int l[ths->d]; /* multi index u<=l<=o */ \
773  int lj[ths->d]; /* multi index 0<=lj<u+o+1 */ \
774  int ll_plain[ths->d+1]; /* postfix plain index in g */ \
775  R phi_prod[ths->d+1]; /* postfix product of PHI */ \
776  C *f, *g; /* local copy */ \
777  C *fj; /* local copy */ \
778  R y[ths->d]; \
779  R fg_psi[ths->d][2*ths->m+2]; \
780  R fg_exp_l[ths->d][2*ths->m+2]; \
781  int l_fg,lj_fg; \
782  R tmpEXP1, tmpEXP2, tmpEXP2sq, tmp1, tmp2, tmp3; \
783  R ip_w; \
784  int ip_u; \
785  int ip_s = ths->K/(ths->m+2); \
786  \
787  f = (C*)ths->f; g = (C*)ths->g; \
788  \
789  MACRO_nfft_B_init_result_ ## which_one; \
790  \
791  if (ths->nfft_flags & PRE_FULL_PSI) \
792  { \
793  for (ix = 0, j = 0, fj = f; j < ths->M_total; j++, fj++) \
794  for (l_L = 0; l_L < ths->psi_index_f[j]; l_L++, ix++) \
795  MACRO_nfft_B_PRE_FULL_PSI_compute_ ## which_one; \
796  return; \
797  } \
798  \
799  phi_prod[0] = K(1.0); \
800  ll_plain[0] = 0; \
801  \
802  for (t = 0, lprod = 1; t < ths->d; t++) \
803  lprod *= (2*ths->m+2); \
804  \
805  if (ths->nfft_flags & PRE_PSI) \
806  { \
807  for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
808  { \
809  MACRO_init_uo_l_lj_t; \
810  \
811  for (l_L = 0; l_L < lprod; l_L++) \
812  { \
813  MACRO_update_phi_prod_ll_plain(with_PRE_PSI); \
814  \
815  MACRO_nfft_B_compute_ ## which_one; \
816  \
817  MACRO_count_uo_l_lj_t; \
818  } /* for(l_L) */ \
819  } /* for(j) */ \
820  return; \
821  } /* if(PRE_PSI) */ \
822  \
823  if (ths->nfft_flags & PRE_FG_PSI) \
824  { \
825  for(t2 = 0; t2 < ths->d; t2++) \
826  { \
827  tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); \
828  tmpEXP2sq = tmpEXP2*tmpEXP2; \
829  tmp2 = K(1.0); \
830  tmp3 = K(1.0); \
831  fg_exp_l[t2][0] = K(1.0); \
832  for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) \
833  { \
834  tmp3 = tmp2*tmpEXP2; \
835  tmp2 *= tmpEXP2sq; \
836  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; \
837  } \
838  } \
839  for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
840  { \
841  MACRO_init_uo_l_lj_t; \
842  \
843  for (t2 = 0; t2 < ths->d; t2++) \
844  { \
845  fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)]; \
846  tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1]; \
847  tmp1 = K(1.0); \
848  for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
849  { \
850  tmp1 *= tmpEXP1; \
851  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
852  } \
853  } \
854  \
855  for (l_L= 0; l_L < lprod; l_L++) \
856  { \
857  MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
858  \
859  MACRO_nfft_B_compute_ ## which_one; \
860  \
861  MACRO_count_uo_l_lj_t; \
862  } /* for(l_L) */ \
863  } /* for(j) */ \
864  return; \
865  } /* if(PRE_FG_PSI) */ \
866  \
867  if (ths->nfft_flags & FG_PSI) \
868  { \
869  for (t2 = 0; t2 < ths->d; t2++) \
870  { \
871  tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); \
872  tmpEXP2sq = tmpEXP2*tmpEXP2; \
873  tmp2 = K(1.0); \
874  tmp3 = K(1.0); \
875  fg_exp_l[t2][0] = K(1.0); \
876  for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) \
877  { \
878  tmp3 = tmp2*tmpEXP2; \
879  tmp2 *= tmpEXP2sq; \
880  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; \
881  } \
882  } \
883  for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
884  { \
885  MACRO_init_uo_l_lj_t; \
886  \
887  for (t2 = 0; t2 < ths->d; t2++) \
888  { \
889  fg_psi[t2][0] = (PHI((ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));\
890  \
891  tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2]) \
892  /ths->b[t2]); \
893  tmp1 = K(1.0); \
894  for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
895  { \
896  tmp1 *= tmpEXP1; \
897  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
898  } \
899  } \
900  \
901  for (l_L = 0; l_L < lprod; l_L++) \
902  { \
903  MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
904  \
905  MACRO_nfft_B_compute_ ## which_one; \
906  \
907  MACRO_count_uo_l_lj_t; \
908  } /* for(l_L) */ \
909  } /* for(j) */ \
910  return; \
911  } /* if(FG_PSI) */ \
912  \
913  if (ths->nfft_flags & PRE_LIN_PSI) \
914  { \
915  for (j = 0, fj=f; j<ths->M_total; j++, fj++) \
916  { \
917  MACRO_init_uo_l_lj_t; \
918  \
919  for (t2 = 0; t2 < ths->d; t2++) \
920  { \
921  y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2]) \
922  * ((R)ths->K))/(ths->m+2); \
923  ip_u = LRINT(FLOOR(y[t2])); \
924  ip_w = y[t2]-ip_u; \
925  for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++) \
926  { \
927  fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)] \
928  * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)] \
929  * (ip_w); \
930  } \
931  } \
932  \
933  for (l_L = 0; l_L < lprod; l_L++) \
934  { \
935  MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
936  \
937  MACRO_nfft_B_compute_ ## which_one; \
938  \
939  MACRO_count_uo_l_lj_t; \
940  } /* for(l_L) */ \
941  } /* for(j) */ \
942  return; \
943  } /* if(PRE_LIN_PSI) */ \
944  \
945  /* no precomputed psi at all */ \
946  for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
947  { \
948  MACRO_init_uo_l_lj_t; \
949  \
950  for (l_L = 0; l_L < lprod; l_L++) \
951  { \
952  MACRO_update_phi_prod_ll_plain(without_PRE_PSI); \
953  \
954  MACRO_nfft_B_compute_ ## which_one; \
955  \
956  MACRO_count_uo_l_lj_t; \
957  } /* for(l_L) */ \
958  } /* for(j) */ \
959 } /* nfft_B */ \
960 
961 #ifndef _OPENMP
962 MACRO_nfft_B(A)
963 #endif
964 
965 #ifdef _OPENMP
966 static inline void nfft_B_openmp_A (nfft_plan *ths)
967 {
968  int lprod; /* 'regular bandwidth' of matrix B */
969  int k;
970 
971  memset(ths->f,0,ths->M_total*sizeof(C));
972 
973  for (k = 0, lprod = 1; k < ths->d; k++)
974  lprod *= (2*ths->m+2);
975 
976  if (ths->nfft_flags & PRE_FULL_PSI)
977  {
978  #pragma omp parallel for default(shared) private(k)
979  for (k = 0; k < ths->M_total; k++)
980  {
981  int l;
982  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
983  ths->f[j] = K(0.0);
984  for (l = 0; l < lprod; l++)
985  ths->f[j] += ths->psi[j*lprod+l] * ths->g[ths->psi_index_g[j*lprod+l]];
986  }
987  return;
988  }
989 
990  if (ths->nfft_flags & PRE_PSI)
991  {
992  #pragma omp parallel for default(shared) private(k)
993  for (k = 0; k < ths->M_total; k++)
994  {
995  int u[ths->d], o[ths->d]; /* multi band with respect to x_j */
996  int t, t2; /* index dimensions */
997  int l_L; /* index one row of B */
998  int l[ths->d]; /* multi index u<=l<=o */
999  int lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1000  int ll_plain[ths->d+1]; /* postfix plain index in g */
1001  R phi_prod[ths->d+1]; /* postfix product of PHI */
1002  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1003 
1004  phi_prod[0] = K(1.0);
1005  ll_plain[0] = 0;
1006 
1007  MACRO_init_uo_l_lj_t;
1008 
1009  for (l_L = 0; l_L < lprod; l_L++)
1010  {
1011  MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
1012 
1013  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1014 
1015  MACRO_count_uo_l_lj_t;
1016  } /* for(l_L) */
1017  } /* for(j) */
1018  return;
1019  } /* if(PRE_PSI) */
1020 
1021  if (ths->nfft_flags & PRE_FG_PSI)
1022  {
1023  int t, t2; /* index dimensions */
1024  R fg_exp_l[ths->d][2*ths->m+2];
1025 
1026  for(t2 = 0; t2 < ths->d; t2++)
1027  {
1028  int lj_fg;
1029  R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
1030  R tmpEXP2sq = tmpEXP2*tmpEXP2;
1031  R tmp2 = K(1.0);
1032  R tmp3 = K(1.0);
1033  fg_exp_l[t2][0] = K(1.0);
1034  for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
1035  {
1036  tmp3 = tmp2*tmpEXP2;
1037  tmp2 *= tmpEXP2sq;
1038  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1039  }
1040  }
1041 
1042  #pragma omp parallel for default(shared) private(k,t,t2)
1043  for (k = 0; k < ths->M_total; k++)
1044  {
1045  int ll_plain[ths->d+1]; /* postfix plain index in g */
1046  R phi_prod[ths->d+1]; /* postfix product of PHI */
1047  int u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1048  int l[ths->d]; /* multi index u<=l<=o */
1049  int lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1050  R fg_psi[ths->d][2*ths->m+2];
1051  R tmpEXP1, tmp1;
1052  int l_fg,lj_fg;
1053  int l_L;
1054  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1055 
1056  phi_prod[0] = K(1.0);
1057  ll_plain[0] = 0;
1058 
1059  MACRO_init_uo_l_lj_t;
1060 
1061  for (t2 = 0; t2 < ths->d; t2++)
1062  {
1063  fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)];
1064  tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1];
1065  tmp1 = K(1.0);
1066  for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1067  {
1068  tmp1 *= tmpEXP1;
1069  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1070  }
1071  }
1072 
1073  for (l_L= 0; l_L < lprod; l_L++)
1074  {
1075  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1076 
1077  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1078 
1079  MACRO_count_uo_l_lj_t;
1080  } /* for(l_L) */
1081  } /* for(j) */
1082  return;
1083  } /* if(PRE_FG_PSI) */
1084 
1085  if (ths->nfft_flags & FG_PSI)
1086  {
1087  int t, t2; /* index dimensions */
1088  R fg_exp_l[ths->d][2*ths->m+2];
1089 
1090  nfft_sort_nodes(ths);
1091 
1092  for (t2 = 0; t2 < ths->d; t2++)
1093  {
1094  int lj_fg;
1095  R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
1096  R tmpEXP2sq = tmpEXP2*tmpEXP2;
1097  R tmp2 = K(1.0);
1098  R tmp3 = K(1.0);
1099  fg_exp_l[t2][0] = K(1.0);
1100  for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
1101  {
1102  tmp3 = tmp2*tmpEXP2;
1103  tmp2 *= tmpEXP2sq;
1104  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1105  }
1106  }
1107 
1108  #pragma omp parallel for default(shared) private(k,t,t2)
1109  for (k = 0; k < ths->M_total; k++)
1110  {
1111  int ll_plain[ths->d+1]; /* postfix plain index in g */
1112  R phi_prod[ths->d+1]; /* postfix product of PHI */
1113  int u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1114  int l[ths->d]; /* multi index u<=l<=o */
1115  int lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1116  R fg_psi[ths->d][2*ths->m+2];
1117  R tmpEXP1, tmp1;
1118  int l_fg,lj_fg;
1119  int l_L;
1120  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1121 
1122  phi_prod[0] = K(1.0);
1123  ll_plain[0] = 0;
1124 
1125  MACRO_init_uo_l_lj_t;
1126 
1127  for (t2 = 0; t2 < ths->d; t2++)
1128  {
1129  fg_psi[t2][0] = (PHI((ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));
1130 
1131  tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2])
1132  /ths->b[t2]);
1133  tmp1 = K(1.0);
1134  for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1135  {
1136  tmp1 *= tmpEXP1;
1137  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1138  }
1139  }
1140 
1141  for (l_L = 0; l_L < lprod; l_L++)
1142  {
1143  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1144 
1145  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1146 
1147  MACRO_count_uo_l_lj_t;
1148  } /* for(l_L) */
1149  } /* for(j) */
1150  return;
1151  } /* if(FG_PSI) */
1152 
1153  if (ths->nfft_flags & PRE_LIN_PSI)
1154  {
1155  nfft_sort_nodes(ths);
1156 
1157  #pragma omp parallel for default(shared) private(k)
1158  for (k = 0; k<ths->M_total; k++)
1159  {
1160  int u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1161  int t, t2; /* index dimensions */
1162  int l_L; /* index one row of B */
1163  int l[ths->d]; /* multi index u<=l<=o */
1164  int lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1165  int ll_plain[ths->d+1]; /* postfix plain index in g */
1166  R phi_prod[ths->d+1]; /* postfix product of PHI */
1167  R y[ths->d];
1168  R fg_psi[ths->d][2*ths->m+2];
1169  int l_fg,lj_fg;
1170  R ip_w;
1171  int ip_u;
1172  int ip_s = ths->K/(ths->m+2);
1173  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1174 
1175  phi_prod[0] = K(1.0);
1176  ll_plain[0] = 0;
1177 
1178  MACRO_init_uo_l_lj_t;
1179 
1180  for (t2 = 0; t2 < ths->d; t2++)
1181  {
1182  y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2])
1183  * ((R)ths->K))/(ths->m+2);
1184  ip_u = LRINT(FLOOR(y[t2]));
1185  ip_w = y[t2]-ip_u;
1186  for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++)
1187  {
1188  fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)]
1189  * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)]
1190  * (ip_w);
1191  }
1192  }
1193 
1194  for (l_L = 0; l_L < lprod; l_L++)
1195  {
1196  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1197 
1198  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1199 
1200  MACRO_count_uo_l_lj_t;
1201  } /* for(l_L) */
1202  } /* for(j) */
1203  return;
1204  } /* if(PRE_LIN_PSI) */
1205 
1206  /* no precomputed psi at all */
1207  nfft_sort_nodes(ths);
1208 
1209  #pragma omp parallel for default(shared) private(k)
1210  for (k = 0; k < ths->M_total; k++)
1211  {
1212  int u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1213  int t, t2; /* index dimensions */
1214  int l_L; /* index one row of B */
1215  int l[ths->d]; /* multi index u<=l<=o */
1216  int lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1217  int ll_plain[ths->d+1]; /* postfix plain index in g */
1218  R phi_prod[ths->d+1]; /* postfix product of PHI */
1219  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1220 
1221  phi_prod[0] = K(1.0);
1222  ll_plain[0] = 0;
1223 
1224  MACRO_init_uo_l_lj_t;
1225 
1226  for (l_L = 0; l_L < lprod; l_L++)
1227  {
1228  MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
1229 
1230  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1231 
1232  MACRO_count_uo_l_lj_t;
1233  } /* for(l_L) */
1234  } /* for(j) */
1235 }
1236 #endif
1237 
1238 static void nfft_B_A(nfft_plan *ths)
1239 {
1240 #ifdef _OPENMP
1241  nfft_B_openmp_A(ths);
1242 #else
1243  nfft_B_serial_A(ths);
1244 #endif
1245 }
1246 
1247 #ifdef _OPENMP
1248 
1263 static inline int index_x_binary_search(const int *ar_x, const int len, const int key)
1264 {
1265  int left = 0, right = len - 1;
1266 
1267  if (len == 1)
1268  return 0;
1269 
1270  while (left < right - 1)
1271  {
1272  int i = (left + right) / 2;
1273  if (ar_x[2*i] >= key)
1274  right = i;
1275  else if (ar_x[2*i] < key)
1276  left = i;
1277  }
1278 
1279  if (ar_x[2*left] < key && left != len-1)
1280  return left+1;
1281 
1282  return left;
1283 }
1284 #endif
1285 
1286 #ifdef _OPENMP
1287 
1302 static void nfft_adjoint_B_omp_blockwise_init(int *my_u0, int *my_o0, int *min_u_a, int *max_u_a, int *min_u_b, int *max_u_b, const int d, const int *n, const int m)
1303 {
1304  const int n0 = n[0];
1305  int k;
1306  int nthreads = omp_get_num_threads();
1307  int nthreads_used = MIN(nthreads, n0);
1308  int size_per_thread = n0 / nthreads_used;
1309  int size_left = n0 - size_per_thread * nthreads_used;
1310  int size_g[nthreads_used];
1311  int offset_g[nthreads_used];
1312  int my_id = omp_get_thread_num();
1313  int n_prod_rest = 1;
1314 
1315  for (k = 1; k < d; k++)
1316  n_prod_rest *= n[k];
1317 
1318  *min_u_a = -1;
1319  *max_u_a = -1;
1320  *min_u_b = -1;
1321  *max_u_b = -1;
1322  *my_u0 = -1;
1323  *my_o0 = -1;
1324 
1325  if (my_id < nthreads_used)
1326  {
1327  const int m22 = 2 * m + 2;
1328 
1329  offset_g[0] = 0;
1330  for (k = 0; k < nthreads_used; k++)
1331  {
1332  if (k > 0)
1333  offset_g[k] = offset_g[k-1] + size_g[k-1];
1334  size_g[k] = size_per_thread;
1335  if (size_left > 0)
1336  {
1337  size_g[k]++;
1338  size_left--;
1339  }
1340  }
1341 
1342  *my_u0 = offset_g[my_id];
1343  *my_o0 = offset_g[my_id] + size_g[my_id] - 1;
1344 
1345  if (nthreads_used > 1)
1346  {
1347  *max_u_a = n_prod_rest*(offset_g[my_id] + size_g[my_id]) - 1;
1348  *min_u_a = n_prod_rest*(offset_g[my_id] - m22 + 1);
1349  }
1350  else
1351  {
1352  *min_u_a = 0;
1353  *max_u_a = n_prod_rest * n0 - 1;
1354  }
1355 
1356  if (*min_u_a < 0)
1357  {
1358  *min_u_b = n_prod_rest * (offset_g[my_id] - m22 + 1 + n0);
1359  *max_u_b = n_prod_rest * n0 - 1;
1360  *min_u_a = 0;
1361  }
1362 
1363  if (*min_u_b != -1 && *min_u_b <= *max_u_a)
1364  {
1365  *max_u_a = *max_u_b;
1366  *min_u_b = -1;
1367  *max_u_b = -1;
1368  }
1369 #ifdef OMP_ASSERT
1370  assert(*min_u_a <= *max_u_a);
1371  assert(*min_u_b <= *max_u_b);
1372  assert(*min_u_b == -1 || *max_u_a < *min_u_b);
1373 #endif
1374  }
1375 }
1376 #endif
1377 
1386 static void nfft_adjoint_B_compute_full_psi(
1387  C *g, const int *psi_index_g, const R *psi, const C *f,
1388  const int M, const int d, const int *n, const int m, const int nfft_flags, const int *index_x)
1389 {
1390  int k;
1391  int lprod, lprod_m1;
1392  {
1393  int t;
1394  for(t = 0, lprod = 1; t < d; t++)
1395  lprod *= 2 * m + 2;
1396  }
1397  lprod_m1 = lprod / (2 * m + 2);
1398 
1399 #ifdef _OPENMP
1400  if (nfft_flags & NFFT_OMP_BLOCKWISE_ADJOINT)
1401  {
1402  #pragma omp parallel private(k)
1403  {
1404  int my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b;
1405  const int *ar_x = index_x;
1406  int n_prod_rest = 1;
1407 
1408  for (k = 1; k < d; k++)
1409  n_prod_rest *= n[k];
1410 
1411  nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, &min_u_b, &max_u_b, d, n, m);
1412 
1413  if (min_u_a != -1)
1414  {
1415  k = index_x_binary_search(ar_x, M, min_u_a);
1416 #ifdef OMP_ASSERT
1417  assert(ar_x[2*k] >= min_u_a || k == M-1);
1418  if (k > 0)
1419  assert(ar_x[2*k-2] < min_u_a);
1420 #endif
1421  while (k < M)
1422  {
1423  int l0, lrest;
1424  int u_prod = ar_x[2*k];
1425  int j = ar_x[2*k+1];
1426 
1427  if (u_prod < min_u_a || u_prod > max_u_a)
1428  break;
1429 
1430  for (l0 = 0; l0 < 2 * m + 2; l0++)
1431  {
1432  const int start_index = psi_index_g[j * lprod + l0 * lprod_m1];
1433 
1434  if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
1435  continue;
1436 
1437  for (lrest = 0; lrest < lprod_m1; lrest++)
1438  {
1439  const int l = l0 * lprod_m1 + lrest;
1440  g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1441  }
1442  }
1443 
1444  k++;
1445  }
1446  }
1447 
1448  if (min_u_b != -1)
1449  {
1450  k = index_x_binary_search(ar_x, M, min_u_b);
1451 #ifdef OMP_ASSERT
1452  assert(ar_x[2*k] >= min_u_b || k == M-1);
1453  if (k > 0)
1454  assert(ar_x[2*k-2] < min_u_b);
1455 #endif
1456  while (k < M)
1457  {
1458  int l0, lrest;
1459  int u_prod = ar_x[2*k];
1460  int j = ar_x[2*k+1];
1461 
1462  if (u_prod < min_u_b || u_prod > max_u_b)
1463  break;
1464 
1465  for (l0 = 0; l0 < 2 * m + 2; l0++)
1466  {
1467  const int start_index = psi_index_g[j * lprod + l0 * lprod_m1];
1468 
1469  if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
1470  continue;
1471  for (lrest = 0; lrest < lprod_m1; lrest++)
1472  {
1473  const int l = l0 * lprod_m1 + lrest;
1474  g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1475  }
1476  }
1477 
1478  k++;
1479  }
1480  }
1481  } /* omp parallel */
1482  return;
1483  } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */
1484 #endif
1485 
1486  #pragma omp parallel for default(shared) private(k)
1487  for (k = 0; k < M; k++)
1488  {
1489  int l;
1490  int j = (nfft_flags & NFFT_SORT_NODES) ? index_x[2*k+1] : k;
1491 
1492  for (l = 0; l < lprod; l++)
1493  {
1494 #ifdef _OPENMP
1495  C val = psi[j * lprod + l] * f[j];
1496  C *gref = g + psi_index_g[j * lprod + l];
1497  R *gref_real = (R*) gref;
1498 
1499  #pragma omp atomic
1500  gref_real[0] += creal(val);
1501 
1502  #pragma omp atomic
1503  gref_real[1] += cimag(val);
1504 #else
1505  g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1506 #endif
1507  }
1508  }
1509 }
1510 
1511 #ifndef _OPENMP
1512 MACRO_nfft_B(T)
1513 #endif
1514 
1515 #ifdef _OPENMP
1516 static inline void nfft_B_openmp_T(nfft_plan *ths)
1517 {
1518  int lprod; /* 'regular bandwidth' of matrix B */
1519  int k;
1520 
1521  memset(ths->g,0,ths->n_total*sizeof(C));
1522 
1523  for (k = 0, lprod = 1; k < ths->d; k++)
1524  lprod *= (2*ths->m+2);
1525 
1526  if (ths->nfft_flags & PRE_FULL_PSI)
1527  {
1528  nfft_adjoint_B_compute_full_psi(ths->g, ths->psi_index_g, ths->psi, ths->f, ths->M_total,
1529  ths->d, ths->n, ths->m, ths->nfft_flags, ths->index_x);
1530  return;
1531  }
1532 
1533  if (ths->nfft_flags & PRE_PSI)
1534  {
1535  #pragma omp parallel for default(shared) private(k)
1536  for (k = 0; k < ths->M_total; k++)
1537  {
1538  int u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1539  int t, t2; /* index dimensions */
1540  int l_L; /* index one row of B */
1541  int l[ths->d]; /* multi index u<=l<=o */
1542  int lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1543  int ll_plain[ths->d+1]; /* postfix plain index in g */
1544  R phi_prod[ths->d+1]; /* postfix product of PHI */
1545  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1546 
1547  phi_prod[0] = K(1.0);
1548  ll_plain[0] = 0;
1549 
1550  MACRO_init_uo_l_lj_t;
1551 
1552  for (l_L = 0; l_L < lprod; l_L++)
1553  {
1554  C *lhs;
1555  R *lhs_real;
1556  C val;
1557 
1558  MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
1559 
1560  lhs = ths->g + ll_plain[ths->d];
1561  lhs_real = (R*)lhs;
1562  val = phi_prod[ths->d] * ths->f[j];
1563 
1564  #pragma omp atomic
1565  lhs_real[0] += creal(val);
1566 
1567  #pragma omp atomic
1568  lhs_real[1] += cimag(val);
1569 
1570  MACRO_count_uo_l_lj_t;
1571  } /* for(l_L) */
1572  } /* for(j) */
1573  return;
1574  } /* if(PRE_PSI) */
1575 
1576  if (ths->nfft_flags & PRE_FG_PSI)
1577  {
1578  int t, t2; /* index dimensions */
1579  R fg_exp_l[ths->d][2*ths->m+2];
1580  for(t2 = 0; t2 < ths->d; t2++)
1581  {
1582  int lj_fg;
1583  R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
1584  R tmpEXP2sq = tmpEXP2*tmpEXP2;
1585  R tmp2 = K(1.0);
1586  R tmp3 = K(1.0);
1587  fg_exp_l[t2][0] = K(1.0);
1588  for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
1589  {
1590  tmp3 = tmp2*tmpEXP2;
1591  tmp2 *= tmpEXP2sq;
1592  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1593  }
1594  }
1595 
1596  #pragma omp parallel for default(shared) private(k,t,t2)
1597  for (k = 0; k < ths->M_total; k++)
1598  {
1599  int ll_plain[ths->d+1]; /* postfix plain index in g */
1600  R phi_prod[ths->d+1]; /* postfix product of PHI */
1601  int u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1602  int l[ths->d]; /* multi index u<=l<=o */
1603  int lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1604  R fg_psi[ths->d][2*ths->m+2];
1605  R tmpEXP1, tmp1;
1606  int l_fg,lj_fg;
1607  int l_L;
1608  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1609 
1610  phi_prod[0] = K(1.0);
1611  ll_plain[0] = 0;
1612 
1613  MACRO_init_uo_l_lj_t;
1614 
1615  for (t2 = 0; t2 < ths->d; t2++)
1616  {
1617  fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)];
1618  tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1];
1619  tmp1 = K(1.0);
1620  for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1621  {
1622  tmp1 *= tmpEXP1;
1623  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1624  }
1625  }
1626 
1627  for (l_L= 0; l_L < lprod; l_L++)
1628  {
1629  C *lhs;
1630  R *lhs_real;
1631  C val;
1632 
1633  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1634 
1635  lhs = ths->g + ll_plain[ths->d];
1636  lhs_real = (R*)lhs;
1637  val = phi_prod[ths->d] * ths->f[j];
1638 
1639  #pragma omp atomic
1640  lhs_real[0] += creal(val);
1641 
1642  #pragma omp atomic
1643  lhs_real[1] += cimag(val);
1644 
1645  MACRO_count_uo_l_lj_t;
1646  } /* for(l_L) */
1647  } /* for(j) */
1648  return;
1649  } /* if(PRE_FG_PSI) */
1650 
1651  if (ths->nfft_flags & FG_PSI)
1652  {
1653  int t, t2; /* index dimensions */
1654  R fg_exp_l[ths->d][2*ths->m+2];
1655 
1656  nfft_sort_nodes(ths);
1657 
1658  for (t2 = 0; t2 < ths->d; t2++)
1659  {
1660  int lj_fg;
1661  R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
1662  R tmpEXP2sq = tmpEXP2*tmpEXP2;
1663  R tmp2 = K(1.0);
1664  R tmp3 = K(1.0);
1665  fg_exp_l[t2][0] = K(1.0);
1666  for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
1667  {
1668  tmp3 = tmp2*tmpEXP2;
1669  tmp2 *= tmpEXP2sq;
1670  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1671  }
1672  }
1673 
1674  #pragma omp parallel for default(shared) private(k,t,t2)
1675  for (k = 0; k < ths->M_total; k++)
1676  {
1677  int ll_plain[ths->d+1]; /* postfix plain index in g */
1678  R phi_prod[ths->d+1]; /* postfix product of PHI */
1679  int u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1680  int l[ths->d]; /* multi index u<=l<=o */
1681  int lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1682  R fg_psi[ths->d][2*ths->m+2];
1683  R tmpEXP1, tmp1;
1684  int l_fg,lj_fg;
1685  int l_L;
1686  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1687 
1688  phi_prod[0] = K(1.0);
1689  ll_plain[0] = 0;
1690 
1691  MACRO_init_uo_l_lj_t;
1692 
1693  for (t2 = 0; t2 < ths->d; t2++)
1694  {
1695  fg_psi[t2][0] = (PHI((ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));
1696 
1697  tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2])
1698  /ths->b[t2]);
1699  tmp1 = K(1.0);
1700  for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1701  {
1702  tmp1 *= tmpEXP1;
1703  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1704  }
1705  }
1706 
1707  for (l_L = 0; l_L < lprod; l_L++)
1708  {
1709  C *lhs;
1710  R *lhs_real;
1711  C val;
1712 
1713  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1714 
1715  lhs = ths->g + ll_plain[ths->d];
1716  lhs_real = (R*)lhs;
1717  val = phi_prod[ths->d] * ths->f[j];
1718 
1719  #pragma omp atomic
1720  lhs_real[0] += creal(val);
1721 
1722  #pragma omp atomic
1723  lhs_real[1] += cimag(val);
1724 
1725  MACRO_count_uo_l_lj_t;
1726  } /* for(l_L) */
1727  } /* for(j) */
1728  return;
1729  } /* if(FG_PSI) */
1730 
1731  if (ths->nfft_flags & PRE_LIN_PSI)
1732  {
1733  nfft_sort_nodes(ths);
1734 
1735  #pragma omp parallel for default(shared) private(k)
1736  for (k = 0; k<ths->M_total; k++)
1737  {
1738  int u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1739  int t, t2; /* index dimensions */
1740  int l_L; /* index one row of B */
1741  int l[ths->d]; /* multi index u<=l<=o */
1742  int lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1743  int ll_plain[ths->d+1]; /* postfix plain index in g */
1744  R phi_prod[ths->d+1]; /* postfix product of PHI */
1745  R y[ths->d];
1746  R fg_psi[ths->d][2*ths->m+2];
1747  int l_fg,lj_fg;
1748  R ip_w;
1749  int ip_u;
1750  int ip_s = ths->K/(ths->m+2);
1751  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1752 
1753  phi_prod[0] = K(1.0);
1754  ll_plain[0] = 0;
1755 
1756  MACRO_init_uo_l_lj_t;
1757 
1758  for (t2 = 0; t2 < ths->d; t2++)
1759  {
1760  y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2])
1761  * ((R)ths->K))/(ths->m+2);
1762  ip_u = LRINT(FLOOR(y[t2]));
1763  ip_w = y[t2]-ip_u;
1764  for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++)
1765  {
1766  fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)]
1767  * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)]
1768  * (ip_w);
1769  }
1770  }
1771 
1772  for (l_L = 0; l_L < lprod; l_L++)
1773  {
1774  C *lhs;
1775  R *lhs_real;
1776  C val;
1777 
1778  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1779 
1780  lhs = ths->g + ll_plain[ths->d];
1781  lhs_real = (R*)lhs;
1782  val = phi_prod[ths->d] * ths->f[j];
1783 
1784  #pragma omp atomic
1785  lhs_real[0] += creal(val);
1786 
1787  #pragma omp atomic
1788  lhs_real[1] += cimag(val);
1789 
1790  MACRO_count_uo_l_lj_t;
1791  } /* for(l_L) */
1792  } /* for(j) */
1793  return;
1794  } /* if(PRE_LIN_PSI) */
1795 
1796  /* no precomputed psi at all */
1797  nfft_sort_nodes(ths);
1798 
1799  #pragma omp parallel for default(shared) private(k)
1800  for (k = 0; k < ths->M_total; k++)
1801  {
1802  int u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1803  int t, t2; /* index dimensions */
1804  int l_L; /* index one row of B */
1805  int l[ths->d]; /* multi index u<=l<=o */
1806  int lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1807  int ll_plain[ths->d+1]; /* postfix plain index in g */
1808  R phi_prod[ths->d+1]; /* postfix product of PHI */
1809  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1810 
1811  phi_prod[0] = K(1.0);
1812  ll_plain[0] = 0;
1813 
1814  MACRO_init_uo_l_lj_t;
1815 
1816  for (l_L = 0; l_L < lprod; l_L++)
1817  {
1818  C *lhs;
1819  R *lhs_real;
1820  C val;
1821 
1822  MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
1823 
1824  lhs = ths->g + ll_plain[ths->d];
1825  lhs_real = (R*)lhs;
1826  val = phi_prod[ths->d] * ths->f[j];
1827 
1828  #pragma omp atomic
1829  lhs_real[0] += creal(val);
1830 
1831  #pragma omp atomic
1832  lhs_real[1] += cimag(val);
1833 
1834  MACRO_count_uo_l_lj_t;
1835  } /* for(l_L) */
1836  } /* for(j) */
1837 }
1838 #endif
1839 
1840 static void nfft_B_T(nfft_plan *ths)
1841 {
1842 #ifdef _OPENMP
1843  nfft_B_openmp_T(ths);
1844 #else
1845  nfft_B_serial_T(ths);
1846 #endif
1847 }
1848 
1849 /* ## specialized version for d=1 ########################################### */
1850 
1851 static void nfft_1d_init_fg_exp_l(R *fg_exp_l, const int m, const R b)
1852 {
1853  const int tmp2 = 2*m+2;
1854  int l;
1855  R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
1856 
1857  fg_exp_b0 = EXP(K(-1.0)/b);
1858  fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
1859  fg_exp_b1 = fg_exp_b2 =fg_exp_l[0] = K(1.0);
1860 
1861  for (l = 1; l < tmp2; l++)
1862  {
1863  fg_exp_b2 = fg_exp_b1*fg_exp_b0;
1864  fg_exp_b1 *= fg_exp_b0_sq;
1865  fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
1866  }
1867 }
1868 
1869 
1870 static void nfft_trafo_1d_compute(C *fj, const C *g,const R *psij_const,
1871  const R *xj, const int n, const int m)
1872 {
1873  int u, o, l;
1874  const C *gj;
1875  const R *psij;
1876  psij = psij_const;
1877 
1878  nfft_uo2(&u, &o, *xj, n, m);
1879 
1880  if (u < o)
1881  {
1882  for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l <= 2*m+1; l++)
1883  (*fj) += (*psij++) * (*gj++);
1884  }
1885  else
1886  {
1887  for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l < 2*m+1 - o; l++)
1888  (*fj) += (*psij++) * (*gj++);
1889  for (l = 0, gj = g; l <= o; l++)
1890  (*fj) += (*psij++) * (*gj++);
1891  }
1892 }
1893 
1894 #ifndef _OPENMP
1895 static void nfft_adjoint_1d_compute_serial(const C *fj, C *g,const R *psij_const,
1896  const R *xj, const int n, const int m)
1897 {
1898  int u,o,l;
1899  C *gj;
1900  const R *psij;
1901  psij=psij_const;
1902 
1903  nfft_uo2(&u,&o,*xj, n, m);
1904 
1905  if(u<o)
1906  {
1907  for (l = 0, gj = g+u; l <= 2*m+1; l++)
1908  (*gj++) += (*psij++) * (*fj);
1909  }
1910  else
1911  {
1912  for (l = 0, gj = g+u; l < 2*m+1-o; l++)
1913  (*gj++) += (*psij++) * (*fj);
1914  for (l = 0, gj = g; l <= o; l++)
1915  (*gj++) += (*psij++) * (*fj);
1916  }
1917 }
1918 #endif
1919 
1920 #ifdef _OPENMP
1921 /* adjoint NFFT one-dimensional case with OpenMP atomic operations */
1922 static void nfft_adjoint_1d_compute_omp_atomic(const C f, C *g,const R *psij_const,
1923  const R *xj, const int n, const int m)
1924 {
1925  int u,o,l;
1926  C *gj;
1927  int index_temp[2*m+2];
1928 
1929  nfft_uo2(&u,&o,*xj, n, m);
1930 
1931  for (l=0; l<=2*m+1; l++)
1932  index_temp[l] = (l+u)%n;
1933 
1934  for (l = 0, gj = g+u; l <= 2*m+1; l++)
1935  {
1936  int i = index_temp[l];
1937  C *lhs = g+i;
1938  R *lhs_real = (R*)lhs;
1939  C val = psij_const[l] * f;
1940  #pragma omp atomic
1941  lhs_real[0] += creal(val);
1942 
1943  #pragma omp atomic
1944  lhs_real[1] += cimag(val);
1945  }
1946 }
1947 #endif
1948 
1949 #ifdef _OPENMP
1950 
1965 static void nfft_adjoint_1d_compute_omp_blockwise(const C f, C *g,const R *psij_const,
1966  const R *xj, const int n, const int m, const int my_u0, const int my_o0)
1967 {
1968  int ar_u,ar_o,l;
1969 
1970  nfft_uo2(&ar_u,&ar_o,*xj, n, m);
1971 
1972  if(ar_u<ar_o)
1973  {
1974  int u = MAX(my_u0,ar_u);
1975  int o = MIN(my_o0,ar_o);
1976  int offset_psij = u-ar_u;
1977 #ifdef OMP_ASSERT
1978  assert(offset_psij >= 0);
1979  assert(o-u <= 2*m+1);
1980  assert(offset_psij+o-u <= 2*m+1);
1981 #endif
1982 
1983  for (l = 0; l <= o-u; l++)
1984  g[u+l] += psij_const[offset_psij+l] * f;
1985  }
1986  else
1987  {
1988  int u = MAX(my_u0,ar_u);
1989  int o = my_o0;
1990  int offset_psij = u-ar_u;
1991 #ifdef OMP_ASSERT
1992  assert(offset_psij >= 0);
1993  assert(o-u <= 2*m+1);
1994  assert(offset_psij+o-u <= 2*m+1);
1995 #endif
1996 
1997  for (l = 0; l <= o-u; l++)
1998  g[u+l] += psij_const[offset_psij+l] * f;
1999 
2000  u = my_u0;
2001  o = MIN(my_o0,ar_o);
2002  offset_psij += my_u0-ar_u+n;
2003 
2004 #ifdef OMP_ASSERT
2005  if (u<=o)
2006  {
2007  assert(o-u <= 2*m+1);
2008  if (offset_psij+o-u > 2*m+1)
2009  {
2010  fprintf(stderr, "ERR: %d %d %d %d %d %d %d\n", ar_u, ar_o, my_u0, my_o0, u, o, offset_psij);
2011  }
2012  assert(offset_psij+o-u <= 2*m+1);
2013  }
2014 #endif
2015  for (l = 0; l <= o-u; l++)
2016  g[u+l] += psij_const[offset_psij+l] * f;
2017  }
2018 }
2019 #endif
2020 
2021 static void nfft_trafo_1d_B(nfft_plan *ths)
2022 {
2023  const int n = ths->n[0], M = ths->M_total, m = ths->m, m2p2 = 2*m+2;
2024  const C *g = (C*)ths->g;
2025 
2026  if (ths->nfft_flags & PRE_FULL_PSI)
2027  {
2028  int k;
2029  #pragma omp parallel for default(shared) private(k)
2030  for (k = 0; k < M; k++)
2031  {
2032  int l;
2033  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2034  ths->f[j] = K(0.0);
2035  for (l = 0; l < m2p2; l++)
2036  ths->f[j] += ths->psi[j*m2p2+l] * g[ths->psi_index_g[j*m2p2+l]];
2037  }
2038  return;
2039  } /* if(PRE_FULL_PSI) */
2040 
2041  if (ths->nfft_flags & PRE_PSI)
2042  {
2043  int k;
2044  #pragma omp parallel for default(shared) private(k)
2045  for (k = 0; k < M; k++)
2046  {
2047  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2048  nfft_trafo_1d_compute(&ths->f[j], g, ths->psi + j * (2 * m + 2),
2049  &ths->x[j], n, m);
2050  }
2051  return;
2052  } /* if(PRE_PSI) */
2053 
2054  if (ths->nfft_flags & PRE_FG_PSI)
2055  {
2056  int k;
2057  R fg_exp_l[m2p2];
2058 
2059  nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2060 
2061  #pragma omp parallel for default(shared) private(k)
2062  for (k = 0; k < M; k++)
2063  {
2064  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2065  const R fg_psij0 = ths->psi[2 * j], fg_psij1 = ths->psi[2 * j + 1];
2066  R fg_psij2 = K(1.0);
2067  R psij_const[m2p2];
2068  int l;
2069 
2070  psij_const[0] = fg_psij0;
2071 
2072  for (l = 1; l < m2p2; l++)
2073  {
2074  fg_psij2 *= fg_psij1;
2075  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2076  }
2077 
2078  nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2079  }
2080 
2081  return;
2082  } /* if(PRE_FG_PSI) */
2083 
2084  if (ths->nfft_flags & FG_PSI)
2085  {
2086  int k;
2087  R fg_exp_l[m2p2];
2088 
2089  nfft_sort_nodes(ths);
2090 
2091  nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2092 
2093  #pragma omp parallel for default(shared) private(k)
2094  for (k = 0; k < M; k++)
2095  {
2096  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2097  int u, o, l;
2098  R fg_psij0, fg_psij1, fg_psij2;
2099  R psij_const[m2p2];
2100 
2101  nfft_uo(ths, (int)j, &u, &o, 0);
2102  fg_psij0 = (PHI(ths->x[j]-((R)u)/n,0));
2103  fg_psij1 = EXP(K(2.0) * (n * ths->x[j] - u) / ths->b[0]);
2104  fg_psij2 = K(1.0);
2105 
2106  psij_const[0] = fg_psij0;
2107 
2108  for (l = 1; l < m2p2; l++)
2109  {
2110  fg_psij2 *= fg_psij1;
2111  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2112  }
2113 
2114  nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2115  }
2116  return;
2117  } /* if(FG_PSI) */
2118 
2119  if (ths->nfft_flags & PRE_LIN_PSI)
2120  {
2121  const int K = ths->K, ip_s = K / (m + 2);
2122  int k;
2123 
2124  nfft_sort_nodes(ths);
2125 
2126  #pragma omp parallel for default(shared) private(k)
2127  for (k = 0; k < M; k++)
2128  {
2129  int u, o, l;
2130  R ip_y, ip_w;
2131  int ip_u;
2132  R psij_const[m2p2];
2133  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2134 
2135  nfft_uo(ths, (int)j, &u, &o, 0);
2136 
2137  ip_y = FABS(n * ths->x[j] - u) * ((R)ip_s);
2138  ip_u = LRINT(FLOOR(ip_y));
2139  ip_w = ip_y - ip_u;
2140 
2141  for (l = 0; l < m2p2; l++)
2142  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
2143  + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
2144 
2145  nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2146  }
2147  return;
2148  } /* if(PRE_LIN_PSI) */
2149  else
2150  {
2151  /* no precomputed psi at all */
2152  int k;
2153 
2154  nfft_sort_nodes(ths);
2155 
2156  #pragma omp parallel for default(shared) private(k)
2157  for (k = 0; k < M; k++)
2158  {
2159  R psij_const[m2p2];
2160  int u, o, l;
2161  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2162 
2163  nfft_uo(ths, (int)j, &u, &o, 0);
2164 
2165  for (l = 0; l < m2p2; l++)
2166  psij_const[l] = (PHI(ths->x[j]-((R)((u+l)))/n,0));
2167 
2168  nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2169  }
2170  }
2171 }
2172 
2173 
2174 #ifdef OMP_ASSERT
2175 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A \
2176 { \
2177  assert(ar_x[2*k] >= min_u_a || k == M-1); \
2178  if (k > 0) \
2179  assert(ar_x[2*k-2] < min_u_a); \
2180 }
2181 #else
2182 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A
2183 #endif
2184 
2185 #ifdef OMP_ASSERT
2186 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B \
2187 { \
2188  assert(ar_x[2*k] >= min_u_b || k == M-1); \
2189  if (k > 0) \
2190  assert(ar_x[2*k-2] < min_u_b); \
2191 }
2192 #else
2193 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B
2194 #endif
2195 
2196 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
2197 { \
2198  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, \
2199  ths->psi + j * (2 * m + 2), ths->x + j, n, m, my_u0, my_o0); \
2200 }
2201 
2202 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
2203 { \
2204  R psij_const[2 * m + 2]; \
2205  int u, o, l; \
2206  R fg_psij0 = ths->psi[2 * j]; \
2207  R fg_psij1 = ths->psi[2 * j + 1]; \
2208  R fg_psij2 = K(1.0); \
2209  \
2210  psij_const[0] = fg_psij0; \
2211  for (l = 1; l <= 2 * m + 1; l++) \
2212  { \
2213  fg_psij2 *= fg_psij1; \
2214  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
2215  } \
2216  \
2217  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2218  ths->x + j, n, m, my_u0, my_o0); \
2219 }
2220 
2221 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
2222 { \
2223  R psij_const[2 * m + 2]; \
2224  R fg_psij0, fg_psij1, fg_psij2; \
2225  int u, o, l; \
2226  \
2227  nfft_uo(ths, j, &u, &o, 0); \
2228  fg_psij0 = (PHI(ths->x[j]-((R)u)/n,0)); \
2229  fg_psij1 = EXP(K(2.0) * (n * (ths->x[j]) - u) / ths->b[0]); \
2230  fg_psij2 = K(1.0); \
2231  psij_const[0] = fg_psij0; \
2232  for (l = 1; l <= 2 * m + 1; l++) \
2233  { \
2234  fg_psij2 *= fg_psij1; \
2235  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
2236  } \
2237  \
2238  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2239  ths->x + j, n, m, my_u0, my_o0); \
2240 }
2241 
2242 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
2243 { \
2244  R psij_const[2 * m + 2]; \
2245  int ip_u; \
2246  R ip_y, ip_w; \
2247  int u, o, l; \
2248  \
2249  nfft_uo(ths, j, &u, &o, 0); \
2250  \
2251  ip_y = FABS(n * ths->x[j] - u) * ((R)ip_s); \
2252  ip_u = LRINT(FLOOR(ip_y)); \
2253  ip_w = ip_y - ip_u; \
2254  for (l = 0; l < 2 * m + 2; l++) \
2255  psij_const[l] \
2256  = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w) \
2257  + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w); \
2258  \
2259  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2260  ths->x + j, n, m, my_u0, my_o0); \
2261 }
2262 
2263 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
2264 { \
2265  R psij_const[2 * m + 2]; \
2266  int u, o, l; \
2267  \
2268  nfft_uo(ths, j, &u, &o, 0); \
2269  \
2270  for (l = 0; l <= 2 * m + 1; l++) \
2271  psij_const[l] = (PHI(ths->x[j]-((R)((u+l)))/n,0)); \
2272  \
2273  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2274  ths->x + j, n, m, my_u0, my_o0); \
2275 }
2276 
2277 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(whichone) \
2278 { \
2279  if (ths->nfft_flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
2280  { \
2281  _Pragma("omp parallel private(k)") \
2282  { \
2283  int my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
2284  int *ar_x = ths->index_x; \
2285  \
2286  nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
2287  &min_u_b, &max_u_b, 1, &n, m); \
2288  \
2289  if (min_u_a != -1) \
2290  { \
2291  k = index_x_binary_search(ar_x, M, min_u_a); \
2292  \
2293  MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A \
2294  \
2295  while (k < M) \
2296  { \
2297  int u_prod = ar_x[2*k]; \
2298  int j = ar_x[2*k+1]; \
2299  \
2300  if (u_prod < min_u_a || u_prod > max_u_a) \
2301  break; \
2302  \
2303  MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
2304  \
2305  k++; \
2306  } \
2307  } \
2308  \
2309  if (min_u_b != -1) \
2310  { \
2311  k = index_x_binary_search(ar_x, M, min_u_b); \
2312  \
2313  MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B \
2314  \
2315  while (k < M) \
2316  { \
2317  int u_prod = ar_x[2*k]; \
2318  int j = ar_x[2*k+1]; \
2319  \
2320  if (u_prod < min_u_b || u_prod > max_u_b) \
2321  break; \
2322  \
2323  MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
2324  \
2325  k++; \
2326  } \
2327  } \
2328  } /* omp parallel */ \
2329  return; \
2330  } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \
2331 }
2332 
2333 static void nfft_adjoint_1d_B(nfft_plan *ths)
2334 {
2335  const int n = ths->n[0], M = ths->M_total, m = ths->m;
2336  int k;
2337  C *g = (C*)ths->g;
2338 
2339  memset(g,0,ths->n_total*sizeof(C));
2340 
2341  if (ths->nfft_flags & PRE_FULL_PSI)
2342  {
2343  nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
2344  1, ths->n, m, ths->nfft_flags, ths->index_x);
2345  return;
2346  } /* if(PRE_FULL_PSI) */
2347 
2348  if (ths->nfft_flags & PRE_PSI)
2349  {
2350 #ifdef _OPENMP
2351  MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(PRE_PSI)
2352 #endif
2353 
2354  #pragma omp parallel for default(shared) private(k)
2355  for (k = 0; k < M; k++)
2356  {
2357  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2358 #ifdef _OPENMP
2359  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
2360 #else
2361  nfft_adjoint_1d_compute_serial(ths->f + j, g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
2362 #endif
2363  }
2364 
2365  return;
2366  } /* if(PRE_PSI) */
2367 
2368  if (ths->nfft_flags & PRE_FG_PSI)
2369  {
2370  R fg_exp_l[2 * m + 2];
2371 
2372  nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2373 
2374 #ifdef _OPENMP
2375  MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(PRE_FG_PSI)
2376 #endif
2377 
2378 
2379  #pragma omp parallel for default(shared) private(k)
2380  for (k = 0; k < M; k++)
2381  {
2382  R psij_const[2 * m + 2];
2383  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2384  int l;
2385  R fg_psij0 = ths->psi[2 * j];
2386  R fg_psij1 = ths->psi[2 * j + 1];
2387  R fg_psij2 = K(1.0);
2388 
2389  psij_const[0] = fg_psij0;
2390  for (l = 1; l <= 2 * m + 1; l++)
2391  {
2392  fg_psij2 *= fg_psij1;
2393  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2394  }
2395 
2396 #ifdef _OPENMP
2397  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2398 #else
2399  nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2400 #endif
2401  }
2402 
2403  return;
2404  } /* if(PRE_FG_PSI) */
2405 
2406  if (ths->nfft_flags & FG_PSI)
2407  {
2408  R fg_exp_l[2 * m + 2];
2409 
2410  nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2411 
2412  nfft_sort_nodes(ths);
2413 
2414 #ifdef _OPENMP
2415  MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(FG_PSI)
2416 #endif
2417 
2418  #pragma omp parallel for default(shared) private(k)
2419  for (k = 0; k < M; k++)
2420  {
2421  int u,o,l;
2422  R psij_const[2 * m + 2];
2423  R fg_psij0, fg_psij1, fg_psij2;
2424  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2425 
2426  nfft_uo(ths, j, &u, &o, 0);
2427  fg_psij0 = (PHI(ths->x[j]-((R)u)/n,0));
2428  fg_psij1 = EXP(K(2.0) * (n * (ths->x[j]) - u) / ths->b[0]);
2429  fg_psij2 = K(1.0);
2430  psij_const[0] = fg_psij0;
2431  for (l = 1; l <= 2 * m + 1; l++)
2432  {
2433  fg_psij2 *= fg_psij1;
2434  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2435  }
2436 
2437 #ifdef _OPENMP
2438  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2439 #else
2440  nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2441 #endif
2442  }
2443 
2444  return;
2445  } /* if(FG_PSI) */
2446 
2447  if (ths->nfft_flags & PRE_LIN_PSI)
2448  {
2449  const int K = ths->K;
2450  const int ip_s = K / (m + 2);
2451 
2452  nfft_sort_nodes(ths);
2453 
2454 #ifdef _OPENMP
2455  MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
2456 #endif
2457 
2458  #pragma openmp parallel for default(shared) private(k)
2459  for (k = 0; k < M; k++)
2460  {
2461  int u,o,l;
2462  int ip_u;
2463  R ip_y, ip_w;
2464  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2465  R psij_const[2 * m + 2];
2466 
2467  nfft_uo(ths, j, &u, &o, 0);
2468 
2469  ip_y = FABS(n * ths->x[j] - u) * ((R)ip_s);
2470  ip_u = LRINT(FLOOR(ip_y));
2471  ip_w = ip_y - ip_u;
2472  for (l = 0; l < 2 * m + 2; l++)
2473  psij_const[l]
2474  = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
2475  + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
2476 
2477 #ifdef _OPENMP
2478  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2479 #else
2480  nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2481 #endif
2482  }
2483  return;
2484  } /* if(PRE_LIN_PSI) */
2485 
2486  /* no precomputed psi at all */
2487  nfft_sort_nodes(ths);
2488 
2489 #ifdef _OPENMP
2490  MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(NO_PSI)
2491 #endif
2492 
2493  #pragma omp parallel for default(shared) private(k)
2494  for (k = 0; k < M; k++)
2495  {
2496  int u,o,l;
2497  R psij_const[2 * m + 2];
2498  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2499 
2500  nfft_uo(ths, j, &u, &o, 0);
2501 
2502  for (l = 0; l <= 2 * m + 1; l++)
2503  psij_const[l] = (PHI(ths->x[j]-((R)((u+l)))/n,0));
2504 
2505 #ifdef _OPENMP
2506  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2507 #else
2508  nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2509 #endif
2510  }
2511 }
2512 
2513 void nfft_trafo_1d(nfft_plan *ths)
2514 {
2515  const int N = ths->N[0], N2 = N/2, n = ths->n[0];
2516  C *f_hat1 = (C*)ths->f_hat, *f_hat2 = (C*)&ths->f_hat[N2];
2517 
2518  ths->g_hat = ths->g1;
2519  ths->g = ths->g2;
2520 
2521  {
2522  C *g_hat1 = (C*)&ths->g_hat[n-N/2], *g_hat2 = (C*)ths->g_hat;
2523  R *c_phi_inv1, *c_phi_inv2;
2524 
2525  TIC(0)
2526 #ifdef _OPENMP
2527  {
2528  int k;
2529  #pragma omp parallel for default(shared) private(k)
2530  for (k = 0; k < ths->n_total; k++)
2531  ths->g_hat[k] = 0.0;
2532  }
2533 #else
2534  memset(ths->g_hat, 0, ths->n_total*sizeof(C));
2535 #endif
2536  if(ths->nfft_flags & PRE_PHI_HUT)
2537  {
2538  int k;
2539  c_phi_inv1 = ths->c_phi_inv[0];
2540  c_phi_inv2 = &ths->c_phi_inv[0][N2];
2541 
2542  #pragma omp parallel for default(shared) private(k)
2543  for (k = 0; k < N2; k++)
2544  {
2545  g_hat1[k] = f_hat1[k] * c_phi_inv1[k];
2546  g_hat2[k] = f_hat2[k] * c_phi_inv2[k];
2547  }
2548  }
2549  else
2550  {
2551  int k;
2552  #pragma omp parallel for default(shared) private(k)
2553  for (k = 0; k < N2; k++)
2554  {
2555  g_hat1[k] = f_hat1[k] / (PHI_HUT(k-N2,0));
2556  g_hat2[k] = f_hat2[k] / (PHI_HUT(k,0));
2557  }
2558  }
2559  TOC(0)
2560 
2561  TIC_FFTW(1)
2562  fftw_execute(ths->my_fftw_plan1);
2563  TOC_FFTW(1);
2564 
2565  TIC(2);
2566  nfft_trafo_1d_B(ths);
2567  TOC(2);
2568  }
2569 }
2570 
2571 void nfft_adjoint_1d(nfft_plan *ths)
2572 {
2573  int n,N;
2574  C *g_hat1,*g_hat2,*f_hat1,*f_hat2;
2575  R *c_phi_inv1, *c_phi_inv2;
2576 
2577  N=ths->N[0];
2578  n=ths->n[0];
2579 
2580  ths->g_hat=ths->g1;
2581  ths->g=ths->g2;
2582 
2583  f_hat1=(C*)ths->f_hat;
2584  f_hat2=(C*)&ths->f_hat[N/2];
2585  g_hat1=(C*)&ths->g_hat[n-N/2];
2586  g_hat2=(C*)ths->g_hat;
2587 
2588  TIC(2)
2589  nfft_adjoint_1d_B(ths);
2590  TOC(2)
2591 
2592  TIC_FFTW(1)
2593  fftw_execute(ths->my_fftw_plan2);
2594  TOC_FFTW(1);
2595 
2596  TIC(0)
2597  if(ths->nfft_flags & PRE_PHI_HUT)
2598  {
2599  int k;
2600  c_phi_inv1=ths->c_phi_inv[0];
2601  c_phi_inv2=&ths->c_phi_inv[0][N/2];
2602 
2603  #pragma omp parallel for default(shared) private(k)
2604  for (k = 0; k < N/2; k++)
2605  {
2606  f_hat1[k] = g_hat1[k] * c_phi_inv1[k];
2607  f_hat2[k] = g_hat2[k] * c_phi_inv2[k];
2608  }
2609  }
2610  else
2611  {
2612  int k;
2613 
2614  #pragma omp parallel for default(shared) private(k)
2615  for (k = 0; k < N/2; k++)
2616  {
2617  f_hat1[k] = g_hat1[k] / (PHI_HUT(k-N/2,0));
2618  f_hat2[k] = g_hat2[k] / (PHI_HUT(k,0));
2619  }
2620  }
2621  TOC(0)
2622 }
2623 
2624 
2625 /* ############################################################ SPECIFIC VERSIONS FOR d=2 */
2626 
2627 static void nfft_2d_init_fg_exp_l(R *fg_exp_l, const int m, const R b)
2628 {
2629  int l;
2630  R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
2631 
2632  fg_exp_b0 = EXP(K(-1.0)/b);
2633  fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
2634  fg_exp_b1 = K(1.0);
2635  fg_exp_b2 = K(1.0);
2636  fg_exp_l[0] = K(1.0);
2637  for(l=1; l <= 2*m+1; l++)
2638  {
2639  fg_exp_b2 = fg_exp_b1*fg_exp_b0;
2640  fg_exp_b1 *= fg_exp_b0_sq;
2641  fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
2642  }
2643 }
2644 
2645 static void nfft_trafo_2d_compute(C *fj, const C *g,
2646  const R *psij_const0, const R *psij_const1,
2647  const R *xj0, const R *xj1,
2648  const int n0, const int n1, const int m)
2649 {
2650  int u0,o0,l0,u1,o1,l1;
2651  const C *gj;
2652  const R *psij0,*psij1;
2653 
2654  psij0=psij_const0;
2655  psij1=psij_const1;
2656 
2657  nfft_uo2(&u0,&o0,*xj0, n0, m);
2658  nfft_uo2(&u1,&o1,*xj1, n1, m);
2659 
2660  *fj=0;
2661 
2662  if(u0<o0)
2663  if(u1<o1)
2664  for(l0=0; l0<=2*m+1; l0++,psij0++)
2665  {
2666  psij1=psij_const1;
2667  gj=g+(u0+l0)*n1+u1;
2668  for(l1=0; l1<=2*m+1; l1++)
2669  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2670  }
2671  else
2672  for(l0=0; l0<=2*m+1; l0++,psij0++)
2673  {
2674  psij1=psij_const1;
2675  gj=g+(u0+l0)*n1+u1;
2676  for(l1=0; l1<2*m+1-o1; l1++)
2677  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2678  gj=g+(u0+l0)*n1;
2679  for(l1=0; l1<=o1; l1++)
2680  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2681  }
2682  else
2683  if(u1<o1)
2684  {
2685  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2686  {
2687  psij1=psij_const1;
2688  gj=g+(u0+l0)*n1+u1;
2689  for(l1=0; l1<=2*m+1; l1++)
2690  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2691  }
2692  for(l0=0; l0<=o0; l0++,psij0++)
2693  {
2694  psij1=psij_const1;
2695  gj=g+l0*n1+u1;
2696  for(l1=0; l1<=2*m+1; l1++)
2697  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2698  }
2699  }
2700  else
2701  {
2702  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2703  {
2704  psij1=psij_const1;
2705  gj=g+(u0+l0)*n1+u1;
2706  for(l1=0; l1<2*m+1-o1; l1++)
2707  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2708  gj=g+(u0+l0)*n1;
2709  for(l1=0; l1<=o1; l1++)
2710  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2711  }
2712  for(l0=0; l0<=o0; l0++,psij0++)
2713  {
2714  psij1=psij_const1;
2715  gj=g+l0*n1+u1;
2716  for(l1=0; l1<2*m+1-o1; l1++)
2717  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2718  gj=g+l0*n1;
2719  for(l1=0; l1<=o1; l1++)
2720  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2721  }
2722  }
2723 }
2724 
2725 #ifdef _OPENMP
2726 /* adjoint NFFT two-dimensional case with OpenMP atomic operations */
2727 static void nfft_adjoint_2d_compute_omp_atomic(const C f, C *g,
2728  const R *psij_const0, const R *psij_const1,
2729  const R *xj0, const R *xj1,
2730  const int n0, const int n1, const int m)
2731 {
2732  int u0,o0,l0,u1,o1,l1;
2733  const int lprod = (2*m+2) * (2*m+2);
2734 
2735  unsigned long int index_temp0[2*m+2];
2736  unsigned long int index_temp1[2*m+2];
2737 
2738  nfft_uo2(&u0,&o0,*xj0, n0, m);
2739  nfft_uo2(&u1,&o1,*xj1, n1, m);
2740 
2741  for (l0=0; l0<=2*m+1; l0++)
2742  index_temp0[l0] = (u0+l0)%n0;
2743 
2744  for (l1=0; l1<=2*m+1; l1++)
2745  index_temp1[l1] = (u1+l1)%n1;
2746 
2747  for(l0=0; l0<=2*m+1; l0++)
2748  {
2749  for(l1=0; l1<=2*m+1; l1++)
2750  {
2751  unsigned long int i = index_temp0[l0] * n1 + index_temp1[l1];
2752  C *lhs = g+i;
2753  R *lhs_real = (R*)lhs;
2754  C val = psij_const0[l0] * psij_const1[l1] * f;
2755 
2756  #pragma omp atomic
2757  lhs_real[0] += creal(val);
2758 
2759  #pragma omp atomic
2760  lhs_real[1] += cimag(val);
2761  }
2762  }
2763 }
2764 #endif
2765 
2766 #ifdef _OPENMP
2767 
2785 static void nfft_adjoint_2d_compute_omp_blockwise(const C f, C *g,
2786  const R *psij_const0, const R *psij_const1,
2787  const R *xj0, const R *xj1,
2788  const int n0, const int n1, const int m,
2789  const int my_u0, const int my_o0)
2790 {
2791  int ar_u0,ar_o0,l0,u1,o1,l1;
2792  const int lprod = (2*m+2) * (2*m+2);
2793  unsigned long int index_temp1[2*m+2];
2794 
2795  nfft_uo2(&ar_u0,&ar_o0,*xj0, n0, m);
2796  nfft_uo2(&u1,&o1,*xj1, n1, m);
2797 
2798  for (l1=0; l1<=2*m+1; l1++)
2799  index_temp1[l1] = (u1+l1)%n1;
2800 
2801  if(ar_u0<ar_o0)
2802  {
2803  int u0 = MAX(my_u0,ar_u0);
2804  int o0 = MIN(my_o0,ar_o0);
2805  int offset_psij = u0-ar_u0;
2806 #ifdef OMP_ASSERT
2807  assert(offset_psij >= 0);
2808  assert(o0-u0 <= 2*m+1);
2809  assert(offset_psij+o0-u0 <= 2*m+1);
2810 #endif
2811 
2812  for (l0 = 0; l0 <= o0-u0; l0++)
2813  {
2814  unsigned long int i0 = (u0+l0) * n1;
2815  const C val0 = psij_const0[offset_psij+l0];
2816 
2817  for(l1=0; l1<=2*m+1; l1++)
2818  g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2819  }
2820  }
2821  else
2822  {
2823  int u0 = MAX(my_u0,ar_u0);
2824  int o0 = my_o0;
2825  int offset_psij = u0-ar_u0;
2826 #ifdef OMP_ASSERT
2827  assert(offset_psij >= 0);
2828  assert(o0-u0 <= 2*m+1);
2829  assert(offset_psij+o0-u0 <= 2*m+1);
2830 #endif
2831 
2832  for (l0 = 0; l0 <= o0-u0; l0++)
2833  {
2834  unsigned long int i0 = (u0+l0) * n1;
2835  const C val0 = psij_const0[offset_psij+l0];
2836 
2837  for(l1=0; l1<=2*m+1; l1++)
2838  g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2839  }
2840 
2841  u0 = my_u0;
2842  o0 = MIN(my_o0,ar_o0);
2843  offset_psij += my_u0-ar_u0+n0;
2844 
2845 #ifdef OMP_ASSERT
2846  if (u0<=o0)
2847  {
2848  assert(o0-u0 <= 2*m+1);
2849  assert(offset_psij+o0-u0 <= 2*m+1);
2850  }
2851 #endif
2852 
2853  for (l0 = 0; l0 <= o0-u0; l0++)
2854  {
2855  unsigned long int i0 = (u0+l0) * n1;
2856  const C val0 = psij_const0[offset_psij+l0];
2857 
2858  for(l1=0; l1<=2*m+1; l1++)
2859  g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2860  }
2861  }
2862 }
2863 #endif
2864 
2865 #ifndef _OPENMP
2866 static void nfft_adjoint_2d_compute_serial(const C *fj, C *g,
2867  const R *psij_const0, const R *psij_const1,
2868  const R *xj0, const R *xj1,
2869  const int n0, const int n1, const int m)
2870 {
2871  int u0,o0,l0,u1,o1,l1;
2872  C *gj;
2873  const R *psij0,*psij1;
2874 
2875  psij0=psij_const0;
2876  psij1=psij_const1;
2877 
2878  nfft_uo2(&u0,&o0,*xj0, n0, m);
2879  nfft_uo2(&u1,&o1,*xj1, n1, m);
2880 
2881  if(u0<o0)
2882  if(u1<o1)
2883  for(l0=0; l0<=2*m+1; l0++,psij0++)
2884  {
2885  psij1=psij_const1;
2886  gj=g+(u0+l0)*n1+u1;
2887  for(l1=0; l1<=2*m+1; l1++)
2888  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2889  }
2890  else
2891  for(l0=0; l0<=2*m+1; l0++,psij0++)
2892  {
2893  psij1=psij_const1;
2894  gj=g+(u0+l0)*n1+u1;
2895  for(l1=0; l1<2*m+1-o1; l1++)
2896  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2897  gj=g+(u0+l0)*n1;
2898  for(l1=0; l1<=o1; l1++)
2899  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2900  }
2901  else
2902  if(u1<o1)
2903  {
2904  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2905  {
2906  psij1=psij_const1;
2907  gj=g+(u0+l0)*n1+u1;
2908  for(l1=0; l1<=2*m+1; l1++)
2909  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2910  }
2911  for(l0=0; l0<=o0; l0++,psij0++)
2912  {
2913  psij1=psij_const1;
2914  gj=g+l0*n1+u1;
2915  for(l1=0; l1<=2*m+1; l1++)
2916  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2917  }
2918  }
2919  else
2920  {
2921  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2922  {
2923  psij1=psij_const1;
2924  gj=g+(u0+l0)*n1+u1;
2925  for(l1=0; l1<2*m+1-o1; l1++)
2926  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2927  gj=g+(u0+l0)*n1;
2928  for(l1=0; l1<=o1; l1++)
2929  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2930  }
2931  for(l0=0; l0<=o0; l0++,psij0++)
2932  {
2933  psij1=psij_const1;
2934  gj=g+l0*n1+u1;
2935  for(l1=0; l1<2*m+1-o1; l1++)
2936  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2937  gj=g+l0*n1;
2938  for(l1=0; l1<=o1; l1++)
2939  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2940  }
2941  }
2942 }
2943 #endif
2944 
2945 static void nfft_trafo_2d_B(nfft_plan *ths)
2946 {
2947  const C *g = (C*)ths->g;
2948  const int N0 = ths->N[0];
2949  const int n0 = ths->n[0];
2950  const int N1 = ths->N[1];
2951  const int n1 = ths->n[1];
2952  const int M = ths->M_total;
2953  const int m = ths->m;
2954 
2955  int k;
2956 
2957  if(ths->nfft_flags & PRE_FULL_PSI)
2958  {
2959  const int lprod = (2*m+2) * (2*m+2);
2960  #pragma omp parallel for default(shared) private(k)
2961  for (k = 0; k < M; k++)
2962  {
2963  int l;
2964  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2965  ths->f[j] = K(0.0);
2966  for (l = 0; l < lprod; l++)
2967  ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
2968  }
2969  return;
2970  } /* if(PRE_FULL_PSI) */
2971 
2972  if(ths->nfft_flags & PRE_PSI)
2973  {
2974  #pragma omp parallel for default(shared) private(k)
2975  for (k = 0; k < M; k++)
2976  {
2977  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2978  nfft_trafo_2d_compute(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
2979  }
2980 
2981  return;
2982  } /* if(PRE_PSI) */
2983 
2984  if(ths->nfft_flags & PRE_FG_PSI)
2985  {
2986  R fg_exp_l[2*(2*m+2)];
2987 
2988  nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2989  nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
2990 
2991  #pragma omp parallel for default(shared) private(k)
2992  for (k = 0; k < M; k++)
2993  {
2994  R psij_const[2*(2*m+2)];
2995  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2996  int l;
2997  R fg_psij0 = ths->psi[2*j*2];
2998  R fg_psij1 = ths->psi[2*j*2+1];
2999  R fg_psij2 = K(1.0);
3000 
3001  psij_const[0] = fg_psij0;
3002  for(l=1; l<=2*m+1; l++)
3003  {
3004  fg_psij2 *= fg_psij1;
3005  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3006  }
3007 
3008  fg_psij0 = ths->psi[2*(j*2+1)];
3009  fg_psij1 = ths->psi[2*(j*2+1)+1];
3010  fg_psij2 = K(1.0);
3011  psij_const[2*m+2] = fg_psij0;
3012  for(l=1; l<=2*m+1; l++)
3013  {
3014  fg_psij2 *= fg_psij1;
3015  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3016  }
3017 
3018  nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3019  }
3020 
3021  return;
3022  } /* if(PRE_FG_PSI) */
3023 
3024  if(ths->nfft_flags & FG_PSI)
3025  {
3026  R fg_exp_l[2*(2*m+2)];
3027 
3028  nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3029  nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3030 
3031  nfft_sort_nodes(ths);
3032 
3033  #pragma omp parallel for default(shared) private(k)
3034  for (k = 0; k < M; k++)
3035  {
3036  int u, o, l;
3037  R fg_psij0, fg_psij1, fg_psij2;
3038  R psij_const[2*(2*m+2)];
3039  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3040 
3041  nfft_uo(ths,j,&u,&o,0);
3042  fg_psij0 = (PHI(ths->x[2*j]-((R)u)/n0,0));
3043  fg_psij1 = EXP(K(2.0)*(n0*(ths->x[2*j]) - u)/ths->b[0]);
3044  fg_psij2 = K(1.0);
3045  psij_const[0] = fg_psij0;
3046  for(l=1; l<=2*m+1; l++)
3047  {
3048  fg_psij2 *= fg_psij1;
3049  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3050  }
3051 
3052  nfft_uo(ths,j,&u,&o,1);
3053  fg_psij0 = (PHI(ths->x[2*j+1]-((R)u)/n1,1));
3054  fg_psij1 = EXP(K(2.0)*(n1*(ths->x[2*j+1]) - u)/ths->b[1]);
3055  fg_psij2 = K(1.0);
3056  psij_const[2*m+2] = fg_psij0;
3057  for(l=1; l<=2*m+1; l++)
3058  {
3059  fg_psij2 *= fg_psij1;
3060  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3061  }
3062 
3063  nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3064  }
3065 
3066  return;
3067  } /* if(FG_PSI) */
3068 
3069  if(ths->nfft_flags & PRE_LIN_PSI)
3070  {
3071  const int K = ths->K, ip_s = K / (m + 2);
3072 
3073  nfft_sort_nodes(ths);
3074 
3075  #pragma omp parallel for default(shared) private(k)
3076  for (k = 0; k < M; k++)
3077  {
3078  int u, o, l;
3079  R ip_y, ip_w;
3080  int ip_u;
3081  R psij_const[2*(2*m+2)];
3082  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3083 
3084  nfft_uo(ths,j,&u,&o,0);
3085  ip_y = FABS(n0*ths->x[2*j] - u)*((R)ip_s);
3086  ip_u = LRINT(FLOOR(ip_y));
3087  ip_w = ip_y-ip_u;
3088  for(l=0; l < 2*m+2; l++)
3089  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
3090 
3091  nfft_uo(ths,j,&u,&o,1);
3092  ip_y = FABS(n1*ths->x[2*j+1] - u)*((R)ip_s);
3093  ip_u = LRINT(FLOOR(ip_y));
3094  ip_w = ip_y-ip_u;
3095  for(l=0; l < 2*m+2; l++)
3096  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
3097 
3098  nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3099  }
3100  return;
3101  } /* if(PRE_LIN_PSI) */
3102 
3103  /* no precomputed psi at all */
3104 
3105  nfft_sort_nodes(ths);
3106 
3107  #pragma omp parallel for default(shared) private(k)
3108  for (k = 0; k < M; k++)
3109  {
3110  R psij_const[2*(2*m+2)];
3111  int u, o, l;
3112  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3113 
3114  nfft_uo(ths,j,&u,&o,0);
3115  for(l=0;l<=2*m+1;l++)
3116  psij_const[l]=(PHI(ths->x[2*j]-((R)((u+l)))/n0,0));
3117 
3118  nfft_uo(ths,j,&u,&o,1);
3119  for(l=0;l<=2*m+1;l++)
3120  psij_const[2*m+2+l]=(PHI(ths->x[2*j+1]-((R)((u+l)))/n1,1));
3121 
3122  nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3123  }
3124 }
3125 
3126 #ifdef OMP_ASSERT
3127 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A \
3128 { \
3129  assert(ar_x[2*k] >= min_u_a || k == M-1); \
3130  if (k > 0) \
3131  assert(ar_x[2*k-2] < min_u_a); \
3132 }
3133 #else
3134 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A
3135 #endif
3136 
3137 #ifdef OMP_ASSERT
3138 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B \
3139 { \
3140  assert(ar_x[2*k] >= min_u_b || k == M-1); \
3141  if (k > 0) \
3142  assert(ar_x[2*k-2] < min_u_b); \
3143 }
3144 #else
3145 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B
3146 #endif
3147 
3148 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
3149  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3150  ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), \
3151  ths->x+2*j, ths->x+2*j+1, n0, n1, m, my_u0, my_o0);
3152 
3153 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
3154 { \
3155  R psij_const[2*(2*m+2)]; \
3156  int u, o, l; \
3157  R fg_psij0 = ths->psi[2*j*2]; \
3158  R fg_psij1 = ths->psi[2*j*2+1]; \
3159  R fg_psij2 = K(1.0); \
3160  \
3161  psij_const[0] = fg_psij0; \
3162  for(l=1; l<=2*m+1; l++) \
3163  { \
3164  fg_psij2 *= fg_psij1; \
3165  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
3166  } \
3167  \
3168  fg_psij0 = ths->psi[2*(j*2+1)]; \
3169  fg_psij1 = ths->psi[2*(j*2+1)+1]; \
3170  fg_psij2 = K(1.0); \
3171  psij_const[2*m+2] = fg_psij0; \
3172  for(l=1; l<=2*m+1; l++) \
3173  { \
3174  fg_psij2 *= fg_psij1; \
3175  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
3176  } \
3177  \
3178  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3179  psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3180  n0, n1, m, my_u0, my_o0); \
3181 }
3182 
3183 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
3184 { \
3185  R psij_const[2*(2*m+2)]; \
3186  R fg_psij0, fg_psij1, fg_psij2; \
3187  int u, o, l; \
3188  \
3189  nfft_uo(ths,j,&u,&o,0); \
3190  fg_psij0 = (PHI(ths->x[2*j]-((R)u)/n0,0)); \
3191  fg_psij1 = EXP(K(2.0)*(n0*(ths->x[2*j]) - u)/ths->b[0]); \
3192  fg_psij2 = K(1.0); \
3193  psij_const[0] = fg_psij0; \
3194  for(l=1; l<=2*m+1; l++) \
3195  { \
3196  fg_psij2 *= fg_psij1; \
3197  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
3198  } \
3199  \
3200  nfft_uo(ths,j,&u,&o,1); \
3201  fg_psij0 = (PHI(ths->x[2*j+1]-((R)u)/n1,1)); \
3202  fg_psij1 = EXP(K(2.0)*(n1*(ths->x[2*j+1]) - u)/ths->b[1]); \
3203  fg_psij2 = K(1.0); \
3204  psij_const[2*m+2] = fg_psij0; \
3205  for(l=1; l<=2*m+1; l++) \
3206  { \
3207  fg_psij2 *= fg_psij1; \
3208  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
3209  } \
3210  \
3211  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3212  psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3213  n0, n1, m, my_u0, my_o0); \
3214 }
3215 
3216 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
3217 { \
3218  R psij_const[2*(2*m+2)]; \
3219  int u, o, l; \
3220  int ip_u; \
3221  R ip_y, ip_w; \
3222  \
3223  nfft_uo(ths,j,&u,&o,0); \
3224  ip_y = FABS(n0*(ths->x[2*j]) - u)*((R)ip_s); \
3225  ip_u = LRINT(FLOOR(ip_y)); \
3226  ip_w = ip_y-ip_u; \
3227  for(l=0; l < 2*m+2; l++) \
3228  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
3229  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
3230  \
3231  nfft_uo(ths,j,&u,&o,1); \
3232  ip_y = FABS(n1*(ths->x[2*j+1]) - u)*((R)ip_s); \
3233  ip_u = LRINT(FLOOR(ip_y)); \
3234  ip_w = ip_y-ip_u; \
3235  for(l=0; l < 2*m+2; l++) \
3236  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
3237  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
3238  \
3239  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3240  psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3241  n0, n1, m, my_u0, my_o0); \
3242 }
3243 
3244 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
3245 { \
3246  R psij_const[2*(2*m+2)]; \
3247  int u, o, l; \
3248  \
3249  nfft_uo(ths,j,&u,&o,0); \
3250  for(l=0;l<=2*m+1;l++) \
3251  psij_const[l]=(PHI(ths->x[2*j]-((R)((u+l)))/n0,0)); \
3252  \
3253  nfft_uo(ths,j,&u,&o,1); \
3254  for(l=0;l<=2*m+1;l++) \
3255  psij_const[2*m+2+l]=(PHI(ths->x[2*j+1]-((R)((u+l)))/n1,1)); \
3256  \
3257  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3258  psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3259  n0, n1, m, my_u0, my_o0); \
3260 }
3261 
3262 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(whichone) \
3263 { \
3264  if (ths->nfft_flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
3265  { \
3266  _Pragma("omp parallel private(k)") \
3267  { \
3268  int my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
3269  int *ar_x = ths->index_x; \
3270  \
3271  nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
3272  &min_u_b, &max_u_b, 2, ths->n, m); \
3273  \
3274  if (min_u_a != -1) \
3275  { \
3276  k = index_x_binary_search(ar_x, M, min_u_a); \
3277  \
3278  MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A \
3279  \
3280  while (k < M) \
3281  { \
3282  int u_prod = ar_x[2*k]; \
3283  int j = ar_x[2*k+1]; \
3284  \
3285  if (u_prod < min_u_a || u_prod > max_u_a) \
3286  break; \
3287  \
3288  MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
3289  \
3290  k++; \
3291  } \
3292  } \
3293  \
3294  if (min_u_b != -1) \
3295  { \
3296  int k = index_x_binary_search(ar_x, M, min_u_b); \
3297  \
3298  MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B \
3299  \
3300  while (k < M) \
3301  { \
3302  int u_prod = ar_x[2*k]; \
3303  int j = ar_x[2*k+1]; \
3304  \
3305  if (u_prod < min_u_b || u_prod > max_u_b) \
3306  break; \
3307  \
3308  MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
3309  \
3310  k++; \
3311  } \
3312  } \
3313  } /* omp parallel */ \
3314  return; \
3315  } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \
3316 }
3317 
3318 
3319 static void nfft_adjoint_2d_B(nfft_plan *ths)
3320 {
3321  const int N0 = ths->N[0];
3322  const int n0 = ths->n[0];
3323  const int N1 = ths->N[1];
3324  const int n1 = ths->n[1];
3325  const int M = ths->M_total;
3326  const int m = ths->m;
3327  C* g = (C*) ths->g;
3328  int k;
3329 
3330  memset(g,0,ths->n_total*sizeof(C));
3331 
3332  if(ths->nfft_flags & PRE_FULL_PSI)
3333  {
3334  nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
3335  2, ths->n, m, ths->nfft_flags, ths->index_x);
3336  return;
3337  } /* if(PRE_FULL_PSI) */
3338 
3339  if(ths->nfft_flags & PRE_PSI)
3340  {
3341 #ifdef _OPENMP
3342  MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(PRE_PSI)
3343 #endif
3344 
3345  #pragma omp parallel for default(shared) private(k)
3346  for (k = 0; k < M; k++)
3347  {
3348  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3349 #ifdef _OPENMP
3350  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3351 #else
3352  nfft_adjoint_2d_compute_serial(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3353 #endif
3354  }
3355  return;
3356  } /* if(PRE_PSI) */
3357 
3358  if(ths->nfft_flags & PRE_FG_PSI)
3359  {
3360  R fg_exp_l[2*(2*m+2)];
3361 
3362  nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3363  nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3364 
3365 #ifdef _OPENMP
3366  MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(PRE_FG_PSI)
3367 #endif
3368 
3369 
3370  #pragma omp parallel for default(shared) private(k)
3371  for (k = 0; k < M; k++)
3372  {
3373  R psij_const[2*(2*m+2)];
3374  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3375  int l;
3376  R fg_psij0 = ths->psi[2*j*2];
3377  R fg_psij1 = ths->psi[2*j*2+1];
3378  R fg_psij2 = K(1.0);
3379 
3380  psij_const[0] = fg_psij0;
3381  for(l=1; l<=2*m+1; l++)
3382  {
3383  fg_psij2 *= fg_psij1;
3384  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3385  }
3386 
3387  fg_psij0 = ths->psi[2*(j*2+1)];
3388  fg_psij1 = ths->psi[2*(j*2+1)+1];
3389  fg_psij2 = K(1.0);
3390  psij_const[2*m+2] = fg_psij0;
3391  for(l=1; l<=2*m+1; l++)
3392  {
3393  fg_psij2 *= fg_psij1;
3394  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3395  }
3396 
3397 #ifdef _OPENMP
3398  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3399 #else
3400  nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3401 #endif
3402  }
3403 
3404  return;
3405  } /* if(PRE_FG_PSI) */
3406 
3407  if(ths->nfft_flags & FG_PSI)
3408  {
3409  R fg_exp_l[2*(2*m+2)];
3410 
3411  nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3412  nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3413 
3414  nfft_sort_nodes(ths);
3415 
3416 #ifdef _OPENMP
3417  MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(FG_PSI)
3418 #endif
3419 
3420  #pragma omp parallel for default(shared) private(k)
3421  for (k = 0; k < M; k++)
3422  {
3423  int u, o, l;
3424  R fg_psij0, fg_psij1, fg_psij2;
3425  R psij_const[2*(2*m+2)];
3426  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3427 
3428  nfft_uo(ths,j,&u,&o,0);
3429  fg_psij0 = (PHI(ths->x[2*j]-((R)u)/n0,0));
3430  fg_psij1 = EXP(K(2.0)*(n0*(ths->x[2*j]) - u)/ths->b[0]);
3431  fg_psij2 = K(1.0);
3432  psij_const[0] = fg_psij0;
3433  for(l=1; l<=2*m+1; l++)
3434  {
3435  fg_psij2 *= fg_psij1;
3436  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3437  }
3438 
3439  nfft_uo(ths,j,&u,&o,1);
3440  fg_psij0 = (PHI(ths->x[2*j+1]-((R)u)/n1,1));
3441  fg_psij1 = EXP(K(2.0)*(n1*(ths->x[2*j+1]) - u)/ths->b[1]);
3442  fg_psij2 = K(1.0);
3443  psij_const[2*m+2] = fg_psij0;
3444  for(l=1; l<=2*m+1; l++)
3445  {
3446  fg_psij2 *= fg_psij1;
3447  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3448  }
3449 
3450 #ifdef _OPENMP
3451  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3452 #else
3453  nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3454 #endif
3455  }
3456 
3457  return;
3458  } /* if(FG_PSI) */
3459 
3460  if(ths->nfft_flags & PRE_LIN_PSI)
3461  {
3462  const int K = ths->K;
3463  const int ip_s = K / (m + 2);
3464 
3465  nfft_sort_nodes(ths);
3466 
3467 #ifdef _OPENMP
3468  MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
3469 #endif
3470 
3471  #pragma openmp parallel for default(shared) private(k)
3472  for (k = 0; k < M; k++)
3473  {
3474  int u,o,l;
3475  int ip_u;
3476  R ip_y, ip_w;
3477  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3478  R psij_const[2*(2*m+2)];
3479 
3480  nfft_uo(ths,j,&u,&o,0);
3481  ip_y = FABS(n0*(ths->x[2*j]) - u)*((R)ip_s);
3482  ip_u = LRINT(FLOOR(ip_y));
3483  ip_w = ip_y-ip_u;
3484  for(l=0; l < 2*m+2; l++)
3485  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
3486  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
3487 
3488  nfft_uo(ths,j,&u,&o,1);
3489  ip_y = FABS(n1*(ths->x[2*j+1]) - u)*((R)ip_s);
3490  ip_u = LRINT(FLOOR(ip_y));
3491  ip_w = ip_y-ip_u;
3492  for(l=0; l < 2*m+2; l++)
3493  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
3494  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
3495 
3496 #ifdef _OPENMP
3497  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3498 #else
3499  nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3500 #endif
3501  }
3502  return;
3503  } /* if(PRE_LIN_PSI) */
3504 
3505  /* no precomputed psi at all */
3506  nfft_sort_nodes(ths);
3507 
3508 #ifdef _OPENMP
3509  MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(NO_PSI)
3510 #endif
3511 
3512  #pragma omp parallel for default(shared) private(k)
3513  for (k = 0; k < M; k++)
3514  {
3515  int u,o,l;
3516  R psij_const[2*(2*m+2)];
3517  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3518 
3519  nfft_uo(ths,j,&u,&o,0);
3520  for(l=0;l<=2*m+1;l++)
3521  psij_const[l]=(PHI(ths->x[2*j]-((R)((u+l)))/n0,0));
3522 
3523  nfft_uo(ths,j,&u,&o,1);
3524  for(l=0;l<=2*m+1;l++)
3525  psij_const[2*m+2+l]=(PHI(ths->x[2*j+1]-((R)((u+l)))/n1,1));
3526 
3527 #ifdef _OPENMP
3528  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3529 #else
3530  nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3531 #endif
3532  }
3533 }
3534 
3535 
3536 void nfft_trafo_2d(nfft_plan *ths)
3537 {
3538  int k0,k1,n0,n1,N0,N1;
3539  C *g_hat,*f_hat;
3540  R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
3541  R ck01, ck02, ck11, ck12;
3542  C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
3543 
3544  ths->g_hat=ths->g1;
3545  ths->g=ths->g2;
3546 
3547  N0=ths->N[0];
3548  N1=ths->N[1];
3549  n0=ths->n[0];
3550  n1=ths->n[1];
3551 
3552  f_hat=(C*)ths->f_hat;
3553  g_hat=(C*)ths->g_hat;
3554 
3555  TIC(0)
3556 #ifdef _OPENMP
3557  #pragma omp parallel for default(shared) private(k0)
3558  for (k0 = 0; k0 < ths->n_total; k0++)
3559  ths->g_hat[k0] = 0.0;
3560 #else
3561  memset(ths->g_hat,0,ths->n_total*sizeof(C));
3562 #endif
3563  if(ths->nfft_flags & PRE_PHI_HUT)
3564  {
3565  c_phi_inv01=ths->c_phi_inv[0];
3566  c_phi_inv02=&ths->c_phi_inv[0][N0/2];
3567 
3568  #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
3569  for(k0=0;k0<N0/2;k0++)
3570  {
3571  ck01=c_phi_inv01[k0];
3572  ck02=c_phi_inv02[k0];
3573 
3574  c_phi_inv11=ths->c_phi_inv[1];
3575  c_phi_inv12=&ths->c_phi_inv[1][N1/2];
3576 
3577  g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
3578  f_hat11=f_hat + k0*N1;
3579  g_hat21=g_hat + k0*n1+n1-(N1/2);
3580  f_hat21=f_hat + ((N0/2)+k0)*N1;
3581  g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
3582  f_hat12=f_hat + k0*N1+(N1/2);
3583  g_hat22=g_hat + k0*n1;
3584  f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
3585 
3586  for(k1=0;k1<N1/2;k1++)
3587  {
3588  ck11=c_phi_inv11[k1];
3589  ck12=c_phi_inv12[k1];
3590 
3591  g_hat11[k1] = f_hat11[k1] * ck01 * ck11;
3592  g_hat21[k1] = f_hat21[k1] * ck02 * ck11;
3593  g_hat12[k1] = f_hat12[k1] * ck01 * ck12;
3594  g_hat22[k1] = f_hat22[k1] * ck02 * ck12;
3595  }
3596  }
3597  }
3598  else
3599  #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
3600  for(k0=0;k0<N0/2;k0++)
3601  {
3602  ck01=K(1.0)/(PHI_HUT(k0-N0/2,0));
3603  ck02=K(1.0)/(PHI_HUT(k0,0));
3604  for(k1=0;k1<N1/2;k1++)
3605  {
3606  ck11=K(1.0)/(PHI_HUT(k1-N1/2,1));
3607  ck12=K(1.0)/(PHI_HUT(k1,1));
3608  g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] = f_hat[k0*N1+k1] * ck01 * ck11;
3609  g_hat[k0*n1+n1-N1/2+k1] = f_hat[(N0/2+k0)*N1+k1] * ck02 * ck11;
3610  g_hat[(n0-N0/2+k0)*n1+k1] = f_hat[k0*N1+N1/2+k1] * ck01 * ck12;
3611  g_hat[k0*n1+k1] = f_hat[(N0/2+k0)*N1+N1/2+k1] * ck02 * ck12;
3612  }
3613  }
3614 
3615  TOC(0)
3616 
3617  TIC_FFTW(1)
3618  fftw_execute(ths->my_fftw_plan1);
3619  TOC_FFTW(1);
3620 
3621  TIC(2);
3622  nfft_trafo_2d_B(ths);
3623  TOC(2);
3624 }
3625 
3626 void nfft_adjoint_2d(nfft_plan *ths)
3627 {
3628  int k0,k1,n0,n1,N0,N1;
3629  C *g_hat,*f_hat;
3630  R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
3631  R ck01, ck02, ck11, ck12;
3632  C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
3633 
3634  ths->g_hat=ths->g1;
3635  ths->g=ths->g2;
3636 
3637  N0=ths->N[0];
3638  N1=ths->N[1];
3639  n0=ths->n[0];
3640  n1=ths->n[1];
3641 
3642  f_hat=(C*)ths->f_hat;
3643  g_hat=(C*)ths->g_hat;
3644 
3645  TIC(2);
3646  nfft_adjoint_2d_B(ths);
3647  TOC(2);
3648 
3649  TIC_FFTW(1)
3650  fftw_execute(ths->my_fftw_plan2);
3651  TOC_FFTW(1);
3652 
3653  TIC(0)
3654  if(ths->nfft_flags & PRE_PHI_HUT)
3655  {
3656  c_phi_inv01=ths->c_phi_inv[0];
3657  c_phi_inv02=&ths->c_phi_inv[0][N0/2];
3658 
3659  #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
3660  for(k0=0;k0<N0/2;k0++)
3661  {
3662  ck01=c_phi_inv01[k0];
3663  ck02=c_phi_inv02[k0];
3664 
3665  c_phi_inv11=ths->c_phi_inv[1];
3666  c_phi_inv12=&ths->c_phi_inv[1][N1/2];
3667 
3668  g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
3669  f_hat11=f_hat + k0*N1;
3670  g_hat21=g_hat + k0*n1+n1-(N1/2);
3671  f_hat21=f_hat + ((N0/2)+k0)*N1;
3672  g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
3673  f_hat12=f_hat + k0*N1+(N1/2);
3674  g_hat22=g_hat + k0*n1;
3675  f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
3676 
3677  for(k1=0;k1<N1/2;k1++)
3678  {
3679  ck11=c_phi_inv11[k1];
3680  ck12=c_phi_inv12[k1];
3681 
3682  f_hat11[k1] = g_hat11[k1] * ck01 * ck11;
3683  f_hat21[k1] = g_hat21[k1] * ck02 * ck11;
3684  f_hat12[k1] = g_hat12[k1] * ck01 * ck12;
3685  f_hat22[k1] = g_hat22[k1] * ck02 * ck12;
3686  }
3687  }
3688  }
3689  else
3690  #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
3691  for(k0=0;k0<N0/2;k0++)
3692  {
3693  ck01=K(1.0)/(PHI_HUT(k0-N0/2,0));
3694  ck02=K(1.0)/(PHI_HUT(k0,0));
3695  for(k1=0;k1<N1/2;k1++)
3696  {
3697  ck11=K(1.0)/(PHI_HUT(k1-N1/2,1));
3698  ck12=K(1.0)/(PHI_HUT(k1,1));
3699  f_hat[k0*N1+k1] = g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] * ck01 * ck11;
3700  f_hat[(N0/2+k0)*N1+k1] = g_hat[k0*n1+n1-N1/2+k1] * ck02 * ck11;
3701  f_hat[k0*N1+N1/2+k1] = g_hat[(n0-N0/2+k0)*n1+k1] * ck01 * ck12;
3702  f_hat[(N0/2+k0)*N1+N1/2+k1] = g_hat[k0*n1+k1] * ck02 * ck12;
3703  }
3704  }
3705  TOC(0)
3706 }
3707 
3708 /* ############################################################ SPECIFIC VERSIONS FOR d=3 */
3709 
3710 static void nfft_3d_init_fg_exp_l(R *fg_exp_l, const int m, const R b)
3711 {
3712  int l;
3713  R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
3714 
3715  fg_exp_b0 = EXP(-1.0/b);
3716  fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
3717  fg_exp_b1 = K(1.0);
3718  fg_exp_b2 = K(1.0);
3719  fg_exp_l[0] = K(1.0);
3720  for(l=1; l <= 2*m+1; l++)
3721  {
3722  fg_exp_b2 = fg_exp_b1*fg_exp_b0;
3723  fg_exp_b1 *= fg_exp_b0_sq;
3724  fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
3725  }
3726 }
3727 
3728 static void nfft_trafo_3d_compute(C *fj, const C *g,
3729  const R *psij_const0, const R *psij_const1, const R *psij_const2,
3730  const R *xj0, const R *xj1, const R *xj2,
3731  const int n0, const int n1, const int n2, const int m)
3732 {
3733  int u0,o0,l0,u1,o1,l1,u2,o2,l2;
3734  const C *gj;
3735  const R *psij0,*psij1,*psij2;
3736 
3737  psij0=psij_const0;
3738  psij1=psij_const1;
3739  psij2=psij_const2;
3740 
3741  nfft_uo2(&u0,&o0,*xj0, n0, m);
3742  nfft_uo2(&u1,&o1,*xj1, n1, m);
3743  nfft_uo2(&u2,&o2,*xj2, n2, m);
3744 
3745  *fj=0;
3746 
3747  if(u0<o0)
3748  if(u1<o1)
3749  if(u2<o2)
3750  for(l0=0; l0<=2*m+1; l0++,psij0++)
3751  {
3752  psij1=psij_const1;
3753  for(l1=0; l1<=2*m+1; l1++,psij1++)
3754  {
3755  psij2=psij_const2;
3756  gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
3757  for(l2=0; l2<=2*m+1; l2++)
3758  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3759  }
3760  }
3761  else/* asserts (u2>o2)*/
3762  for(l0=0; l0<=2*m+1; l0++,psij0++)
3763  {
3764  psij1=psij_const1;
3765  for(l1=0; l1<=2*m+1; l1++,psij1++)
3766  {
3767  psij2=psij_const2;
3768  gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
3769  for(l2=0; l2<2*m+1-o2; l2++)
3770  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3771  gj=g+((u0+l0)*n1+(u1+l1))*n2;
3772  for(l2=0; l2<=o2; l2++)
3773  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3774  }
3775  }
3776  else/* asserts (u1>o1)*/
3777  if(u2<o2)
3778  for(l0=0; l0<=2*m+1; l0++,psij0++)
3779  {
3780  psij1=psij_const1;
3781  for(l1=0; l1<2*m+1-o1; l1++,psij1++)
3782  {
3783  psij2=psij_const2;
3784  gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
3785  for(l2=0; l2<=2*m+1; l2++)
3786  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3787  }
3788  for(l1=0; l1<=o1; l1++,psij1++)
3789  {
3790  psij2=psij_const2;
3791  gj=g+((u0+l0)*n1+l1)*n2+u2;
3792  for(l2=0; l2<=2*m+1; l2++)
3793  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3794  }
3795  }
3796  else/* asserts (u2>o2) */
3797  {
3798  for(l0=0; l0<=2*m+1; l0++,psij0++)
3799  {
3800  psij1=psij_const1;
3801  for(l1=0; l1<2*m+1-o1; l1++,psij1++)
3802  {
3803  psij2=psij_const2;
3804  gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
3805  for(l2=0; l2<2*m+1-o2; l2++)
3806  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3807  gj=g+((u0+l0)*n1+(u1+l1))*n2;
3808  for(l2=0; l2<=o2; l2++)
3809  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3810  }
3811  for(l1=0; l1<=o1; l1++,psij1++)
3812  {
3813  psij2=psij_const2;
3814  gj=g+((u0+l0)*n1+l1)*n2+u2;
3815  for(l2=0; l2<2*m+1-o2; l2++)
3816  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3817  gj=g+((u0+l0)*n1+l1)*n2;
3818  for(l2=0; l2<=o2; l2++)
3819  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3820  }
3821  }
3822  }
3823  else/* asserts (u0>o0) */
3824  if(u1<o1)
3825  if(u2<o2)
3826  {
3827  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
3828  {
3829  psij1=psij_const1;
3830  for(l1=0; l1<=2*m+1; l1++,psij1++)
3831  {
3832  psij2=psij_const2;
3833  gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
3834  for(l2=0; l2<=2*m+1; l2++)
3835  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3836  }
3837  }
3838 
3839  for(l0=0; l0<=o0; l0++,psij0++)
3840  {
3841  psij1=psij_const1;
3842  for(l1=0; l1<=2*m+1; l1++,psij1++)
3843  {
3844  psij2=psij_const2;
3845  gj=g+(l0*n1+(u1+l1))*n2+u2;
3846  for(l2=0; l2<=2*m+1; l2++)
3847  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3848  }
3849  }
3850  }
3851  else/* asserts (u2>o2) */
3852  {
3853  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
3854  {
3855  psij1=psij_const1;
3856  for(l1=0; l1<=2*m+1; l1++,psij1++)
3857  {
3858  psij2=psij_const2;
3859  gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
3860  for(l2=0; l2<2*m+1-o2; l2++)
3861  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3862  gj=g+((u0+l0)*n1+(u1+l1))*n2;
3863  for(l2=0; l2<=o2; l2++)
3864  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3865  }
3866  }
3867 
3868  for(l0=0; l0<=o0; l0++,psij0++)
3869  {
3870  psij1=psij_const1;
3871  for(l1=0; l1<=2*m+1; l1++,psij1++)
3872  {
3873  psij2=psij_const2;
3874  gj=g+(l0*n1+(u1+l1))*n2+u2;
3875  for(l2=0; l2<2*m+1-o2; l2++)
3876  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3877  gj=g+(l0*n1+(u1+l1))*n2;
3878  for(l2=0; l2<=o2; l2++)
3879  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3880  }
3881  }
3882  }
3883  else/* asserts (u1>o1) */
3884  if(u2<o2)
3885  {
3886  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
3887  {
3888  psij1=psij_const1;
3889  for(l1=0; l1<2*m+1-o1; l1++,psij1++)
3890  {
3891  psij2=psij_const2;
3892  gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
3893  for(l2=0; l2<=2*m+1; l2++)
3894  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3895  }
3896  for(l1=0; l1<=o1; l1++,psij1++)
3897  {
3898  psij2=psij_const2;
3899  gj=g+((u0+l0)*n1+l1)*n2+u2;
3900  for(l2=0; l2<=2*m+1; l2++)
3901  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3902  }
3903  }
3904  for(l0=0; l0<=o0; l0++,psij0++)
3905  {
3906  psij1=psij_const1;
3907  for(l1=0; l1<2*m+1-o1; l1++,psij1++)
3908  {
3909  psij2=psij_const2;
3910  gj=g+(l0*n1+(u1+l1))*n2+u2;
3911  for(l2=0; l2<=2*m+1; l2++)
3912  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3913  }
3914  for(l1=0; l1<=o1; l1++,psij1++)
3915  {
3916  psij2=psij_const2;
3917  gj=g+(l0*n1+l1)*n2+u2;
3918  for(l2=0; l2<=2*m+1; l2++)
3919  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3920  }
3921  }
3922  }
3923  else/* asserts (u2>o2) */
3924  {
3925  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
3926  {
3927  psij1=psij_const1;
3928  for(l1=0; l1<2*m+1-o1; l1++,psij1++)
3929  {
3930  psij2=psij_const2;
3931  gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
3932  for(l2=0; l2<2*m+1-o2; l2++)
3933  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3934  gj=g+((u0+l0)*n1+(u1+l1))*n2;
3935  for(l2=0; l2<=o2; l2++)
3936  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3937  }
3938  for(l1=0; l1<=o1; l1++,psij1++)
3939  {
3940  psij2=psij_const2;
3941  gj=g+((u0+l0)*n1+l1)*n2+u2;
3942  for(l2=0; l2<2*m+1-o2; l2++)
3943  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3944  gj=g+((u0+l0)*n1+l1)*n2;
3945  for(l2=0; l2<=o2; l2++)
3946  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3947  }
3948  }
3949 
3950  for(l0=0; l0<=o0; l0++,psij0++)
3951  {
3952  psij1=psij_const1;
3953  for(l1=0; l1<2*m+1-o1; l1++,psij1++)
3954  {
3955  psij2=psij_const2;
3956  gj=g+(l0*n1+(u1+l1))*n2+u2;
3957  for(l2=0; l2<2*m+1-o2; l2++)
3958  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3959  gj=g+(l0*n1+(u1+l1))*n2;
3960  for(l2=0; l2<=o2; l2++)
3961  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3962  }
3963  for(l1=0; l1<=o1; l1++,psij1++)
3964  {
3965  psij2=psij_const2;
3966  gj=g+(l0*n1+l1)*n2+u2;
3967  for(l2=0; l2<2*m+1-o2; l2++)
3968  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3969  gj=g+(l0*n1+l1)*n2;
3970  for(l2=0; l2<=o2; l2++)
3971  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3972  }
3973  }
3974  }
3975 }
3976 
3977 #ifdef _OPENMP
3978 
3999 static void nfft_adjoint_3d_compute_omp_blockwise(const C f, C *g,
4000  const R *psij_const0, const R *psij_const1, const R *psij_const2,
4001  const R *xj0, const R *xj1, const R *xj2,
4002  const int n0, const int n1, const int n2, const int m,
4003  const int my_u0, const int my_o0)
4004 {
4005  int ar_u0,ar_o0,l0,u1,o1,l1,u2,o2,l2;
4006  const int lprod = (2*m+2) * (2*m+2) * (2*m+2);
4007 
4008  unsigned long int index_temp1[2*m+2];
4009  unsigned long int index_temp2[2*m+2];
4010 
4011  nfft_uo2(&ar_u0,&ar_o0,*xj0, n0, m);
4012  nfft_uo2(&u1,&o1,*xj1, n1, m);
4013  nfft_uo2(&u2,&o2,*xj2, n2, m);
4014 
4015  for (l1=0; l1<=2*m+1; l1++)
4016  index_temp1[l1] = (u1+l1)%n1;
4017 
4018  for (l2=0; l2<=2*m+1; l2++)
4019  index_temp2[l2] = (u2+l2)%n2;
4020 
4021  if(ar_u0<ar_o0)
4022  {
4023  int u0 = MAX(my_u0,ar_u0);
4024  int o0 = MIN(my_o0,ar_o0);
4025  int offset_psij = u0-ar_u0;
4026 #ifdef OMP_ASSERT
4027  assert(offset_psij >= 0);
4028  assert(o0-u0 <= 2*m+1);
4029  assert(offset_psij+o0-u0 <= 2*m+1);
4030 #endif
4031 
4032  for (l0 = 0; l0 <= o0-u0; l0++)
4033  {
4034  const unsigned long int i0 = (u0+l0) * n1;
4035  const C val0 = psij_const0[offset_psij+l0];
4036 
4037  for(l1=0; l1<=2*m+1; l1++)
4038  {
4039  const unsigned long int i1 = (i0 + index_temp1[l1]) * n2;
4040  const C val1 = psij_const1[l1];
4041 
4042  for(l2=0; l2<=2*m+1; l2++)
4043  g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4044  }
4045  }
4046  }
4047  else
4048  {
4049  int u0 = MAX(my_u0,ar_u0);
4050  int o0 = my_o0;
4051  int offset_psij = u0-ar_u0;
4052 #ifdef OMP_ASSERT
4053  assert(offset_psij >= 0);
4054  assert(o0-u0 <= 2*m+1);
4055  assert(offset_psij+o0-u0 <= 2*m+1);
4056 #endif
4057 
4058  for (l0 = 0; l0 <= o0-u0; l0++)
4059  {
4060  unsigned long int i0 = (u0+l0) * n1;
4061  const C val0 = psij_const0[offset_psij+l0];
4062 
4063  for(l1=0; l1<=2*m+1; l1++)
4064  {
4065  const unsigned long int i1 = (i0 + index_temp1[l1]) * n2;
4066  const C val1 = psij_const1[l1];
4067 
4068  for(l2=0; l2<=2*m+1; l2++)
4069  g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4070  }
4071  }
4072 
4073  u0 = my_u0;
4074  o0 = MIN(my_o0,ar_o0);
4075  offset_psij += my_u0-ar_u0+n0;
4076 
4077 #ifdef OMP_ASSERT
4078  if (u0<=o0)
4079  {
4080  assert(o0-u0 <= 2*m+1);
4081  assert(offset_psij+o0-u0 <= 2*m+1);
4082  }
4083 #endif
4084  for (l0 = 0; l0 <= o0-u0; l0++)
4085  {
4086  unsigned long int i0 = (u0+l0) * n1;
4087  const C val0 = psij_const0[offset_psij+l0];
4088 
4089  for(l1=0; l1<=2*m+1; l1++)
4090  {
4091  const unsigned long int i1 = (i0 + index_temp1[l1]) * n2;
4092  const C val1 = psij_const1[l1];
4093 
4094  for(l2=0; l2<=2*m+1; l2++)
4095  g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4096  }
4097  }
4098  }
4099 }
4100 #endif
4101 
4102 #ifdef _OPENMP
4103 /* adjoint NFFT three-dimensional case with OpenMP atomic operations */
4104 static void nfft_adjoint_3d_compute_omp_atomic(const C f, C *g,
4105  const R *psij_const0, const R *psij_const1, const R *psij_const2,
4106  const R *xj0, const R *xj1, const R *xj2,
4107  const int n0, const int n1, const int n2, const int m)
4108 {
4109  int u0,o0,l0,u1,o1,l1,u2,o2,l2;
4110  const int lprod = (2*m+2) * (2*m+2) * (2*m+2);
4111 
4112  unsigned long int index_temp0[2*m+2];
4113  unsigned long int index_temp1[2*m+2];
4114  unsigned long int index_temp2[2*m+2];
4115 
4116  nfft_uo2(&u0,&o0,*xj0, n0, m);
4117  nfft_uo2(&u1,&o1,*xj1, n1, m);
4118  nfft_uo2(&u2,&o2,*xj2, n2, m);
4119 
4120  for (l0=0; l0<=2*m+1; l0++)
4121  index_temp0[l0] = (u0+l0)%n0;
4122 
4123  for (l1=0; l1<=2*m+1; l1++)
4124  index_temp1[l1] = (u1+l1)%n1;
4125 
4126  for (l2=0; l2<=2*m+1; l2++)
4127  index_temp2[l2] = (u2+l2)%n2;
4128 
4129  for(l0=0; l0<=2*m+1; l0++)
4130  {
4131  for(l1=0; l1<=2*m+1; l1++)
4132  {
4133  for(l2=0; l2<=2*m+1; l2++)
4134  {
4135  unsigned long int i = (index_temp0[l0] * n1 + index_temp1[l1]) * n2 + index_temp2[l2];
4136  C *lhs = g+i;
4137  R *lhs_real = (R*)lhs;
4138  C val = psij_const0[l0] * psij_const1[l1] * psij_const2[l2] * f;
4139 
4140  #pragma omp atomic
4141  lhs_real[0] += creal(val);
4142 
4143  #pragma omp atomic
4144  lhs_real[1] += cimag(val);
4145  }
4146  }
4147  }
4148 }
4149 #endif
4150 
4151 #ifndef _OPENMP
4152 static void nfft_adjoint_3d_compute_serial(const C *fj, C *g,
4153  const R *psij_const0, const R *psij_const1, const R *psij_const2,
4154  const R *xj0, const R *xj1, const R *xj2,
4155  const int n0, const int n1, const int n2, const int m)
4156 {
4157  int u0,o0,l0,u1,o1,l1,u2,o2,l2;
4158  C *gj;
4159  const R *psij0,*psij1,*psij2;
4160 
4161  psij0=psij_const0;
4162  psij1=psij_const1;
4163  psij2=psij_const2;
4164 
4165  nfft_uo2(&u0,&o0,*xj0, n0, m);
4166  nfft_uo2(&u1,&o1,*xj1, n1, m);
4167  nfft_uo2(&u2,&o2,*xj2, n2, m);
4168 
4169  if(u0<o0)
4170  if(u1<o1)
4171  if(u2<o2)
4172  for(l0=0; l0<=2*m+1; l0++,psij0++)
4173  {
4174  psij1=psij_const1;
4175  for(l1=0; l1<=2*m+1; l1++,psij1++)
4176  {
4177  psij2=psij_const2;
4178  gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
4179  for(l2=0; l2<=2*m+1; l2++)
4180  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4181  }
4182  }
4183  else/* asserts (u2>o2)*/
4184  for(l0=0; l0<=2*m+1; l0++,psij0++)
4185  {
4186  psij1=psij_const1;
4187  for(l1=0; l1<=2*m+1; l1++,psij1++)
4188  {
4189  psij2=psij_const2;
4190  gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
4191  for(l2=0; l2<2*m+1-o2; l2++)
4192  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4193  gj=g+((u0+l0)*n1+(u1+l1))*n2;
4194  for(l2=0; l2<=o2; l2++)
4195  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4196  }
4197  }
4198  else/* asserts (u1>o1)*/
4199  if(u2<o2)
4200  for(l0=0; l0<=2*m+1; l0++,psij0++)
4201  {
4202  psij1=psij_const1;
4203  for(l1=0; l1<2*m+1-o1; l1++,psij1++)
4204  {
4205  psij2=psij_const2;
4206  gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
4207  for(l2=0; l2<=2*m+1; l2++)
4208  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4209  }
4210  for(l1=0; l1<=o1; l1++,psij1++)
4211  {
4212  psij2=psij_const2;
4213  gj=g+((u0+l0)*n1+l1)*n2+u2;
4214  for(l2=0; l2<=2*m+1; l2++)
4215  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4216  }
4217  }
4218  else/* asserts (u2>o2) */
4219  {
4220  for(l0=0; l0<=2*m+1; l0++,psij0++)
4221  {
4222  psij1=psij_const1;
4223  for(l1=0; l1<2*m+1-o1; l1++,psij1++)
4224  {
4225  psij2=psij_const2;
4226  gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
4227  for(l2=0; l2<2*m+1-o2; l2++)
4228  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4229  gj=g+((u0+l0)*n1+(u1+l1))*n2;
4230  for(l2=0; l2<=o2; l2++)
4231  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4232  }
4233  for(l1=0; l1<=o1; l1++,psij1++)
4234  {
4235  psij2=psij_const2;
4236  gj=g+((u0+l0)*n1+l1)*n2+u2;
4237  for(l2=0; l2<2*m+1-o2; l2++)
4238  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4239  gj=g+((u0+l0)*n1+l1)*n2;
4240  for(l2=0; l2<=o2; l2++)
4241  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4242  }
4243  }
4244  }
4245  else/* asserts (u0>o0) */
4246  if(u1<o1)
4247  if(u2<o2)
4248  {
4249  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
4250  {
4251  psij1=psij_const1;
4252  for(l1=0; l1<=2*m+1; l1++,psij1++)
4253  {
4254  psij2=psij_const2;
4255  gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
4256  for(l2=0; l2<=2*m+1; l2++)
4257  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4258  }
4259  }
4260 
4261  for(l0=0; l0<=o0; l0++,psij0++)
4262  {
4263  psij1=psij_const1;
4264  for(l1=0; l1<=2*m+1; l1++,psij1++)
4265  {
4266  psij2=psij_const2;
4267  gj=g+(l0*n1+(u1+l1))*n2+u2;
4268  for(l2=0; l2<=2*m+1; l2++)
4269  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4270  }
4271  }
4272  }
4273  else/* asserts (u2>o2) */
4274  {
4275  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
4276  {
4277  psij1=psij_const1;
4278  for(l1=0; l1<=2*m+1; l1++,psij1++)
4279  {
4280  psij2=psij_const2;
4281  gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
4282  for(l2=0; l2<2*m+1-o2; l2++)
4283  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4284  gj=g+((u0+l0)*n1+(u1+l1))*n2;
4285  for(l2=0; l2<=o2; l2++)
4286  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4287  }
4288  }
4289 
4290  for(l0=0; l0<=o0; l0++,psij0++)
4291  {
4292  psij1=psij_const1;
4293  for(l1=0; l1<=2*m+1; l1++,psij1++)
4294  {
4295  psij2=psij_const2;
4296  gj=g+(l0*n1+(u1+l1))*n2+u2;
4297  for(l2=0; l2<2*m+1-o2; l2++)
4298  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4299  gj=g+(l0*n1+(u1+l1))*n2;
4300  for(l2=0; l2<=o2; l2++)
4301  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4302  }
4303  }
4304  }
4305  else/* asserts (u1>o1) */
4306  if(u2<o2)
4307  {
4308  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
4309  {
4310  psij1=psij_const1;
4311  for(l1=0; l1<2*m+1-o1; l1++,psij1++)
4312  {
4313  psij2=psij_const2;
4314  gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
4315  for(l2=0; l2<=2*m+1; l2++)
4316  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4317  }
4318  for(l1=0; l1<=o1; l1++,psij1++)
4319  {
4320  psij2=psij_const2;
4321  gj=g+((u0+l0)*n1+l1)*n2+u2;
4322  for(l2=0; l2<=2*m+1; l2++)
4323  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4324  }
4325  }
4326  for(l0=0; l0<=o0; l0++,psij0++)
4327  {
4328  psij1=psij_const1;
4329  for(l1=0; l1<2*m+1-o1; l1++,psij1++)
4330  {
4331  psij2=psij_const2;
4332  gj=g+(l0*n1+(u1+l1))*n2+u2;
4333  for(l2=0; l2<=2*m+1; l2++)
4334  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4335  }
4336  for(l1=0; l1<=o1; l1++,psij1++)
4337  {
4338  psij2=psij_const2;
4339  gj=g+(l0*n1+l1)*n2+u2;
4340  for(l2=0; l2<=2*m+1; l2++)
4341  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4342  }
4343  }
4344  }
4345  else/* asserts (u2>o2) */
4346  {
4347  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
4348  {
4349  psij1=psij_const1;
4350  for(l1=0; l1<2*m+1-o1; l1++,psij1++)
4351  {
4352  psij2=psij_const2;
4353  gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
4354  for(l2=0; l2<2*m+1-o2; l2++)
4355  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4356  gj=g+((u0+l0)*n1+(u1+l1))*n2;
4357  for(l2=0; l2<=o2; l2++)
4358  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4359  }
4360  for(l1=0; l1<=o1; l1++,psij1++)
4361  {
4362  psij2=psij_const2;
4363  gj=g+((u0+l0)*n1+l1)*n2+u2;
4364  for(l2=0; l2<2*m+1-o2; l2++)
4365  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4366  gj=g+((u0+l0)*n1+l1)*n2;
4367  for(l2=0; l2<=o2; l2++)
4368  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4369  }
4370  }
4371 
4372  for(l0=0; l0<=o0; l0++,psij0++)
4373  {
4374  psij1=psij_const1;
4375  for(l1=0; l1<2*m+1-o1; l1++,psij1++)
4376  {
4377  psij2=psij_const2;
4378  gj=g+(l0*n1+(u1+l1))*n2+u2;
4379  for(l2=0; l2<2*m+1-o2; l2++)
4380  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4381  gj=g+(l0*n1+(u1+l1))*n2;
4382  for(l2=0; l2<=o2; l2++)
4383  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4384  }
4385  for(l1=0; l1<=o1; l1++,psij1++)
4386  {
4387  psij2=psij_const2;
4388  gj=g+(l0*n1+l1)*n2+u2;
4389  for(l2=0; l2<2*m+1-o2; l2++)
4390  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4391  gj=g+(l0*n1+l1)*n2;
4392  for(l2=0; l2<=o2; l2++)
4393  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4394  }
4395  }
4396  }
4397 }
4398 #endif
4399 
4400 static void nfft_trafo_3d_B(nfft_plan *ths)
4401 {
4402  const int N0 = ths->N[0];
4403  const int n0 = ths->n[0];
4404  const int N1 = ths->N[1];
4405  const int n1 = ths->n[1];
4406  const int N2 = ths->N[2];
4407  const int n2 = ths->n[2];
4408  const int M = ths->M_total;
4409  const int m = ths->m;
4410 
4411  const C* g = (C*) ths->g;
4412 
4413  int k;
4414 
4415  if(ths->nfft_flags & PRE_FULL_PSI)
4416  {
4417  const int lprod = (2*m+2) * (2*m+2) * (2*m+2);
4418  #pragma omp parallel for default(shared) private(k)
4419  for (k = 0; k < M; k++)
4420  {
4421  int l;
4422  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4423  ths->f[j] = K(0.0);
4424  for (l = 0; l < lprod; l++)
4425  ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
4426  }
4427  return;
4428  } /* if(PRE_FULL_PSI) */
4429 
4430  if(ths->nfft_flags & PRE_PSI)
4431  {
4432  #pragma omp parallel for default(shared) private(k)
4433  for (k = 0; k < M; k++)
4434  {
4435  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4436  nfft_trafo_3d_compute(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4437  }
4438  return;
4439  } /* if(PRE_PSI) */
4440 
4441  if(ths->nfft_flags & PRE_FG_PSI)
4442  {
4443  R fg_exp_l[3*(2*m+2)];
4444 
4445  nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4446  nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4447  nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4448 
4449  #pragma omp parallel for default(shared) private(k)
4450  for (k = 0; k < M; k++)
4451  {
4452  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4453  int l;
4454  R psij_const[3*(2*m+2)];
4455  R fg_psij0 = ths->psi[2*j*3];
4456  R fg_psij1 = ths->psi[2*j*3+1];
4457  R fg_psij2 = K(1.0);
4458 
4459  psij_const[0] = fg_psij0;
4460  for(l=1; l<=2*m+1; l++)
4461  {
4462  fg_psij2 *= fg_psij1;
4463  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4464  }
4465 
4466  fg_psij0 = ths->psi[2*(j*3+1)];
4467  fg_psij1 = ths->psi[2*(j*3+1)+1];
4468  fg_psij2 = K(1.0);
4469  psij_const[2*m+2] = fg_psij0;
4470  for(l=1; l<=2*m+1; l++)
4471  {
4472  fg_psij2 *= fg_psij1;
4473  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4474  }
4475 
4476  fg_psij0 = ths->psi[2*(j*3+2)];
4477  fg_psij1 = ths->psi[2*(j*3+2)+1];
4478  fg_psij2 = K(1.0);
4479  psij_const[2*(2*m+2)] = fg_psij0;
4480  for(l=1; l<=2*m+1; l++)
4481  {
4482  fg_psij2 *= fg_psij1;
4483  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4484  }
4485 
4486  nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4487  }
4488 
4489  return;
4490  } /* if(PRE_FG_PSI) */
4491 
4492  if(ths->nfft_flags & FG_PSI)
4493  {
4494  R fg_exp_l[3*(2*m+2)];
4495 
4496  nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4497  nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4498  nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4499 
4500  nfft_sort_nodes(ths);
4501 
4502  #pragma omp parallel for default(shared) private(k)
4503  for (k = 0; k < M; k++)
4504  {
4505  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4506  int u, o, l;
4507  R psij_const[3*(2*m+2)];
4508  R fg_psij0, fg_psij1, fg_psij2;
4509 
4510  nfft_uo(ths,j,&u,&o,0);
4511  fg_psij0 = (PHI(ths->x[3*j]-((R)u)/n0,0));
4512  fg_psij1 = EXP(K(2.0)*(n0*(ths->x[3*j]) - u)/ths->b[0]);
4513  fg_psij2 = K(1.0);
4514  psij_const[0] = fg_psij0;
4515  for(l=1; l<=2*m+1; l++)
4516  {
4517  fg_psij2 *= fg_psij1;
4518  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4519  }
4520 
4521  nfft_uo(ths,j,&u,&o,1);
4522  fg_psij0 = (PHI(ths->x[3*j+1]-((R)u)/n1,1));
4523  fg_psij1 = EXP(K(2.0)*(n1*(ths->x[3*j+1]) - u)/ths->b[1]);
4524  fg_psij2 = K(1.0);
4525  psij_const[2*m+2] = fg_psij0;
4526  for(l=1; l<=2*m+1; l++)
4527  {
4528  fg_psij2 *= fg_psij1;
4529  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4530  }
4531 
4532  nfft_uo(ths,j,&u,&o,2);
4533  fg_psij0 = (PHI(ths->x[3*j+2]-((R)u)/n2,2));
4534  fg_psij1 = EXP(K(2.0)*(n2*(ths->x[3*j+2]) - u)/ths->b[2]);
4535  fg_psij2 = K(1.0);
4536  psij_const[2*(2*m+2)] = fg_psij0;
4537  for(l=1; l<=2*m+1; l++)
4538  {
4539  fg_psij2 *= fg_psij1;
4540  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4541  }
4542 
4543  nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4544  }
4545 
4546  return;
4547  } /* if(FG_PSI) */
4548 
4549  if(ths->nfft_flags & PRE_LIN_PSI)
4550  {
4551  const int K = ths->K, ip_s = K / (m + 2);
4552 
4553  nfft_sort_nodes(ths);
4554 
4555  #pragma omp parallel for default(shared) private(k)
4556  for (k = 0; k < M; k++)
4557  {
4558  int u, o, l;
4559  R ip_y, ip_w;
4560  int ip_u;
4561  R psij_const[3*(2*m+2)];
4562  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4563 
4564  nfft_uo(ths,j,&u,&o,0);
4565  ip_y = FABS(n0*ths->x[3*j+0] - u)*((R)ip_s);
4566  ip_u = LRINT(FLOOR(ip_y));
4567  ip_w = ip_y-ip_u;
4568  for(l=0; l < 2*m+2; l++)
4569  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4570  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
4571 
4572  nfft_uo(ths,j,&u,&o,1);
4573  ip_y = FABS(n1*ths->x[3*j+1] - u)*((R)ip_s);
4574  ip_u = LRINT(FLOOR(ip_y));
4575  ip_w = ip_y-ip_u;
4576  for(l=0; l < 2*m+2; l++)
4577  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4578  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
4579 
4580  nfft_uo(ths,j,&u,&o,2);
4581  ip_y = FABS(n2*ths->x[3*j+2] - u)*((R)ip_s);
4582  ip_u = LRINT(FLOOR(ip_y));
4583  ip_w = ip_y-ip_u;
4584  for(l=0; l < 2*m+2; l++)
4585  psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4586  ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
4587 
4588  nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4589  }
4590  return;
4591  } /* if(PRE_LIN_PSI) */
4592 
4593  /* no precomputed psi at all */
4594 
4595  nfft_sort_nodes(ths);
4596 
4597  #pragma omp parallel for default(shared) private(k)
4598  for (k = 0; k < M; k++)
4599  {
4600  R psij_const[3*(2*m+2)];
4601  int u, o, l;
4602  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4603 
4604  nfft_uo(ths,j,&u,&o,0);
4605  for(l=0;l<=2*m+1;l++)
4606  psij_const[l]=(PHI(ths->x[3*j]-((R)((u+l)))/n0,0));
4607 
4608  nfft_uo(ths,j,&u,&o,1);
4609  for(l=0;l<=2*m+1;l++)
4610  psij_const[2*m+2+l]=(PHI(ths->x[3*j+1]-((R)((u+l)))/n1,1));
4611 
4612  nfft_uo(ths,j,&u,&o,2);
4613  for(l=0;l<=2*m+1;l++)
4614  psij_const[2*(2*m+2)+l]=(PHI(ths->x[3*j+2]-((R)((u+l)))/n2,2));
4615 
4616  nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4617  }
4618 }
4619 
4620 #ifdef OMP_ASSERT
4621 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A \
4622 { \
4623  assert(ar_x[2*k] >= min_u_a || k == M-1); \
4624  if (k > 0) \
4625  assert(ar_x[2*k-2] < min_u_a); \
4626 }
4627 #else
4628 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A
4629 #endif
4630 
4631 #ifdef OMP_ASSERT
4632 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B \
4633 { \
4634  assert(ar_x[2*k] >= min_u_b || k == M-1); \
4635  if (k > 0) \
4636  assert(ar_x[2*k-2] < min_u_b); \
4637 }
4638 #else
4639 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B
4640 #endif
4641 
4642 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
4643  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4644  ths->psi+j*3*(2*m+2), \
4645  ths->psi+(j*3+1)*(2*m+2), \
4646  ths->psi+(j*3+2)*(2*m+2), \
4647  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4648  n0, n1, n2, m, my_u0, my_o0);
4649 
4650 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
4651 { \
4652  int u, o, l; \
4653  R psij_const[3*(2*m+2)]; \
4654  R fg_psij0 = ths->psi[2*j*3]; \
4655  R fg_psij1 = ths->psi[2*j*3+1]; \
4656  R fg_psij2 = K(1.0); \
4657  \
4658  psij_const[0] = fg_psij0; \
4659  for(l=1; l<=2*m+1; l++) \
4660  { \
4661  fg_psij2 *= fg_psij1; \
4662  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
4663  } \
4664  \
4665  fg_psij0 = ths->psi[2*(j*3+1)]; \
4666  fg_psij1 = ths->psi[2*(j*3+1)+1]; \
4667  fg_psij2 = K(1.0); \
4668  psij_const[2*m+2] = fg_psij0; \
4669  for(l=1; l<=2*m+1; l++) \
4670  { \
4671  fg_psij2 *= fg_psij1; \
4672  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
4673  } \
4674  \
4675  fg_psij0 = ths->psi[2*(j*3+2)]; \
4676  fg_psij1 = ths->psi[2*(j*3+2)+1]; \
4677  fg_psij2 = K(1.0); \
4678  psij_const[2*(2*m+2)] = fg_psij0; \
4679  for(l=1; l<=2*m+1; l++) \
4680  { \
4681  fg_psij2 *= fg_psij1; \
4682  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
4683  } \
4684  \
4685  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4686  psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4687  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4688  n0, n1, n2, m, my_u0, my_o0); \
4689 }
4690 
4691 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
4692 { \
4693  int u, o, l; \
4694  R psij_const[3*(2*m+2)]; \
4695  R fg_psij0, fg_psij1, fg_psij2; \
4696  \
4697  nfft_uo(ths,j,&u,&o,0); \
4698  fg_psij0 = (PHI(ths->x[3*j]-((R)u)/n0,0)); \
4699  fg_psij1 = EXP(K(2.0)*(n0*(ths->x[3*j]) - u)/ths->b[0]); \
4700  fg_psij2 = K(1.0); \
4701  psij_const[0] = fg_psij0; \
4702  for(l=1; l<=2*m+1; l++) \
4703  { \
4704  fg_psij2 *= fg_psij1; \
4705  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
4706  } \
4707  \
4708  nfft_uo(ths,j,&u,&o,1); \
4709  fg_psij0 = (PHI(ths->x[3*j+1]-((R)u)/n1,1)); \
4710  fg_psij1 = EXP(K(2.0)*(n1*(ths->x[3*j+1]) - u)/ths->b[1]); \
4711  fg_psij2 = K(1.0); \
4712  psij_const[2*m+2] = fg_psij0; \
4713  for(l=1; l<=2*m+1; l++) \
4714  { \
4715  fg_psij2 *= fg_psij1; \
4716  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
4717  } \
4718  \
4719  nfft_uo(ths,j,&u,&o,2); \
4720  fg_psij0 = (PHI(ths->x[3*j+2]-((R)u)/n2,2)); \
4721  fg_psij1 = EXP(K(2.0)*(n2*(ths->x[3*j+2]) - u)/ths->b[2]); \
4722  fg_psij2 = K(1.0); \
4723  psij_const[2*(2*m+2)] = fg_psij0; \
4724  for(l=1; l<=2*m+1; l++) \
4725  { \
4726  fg_psij2 *= fg_psij1; \
4727  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
4728  } \
4729  \
4730  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4731  psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4732  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4733  n0, n1, n2, m, my_u0, my_o0); \
4734 }
4735 
4736 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
4737 { \
4738  int u, o, l; \
4739  R psij_const[3*(2*m+2)]; \
4740  int ip_u; \
4741  R ip_y, ip_w; \
4742  \
4743  nfft_uo(ths,j,&u,&o,0); \
4744  ip_y = FABS(n0*ths->x[3*j+0] - u)*((R)ip_s); \
4745  ip_u = LRINT(FLOOR(ip_y)); \
4746  ip_w = ip_y-ip_u; \
4747  for(l=0; l < 2*m+2; l++) \
4748  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4749  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
4750  \
4751  nfft_uo(ths,j,&u,&o,1); \
4752  ip_y = FABS(n1*ths->x[3*j+1] - u)*((R)ip_s); \
4753  ip_u = LRINT(FLOOR(ip_y)); \
4754  ip_w = ip_y-ip_u; \
4755  for(l=0; l < 2*m+2; l++) \
4756  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4757  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
4758  \
4759  nfft_uo(ths,j,&u,&o,2); \
4760  ip_y = FABS(n2*ths->x[3*j+2] - u)*((R)ip_s); \
4761  ip_u = LRINT(FLOOR(ip_y)); \
4762  ip_w = ip_y-ip_u; \
4763  for(l=0; l < 2*m+2; l++) \
4764  psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4765  ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
4766  \
4767  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4768  psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4769  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4770  n0, n1, n2, m, my_u0, my_o0); \
4771 }
4772 
4773 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
4774 { \
4775  int u, o, l; \
4776  R psij_const[3*(2*m+2)]; \
4777  \
4778  nfft_uo(ths,j,&u,&o,0); \
4779  for(l=0;l<=2*m+1;l++) \
4780  psij_const[l]=(PHI(ths->x[3*j]-((R)((u+l)))/n0,0)); \
4781  \
4782  nfft_uo(ths,j,&u,&o,1); \
4783  for(l=0;l<=2*m+1;l++) \
4784  psij_const[2*m+2+l]=(PHI(ths->x[3*j+1]-((R)((u+l)))/n1,1)); \
4785  \
4786  nfft_uo(ths,j,&u,&o,2); \
4787  for(l=0;l<=2*m+1;l++) \
4788  psij_const[2*(2*m+2)+l]=(PHI(ths->x[3*j+2]-((R)((u+l)))/n2,2)); \
4789  \
4790  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4791  psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4792  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4793  n0, n1, n2, m, my_u0, my_o0); \
4794 }
4795 
4796 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(whichone) \
4797 { \
4798  if (ths->nfft_flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
4799  { \
4800  _Pragma("omp parallel private(k)") \
4801  { \
4802  int my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
4803  int *ar_x = ths->index_x; \
4804  \
4805  nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
4806  &min_u_b, &max_u_b, 3, ths->n, m); \
4807  \
4808  if (min_u_a != -1) \
4809  { \
4810  k = index_x_binary_search(ar_x, M, min_u_a); \
4811  \
4812  MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A \
4813  \
4814  while (k < M) \
4815  { \
4816  int u_prod = ar_x[2*k]; \
4817  int j = ar_x[2*k+1]; \
4818  \
4819  if (u_prod < min_u_a || u_prod > max_u_a) \
4820  break; \
4821  \
4822  MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
4823  \
4824  k++; \
4825  } \
4826  } \
4827  \
4828  if (min_u_b != -1) \
4829  { \
4830  int k = index_x_binary_search(ar_x, M, min_u_b); \
4831  \
4832  MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B \
4833  \
4834  while (k < M) \
4835  { \
4836  int u_prod = ar_x[2*k]; \
4837  int j = ar_x[2*k+1]; \
4838  \
4839  if (u_prod < min_u_b || u_prod > max_u_b) \
4840  break; \
4841  \
4842  MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
4843  \
4844  k++; \
4845  } \
4846  } \
4847  } /* omp parallel */ \
4848  return; \
4849  } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \
4850 }
4851 
4852 static void nfft_adjoint_3d_B(nfft_plan *ths)
4853 {
4854  int k;
4855  const int N0 = ths->N[0];
4856  const int n0 = ths->n[0];
4857  const int N1 = ths->N[1];
4858  const int n1 = ths->n[1];
4859  const int N2 = ths->N[2];
4860  const int n2 = ths->n[2];
4861  const int M = ths->M_total;
4862  const int m = ths->m;
4863 
4864  C* g = (C*) ths->g;
4865 
4866  memset(g,0,ths->n_total*sizeof(C));
4867 
4868  if(ths->nfft_flags & PRE_FULL_PSI)
4869  {
4870  nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
4871  3, ths->n, m, ths->nfft_flags, ths->index_x);
4872  return;
4873  } /* if(PRE_FULL_PSI) */
4874 
4875  if(ths->nfft_flags & PRE_PSI)
4876  {
4877 #ifdef _OPENMP
4878  MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(PRE_PSI)
4879 #endif
4880 
4881  #pragma omp parallel for default(shared) private(k)
4882  for (k = 0; k < M; k++)
4883  {
4884  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4885 #ifdef _OPENMP
4886  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4887 #else
4888  nfft_adjoint_3d_compute_serial(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4889 #endif
4890  }
4891  return;
4892  } /* if(PRE_PSI) */
4893 
4894  if(ths->nfft_flags & PRE_FG_PSI)
4895  {
4896  R fg_exp_l[3*(2*m+2)];
4897 
4898  nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4899  nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4900  nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4901 
4902 #ifdef _OPENMP
4903  MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(PRE_FG_PSI)
4904 #endif
4905 
4906  #pragma omp parallel for default(shared) private(k)
4907  for (k = 0; k < M; k++)
4908  {
4909  R psij_const[3*(2*m+2)];
4910  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4911  int l;
4912  R fg_psij0 = ths->psi[2*j*3];
4913  R fg_psij1 = ths->psi[2*j*3+1];
4914  R fg_psij2 = K(1.0);
4915 
4916  psij_const[0] = fg_psij0;
4917  for(l=1; l<=2*m+1; l++)
4918  {
4919  fg_psij2 *= fg_psij1;
4920  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4921  }
4922 
4923  fg_psij0 = ths->psi[2*(j*3+1)];
4924  fg_psij1 = ths->psi[2*(j*3+1)+1];
4925  fg_psij2 = K(1.0);
4926  psij_const[2*m+2] = fg_psij0;
4927  for(l=1; l<=2*m+1; l++)
4928  {
4929  fg_psij2 *= fg_psij1;
4930  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4931  }
4932 
4933  fg_psij0 = ths->psi[2*(j*3+2)];
4934  fg_psij1 = ths->psi[2*(j*3+2)+1];
4935  fg_psij2 = K(1.0);
4936  psij_const[2*(2*m+2)] = fg_psij0;
4937  for(l=1; l<=2*m+1; l++)
4938  {
4939  fg_psij2 *= fg_psij1;
4940  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4941  }
4942 
4943 #ifdef _OPENMP
4944  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4945 #else
4946  nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4947 #endif
4948  }
4949 
4950  return;
4951  } /* if(PRE_FG_PSI) */
4952 
4953  if(ths->nfft_flags & FG_PSI)
4954  {
4955  R fg_exp_l[3*(2*m+2)];
4956 
4957  nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4958  nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4959  nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4960 
4961  nfft_sort_nodes(ths);
4962 
4963 #ifdef _OPENMP
4964  MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(FG_PSI)
4965 #endif
4966 
4967  #pragma openmp parallel for default(shared) private(k)
4968  for (k = 0; k < M; k++)
4969  {
4970  int u,o,l;
4971  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4972  R psij_const[3*(2*m+2)];
4973  R fg_psij0, fg_psij1, fg_psij2;
4974 
4975  nfft_uo(ths,j,&u,&o,0);
4976  fg_psij0 = (PHI(ths->x[3*j]-((R)u)/n0,0));
4977  fg_psij1 = EXP(K(2.0)*(n0*(ths->x[3*j]) - u)/ths->b[0]);
4978  fg_psij2 = K(1.0);
4979  psij_const[0] = fg_psij0;
4980  for(l=1; l<=2*m+1; l++)
4981  {
4982  fg_psij2 *= fg_psij1;
4983  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4984  }
4985 
4986  nfft_uo(ths,j,&u,&o,1);
4987  fg_psij0 = (PHI(ths->x[3*j+1]-((R)u)/n1,1));
4988  fg_psij1 = EXP(K(2.0)*(n1*(ths->x[3*j+1]) - u)/ths->b[1]);
4989  fg_psij2 = K(1.0);
4990  psij_const[2*m+2] = fg_psij0;
4991  for(l=1; l<=2*m+1; l++)
4992  {
4993  fg_psij2 *= fg_psij1;
4994  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4995  }
4996 
4997  nfft_uo(ths,j,&u,&o,2);
4998  fg_psij0 = (PHI(ths->x[3*j+2]-((R)u)/n2,2));
4999  fg_psij1 = EXP(K(2.0)*(n2*(ths->x[3*j+2]) - u)/ths->b[2]);
5000  fg_psij2 = K(1.0);
5001  psij_const[2*(2*m+2)] = fg_psij0;
5002  for(l=1; l<=2*m+1; l++)
5003  {
5004  fg_psij2 *= fg_psij1;
5005  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
5006  }
5007 
5008 #ifdef _OPENMP
5009  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5010 #else
5011  nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5012 #endif
5013  }
5014 
5015  return;
5016  } /* if(FG_PSI) */
5017 
5018  if(ths->nfft_flags & PRE_LIN_PSI)
5019  {
5020  const int K = ths->K;
5021  const int ip_s = K / (m + 2);
5022 
5023  nfft_sort_nodes(ths);
5024 
5025 #ifdef _OPENMP
5026  MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
5027 #endif
5028 
5029  #pragma openmp parallel for default(shared) private(k)
5030  for (k = 0; k < M; k++)
5031  {
5032  int u,o,l;
5033  int ip_u;
5034  R ip_y, ip_w;
5035  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5036  R psij_const[3*(2*m+2)];
5037 
5038  nfft_uo(ths,j,&u,&o,0);
5039  ip_y = FABS(n0*ths->x[3*j+0] - u)*((R)ip_s);
5040  ip_u = LRINT(FLOOR(ip_y));
5041  ip_w = ip_y-ip_u;
5042  for(l=0; l < 2*m+2; l++)
5043  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5044  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
5045 
5046  nfft_uo(ths,j,&u,&o,1);
5047  ip_y = FABS(n1*ths->x[3*j+1] - u)*((R)ip_s);
5048  ip_u = LRINT(FLOOR(ip_y));
5049  ip_w = ip_y-ip_u;
5050  for(l=0; l < 2*m+2; l++)
5051  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5052  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
5053 
5054  nfft_uo(ths,j,&u,&o,2);
5055  ip_y = FABS(n2*ths->x[3*j+2] - u)*((R)ip_s);
5056  ip_u = LRINT(FLOOR(ip_y));
5057  ip_w = ip_y-ip_u;
5058  for(l=0; l < 2*m+2; l++)
5059  psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5060  ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
5061 
5062 #ifdef _OPENMP
5063  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5064 #else
5065  nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5066 #endif
5067  }
5068  return;
5069  } /* if(PRE_LIN_PSI) */
5070 
5071  /* no precomputed psi at all */
5072  nfft_sort_nodes(ths);
5073 
5074 #ifdef _OPENMP
5075  MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(NO_PSI)
5076 #endif
5077 
5078  #pragma omp parallel for default(shared) private(k)
5079  for (k = 0; k < M; k++)
5080  {
5081  int u,o,l;
5082  R psij_const[3*(2*m+2)];
5083  int j = (ths->nfft_flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5084 
5085  nfft_uo(ths,j,&u,&o,0);
5086  for(l=0;l<=2*m+1;l++)
5087  psij_const[l]=(PHI(ths->x[3*j]-((R)((u+l)))/n0,0));
5088 
5089  nfft_uo(ths,j,&u,&o,1);
5090  for(l=0;l<=2*m+1;l++)
5091  psij_const[2*m+2+l]=(PHI(ths->x[3*j+1]-((R)((u+l)))/n1,1));
5092 
5093  nfft_uo(ths,j,&u,&o,2);
5094  for(l=0;l<=2*m+1;l++)
5095  psij_const[2*(2*m+2)+l]=(PHI(ths->x[3*j+2]-((R)((u+l)))/n2,2));
5096 
5097 #ifdef _OPENMP
5098  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5099 #else
5100  nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5101 #endif
5102  }
5103 }
5104 
5105 
5106 void nfft_trafo_3d(nfft_plan *ths)
5107 {
5108  int k0,k1,k2,n0,n1,n2,N0,N1,N2;
5109  C *g_hat,*f_hat;
5110  R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
5111  R ck01, ck02, ck11, ck12, ck21, ck22;
5112  C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
5113  C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
5114 
5115  ths->g_hat=ths->g1;
5116  ths->g=ths->g2;
5117 
5118  N0=ths->N[0];
5119  N1=ths->N[1];
5120  N2=ths->N[2];
5121  n0=ths->n[0];
5122  n1=ths->n[1];
5123  n2=ths->n[2];
5124 
5125  f_hat=(C*)ths->f_hat;
5126  g_hat=(C*)ths->g_hat;
5127 
5128  TIC(0)
5129 #ifdef _OPENMP
5130  #pragma omp parallel for default(shared) private(k0)
5131  for (k0 = 0; k0 < ths->n_total; k0++)
5132  ths->g_hat[k0] = 0.0;
5133 #else
5134  memset(ths->g_hat,0,ths->n_total*sizeof(C));
5135 #endif
5136 
5137  if(ths->nfft_flags & PRE_PHI_HUT)
5138  {
5139  c_phi_inv01=ths->c_phi_inv[0];
5140  c_phi_inv02=&ths->c_phi_inv[0][N0/2];
5141 
5142  #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
5143  for(k0=0;k0<N0/2;k0++)
5144  {
5145  ck01=c_phi_inv01[k0];
5146  ck02=c_phi_inv02[k0];
5147  c_phi_inv11=ths->c_phi_inv[1];
5148  c_phi_inv12=&ths->c_phi_inv[1][N1/2];
5149 
5150  for(k1=0;k1<N1/2;k1++)
5151  {
5152  ck11=c_phi_inv11[k1];
5153  ck12=c_phi_inv12[k1];
5154  c_phi_inv21=ths->c_phi_inv[2];
5155  c_phi_inv22=&ths->c_phi_inv[2][N2/2];
5156 
5157  g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5158  f_hat111=f_hat + (k0*N1+k1)*N2;
5159  g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5160  f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
5161  g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
5162  f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
5163  g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
5164  f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
5165 
5166  g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
5167  f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
5168  g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
5169  f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
5170  g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
5171  f_hat122=f_hat + (k0*N1+N1/2+k1)*N2+(N2/2);
5172  g_hat222=g_hat + (k0*n1+k1)*n2;
5173  f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
5174 
5175  for(k2=0;k2<N2/2;k2++)
5176  {
5177  ck21=c_phi_inv21[k2];
5178  ck22=c_phi_inv22[k2];
5179 
5180  g_hat111[k2] = f_hat111[k2] * ck01 * ck11 * ck21;
5181  g_hat211[k2] = f_hat211[k2] * ck02 * ck11 * ck21;
5182  g_hat121[k2] = f_hat121[k2] * ck01 * ck12 * ck21;
5183  g_hat221[k2] = f_hat221[k2] * ck02 * ck12 * ck21;
5184 
5185  g_hat112[k2] = f_hat112[k2] * ck01 * ck11 * ck22;
5186  g_hat212[k2] = f_hat212[k2] * ck02 * ck11 * ck22;
5187  g_hat122[k2] = f_hat122[k2] * ck01 * ck12 * ck22;
5188  g_hat222[k2] = f_hat222[k2] * ck02 * ck12 * ck22;
5189  }
5190  }
5191  }
5192  }
5193  else
5194  #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
5195  for(k0=0;k0<N0/2;k0++)
5196  {
5197  ck01=K(1.0)/(PHI_HUT(k0-N0/2,0));
5198  ck02=K(1.0)/(PHI_HUT(k0,0));
5199  for(k1=0;k1<N1/2;k1++)
5200  {
5201  ck11=K(1.0)/(PHI_HUT(k1-N1/2,1));
5202  ck12=K(1.0)/(PHI_HUT(k1,1));
5203 
5204  for(k2=0;k2<N2/2;k2++)
5205  {
5206  ck21=K(1.0)/(PHI_HUT(k2-N2/2,2));
5207  ck22=K(1.0)/(PHI_HUT(k2,2));
5208 
5209  g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+k1)*N2+k2] * ck01 * ck11 * ck21;
5210  g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+k2] * ck02 * ck11 * ck21;
5211  g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+k2] * ck01 * ck12 * ck21;
5212  g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] * ck02 * ck12 * ck21;
5213 
5214  g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] = f_hat[(k0*N1+k1)*N2+N2/2+k2] * ck01 * ck11 * ck22;
5215  g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] * ck02 * ck11 * ck22;
5216  g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] * ck01 * ck12 * ck22;
5217  g_hat[(k0*n1+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] * ck02 * ck12 * ck22;
5218  }
5219  }
5220  }
5221 
5222  TOC(0)
5223 
5224  TIC_FFTW(1)
5225  fftw_execute(ths->my_fftw_plan1);
5226  TOC_FFTW(1);
5227 
5228  TIC(2);
5229  nfft_trafo_3d_B(ths);
5230  TOC(2);
5231 }
5232 
5233 void nfft_adjoint_3d(nfft_plan *ths)
5234 {
5235  int k0,k1,k2,n0,n1,n2,N0,N1,N2;
5236  C *g_hat,*f_hat;
5237  R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
5238  R ck01, ck02, ck11, ck12, ck21, ck22;
5239  C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
5240  C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
5241 
5242  ths->g_hat=ths->g1;
5243  ths->g=ths->g2;
5244 
5245  N0=ths->N[0];
5246  N1=ths->N[1];
5247  N2=ths->N[2];
5248  n0=ths->n[0];
5249  n1=ths->n[1];
5250  n2=ths->n[2];
5251 
5252  f_hat=(C*)ths->f_hat;
5253  g_hat=(C*)ths->g_hat;
5254 
5255  TIC(2);
5256  nfft_adjoint_3d_B(ths);
5257  TOC(2);
5258 
5259  TIC_FFTW(1)
5260  fftw_execute(ths->my_fftw_plan2);
5261  TOC_FFTW(1);
5262 
5263  TIC(0)
5264  if(ths->nfft_flags & PRE_PHI_HUT)
5265  {
5266  c_phi_inv01=ths->c_phi_inv[0];
5267  c_phi_inv02=&ths->c_phi_inv[0][N0/2];
5268 
5269  #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
5270  for(k0=0;k0<N0/2;k0++)
5271  {
5272  ck01=c_phi_inv01[k0];
5273  ck02=c_phi_inv02[k0];
5274  c_phi_inv11=ths->c_phi_inv[1];
5275  c_phi_inv12=&ths->c_phi_inv[1][N1/2];
5276 
5277  for(k1=0;k1<N1/2;k1++)
5278  {
5279  ck11=c_phi_inv11[k1];
5280  ck12=c_phi_inv12[k1];
5281  c_phi_inv21=ths->c_phi_inv[2];
5282  c_phi_inv22=&ths->c_phi_inv[2][N2/2];
5283 
5284  g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5285  f_hat111=f_hat + (k0*N1+k1)*N2;
5286  g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5287  f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
5288  g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
5289  f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
5290  g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
5291  f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
5292 
5293  g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
5294  f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
5295  g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
5296  f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
5297  g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
5298  f_hat122=f_hat + (k0*N1+(N1/2)+k1)*N2+(N2/2);
5299  g_hat222=g_hat + (k0*n1+k1)*n2;
5300  f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
5301 
5302  for(k2=0;k2<N2/2;k2++)
5303  {
5304  ck21=c_phi_inv21[k2];
5305  ck22=c_phi_inv22[k2];
5306 
5307  f_hat111[k2] = g_hat111[k2] * ck01 * ck11 * ck21;
5308  f_hat211[k2] = g_hat211[k2] * ck02 * ck11 * ck21;
5309  f_hat121[k2] = g_hat121[k2] * ck01 * ck12 * ck21;
5310  f_hat221[k2] = g_hat221[k2] * ck02 * ck12 * ck21;
5311 
5312  f_hat112[k2] = g_hat112[k2] * ck01 * ck11 * ck22;
5313  f_hat212[k2] = g_hat212[k2] * ck02 * ck11 * ck22;
5314  f_hat122[k2] = g_hat122[k2] * ck01 * ck12 * ck22;
5315  f_hat222[k2] = g_hat222[k2] * ck02 * ck12 * ck22;
5316  }
5317  }
5318  }
5319  }
5320  else
5321  #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
5322  for(k0=0;k0<N0/2;k0++)
5323  {
5324  ck01=K(1.0)/(PHI_HUT(k0-N0/2,0));
5325  ck02=K(1.0)/(PHI_HUT(k0,0));
5326  for(k1=0;k1<N1/2;k1++)
5327  {
5328  ck11=K(1.0)/(PHI_HUT(k1-N1/2,1));
5329  ck12=K(1.0)/(PHI_HUT(k1,1));
5330 
5331  for(k2=0;k2<N2/2;k2++)
5332  {
5333  ck21=K(1.0)/(PHI_HUT(k2-N2/2,2));
5334  ck22=K(1.0)/(PHI_HUT(k2,2));
5335 
5336  f_hat[(k0*N1+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck01 * ck11 * ck21;
5337  f_hat[((N0/2+k0)*N1+k1)*N2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck02 * ck11 * ck21;
5338  f_hat[(k0*N1+N1/2+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] * ck01 * ck12 * ck21;
5339  f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] = g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] * ck02 * ck12 * ck21;
5340 
5341  f_hat[(k0*N1+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] * ck01 * ck11 * ck22;
5342  f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] * ck02 * ck11 * ck22;
5343  f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] * ck01 * ck12 * ck22;
5344  f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[(k0*n1+k1)*n2+k2] * ck02 * ck12 * ck22;
5345  }
5346  }
5347  }
5348 
5349  TOC(0)
5350 }
5351 
5355 {
5356  switch(ths->d)
5357  {
5358  case 1: nfft_trafo_1d(ths); break;
5359  case 2: nfft_trafo_2d(ths); break;
5360  case 3: nfft_trafo_3d(ths); break;
5361  default:
5362  /* use ths->my_fftw_plan1 */
5363  ths->g_hat=ths->g1;
5364  ths->g=ths->g2;
5365 
5369  TIC(0)
5370  nfft_D_A(ths);
5371  TOC(0)
5372 
5373 
5377  TIC_FFTW(1)
5378  fftw_execute(ths->my_fftw_plan1);
5379  TOC_FFTW(1)
5380 
5381 
5384  TIC(2)
5385  nfft_B_A(ths);
5386  TOC(2)
5387  }
5388 } /* nfft_trafo */
5389 
5390 void nfft_adjoint(nfft_plan *ths)
5391 {
5392  switch(ths->d)
5393  {
5394  case 1: nfft_adjoint_1d(ths); break;
5395  case 2: nfft_adjoint_2d(ths); break;
5396  case 3: nfft_adjoint_3d(ths); break;
5397  default:
5398  /* use ths->my_fftw_plan2 */
5399  ths->g_hat=ths->g1;
5400  ths->g=ths->g2;
5401 
5405  TIC(2)
5406  nfft_B_T(ths);
5407  TOC(2)
5408 
5413  TIC_FFTW(1)
5414  fftw_execute(ths->my_fftw_plan2);
5415  TOC_FFTW(1)
5416 
5420  TIC(0)
5421  nfft_D_T(ths);
5422  TOC(0)
5423  }
5424 } /* nfft_adjoint */
5425 
5426 
5429 static void nfft_precompute_phi_hut(nfft_plan *ths)
5430 {
5431  int ks[ths->d];
5432  int t;
5434  ths->c_phi_inv = (R**) nfft_malloc(ths->d*sizeof(R*));
5435 
5436  for(t=0; t<ths->d; t++)
5437  {
5438  ths->c_phi_inv[t]= (R*)nfft_malloc(ths->N[t]*sizeof(R));
5439  for(ks[t]=0; ks[t]<ths->N[t]; ks[t]++)
5440  ths->c_phi_inv[t][ks[t]]= K(1.0)/(PHI_HUT(ks[t]-ths->N[t]/2,t));
5441  }
5442 } /* nfft_phi_hut */
5443 
5450 {
5451  int t;
5452  int j;
5453  R step;
5455  for (t=0; t<ths->d; t++)
5456  {
5457  step=((R)(ths->m+2))/(((R)ths->K)*ths->n[t]);
5458  for(j=0;j<=ths->K;j++)
5459  {
5460  ths->psi[(ths->K+1)*t + j] = PHI(j*step,t);
5461  } /* for(j) */
5462  } /* for(t) */
5463 }
5464 
5465 static void nfft_precompute_fg_psi(nfft_plan *ths)
5466 {
5467  int t;
5468  int u, o;
5470  nfft_sort_nodes(ths);
5471 
5472  for (t=0; t<ths->d; t++)
5473  {
5474  int j;
5475  #pragma omp parallel for default(shared) private(j,u,o)
5476  for (j = 0; j < ths->M_total; j++)
5477  {
5478  nfft_uo(ths,j,&u,&o,t);
5479 
5480  ths->psi[2*(j*ths->d+t)]=
5481  (PHI((ths->x[j*ths->d+t]-((R)u)/ths->n[t]),t));
5482 
5483  ths->psi[2*(j*ths->d+t)+1]=
5484  EXP(K(2.0)*(ths->n[t]*ths->x[j*ths->d+t] - u) / ths->b[t]);
5485  } /* for(j) */
5486  }
5487  /* for(t) */
5488 } /* nfft_precompute_fg_psi */
5489 
5490 void nfft_precompute_psi(nfft_plan *ths)
5491 {
5492  int t;
5493  int l;
5494  int lj;
5495  int u, o;
5497  nfft_sort_nodes(ths);
5498 
5499  for (t=0; t<ths->d; t++)
5500  {
5501  int j;
5502  #pragma omp parallel for default(shared) private(j,l,lj,u,o)
5503  for (j = 0; j < ths->M_total; j++)
5504  {
5505  nfft_uo(ths,j,&u,&o,t);
5506 
5507  for(l=u, lj=0; l <= o; l++, lj++)
5508  ths->psi[(j*ths->d+t)*(2*ths->m+2)+lj]=
5509  (PHI((ths->x[j*ths->d+t]-((R)l)/ths->n[t]),t));
5510  } /* for(j) */
5511  }
5512  /* for(t) */
5513 } /* nfft_precompute_psi */
5514 
5515 #ifdef _OPENMP
5516 static void nfft_precompute_full_psi_omp(nfft_plan *ths)
5517 {
5518  int j;
5519  int lprod;
5521  {
5522  int t;
5523  for(t=0,lprod = 1; t<ths->d; t++)
5524  lprod *= 2*ths->m+2;
5525  }
5526 
5527  #pragma omp parallel for default(shared) private(j)
5528  for(j=0; j<ths->M_total; j++)
5529  {
5530  int t,t2;
5531  int l_L;
5532  int l[ths->d];
5533  int lj[ths->d];
5534  int ll_plain[ths->d+1];
5536  int u[ths->d], o[ths->d];
5538  R phi_prod[ths->d+1];
5539  int ix = j*lprod;
5540 
5541  phi_prod[0]=1;
5542  ll_plain[0]=0;
5543 
5544  MACRO_init_uo_l_lj_t;
5545 
5546  for(l_L=0; l_L<lprod; l_L++, ix++)
5547  {
5548  MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
5549 
5550  ths->psi_index_g[ix]=ll_plain[ths->d];
5551  ths->psi[ix]=phi_prod[ths->d];
5552 
5553  MACRO_count_uo_l_lj_t;
5554  } /* for(l_L) */
5555 
5556  ths->psi_index_f[j]=lprod;
5557  } /* for(j) */
5558 }
5559 #endif
5560 
5561 void nfft_precompute_full_psi(nfft_plan *ths)
5562 {
5563 #ifdef _OPENMP
5564  nfft_sort_nodes(ths);
5565 
5566  nfft_precompute_full_psi_omp(ths);
5567 #else
5568  int t,t2;
5569  int j;
5570  int l_L;
5571  int l[ths->d];
5572  int lj[ths->d];
5573  int ll_plain[ths->d+1];
5574  int lprod;
5575  int u[ths->d], o[ths->d];
5577  R phi_prod[ths->d+1];
5578 
5579  int ix,ix_old;
5580 
5581  nfft_sort_nodes(ths);
5582 
5583  phi_prod[0]=1;
5584  ll_plain[0]=0;
5585 
5586  for(t=0,lprod = 1; t<ths->d; t++)
5587  lprod *= 2*ths->m+2;
5588 
5589  for(j=0,ix=0,ix_old=0; j<ths->M_total; j++)
5590  {
5591  MACRO_init_uo_l_lj_t;
5592 
5593  for(l_L=0; l_L<lprod; l_L++, ix++)
5594  {
5595  MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
5596 
5597  ths->psi_index_g[ix]=ll_plain[ths->d];
5598  ths->psi[ix]=phi_prod[ths->d];
5599 
5600  MACRO_count_uo_l_lj_t;
5601  } /* for(l_L) */
5602 
5603 
5604  ths->psi_index_f[j]=ix-ix_old;
5605  ix_old=ix;
5606  } /* for(j) */
5607 #endif
5608 }
5609 
5610 void nfft_precompute_one_psi(nfft_plan *ths)
5611 {
5612  if(ths->nfft_flags & PRE_LIN_PSI)
5614  if(ths->nfft_flags & PRE_FG_PSI)
5615  nfft_precompute_fg_psi(ths);
5616  if(ths->nfft_flags & PRE_PSI)
5617  nfft_precompute_psi(ths);
5618  if(ths->nfft_flags & PRE_FULL_PSI)
5619  nfft_precompute_full_psi(ths);
5620 }
5621 
5622 
5623 static void nfft_init_help(nfft_plan *ths)
5624 {
5625  int t;
5626  int lprod;
5628  if (ths->nfft_flags & NFFT_OMP_BLOCKWISE_ADJOINT)
5629  ths->nfft_flags |= NFFT_SORT_NODES;
5630 
5631  ths->N_total=nfft_prod_int(ths->N, ths->d);
5632  ths->n_total=nfft_prod_int(ths->n, ths->d);
5633 
5634  ths->sigma = (R*) nfft_malloc(ths->d*sizeof(R));
5635  for(t = 0;t < ths->d; t++)
5636  ths->sigma[t] = ((R)ths->n[t])/ths->N[t];
5637 
5638  WINDOW_HELP_INIT;
5639 
5640  if(ths->nfft_flags & MALLOC_X)
5641  ths->x = (R*)nfft_malloc(ths->d*ths->M_total*sizeof(R));
5642 
5643  if(ths->nfft_flags & MALLOC_F_HAT)
5644  ths->f_hat = (fftw_complex*)nfft_malloc(ths->N_total*sizeof(C));
5645 
5646  if(ths->nfft_flags & MALLOC_F)
5647  ths->f = (fftw_complex*)nfft_malloc(ths->M_total*sizeof(C));
5648 
5649  if(ths->nfft_flags & PRE_PHI_HUT)
5650  nfft_precompute_phi_hut(ths);
5651 
5652  if(ths->nfft_flags & PRE_LIN_PSI)
5653  {
5654  ths->K=(1U<< 10)*(ths->m+2);
5655  ths->psi = (R*) nfft_malloc((ths->K+1)*ths->d*sizeof(R));
5656  }
5657 
5658  if(ths->nfft_flags & PRE_FG_PSI)
5659  ths->psi = (R*) nfft_malloc(ths->M_total*ths->d*2*sizeof(R));
5660 
5661  if(ths->nfft_flags & PRE_PSI)
5662  ths->psi = (R*) nfft_malloc(ths->M_total*ths->d*
5663  (2*ths->m+2)*sizeof(R));
5664 
5665  if(ths->nfft_flags & PRE_FULL_PSI)
5666  {
5667  for(t=0,lprod = 1; t<ths->d; t++)
5668  lprod *= 2*ths->m+2;
5669 
5670  ths->psi = (R*) nfft_malloc(ths->M_total*lprod*sizeof(R));
5671 
5672  ths->psi_index_f = (int*) nfft_malloc(ths->M_total*sizeof(int));
5673  ths->psi_index_g = (int*) nfft_malloc(ths->M_total*lprod*sizeof(int));
5674  }
5675 
5676  if(ths->nfft_flags & FFTW_INIT)
5677  {
5678 #ifdef _OPENMP
5679  int nthreads = nfft_get_omp_num_threads();
5680 #endif
5681 
5682  ths->g1=(fftw_complex*)nfft_malloc(ths->n_total*sizeof(C));
5683 
5684  if(ths->nfft_flags & FFT_OUT_OF_PLACE)
5685  ths->g2 = (fftw_complex*) nfft_malloc(ths->n_total*sizeof(C));
5686  else
5687  ths->g2 = ths->g1;
5688 
5689 #ifdef _OPENMP
5690 #pragma omp critical (nfft_omp_critical_fftw_plan)
5691 {
5692  fftw_plan_with_nthreads(nthreads);
5693 #endif
5694  ths->my_fftw_plan1 = fftw_plan_dft(ths->d, ths->n, ths->g1, ths->g2, FFTW_FORWARD, ths->fftw_flags);
5695  ths->my_fftw_plan2 = fftw_plan_dft(ths->d, ths->n, ths->g2, ths->g1,
5696  FFTW_BACKWARD, ths->fftw_flags);
5697 #ifdef _OPENMP
5698 }
5699 #endif
5700  }
5701 
5702  if(ths->nfft_flags & NFFT_SORT_NODES)
5703  ths->index_x = (int*) nfft_malloc(sizeof(int)*2*ths->M_total);
5704  else
5705  ths->index_x = NULL;
5706 
5707  ths->mv_trafo = (void (*) (void* ))nfft_trafo;
5708  ths->mv_adjoint = (void (*) (void* ))nfft_adjoint;
5709 }
5710 
5711 void nfft_init(nfft_plan *ths, int d, int *N, int M_total)
5712 {
5713  int t;
5715  ths->d = d;
5716 
5717  ths->N=(int*) nfft_malloc(d*sizeof(int));
5718 
5719  for (t = 0;t < d; t++)
5720  ths->N[t] = N[t];
5721 
5722  ths->M_total = M_total;
5723 
5724  ths->n = (int*) nfft_malloc(d*sizeof(int));
5725  for (t = 0;t < d; t++)
5726  ths->n[t] = 2*X(next_power_of_2)(ths->N[t]);
5727 
5728  ths->m = WINDOW_HELP_ESTIMATE_m;
5729 
5730  if (d > 1)
5731  {
5732 #ifdef _OPENMP
5733  ths->nfft_flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
5734  FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES |
5735  NFFT_OMP_BLOCKWISE_ADJOINT;
5736 #else
5737  ths->nfft_flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
5738  FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES;
5739 #endif
5740  }
5741  else
5742  ths->nfft_flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
5743  FFTW_INIT | FFT_OUT_OF_PLACE;
5744 
5745  ths->fftw_flags= FFTW_ESTIMATE| FFTW_DESTROY_INPUT;
5746 
5747  nfft_init_help(ths);
5748 }
5749 
5750 void nfft_init_guru(nfft_plan *ths, int d, int *N, int M_total, int *n,
5751  int m, unsigned nfft_flags, unsigned fftw_flags)
5752 {
5753  int t;
5755  ths->d =d;
5756  ths->N= (int*) nfft_malloc(ths->d*sizeof(int));
5757  for(t=0; t<d; t++)
5758  ths->N[t]= N[t];
5759  ths->M_total= M_total;
5760  ths->n= (int*) nfft_malloc(ths->d*sizeof(int));
5761  for(t=0; t<d; t++)
5762  ths->n[t]= n[t];
5763  ths->m= m;
5764  ths->nfft_flags= nfft_flags;
5765  ths->fftw_flags= fftw_flags;
5766 
5767  nfft_init_help(ths);
5768 }
5769 
5770 void nfft_init_1d(nfft_plan *ths, int N1, int M_total)
5771 {
5772  int N[1];
5773 
5774  N[0]=N1;
5775 
5776  nfft_init(ths, 1, N, M_total);
5777 }
5778 
5779 void nfft_init_2d(nfft_plan *ths, int N1, int N2, int M_total)
5780 {
5781  int N[2];
5782 
5783  N[0]=N1;
5784  N[1]=N2;
5785  nfft_init(ths,2,N,M_total);
5786 }
5787 
5788 void nfft_init_3d(nfft_plan *ths, int N1, int N2, int N3, int M_total)
5789 {
5790  int N[3];
5791 
5792  N[0]=N1;
5793  N[1]=N2;
5794  N[2]=N3;
5795  nfft_init(ths,3,N,M_total);
5796 }
5797 
5798 const char* nfft_check(nfft_plan *ths)
5799 {
5800  int j;
5801 
5802  for(j=0;j<ths->M_total*ths->d;j++)
5803  if((ths->x[j]<-K(0.5)) || (ths->x[j]>= K(0.5)))
5804  return "ths->x out of range [-0.5,0.5)";
5805 
5806  for(j=0;j<ths->d;j++)
5807  {
5808  if(ths->sigma[j]<=1)
5809  return "nfft_check: oversampling factor too small";
5810 
5811  if(ths->N[j]<=ths->m)
5812  return "Polynomial degree N is smaller than cut-off m";
5813 
5814  if(ths->N[j]%2==1)
5815  return "polynomial degree N has to be even";
5816  }
5817  return 0;
5818 }
5819 
5820 void nfft_finalize(nfft_plan *ths)
5821 {
5822  int t; /* index over dimensions */
5823 
5824  if(ths->nfft_flags & NFFT_SORT_NODES)
5825  nfft_free(ths->index_x);
5826 
5827  if(ths->nfft_flags & FFTW_INIT)
5828  {
5829 #pragma omp critical (nfft_omp_critical_fftw_plan)
5830 {
5831  fftw_destroy_plan(ths->my_fftw_plan2);
5832  fftw_destroy_plan(ths->my_fftw_plan1);
5833 }
5834 
5835  if(ths->nfft_flags & FFT_OUT_OF_PLACE)
5836  nfft_free(ths->g2);
5837 
5838  nfft_free(ths->g1);
5839  }
5840 
5841  if(ths->nfft_flags & PRE_FULL_PSI)
5842  {
5843  nfft_free(ths->psi_index_g);
5844  nfft_free(ths->psi_index_f);
5845  nfft_free(ths->psi);
5846  }
5847 
5848  if(ths->nfft_flags & PRE_PSI)
5849  nfft_free(ths->psi);
5850 
5851  if(ths->nfft_flags & PRE_FG_PSI)
5852  nfft_free(ths->psi);
5853 
5854  if(ths->nfft_flags & PRE_LIN_PSI)
5855  nfft_free(ths->psi);
5856 
5857  if(ths->nfft_flags & PRE_PHI_HUT)
5858  {
5859  for(t=0; t<ths->d; t++)
5860  nfft_free(ths->c_phi_inv[t]);
5861  nfft_free(ths->c_phi_inv);
5862  }
5863 
5864  if(ths->nfft_flags & MALLOC_F)
5865  nfft_free(ths->f);
5866 
5867  if(ths->nfft_flags & MALLOC_F_HAT)
5868  nfft_free(ths->f_hat);
5869 
5870  if(ths->nfft_flags & MALLOC_X)
5871  nfft_free(ths->x);
5872 
5873  WINDOW_HELP_FINALIZE;
5874 
5875  nfft_free(ths->sigma);
5876  nfft_free(ths->n);
5877  nfft_free(ths->N);
5878 }

Generated on Tue Apr 30 2013 by Doxygen 1.8.1