61 static void nfft_sort_nodes_for_better_cache_handle(
int d,
62 const int *n,
int m,
int local_x_num,
const R *local_x,
int *ar_x)
64 int u_j[d], i, j, help, rhigh;
68 for(i = 0; i < local_x_num; i++) {
71 for(j = 0; j < d; j++) {
72 help = (int) floor( n[j]*local_x[d*i+j] - m);
73 u_j[j] = (help%n[j]+n[j])%n[j];
81 for (j = 0, nprod = 1; j < d; j++)
84 rhigh = (int) ceil(log2(nprod)) - 1;
86 ar_x_temp = (
int *)
nfft_malloc(2*local_x_num*
sizeof(
int));
89 for (i = 1; i < local_x_num; i++)
90 assert(ar_x[2*(i-1)] <= ar_x[2*i]);
103 static void nfft_sort_nodes(
const nfft_plan *ths)
106 nfft_sort_nodes_for_better_cache_handle(ths->
d, ths->
n, ths->
m, ths->
M_total, ths->
x, ths->
index_x);
131 #define MACRO_ndft_init_result_trafo memset(f,0,ths->M_total*sizeof(C));
132 #define MACRO_ndft_init_result_conjugated MACRO_ndft_init_result_trafo
133 #define MACRO_ndft_init_result_adjoint memset(f_hat,0,ths->N_total*sizeof(C));
134 #define MACRO_ndft_init_result_transposed MACRO_ndft_init_result_adjoint
137 #define MACRO_ndft_sign_trafo K2PI*ths->x[j*ths->d+t]
138 #define MACRO_ndft_sign_conjugated -K2PI*ths->x[j*ths->d+t]
139 #define MACRO_ndft_sign_adjoint K2PI*ths->x[j*ths->d+t]
140 #define MACRO_ndft_sign_transposed -K2PI*ths->x[j*ths->d+t]
142 #define MACRO_init_k_N_Omega_x(which_one) \
144 for (t = 0; t < ths->d; t++) \
146 k[t] = -ths->N[t]/2; \
147 x[t] = MACRO_ndft_sign_ ## which_one; \
148 Omega[t+1] = k[t]*x[t] + Omega[t]; \
150 omega = Omega[ths->d]; \
153 #define MACRO_count_k_N_Omega \
155 for (t = ths->d-1; (t >= 1) && (k[t] == ths->N[t]/2-1); t--) \
156 k[t]-= ths->N[t]-1; \
160 for (t2 = t; t2 < ths->d; t2++) \
161 Omega[t2+1] = k[t2]*x[t2] + Omega[t2]; \
163 omega = Omega[ths->d]; \
166 #define MACRO_ndft_compute_trafo f[j] += f_hat[k_L]*CEXP(-II*omega);
167 #define MACRO_ndft_compute_conjugated MACRO_ndft_compute_trafo
168 #define MACRO_ndft_compute_adjoint f_hat[k_L] += f[j]*CEXP(+II*omega);
169 #define MACRO_ndft_compute_transposed MACRO_ndft_compute_adjoint
171 #define MACRO_ndft(which_one) \
172 void nfft_ ## which_one ## _direct (nfft_plan *ths) \
174 C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f; \
176 MACRO_ndft_init_result_ ## which_one \
183 for (j = 0; j < ths->M_total; j++) \
186 for(k_L = 0; k_L < ths->N_total; k_L++) \
188 R omega = (k_L - (ths->N_total/2)) * MACRO_ndft_sign_ ## which_one; \
189 MACRO_ndft_compute_ ## which_one; \
197 for (j = 0; j < ths->M_total; j++) \
199 R x[ths->d], omega, Omega[ths->d+1]; \
200 int t, t2, k_L, k[ths->d]; \
202 MACRO_init_k_N_Omega_x(which_one); \
203 for(k_L = 0; k_L < ths->N_total; k_L++) \
205 MACRO_ndft_compute_ ## which_one; \
206 MACRO_count_k_N_Omega; \
216 C *f_hat = (C*)ths->
f_hat, *f = (C*)ths->
f;
218 memset(f,0,ths->
M_total*
sizeof(C));
224 #pragma omp parallel for default(shared) private(j)
225 for (j = 0; j < ths->
M_total; j++)
228 for(k_L = 0; k_L < ths->
N_total; k_L++)
230 R omega = (k_L - ths->
N_total/2) * K2PI * ths->
x[j];
231 f[j] += f_hat[k_L]*cexp(-( 1.0iF)*omega);
239 #pragma omp parallel for default(shared) private(j)
240 for (j = 0; j < ths->
M_total; j++)
242 R x[ths->
d], omega, Omega[ths->
d+1];
243 int t, t2, k_L, k[ths->
d];
244 Omega[0] = ((R) 0.0);
245 for (t = 0; t < ths->
d; t++)
248 x[t] = K2PI*ths->
x[j*ths->
d+t];
249 Omega[t+1] = k[t]*x[t] + Omega[t];
251 omega = Omega[ths->
d];
253 for(k_L = 0; k_L < ths->
N_total; k_L++)
255 f[j] += f_hat[k_L]*cexp(-( 1.0iF)*omega);
257 for (t = ths->
d-1; (t >= 1) && (k[t] == ths->
N[t]/2-1); t--)
262 for (t2 = t; t2 < ths->
d; t2++)
263 Omega[t2+1] = k[t2]*x[t2] + Omega[t2];
265 omega = Omega[ths->
d];
275 void nfft_adjoint_direct (
nfft_plan *ths)
277 C *f_hat = (C*)ths->
f_hat, *f = (C*)ths->
f;
279 memset(f_hat,0,ths->
N_total*
sizeof(C));
286 #pragma omp parallel for default(shared) private(k_L)
287 for(k_L = 0; k_L < ths->
N_total; k_L++)
290 for (j = 0; j < ths->
M_total; j++)
292 R omega = (k_L - (ths->
N_total/2)) * K2PI * ths->
x[j];
293 f_hat[k_L] += f[j]*cexp(+( 1.0iF)*omega);
298 for (j = 0; j < ths->
M_total; j++)
301 for(k_L = 0; k_L < ths->
N_total; k_L++)
303 R omega = (k_L - (ths->
N_total/2)) * K2PI * ths->
x[j];
304 f_hat[k_L] += f[j]*cexp(+( 1.0iF)*omega);
314 #pragma omp parallel for default(shared) private(j, k_L)
315 for(k_L = 0; k_L < ths->
N_total; k_L++)
317 int k[ths->
d], k_temp, t;
321 for (t = ths->
d-1; t >= 0; t--)
323 k[t] = k_temp % ths->
N[t] - ths->
N[t]/2;
327 for (j = 0; j < ths->
M_total; j++)
330 for (t = 0; t < ths->
d; t++)
331 omega += k[t] * K2PI * ths->
x[j*ths->
d+t];
332 f_hat[k_L] += f[j]*cexp(+( 1.0iF)*omega);
336 for (j = 0; j < ths->
M_total; j++)
338 R x[ths->
d], omega, Omega[ths->
d+1];
339 int t, t2, k[ths->
d];
340 Omega[0] = ((R) 0.0);
341 for (t = 0; t < ths->
d; t++)
344 x[t] = K2PI * ths->
x[j*ths->
d+t];
345 Omega[t+1] = k[t]*x[t] + Omega[t];
347 omega = Omega[ths->
d];
348 for(k_L = 0; k_L < ths->
N_total; k_L++)
350 f_hat[k_L] += f[j]*cexp(+( 1.0iF)*omega);
352 for (t = ths->
d-1; (t >= 1) && (k[t] == ths->
N[t]/2-1); t--)
357 for (t2 = t; t2 < ths->
d; t2++)
358 Omega[t2+1] = k[t2]*x[t2] + Omega[t2];
360 omega = Omega[ths->
d];
392 static inline void nfft_uo(
const nfft_plan *ths,
const int j,
int *up,
int *op,
395 const R xj = ths->
x[j * ths->
d + act_dim];
396 int c = LRINT(FLOOR(xj * ths->
n[act_dim]));
398 (*up) = c - (ths->
m);
399 (*op) = c + 1 + (ths->
m);
402 static inline void nfft_uo2(
int *u,
int *o,
const R x,
const int n,
const int m)
404 int c = LRINT(FLOOR(x * n));
406 *u = (c - m + n) % n;
407 *o = (c + 1 + m + n) % n;
410 #define MACRO_nfft_D_compute_A \
412 g_hat[k_plain[ths->d]] = f_hat[ks_plain[ths->d]] * c_phi_inv_k[ths->d]; \
415 #define MACRO_nfft_D_compute_T \
417 f_hat[ks_plain[ths->d]] = g_hat[k_plain[ths->d]] * c_phi_inv_k[ths->d]; \
420 #define MACRO_nfft_D_init_result_A memset(g_hat,0,ths->n_total*sizeof(C));
422 #define MACRO_nfft_D_init_result_T memset(f_hat,0,ths->N_total*sizeof(C));
424 #define MACRO_with_PRE_PHI_HUT * ths->c_phi_inv[t2][ks[t2]];
426 #define MACRO_without_PRE_PHI_HUT / (PHI_HUT(ks[t2]-(ths->N[t2]/2),t2));
428 #define MACRO_init_k_ks \
430 for (t = ths->d-1; 0 <= t; t--) \
433 ks[t] = ths->N[t]/2; \
438 #define MACRO_update_c_phi_inv_k(which_one) \
440 for (t2 = t; t2 < ths->d; t2++) \
442 c_phi_inv_k[t2+1] = c_phi_inv_k[t2] MACRO_ ##which_one; \
443 ks_plain[t2+1] = ks_plain[t2]*ths->N[t2] + ks[t2]; \
444 k_plain[t2+1] = k_plain[t2]*ths->n[t2] + k[t2]; \
448 #define MACRO_count_k_ks \
450 for (t = ths->d-1; (t > 0) && (kp[t] == ths->N[t]-1); t--) \
453 ks[t]= ths->N[t]/2; \
456 kp[t]++; k[t]++; ks[t]++; \
457 if(kp[t] == ths->N[t]/2) \
459 k[t] = ths->n[t] - ths->N[t]/2; \
468 #define MACRO_nfft_D(which_one) \
469 static inline void nfft_D_serial_ ## which_one (nfft_plan *ths) \
472 R c_phi_inv_k[ths->d+1]; \
478 int k_plain[ths->d+1]; \
479 int ks_plain[ths->d+1]; \
481 f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat; \
482 MACRO_nfft_D_init_result_ ## which_one; \
484 c_phi_inv_k[0] = K(1.0); \
488 if (ths->nfft_flags & PRE_PHI_HUT) \
492 for (k_L = 0; k_L < ths->N_total; k_L++) \
494 MACRO_update_c_phi_inv_k(with_PRE_PHI_HUT); \
495 MACRO_nfft_D_compute_ ## which_one; \
502 for (k_L = 0; k_L < ths->N_total; k_L++) \
504 MACRO_update_c_phi_inv_k(without_PRE_PHI_HUT); \
505 MACRO_nfft_D_compute_ ## which_one; \
512 static void nfft_D_openmp_A(
nfft_plan *ths)
517 f_hat = (C*)ths->
f_hat; g_hat = (C*)ths->
g_hat;
518 memset(g_hat,0,ths->
n_total*
sizeof(C));
522 #pragma omp parallel for default(shared) private(k_L)
523 for (k_L = 0; k_L < ths->
N_total; k_L++)
528 R c_phi_inv_k_val = K(1.0);
530 int ks_plain_val = 0;
534 for (t = ths->
d-1; t >= 0; t--)
536 kp[t] = k_temp % ths->
N[t];
537 if (kp[t] >= ths->
N[t]/2)
538 k[t] = ths->
n[t] - ths->
N[t] + kp[t];
541 ks[t] = (kp[t] + ths->
N[t]/2) % ths->
N[t];
545 for (t = 0; t < ths->
d; t++)
547 c_phi_inv_k_val *= ths->
c_phi_inv[t][ks[t]];
548 ks_plain_val = ks_plain_val*ths->
N[t] + ks[t];
549 k_plain_val = k_plain_val*ths->
n[t] + k[t];
552 g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
557 #pragma omp parallel for default(shared) private(k_L)
558 for (k_L = 0; k_L < ths->
N_total; k_L++)
563 R c_phi_inv_k_val = K(1.0);
565 int ks_plain_val = 0;
569 for (t = ths->
d-1; t >= 0; t--)
571 kp[t] = k_temp % ths->
N[t];
572 if (kp[t] >= ths->
N[t]/2)
573 k[t] = ths->
n[t] - ths->
N[t] + kp[t];
576 ks[t] = (kp[t] + ths->
N[t]/2) % ths->
N[t];
580 for (t = 0; t < ths->
d; t++)
582 c_phi_inv_k_val /= (PHI_HUT(ks[t]-(ths->
N[t]/2),t));
583 ks_plain_val = ks_plain_val*ths->
N[t] + ks[t];
584 k_plain_val = k_plain_val*ths->
n[t] + k[t];
587 g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
600 nfft_D_openmp_A(ths);
602 nfft_D_serial_A(ths);
607 static void nfft_D_openmp_T(
nfft_plan *ths)
612 f_hat = (C*)ths->
f_hat; g_hat = (C*)ths->
g_hat;
613 memset(f_hat,0,ths->
N_total*
sizeof(C));
617 #pragma omp parallel for default(shared) private(k_L)
618 for (k_L = 0; k_L < ths->
N_total; k_L++)
623 R c_phi_inv_k_val = K(1.0);
625 int ks_plain_val = 0;
629 for (t = ths->
d-1; t >= 0; t--)
631 kp[t] = k_temp % ths->
N[t];
632 if (kp[t] >= ths->
N[t]/2)
633 k[t] = ths->
n[t] - ths->
N[t] + kp[t];
636 ks[t] = (kp[t] + ths->
N[t]/2) % ths->
N[t];
640 for (t = 0; t < ths->
d; t++)
642 c_phi_inv_k_val *= ths->
c_phi_inv[t][ks[t]];
643 ks_plain_val = ks_plain_val*ths->
N[t] + ks[t];
644 k_plain_val = k_plain_val*ths->
n[t] + k[t];
647 f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
652 #pragma omp parallel for default(shared) private(k_L)
653 for (k_L = 0; k_L < ths->
N_total; k_L++)
658 R c_phi_inv_k_val = K(1.0);
660 int ks_plain_val = 0;
664 for (t = ths->
d-1; t >= 0; t--)
666 kp[t] = k_temp % ths->
N[t];
667 if (kp[t] >= ths->
N[t]/2)
668 k[t] = ths->
n[t] - ths->
N[t] + kp[t];
671 ks[t] = (kp[t] + ths->
N[t]/2) % ths->
N[t];
675 for (t = 0; t < ths->
d; t++)
677 c_phi_inv_k_val /= (PHI_HUT(ks[t]-(ths->
N[t]/2),t));
678 ks_plain_val = ks_plain_val*ths->
N[t] + ks[t];
679 k_plain_val = k_plain_val*ths->
n[t] + k[t];
682 f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
695 nfft_D_openmp_T(ths);
697 nfft_D_serial_T(ths);
704 #define MACRO_nfft_B_init_result_A memset(f,0,ths->M_total*sizeof(C));
705 #define MACRO_nfft_B_init_result_T memset(g,0,ths->n_total*sizeof(C));
707 #define MACRO_nfft_B_PRE_FULL_PSI_compute_A \
709 (*fj) += ths->psi[ix] * g[ths->psi_index_g[ix]]; \
712 #define MACRO_nfft_B_PRE_FULL_PSI_compute_T \
714 g[ths->psi_index_g[ix]] += ths->psi[ix] * (*fj); \
717 #define MACRO_nfft_B_compute_A \
719 (*fj) += phi_prod[ths->d] * g[ll_plain[ths->d]]; \
722 #define MACRO_nfft_B_compute_T \
724 g[ll_plain[ths->d]] += phi_prod[ths->d] * (*fj); \
727 #define MACRO_with_FG_PSI fg_psi[t2][lj[t2]]
729 #define MACRO_with_PRE_PSI ths->psi[(j*ths->d+t2) * (2*ths->m+2)+lj[t2]]
731 #define MACRO_without_PRE_PSI PHI(ths->x[j*ths->d+t2] \
732 - ((R)l[t2])/((R)ths->n[t2]), t2)
734 #define MACRO_init_uo_l_lj_t \
736 for (t = ths->d-1; t >= 0; t--) \
738 nfft_uo(ths,j,&u[t],&o[t],t); \
745 #define MACRO_update_phi_prod_ll_plain(which_one) { \
746 for(t2=t; t2<ths->d; t2++) \
748 phi_prod[t2+1] = phi_prod[t2]* MACRO_ ## which_one; \
749 ll_plain[t2+1] = ll_plain[t2]*ths->n[t2] +(l[t2]+ths->n[t2])%ths->n[t2];\
753 #define MACRO_count_uo_l_lj_t { \
754 for(t = ths->d-1; (t > 0) && (l[t] == o[t]); t--) \
764 #define MACRO_nfft_B(which_one) \
765 static inline void nfft_B_serial_ ## which_one (nfft_plan *ths) \
768 int u[ths->d], o[ths->d]; \
774 int ll_plain[ths->d+1]; \
775 R phi_prod[ths->d+1]; \
779 R fg_psi[ths->d][2*ths->m+2]; \
780 R fg_exp_l[ths->d][2*ths->m+2]; \
782 R tmpEXP1, tmpEXP2, tmpEXP2sq, tmp1, tmp2, tmp3; \
785 int ip_s = ths->K/(ths->m+2); \
787 f = (C*)ths->f; g = (C*)ths->g; \
789 MACRO_nfft_B_init_result_ ## which_one; \
791 if (ths->nfft_flags & PRE_FULL_PSI) \
793 for (ix = 0, j = 0, fj = f; j < ths->M_total; j++, fj++) \
794 for (l_L = 0; l_L < ths->psi_index_f[j]; l_L++, ix++) \
795 MACRO_nfft_B_PRE_FULL_PSI_compute_ ## which_one; \
799 phi_prod[0] = K(1.0); \
802 for (t = 0, lprod = 1; t < ths->d; t++) \
803 lprod *= (2*ths->m+2); \
805 if (ths->nfft_flags & PRE_PSI) \
807 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
809 MACRO_init_uo_l_lj_t; \
811 for (l_L = 0; l_L < lprod; l_L++) \
813 MACRO_update_phi_prod_ll_plain(with_PRE_PSI); \
815 MACRO_nfft_B_compute_ ## which_one; \
817 MACRO_count_uo_l_lj_t; \
823 if (ths->nfft_flags & PRE_FG_PSI) \
825 for(t2 = 0; t2 < ths->d; t2++) \
827 tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); \
828 tmpEXP2sq = tmpEXP2*tmpEXP2; \
831 fg_exp_l[t2][0] = K(1.0); \
832 for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) \
834 tmp3 = tmp2*tmpEXP2; \
836 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; \
839 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
841 MACRO_init_uo_l_lj_t; \
843 for (t2 = 0; t2 < ths->d; t2++) \
845 fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)]; \
846 tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1]; \
848 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
851 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
855 for (l_L= 0; l_L < lprod; l_L++) \
857 MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
859 MACRO_nfft_B_compute_ ## which_one; \
861 MACRO_count_uo_l_lj_t; \
867 if (ths->nfft_flags & FG_PSI) \
869 for (t2 = 0; t2 < ths->d; t2++) \
871 tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); \
872 tmpEXP2sq = tmpEXP2*tmpEXP2; \
875 fg_exp_l[t2][0] = K(1.0); \
876 for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) \
878 tmp3 = tmp2*tmpEXP2; \
880 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; \
883 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
885 MACRO_init_uo_l_lj_t; \
887 for (t2 = 0; t2 < ths->d; t2++) \
889 fg_psi[t2][0] = (PHI((ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));\
891 tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2]) \
894 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
897 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
901 for (l_L = 0; l_L < lprod; l_L++) \
903 MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
905 MACRO_nfft_B_compute_ ## which_one; \
907 MACRO_count_uo_l_lj_t; \
913 if (ths->nfft_flags & PRE_LIN_PSI) \
915 for (j = 0, fj=f; j<ths->M_total; j++, fj++) \
917 MACRO_init_uo_l_lj_t; \
919 for (t2 = 0; t2 < ths->d; t2++) \
921 y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2]) \
922 * ((R)ths->K))/(ths->m+2); \
923 ip_u = LRINT(FLOOR(y[t2])); \
925 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++) \
927 fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)] \
928 * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)] \
933 for (l_L = 0; l_L < lprod; l_L++) \
935 MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
937 MACRO_nfft_B_compute_ ## which_one; \
939 MACRO_count_uo_l_lj_t; \
946 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
948 MACRO_init_uo_l_lj_t; \
950 for (l_L = 0; l_L < lprod; l_L++) \
952 MACRO_update_phi_prod_ll_plain(without_PRE_PSI); \
954 MACRO_nfft_B_compute_ ## which_one; \
956 MACRO_count_uo_l_lj_t; \
966 static inline void nfft_B_openmp_A (
nfft_plan *ths)
971 memset(ths->
f,0,ths->
M_total*
sizeof(C));
973 for (k = 0, lprod = 1; k < ths->
d; k++)
974 lprod *= (2*ths->
m+2);
978 #pragma omp parallel for default(shared) private(k)
979 for (k = 0; k < ths->
M_total; k++)
984 for (l = 0; l < lprod; l++)
992 #pragma omp parallel for default(shared) private(k)
993 for (k = 0; k < ths->
M_total; k++)
995 int u[ths->
d], o[ths->
d];
1000 int ll_plain[ths->
d+1];
1001 R phi_prod[ths->
d+1];
1004 phi_prod[0] = K(1.0);
1007 MACRO_init_uo_l_lj_t;
1009 for (l_L = 0; l_L < lprod; l_L++)
1011 MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
1013 ths->
f[j] += phi_prod[ths->
d] * ths->
g[ll_plain[ths->
d]];
1015 MACRO_count_uo_l_lj_t;
1024 R fg_exp_l[ths->
d][2*ths->
m+2];
1026 for(t2 = 0; t2 < ths->
d; t2++)
1029 R tmpEXP2 = EXP(K(-1.0)/ths->
b[t2]);
1030 R tmpEXP2sq = tmpEXP2*tmpEXP2;
1033 fg_exp_l[t2][0] = K(1.0);
1034 for(lj_fg = 1; lj_fg <= (2*ths->
m+2); lj_fg++)
1036 tmp3 = tmp2*tmpEXP2;
1038 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1042 #pragma omp parallel for default(shared) private(k,t,t2)
1043 for (k = 0; k < ths->
M_total; k++)
1045 int ll_plain[ths->
d+1];
1046 R phi_prod[ths->
d+1];
1047 int u[ths->
d], o[ths->
d];
1050 R fg_psi[ths->
d][2*ths->
m+2];
1056 phi_prod[0] = K(1.0);
1059 MACRO_init_uo_l_lj_t;
1061 for (t2 = 0; t2 < ths->
d; t2++)
1063 fg_psi[t2][0] = ths->
psi[2*(j*ths->
d+t2)];
1064 tmpEXP1 = ths->
psi[2*(j*ths->
d+t2)+1];
1066 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1069 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1073 for (l_L= 0; l_L < lprod; l_L++)
1075 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1077 ths->
f[j] += phi_prod[ths->
d] * ths->
g[ll_plain[ths->
d]];
1079 MACRO_count_uo_l_lj_t;
1088 R fg_exp_l[ths->
d][2*ths->
m+2];
1090 nfft_sort_nodes(ths);
1092 for (t2 = 0; t2 < ths->
d; t2++)
1095 R tmpEXP2 = EXP(K(-1.0)/ths->
b[t2]);
1096 R tmpEXP2sq = tmpEXP2*tmpEXP2;
1099 fg_exp_l[t2][0] = K(1.0);
1100 for (lj_fg = 1; lj_fg <= (2*ths->
m+2); lj_fg++)
1102 tmp3 = tmp2*tmpEXP2;
1104 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1108 #pragma omp parallel for default(shared) private(k,t,t2)
1109 for (k = 0; k < ths->
M_total; k++)
1111 int ll_plain[ths->
d+1];
1112 R phi_prod[ths->
d+1];
1113 int u[ths->
d], o[ths->
d];
1116 R fg_psi[ths->
d][2*ths->
m+2];
1122 phi_prod[0] = K(1.0);
1125 MACRO_init_uo_l_lj_t;
1127 for (t2 = 0; t2 < ths->
d; t2++)
1129 fg_psi[t2][0] = (PHI((ths->
x[j*ths->
d+t2]-((R)u[t2])/ths->
n[t2]),t2));
1131 tmpEXP1 = EXP(K(2.0)*(ths->
n[t2]*ths->
x[j*ths->
d+t2] - u[t2])
1134 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1137 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1141 for (l_L = 0; l_L < lprod; l_L++)
1143 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1145 ths->
f[j] += phi_prod[ths->
d] * ths->
g[ll_plain[ths->
d]];
1147 MACRO_count_uo_l_lj_t;
1155 nfft_sort_nodes(ths);
1157 #pragma omp parallel for default(shared) private(k)
1158 for (k = 0; k<ths->
M_total; k++)
1160 int u[ths->
d], o[ths->
d];
1165 int ll_plain[ths->
d+1];
1166 R phi_prod[ths->
d+1];
1168 R fg_psi[ths->
d][2*ths->
m+2];
1172 int ip_s = ths->
K/(ths->
m+2);
1175 phi_prod[0] = K(1.0);
1178 MACRO_init_uo_l_lj_t;
1180 for (t2 = 0; t2 < ths->
d; t2++)
1182 y[t2] = ((ths->
n[t2]*ths->
x[j*ths->
d+t2]-(R)u[t2])
1183 * ((R)ths->
K))/(ths->
m+2);
1184 ip_u = LRINT(FLOOR(y[t2]));
1186 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++)
1188 fg_psi[t2][lj_fg] = ths->
psi[(ths->
K+1)*t2 + ABS(ip_u-lj_fg*ip_s)]
1189 * (1-ip_w) + ths->
psi[(ths->
K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)]
1194 for (l_L = 0; l_L < lprod; l_L++)
1196 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1198 ths->
f[j] += phi_prod[ths->
d] * ths->
g[ll_plain[ths->
d]];
1200 MACRO_count_uo_l_lj_t;
1207 nfft_sort_nodes(ths);
1209 #pragma omp parallel for default(shared) private(k)
1210 for (k = 0; k < ths->
M_total; k++)
1212 int u[ths->
d], o[ths->
d];
1217 int ll_plain[ths->
d+1];
1218 R phi_prod[ths->
d+1];
1221 phi_prod[0] = K(1.0);
1224 MACRO_init_uo_l_lj_t;
1226 for (l_L = 0; l_L < lprod; l_L++)
1228 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
1230 ths->
f[j] += phi_prod[ths->
d] * ths->
g[ll_plain[ths->
d]];
1232 MACRO_count_uo_l_lj_t;
1241 nfft_B_openmp_A(ths);
1243 nfft_B_serial_A(ths);
1263 static inline int index_x_binary_search(
const int *ar_x,
const int len,
const int key)
1265 int left = 0, right = len - 1;
1270 while (left < right - 1)
1272 int i = (left + right) / 2;
1273 if (ar_x[2*i] >= key)
1275 else if (ar_x[2*i] < key)
1279 if (ar_x[2*left] < key && left != len-1)
1302 static void nfft_adjoint_B_omp_blockwise_init(
int *my_u0,
int *my_o0,
int *min_u_a,
int *max_u_a,
int *min_u_b,
int *max_u_b,
const int d,
const int *n,
const int m)
1304 const int n0 = n[0];
1306 int nthreads = omp_get_num_threads();
1307 int nthreads_used = MIN(nthreads, n0);
1308 int size_per_thread = n0 / nthreads_used;
1309 int size_left = n0 - size_per_thread * nthreads_used;
1310 int size_g[nthreads_used];
1311 int offset_g[nthreads_used];
1312 int my_id = omp_get_thread_num();
1313 int n_prod_rest = 1;
1315 for (k = 1; k < d; k++)
1316 n_prod_rest *= n[k];
1325 if (my_id < nthreads_used)
1327 const int m22 = 2 * m + 2;
1330 for (k = 0; k < nthreads_used; k++)
1333 offset_g[k] = offset_g[k-1] + size_g[k-1];
1334 size_g[k] = size_per_thread;
1342 *my_u0 = offset_g[my_id];
1343 *my_o0 = offset_g[my_id] + size_g[my_id] - 1;
1345 if (nthreads_used > 1)
1347 *max_u_a = n_prod_rest*(offset_g[my_id] + size_g[my_id]) - 1;
1348 *min_u_a = n_prod_rest*(offset_g[my_id] - m22 + 1);
1353 *max_u_a = n_prod_rest * n0 - 1;
1358 *min_u_b = n_prod_rest * (offset_g[my_id] - m22 + 1 + n0);
1359 *max_u_b = n_prod_rest * n0 - 1;
1363 if (*min_u_b != -1 && *min_u_b <= *max_u_a)
1365 *max_u_a = *max_u_b;
1370 assert(*min_u_a <= *max_u_a);
1371 assert(*min_u_b <= *max_u_b);
1372 assert(*min_u_b == -1 || *max_u_a < *min_u_b);
1386 static void nfft_adjoint_B_compute_full_psi(
1387 C *g,
const int *psi_index_g,
const R *psi,
const C *f,
1388 const int M,
const int d,
const int *n,
const int m,
const int nfft_flags,
const int *index_x)
1391 int lprod, lprod_m1;
1394 for(t = 0, lprod = 1; t < d; t++)
1397 lprod_m1 = lprod / (2 * m + 2);
1400 if (nfft_flags & NFFT_OMP_BLOCKWISE_ADJOINT)
1402 #pragma omp parallel private(k)
1404 int my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b;
1405 const int *ar_x = index_x;
1406 int n_prod_rest = 1;
1408 for (k = 1; k < d; k++)
1409 n_prod_rest *= n[k];
1411 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, &min_u_b, &max_u_b, d, n, m);
1415 k = index_x_binary_search(ar_x, M, min_u_a);
1417 assert(ar_x[2*k] >= min_u_a || k == M-1);
1419 assert(ar_x[2*k-2] < min_u_a);
1424 int u_prod = ar_x[2*k];
1425 int j = ar_x[2*k+1];
1427 if (u_prod < min_u_a || u_prod > max_u_a)
1430 for (l0 = 0; l0 < 2 * m + 2; l0++)
1432 const int start_index = psi_index_g[j * lprod + l0 * lprod_m1];
1434 if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
1437 for (lrest = 0; lrest < lprod_m1; lrest++)
1439 const int l = l0 * lprod_m1 + lrest;
1440 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1450 k = index_x_binary_search(ar_x, M, min_u_b);
1452 assert(ar_x[2*k] >= min_u_b || k == M-1);
1454 assert(ar_x[2*k-2] < min_u_b);
1459 int u_prod = ar_x[2*k];
1460 int j = ar_x[2*k+1];
1462 if (u_prod < min_u_b || u_prod > max_u_b)
1465 for (l0 = 0; l0 < 2 * m + 2; l0++)
1467 const int start_index = psi_index_g[j * lprod + l0 * lprod_m1];
1469 if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
1471 for (lrest = 0; lrest < lprod_m1; lrest++)
1473 const int l = l0 * lprod_m1 + lrest;
1474 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1486 #pragma omp parallel for default(shared) private(k)
1487 for (k = 0; k < M; k++)
1490 int j = (nfft_flags & NFFT_SORT_NODES) ? index_x[2*k+1] : k;
1492 for (l = 0; l < lprod; l++)
1495 C val = psi[j * lprod + l] * f[j];
1496 C *gref = g + psi_index_g[j * lprod + l];
1497 R *gref_real = (R*) gref;
1500 gref_real[0] += creal(val);
1503 gref_real[1] += cimag(val);
1505 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1516 static inline void nfft_B_openmp_T(
nfft_plan *ths)
1521 memset(ths->
g,0,ths->
n_total*
sizeof(C));
1523 for (k = 0, lprod = 1; k < ths->
d; k++)
1524 lprod *= (2*ths->
m+2);
1535 #pragma omp parallel for default(shared) private(k)
1536 for (k = 0; k < ths->
M_total; k++)
1538 int u[ths->
d], o[ths->
d];
1543 int ll_plain[ths->
d+1];
1544 R phi_prod[ths->
d+1];
1547 phi_prod[0] = K(1.0);
1550 MACRO_init_uo_l_lj_t;
1552 for (l_L = 0; l_L < lprod; l_L++)
1558 MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
1560 lhs = ths->
g + ll_plain[ths->
d];
1562 val = phi_prod[ths->
d] * ths->
f[j];
1565 lhs_real[0] += creal(val);
1568 lhs_real[1] += cimag(val);
1570 MACRO_count_uo_l_lj_t;
1579 R fg_exp_l[ths->
d][2*ths->
m+2];
1580 for(t2 = 0; t2 < ths->
d; t2++)
1583 R tmpEXP2 = EXP(K(-1.0)/ths->
b[t2]);
1584 R tmpEXP2sq = tmpEXP2*tmpEXP2;
1587 fg_exp_l[t2][0] = K(1.0);
1588 for(lj_fg = 1; lj_fg <= (2*ths->
m+2); lj_fg++)
1590 tmp3 = tmp2*tmpEXP2;
1592 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1596 #pragma omp parallel for default(shared) private(k,t,t2)
1597 for (k = 0; k < ths->
M_total; k++)
1599 int ll_plain[ths->
d+1];
1600 R phi_prod[ths->
d+1];
1601 int u[ths->
d], o[ths->
d];
1604 R fg_psi[ths->
d][2*ths->
m+2];
1610 phi_prod[0] = K(1.0);
1613 MACRO_init_uo_l_lj_t;
1615 for (t2 = 0; t2 < ths->
d; t2++)
1617 fg_psi[t2][0] = ths->
psi[2*(j*ths->
d+t2)];
1618 tmpEXP1 = ths->
psi[2*(j*ths->
d+t2)+1];
1620 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1623 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1627 for (l_L= 0; l_L < lprod; l_L++)
1633 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1635 lhs = ths->
g + ll_plain[ths->
d];
1637 val = phi_prod[ths->
d] * ths->
f[j];
1640 lhs_real[0] += creal(val);
1643 lhs_real[1] += cimag(val);
1645 MACRO_count_uo_l_lj_t;
1654 R fg_exp_l[ths->
d][2*ths->
m+2];
1656 nfft_sort_nodes(ths);
1658 for (t2 = 0; t2 < ths->
d; t2++)
1661 R tmpEXP2 = EXP(K(-1.0)/ths->
b[t2]);
1662 R tmpEXP2sq = tmpEXP2*tmpEXP2;
1665 fg_exp_l[t2][0] = K(1.0);
1666 for (lj_fg = 1; lj_fg <= (2*ths->
m+2); lj_fg++)
1668 tmp3 = tmp2*tmpEXP2;
1670 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1674 #pragma omp parallel for default(shared) private(k,t,t2)
1675 for (k = 0; k < ths->
M_total; k++)
1677 int ll_plain[ths->
d+1];
1678 R phi_prod[ths->
d+1];
1679 int u[ths->
d], o[ths->
d];
1682 R fg_psi[ths->
d][2*ths->
m+2];
1688 phi_prod[0] = K(1.0);
1691 MACRO_init_uo_l_lj_t;
1693 for (t2 = 0; t2 < ths->
d; t2++)
1695 fg_psi[t2][0] = (PHI((ths->
x[j*ths->
d+t2]-((R)u[t2])/ths->
n[t2]),t2));
1697 tmpEXP1 = EXP(K(2.0)*(ths->
n[t2]*ths->
x[j*ths->
d+t2] - u[t2])
1700 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1703 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1707 for (l_L = 0; l_L < lprod; l_L++)
1713 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1715 lhs = ths->
g + ll_plain[ths->
d];
1717 val = phi_prod[ths->
d] * ths->
f[j];
1720 lhs_real[0] += creal(val);
1723 lhs_real[1] += cimag(val);
1725 MACRO_count_uo_l_lj_t;
1733 nfft_sort_nodes(ths);
1735 #pragma omp parallel for default(shared) private(k)
1736 for (k = 0; k<ths->
M_total; k++)
1738 int u[ths->
d], o[ths->
d];
1743 int ll_plain[ths->
d+1];
1744 R phi_prod[ths->
d+1];
1746 R fg_psi[ths->
d][2*ths->
m+2];
1750 int ip_s = ths->
K/(ths->
m+2);
1753 phi_prod[0] = K(1.0);
1756 MACRO_init_uo_l_lj_t;
1758 for (t2 = 0; t2 < ths->
d; t2++)
1760 y[t2] = ((ths->
n[t2]*ths->
x[j*ths->
d+t2]-(R)u[t2])
1761 * ((R)ths->
K))/(ths->
m+2);
1762 ip_u = LRINT(FLOOR(y[t2]));
1764 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++)
1766 fg_psi[t2][lj_fg] = ths->
psi[(ths->
K+1)*t2 + ABS(ip_u-lj_fg*ip_s)]
1767 * (1-ip_w) + ths->
psi[(ths->
K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)]
1772 for (l_L = 0; l_L < lprod; l_L++)
1778 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1780 lhs = ths->
g + ll_plain[ths->
d];
1782 val = phi_prod[ths->
d] * ths->
f[j];
1785 lhs_real[0] += creal(val);
1788 lhs_real[1] += cimag(val);
1790 MACRO_count_uo_l_lj_t;
1797 nfft_sort_nodes(ths);
1799 #pragma omp parallel for default(shared) private(k)
1800 for (k = 0; k < ths->
M_total; k++)
1802 int u[ths->
d], o[ths->
d];
1807 int ll_plain[ths->
d+1];
1808 R phi_prod[ths->
d+1];
1811 phi_prod[0] = K(1.0);
1814 MACRO_init_uo_l_lj_t;
1816 for (l_L = 0; l_L < lprod; l_L++)
1822 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
1824 lhs = ths->
g + ll_plain[ths->
d];
1826 val = phi_prod[ths->
d] * ths->
f[j];
1829 lhs_real[0] += creal(val);
1832 lhs_real[1] += cimag(val);
1834 MACRO_count_uo_l_lj_t;
1843 nfft_B_openmp_T(ths);
1845 nfft_B_serial_T(ths);
1851 static void nfft_1d_init_fg_exp_l(R *fg_exp_l,
const int m,
const R b)
1853 const int tmp2 = 2*m+2;
1855 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
1857 fg_exp_b0 = EXP(K(-1.0)/b);
1858 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
1859 fg_exp_b1 = fg_exp_b2 =fg_exp_l[0] = K(1.0);
1861 for (l = 1; l < tmp2; l++)
1863 fg_exp_b2 = fg_exp_b1*fg_exp_b0;
1864 fg_exp_b1 *= fg_exp_b0_sq;
1865 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
1870 static void nfft_trafo_1d_compute(C *fj,
const C *g,
const R *psij_const,
1871 const R *xj,
const int n,
const int m)
1878 nfft_uo2(&u, &o, *xj, n, m);
1882 for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l <= 2*m+1; l++)
1883 (*fj) += (*psij++) * (*gj++);
1887 for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l < 2*m+1 - o; l++)
1888 (*fj) += (*psij++) * (*gj++);
1889 for (l = 0, gj = g; l <= o; l++)
1890 (*fj) += (*psij++) * (*gj++);
1895 static void nfft_adjoint_1d_compute_serial(
const C *fj, C *g,
const R *psij_const,
1896 const R *xj,
const int n,
const int m)
1903 nfft_uo2(&u,&o,*xj, n, m);
1907 for (l = 0, gj = g+u; l <= 2*m+1; l++)
1908 (*gj++) += (*psij++) * (*fj);
1912 for (l = 0, gj = g+u; l < 2*m+1-o; l++)
1913 (*gj++) += (*psij++) * (*fj);
1914 for (l = 0, gj = g; l <= o; l++)
1915 (*gj++) += (*psij++) * (*fj);
1922 static void nfft_adjoint_1d_compute_omp_atomic(
const C f, C *g,
const R *psij_const,
1923 const R *xj,
const int n,
const int m)
1927 int index_temp[2*m+2];
1929 nfft_uo2(&u,&o,*xj, n, m);
1931 for (l=0; l<=2*m+1; l++)
1932 index_temp[l] = (l+u)%n;
1934 for (l = 0, gj = g+u; l <= 2*m+1; l++)
1936 int i = index_temp[l];
1938 R *lhs_real = (R*)lhs;
1939 C val = psij_const[l] * f;
1941 lhs_real[0] += creal(val);
1944 lhs_real[1] += cimag(val);
1965 static void nfft_adjoint_1d_compute_omp_blockwise(
const C f, C *g,
const R *psij_const,
1966 const R *xj,
const int n,
const int m,
const int my_u0,
const int my_o0)
1970 nfft_uo2(&ar_u,&ar_o,*xj, n, m);
1974 int u = MAX(my_u0,ar_u);
1975 int o = MIN(my_o0,ar_o);
1976 int offset_psij = u-ar_u;
1978 assert(offset_psij >= 0);
1979 assert(o-u <= 2*m+1);
1980 assert(offset_psij+o-u <= 2*m+1);
1983 for (l = 0; l <= o-u; l++)
1984 g[u+l] += psij_const[offset_psij+l] * f;
1988 int u = MAX(my_u0,ar_u);
1990 int offset_psij = u-ar_u;
1992 assert(offset_psij >= 0);
1993 assert(o-u <= 2*m+1);
1994 assert(offset_psij+o-u <= 2*m+1);
1997 for (l = 0; l <= o-u; l++)
1998 g[u+l] += psij_const[offset_psij+l] * f;
2001 o = MIN(my_o0,ar_o);
2002 offset_psij += my_u0-ar_u+n;
2007 assert(o-u <= 2*m+1);
2008 if (offset_psij+o-u > 2*m+1)
2010 fprintf(stderr,
"ERR: %d %d %d %d %d %d %d\n", ar_u, ar_o, my_u0, my_o0, u, o, offset_psij);
2012 assert(offset_psij+o-u <= 2*m+1);
2015 for (l = 0; l <= o-u; l++)
2016 g[u+l] += psij_const[offset_psij+l] * f;
2021 static void nfft_trafo_1d_B(
nfft_plan *ths)
2023 const int n = ths->
n[0], M = ths->
M_total, m = ths->
m, m2p2 = 2*m+2;
2024 const C *g = (C*)ths->
g;
2029 #pragma omp parallel for default(shared) private(k)
2030 for (k = 0; k < M; k++)
2035 for (l = 0; l < m2p2; l++)
2044 #pragma omp parallel for default(shared) private(k)
2045 for (k = 0; k < M; k++)
2048 nfft_trafo_1d_compute(&ths->
f[j], g, ths->
psi + j * (2 * m + 2),
2059 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->
b[0]);
2061 #pragma omp parallel for default(shared) private(k)
2062 for (k = 0; k < M; k++)
2065 const R fg_psij0 = ths->
psi[2 * j], fg_psij1 = ths->
psi[2 * j + 1];
2066 R fg_psij2 = K(1.0);
2070 psij_const[0] = fg_psij0;
2072 for (l = 1; l < m2p2; l++)
2074 fg_psij2 *= fg_psij1;
2075 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2078 nfft_trafo_1d_compute(&ths->
f[j], g, psij_const, &ths->
x[j], n, m);
2089 nfft_sort_nodes(ths);
2091 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->
b[0]);
2093 #pragma omp parallel for default(shared) private(k)
2094 for (k = 0; k < M; k++)
2098 R fg_psij0, fg_psij1, fg_psij2;
2101 nfft_uo(ths, (
int)j, &u, &o, 0);
2102 fg_psij0 = (PHI(ths->
x[j]-((R)u)/n,0));
2103 fg_psij1 = EXP(K(2.0) * (n * ths->
x[j] - u) / ths->
b[0]);
2106 psij_const[0] = fg_psij0;
2108 for (l = 1; l < m2p2; l++)
2110 fg_psij2 *= fg_psij1;
2111 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2114 nfft_trafo_1d_compute(&ths->
f[j], g, psij_const, &ths->
x[j], n, m);
2121 const int K = ths->
K, ip_s = K / (m + 2);
2124 nfft_sort_nodes(ths);
2126 #pragma omp parallel for default(shared) private(k)
2127 for (k = 0; k < M; k++)
2135 nfft_uo(ths, (
int)j, &u, &o, 0);
2137 ip_y = FABS(n * ths->
x[j] - u) * ((R)ip_s);
2138 ip_u = LRINT(FLOOR(ip_y));
2141 for (l = 0; l < m2p2; l++)
2142 psij_const[l] = ths->
psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
2143 + ths->
psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
2145 nfft_trafo_1d_compute(&ths->
f[j], g, psij_const, &ths->
x[j], n, m);
2154 nfft_sort_nodes(ths);
2156 #pragma omp parallel for default(shared) private(k)
2157 for (k = 0; k < M; k++)
2163 nfft_uo(ths, (
int)j, &u, &o, 0);
2165 for (l = 0; l < m2p2; l++)
2166 psij_const[l] = (PHI(ths->
x[j]-((R)((u+l)))/n,0));
2168 nfft_trafo_1d_compute(&ths->
f[j], g, psij_const, &ths->
x[j], n, m);
2175 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A \
2177 assert(ar_x[2*k] >= min_u_a || k == M-1); \
2179 assert(ar_x[2*k-2] < min_u_a); \
2182 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A
2186 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B \
2188 assert(ar_x[2*k] >= min_u_b || k == M-1); \
2190 assert(ar_x[2*k-2] < min_u_b); \
2193 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B
2196 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
2198 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, \
2199 ths->psi + j * (2 * m + 2), ths->x + j, n, m, my_u0, my_o0); \
2202 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
2204 R psij_const[2 * m + 2]; \
2206 R fg_psij0 = ths->psi[2 * j]; \
2207 R fg_psij1 = ths->psi[2 * j + 1]; \
2208 R fg_psij2 = K(1.0); \
2210 psij_const[0] = fg_psij0; \
2211 for (l = 1; l <= 2 * m + 1; l++) \
2213 fg_psij2 *= fg_psij1; \
2214 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
2217 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2218 ths->x + j, n, m, my_u0, my_o0); \
2221 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
2223 R psij_const[2 * m + 2]; \
2224 R fg_psij0, fg_psij1, fg_psij2; \
2227 nfft_uo(ths, j, &u, &o, 0); \
2228 fg_psij0 = (PHI(ths->x[j]-((R)u)/n,0)); \
2229 fg_psij1 = EXP(K(2.0) * (n * (ths->x[j]) - u) / ths->b[0]); \
2230 fg_psij2 = K(1.0); \
2231 psij_const[0] = fg_psij0; \
2232 for (l = 1; l <= 2 * m + 1; l++) \
2234 fg_psij2 *= fg_psij1; \
2235 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
2238 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2239 ths->x + j, n, m, my_u0, my_o0); \
2242 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
2244 R psij_const[2 * m + 2]; \
2249 nfft_uo(ths, j, &u, &o, 0); \
2251 ip_y = FABS(n * ths->x[j] - u) * ((R)ip_s); \
2252 ip_u = LRINT(FLOOR(ip_y)); \
2253 ip_w = ip_y - ip_u; \
2254 for (l = 0; l < 2 * m + 2; l++) \
2256 = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w) \
2257 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w); \
2259 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2260 ths->x + j, n, m, my_u0, my_o0); \
2263 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
2265 R psij_const[2 * m + 2]; \
2268 nfft_uo(ths, j, &u, &o, 0); \
2270 for (l = 0; l <= 2 * m + 1; l++) \
2271 psij_const[l] = (PHI(ths->x[j]-((R)((u+l)))/n,0)); \
2273 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2274 ths->x + j, n, m, my_u0, my_o0); \
2277 #define MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(whichone) \
2279 if (ths->nfft_flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
2281 _Pragma("omp parallel private(k)") \
2283 int my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
2284 int *ar_x = ths->index_x; \
2286 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
2287 &min_u_b, &max_u_b, 1, &n, m); \
2289 if (min_u_a != -1) \
2291 k = index_x_binary_search(ar_x, M, min_u_a); \
2293 MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A \
2297 int u_prod = ar_x[2*k]; \
2298 int j = ar_x[2*k+1]; \
2300 if (u_prod < min_u_a || u_prod > max_u_a) \
2303 MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
2309 if (min_u_b != -1) \
2311 k = index_x_binary_search(ar_x, M, min_u_b); \
2313 MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B \
2317 int u_prod = ar_x[2*k]; \
2318 int j = ar_x[2*k+1]; \
2320 if (u_prod < min_u_b || u_prod > max_u_b) \
2323 MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
2333 static void nfft_adjoint_1d_B(
nfft_plan *ths)
2335 const int n = ths->
n[0], M = ths->
M_total, m = ths->
m;
2339 memset(g,0,ths->
n_total*
sizeof(C));
2343 nfft_adjoint_B_compute_full_psi(g, ths->
psi_index_g, ths->
psi, ths->
f, M,
2351 MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(PRE_PSI)
2354 #pragma omp parallel for default(shared) private(k)
2355 for (k = 0; k < M; k++)
2359 nfft_adjoint_1d_compute_omp_atomic(ths->
f[j], g, ths->
psi + j * (2 * m + 2), ths->
x + j, n, m);
2361 nfft_adjoint_1d_compute_serial(ths->
f + j, g, ths->
psi + j * (2 * m + 2), ths->
x + j, n, m);
2370 R fg_exp_l[2 * m + 2];
2372 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->
b[0]);
2375 MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(PRE_FG_PSI)
2379 #pragma omp parallel for default(shared) private(k)
2380 for (k = 0; k < M; k++)
2382 R psij_const[2 * m + 2];
2385 R fg_psij0 = ths->
psi[2 * j];
2386 R fg_psij1 = ths->
psi[2 * j + 1];
2387 R fg_psij2 = K(1.0);
2389 psij_const[0] = fg_psij0;
2390 for (l = 1; l <= 2 * m + 1; l++)
2392 fg_psij2 *= fg_psij1;
2393 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2397 nfft_adjoint_1d_compute_omp_atomic(ths->
f[j], g, psij_const, ths->
x + j, n, m);
2399 nfft_adjoint_1d_compute_serial(ths->
f + j, g, psij_const, ths->
x + j, n, m);
2408 R fg_exp_l[2 * m + 2];
2410 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->
b[0]);
2412 nfft_sort_nodes(ths);
2415 MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(FG_PSI)
2418 #pragma omp parallel for default(shared) private(k)
2419 for (k = 0; k < M; k++)
2422 R psij_const[2 * m + 2];
2423 R fg_psij0, fg_psij1, fg_psij2;
2426 nfft_uo(ths, j, &u, &o, 0);
2427 fg_psij0 = (PHI(ths->
x[j]-((R)u)/n,0));
2428 fg_psij1 = EXP(K(2.0) * (n * (ths->
x[j]) - u) / ths->
b[0]);
2430 psij_const[0] = fg_psij0;
2431 for (l = 1; l <= 2 * m + 1; l++)
2433 fg_psij2 *= fg_psij1;
2434 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2438 nfft_adjoint_1d_compute_omp_atomic(ths->
f[j], g, psij_const, ths->
x + j, n, m);
2440 nfft_adjoint_1d_compute_serial(ths->
f + j, g, psij_const, ths->
x + j, n, m);
2449 const int K = ths->
K;
2450 const int ip_s = K / (m + 2);
2452 nfft_sort_nodes(ths);
2455 MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
2458 #pragma openmp parallel for default(shared) private(k)
2459 for (k = 0; k < M; k++)
2465 R psij_const[2 * m + 2];
2467 nfft_uo(ths, j, &u, &o, 0);
2469 ip_y = FABS(n * ths->
x[j] - u) * ((R)ip_s);
2470 ip_u = LRINT(FLOOR(ip_y));
2472 for (l = 0; l < 2 * m + 2; l++)
2474 = ths->
psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
2475 + ths->
psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
2478 nfft_adjoint_1d_compute_omp_atomic(ths->
f[j], g, psij_const, ths->
x + j, n, m);
2480 nfft_adjoint_1d_compute_serial(ths->
f + j, g, psij_const, ths->
x + j, n, m);
2487 nfft_sort_nodes(ths);
2490 MACRO_nfft_adjoint_1d_B_OMP_BLOCKWISE(NO_PSI)
2493 #pragma omp parallel for default(shared) private(k)
2494 for (k = 0; k < M; k++)
2497 R psij_const[2 * m + 2];
2500 nfft_uo(ths, j, &u, &o, 0);
2502 for (l = 0; l <= 2 * m + 1; l++)
2503 psij_const[l] = (PHI(ths->
x[j]-((R)((u+l)))/n,0));
2506 nfft_adjoint_1d_compute_omp_atomic(ths->
f[j], g, psij_const, ths->
x + j, n, m);
2508 nfft_adjoint_1d_compute_serial(ths->
f + j, g, psij_const, ths->
x + j, n, m);
2515 const int N = ths->
N[0], N2 = N/2, n = ths->
n[0];
2516 C *f_hat1 = (C*)ths->
f_hat, *f_hat2 = (C*)&ths->
f_hat[N2];
2522 C *g_hat1 = (C*)&ths->
g_hat[n-N/2], *g_hat2 = (C*)ths->
g_hat;
2523 R *c_phi_inv1, *c_phi_inv2;
2529 #pragma omp parallel for default(shared) private(k)
2530 for (k = 0; k < ths->
n_total; k++)
2531 ths->
g_hat[k] = 0.0;
2542 #pragma omp parallel for default(shared) private(k)
2543 for (k = 0; k < N2; k++)
2545 g_hat1[k] = f_hat1[k] * c_phi_inv1[k];
2546 g_hat2[k] = f_hat2[k] * c_phi_inv2[k];
2552 #pragma omp parallel for default(shared) private(k)
2553 for (k = 0; k < N2; k++)
2555 g_hat1[k] = f_hat1[k] / (PHI_HUT(k-N2,0));
2556 g_hat2[k] = f_hat2[k] / (PHI_HUT(k,0));
2562 fftw_execute(ths->my_fftw_plan1);
2566 nfft_trafo_1d_B(ths);
2574 C *g_hat1,*g_hat2,*f_hat1,*f_hat2;
2575 R *c_phi_inv1, *c_phi_inv2;
2583 f_hat1=(C*)ths->f_hat;
2584 f_hat2=(C*)&ths->f_hat[N/2];
2585 g_hat1=(C*)&ths->g_hat[n-N/2];
2586 g_hat2=(C*)ths->g_hat;
2589 nfft_adjoint_1d_B(ths);
2593 fftw_execute(ths->my_fftw_plan2);
2597 if(ths->nfft_flags & PRE_PHI_HUT)
2600 c_phi_inv1=ths->c_phi_inv[0];
2601 c_phi_inv2=&ths->c_phi_inv[0][N/2];
2603 #pragma omp parallel for default(shared) private(k)
2604 for (k = 0; k < N/2; k++)
2606 f_hat1[k] = g_hat1[k] * c_phi_inv1[k];
2607 f_hat2[k] = g_hat2[k] * c_phi_inv2[k];
2614 #pragma omp parallel for default(shared) private(k)
2615 for (k = 0; k < N/2; k++)
2617 f_hat1[k] = g_hat1[k] / (PHI_HUT(k-N/2,0));
2618 f_hat2[k] = g_hat2[k] / (PHI_HUT(k,0));
2627 static void nfft_2d_init_fg_exp_l(R *fg_exp_l,
const int m,
const R b)
2630 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
2632 fg_exp_b0 = EXP(K(-1.0)/b);
2633 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
2636 fg_exp_l[0] = K(1.0);
2637 for(l=1; l <= 2*m+1; l++)
2639 fg_exp_b2 = fg_exp_b1*fg_exp_b0;
2640 fg_exp_b1 *= fg_exp_b0_sq;
2641 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
2645 static void nfft_trafo_2d_compute(C *fj,
const C *g,
2646 const R *psij_const0,
const R *psij_const1,
2647 const R *xj0,
const R *xj1,
2648 const int n0,
const int n1,
const int m)
2650 int u0,o0,l0,u1,o1,l1;
2652 const R *psij0,*psij1;
2657 nfft_uo2(&u0,&o0,*xj0, n0, m);
2658 nfft_uo2(&u1,&o1,*xj1, n1, m);
2664 for(l0=0; l0<=2*m+1; l0++,psij0++)
2668 for(l1=0; l1<=2*m+1; l1++)
2669 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2672 for(l0=0; l0<=2*m+1; l0++,psij0++)
2676 for(l1=0; l1<2*m+1-o1; l1++)
2677 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2679 for(l1=0; l1<=o1; l1++)
2680 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2685 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2689 for(l1=0; l1<=2*m+1; l1++)
2690 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2692 for(l0=0; l0<=o0; l0++,psij0++)
2696 for(l1=0; l1<=2*m+1; l1++)
2697 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2702 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2706 for(l1=0; l1<2*m+1-o1; l1++)
2707 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2709 for(l1=0; l1<=o1; l1++)
2710 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2712 for(l0=0; l0<=o0; l0++,psij0++)
2716 for(l1=0; l1<2*m+1-o1; l1++)
2717 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2719 for(l1=0; l1<=o1; l1++)
2720 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2727 static void nfft_adjoint_2d_compute_omp_atomic(
const C f, C *g,
2728 const R *psij_const0,
const R *psij_const1,
2729 const R *xj0,
const R *xj1,
2730 const int n0,
const int n1,
const int m)
2732 int u0,o0,l0,u1,o1,l1;
2733 const int lprod = (2*m+2) * (2*m+2);
2735 unsigned long int index_temp0[2*m+2];
2736 unsigned long int index_temp1[2*m+2];
2738 nfft_uo2(&u0,&o0,*xj0, n0, m);
2739 nfft_uo2(&u1,&o1,*xj1, n1, m);
2741 for (l0=0; l0<=2*m+1; l0++)
2742 index_temp0[l0] = (u0+l0)%n0;
2744 for (l1=0; l1<=2*m+1; l1++)
2745 index_temp1[l1] = (u1+l1)%n1;
2747 for(l0=0; l0<=2*m+1; l0++)
2749 for(l1=0; l1<=2*m+1; l1++)
2751 unsigned long int i = index_temp0[l0] * n1 + index_temp1[l1];
2753 R *lhs_real = (R*)lhs;
2754 C val = psij_const0[l0] * psij_const1[l1] * f;
2757 lhs_real[0] += creal(val);
2760 lhs_real[1] += cimag(val);
2785 static void nfft_adjoint_2d_compute_omp_blockwise(
const C f, C *g,
2786 const R *psij_const0,
const R *psij_const1,
2787 const R *xj0,
const R *xj1,
2788 const int n0,
const int n1,
const int m,
2789 const int my_u0,
const int my_o0)
2791 int ar_u0,ar_o0,l0,u1,o1,l1;
2792 const int lprod = (2*m+2) * (2*m+2);
2793 unsigned long int index_temp1[2*m+2];
2795 nfft_uo2(&ar_u0,&ar_o0,*xj0, n0, m);
2796 nfft_uo2(&u1,&o1,*xj1, n1, m);
2798 for (l1=0; l1<=2*m+1; l1++)
2799 index_temp1[l1] = (u1+l1)%n1;
2803 int u0 = MAX(my_u0,ar_u0);
2804 int o0 = MIN(my_o0,ar_o0);
2805 int offset_psij = u0-ar_u0;
2807 assert(offset_psij >= 0);
2808 assert(o0-u0 <= 2*m+1);
2809 assert(offset_psij+o0-u0 <= 2*m+1);
2812 for (l0 = 0; l0 <= o0-u0; l0++)
2814 unsigned long int i0 = (u0+l0) * n1;
2815 const C val0 = psij_const0[offset_psij+l0];
2817 for(l1=0; l1<=2*m+1; l1++)
2818 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2823 int u0 = MAX(my_u0,ar_u0);
2825 int offset_psij = u0-ar_u0;
2827 assert(offset_psij >= 0);
2828 assert(o0-u0 <= 2*m+1);
2829 assert(offset_psij+o0-u0 <= 2*m+1);
2832 for (l0 = 0; l0 <= o0-u0; l0++)
2834 unsigned long int i0 = (u0+l0) * n1;
2835 const C val0 = psij_const0[offset_psij+l0];
2837 for(l1=0; l1<=2*m+1; l1++)
2838 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2842 o0 = MIN(my_o0,ar_o0);
2843 offset_psij += my_u0-ar_u0+n0;
2848 assert(o0-u0 <= 2*m+1);
2849 assert(offset_psij+o0-u0 <= 2*m+1);
2853 for (l0 = 0; l0 <= o0-u0; l0++)
2855 unsigned long int i0 = (u0+l0) * n1;
2856 const C val0 = psij_const0[offset_psij+l0];
2858 for(l1=0; l1<=2*m+1; l1++)
2859 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2866 static void nfft_adjoint_2d_compute_serial(
const C *fj, C *g,
2867 const R *psij_const0,
const R *psij_const1,
2868 const R *xj0,
const R *xj1,
2869 const int n0,
const int n1,
const int m)
2871 int u0,o0,l0,u1,o1,l1;
2873 const R *psij0,*psij1;
2878 nfft_uo2(&u0,&o0,*xj0, n0, m);
2879 nfft_uo2(&u1,&o1,*xj1, n1, m);
2883 for(l0=0; l0<=2*m+1; l0++,psij0++)
2887 for(l1=0; l1<=2*m+1; l1++)
2888 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2891 for(l0=0; l0<=2*m+1; l0++,psij0++)
2895 for(l1=0; l1<2*m+1-o1; l1++)
2896 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2898 for(l1=0; l1<=o1; l1++)
2899 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2904 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2908 for(l1=0; l1<=2*m+1; l1++)
2909 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2911 for(l0=0; l0<=o0; l0++,psij0++)
2915 for(l1=0; l1<=2*m+1; l1++)
2916 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2921 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2925 for(l1=0; l1<2*m+1-o1; l1++)
2926 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2928 for(l1=0; l1<=o1; l1++)
2929 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2931 for(l0=0; l0<=o0; l0++,psij0++)
2935 for(l1=0; l1<2*m+1-o1; l1++)
2936 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2938 for(l1=0; l1<=o1; l1++)
2939 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2945 static void nfft_trafo_2d_B(
nfft_plan *ths)
2947 const C *g = (C*)ths->
g;
2948 const int N0 = ths->
N[0];
2949 const int n0 = ths->
n[0];
2950 const int N1 = ths->
N[1];
2951 const int n1 = ths->
n[1];
2953 const int m = ths->
m;
2959 const int lprod = (2*m+2) * (2*m+2);
2960 #pragma omp parallel for default(shared) private(k)
2961 for (k = 0; k < M; k++)
2966 for (l = 0; l < lprod; l++)
2974 #pragma omp parallel for default(shared) private(k)
2975 for (k = 0; k < M; k++)
2978 nfft_trafo_2d_compute(ths->
f+j, g, ths->
psi+j*2*(2*m+2), ths->
psi+(j*2+1)*(2*m+2), ths->
x+2*j, ths->
x+2*j+1, n0, n1, m);
2986 R fg_exp_l[2*(2*m+2)];
2988 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->
b[0]);
2989 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->
b[1]);
2991 #pragma omp parallel for default(shared) private(k)
2992 for (k = 0; k < M; k++)
2994 R psij_const[2*(2*m+2)];
2997 R fg_psij0 = ths->
psi[2*j*2];
2998 R fg_psij1 = ths->
psi[2*j*2+1];
2999 R fg_psij2 = K(1.0);
3001 psij_const[0] = fg_psij0;
3002 for(l=1; l<=2*m+1; l++)
3004 fg_psij2 *= fg_psij1;
3005 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3008 fg_psij0 = ths->
psi[2*(j*2+1)];
3009 fg_psij1 = ths->
psi[2*(j*2+1)+1];
3011 psij_const[2*m+2] = fg_psij0;
3012 for(l=1; l<=2*m+1; l++)
3014 fg_psij2 *= fg_psij1;
3015 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3018 nfft_trafo_2d_compute(ths->
f+j, g, psij_const, psij_const+2*m+2, ths->
x+2*j, ths->
x+2*j+1, n0, n1, m);
3026 R fg_exp_l[2*(2*m+2)];
3028 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->
b[0]);
3029 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->
b[1]);
3031 nfft_sort_nodes(ths);
3033 #pragma omp parallel for default(shared) private(k)
3034 for (k = 0; k < M; k++)
3037 R fg_psij0, fg_psij1, fg_psij2;
3038 R psij_const[2*(2*m+2)];
3041 nfft_uo(ths,j,&u,&o,0);
3042 fg_psij0 = (PHI(ths->
x[2*j]-((R)u)/n0,0));
3043 fg_psij1 = EXP(K(2.0)*(n0*(ths->
x[2*j]) - u)/ths->
b[0]);
3045 psij_const[0] = fg_psij0;
3046 for(l=1; l<=2*m+1; l++)
3048 fg_psij2 *= fg_psij1;
3049 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3052 nfft_uo(ths,j,&u,&o,1);
3053 fg_psij0 = (PHI(ths->
x[2*j+1]-((R)u)/n1,1));
3054 fg_psij1 = EXP(K(2.0)*(n1*(ths->
x[2*j+1]) - u)/ths->
b[1]);
3056 psij_const[2*m+2] = fg_psij0;
3057 for(l=1; l<=2*m+1; l++)
3059 fg_psij2 *= fg_psij1;
3060 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3063 nfft_trafo_2d_compute(ths->
f+j, g, psij_const, psij_const+2*m+2, ths->
x+2*j, ths->
x+2*j+1, n0, n1, m);
3071 const int K = ths->
K, ip_s = K / (m + 2);
3073 nfft_sort_nodes(ths);
3075 #pragma omp parallel for default(shared) private(k)
3076 for (k = 0; k < M; k++)
3081 R psij_const[2*(2*m+2)];
3084 nfft_uo(ths,j,&u,&o,0);
3085 ip_y = FABS(n0*ths->
x[2*j] - u)*((R)ip_s);
3086 ip_u = LRINT(FLOOR(ip_y));
3088 for(l=0; l < 2*m+2; l++)
3089 psij_const[l] = ths->
psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->
psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
3091 nfft_uo(ths,j,&u,&o,1);
3092 ip_y = FABS(n1*ths->
x[2*j+1] - u)*((R)ip_s);
3093 ip_u = LRINT(FLOOR(ip_y));
3095 for(l=0; l < 2*m+2; l++)
3096 psij_const[2*m+2+l] = ths->
psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->
psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
3098 nfft_trafo_2d_compute(ths->
f+j, g, psij_const, psij_const+2*m+2, ths->
x+2*j, ths->
x+2*j+1, n0, n1, m);
3105 nfft_sort_nodes(ths);
3107 #pragma omp parallel for default(shared) private(k)
3108 for (k = 0; k < M; k++)
3110 R psij_const[2*(2*m+2)];
3114 nfft_uo(ths,j,&u,&o,0);
3115 for(l=0;l<=2*m+1;l++)
3116 psij_const[l]=(PHI(ths->
x[2*j]-((R)((u+l)))/n0,0));
3118 nfft_uo(ths,j,&u,&o,1);
3119 for(l=0;l<=2*m+1;l++)
3120 psij_const[2*m+2+l]=(PHI(ths->
x[2*j+1]-((R)((u+l)))/n1,1));
3122 nfft_trafo_2d_compute(ths->
f+j, g, psij_const, psij_const+2*m+2, ths->
x+2*j, ths->
x+2*j+1, n0, n1, m);
3127 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A \
3129 assert(ar_x[2*k] >= min_u_a || k == M-1); \
3131 assert(ar_x[2*k-2] < min_u_a); \
3134 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A
3138 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B \
3140 assert(ar_x[2*k] >= min_u_b || k == M-1); \
3142 assert(ar_x[2*k-2] < min_u_b); \
3145 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B
3148 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
3149 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3150 ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), \
3151 ths->x+2*j, ths->x+2*j+1, n0, n1, m, my_u0, my_o0);
3153 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
3155 R psij_const[2*(2*m+2)]; \
3157 R fg_psij0 = ths->psi[2*j*2]; \
3158 R fg_psij1 = ths->psi[2*j*2+1]; \
3159 R fg_psij2 = K(1.0); \
3161 psij_const[0] = fg_psij0; \
3162 for(l=1; l<=2*m+1; l++) \
3164 fg_psij2 *= fg_psij1; \
3165 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
3168 fg_psij0 = ths->psi[2*(j*2+1)]; \
3169 fg_psij1 = ths->psi[2*(j*2+1)+1]; \
3170 fg_psij2 = K(1.0); \
3171 psij_const[2*m+2] = fg_psij0; \
3172 for(l=1; l<=2*m+1; l++) \
3174 fg_psij2 *= fg_psij1; \
3175 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
3178 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3179 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3180 n0, n1, m, my_u0, my_o0); \
3183 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
3185 R psij_const[2*(2*m+2)]; \
3186 R fg_psij0, fg_psij1, fg_psij2; \
3189 nfft_uo(ths,j,&u,&o,0); \
3190 fg_psij0 = (PHI(ths->x[2*j]-((R)u)/n0,0)); \
3191 fg_psij1 = EXP(K(2.0)*(n0*(ths->x[2*j]) - u)/ths->b[0]); \
3192 fg_psij2 = K(1.0); \
3193 psij_const[0] = fg_psij0; \
3194 for(l=1; l<=2*m+1; l++) \
3196 fg_psij2 *= fg_psij1; \
3197 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
3200 nfft_uo(ths,j,&u,&o,1); \
3201 fg_psij0 = (PHI(ths->x[2*j+1]-((R)u)/n1,1)); \
3202 fg_psij1 = EXP(K(2.0)*(n1*(ths->x[2*j+1]) - u)/ths->b[1]); \
3203 fg_psij2 = K(1.0); \
3204 psij_const[2*m+2] = fg_psij0; \
3205 for(l=1; l<=2*m+1; l++) \
3207 fg_psij2 *= fg_psij1; \
3208 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
3211 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3212 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3213 n0, n1, m, my_u0, my_o0); \
3216 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
3218 R psij_const[2*(2*m+2)]; \
3223 nfft_uo(ths,j,&u,&o,0); \
3224 ip_y = FABS(n0*(ths->x[2*j]) - u)*((R)ip_s); \
3225 ip_u = LRINT(FLOOR(ip_y)); \
3227 for(l=0; l < 2*m+2; l++) \
3228 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
3229 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
3231 nfft_uo(ths,j,&u,&o,1); \
3232 ip_y = FABS(n1*(ths->x[2*j+1]) - u)*((R)ip_s); \
3233 ip_u = LRINT(FLOOR(ip_y)); \
3235 for(l=0; l < 2*m+2; l++) \
3236 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
3237 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
3239 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3240 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3241 n0, n1, m, my_u0, my_o0); \
3244 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
3246 R psij_const[2*(2*m+2)]; \
3249 nfft_uo(ths,j,&u,&o,0); \
3250 for(l=0;l<=2*m+1;l++) \
3251 psij_const[l]=(PHI(ths->x[2*j]-((R)((u+l)))/n0,0)); \
3253 nfft_uo(ths,j,&u,&o,1); \
3254 for(l=0;l<=2*m+1;l++) \
3255 psij_const[2*m+2+l]=(PHI(ths->x[2*j+1]-((R)((u+l)))/n1,1)); \
3257 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3258 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3259 n0, n1, m, my_u0, my_o0); \
3262 #define MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(whichone) \
3264 if (ths->nfft_flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
3266 _Pragma("omp parallel private(k)") \
3268 int my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
3269 int *ar_x = ths->index_x; \
3271 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
3272 &min_u_b, &max_u_b, 2, ths->n, m); \
3274 if (min_u_a != -1) \
3276 k = index_x_binary_search(ar_x, M, min_u_a); \
3278 MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A \
3282 int u_prod = ar_x[2*k]; \
3283 int j = ar_x[2*k+1]; \
3285 if (u_prod < min_u_a || u_prod > max_u_a) \
3288 MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
3294 if (min_u_b != -1) \
3296 int k = index_x_binary_search(ar_x, M, min_u_b); \
3298 MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B \
3302 int u_prod = ar_x[2*k]; \
3303 int j = ar_x[2*k+1]; \
3305 if (u_prod < min_u_b || u_prod > max_u_b) \
3308 MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
3319 static void nfft_adjoint_2d_B(
nfft_plan *ths)
3321 const int N0 = ths->
N[0];
3322 const int n0 = ths->
n[0];
3323 const int N1 = ths->
N[1];
3324 const int n1 = ths->
n[1];
3326 const int m = ths->
m;
3330 memset(g,0,ths->
n_total*
sizeof(C));
3334 nfft_adjoint_B_compute_full_psi(g, ths->
psi_index_g, ths->
psi, ths->
f, M,
3342 MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(PRE_PSI)
3345 #pragma omp parallel for default(shared) private(k)
3346 for (k = 0; k < M; k++)
3350 nfft_adjoint_2d_compute_omp_atomic(ths->
f[j], g, ths->
psi+j*2*(2*m+2), ths->
psi+(j*2+1)*(2*m+2), ths->
x+2*j, ths->
x+2*j+1, n0, n1, m);
3352 nfft_adjoint_2d_compute_serial(ths->
f+j, g, ths->
psi+j*2*(2*m+2), ths->
psi+(j*2+1)*(2*m+2), ths->
x+2*j, ths->
x+2*j+1, n0, n1, m);
3360 R fg_exp_l[2*(2*m+2)];
3362 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->
b[0]);
3363 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->
b[1]);
3366 MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(PRE_FG_PSI)
3370 #pragma omp parallel for default(shared) private(k)
3371 for (k = 0; k < M; k++)
3373 R psij_const[2*(2*m+2)];
3376 R fg_psij0 = ths->
psi[2*j*2];
3377 R fg_psij1 = ths->
psi[2*j*2+1];
3378 R fg_psij2 = K(1.0);
3380 psij_const[0] = fg_psij0;
3381 for(l=1; l<=2*m+1; l++)
3383 fg_psij2 *= fg_psij1;
3384 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3387 fg_psij0 = ths->
psi[2*(j*2+1)];
3388 fg_psij1 = ths->
psi[2*(j*2+1)+1];
3390 psij_const[2*m+2] = fg_psij0;
3391 for(l=1; l<=2*m+1; l++)
3393 fg_psij2 *= fg_psij1;
3394 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3398 nfft_adjoint_2d_compute_omp_atomic(ths->
f[j], g, psij_const, psij_const+2*m+2, ths->
x+2*j, ths->
x+2*j+1, n0, n1, m);
3400 nfft_adjoint_2d_compute_serial(ths->
f+j, g, psij_const, psij_const+2*m+2, ths->
x+2*j, ths->
x+2*j+1, n0, n1, m);
3409 R fg_exp_l[2*(2*m+2)];
3411 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->
b[0]);
3412 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->
b[1]);
3414 nfft_sort_nodes(ths);
3417 MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(FG_PSI)
3420 #pragma omp parallel for default(shared) private(k)
3421 for (k = 0; k < M; k++)
3424 R fg_psij0, fg_psij1, fg_psij2;
3425 R psij_const[2*(2*m+2)];
3428 nfft_uo(ths,j,&u,&o,0);
3429 fg_psij0 = (PHI(ths->
x[2*j]-((R)u)/n0,0));
3430 fg_psij1 = EXP(K(2.0)*(n0*(ths->
x[2*j]) - u)/ths->
b[0]);
3432 psij_const[0] = fg_psij0;
3433 for(l=1; l<=2*m+1; l++)
3435 fg_psij2 *= fg_psij1;
3436 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3439 nfft_uo(ths,j,&u,&o,1);
3440 fg_psij0 = (PHI(ths->
x[2*j+1]-((R)u)/n1,1));
3441 fg_psij1 = EXP(K(2.0)*(n1*(ths->
x[2*j+1]) - u)/ths->
b[1]);
3443 psij_const[2*m+2] = fg_psij0;
3444 for(l=1; l<=2*m+1; l++)
3446 fg_psij2 *= fg_psij1;
3447 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3451 nfft_adjoint_2d_compute_omp_atomic(ths->
f[j], g, psij_const, psij_const+2*m+2, ths->
x+2*j, ths->
x+2*j+1, n0, n1, m);
3453 nfft_adjoint_2d_compute_serial(ths->
f+j, g, psij_const, psij_const+2*m+2, ths->
x+2*j, ths->
x+2*j+1, n0, n1, m);
3462 const int K = ths->
K;
3463 const int ip_s = K / (m + 2);
3465 nfft_sort_nodes(ths);
3468 MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
3471 #pragma openmp parallel for default(shared) private(k)
3472 for (k = 0; k < M; k++)
3478 R psij_const[2*(2*m+2)];
3480 nfft_uo(ths,j,&u,&o,0);
3481 ip_y = FABS(n0*(ths->
x[2*j]) - u)*((R)ip_s);
3482 ip_u = LRINT(FLOOR(ip_y));
3484 for(l=0; l < 2*m+2; l++)
3485 psij_const[l] = ths->
psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
3486 ths->
psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
3488 nfft_uo(ths,j,&u,&o,1);
3489 ip_y = FABS(n1*(ths->
x[2*j+1]) - u)*((R)ip_s);
3490 ip_u = LRINT(FLOOR(ip_y));
3492 for(l=0; l < 2*m+2; l++)
3493 psij_const[2*m+2+l] = ths->
psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
3494 ths->
psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
3497 nfft_adjoint_2d_compute_omp_atomic(ths->
f[j], g, psij_const, psij_const+2*m+2, ths->
x+2*j, ths->
x+2*j+1, n0, n1, m);
3499 nfft_adjoint_2d_compute_serial(ths->
f+j, g, psij_const, psij_const+2*m+2, ths->
x+2*j, ths->
x+2*j+1, n0, n1, m);
3506 nfft_sort_nodes(ths);
3509 MACRO_nfft_adjoint_2d_B_OMP_BLOCKWISE(NO_PSI)
3512 #pragma omp parallel for default(shared) private(k)
3513 for (k = 0; k < M; k++)
3516 R psij_const[2*(2*m+2)];
3519 nfft_uo(ths,j,&u,&o,0);
3520 for(l=0;l<=2*m+1;l++)
3521 psij_const[l]=(PHI(ths->
x[2*j]-((R)((u+l)))/n0,0));
3523 nfft_uo(ths,j,&u,&o,1);
3524 for(l=0;l<=2*m+1;l++)
3525 psij_const[2*m+2+l]=(PHI(ths->
x[2*j+1]-((R)((u+l)))/n1,1));
3528 nfft_adjoint_2d_compute_omp_atomic(ths->
f[j], g, psij_const, psij_const+2*m+2, ths->
x+2*j, ths->
x+2*j+1, n0, n1, m);
3530 nfft_adjoint_2d_compute_serial(ths->
f+j, g, psij_const, psij_const+2*m+2, ths->
x+2*j, ths->
x+2*j+1, n0, n1, m);
3538 int k0,k1,n0,n1,N0,N1;
3540 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
3541 R ck01, ck02, ck11, ck12;
3542 C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
3552 f_hat=(C*)ths->
f_hat;
3553 g_hat=(C*)ths->
g_hat;
3557 #pragma omp parallel for default(shared) private(k0)
3558 for (k0 = 0; k0 < ths->
n_total; k0++)
3559 ths->
g_hat[k0] = 0.0;
3568 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
3569 for(k0=0;k0<N0/2;k0++)
3571 ck01=c_phi_inv01[k0];
3572 ck02=c_phi_inv02[k0];
3577 g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
3578 f_hat11=f_hat + k0*N1;
3579 g_hat21=g_hat + k0*n1+n1-(N1/2);
3580 f_hat21=f_hat + ((N0/2)+k0)*N1;
3581 g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
3582 f_hat12=f_hat + k0*N1+(N1/2);
3583 g_hat22=g_hat + k0*n1;
3584 f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
3586 for(k1=0;k1<N1/2;k1++)
3588 ck11=c_phi_inv11[k1];
3589 ck12=c_phi_inv12[k1];
3591 g_hat11[k1] = f_hat11[k1] * ck01 * ck11;
3592 g_hat21[k1] = f_hat21[k1] * ck02 * ck11;
3593 g_hat12[k1] = f_hat12[k1] * ck01 * ck12;
3594 g_hat22[k1] = f_hat22[k1] * ck02 * ck12;
3599 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
3600 for(k0=0;k0<N0/2;k0++)
3602 ck01=K(1.0)/(PHI_HUT(k0-N0/2,0));
3603 ck02=K(1.0)/(PHI_HUT(k0,0));
3604 for(k1=0;k1<N1/2;k1++)
3606 ck11=K(1.0)/(PHI_HUT(k1-N1/2,1));
3607 ck12=K(1.0)/(PHI_HUT(k1,1));
3608 g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] = f_hat[k0*N1+k1] * ck01 * ck11;
3609 g_hat[k0*n1+n1-N1/2+k1] = f_hat[(N0/2+k0)*N1+k1] * ck02 * ck11;
3610 g_hat[(n0-N0/2+k0)*n1+k1] = f_hat[k0*N1+N1/2+k1] * ck01 * ck12;
3611 g_hat[k0*n1+k1] = f_hat[(N0/2+k0)*N1+N1/2+k1] * ck02 * ck12;
3618 fftw_execute(ths->my_fftw_plan1);
3622 nfft_trafo_2d_B(ths);
3628 int k0,k1,n0,n1,N0,N1;
3630 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
3631 R ck01, ck02, ck11, ck12;
3632 C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
3642 f_hat=(C*)ths->f_hat;
3643 g_hat=(C*)ths->g_hat;
3646 nfft_adjoint_2d_B(ths);
3650 fftw_execute(ths->my_fftw_plan2);
3654 if(ths->nfft_flags & PRE_PHI_HUT)
3656 c_phi_inv01=ths->c_phi_inv[0];
3657 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
3659 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
3660 for(k0=0;k0<N0/2;k0++)
3662 ck01=c_phi_inv01[k0];
3663 ck02=c_phi_inv02[k0];
3665 c_phi_inv11=ths->c_phi_inv[1];
3666 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
3668 g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
3669 f_hat11=f_hat + k0*N1;
3670 g_hat21=g_hat + k0*n1+n1-(N1/2);
3671 f_hat21=f_hat + ((N0/2)+k0)*N1;
3672 g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
3673 f_hat12=f_hat + k0*N1+(N1/2);
3674 g_hat22=g_hat + k0*n1;
3675 f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
3677 for(k1=0;k1<N1/2;k1++)
3679 ck11=c_phi_inv11[k1];
3680 ck12=c_phi_inv12[k1];
3682 f_hat11[k1] = g_hat11[k1] * ck01 * ck11;
3683 f_hat21[k1] = g_hat21[k1] * ck02 * ck11;
3684 f_hat12[k1] = g_hat12[k1] * ck01 * ck12;
3685 f_hat22[k1] = g_hat22[k1] * ck02 * ck12;
3690 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
3691 for(k0=0;k0<N0/2;k0++)
3693 ck01=K(1.0)/(PHI_HUT(k0-N0/2,0));
3694 ck02=K(1.0)/(PHI_HUT(k0,0));
3695 for(k1=0;k1<N1/2;k1++)
3697 ck11=K(1.0)/(PHI_HUT(k1-N1/2,1));
3698 ck12=K(1.0)/(PHI_HUT(k1,1));
3699 f_hat[k0*N1+k1] = g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] * ck01 * ck11;
3700 f_hat[(N0/2+k0)*N1+k1] = g_hat[k0*n1+n1-N1/2+k1] * ck02 * ck11;
3701 f_hat[k0*N1+N1/2+k1] = g_hat[(n0-N0/2+k0)*n1+k1] * ck01 * ck12;
3702 f_hat[(N0/2+k0)*N1+N1/2+k1] = g_hat[k0*n1+k1] * ck02 * ck12;
3710 static void nfft_3d_init_fg_exp_l(R *fg_exp_l,
const int m,
const R b)
3713 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
3715 fg_exp_b0 = EXP(-1.0/b);
3716 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
3719 fg_exp_l[0] = K(1.0);
3720 for(l=1; l <= 2*m+1; l++)
3722 fg_exp_b2 = fg_exp_b1*fg_exp_b0;
3723 fg_exp_b1 *= fg_exp_b0_sq;
3724 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
3728 static void nfft_trafo_3d_compute(C *fj,
const C *g,
3729 const R *psij_const0,
const R *psij_const1,
const R *psij_const2,
3730 const R *xj0,
const R *xj1,
const R *xj2,
3731 const int n0,
const int n1,
const int n2,
const int m)
3733 int u0,o0,l0,u1,o1,l1,u2,o2,l2;
3735 const R *psij0,*psij1,*psij2;
3741 nfft_uo2(&u0,&o0,*xj0, n0, m);
3742 nfft_uo2(&u1,&o1,*xj1, n1, m);
3743 nfft_uo2(&u2,&o2,*xj2, n2, m);
3750 for(l0=0; l0<=2*m+1; l0++,psij0++)
3753 for(l1=0; l1<=2*m+1; l1++,psij1++)
3756 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
3757 for(l2=0; l2<=2*m+1; l2++)
3758 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3762 for(l0=0; l0<=2*m+1; l0++,psij0++)
3765 for(l1=0; l1<=2*m+1; l1++,psij1++)
3768 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
3769 for(l2=0; l2<2*m+1-o2; l2++)
3770 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3771 gj=g+((u0+l0)*n1+(u1+l1))*n2;
3772 for(l2=0; l2<=o2; l2++)
3773 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3778 for(l0=0; l0<=2*m+1; l0++,psij0++)
3781 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
3784 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
3785 for(l2=0; l2<=2*m+1; l2++)
3786 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3788 for(l1=0; l1<=o1; l1++,psij1++)
3791 gj=g+((u0+l0)*n1+l1)*n2+u2;
3792 for(l2=0; l2<=2*m+1; l2++)
3793 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3798 for(l0=0; l0<=2*m+1; l0++,psij0++)
3801 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
3804 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
3805 for(l2=0; l2<2*m+1-o2; l2++)
3806 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3807 gj=g+((u0+l0)*n1+(u1+l1))*n2;
3808 for(l2=0; l2<=o2; l2++)
3809 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3811 for(l1=0; l1<=o1; l1++,psij1++)
3814 gj=g+((u0+l0)*n1+l1)*n2+u2;
3815 for(l2=0; l2<2*m+1-o2; l2++)
3816 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3817 gj=g+((u0+l0)*n1+l1)*n2;
3818 for(l2=0; l2<=o2; l2++)
3819 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3827 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
3830 for(l1=0; l1<=2*m+1; l1++,psij1++)
3833 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
3834 for(l2=0; l2<=2*m+1; l2++)
3835 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3839 for(l0=0; l0<=o0; l0++,psij0++)
3842 for(l1=0; l1<=2*m+1; l1++,psij1++)
3845 gj=g+(l0*n1+(u1+l1))*n2+u2;
3846 for(l2=0; l2<=2*m+1; l2++)
3847 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3853 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
3856 for(l1=0; l1<=2*m+1; l1++,psij1++)
3859 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
3860 for(l2=0; l2<2*m+1-o2; l2++)
3861 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3862 gj=g+((u0+l0)*n1+(u1+l1))*n2;
3863 for(l2=0; l2<=o2; l2++)
3864 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3868 for(l0=0; l0<=o0; l0++,psij0++)
3871 for(l1=0; l1<=2*m+1; l1++,psij1++)
3874 gj=g+(l0*n1+(u1+l1))*n2+u2;
3875 for(l2=0; l2<2*m+1-o2; l2++)
3876 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3877 gj=g+(l0*n1+(u1+l1))*n2;
3878 for(l2=0; l2<=o2; l2++)
3879 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3886 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
3889 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
3892 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
3893 for(l2=0; l2<=2*m+1; l2++)
3894 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3896 for(l1=0; l1<=o1; l1++,psij1++)
3899 gj=g+((u0+l0)*n1+l1)*n2+u2;
3900 for(l2=0; l2<=2*m+1; l2++)
3901 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3904 for(l0=0; l0<=o0; l0++,psij0++)
3907 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
3910 gj=g+(l0*n1+(u1+l1))*n2+u2;
3911 for(l2=0; l2<=2*m+1; l2++)
3912 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3914 for(l1=0; l1<=o1; l1++,psij1++)
3917 gj=g+(l0*n1+l1)*n2+u2;
3918 for(l2=0; l2<=2*m+1; l2++)
3919 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3925 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
3928 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
3931 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
3932 for(l2=0; l2<2*m+1-o2; l2++)
3933 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3934 gj=g+((u0+l0)*n1+(u1+l1))*n2;
3935 for(l2=0; l2<=o2; l2++)
3936 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3938 for(l1=0; l1<=o1; l1++,psij1++)
3941 gj=g+((u0+l0)*n1+l1)*n2+u2;
3942 for(l2=0; l2<2*m+1-o2; l2++)
3943 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3944 gj=g+((u0+l0)*n1+l1)*n2;
3945 for(l2=0; l2<=o2; l2++)
3946 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3950 for(l0=0; l0<=o0; l0++,psij0++)
3953 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
3956 gj=g+(l0*n1+(u1+l1))*n2+u2;
3957 for(l2=0; l2<2*m+1-o2; l2++)
3958 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3959 gj=g+(l0*n1+(u1+l1))*n2;
3960 for(l2=0; l2<=o2; l2++)
3961 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3963 for(l1=0; l1<=o1; l1++,psij1++)
3966 gj=g+(l0*n1+l1)*n2+u2;
3967 for(l2=0; l2<2*m+1-o2; l2++)
3968 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3970 for(l2=0; l2<=o2; l2++)
3971 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3999 static void nfft_adjoint_3d_compute_omp_blockwise(
const C f, C *g,
4000 const R *psij_const0,
const R *psij_const1,
const R *psij_const2,
4001 const R *xj0,
const R *xj1,
const R *xj2,
4002 const int n0,
const int n1,
const int n2,
const int m,
4003 const int my_u0,
const int my_o0)
4005 int ar_u0,ar_o0,l0,u1,o1,l1,u2,o2,l2;
4006 const int lprod = (2*m+2) * (2*m+2) * (2*m+2);
4008 unsigned long int index_temp1[2*m+2];
4009 unsigned long int index_temp2[2*m+2];
4011 nfft_uo2(&ar_u0,&ar_o0,*xj0, n0, m);
4012 nfft_uo2(&u1,&o1,*xj1, n1, m);
4013 nfft_uo2(&u2,&o2,*xj2, n2, m);
4015 for (l1=0; l1<=2*m+1; l1++)
4016 index_temp1[l1] = (u1+l1)%n1;
4018 for (l2=0; l2<=2*m+1; l2++)
4019 index_temp2[l2] = (u2+l2)%n2;
4023 int u0 = MAX(my_u0,ar_u0);
4024 int o0 = MIN(my_o0,ar_o0);
4025 int offset_psij = u0-ar_u0;
4027 assert(offset_psij >= 0);
4028 assert(o0-u0 <= 2*m+1);
4029 assert(offset_psij+o0-u0 <= 2*m+1);
4032 for (l0 = 0; l0 <= o0-u0; l0++)
4034 const unsigned long int i0 = (u0+l0) * n1;
4035 const C val0 = psij_const0[offset_psij+l0];
4037 for(l1=0; l1<=2*m+1; l1++)
4039 const unsigned long int i1 = (i0 + index_temp1[l1]) * n2;
4040 const C val1 = psij_const1[l1];
4042 for(l2=0; l2<=2*m+1; l2++)
4043 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4049 int u0 = MAX(my_u0,ar_u0);
4051 int offset_psij = u0-ar_u0;
4053 assert(offset_psij >= 0);
4054 assert(o0-u0 <= 2*m+1);
4055 assert(offset_psij+o0-u0 <= 2*m+1);
4058 for (l0 = 0; l0 <= o0-u0; l0++)
4060 unsigned long int i0 = (u0+l0) * n1;
4061 const C val0 = psij_const0[offset_psij+l0];
4063 for(l1=0; l1<=2*m+1; l1++)
4065 const unsigned long int i1 = (i0 + index_temp1[l1]) * n2;
4066 const C val1 = psij_const1[l1];
4068 for(l2=0; l2<=2*m+1; l2++)
4069 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4074 o0 = MIN(my_o0,ar_o0);
4075 offset_psij += my_u0-ar_u0+n0;
4080 assert(o0-u0 <= 2*m+1);
4081 assert(offset_psij+o0-u0 <= 2*m+1);
4084 for (l0 = 0; l0 <= o0-u0; l0++)
4086 unsigned long int i0 = (u0+l0) * n1;
4087 const C val0 = psij_const0[offset_psij+l0];
4089 for(l1=0; l1<=2*m+1; l1++)
4091 const unsigned long int i1 = (i0 + index_temp1[l1]) * n2;
4092 const C val1 = psij_const1[l1];
4094 for(l2=0; l2<=2*m+1; l2++)
4095 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4104 static void nfft_adjoint_3d_compute_omp_atomic(
const C f, C *g,
4105 const R *psij_const0,
const R *psij_const1,
const R *psij_const2,
4106 const R *xj0,
const R *xj1,
const R *xj2,
4107 const int n0,
const int n1,
const int n2,
const int m)
4109 int u0,o0,l0,u1,o1,l1,u2,o2,l2;
4110 const int lprod = (2*m+2) * (2*m+2) * (2*m+2);
4112 unsigned long int index_temp0[2*m+2];
4113 unsigned long int index_temp1[2*m+2];
4114 unsigned long int index_temp2[2*m+2];
4116 nfft_uo2(&u0,&o0,*xj0, n0, m);
4117 nfft_uo2(&u1,&o1,*xj1, n1, m);
4118 nfft_uo2(&u2,&o2,*xj2, n2, m);
4120 for (l0=0; l0<=2*m+1; l0++)
4121 index_temp0[l0] = (u0+l0)%n0;
4123 for (l1=0; l1<=2*m+1; l1++)
4124 index_temp1[l1] = (u1+l1)%n1;
4126 for (l2=0; l2<=2*m+1; l2++)
4127 index_temp2[l2] = (u2+l2)%n2;
4129 for(l0=0; l0<=2*m+1; l0++)
4131 for(l1=0; l1<=2*m+1; l1++)
4133 for(l2=0; l2<=2*m+1; l2++)
4135 unsigned long int i = (index_temp0[l0] * n1 + index_temp1[l1]) * n2 + index_temp2[l2];
4137 R *lhs_real = (R*)lhs;
4138 C val = psij_const0[l0] * psij_const1[l1] * psij_const2[l2] * f;
4141 lhs_real[0] += creal(val);
4144 lhs_real[1] += cimag(val);
4152 static void nfft_adjoint_3d_compute_serial(
const C *fj, C *g,
4153 const R *psij_const0,
const R *psij_const1,
const R *psij_const2,
4154 const R *xj0,
const R *xj1,
const R *xj2,
4155 const int n0,
const int n1,
const int n2,
const int m)
4157 int u0,o0,l0,u1,o1,l1,u2,o2,l2;
4159 const R *psij0,*psij1,*psij2;
4165 nfft_uo2(&u0,&o0,*xj0, n0, m);
4166 nfft_uo2(&u1,&o1,*xj1, n1, m);
4167 nfft_uo2(&u2,&o2,*xj2, n2, m);
4172 for(l0=0; l0<=2*m+1; l0++,psij0++)
4175 for(l1=0; l1<=2*m+1; l1++,psij1++)
4178 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
4179 for(l2=0; l2<=2*m+1; l2++)
4180 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4184 for(l0=0; l0<=2*m+1; l0++,psij0++)
4187 for(l1=0; l1<=2*m+1; l1++,psij1++)
4190 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
4191 for(l2=0; l2<2*m+1-o2; l2++)
4192 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4193 gj=g+((u0+l0)*n1+(u1+l1))*n2;
4194 for(l2=0; l2<=o2; l2++)
4195 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4200 for(l0=0; l0<=2*m+1; l0++,psij0++)
4203 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
4206 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
4207 for(l2=0; l2<=2*m+1; l2++)
4208 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4210 for(l1=0; l1<=o1; l1++,psij1++)
4213 gj=g+((u0+l0)*n1+l1)*n2+u2;
4214 for(l2=0; l2<=2*m+1; l2++)
4215 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4220 for(l0=0; l0<=2*m+1; l0++,psij0++)
4223 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
4226 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
4227 for(l2=0; l2<2*m+1-o2; l2++)
4228 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4229 gj=g+((u0+l0)*n1+(u1+l1))*n2;
4230 for(l2=0; l2<=o2; l2++)
4231 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4233 for(l1=0; l1<=o1; l1++,psij1++)
4236 gj=g+((u0+l0)*n1+l1)*n2+u2;
4237 for(l2=0; l2<2*m+1-o2; l2++)
4238 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4239 gj=g+((u0+l0)*n1+l1)*n2;
4240 for(l2=0; l2<=o2; l2++)
4241 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4249 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
4252 for(l1=0; l1<=2*m+1; l1++,psij1++)
4255 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
4256 for(l2=0; l2<=2*m+1; l2++)
4257 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4261 for(l0=0; l0<=o0; l0++,psij0++)
4264 for(l1=0; l1<=2*m+1; l1++,psij1++)
4267 gj=g+(l0*n1+(u1+l1))*n2+u2;
4268 for(l2=0; l2<=2*m+1; l2++)
4269 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4275 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
4278 for(l1=0; l1<=2*m+1; l1++,psij1++)
4281 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
4282 for(l2=0; l2<2*m+1-o2; l2++)
4283 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4284 gj=g+((u0+l0)*n1+(u1+l1))*n2;
4285 for(l2=0; l2<=o2; l2++)
4286 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4290 for(l0=0; l0<=o0; l0++,psij0++)
4293 for(l1=0; l1<=2*m+1; l1++,psij1++)
4296 gj=g+(l0*n1+(u1+l1))*n2+u2;
4297 for(l2=0; l2<2*m+1-o2; l2++)
4298 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4299 gj=g+(l0*n1+(u1+l1))*n2;
4300 for(l2=0; l2<=o2; l2++)
4301 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4308 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
4311 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
4314 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
4315 for(l2=0; l2<=2*m+1; l2++)
4316 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4318 for(l1=0; l1<=o1; l1++,psij1++)
4321 gj=g+((u0+l0)*n1+l1)*n2+u2;
4322 for(l2=0; l2<=2*m+1; l2++)
4323 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4326 for(l0=0; l0<=o0; l0++,psij0++)
4329 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
4332 gj=g+(l0*n1+(u1+l1))*n2+u2;
4333 for(l2=0; l2<=2*m+1; l2++)
4334 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4336 for(l1=0; l1<=o1; l1++,psij1++)
4339 gj=g+(l0*n1+l1)*n2+u2;
4340 for(l2=0; l2<=2*m+1; l2++)
4341 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4347 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
4350 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
4353 gj=g+((u0+l0)*n1+(u1+l1))*n2+u2;
4354 for(l2=0; l2<2*m+1-o2; l2++)
4355 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4356 gj=g+((u0+l0)*n1+(u1+l1))*n2;
4357 for(l2=0; l2<=o2; l2++)
4358 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4360 for(l1=0; l1<=o1; l1++,psij1++)
4363 gj=g+((u0+l0)*n1+l1)*n2+u2;
4364 for(l2=0; l2<2*m+1-o2; l2++)
4365 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4366 gj=g+((u0+l0)*n1+l1)*n2;
4367 for(l2=0; l2<=o2; l2++)
4368 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4372 for(l0=0; l0<=o0; l0++,psij0++)
4375 for(l1=0; l1<2*m+1-o1; l1++,psij1++)
4378 gj=g+(l0*n1+(u1+l1))*n2+u2;
4379 for(l2=0; l2<2*m+1-o2; l2++)
4380 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4381 gj=g+(l0*n1+(u1+l1))*n2;
4382 for(l2=0; l2<=o2; l2++)
4383 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4385 for(l1=0; l1<=o1; l1++,psij1++)
4388 gj=g+(l0*n1+l1)*n2+u2;
4389 for(l2=0; l2<2*m+1-o2; l2++)
4390 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4392 for(l2=0; l2<=o2; l2++)
4393 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4400 static void nfft_trafo_3d_B(
nfft_plan *ths)
4402 const int N0 = ths->
N[0];
4403 const int n0 = ths->
n[0];
4404 const int N1 = ths->
N[1];
4405 const int n1 = ths->
n[1];
4406 const int N2 = ths->
N[2];
4407 const int n2 = ths->
n[2];
4409 const int m = ths->
m;
4411 const C* g = (C*) ths->
g;
4417 const int lprod = (2*m+2) * (2*m+2) * (2*m+2);
4418 #pragma omp parallel for default(shared) private(k)
4419 for (k = 0; k < M; k++)
4424 for (l = 0; l < lprod; l++)
4432 #pragma omp parallel for default(shared) private(k)
4433 for (k = 0; k < M; k++)
4436 nfft_trafo_3d_compute(ths->
f+j, g, ths->
psi+j*3*(2*m+2), ths->
psi+(j*3+1)*(2*m+2), ths->
psi+(j*3+2)*(2*m+2), ths->
x+3*j, ths->
x+3*j+1, ths->
x+3*j+2, n0, n1, n2, m);
4443 R fg_exp_l[3*(2*m+2)];
4445 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->
b[0]);
4446 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->
b[1]);
4447 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->
b[2]);
4449 #pragma omp parallel for default(shared) private(k)
4450 for (k = 0; k < M; k++)
4454 R psij_const[3*(2*m+2)];
4455 R fg_psij0 = ths->
psi[2*j*3];
4456 R fg_psij1 = ths->
psi[2*j*3+1];
4457 R fg_psij2 = K(1.0);
4459 psij_const[0] = fg_psij0;
4460 for(l=1; l<=2*m+1; l++)
4462 fg_psij2 *= fg_psij1;
4463 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4466 fg_psij0 = ths->
psi[2*(j*3+1)];
4467 fg_psij1 = ths->
psi[2*(j*3+1)+1];
4469 psij_const[2*m+2] = fg_psij0;
4470 for(l=1; l<=2*m+1; l++)
4472 fg_psij2 *= fg_psij1;
4473 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4476 fg_psij0 = ths->
psi[2*(j*3+2)];
4477 fg_psij1 = ths->
psi[2*(j*3+2)+1];
4479 psij_const[2*(2*m+2)] = fg_psij0;
4480 for(l=1; l<=2*m+1; l++)
4482 fg_psij2 *= fg_psij1;
4483 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4486 nfft_trafo_3d_compute(ths->
f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->
x+3*j, ths->
x+3*j+1, ths->
x+3*j+2, n0, n1, n2, m);
4494 R fg_exp_l[3*(2*m+2)];
4496 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->
b[0]);
4497 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->
b[1]);
4498 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->
b[2]);
4500 nfft_sort_nodes(ths);
4502 #pragma omp parallel for default(shared) private(k)
4503 for (k = 0; k < M; k++)
4507 R psij_const[3*(2*m+2)];
4508 R fg_psij0, fg_psij1, fg_psij2;
4510 nfft_uo(ths,j,&u,&o,0);
4511 fg_psij0 = (PHI(ths->
x[3*j]-((R)u)/n0,0));
4512 fg_psij1 = EXP(K(2.0)*(n0*(ths->
x[3*j]) - u)/ths->
b[0]);
4514 psij_const[0] = fg_psij0;
4515 for(l=1; l<=2*m+1; l++)
4517 fg_psij2 *= fg_psij1;
4518 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4521 nfft_uo(ths,j,&u,&o,1);
4522 fg_psij0 = (PHI(ths->
x[3*j+1]-((R)u)/n1,1));
4523 fg_psij1 = EXP(K(2.0)*(n1*(ths->
x[3*j+1]) - u)/ths->
b[1]);
4525 psij_const[2*m+2] = fg_psij0;
4526 for(l=1; l<=2*m+1; l++)
4528 fg_psij2 *= fg_psij1;
4529 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4532 nfft_uo(ths,j,&u,&o,2);
4533 fg_psij0 = (PHI(ths->
x[3*j+2]-((R)u)/n2,2));
4534 fg_psij1 = EXP(K(2.0)*(n2*(ths->
x[3*j+2]) - u)/ths->
b[2]);
4536 psij_const[2*(2*m+2)] = fg_psij0;
4537 for(l=1; l<=2*m+1; l++)
4539 fg_psij2 *= fg_psij1;
4540 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4543 nfft_trafo_3d_compute(ths->
f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->
x+3*j, ths->
x+3*j+1, ths->
x+3*j+2, n0, n1, n2, m);
4551 const int K = ths->
K, ip_s = K / (m + 2);
4553 nfft_sort_nodes(ths);
4555 #pragma omp parallel for default(shared) private(k)
4556 for (k = 0; k < M; k++)
4561 R psij_const[3*(2*m+2)];
4564 nfft_uo(ths,j,&u,&o,0);
4565 ip_y = FABS(n0*ths->
x[3*j+0] - u)*((R)ip_s);
4566 ip_u = LRINT(FLOOR(ip_y));
4568 for(l=0; l < 2*m+2; l++)
4569 psij_const[l] = ths->
psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4570 ths->
psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
4572 nfft_uo(ths,j,&u,&o,1);
4573 ip_y = FABS(n1*ths->
x[3*j+1] - u)*((R)ip_s);
4574 ip_u = LRINT(FLOOR(ip_y));
4576 for(l=0; l < 2*m+2; l++)
4577 psij_const[2*m+2+l] = ths->
psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4578 ths->
psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
4580 nfft_uo(ths,j,&u,&o,2);
4581 ip_y = FABS(n2*ths->
x[3*j+2] - u)*((R)ip_s);
4582 ip_u = LRINT(FLOOR(ip_y));
4584 for(l=0; l < 2*m+2; l++)
4585 psij_const[2*(2*m+2)+l] = ths->
psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4586 ths->
psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
4588 nfft_trafo_3d_compute(ths->
f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->
x+3*j, ths->
x+3*j+1, ths->
x+3*j+2, n0, n1, n2, m);
4595 nfft_sort_nodes(ths);
4597 #pragma omp parallel for default(shared) private(k)
4598 for (k = 0; k < M; k++)
4600 R psij_const[3*(2*m+2)];
4604 nfft_uo(ths,j,&u,&o,0);
4605 for(l=0;l<=2*m+1;l++)
4606 psij_const[l]=(PHI(ths->
x[3*j]-((R)((u+l)))/n0,0));
4608 nfft_uo(ths,j,&u,&o,1);
4609 for(l=0;l<=2*m+1;l++)
4610 psij_const[2*m+2+l]=(PHI(ths->
x[3*j+1]-((R)((u+l)))/n1,1));
4612 nfft_uo(ths,j,&u,&o,2);
4613 for(l=0;l<=2*m+1;l++)
4614 psij_const[2*(2*m+2)+l]=(PHI(ths->
x[3*j+2]-((R)((u+l)))/n2,2));
4616 nfft_trafo_3d_compute(ths->
f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->
x+3*j, ths->
x+3*j+1, ths->
x+3*j+2, n0, n1, n2, m);
4621 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A \
4623 assert(ar_x[2*k] >= min_u_a || k == M-1); \
4625 assert(ar_x[2*k-2] < min_u_a); \
4628 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A
4632 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B \
4634 assert(ar_x[2*k] >= min_u_b || k == M-1); \
4636 assert(ar_x[2*k-2] < min_u_b); \
4639 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B
4642 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
4643 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4644 ths->psi+j*3*(2*m+2), \
4645 ths->psi+(j*3+1)*(2*m+2), \
4646 ths->psi+(j*3+2)*(2*m+2), \
4647 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4648 n0, n1, n2, m, my_u0, my_o0);
4650 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
4653 R psij_const[3*(2*m+2)]; \
4654 R fg_psij0 = ths->psi[2*j*3]; \
4655 R fg_psij1 = ths->psi[2*j*3+1]; \
4656 R fg_psij2 = K(1.0); \
4658 psij_const[0] = fg_psij0; \
4659 for(l=1; l<=2*m+1; l++) \
4661 fg_psij2 *= fg_psij1; \
4662 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
4665 fg_psij0 = ths->psi[2*(j*3+1)]; \
4666 fg_psij1 = ths->psi[2*(j*3+1)+1]; \
4667 fg_psij2 = K(1.0); \
4668 psij_const[2*m+2] = fg_psij0; \
4669 for(l=1; l<=2*m+1; l++) \
4671 fg_psij2 *= fg_psij1; \
4672 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
4675 fg_psij0 = ths->psi[2*(j*3+2)]; \
4676 fg_psij1 = ths->psi[2*(j*3+2)+1]; \
4677 fg_psij2 = K(1.0); \
4678 psij_const[2*(2*m+2)] = fg_psij0; \
4679 for(l=1; l<=2*m+1; l++) \
4681 fg_psij2 *= fg_psij1; \
4682 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
4685 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4686 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4687 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4688 n0, n1, n2, m, my_u0, my_o0); \
4691 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
4694 R psij_const[3*(2*m+2)]; \
4695 R fg_psij0, fg_psij1, fg_psij2; \
4697 nfft_uo(ths,j,&u,&o,0); \
4698 fg_psij0 = (PHI(ths->x[3*j]-((R)u)/n0,0)); \
4699 fg_psij1 = EXP(K(2.0)*(n0*(ths->x[3*j]) - u)/ths->b[0]); \
4700 fg_psij2 = K(1.0); \
4701 psij_const[0] = fg_psij0; \
4702 for(l=1; l<=2*m+1; l++) \
4704 fg_psij2 *= fg_psij1; \
4705 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
4708 nfft_uo(ths,j,&u,&o,1); \
4709 fg_psij0 = (PHI(ths->x[3*j+1]-((R)u)/n1,1)); \
4710 fg_psij1 = EXP(K(2.0)*(n1*(ths->x[3*j+1]) - u)/ths->b[1]); \
4711 fg_psij2 = K(1.0); \
4712 psij_const[2*m+2] = fg_psij0; \
4713 for(l=1; l<=2*m+1; l++) \
4715 fg_psij2 *= fg_psij1; \
4716 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
4719 nfft_uo(ths,j,&u,&o,2); \
4720 fg_psij0 = (PHI(ths->x[3*j+2]-((R)u)/n2,2)); \
4721 fg_psij1 = EXP(K(2.0)*(n2*(ths->x[3*j+2]) - u)/ths->b[2]); \
4722 fg_psij2 = K(1.0); \
4723 psij_const[2*(2*m+2)] = fg_psij0; \
4724 for(l=1; l<=2*m+1; l++) \
4726 fg_psij2 *= fg_psij1; \
4727 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
4730 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4731 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4732 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4733 n0, n1, n2, m, my_u0, my_o0); \
4736 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
4739 R psij_const[3*(2*m+2)]; \
4743 nfft_uo(ths,j,&u,&o,0); \
4744 ip_y = FABS(n0*ths->x[3*j+0] - u)*((R)ip_s); \
4745 ip_u = LRINT(FLOOR(ip_y)); \
4747 for(l=0; l < 2*m+2; l++) \
4748 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4749 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
4751 nfft_uo(ths,j,&u,&o,1); \
4752 ip_y = FABS(n1*ths->x[3*j+1] - u)*((R)ip_s); \
4753 ip_u = LRINT(FLOOR(ip_y)); \
4755 for(l=0; l < 2*m+2; l++) \
4756 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4757 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
4759 nfft_uo(ths,j,&u,&o,2); \
4760 ip_y = FABS(n2*ths->x[3*j+2] - u)*((R)ip_s); \
4761 ip_u = LRINT(FLOOR(ip_y)); \
4763 for(l=0; l < 2*m+2; l++) \
4764 psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4765 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
4767 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4768 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4769 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4770 n0, n1, n2, m, my_u0, my_o0); \
4773 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
4776 R psij_const[3*(2*m+2)]; \
4778 nfft_uo(ths,j,&u,&o,0); \
4779 for(l=0;l<=2*m+1;l++) \
4780 psij_const[l]=(PHI(ths->x[3*j]-((R)((u+l)))/n0,0)); \
4782 nfft_uo(ths,j,&u,&o,1); \
4783 for(l=0;l<=2*m+1;l++) \
4784 psij_const[2*m+2+l]=(PHI(ths->x[3*j+1]-((R)((u+l)))/n1,1)); \
4786 nfft_uo(ths,j,&u,&o,2); \
4787 for(l=0;l<=2*m+1;l++) \
4788 psij_const[2*(2*m+2)+l]=(PHI(ths->x[3*j+2]-((R)((u+l)))/n2,2)); \
4790 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4791 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4792 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4793 n0, n1, n2, m, my_u0, my_o0); \
4796 #define MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(whichone) \
4798 if (ths->nfft_flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
4800 _Pragma("omp parallel private(k)") \
4802 int my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
4803 int *ar_x = ths->index_x; \
4805 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
4806 &min_u_b, &max_u_b, 3, ths->n, m); \
4808 if (min_u_a != -1) \
4810 k = index_x_binary_search(ar_x, M, min_u_a); \
4812 MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A \
4816 int u_prod = ar_x[2*k]; \
4817 int j = ar_x[2*k+1]; \
4819 if (u_prod < min_u_a || u_prod > max_u_a) \
4822 MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
4828 if (min_u_b != -1) \
4830 int k = index_x_binary_search(ar_x, M, min_u_b); \
4832 MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B \
4836 int u_prod = ar_x[2*k]; \
4837 int j = ar_x[2*k+1]; \
4839 if (u_prod < min_u_b || u_prod > max_u_b) \
4842 MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
4852 static void nfft_adjoint_3d_B(
nfft_plan *ths)
4855 const int N0 = ths->
N[0];
4856 const int n0 = ths->
n[0];
4857 const int N1 = ths->
N[1];
4858 const int n1 = ths->
n[1];
4859 const int N2 = ths->
N[2];
4860 const int n2 = ths->
n[2];
4862 const int m = ths->
m;
4866 memset(g,0,ths->
n_total*
sizeof(C));
4870 nfft_adjoint_B_compute_full_psi(g, ths->
psi_index_g, ths->
psi, ths->
f, M,
4878 MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(PRE_PSI)
4881 #pragma omp parallel for default(shared) private(k)
4882 for (k = 0; k < M; k++)
4886 nfft_adjoint_3d_compute_omp_atomic(ths->
f[j], g, ths->
psi+j*3*(2*m+2), ths->
psi+(j*3+1)*(2*m+2), ths->
psi+(j*3+2)*(2*m+2), ths->
x+3*j, ths->
x+3*j+1, ths->
x+3*j+2, n0, n1, n2, m);
4888 nfft_adjoint_3d_compute_serial(ths->
f+j, g, ths->
psi+j*3*(2*m+2), ths->
psi+(j*3+1)*(2*m+2), ths->
psi+(j*3+2)*(2*m+2), ths->
x+3*j, ths->
x+3*j+1, ths->
x+3*j+2, n0, n1, n2, m);
4896 R fg_exp_l[3*(2*m+2)];
4898 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->
b[0]);
4899 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->
b[1]);
4900 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->
b[2]);
4903 MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(PRE_FG_PSI)
4906 #pragma omp parallel for default(shared) private(k)
4907 for (k = 0; k < M; k++)
4909 R psij_const[3*(2*m+2)];
4912 R fg_psij0 = ths->
psi[2*j*3];
4913 R fg_psij1 = ths->
psi[2*j*3+1];
4914 R fg_psij2 = K(1.0);
4916 psij_const[0] = fg_psij0;
4917 for(l=1; l<=2*m+1; l++)
4919 fg_psij2 *= fg_psij1;
4920 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4923 fg_psij0 = ths->
psi[2*(j*3+1)];
4924 fg_psij1 = ths->
psi[2*(j*3+1)+1];
4926 psij_const[2*m+2] = fg_psij0;
4927 for(l=1; l<=2*m+1; l++)
4929 fg_psij2 *= fg_psij1;
4930 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4933 fg_psij0 = ths->
psi[2*(j*3+2)];
4934 fg_psij1 = ths->
psi[2*(j*3+2)+1];
4936 psij_const[2*(2*m+2)] = fg_psij0;
4937 for(l=1; l<=2*m+1; l++)
4939 fg_psij2 *= fg_psij1;
4940 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4944 nfft_adjoint_3d_compute_omp_atomic(ths->
f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->
x+3*j, ths->
x+3*j+1, ths->
x+3*j+2, n0, n1, n2, m);
4946 nfft_adjoint_3d_compute_serial(ths->
f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->
x+3*j, ths->
x+3*j+1, ths->
x+3*j+2, n0, n1, n2, m);
4955 R fg_exp_l[3*(2*m+2)];
4957 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->
b[0]);
4958 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->
b[1]);
4959 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->
b[2]);
4961 nfft_sort_nodes(ths);
4964 MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(FG_PSI)
4967 #pragma openmp parallel for default(shared) private(k)
4968 for (k = 0; k < M; k++)
4972 R psij_const[3*(2*m+2)];
4973 R fg_psij0, fg_psij1, fg_psij2;
4975 nfft_uo(ths,j,&u,&o,0);
4976 fg_psij0 = (PHI(ths->
x[3*j]-((R)u)/n0,0));
4977 fg_psij1 = EXP(K(2.0)*(n0*(ths->
x[3*j]) - u)/ths->
b[0]);
4979 psij_const[0] = fg_psij0;
4980 for(l=1; l<=2*m+1; l++)
4982 fg_psij2 *= fg_psij1;
4983 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4986 nfft_uo(ths,j,&u,&o,1);
4987 fg_psij0 = (PHI(ths->
x[3*j+1]-((R)u)/n1,1));
4988 fg_psij1 = EXP(K(2.0)*(n1*(ths->
x[3*j+1]) - u)/ths->
b[1]);
4990 psij_const[2*m+2] = fg_psij0;
4991 for(l=1; l<=2*m+1; l++)
4993 fg_psij2 *= fg_psij1;
4994 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4997 nfft_uo(ths,j,&u,&o,2);
4998 fg_psij0 = (PHI(ths->
x[3*j+2]-((R)u)/n2,2));
4999 fg_psij1 = EXP(K(2.0)*(n2*(ths->
x[3*j+2]) - u)/ths->
b[2]);
5001 psij_const[2*(2*m+2)] = fg_psij0;
5002 for(l=1; l<=2*m+1; l++)
5004 fg_psij2 *= fg_psij1;
5005 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
5009 nfft_adjoint_3d_compute_omp_atomic(ths->
f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->
x+3*j, ths->
x+3*j+1, ths->
x+3*j+2, n0, n1, n2, m);
5011 nfft_adjoint_3d_compute_serial(ths->
f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->
x+3*j, ths->
x+3*j+1, ths->
x+3*j+2, n0, n1, n2, m);
5020 const int K = ths->
K;
5021 const int ip_s = K / (m + 2);
5023 nfft_sort_nodes(ths);
5026 MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
5029 #pragma openmp parallel for default(shared) private(k)
5030 for (k = 0; k < M; k++)
5036 R psij_const[3*(2*m+2)];
5038 nfft_uo(ths,j,&u,&o,0);
5039 ip_y = FABS(n0*ths->
x[3*j+0] - u)*((R)ip_s);
5040 ip_u = LRINT(FLOOR(ip_y));
5042 for(l=0; l < 2*m+2; l++)
5043 psij_const[l] = ths->
psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5044 ths->
psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
5046 nfft_uo(ths,j,&u,&o,1);
5047 ip_y = FABS(n1*ths->
x[3*j+1] - u)*((R)ip_s);
5048 ip_u = LRINT(FLOOR(ip_y));
5050 for(l=0; l < 2*m+2; l++)
5051 psij_const[2*m+2+l] = ths->
psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5052 ths->
psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
5054 nfft_uo(ths,j,&u,&o,2);
5055 ip_y = FABS(n2*ths->
x[3*j+2] - u)*((R)ip_s);
5056 ip_u = LRINT(FLOOR(ip_y));
5058 for(l=0; l < 2*m+2; l++)
5059 psij_const[2*(2*m+2)+l] = ths->
psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5060 ths->
psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
5063 nfft_adjoint_3d_compute_omp_atomic(ths->
f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->
x+3*j, ths->
x+3*j+1, ths->
x+3*j+2, n0, n1, n2, m);
5065 nfft_adjoint_3d_compute_serial(ths->
f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->
x+3*j, ths->
x+3*j+1, ths->
x+3*j+2, n0, n1, n2, m);
5072 nfft_sort_nodes(ths);
5075 MACRO_nfft_adjoint_3d_B_OMP_BLOCKWISE(NO_PSI)
5078 #pragma omp parallel for default(shared) private(k)
5079 for (k = 0; k < M; k++)
5082 R psij_const[3*(2*m+2)];
5085 nfft_uo(ths,j,&u,&o,0);
5086 for(l=0;l<=2*m+1;l++)
5087 psij_const[l]=(PHI(ths->
x[3*j]-((R)((u+l)))/n0,0));
5089 nfft_uo(ths,j,&u,&o,1);
5090 for(l=0;l<=2*m+1;l++)
5091 psij_const[2*m+2+l]=(PHI(ths->
x[3*j+1]-((R)((u+l)))/n1,1));
5093 nfft_uo(ths,j,&u,&o,2);
5094 for(l=0;l<=2*m+1;l++)
5095 psij_const[2*(2*m+2)+l]=(PHI(ths->
x[3*j+2]-((R)((u+l)))/n2,2));
5098 nfft_adjoint_3d_compute_omp_atomic(ths->
f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->
x+3*j, ths->
x+3*j+1, ths->
x+3*j+2, n0, n1, n2, m);
5100 nfft_adjoint_3d_compute_serial(ths->
f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->
x+3*j, ths->
x+3*j+1, ths->
x+3*j+2, n0, n1, n2, m);
5108 int k0,k1,k2,n0,n1,n2,N0,N1,N2;
5110 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
5111 R ck01, ck02, ck11, ck12, ck21, ck22;
5112 C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
5113 C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
5125 f_hat=(C*)ths->
f_hat;
5126 g_hat=(C*)ths->
g_hat;
5130 #pragma omp parallel for default(shared) private(k0)
5131 for (k0 = 0; k0 < ths->
n_total; k0++)
5132 ths->
g_hat[k0] = 0.0;
5142 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
5143 for(k0=0;k0<N0/2;k0++)
5145 ck01=c_phi_inv01[k0];
5146 ck02=c_phi_inv02[k0];
5150 for(k1=0;k1<N1/2;k1++)
5152 ck11=c_phi_inv11[k1];
5153 ck12=c_phi_inv12[k1];
5157 g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5158 f_hat111=f_hat + (k0*N1+k1)*N2;
5159 g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5160 f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
5161 g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
5162 f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
5163 g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
5164 f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
5166 g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
5167 f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
5168 g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
5169 f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
5170 g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
5171 f_hat122=f_hat + (k0*N1+N1/2+k1)*N2+(N2/2);
5172 g_hat222=g_hat + (k0*n1+k1)*n2;
5173 f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
5175 for(k2=0;k2<N2/2;k2++)
5177 ck21=c_phi_inv21[k2];
5178 ck22=c_phi_inv22[k2];
5180 g_hat111[k2] = f_hat111[k2] * ck01 * ck11 * ck21;
5181 g_hat211[k2] = f_hat211[k2] * ck02 * ck11 * ck21;
5182 g_hat121[k2] = f_hat121[k2] * ck01 * ck12 * ck21;
5183 g_hat221[k2] = f_hat221[k2] * ck02 * ck12 * ck21;
5185 g_hat112[k2] = f_hat112[k2] * ck01 * ck11 * ck22;
5186 g_hat212[k2] = f_hat212[k2] * ck02 * ck11 * ck22;
5187 g_hat122[k2] = f_hat122[k2] * ck01 * ck12 * ck22;
5188 g_hat222[k2] = f_hat222[k2] * ck02 * ck12 * ck22;
5194 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
5195 for(k0=0;k0<N0/2;k0++)
5197 ck01=K(1.0)/(PHI_HUT(k0-N0/2,0));
5198 ck02=K(1.0)/(PHI_HUT(k0,0));
5199 for(k1=0;k1<N1/2;k1++)
5201 ck11=K(1.0)/(PHI_HUT(k1-N1/2,1));
5202 ck12=K(1.0)/(PHI_HUT(k1,1));
5204 for(k2=0;k2<N2/2;k2++)
5206 ck21=K(1.0)/(PHI_HUT(k2-N2/2,2));
5207 ck22=K(1.0)/(PHI_HUT(k2,2));
5209 g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+k1)*N2+k2] * ck01 * ck11 * ck21;
5210 g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+k2] * ck02 * ck11 * ck21;
5211 g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+k2] * ck01 * ck12 * ck21;
5212 g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] * ck02 * ck12 * ck21;
5214 g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] = f_hat[(k0*N1+k1)*N2+N2/2+k2] * ck01 * ck11 * ck22;
5215 g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] * ck02 * ck11 * ck22;
5216 g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] * ck01 * ck12 * ck22;
5217 g_hat[(k0*n1+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] * ck02 * ck12 * ck22;
5225 fftw_execute(ths->my_fftw_plan1);
5229 nfft_trafo_3d_B(ths);
5235 int k0,k1,k2,n0,n1,n2,N0,N1,N2;
5237 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
5238 R ck01, ck02, ck11, ck12, ck21, ck22;
5239 C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
5240 C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
5252 f_hat=(C*)ths->f_hat;
5253 g_hat=(C*)ths->g_hat;
5256 nfft_adjoint_3d_B(ths);
5260 fftw_execute(ths->my_fftw_plan2);
5264 if(ths->nfft_flags & PRE_PHI_HUT)
5266 c_phi_inv01=ths->c_phi_inv[0];
5267 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
5269 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
5270 for(k0=0;k0<N0/2;k0++)
5272 ck01=c_phi_inv01[k0];
5273 ck02=c_phi_inv02[k0];
5274 c_phi_inv11=ths->c_phi_inv[1];
5275 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
5277 for(k1=0;k1<N1/2;k1++)
5279 ck11=c_phi_inv11[k1];
5280 ck12=c_phi_inv12[k1];
5281 c_phi_inv21=ths->c_phi_inv[2];
5282 c_phi_inv22=&ths->c_phi_inv[2][N2/2];
5284 g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5285 f_hat111=f_hat + (k0*N1+k1)*N2;
5286 g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5287 f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
5288 g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
5289 f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
5290 g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
5291 f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
5293 g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
5294 f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
5295 g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
5296 f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
5297 g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
5298 f_hat122=f_hat + (k0*N1+(N1/2)+k1)*N2+(N2/2);
5299 g_hat222=g_hat + (k0*n1+k1)*n2;
5300 f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
5302 for(k2=0;k2<N2/2;k2++)
5304 ck21=c_phi_inv21[k2];
5305 ck22=c_phi_inv22[k2];
5307 f_hat111[k2] = g_hat111[k2] * ck01 * ck11 * ck21;
5308 f_hat211[k2] = g_hat211[k2] * ck02 * ck11 * ck21;
5309 f_hat121[k2] = g_hat121[k2] * ck01 * ck12 * ck21;
5310 f_hat221[k2] = g_hat221[k2] * ck02 * ck12 * ck21;
5312 f_hat112[k2] = g_hat112[k2] * ck01 * ck11 * ck22;
5313 f_hat212[k2] = g_hat212[k2] * ck02 * ck11 * ck22;
5314 f_hat122[k2] = g_hat122[k2] * ck01 * ck12 * ck22;
5315 f_hat222[k2] = g_hat222[k2] * ck02 * ck12 * ck22;
5321 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
5322 for(k0=0;k0<N0/2;k0++)
5324 ck01=K(1.0)/(PHI_HUT(k0-N0/2,0));
5325 ck02=K(1.0)/(PHI_HUT(k0,0));
5326 for(k1=0;k1<N1/2;k1++)
5328 ck11=K(1.0)/(PHI_HUT(k1-N1/2,1));
5329 ck12=K(1.0)/(PHI_HUT(k1,1));
5331 for(k2=0;k2<N2/2;k2++)
5333 ck21=K(1.0)/(PHI_HUT(k2-N2/2,2));
5334 ck22=K(1.0)/(PHI_HUT(k2,2));
5336 f_hat[(k0*N1+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck01 * ck11 * ck21;
5337 f_hat[((N0/2+k0)*N1+k1)*N2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck02 * ck11 * ck21;
5338 f_hat[(k0*N1+N1/2+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] * ck01 * ck12 * ck21;
5339 f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] = g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] * ck02 * ck12 * ck21;
5341 f_hat[(k0*N1+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] * ck01 * ck11 * ck22;
5342 f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] * ck02 * ck11 * ck22;
5343 f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] * ck01 * ck12 * ck22;
5344 f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[(k0*n1+k1)*n2+k2] * ck02 * ck12 * ck22;
5358 case 1: nfft_trafo_1d(ths);
break;
5359 case 2: nfft_trafo_2d(ths);
break;
5360 case 3: nfft_trafo_3d(ths);
break;
5378 fftw_execute(ths->my_fftw_plan1);
5394 case 1: nfft_adjoint_1d(ths);
break;
5395 case 2: nfft_adjoint_2d(ths);
break;
5396 case 3: nfft_adjoint_3d(ths);
break;
5414 fftw_execute(ths->my_fftw_plan2);
5429 static
void nfft_precompute_phi_hut(
nfft_plan *ths)
5434 ths->c_phi_inv = (R**)
nfft_malloc(ths->d*
sizeof(R*));
5436 for(t=0; t<ths->d; t++)
5438 ths->c_phi_inv[t]= (R*)
nfft_malloc(ths->N[t]*
sizeof(R));
5439 for(ks[t]=0; ks[t]<ths->N[t]; ks[t]++)
5440 ths->c_phi_inv[t][ks[t]]= K(1.0)/(PHI_HUT(ks[t]-ths->N[t]/2,t));
5455 for (t=0; t<ths->
d; t++)
5457 step=((R)(ths->
m+2))/(((R)ths->
K)*ths->
n[t]);
5458 for(j=0;j<=ths->
K;j++)
5460 ths->
psi[(ths->
K+1)*t + j] = PHI(j*step,t);
5465 static void nfft_precompute_fg_psi(
nfft_plan *ths)
5470 nfft_sort_nodes(ths);
5472 for (t=0; t<ths->
d; t++)
5475 #pragma omp parallel for default(shared) private(j,u,o)
5476 for (j = 0; j < ths->
M_total; j++)
5478 nfft_uo(ths,j,&u,&o,t);
5480 ths->
psi[2*(j*ths->
d+t)]=
5481 (PHI((ths->
x[j*ths->
d+t]-((R)u)/ths->
n[t]),t));
5483 ths->
psi[2*(j*ths->
d+t)+1]=
5484 EXP(K(2.0)*(ths->
n[t]*ths->
x[j*ths->
d+t] - u) / ths->
b[t]);
5490 void nfft_precompute_psi(
nfft_plan *ths)
5497 nfft_sort_nodes(ths);
5499 for (t=0; t<ths->
d; t++)
5502 #pragma omp parallel for default(shared) private(j,l,lj,u,o)
5503 for (j = 0; j < ths->
M_total; j++)
5505 nfft_uo(ths,j,&u,&o,t);
5507 for(l=u, lj=0; l <= o; l++, lj++)
5508 ths->
psi[(j*ths->
d+t)*(2*ths->
m+2)+lj]=
5509 (PHI((ths->
x[j*ths->
d+t]-((R)l)/ths->
n[t]),t));
5516 static void nfft_precompute_full_psi_omp(
nfft_plan *ths)
5523 for(t=0,lprod = 1; t<ths->
d; t++)
5524 lprod *= 2*ths->
m+2;
5527 #pragma omp parallel for default(shared) private(j)
5534 int ll_plain[ths->
d+1];
5536 int u[ths->
d], o[ths->
d];
5538 R phi_prod[ths->
d+1];
5544 MACRO_init_uo_l_lj_t;
5546 for(l_L=0; l_L<lprod; l_L++, ix++)
5548 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
5551 ths->
psi[ix]=phi_prod[ths->
d];
5553 MACRO_count_uo_l_lj_t;
5561 void nfft_precompute_full_psi(
nfft_plan *ths)
5564 nfft_sort_nodes(ths);
5566 nfft_precompute_full_psi_omp(ths);
5573 int ll_plain[ths->
d+1];
5575 int u[ths->
d], o[ths->
d];
5577 R phi_prod[ths->
d+1];
5581 nfft_sort_nodes(ths);
5586 for(t=0,lprod = 1; t<ths->
d; t++)
5587 lprod *= 2*ths->
m+2;
5589 for(j=0,ix=0,ix_old=0; j<ths->M_total; j++)
5591 MACRO_init_uo_l_lj_t;
5593 for(l_L=0; l_L<lprod; l_L++, ix++)
5595 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
5598 ths->
psi[ix]=phi_prod[ths->
d];
5600 MACRO_count_uo_l_lj_t;
5610 void nfft_precompute_one_psi(
nfft_plan *ths)
5615 nfft_precompute_fg_psi(ths);
5617 nfft_precompute_psi(ths);
5619 nfft_precompute_full_psi(ths);
5623 static void nfft_init_help(
nfft_plan *ths)
5628 if (ths->
nfft_flags & NFFT_OMP_BLOCKWISE_ADJOINT)
5635 for(t = 0;t < ths->
d; t++)
5636 ths->
sigma[t] = ((R)ths->
n[t])/ths->
N[t];
5650 nfft_precompute_phi_hut(ths);
5654 ths->
K=(1U<< 10)*(ths->
m+2);
5663 (2*ths->
m+2)*
sizeof(R));
5667 for(t=0,lprod = 1; t<ths->
d; t++)
5668 lprod *= 2*ths->
m+2;
5679 int nthreads = nfft_get_omp_num_threads();
5690 #pragma omp critical (nfft_omp_critical_fftw_plan)
5692 fftw_plan_with_nthreads(nthreads);
5694 ths->my_fftw_plan1 = fftw_plan_dft(ths->
d, ths->
n, ths->
g1, ths->
g2, FFTW_FORWARD, ths->
fftw_flags);
5695 ths->my_fftw_plan2 = fftw_plan_dft(ths->
d, ths->
n, ths->
g2, ths->
g1,
5708 ths->
mv_adjoint = (void (*) (
void* ))nfft_adjoint;
5711 void nfft_init(
nfft_plan *ths,
int d,
int *N,
int M_total)
5719 for (t = 0;t < d; t++)
5725 for (t = 0;t < d; t++)
5726 ths->
n[t] = 2*X(next_power_of_2)(ths->
N[t]);
5728 ths->
m = WINDOW_HELP_ESTIMATE_m;
5733 ths->
nfft_flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
5734 FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES |
5735 NFFT_OMP_BLOCKWISE_ADJOINT;
5737 ths->
nfft_flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
5738 FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES;
5742 ths->
nfft_flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
5743 FFTW_INIT | FFT_OUT_OF_PLACE;
5745 ths->
fftw_flags= FFTW_ESTIMATE| FFTW_DESTROY_INPUT;
5747 nfft_init_help(ths);
5750 void nfft_init_guru(
nfft_plan *ths,
int d,
int *N,
int M_total,
int *n,
5751 int m,
unsigned nfft_flags,
unsigned fftw_flags)
5767 nfft_init_help(ths);
5770 void nfft_init_1d(
nfft_plan *ths,
int N1,
int M_total)
5776 nfft_init(ths, 1, N, M_total);
5779 void nfft_init_2d(
nfft_plan *ths,
int N1,
int N2,
int M_total)
5785 nfft_init(ths,2,N,M_total);
5788 void nfft_init_3d(
nfft_plan *ths,
int N1,
int N2,
int N3,
int M_total)
5795 nfft_init(ths,3,N,M_total);
5803 if((ths->
x[j]<-K(0.5)) || (ths->
x[j]>= K(0.5)))
5804 return "ths->x out of range [-0.5,0.5)";
5806 for(j=0;j<ths->
d;j++)
5808 if(ths->
sigma[j]<=1)
5809 return "nfft_check: oversampling factor too small";
5811 if(ths->
N[j]<=ths->
m)
5812 return "Polynomial degree N is smaller than cut-off m";
5815 return "polynomial degree N has to be even";
5829 #pragma omp critical (nfft_omp_critical_fftw_plan)
5831 fftw_destroy_plan(ths->my_fftw_plan2);
5832 fftw_destroy_plan(ths->my_fftw_plan1);
5859 for(t=0; t<ths->
d; t++)
5873 WINDOW_HELP_FINALIZE;