51 return a >= b ? a : b;
58 else return (
double)n*
fak(n-1);
73 for (k=0; k<=m-r; k++) {
74 sum+=
binom(m+k,k)*pow((xx+1.0)/2.0,(
double)k);
76 return sum*pow((xx+1.0),(
double)r)*pow(1.0-xx,(
double)(m+1))/(1<<(m+1))/
fak(r);
80 double _Complex
regkern(kernel k,
double xx,
int p,
const double *param,
double a,
double b)
83 double _Complex sum=0.0;
89 if ((xx>=-0.5+b && xx<=-a) || (xx>=a && xx<=0.5-b)) {
93 sum=(k(-0.5,0,param)+k(0.5,0,param))/2.0
96 sum+=pow(-b/2.0,(
double)r)
102 else if ((xx>-a) && (xx<a)) {
103 for (r=0; r<p; r++) {
104 sum+=pow(a,(
double)r)
106 +k( a,r,param)*
BasisPoly(p-1,r,-xx/a)*(r & 1 ? -1 : 1));
111 sum=(k(-0.5,0,param)+k(0.5,0,param))/2.0
113 for (r=0; r<p; r++) {
114 sum+=pow(b/2.0,(
double)r)
120 return k(xx,0,param);
126 double _Complex
regkern1(kernel k,
double xx,
int p,
const double *param,
double a,
double b)
129 double _Complex sum=0.0;
135 if ((xx>=-0.5+b && xx<=-a) || (xx>=a && xx<=0.5-b))
137 return k(xx,0,param);
139 else if ((xx>-a) && (xx<a))
141 for (r=0; r<p; r++) {
142 sum+=pow(a,(
double)r)
144 +k( a,r,param)*
BasisPoly(p-1,r,-xx/a)*(r & 1 ? -1 : 1));
150 for (r=0; r<p; r++) {
151 sum+=pow(b,(
double)r)
152 *( k(0.5-b,r,param)*
BasisPoly(p-1,r,(xx+0.5)/b)
153 +k(-0.5+b,r,param)*
BasisPoly(p-1,r,-(xx+0.5)/b)*(r & 1 ? -1 : 1));
159 for (r=0; r<p; r++) {
160 sum+=pow(b,(
double)r)
161 *( k(0.5-b,r,param)*
BasisPoly(p-1,r,(xx-0.5)/b)
162 +k(-0.5+b,r,param)*
BasisPoly(p-1,r,-(xx-0.5)/b)*(r & 1 ? -1 : 1));
166 return k(xx,0,param);
170 double _Complex
regkern2(kernel k,
double xx,
int p,
const double *param,
double a,
double b)
173 double _Complex sum=0.0;
178 for (r=0; r<p; r++) {
179 sum+=pow(b,(
double)r)*k(0.5-b,r,param)
184 else if ((a<=xx) && (xx<=0.5-b)) {
185 return k(xx,0,param);
188 for (r=0; r<p; r++) {
189 sum+=pow(-a,(
double)r)*k(a,r,param)
194 else if ((0.5-b<xx) && (xx<=0.5)) {
195 for (r=0; r<p; r++) {
196 sum+=pow(b,(
double)r)*k(0.5-b,r,param)
207 double _Complex
regkern3(kernel k,
double xx,
int p,
const double *param,
double a,
double b)
210 double _Complex sum=0.0;
219 if ((a<=xx) && (xx<=0.5-b)) {
220 return k(xx,0,param);
223 for (r=0; r<p; r++) {
224 sum+=pow(-a,(
double)r)*k(a,r,param)
230 else if ((0.5-b<xx) && (xx<=0.5)) {
231 sum=k(0.5,0,param)*
BasisPoly(p-1,0,-2.0*xx/b+(1.0-b)/b);
233 for (r=0; r<p; r++) {
234 sum+=pow(b/2.0,(
double)r)
244 double _Complex
linintkern(
const double x,
const double _Complex *Add,
245 const int Ad,
const double a)
249 double _Complex f1,f2;
253 f1=Add[r];f2=Add[r+1];
257 return (-f1*c3+f2*c1);
260 double _Complex quadrintkern(
const double x,
const double _Complex *Add,
261 const int Ad,
const double a)
265 double _Complex f0,f1,f2;
269 if (r==0) {f0=Add[r+1];f1=Add[r];f2=Add[r+1];}
270 else { f0=Add[r-1];f1=Add[r];f2=Add[r+1];}
275 return (f0*c1*c3/2.0-f1*c2*c3+f2*c2*c1/2.0);
279 double _Complex
kubintkern(
const double x,
const double _Complex *Add,
280 const int Ad,
const double a)
282 double c,c1,c2,c3,c4;
284 double _Complex f0,f1,f2,f3;
287 if (r==0) {f0=Add[r+1];f1=Add[r];f2=Add[r+1];f3=Add[r+2];}
288 else { f0=Add[r-1];f1=Add[r];f2=Add[r+1];f3=Add[r+2];}
296 return(-f0*c1*c3*c4/6.0+f1*c2*c3*c4/2.0-f2*c2*c1*c4/2.0+f3*c2*c1*c3/6.0);
300 double _Complex
kubintkern1(
const double x,
const double _Complex *Add,
301 const int Ad,
const double a)
303 double c,c1,c2,c3,c4;
305 double _Complex f0,f1,f2,f3;
311 { f0=Add[r-1];f1=Add[r];f2=Add[r+1];f3=Add[r+2];}
319 return(-f0*c1*c3*c4/6.0+f1*c2*c3*c4/2.0-f2*c2*c1*c4/2.0+f3*c2*c1*c3/6.0);
323 void quicksort(
int d,
int t,
double *x,
double _Complex *alpha,
int N)
328 double pivot=x[(N/2)*d+t];
332 double _Complex temp2;
336 while (x[lpos*d+t]<pivot)
338 while (x[rpos*d+t]>pivot)
345 x[lpos*d+k]=x[rpos*d+k];
349 alpha[lpos]=alpha[rpos];
359 quicksort(d,t,x+lpos*d,alpha+lpos,N-lpos);
369 box_index = (
int *) malloc(ths->box_count *
sizeof(
int));
370 for (t=0; t < ths->box_count; t++)
373 for (l=0; l < ths->
N_total; l++)
376 for (t=0; t < ths->
d; t++)
378 val[t] = ths->
x[ths->
d * l + t] + 0.25 - ths->
eps_B/2.0;
379 ind *= ths->box_count_per_dim;
380 ind += (int) (val[t] / ths->
eps_I);
385 ths->box_offset[0] = 0;
386 for (t=1; t<=ths->box_count; t++)
388 ths->box_offset[t] = ths->box_offset[t-1] + box_index[t-1];
389 box_index[t-1] = ths->box_offset[t-1];
392 for (l=0; l < ths->
N_total; l++)
395 for (t=0; t < ths->
d; t++)
397 val[t] = ths->
x[ths->
d * l + t] + 0.25 - ths->
eps_B/2.0;
398 ind *= ths->box_count_per_dim;
399 ind += (int) (val[t] / ths->
eps_I);
402 ths->box_alpha[box_index[ind]] = ths->
alpha[l];
404 for (t=0; t < ths->
d; t++)
406 ths->box_x[ths->
d * box_index[ind] + t] = ths->
x[ths->
d * l + t];
414 static inline double _Complex
calc_SearchBox(
int d,
double *y,
double *x,
double _Complex *alpha,
int start,
int end_lt,
const double _Complex *Add,
const int Ad,
int p,
double a,
const kernel k,
const double *param,
const unsigned flags)
416 double _Complex result = 0.0;
421 for (m = start; m < end_lt; m++)
431 r+=(y[l]-x[m*d+l])*(y[l]-x[m*d+l]);
436 result += alpha[m]*k(r,0,param);
440 result -= alpha[m]*
regkern1(k,r,p,param,a,1.0/16.0);
447 result -= alpha[m]*
regkern(k,r,p,param,a,1.0/16.0);
451 #elif defined(NF_QUADR)
452 result -= alpha[m]*quadrintkern(r,Add,Ad,a);
453 #elif defined(NF_LIN)
456 #error define interpolation method
467 double _Complex val = 0.0;
469 int y_multiind[ths->
d];
470 int multiindex[ths->
d];
473 for (t=0; t < ths->
d; t++)
475 y_multiind[t] = ((y[t] + 0.25 - ths->
eps_B/2.0) / ths->
eps_I);
480 for (y_ind =
max_i(0, y_multiind[0]-1); y_ind < ths->box_count_per_dim && y_ind <= y_multiind[0]+1; y_ind++){
481 val +=
calc_SearchBox(ths->
d, y, ths->box_x, ths->box_alpha, ths->box_offset[y_ind], ths->box_offset[y_ind+1], ths->
Add, ths->
Ad, ths->
p, ths->
eps_I, ths->
k, ths->
kernel_param, ths->
flags);
486 for (multiindex[0] =
max_i(0, y_multiind[0]-1); multiindex[0] < ths->box_count_per_dim && multiindex[0] <= y_multiind[0]+1; multiindex[0]++)
487 for (multiindex[1] =
max_i(0, y_multiind[1]-1); multiindex[1] < ths->box_count_per_dim && multiindex[1] <= y_multiind[1]+1; multiindex[1]++)
489 y_ind = (ths->box_count_per_dim * multiindex[0]) + multiindex[1];
490 val +=
calc_SearchBox(ths->
d, y, ths->box_x, ths->box_alpha, ths->box_offset[y_ind], ths->box_offset[y_ind+1], ths->
Add, ths->
Ad, ths->
p, ths->
eps_I, ths->
k, ths->
kernel_param, ths->
flags);
495 for (multiindex[0] =
max_i(0, y_multiind[0]-1); multiindex[0] < ths->box_count_per_dim && multiindex[0] <= y_multiind[0]+1; multiindex[0]++)
496 for (multiindex[1] =
max_i(0, y_multiind[1]-1); multiindex[1] < ths->box_count_per_dim && multiindex[1] <= y_multiind[1]+1; multiindex[1]++)
497 for (multiindex[2] =
max_i(0, y_multiind[2]-1); multiindex[2] < ths->box_count_per_dim && multiindex[2] <= y_multiind[2]+1; multiindex[2]++)
499 y_ind = ((ths->box_count_per_dim * multiindex[0]) + multiindex[1]) * ths->box_count_per_dim + multiindex[2];
500 val +=
calc_SearchBox(ths->
d, y, ths->box_x, ths->box_alpha, ths->box_offset[y_ind], ths->box_offset[y_ind+1], ths->
Add, ths->
Ad, ths->
p, ths->
eps_I, ths->
k, ths->
kernel_param, ths->
flags);
510 void BuildTree(
int d,
int t,
double *x,
double _Complex *alpha,
int N)
519 BuildTree(d, (t+1)%d, x+(m+1)*d, alpha+(m+1), N-m-1);
524 double _Complex
SearchTree(
const int d,
const int t,
const double *x,
525 const double _Complex *alpha,
const double *xmin,
const double *xmax,
526 const int N,
const kernel k,
const double *param,
const int Ad,
527 const double _Complex *Add,
const int p,
const unsigned flags)
530 double Min=xmin[t], Max=xmax[t], Median=x[m*d+t];
531 double a=fabs(Max-Min)/2;
539 return SearchTree(d,(t+1)%d,x+(m+1)*d,alpha+(m+1),xmin,xmax,N-m-1,k,param,Ad,Add,p,flags);
541 return SearchTree(d,(t+1)%d,x,alpha,xmin,xmax,m,k,param,Ad,Add,p,flags);
544 double _Complex result = 0.0;
549 if ( x[m*d+l]>xmin[l] && x[m*d+l]<xmax[l] )
563 r+=(xmin[l]+a-x[m*d+l])*(xmin[l]+a-x[m*d+l]);
568 result += alpha[m]*k(r,0,param);
572 result -= alpha[m]*
regkern1(k,r,p,param,a,1.0/16.0);
579 result -= alpha[m]*
regkern(k,r,p,param,a,1.0/16.0);
583 #elif defined(NF_QUADR)
584 result -= alpha[m]*quadrintkern(r,Add,Ad,a);
585 #elif defined(NF_LIN)
588 #error define interpolation method
593 result +=
SearchTree(d,(t+1)%d,x+(m+1)*d,alpha+(m+1),xmin,xmax,N-m-1,k,param,Ad,Add,p,flags)
594 +
SearchTree(d,(t+1)%d,x,alpha,xmin,xmax,m,k,param,Ad,Add,p,flags);
600 void fastsum_init_guru(
fastsum_plan *ths,
int d,
int N_total,
int M_total, kernel k,
double *param,
unsigned flags,
int nn,
int m,
int p,
double eps_I,
double eps_B)
605 int sort_flags_trafo = 0;
606 int sort_flags_adjoint = 0;
608 int nthreads = nfft_get_omp_num_threads();
613 sort_flags_trafo = NFFT_SORT_NODES;
615 sort_flags_adjoint = NFFT_SORT_NODES | NFFT_OMP_BLOCKWISE_ADJOINT;
617 sort_flags_adjoint = NFFT_SORT_NODES;
626 ths->
x = (
double *)
nfft_malloc(d*N_total*(
sizeof(
double)));
627 ths->
alpha = (
double _Complex *)
nfft_malloc(N_total*(
sizeof(
double _Complex)));
629 ths->
y = (
double *)
nfft_malloc(d*M_total*(
sizeof(
double)));
630 ths->
f = (
double _Complex *)
nfft_malloc(M_total*(
sizeof(
double _Complex)));
646 ths->
Ad = 4*(ths->
p)*(ths->
p);
647 ths->
Add = (
double _Complex *)
nfft_malloc((ths->
Ad+5)*(
sizeof(
double _Complex)));
651 if (ths->
k == one_over_x)
656 case 2: delta = 1e-3;
658 case 3: delta = 1e-4;
660 case 4: delta = 1e-5;
662 case 5: delta = 1e-6;
664 case 6: delta = 1e-6;
666 case 7: delta = 1e-7;
668 default: delta = 1e-8;
672 ths->
Ad =
max_i(10, (
int) ceil(1.4/pow(delta,1.0/4.0)));
673 ths->
Add = (
double _Complex *)
nfft_malloc((ths->
Ad+3)*(
sizeof(
double _Complex)));
674 #elif defined(NF_QUADR)
675 ths->
Ad = (int) ceil(2.2/pow(delta,1.0/3.0));
676 ths->
Add = (
double _Complex *)
nfft_malloc((ths->
Ad+3)*(
sizeof(
double _Complex)));
677 #elif defined(NF_LIN)
678 ths->
Ad = (int) ceil(1.7/pow(delta,1.0/2.0));
679 ths->
Add = (
double _Complex *)
nfft_malloc((ths->
Ad+3)*(
sizeof(
double _Complex)));
681 #error define NF_LIN or NF_QUADR or NF_KUB
686 ths->
Ad = 2*(ths->
p)*(ths->
p);
687 ths->
Add = (
double _Complex *)
nfft_malloc((ths->
Ad+3)*(
sizeof(
double _Complex)));
699 nfft_init_guru(&(ths->
mv1), d, N, N_total, n, m,
701 PRE_PHI_HUT| PRE_PSI| MALLOC_X | MALLOC_F_HAT| MALLOC_F| FFTW_INIT | FFT_OUT_OF_PLACE,
702 FFTW_MEASURE| FFTW_DESTROY_INPUT);
703 nfft_init_guru(&(ths->
mv2), d, N, M_total, n, m,
705 PRE_PHI_HUT| PRE_PSI| MALLOC_X | MALLOC_F_HAT| MALLOC_F| FFTW_INIT | FFT_OUT_OF_PLACE,
706 FFTW_MEASURE| FFTW_DESTROY_INPUT);
713 ths->
b = (fftw_complex *)
nfft_malloc(n_total*
sizeof(fftw_complex));
715 #pragma omp critical (nfft_omp_critical_fftw_plan)
717 fftw_plan_with_nthreads(nthreads);
720 ths->fft_plan = fftw_plan_dft(d,N,ths->
b,ths->
b,FFTW_FORWARD,FFTW_ESTIMATE);
726 if (ths->
flags & NEARFIELD_BOXES)
728 ths->box_count_per_dim = floor((0.5 - ths->
eps_B) / ths->
eps_I) + 1;
730 for (t=0; t<ths->
d; t++)
731 ths->box_count *= ths->box_count_per_dim;
733 ths->box_offset = (
int *)
nfft_malloc((ths->box_count+1) *
sizeof(
int));
735 ths->box_alpha = (
double _Complex *)
nfft_malloc(ths->
N_total*(
sizeof(
double _Complex)));
752 nfft_finalize(&(ths->
mv1));
753 nfft_finalize(&(ths->
mv2));
756 #pragma omp critical (nfft_omp_critical_fftw_plan)
759 fftw_destroy_plan(ths->fft_plan);
766 if (ths->
flags & NEARFIELD_BOXES)
781 #pragma omp parallel for default(shared) private(j,k,t,r)
788 r = ths->
y[j] - ths->
x[k];
792 for (t=0; t<ths->
d; t++)
793 r += (ths->
y[j*ths->
d+t]-ths->
x[k*ths->
d+t])*(ths->
y[j*ths->
d+t]-ths->
x[k*ths->
d+t]);
818 if (ths->
flags & NEARFIELD_BOXES)
841 #pragma omp parallel for default(shared) private(k)
842 for (k=-ths->
Ad/2-2; k <= ths->Ad/2+2; k++)
845 #pragma omp parallel for default(shared) private(k)
846 for (k=0; k <= ths->
Ad+2; k++)
860 for (t=0; t<ths->
mv1.
d; t++)
868 nfft_precompute_psi(&(ths->
mv1));
871 nfft_precompute_full_psi(&(ths->
mv1));
886 for (t=0; t<ths->
mv2.
d; t++)
894 nfft_precompute_psi(&(ths->
mv2));
897 nfft_precompute_full_psi(&(ths->
mv2));
909 for (t=0; t<ths->
d; t++)
912 #pragma omp parallel
for default(shared)
private(j,k,t)
913 for (j=0; j<n_total; j++)
921 for (t=0; t<ths->
d; t++)
923 ths->
b[j] += ((double)(k % (ths->
n)) / (ths->
n) - 0.5) * ((double)(k % (ths->
n)) / (ths->
n) - 0.5);
931 fftw_execute(ths->fft_plan);
954 nfft_adjoint(&(ths->
mv1));
965 #pragma omp parallel for default(shared) private(k)
989 #pragma omp parallel for default(shared) private(j,k,t)
992 double ymin[ths->
d], ymax[ths->
d];
994 if (ths->
flags & NEARFIELD_BOXES)
1000 for (t=0; t<ths->
d; t++)
1002 ymin[t] = ths->
y[ths->
d*j+t] - ths->
eps_I;
1003 ymax[t] = ths->
y[ths->
d*j+t] + ths->
eps_I;
1005 ths->
f[j] = ths->
mv2.
f[j] +
SearchTree(ths->
d,0, ths->
x, ths->
alpha, ymin, ymax, ths->
N_total, ths->
k, ths->
kernel_param, ths->
Ad, ths->
Add, ths->
p, ths->
flags);