/* * Copyright (c) 1997-1999, 2003 Massachusetts Institute of Technology * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #include #include #include #include #include #include "fftw-int.h" #include "fftw_mpi.h" #include "test_main.h" #define my_printf if (io_okay) printf #define my_fprintf if (io_okay) fprintf #define my_fflush if (io_okay) fflush int ncpus = 1; int my_cpu = 0; int only_parallel = 0; char fftw_prefix[] = "fftw_mpi"; /************************************************* * Speed tests *************************************************/ #define MPI_TIME_FFT(fft,a,n,t) \ { \ double ts,te; \ double total_t; \ int iters = 1,iter; \ zero_arr((n), (a)); \ do { \ MPI_Barrier(MPI_COMM_WORLD); \ ts = MPI_Wtime(); \ for (iter = 0; iter < iters; ++iter) fft; \ te = MPI_Wtime(); \ t = (total_t = (te - ts)) / iters; \ iters *= 2; \ } while (total_t < 2.0); \ } void zero_arr(int n, fftw_complex * a) { int i; for (i = 0; i < n; ++i) c_re(a[i]) = c_im(a[i]) = 0.0; } void test_speed_aux(int n, fftw_direction dir, int flags, int specific) { int local_n, local_start, local_n_after_transform, local_start_after_transform, total_local_size, nalloc; fftw_complex *in, *work; fftw_plan plan = 0; fftw_mpi_plan mpi_plan; double t, t0 = 0.0; if (specific || !(flags & FFTW_IN_PLACE)) return; if (io_okay && !only_parallel) plan = fftw_create_plan(n, dir, speed_flag | flags | wisdom_flag | no_vector_flag); mpi_plan = fftw_mpi_create_plan(MPI_COMM_WORLD, n, dir, speed_flag | flags | wisdom_flag | no_vector_flag); CHECK(mpi_plan, "failed to create plan!"); fftw_mpi_local_sizes(mpi_plan, &local_n, &local_start, &local_n_after_transform, &local_start_after_transform, &total_local_size); if (io_okay && !only_parallel) nalloc = n; else nalloc = total_local_size; in = (fftw_complex *) fftw_malloc(nalloc * howmany_fields * sizeof(fftw_complex)); work = (fftw_complex *) fftw_malloc(nalloc * howmany_fields * sizeof(fftw_complex)); if (io_okay) { WHEN_VERBOSE(2, fftw_mpi_print_plan(mpi_plan)); } if (io_okay && !only_parallel) { FFTW_TIME_FFT(fftw(plan, howmany_fields, in, howmany_fields, 1, work, 1, 0), in, n * howmany_fields, t0); fftw_destroy_plan(plan); WHEN_VERBOSE(1, printf("time for one fft (uniprocessor): %s\n", smart_sprint_time(t0))); } MPI_TIME_FFT(fftw_mpi(mpi_plan, howmany_fields, in, NULL), in, total_local_size * howmany_fields, t); if (io_okay) { WHEN_VERBOSE(1, printf("time for one fft (%d cpus): %s", ncpus, smart_sprint_time(t))); WHEN_VERBOSE(1, printf(" (%s/point)\n", smart_sprint_time(t / n))); WHEN_VERBOSE(1, printf("\"mflops\" = 5 (n log2 n) / (t in microseconds)" " = %f\n", howmany_fields * mflops(t, n))); if (!only_parallel) WHEN_VERBOSE(1, printf("parallel speedup: %f\n", t0 / t)); } MPI_TIME_FFT(fftw_mpi(mpi_plan, howmany_fields, in, work), in, total_local_size * howmany_fields, t); if (io_okay) { WHEN_VERBOSE(1, printf("w/WORK: time for one fft (%d cpus): %s", ncpus, smart_sprint_time(t))); WHEN_VERBOSE(1, printf(" (%s/point)\n", smart_sprint_time(t / n))); WHEN_VERBOSE(1, printf("w/WORK: \"mflops\" = 5 (n log2 n) / (t in microseconds)" " = %f\n", howmany_fields * mflops(t, n))); if (!only_parallel) WHEN_VERBOSE(1, printf("w/WORK: parallel speedup: %f\n", t0 / t)); } fftw_free(in); fftw_free(work); fftw_mpi_destroy_plan(mpi_plan); WHEN_VERBOSE(1, my_printf("\n")); } void test_speed_nd_aux(struct size sz, fftw_direction dir, int flags, int specific) { int local_nx, local_x_start, local_ny_after_transpose, local_y_start_after_transpose, total_local_size; fftw_complex *in, *work; fftwnd_plan plan = 0; fftwnd_mpi_plan mpi_plan; double t, t0 = 0.0; int i, N; if (sz.rank < 2) return; /* only bench in-place multi-dim transforms */ flags |= FFTW_IN_PLACE; N = 1; for (i = 0; i < sz.rank; ++i) N *= (sz.narray[i]); if (specific) { return; } else { if (io_okay && !only_parallel) plan = fftwnd_create_plan(sz.rank, sz.narray, dir, speed_flag | flags | wisdom_flag | no_vector_flag); mpi_plan = fftwnd_mpi_create_plan(MPI_COMM_WORLD, sz.rank, sz.narray, dir, speed_flag | flags | wisdom_flag | no_vector_flag); } CHECK(mpi_plan != NULL, "can't create plan"); fftwnd_mpi_local_sizes(mpi_plan, &local_nx, &local_x_start, &local_ny_after_transpose, &local_y_start_after_transpose, &total_local_size); if (io_okay && !only_parallel) in = (fftw_complex *) fftw_malloc(N * howmany_fields * sizeof(fftw_complex)); else in = (fftw_complex *) fftw_malloc(total_local_size * howmany_fields * sizeof(fftw_complex)); work = (fftw_complex *) fftw_malloc(total_local_size * howmany_fields * sizeof(fftw_complex)); if (io_okay && !only_parallel) { FFTW_TIME_FFT(fftwnd(plan, howmany_fields, in, howmany_fields, 1, 0, 0, 0), in, N * howmany_fields, t0); fftwnd_destroy_plan(plan); WHEN_VERBOSE(1, printf("time for one fft (uniprocessor): %s\n", smart_sprint_time(t0))); } MPI_TIME_FFT(fftwnd_mpi(mpi_plan, howmany_fields, in, NULL, FFTW_NORMAL_ORDER), in, total_local_size * howmany_fields, t); if (io_okay) { WHEN_VERBOSE(1, printf("NORMAL: time for one fft (%d cpus): %s", ncpus, smart_sprint_time(t))); WHEN_VERBOSE(1, printf(" (%s/point)\n", smart_sprint_time(t / N))); WHEN_VERBOSE(1, printf("NORMAL: \"mflops\" = 5 (N log2 N) / " "(t in microseconds)" " = %f\n", howmany_fields * mflops(t, N))); if (!only_parallel) WHEN_VERBOSE(1, printf("NORMAL: parallel speedup: %f\n", t0 / t)); } MPI_TIME_FFT(fftwnd_mpi(mpi_plan, howmany_fields, in, NULL, FFTW_TRANSPOSED_ORDER), in, total_local_size * howmany_fields, t); if (io_okay) { WHEN_VERBOSE(1, printf("TRANSP.: time for one fft (%d cpus): %s", ncpus, smart_sprint_time(t))); WHEN_VERBOSE(1, printf(" (%s/point)\n", smart_sprint_time(t / N))); WHEN_VERBOSE(1, printf("TRANSP.: \"mflops\" = 5 (N log2 N) / " "(t in microseconds)" " = %f\n", howmany_fields * mflops(t, N))); if (!only_parallel) WHEN_VERBOSE(1, printf("TRANSP.: parallel speedup: %f\n", t0 / t)); } MPI_TIME_FFT(fftwnd_mpi(mpi_plan, howmany_fields, in, work, FFTW_NORMAL_ORDER), in, total_local_size * howmany_fields, t); if (io_okay) { WHEN_VERBOSE(1, printf("NORMAL,w/WORK: time for one fft (%d cpus): %s", ncpus, smart_sprint_time(t))); WHEN_VERBOSE(1, printf(" (%s/point)\n", smart_sprint_time(t / N))); WHEN_VERBOSE(1, printf("NORMAL,w/WORK: \"mflops\" = 5 (N log2 N) / " "(t in microseconds)" " = %f\n", howmany_fields * mflops(t, N))); if (!only_parallel) WHEN_VERBOSE(1, printf("NORMAL,w/WORK: parallel speedup: %f\n", t0 / t)); } MPI_TIME_FFT(fftwnd_mpi(mpi_plan, howmany_fields, in, work, FFTW_TRANSPOSED_ORDER), in, total_local_size * howmany_fields, t); if (io_okay) { WHEN_VERBOSE(1, printf("TRANSP.,w/WORK: time for one fft (%d cpus): %s", ncpus, smart_sprint_time(t))); WHEN_VERBOSE(1, printf(" (%s/point)\n", smart_sprint_time(t / N))); WHEN_VERBOSE(1, printf("TRANSP.,w/WORK: \"mflops\" = 5 (N log2 N) / " "(t in microseconds)" " = %f\n", howmany_fields * mflops(t, N))); if (!only_parallel) WHEN_VERBOSE(1, printf("TRANSP.,w/WORK: parallel speedup: %f\n", t0 / t)); } fftwnd_mpi_destroy_plan(mpi_plan); fftw_free(in); fftw_free(work); WHEN_VERBOSE(1, my_printf("\n")); } /************************************************* * correctness tests *************************************************/ void test_out_of_place(int n, int istride, int ostride, int howmany, fftw_direction dir, fftw_plan validated_plan, int specific) { /* one-dim. out-of-place transforms will never be supported in MPI */ WHEN_VERBOSE(2, my_printf("N/A\n")); } void test_in_place(int n, int istride, int howmany, fftw_direction dir, fftw_plan validated_plan, int specific) { int local_n, local_start, local_n_after_transform, local_start_after_transform, total_local_size; fftw_complex *in1, *work = NULL, *in2, *out2; fftw_mpi_plan plan; int i; int flags = measure_flag | wisdom_flag | FFTW_IN_PLACE; if (specific) { WHEN_VERBOSE(2, my_printf("N/A\n")); return; } if (coinflip()) flags |= FFTW_THREADSAFE; plan = fftw_mpi_create_plan(MPI_COMM_WORLD, n, dir, flags); fftw_mpi_local_sizes(plan, &local_n, &local_start, &local_n_after_transform, &local_start_after_transform, &total_local_size); in1 = (fftw_complex *) fftw_malloc(total_local_size * sizeof(fftw_complex) * howmany); if (coinflip()) { WHEN_VERBOSE(2, my_printf("w/work...")); work = (fftw_complex *) fftw_malloc(total_local_size * sizeof(fftw_complex) * howmany); } in2 = (fftw_complex *) fftw_malloc(n * sizeof(fftw_complex) * howmany); out2 = (fftw_complex *) fftw_malloc(n * sizeof(fftw_complex) * howmany); /* generate random inputs */ for (i = 0; i < n * howmany; ++i) { c_re(in2[i]) = DRAND(); c_im(in2[i]) = DRAND(); } for (i = 0; i < local_n * howmany; ++i) { c_re(in1[i]) = c_re(in2[i + local_start*howmany]); c_im(in1[i]) = c_im(in2[i + local_start*howmany]); } /* fft-ize */ fftw_mpi(plan, howmany, in1, work); fftw_mpi_destroy_plan(plan); fftw(validated_plan, howmany, in2, howmany, 1, out2, howmany, 1); CHECK(compute_error_complex(in1, 1, out2 + local_start_after_transform*howmany, 1, howmany*local_n_after_transform) < TOLERANCE, "test_in_place: wrong answer"); WHEN_VERBOSE(2, my_printf("OK\n")); fftw_free(in1); fftw_free(work); fftw_free(in2); fftw_free(out2); } void test_out_of_place_both(int n, int istride, int ostride, int howmany, fftw_plan validated_plan_forward, fftw_plan validated_plan_backward) { } void test_in_place_both(int n, int istride, int howmany, fftw_plan validated_plan_forward, fftw_plan validated_plan_backward) { WHEN_VERBOSE(2, my_printf("TEST CORRECTNESS (in place, FFTW_FORWARD, %s) " "n = %d istride = %d howmany = %d\n", SPECIFICP(0), n, istride, howmany)); test_in_place(n, istride, howmany, FFTW_FORWARD, validated_plan_forward, 0); WHEN_VERBOSE(2, my_printf("TEST CORRECTNESS (in place, FFTW_BACKWARD, %s) " "n = %d istride = %d howmany = %d\n", SPECIFICP(0), n, istride, howmany)); test_in_place(n, istride, howmany, FFTW_BACKWARD, validated_plan_backward, 0); } void test_correctness(int n) { int howmany; fftw_plan validated_plan_forward, validated_plan_backward; WHEN_VERBOSE(1, my_printf("Testing correctness for n = %d...", n); my_fflush(stdout)); /* produce a good plan */ validated_plan_forward = fftw_create_plan(n, FFTW_FORWARD, measure_flag | wisdom_flag); validated_plan_backward = fftw_create_plan(n, FFTW_BACKWARD, measure_flag | wisdom_flag); for (howmany = 1; howmany <= MAX_HOWMANY; ++howmany) test_in_place_both(n, howmany, howmany, validated_plan_forward, validated_plan_backward); fftw_destroy_plan(validated_plan_forward); fftw_destroy_plan(validated_plan_backward); if (!(wisdom_flag & FFTW_USE_WISDOM) && chk_mem_leak) fftw_check_memory_leaks(); WHEN_VERBOSE(1, my_printf("OK\n")); } /************************************************* * multi-dimensional correctness tests *************************************************/ void testnd_out_of_place(int rank, int *n, fftw_direction dir, fftwnd_plan validated_plan) { } void testnd_in_place(int rank, int *n, fftw_direction dir, fftwnd_plan validated_plan, int alternate_api, int specific, int force_buffered) { int local_nx, local_x_start, local_ny_after_transpose, local_y_start_after_transpose, total_local_size; int istride; int N, dim, i; fftw_complex *in1, *work = 0, *in2; fftwnd_mpi_plan p = 0; int flags = measure_flag | wisdom_flag | FFTW_IN_PLACE; if (specific || rank < 2) return; if (coinflip()) flags |= FFTW_THREADSAFE; if (force_buffered) flags |= FFTWND_FORCE_BUFFERED; N = 1; for (dim = 0; dim < rank; ++dim) N *= n[dim]; if (alternate_api && (rank == 2 || rank == 3)) { if (rank == 2) p = fftw2d_mpi_create_plan(MPI_COMM_WORLD, n[0], n[1], dir, flags); else p = fftw3d_mpi_create_plan(MPI_COMM_WORLD, n[0], n[1], n[2], dir, flags); } else /* standard api */ p = fftwnd_mpi_create_plan(MPI_COMM_WORLD, rank, n, dir, flags); fftwnd_mpi_local_sizes(p, &local_nx, &local_x_start, &local_ny_after_transpose, &local_y_start_after_transpose, &total_local_size); in1 = (fftw_complex *) fftw_malloc(total_local_size * MAX_STRIDE * sizeof(fftw_complex)); if (coinflip()) { WHEN_VERBOSE(1, my_printf("w/work...")); work = (fftw_complex *) fftw_malloc(total_local_size * MAX_STRIDE * sizeof(fftw_complex)); } in2 = (fftw_complex *) fftw_malloc(N * sizeof(fftw_complex)); for (istride = 1; istride <= MAX_STRIDE; ++istride) { /* generate random inputs */ for (i = 0; i < N; ++i) { c_re(in2[i]) = DRAND(); c_im(in2[i]) = DRAND(); } for (i = 0; i < local_nx * (N/n[0]); ++i) { int j; for (j = 0; j < istride; ++j) { c_re(in1[i * istride + j]) = c_re((in2 + local_x_start * (N/n[0])) [i]); c_im(in1[i * istride + j]) = c_im((in2 + local_x_start * (N/n[0])) [i]); } } fftwnd_mpi(p, istride, in1, work, FFTW_NORMAL_ORDER); fftwnd(validated_plan, 1, in2, 1, 1, NULL, 0, 0); for (i = 0; i < istride; ++i) CHECK(compute_error_complex(in1 + i, istride, in2 + local_x_start * (N/n[0]), 1, local_nx * (N/n[0])) < TOLERANCE, "testnd_in_place: wrong answer"); } fftwnd_mpi_destroy_plan(p); fftw_free(in2); fftw_free(work); fftw_free(in1); } void testnd_correctness(struct size sz, fftw_direction dir, int alt_api, int specific, int force_buf) { fftwnd_plan validated_plan; validated_plan = fftwnd_create_plan(sz.rank, sz.narray, dir, measure_flag | wisdom_flag | FFTW_IN_PLACE); testnd_in_place(sz.rank, sz.narray, dir, validated_plan, alt_api, specific, force_buf); fftwnd_destroy_plan(validated_plan); } /************************************************* * planner tests *************************************************/ void test_planner(int rank) { /* * create and destroy many plans, at random. Check the * garbage-collecting allocator of twiddle factors */ int i, dim; int r, s; fftw_mpi_plan p[PLANNER_TEST_SIZE]; fftwnd_mpi_plan pnd[PLANNER_TEST_SIZE]; int *narr, maxdim; chk_mem_leak = 0; verbose--; please_wait(); if (rank < 1) rank = 1; narr = (int *) fftw_malloc(rank * sizeof(int)); for (i = 0; i < PLANNER_TEST_SIZE; ++i) { p[i] = (fftw_mpi_plan) 0; pnd[i] = (fftwnd_mpi_plan) 0; } if (PLANNER_TEST_SIZE >= 8) { p[0] = fftw_mpi_create_plan(MPI_COMM_WORLD, 1024, FFTW_FORWARD, 0); p[1] = fftw_mpi_create_plan(MPI_COMM_WORLD, 1024, FFTW_FORWARD, 0); p[2] = fftw_mpi_create_plan(MPI_COMM_WORLD, 1024, FFTW_BACKWARD, 0); p[3] = fftw_mpi_create_plan(MPI_COMM_WORLD, 1024, FFTW_BACKWARD, 0); p[4] = fftw_mpi_create_plan(MPI_COMM_WORLD, 1024, FFTW_FORWARD, 0); p[5] = fftw_mpi_create_plan(MPI_COMM_WORLD, 1024, FFTW_FORWARD, 0); p[6] = fftw_mpi_create_plan(MPI_COMM_WORLD, 1024, FFTW_BACKWARD, 0); p[7] = fftw_mpi_create_plan(MPI_COMM_WORLD, 1024, FFTW_BACKWARD, 0); } maxdim = (int) pow(8192, 1.0/rank); for (i = 0; i < PLANNER_TEST_SIZE * PLANNER_TEST_SIZE; ++i) { r = rand(); if (r < 0) r = -r; r = r % PLANNER_TEST_SIZE; for (dim = 0; dim < rank; ++dim) { do { s = rand(); if (s < 0) s = -s; s = s % maxdim + 1; } while (s == 0); narr[dim] = s; } if (rank == 1) { if (p[r]) fftw_mpi_destroy_plan(p[r]); p[r] = fftw_mpi_create_plan(MPI_COMM_WORLD, narr[0], random_dir(), measure_flag | wisdom_flag); } if (rank > 1) { if (pnd[r]) fftwnd_mpi_destroy_plan(pnd[r]); pnd[r] = fftwnd_mpi_create_plan(MPI_COMM_WORLD, rank, narr, random_dir(), measure_flag | wisdom_flag); } if (i % (PLANNER_TEST_SIZE * PLANNER_TEST_SIZE / 20) == 0) { WHEN_VERBOSE(0, my_printf("test planner: so far so good\n")); WHEN_VERBOSE(0, my_printf("test planner: iteration %d" " out of %d\n", i, PLANNER_TEST_SIZE * PLANNER_TEST_SIZE)); } } for (i = 0; i < PLANNER_TEST_SIZE; ++i) { if (p[i]) fftw_mpi_destroy_plan(p[i]); if (pnd[i]) fftwnd_mpi_destroy_plan(pnd[i]); } fftw_free(narr); verbose++; chk_mem_leak = 1; } /************************************************* * test initialization *************************************************/ void test_init(int *argc, char ***argv) { int i; unsigned int seed; MPI_Init(argc,argv); MPI_Comm_size(MPI_COMM_WORLD,&ncpus); MPI_Comm_rank(MPI_COMM_WORLD,&my_cpu); /* Only process 0 gets to do I/O: */ io_okay = my_cpu == 0; if (io_okay) for (i = 1; i < *argc; ++i) if (!strcmp((*argv)[i], "--only-parallel")) { only_parallel = 1; strcpy((*argv)[i], ""); } MPI_Bcast(&only_parallel, 1, MPI_INT, 0, MPI_COMM_WORLD); /* Make sure all processes use the same seed for random numbers: */ seed = time(NULL); MPI_Bcast(&seed, 1, MPI_INT, 0, MPI_COMM_WORLD); srand(seed); fftw_die_hook = fftw_mpi_die; /* call MPI_Abort on failure */ } void test_finish(void) { MPI_Finalize(); } void enter_paranoid_mode(void) { } /* in MPI, only process 0 is guaranteed to have access to the argument list */ int get_option(int argc, char **argv, char *argval, int argval_maxlen) { int c; int arglen; if (io_okay) { c = default_get_option(argc,argv,argval,argval_maxlen); arglen = strlen(argval) + 1; } MPI_Bcast(&c, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&arglen, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(argval, arglen, MPI_CHAR, 0, MPI_COMM_WORLD); return c; }