/*
 * Copyright (c) 1997 Massachusetts Institute of Technology
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to use, copy, modify, and distribute the Software without
 * restriction, provided the Software, including any modified copies made
 * under this license, is not distributed for a fee, subject to
 * the following conditions:
 * 
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE MASSACHUSETTS INSTITUTE OF TECHNOLOGY BE LIABLE
 * FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
 * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 * 
 * Except as contained in this notice, the name of the Massachusetts
 * Institute of Technology shall not be used in advertising or otherwise
 * to promote the sale, use or other dealings in this Software without
 * prior written authorization from the Massachusetts Institute of
 * Technology.
 *  
 */

#include <stdio.h>
#include <stdlib.h>

#include <math.h>

#include "bench_utils.h"
#include "rbench_ffts.h"

/***************************************************************************/
/* Prototypes for included FFTs: */

#include <rfftw.h>

void FORTRANIZE(drffti,DCRFFTI)(int *n, FFTW_REAL * wsave);
void FORTRANIZE(drfftf,DCRFFTF)(int *n, FFTW_REAL * c,
                                FFTW_REAL * wsave);
void FORTRANIZE(drfftb,DCRFFTB)(int *n, FFTW_REAL * c,
                                FFTW_REAL * wsave);

void FORTRANIZE(srffti,SCRFFTI)(int *n, FFTW_REAL * wsave);
void FORTRANIZE(srfftf,SCRFFTF)(int *n, FFTW_REAL * c,
                                FFTW_REAL * wsave);
void FORTRANIZE(srfftb,SCRFFTB)(int *n, FFTW_REAL * c,
                                FFTW_REAL * wsave);

#ifdef macintosh /* Mac compilers don't use pathnames */
#include "gsl_fft_real.h"
#include "gsl_fft_halfcomplex.h"
#else
#include "c_source/gsl/gsl_fft_real.h"
#include "c_source/gsl/gsl_fft_halfcomplex.h"
#endif

void  mayer_realfft_1(int n, FFTW_REAL *real);
void mayer_realifft_1(int n, FFTW_REAL *real);

void ooura_c_rdft(int n, int isgn, FFTW_REAL *a, int *ip, FFTW_REAL *w);
void FORTRANIZE(oourafrdft,OOURAFRDFT)(int *n, int *isgn,
				       FFTW_REAL *a, int *ip, FFTW_REAL *w);

void    rfqft(FFTW_REAL *R, FFTW_REAL *I, FFTW_REAL *x, int n, int k);
void    riqft(FFTW_REAL *R, FFTW_REAL *I, FFTW_REAL *x, int n, int k);
int     qftinit(int n, void *work);

void crandall_init_fft(int,FFTW_REAL*,int*);
void crandall_fft_real_to_hermitian(FFTW_REAL*, int);
void crandall_fftinv_hermitian_to_real(FFTW_REAL*, int);

void FORTRANIZE(ksorensenffti,KSORENSENFFTI)(int*,FFTW_REAL*,FFTW_REAL*,
					    FFTW_REAL*, FFTW_REAL*);
void FORTRANIZE(ksorensenrfftf,KSORENSENRFFTF)(FFTW_REAL*, int *, int *,
					       FFTW_REAL*,FFTW_REAL*,
					       FFTW_REAL*,FFTW_REAL*, int *);
void FORTRANIZE(ksorensenrfftb,KSORENSENRFFTB)(FFTW_REAL*, int *, int *,
					       FFTW_REAL*,FFTW_REAL*,
					       FFTW_REAL*,FFTW_REAL*, int *);

int green_fftInit(int M);
void green_fftFree(void);
void green_rffts(FFTW_REAL *ioptr, int M, int Rows);
void green_riffts(FFTW_REAL *ioptr, int M, int Rows);

void nrc_realft(FFTW_REAL data[], unsigned int n, int isign);
void FORTRANIZE(nrfrealft,NRFREALFT)(FFTW_REAL*, int*, int*);

void FORTRANIZE(singletonfft, SINGLETONFFT) (FFTW_REAL *, FFTW_REAL *,
					     int *, int *, int *, int *);

void FORTRANIZE(singrealtr, SINGREALTR) (FFTW_REAL *, FFTW_REAL *,
					 int *, int *);

void go_fft(FFTW_REAL*, FFTW_REAL*, int, int, int, int);
void sing_realtr(FFTW_REAL*, FFTW_REAL*, int, int);

void Bloodworth_Q2_FwdRealFFT(FFTW_REAL*, int);
void Bloodworth_Q2_RevRealFFT(FFTW_REAL*, int);
void FwdRealFFT(FFTW_REAL*, int);
void RevRealFFT(FFTW_REAL*, int);

/***************************************************************************/

extern int check_prime_factors(int n, int maxprime);

void do_fftw_rfft(int rank, int *n, int *n_rev, int N, short is_power_of_two,
		  FFTW_REAL *arr, FFTW_REAL *work,
		  int size_arr, int size_work,
		  short compute_accuracy, factor_type allowed_factors)
{
     rfftw_plan plan = NULL, iplan = NULL;

     if (rank != 1) return;

     FFT_NAME("FFTW");

#ifdef RFFTW_V2
     {
#else
     if (N % 2 == 0) {
#endif
	  if (N != 0 && fft_data_cur) {
#ifdef RFFTW_V2
	       plan = rfftw_create_plan_specific(N, FFTW_FORWARD, FFTW_MEASURE,
						 arr, 1, work, 1);
	       iplan = rfftw_create_plan_specific(N, FFTW_BACKWARD,
						  FFTW_MEASURE,
						  work, 1, arr, 1);
#else
	       plan = rfftw_create_plan(N, FFTW_FORWARD, FFTW_MEASURE,
					REAL_TO_COMPLEX);
	       iplan = rfftw_create_plan(N, FFTW_BACKWARD, FFTW_MEASURE,
					 COMPLEX_TO_REAL);
#endif
	       if (!plan || !iplan) {
                    log_printf("\n\nError creating FFTW plan!\n");
                    printf("\n\nError creating FFTW plan!\n");
                    return;
               }
	  }

#ifdef RFFTW_V2 
	  DO_RBENCHMARK_ND(rank, n, N, arr, work,
			   rfftw_one(plan, arr, work),
			   rfftw_one(iplan, work, arr),
			   1.0/N,
			   compute_accuracy);
#else
	  DO_RBENCHMARK_ND(rank, n, N, arr, work,
			   rfftw(plan, 1, (FFTW_COMPLEX*)arr,1,0,
				 (FFTW_COMPLEX*)work,1,0),
			   rfftw(iplan, 1, (FFTW_COMPLEX*)work,1,0,
				 (FFTW_COMPLEX*)arr,1,0),
			   1.0/N,
			   compute_accuracy);
#endif

	  if (N != 0 && fft_data_cur) {
	       rfftw_destroy_plan(plan);
	       rfftw_destroy_plan(iplan);
	  }
     }
#ifndef RFFTW_V2
     else
	  skip_benchmark("RFFTW requires even array size");
#endif
}

void do_sciport_rfft(int rank, int *n, int *n_rev, int N,short is_power_of_two,
		     FFTW_REAL *arr, FFTW_REAL *work,
		     int size_arr, int size_work,
		     short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_F77
     typedef void sp_fft_proc(int *init, int *is, int *n,
                              FFTW_REAL *x, FFTW_REAL *work,
                              FFTW_REAL *y);
     typedef void (*sp_fft_proc_ptr)(int *init, int *is, int *n,
                                     FFTW_REAL *x, FFTW_REAL *work,
                                     FFTW_REAL *y);
     sp_fft_proc_ptr sp_fft, sp_ifft;
     extern sp_fft_proc
          FORTRANIZE(spsrcfft2,SPSRCFFT2), FORTRANIZE(spdrcfft2,SPDRCFFT2);
     extern sp_fft_proc
          FORTRANIZE(spscrfft2,SPSCRFFT2), FORTRANIZE(spdcrfft2,SPDCRFFT2);
     int init = 1, is1 = -1, is2 = +1;
     
     if (rank != 1) return;

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("SCIPORT");
     if (!FFT_OK) return;

     if (sizeof(FFTW_REAL) == sizeof(double)) {
          sp_fft = FORTRANIZE(spdrcfft2,SPDRCFFT2);
          sp_ifft = FORTRANIZE(spdcrfft2,SPDCRFFT2);
     }
     else {
          sp_fft = FORTRANIZE(spsrcfft2,SPSRCFFT2);
          sp_ifft = FORTRANIZE(spscrfft2,SPSCRFFT2);
     }

     if (N != 0)
	  sp_fft(&init, &is1, &N, arr, work + N + 2, work);

     init = 0;
     if (N == 0 || N >= 8)
	  DO_RBENCHMARK_ND(rank, n, N, arr, work,
			   sp_fft(&init, &is1, &N, arr, work + N + 2, work),
			   sp_ifft(&init, &is2, &N, work, work + N + 2, arr),
			   0.5/N,
			   compute_accuracy);
     else
	  skip_benchmark("requires N >= 8");
#endif
}

void do_fftpack_rfft(int rank, int *n, int *n_rev, int N,short is_power_of_two,
                    FFTW_REAL *arr, FFTW_REAL *work,
                    int size_arr, int size_work,
                    short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_F77
     if (rank != 1) return;

     FFT_NAME("FFTPACK");
     if (sizeof(FFTW_REAL) == sizeof(double)) {
          if (N != 0) FORTRANIZE(drffti,DRFFTI)(&N, work);
          DO_RBENCHMARK_ND(rank, n, N, arr, arr,
                          FORTRANIZE(drfftf,DRFFTF)(&N, arr, work),
                          FORTRANIZE(drfftb,DRFFTB)(&N, arr, work),
                          1.0/N,
                          compute_accuracy);
     }
     else if (sizeof(FFTW_REAL) == sizeof(float)) {
          if (N != 0) FORTRANIZE(srffti,SRFFTI)(&N, work);
          DO_RBENCHMARK_ND(rank, n, N, arr, arr,
                          FORTRANIZE(srfftf,SRFFTF)(&N, arr, work),
                          FORTRANIZE(srfftb,SRFFTB)(&N, arr, work),
                          1.0/N,
                          compute_accuracy);
     }
#endif
}

void do_gsl_rfft(int rank, int *n, int *n_rev, int N,
		 short is_power_of_two,
		 FFTW_REAL *arr, FFTW_REAL *work,
		 int size_arr, int size_work,
		 short compute_accuracy, factor_type allowed_factors)
{
     gsl_fft_real_wavetable wt;
     gsl_fft_halfcomplex_wavetable wt2;
     int status = 0;

     if (rank != 1) return;

     FFT_NAME("GSL");

     if (N != 0) {
          /* Initialize the wavetable to point to our work array,
             rather than reallocating.  Yes, this breaks the abstraction,
             but it saves us memory. */

          wt.scratch = (real*) work;
          wt.trig = (complex*) (work + N);

          wt2.scratch = (complex*) (work + 2*N);
          wt2.trig = (complex*) (work + 4*N);


          /* Now, initialize: */
          status = gsl_fft_real_init(N,&wt);
          if (status == 0)
	       status = gsl_fft_halfcomplex_init(N,&wt2);
     }

     if (status == 0)
          DO_RBENCHMARK_ND(rank, n, N, arr, arr,
                          gsl_fft_real((real*)arr,N,&wt),
                          gsl_fft_halfcomplex((real*)arr,N,&wt2),
                          1.0/N,
                          compute_accuracy);
     else if (N != 0)
          skip_benchmark("Error initializing GSL wavetables!");
}

void do_rmayer_rfft(int rank, int *n, int *n_rev, int N,
		    short is_power_of_two,
		    FFTW_REAL *arr, FFTW_REAL *work,
		    int size_arr, int size_work,
		    short compute_accuracy, factor_type allowed_factors)
{
     if (rank != 1) return;

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("RMayer");
     if (FFT_OK) {
     if ((N == 0 || N > 2) && N <= (1<<19))
          DO_RBENCHMARK_ND(rank, n, N, arr, arr,
			   mayer_realfft_1(N, arr),
			   mayer_realifft_1(N, arr),
			   1.0,
			   compute_accuracy);
     else if (N <= 2)
          skip_benchmark("can't handle N <= 2");
     else
          skip_benchmark("can't handle N > 2^19");
     }
}

void do_ooura_rfft(int rank, int *n, int *n_rev, int N,
		   short is_power_of_two,
		   FFTW_REAL *arr, FFTW_REAL *work,
		   int size_arr, int size_work,
		   short compute_accuracy, factor_type allowed_factors)
{
     if (rank != 1) return;
     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("Ooura (C)");
     if (FFT_OK) {
          int *ip = 0, len_ip, is1=-1, is2=+1;

          if (N != 0) {
	       len_ip = sqrt(N/2) + 3;
               ip = (int*) fftw_malloc(sizeof(int)*len_ip);
               if (!ip) {
                    printf("\nERROR!  Out of memory!\n");
                    exit(1);
               }
               ip[0] = 0;
               ooura_c_rdft(N, +1, arr, ip, work);
          }
          DO_RBENCHMARK_ND(rank, n, N, arr, arr,
			   ooura_c_rdft(N, +1, arr, ip, work),
			   ooura_c_rdft(N, -1, arr, ip, work),
			   2.0/N,
			   compute_accuracy);

#ifdef HAVE_F77
          if (sizeof(FFTW_REAL) == sizeof(double)) {
          FFT_REQUIRE_POWER_OF_TWO;
          FFT_NAME("Ooura (F)");
          if (FFT_OK && N != 0) {
               ip[0] = 0;
               FORTRANIZE(oourafrdft,OOURAFRDFT)(&N, &is2, arr,
                                                     ip, work);
          }
          DO_RBENCHMARK_ND(rank, n, N, arr, arr,
			   FORTRANIZE(oourafrdft,
				      OOURAFRDFT)(&N, &is2, arr,
						  ip, work),
			   FORTRANIZE(oourafrdft,
                                     OOURAFRDFT)(&N, &is1, arr,
						 ip, work),
			   2.0/N,
			   compute_accuracy);
	  }
#endif
          if (FFT_OK && N != 0)
               fftw_free(ip);
     }
}

void do_qft_rfft(int rank, int *n, int *n_rev, int N,
		 short is_power_of_two,
		 FFTW_REAL *arr, FFTW_REAL *work,
		 int size_arr, int size_work,
		 short compute_accuracy, factor_type allowed_factors)
{
     if (rank != 1) return;
     /* QFT is buggy with 64 bit pointers */
     if (sizeof (void *) != sizeof(int)) return;
     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("QFT");
     if (!FFT_OK) return;
     
     if (N == 0 || N >= 16) {
          if (N != 0)
               qftinit(N,(void *) (work + 2*N));
	  
          DO_RBENCHMARK_ND(rank, n, N, arr, work,
			   rfqft(work,
				 N + work,
				 arr,
				 N,N),
			   riqft(work,
				 N + work,
				 arr,
				 N,N),
			   1.0,
			   compute_accuracy);
     }
     else
          skip_benchmark("QFT requires N >= 16");
}

void do_crandall_rfft(int rank, int *n, int *n_rev, int N,
		      short is_power_of_two,
		      FFTW_REAL *arr, FFTW_REAL *work,
		      int size_arr, int size_work,
		      short compute_accuracy, factor_type allowed_factors)
{
     if (rank != 1) return;

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("Crandall (out-of-order)");
     if (FFT_OK) {
	  int *istorage = 0, istorage_alloc = 0;

	  if (N != 0) {
	       if ((size_work - 3*N)*sizeof(double) < N*sizeof(int)) {
		    istorage = (int *) fftw_malloc(sizeof(int)*N);
		    istorage_alloc = 1;
	       }
	       else
		    istorage = (int *) (work + 3*N);
	       
	       crandall_init_fft(N, work, istorage);
	  }
	  
	  DO_RBENCHMARK_ND(rank, n, N, arr, arr,
			   crandall_fft_real_to_hermitian(arr, N),
			   crandall_fftinv_hermitian_to_real(arr, N),
			   1.0,
			   compute_accuracy);

	  if (istorage_alloc)
	       fftw_free(istorage);
     }
}

void do_emayer_rfft(int rank, int *n, int *n_rev, int N,
                   short is_power_of_two,
                      FFTW_REAL *arr, FFTW_REAL *work,
                      int size_arr, int size_work,
                      short compute_accuracy, factor_type allowed_factors)
{
#ifdef USE_EMAYER
#ifdef HAVE_F90
     if (sizeof(FFTW_REAL) == sizeof(double)) {
          if (rank != 1) return;

          FFT_REQUIRE_POWER_OF_TWO;
          FFT_NAME("EMayer");
          if (!FFT_OK) return;
          if (N==0 || N >= (1<<4) && N <= (1<<19))
               DO_RBENCHMARK_ND(rank, n, N, arr, arr,
				FORTRANIZE(emayerfft_real,EMAYERFFT_REAL)
				(arr,&N),
				FORTRANIZE(emayerifft_real,EMAYERIFFT_REAL)
				(arr,&N),
				1.0/N,
				compute_accuracy);
          else
               skip_benchmark("EMayer can only handle 2^4 <= N <= 2^19");
     }
#endif
#endif
}

void do_sorensen_rfft(int rank, int *n, int *n_rev, int N,
		      short is_power_of_two,
		      FFTW_REAL *arr, FFTW_REAL *work,
		      int size_arr, int size_work,
		      short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_F77
     int m;

     if (rank != 1 || sizeof(FFTW_REAL) != sizeof(double)) return;

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("Sorensen");
     if (!FFT_OK) return;

     if (N != 0) {
	  int nn = N;
	  FORTRANIZE(ksorensenffti,KSORENSENFFTI)(&N,
						  work, work + N/8+1,
						  work + N/4 + 1,
						  work + 3*N/8 + 1);
	  for (m = 0; nn > 1; ++m)
	       nn /= 2;
     }
     DO_RBENCHMARK_ND(rank, n, N, arr, arr,
		      FORTRANIZE(ksorensenrfftf,KSORENSENRFFTF)
		      (arr, &N,  &m, work, work + N/8+1, work + N/4 + 1,
		       work + 3*N/8 + 1, &N),
		      FORTRANIZE(ksorensenrfftb,KSORENSENRFFTB)
		      (arr, &N,  &m, work, work + N/8+1, work + N/4 + 1,
		       work + 3*N/8 + 1, &N),
		      1.0,
		      compute_accuracy);
#endif
}

void do_green_rfft(int rank, int *n, int *n_rev, int N,
		   short is_power_of_two,
		   FFTW_REAL *arr, FFTW_REAL *work,
		   int size_arr, int size_work,
		   short compute_accuracy, factor_type allowed_factors)
{
     if (rank != 1) return;
     /* green is buggy with 64 bit pointers */
     if (sizeof (void *) != sizeof(int)) return;

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("Green");
     if (!FFT_OK) return;

     {
          int M = 0;

          if (N > 0) {
               int tmpN = N;
               while (tmpN > 1) {
                    M += 1;
                    tmpN /= 2;
               }
               if (0 != green_fftInit(M))
                    M = -1;
          }

          if (M >= 0) {
               DO_RBENCHMARK_ND(rank, n, N, arr, arr,
				green_rffts(arr, M, 1),
				green_riffts(arr, M, 1),
				1.0,
				compute_accuracy);
               if (N > 0)
                    green_fftFree();
          }
          else
               skip_benchmark("Green can't handle this size.");
     }
}

void do_nrc_rfft(int rank, int *n, int *n_rev, int N,
		 short is_power_of_two,
		 FFTW_REAL *arr, FFTW_REAL *work,
		 int size_arr, int size_work,
		 short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_NRC
     if (rank != 1) return;
     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("NR (C)");
     if (!FFT_OK) return;
     {
          FFTW_REAL *arr2 = arr - 1; /* NRC is 1-based! */
          DO_RBENCHMARK_ND(rank, n, N, arr, arr,
			   nrc_realft(arr2, N, +1),
			   nrc_realft(arr2, N, -1),
			   2.0/N,
			   compute_accuracy);
     }
#endif
}

void do_nrf_rfft(int rank, int *n, int *n_rev, int N,
		 short is_power_of_two,
		 FFTW_REAL *arr, FFTW_REAL *work,
		 int size_arr, int size_work,
		 short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_F77
#ifdef HAVE_NRF
     if (rank != 1) return;
     if (sizeof(FFTW_REAL) != sizeof(double)) return;
     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("NR (F)");
     if (!FFT_OK) return;
     {
	  int is1 = +1, is2 = -1;
          DO_RBENCHMARK_ND(rank, n, N, arr, arr,
			   FORTRANIZE(nrfrealft,NRFREALFT)(arr, &N, &is1),
			   FORTRANIZE(nrfrealft,NRFREALFT)(arr, &N, &is2),
			   2.0/N,
			   compute_accuracy);
     }
#endif
#endif
}

void do_singleton_rfft(int rank, int *n, int *n_rev, int N,
		       short is_power_of_two,
		       FFTW_REAL *arr, FFTW_REAL *work,
		       int size_arr, int size_work,
		       short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_F77
     if (rank != 1) return;
     if (sizeof(FFTW_REAL) == sizeof(double)) {
          int n1 = N / 2, is1 = +2, is2 = -2;

          FFT_NAME("Singleton");

          if (N % 2 == 0 && check_prime_factors(N,23))
               DO_RBENCHMARK_ND(rank, n, N, arr, arr,
			       {
                               FORTRANIZE(singletonfft,
                                          SINGLETONFFT)((FFTW_REAL*)arr,
                                                        (FFTW_REAL*)arr + 1,
                                                        &n1, &n1, &n1, &is1);
                               FORTRANIZE(singrealtr,
                                          SINGREALTR)((FFTW_REAL*)arr,
                                                        (FFTW_REAL*)arr + 1,
                                                        &n1, &is1);
			       },
			       {
                               FORTRANIZE(singrealtr,
                                          SINGREALTR)((FFTW_REAL*)arr,
                                                        (FFTW_REAL*)arr + 1,
                                                        &n1, &is2);
			       FORTRANIZE(singletonfft,
                                          SINGLETONFFT)((FFTW_REAL*)arr,
                                                        (FFTW_REAL*)arr + 1,
                                                        &n1, &n1, &n1, &is2);
			       },
                               0.5/N,
                               compute_accuracy);
          else if (N % 2 != 0)
	       skip_benchmark("can't handle odd sizes");
	  else
               skip_benchmark("can't handle prime factors > 23");
     }
#endif
}

void do_singleton_f2c_rfft(int rank, int *n, int *n_rev, int N,
		       short is_power_of_two,
		       FFTW_REAL *arr, FFTW_REAL *work,
		       int size_arr, int size_work,
		       short compute_accuracy, factor_type allowed_factors)
{
     int n1 = N / 2;

     if (rank != 1) return;
     FFT_NAME("Singleton (f2c)");
     
     if (N % 2 == 0 && check_prime_factors(N,23))
	  DO_RBENCHMARK_ND(rank, n, N, arr, arr,
			   {
				go_fft((FFTW_REAL*)arr,
				       (FFTW_REAL*)arr + 1,
				       n1, n1, n1, +2);
				sing_realtr((FFTW_REAL*)arr,
					    (FFTW_REAL*)arr + 1,
					    n1, +2);
			   },
			   {
				sing_realtr((FFTW_REAL*)arr,
					    (FFTW_REAL*)arr + 1,
					    n1, -2);
				go_fft((FFTW_REAL*)arr,
				       (FFTW_REAL*)arr + 1,
				       n1, n1, n1, -2);
			   },
			   0.5/N,
			   compute_accuracy);
     else if (N % 2 != 0)
	  skip_benchmark("can't handle odd sizes");
     else
	  skip_benchmark("can't handle prime factors > 23");
}

void do_bloodworth_rfft(int rank, int *n, int *n_rev, int N,
                       short is_power_of_two,
                       FFTW_REAL *arr, FFTW_REAL *work,
                       int size_arr, int size_work,
                       short compute_accuracy, factor_type allowed_factors)
{
     if (rank != 1) return;

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("Bloodworth (FHT-based)");
     if (FFT_OK) {
	  if (N == 0 || N >= 8)
	       DO_RBENCHMARK_ND(rank, n, N, arr, arr,
				FwdRealFFT(arr,N),
				RevRealFFT(arr,N),
				1.0/N,
				compute_accuracy);
	  else
	       skip_benchmark("can't handle N < 8");
     }

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("Bloodworth");
     if (FFT_OK)
	  DO_RBENCHMARK_ND(rank, n, N, arr, arr,
			  Bloodworth_Q2_FwdRealFFT(arr,N),
			  Bloodworth_Q2_RevRealFFT(arr,N),
			  1.0,
			  compute_accuracy);
}

void do_scsl_rfft(int rank, int *n, int *n_rev, int N, 
		  short is_power_of_two,
		  FFTW_REAL *arr, FFTW_REAL *work,
		  int size_arr, int size_work,
		  short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_LIBSCS
     typedef void scsl_fft_proc(int *isign, int *n, FFTW_REAL *scale,
			   FFTW_REAL *x, FFTW_REAL *y,
			   FFTW_REAL *table, FFTW_REAL *work, 
			   int *isys);     
     typedef void (*scsl_fft_proc_ptr)(int *isign, int *n, FFTW_REAL *scale,
				       FFTW_REAL *x, FFTW_REAL *y,
				       FFTW_REAL *table, FFTW_REAL *work, 
				       int *isys);
     extern scsl_fft_proc
	  FORTRANIZE(scfft,SCFFT), FORTRANIZE(dzfft,DZFFT),
	  FORTRANIZE(csfft,CSFFT), FORTRANIZE(zdfft,ZDFFT);
     scsl_fft_proc_ptr scsl_fft, scsl_ifft;
     int isign=-1, isign2=+1, n1 = N, isys = 0;
     FFTW_REAL scale = 1.0;
     FFTW_REAL *table, *work2;

     if (rank != 1) return;

     FFT_NAME("SCSL");

     work2 = (FFTW_REAL*)work;
     table = work2 + N;

     if (sizeof(FFTW_REAL) == sizeof(float)) {
	  scsl_fft = FORTRANIZE(scfft,SCFFT);
	  scsl_ifft = FORTRANIZE(csfft,CSFFT);
     }
     else {
	  scsl_fft = FORTRANIZE(dzfft,DZFFT);
	  scsl_ifft = FORTRANIZE(zdfft,ZDFFT);
     }

     if (N != 0) {
	  n1 = N;
	  isign = 0;
	  scsl_fft(&isign, &n1, &scale, arr, arr, 
		   table, work2, &isys);
	  isign = -1;
     }
     if (N == 0 || N > 2)
	  DO_RBENCHMARK_ND(rank, n, N, arr, arr,
			  scsl_fft(&isign, &n1, &scale, arr, arr, 
				   table, work2, &isys),
			  scsl_ifft(&isign2, &n1, &scale, arr, arr, 
				    table, work2, &isys),
			  1.0/N,
			  compute_accuracy);
     else
	  skip_benchmark("can't handle n <= 2");
#endif
}

void do_sgimath_rfft(int rank, int *n, int *n_rev, int N, 
		    short is_power_of_two,
		    FFTW_REAL *arr, FFTW_REAL *work,
		    int size_arr, int size_work,
		    short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_LIBCOMPLIB_SGIMATH
     if (rank != 1) return;
    
     FFT_NAME("SGIMATH");

     if (sizeof(double) == sizeof(FFTW_REAL)) {
	  FFTW_REAL *dzfft1dui( int n, FFTW_REAL *save);
	  int dzfft1du(int sign, int n, FFTW_REAL *array, 
		       int inc, FFTW_REAL *save);
	  int zdfft1du(int sign, int n, FFTW_REAL *array, 
		       int inc, FFTW_REAL *save);
	  if (N != 0) dzfft1dui(N, work);
	  DO_RBENCHMARK_ND(rank, n, N, arr, arr,
			  dzfft1du(-1, N, arr, 1, work),
			  zdfft1du(1, N, arr, 1, work),
			  1.0/N,
			  compute_accuracy);
     } else {
	  FFTW_REAL *scfft1dui( int n, FFTW_REAL *save);
	  int scfft1du(int sign, int n, FFTW_REAL *array, 
		       int inc, FFTW_REAL *save);
	  int csfft1du(int sign, int n, FFTW_REAL *array, 
		       int inc, FFTW_REAL *save);
	  if (N != 0) scfft1dui(N, work);
	  DO_RBENCHMARK_ND(rank, n, N, arr, arr,
			  scfft1du(-1, N, arr, 1, work),
			  csfft1du(1, N, arr, 1, work),
			  1.0/N,
			  compute_accuracy);
     }
#endif
}

void do_asci_red_rfft(int rank, int *n, int *n_rev, int N,
		      short is_power_of_two,
		      FFTW_REAL *arr, FFTW_REAL *work,
		      int size_arr, int size_work,
		      short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_ASCI_RED_FFT
     int is0 = 0, is1 = -1, is2 = +1;
     if (rank != 1) return;

     FFT_NAME("ASCI Red PPro FFT");

     if (sizeof(double) == sizeof(FFTW_REAL)) {
	  void dzfft1d_(FFTW_REAL*, int*, int*, FFTW_REAL*);
	  void zdfft1d_(FFTW_REAL*, int*, int*, FFTW_REAL*);
          if (N != 0) dzfft1d_(arr, &N, &is0, work);
          DO_RBENCHMARK_ND(rank, n, N, arr, arr,
                          dzfft1d_(arr, &N, &is1, work),
                          zdfft1d_(arr, &N, &is2, work),
                          1.0,
                          compute_accuracy);
     } else {
	  void scfft1d_(FFTW_REAL*, int*, int*, FFTW_REAL*);
	  void csfft1d_(FFTW_REAL*, int*, int*, FFTW_REAL*);
          if (N != 0) scfft1d_(arr, &N, &is0, work);
          DO_RBENCHMARK_ND(rank, n, N, arr, arr,
                          scfft1d_(arr, &N, &is1, work),
                          csfft1d_(arr, &N, &is2, work),
                          1.0,
                          compute_accuracy);
     }
#endif
}


