
/* Copyright (C) 2003-2008, Free Software Foundation, Inc.
   Contributed by Andy Vaught

  This file is part of g95.

  G95 is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2, or (at your option)
  any later version.

  G95 is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with g95; see the file COPYING.  If not, write to
  the Free Software Foundation, 59 Temple Place - Suite 330,
  Boston, MA 02111-1307, USA.

  In addition to the permissions in the GNU General Public License, the
  Free Software Foundation gives you unlimited permission to link the
  compiled version of this file into combinations with other programs,
  and to distribute those combinations without any restriction coming
  from the use of this file.  (The General Public License restrictions
  do apply in other respects; for example, they cover modification of
  the file, and distribution when not linked into a combined executable.)
*/



#ifdef SUBROUTINE

#if COMPLEX == 0

/* Real dot_product() */

void SUBROUTINE(g95_array_descriptor *a, g95_array_descriptor *b) {
G95_DINT n, da, db;
char *ap, *bp;
int temp;

    asm("fldz\n"); 

    ap = a->offset + a->info[0].mult * a->info[0].lbound;
    bp = b->offset + b->info[0].mult * b->info[0].lbound;

    da = a->info[0].mult;
    db = b->info[0].mult;

    n = a->info[0].ubound - a->info[0].lbound + 1;

    while(n > 0) {
	asm(LOAD_A);
	asm(LOAD_B);

	asm("fmulp\n"
	    "faddp\n");

	ap += da;
	bp += db;

	n--;
    }
}


#elif COMPLEX == 1

/* Complex from complex/real */

void SUBROUTINE(char *product, g95_array_descriptor *a,
		g95_array_descriptor *b) {
G95_DINT n, da, db;
char *ap, *bp;
 
    asm("fldz\n"
	"fldz\n"); 

    ap = a->offset + a->info[0].mult * a->info[0].lbound;
    bp = b->offset + b->info[0].mult * b->info[0].lbound;

    da = a->info[0].mult;
    db = b->info[0].mult;

    n = a->info[0].ubound - a->info[0].lbound + 1;

    while(n > 0) {
	asm(LOAD_A);
	asm("fchs\n");
	asm(LOAD_B);

	asm("fxch %st(2)\n"
	    "fmul %st(2)\n"
	    "fadd %st(4)\n"
	    "fxch %st(4)\n"
	    "fstp %st\n"
	    "fmulp %st(1)\n"
	    "faddp %st(1)\n");

	ap += da;
	bp += db;

	n--;
    }

    asm(RESULT);
}


#elif COMPLEX == 2

/* Complex from real/complex */

void SUBROUTINE(char *product, g95_array_descriptor *a,
		g95_array_descriptor *b) {
G95_DINT n, da, db;
char *ap, *bp;
 
    asm("fldz\n"
	"fldz\n"); 

    ap = a->offset + a->info[0].mult * a->info[0].lbound;
    bp = b->offset + b->info[0].mult * b->info[0].lbound;

    da = a->info[0].mult;
    db = b->info[0].mult;

    n = a->info[0].ubound - a->info[0].lbound + 1;

    while(n > 0) {
	asm(LOAD_B);
	asm(LOAD_A);

	asm("fxch %st(2)\n"
	    "fmul %st(2)\n"
	    "fadd %st(4)\n"
	    "fxch %st(4)\n"
	    "fstp %st\n"
	    "fmulp %st(1)\n"
	    "faddp %st(1)\n");

	ap += da;
	bp += db;

	n--;
    }

    asm(RESULT);
}


#elif COMPLEX == 3

/* Complex from complex/complex */

void SUBROUTINE(char *product, g95_array_descriptor *a,
		g95_array_descriptor *b) {
G95_DINT n, da, db;
char *ap, *bp;
 
    asm("fldz\n"
	"fldz\n"); 

    ap = a->offset + a->info[0].mult * a->info[0].lbound;
    bp = b->offset + b->info[0].mult * b->info[0].lbound;

    da = a->info[0].mult;
    db = b->info[0].mult;

    n = a->info[0].ubound - a->info[0].lbound + 1;

    while(n > 0) {
	asm(LOAD_A);
	asm(LOAD_B);

	asm("fld %st\n"
	    "fmul %st(3)\n"
	    "fadd %st(6)\n"
	    "fstp %st(6)\n"
	    "fmul %st(3)\n"
	    "fadd %st(4)\n"
	    "fstp %st(4)\n"
	    "fxch %st(2)\n"
	    "fmul %st(2)\n"
	    "fadd %st(4)\n"
	    "fstp %st(4)\n"
	    "fmulp %st(1)\n"
	    "fsubp %st(1)\n");

	ap += da;
	bp += db;

	n--;
    }

    asm("mov %0, %" EAX "\n"
	"fxch %%st(1)\n"
	"fstpt (%" EAX ")\n"
	"fstpt " stringize(REAL10_SIZE) "(%" EAX ")\n" : : "m" (product));
}


#endif

#undef SUBROUTINE
#undef LOAD_A
#undef LOAD_B
#undef RESULT

#else

#include "runtime.h"

#if HAVE_REAL_10

#define COMPLEX    0

#define SUBROUTINE     prefix(dot_product_r10_i1)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (ap) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "movsbl (%" EAX "), %%eax\n" \
                 "mov %%eax, %1\n" \
                 "fild %1\n" : : "m" (bp), "m" (temp) : EAX

#include "dot_product10.c"

#define SUBROUTINE     prefix(dot_product_r10_i2)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (ap) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "movswl (%" EAX "), %%eax\n" \
                 "mov %%eax, %1\n" \
                 "fild %1\n" : : "m" (bp), "m" (temp) : EAX

#include "dot_product10.c"

#define SUBROUTINE     prefix(dot_product_r10_i4)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (ap) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fild ( %" EAX ")\n" : : "m" (bp), "m" (temp) : EAX

#include "dot_product10.c"

#define SUBROUTINE     prefix(dot_product_r10_i8)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (ap) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fildll ( %" EAX ")\n" : : "m" (bp), "m" (temp) : EAX

#include "dot_product10.c"

#define SUBROUTINE     prefix(dot_product_r10_r4)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (ap) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "flds ( %" EAX ")\n" : : "m" (bp), "m" (temp) : EAX

#include "dot_product10.c"

#define SUBROUTINE     prefix(dot_product_r10_r8)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (ap) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldl ( %" EAX ")\n" : : "m" (bp), "m" (temp) : EAX

#include "dot_product10.c"

#define SUBROUTINE     prefix(dot_product_i1_r10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "movsbl (%" EAX "), %%eax\n" \
                 "mov %%eax, %1\n" \
                 "fild %1\n" : : "m" (ap), "m" (temp) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (bp) : EAX


#include "dot_product10.c"

#define SUBROUTINE     prefix(dot_product_i2_r10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "movswl (%" EAX "), %%eax\n" \
                 "mov %%eax, %1\n" \
                 "fild %1\n" : : "m" (ap), "m" (temp) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (bp) : EAX

#include "dot_product10.c"

#define SUBROUTINE     prefix(dot_product_i4_r10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fild ( %" EAX ")\n" : : "m" (ap), "m" (temp) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (bp) : EAX

#include "dot_product10.c"

#define SUBROUTINE     prefix(dot_product_i8_r10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fildll ( %" EAX ")\n" : : "m" (ap), "m" (temp) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (bp) : EAX

#include "dot_product10.c"

#define SUBROUTINE     prefix(dot_product_r4_r10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "flds ( %" EAX ")\n" : : "m" (ap), "m" (temp) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (bp) : EAX

#include "dot_product10.c"

#define SUBROUTINE     prefix(dot_product_r8_r10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldl ( %" EAX ")\n" : : "m" (ap), "m" (temp) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (bp) : EAX

#include "dot_product10.c"

#define SUBROUTINE     prefix(dot_product_r10_r10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (ap) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (bp), "m" (temp) : EAX

#include "dot_product10.c"

#undef COMPLEX
#define COMPLEX 1

#define SUBROUTINE    prefix(dot_product_z4_r10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "flds ( %" EAX ")\n" \
                 "flds 4( %" EAX ")\n" : : "m" (ap) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (bp) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstps (%" EAX ")\n" \
                 "fstps 4(%" EAX ")\n" : : "m" (product)

#include "dot_product10.c"

#define SUBROUTINE    prefix(dot_product_z8_r10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldl ( %" EAX ")\n" \
                 "fldl 8( %" EAX ")\n" : : "m" (ap) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (bp) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpl (%" EAX ")\n" \
                 "fstpl 8(%" EAX ")\n" : : "m" (product)

#include "dot_product10.c"


#define SUBROUTINE    prefix(dot_product_z10_r10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (ap) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (bp) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpt (%" EAX ")\n" \
                 "fstpt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                 : : "m" (product)

#include "dot_product10.c"


#undef COMPLEX
#define COMPLEX 2


#define SUBROUTINE    prefix(dot_product_r10_z4)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (ap) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "flds ( %" EAX ")\n" \
                 "flds 4( %" EAX ")\n" : : "m" (bp) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstps (%" EAX ")\n" \
                 "fstps 4(%" EAX ")\n" : : "m" (product)

#include "dot_product10.c"

#define SUBROUTINE    prefix(dot_product_r10_z8)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (ap) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldl ( %" EAX ")\n" \
                 "fldl 8( %" EAX ")\n" : : "m" (bp) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpl (%" EAX ")\n" \
                 "fstpl 8(%" EAX ")\n" : : "m" (product)

#include "dot_product10.c"


#define SUBROUTINE    prefix(dot_product_r10_z10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (ap) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (bp) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpt (%" EAX ")\n" \
                 "fstpt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                 : : "m" (product)

#include "dot_product10.c"

#undef COMPLEX
#define COMPLEX 3


#define SUBROUTINE    prefix(dot_product_z4_z10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "flds ( %" EAX ")\n" \
                 "flds 4( %" EAX ")\n" : : "m" (ap) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (bp) : EAX

#include "dot_product10.c"

#define SUBROUTINE    prefix(dot_product_z8_z10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldl ( %" EAX ")\n" \
                 "fldl 8( %" EAX ")\n" : : "m" (ap) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (bp) : EAX

#include "dot_product10.c"

#define SUBROUTINE    prefix(dot_product_z10_z4)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (ap) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "flds ( %" EAX ")\n" \
                 "flds 4( %" EAX ")\n" : : "m" (bp) : EAX

#include "dot_product10.c"

#define SUBROUTINE    prefix(dot_product_z10_z8)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (ap) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "flds ( %" EAX ")\n" \
                 "flds 8( %" EAX ")\n" : : "m" (bp) : EAX

#include "dot_product10.c"

#define SUBROUTINE    prefix(dot_product_z10_z10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (ap) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (bp) : EAX

#include "dot_product10.c"

#endif
#endif

