Blame openblas-0.2.19-fix_register_clobbers.patch

fe8187
From 1e70600316ab080d80e318f32868c12eb7d1f2da Mon Sep 17 00:00:00 2001
fe8187
From: Alan Modra <amodra@gmail.com>
fe8187
Date: Thu, 9 Feb 2017 08:41:51 +1030
fe8187
Subject: [PATCH] Fix power8 asm()
f97f55
fe8187
Lots of issues here.
fe8187
- The vsx regs weren't listed as clobbered.
fe8187
- Poor choice of vsx regs, which along with the lack of clobbers led to
fe8187
  trashing v0..v21 and fr14..fr23.  Ideally you'd let gcc choose all
fe8187
  temp vsx regs, but asms currently have a limit of 30 i/o parms.
fe8187
- Other regs were clobbered unnecessarily, seemingly in an attempt to
fe8187
  clobber inputs, with gcc-7 complaining about the clobber of r2.
fe8187
  (Changed inputs should be also listed as outputs or as an i/o.)
fe8187
- "r" constraint used instead of "b" for gprs used in insns where the
fe8187
  r0 encoding means zero rather than r0.
fe8187
- There were unused asm inputs too.
fe8187
- All memory was clobbered rather than hooking up memory outputs with
fe8187
  proper memory constraints, and that and the lack of proper memory
fe8187
  input constraints meant the asms needed to be volatile and their
fe8187
  containing function noinline.
fe8187
- Some parameters were being passed unnecessarily via memory.
fe8187
- When a copy of a pointer input parm was needed, the value passed to
fe8187
  the asm was incremented in C and decremented in asm, rather than
fe8187
  using i/o parms, an early clobber constraint, or a temp output reg
fe8187
  copied in the asm.  In most cases a small change to assembly could
fe8187
  be made that obviated the need for the extra pointer.
fe8187
- A number of functions did not compute the final sum or dot-product
fe8187
  in assembly, instead using scalar code in C.
fe8187
- dcbt was bogus.
f97f55
fe8187
I've also fixed formatting of the asm.
fe8187
fe8187
diff --git a/kernel/power/casum.c b/kernel/power/casum.c
fe8187
index aeed0ca..d110858 100644
fe8187
--- a/kernel/power/casum.c
fe8187
+++ b/kernel/power/casum.c
fe8187
@@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fe8187
 
fe8187
 #ifndef HAVE_KERNEL_16
fe8187
 
fe8187
-static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
fe8187
+static FLOAT casum_kernel_16(BLASLONG n, FLOAT *x1)
fe8187
 {
fe8187
 
fe8187
 	BLASLONG i=0;
fe8187
@@ -92,11 +92,7 @@ static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
fe8187
 
fe8187
 	}
fe8187
 
fe8187
-	svec[0] = sum0+sum1+sum2+sum3;
fe8187
-	svec[1] = 0.0;
fe8187
-	svec[2] = 0.0;
fe8187
-	svec[3] = 0.0;
fe8187
-
fe8187
+	return sum0+sum1+sum2+sum3;
fe8187
 }
fe8187
 
fe8187
 #endif
fe8187
@@ -106,7 +102,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
fe8187
 	BLASLONG i=0;
fe8187
 	BLASLONG ip=0;
fe8187
 	FLOAT sumf = 0.0;
fe8187
-	FLOAT svec[4] __attribute__ ((aligned (16)));;
fe8187
 	BLASLONG n1;
fe8187
 	BLASLONG inc_x2;
fe8187
 
fe8187
@@ -119,8 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
fe8187
 		if ( n1 > 0 )
fe8187
 		{
fe8187
 
fe8187
-			casum_kernel_16(n1, x, svec);
fe8187
-			sumf = svec[0] + svec[1]+svec[2]+svec[3];
fe8187
+			sumf = casum_kernel_16(n1, x);
fe8187
 			i=n1;
fe8187
 			ip = 2 * n1;
fe8187
 		}
fe8187
diff --git a/kernel/power/casum_microk_power8.c b/kernel/power/casum_microk_power8.c
fe8187
index cb50234..38a1143 100644
fe8187
--- a/kernel/power/casum_microk_power8.c
fe8187
+++ b/kernel/power/casum_microk_power8.c
fe8187
@@ -34,144 +34,145 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fe8187
 **************************************************************************************/
f97f55
 
fe8187
 #define HAVE_KERNEL_16 1
fe8187
-static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
fe8187
 
fe8187
-static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
fe8187
+static float casum_kernel_16 (long n, float *x)
f97f55
 {
f97f55
-
f97f55
-
f97f55
-	BLASLONG i = n;
fe8187
-	BLASLONG o16 = 16;
fe8187
-	BLASLONG o32 = 32;
fe8187
-	BLASLONG o48 = 48;
fe8187
-	BLASLONG o64 = 64;
fe8187
-	BLASLONG o80 = 80;
fe8187
-	BLASLONG o96 = 96;
fe8187
-	BLASLONG o112 = 112;
f97f55
-	FLOAT *x1=x;
fe8187
-	BLASLONG pre = 384;
fe8187
-
f97f55
-	__asm__  __volatile__
fe8187
-	(
f97f55
-
f97f55
-	"dcbt		%2 , %4				    \n\t"
f97f55
-
f97f55
-	"xxlxor		32,32,32			    \n\t"
f97f55
-	"xxlxor		33,33,33			    \n\t"
f97f55
-	"xxlxor		34,34,34			    \n\t"
f97f55
-	"xxlxor		35,35,35			    \n\t"
f97f55
-	"xxlxor		36,36,36			    \n\t"
f97f55
-	"xxlxor		37,37,37			    \n\t"
f97f55
-	"xxlxor		38,38,38			    \n\t"
f97f55
-	"xxlxor		39,39,39			    \n\t"
f97f55
-
f97f55
-	"lxvw4x		40, 0, %2			    \n\t"
f97f55
-	"lxvw4x		41, %5, %2			    \n\t"
f97f55
-	"lxvw4x		42, %6, %2			    \n\t"
f97f55
-	"lxvw4x		43, %7, %2			    \n\t"
f97f55
-	"lxvw4x		44, %8, %2			    \n\t"
f97f55
-	"lxvw4x		45, %9, %2			    \n\t"
f97f55
-	"lxvw4x		46, %10, %2			    \n\t"
f97f55
-	"lxvw4x		47, %11, %2			    \n\t"
f97f55
-
f97f55
-	"addi		%2, %2, 128			    \n\t"
f97f55
-
fe8187
-	"addic.		%0 , %0	, -16  	 	             \n\t"
f97f55
-	"ble		2f		             	     \n\t"
f97f55
-
f97f55
-	".align 5				            \n\t"
f97f55
-	"1:				                    \n\t"
f97f55
-
f97f55
-	"dcbt		%2 , %4				    \n\t"
f97f55
-
f97f55
-	"xvabssp	48, 40				    \n\t"
f97f55
-	"xvabssp	49, 41				    \n\t"
f97f55
-	"xvabssp	50, 42				    \n\t"
f97f55
-	"xvabssp	51, 43				    \n\t"
f97f55
-
f97f55
-	"lxvw4x		40, 0, %2			    \n\t"
f97f55
-	"lxvw4x		41, %5, %2			    \n\t"
f97f55
-
f97f55
-	"xvabssp	52, 44				    \n\t"
f97f55
-	"xvabssp	53, 45				    \n\t"
f97f55
-
f97f55
-	"lxvw4x		42, %6, %2			    \n\t"
f97f55
-	"lxvw4x		43, %7, %2			    \n\t"
f97f55
-
f97f55
-	"xvabssp	54, 46				    \n\t"
f97f55
-	"xvabssp	55, 47				    \n\t"
f97f55
-
f97f55
-	"lxvw4x		44, %8, %2			    \n\t"
f97f55
-	"lxvw4x		45, %9, %2			    \n\t"
f97f55
-
f97f55
-	"xvaddsp	32, 32, 48		    \n\t"
f97f55
-	"xvaddsp	33, 33, 49		    \n\t"
f97f55
-
f97f55
-	"lxvw4x		46, %10, %2			    \n\t"
f97f55
-	"lxvw4x		47, %11, %2			    \n\t"
f97f55
-
f97f55
-	"xvaddsp	34, 34, 50		    \n\t"
f97f55
-	"xvaddsp	35, 35, 51		    \n\t"
f97f55
-	"addi		%2, %2, 128			    \n\t"
f97f55
-	"xvaddsp	36, 36, 52		    \n\t"
f97f55
-	"xvaddsp	37, 37, 53		    \n\t"
fe8187
-	"addic.		%0 , %0	, -16  	 	             \n\t"
f97f55
-	"xvaddsp	38, 38, 54		    \n\t"
f97f55
-	"xvaddsp	39, 39, 55		    \n\t"
f97f55
-
f97f55
-	"bgt		1b		             	     \n\t"
f97f55
-
f97f55
-	"2:						     \n\t"
f97f55
-
f97f55
-
f97f55
-	"xvabssp	48, 40				    \n\t"
f97f55
-	"xvabssp	49, 41				    \n\t"
f97f55
-	"xvabssp	50, 42				    \n\t"
f97f55
-	"xvabssp	51, 43				    \n\t"
f97f55
-	"xvabssp	52, 44				    \n\t"
f97f55
-	"xvabssp	53, 45				    \n\t"
f97f55
-	"xvabssp	54, 46				    \n\t"
f97f55
-	"xvabssp	55, 47				    \n\t"
f97f55
-
f97f55
-	"xvaddsp	32, 32, 48		    \n\t"
f97f55
-	"xvaddsp	33, 33, 49		    \n\t"
f97f55
-	"xvaddsp	34, 34, 50		    \n\t"
f97f55
-	"xvaddsp	35, 35, 51		    \n\t"
f97f55
-	"xvaddsp	36, 36, 52		    \n\t"
f97f55
-	"xvaddsp	37, 37, 53		    \n\t"
f97f55
-	"xvaddsp	38, 38, 54		    \n\t"
f97f55
-	"xvaddsp	39, 39, 55		    \n\t"
f97f55
-
f97f55
-	"xvaddsp	32, 32, 33		     \n\t"
f97f55
-	"xvaddsp	34, 34, 35		     \n\t"
f97f55
-	"xvaddsp	36, 36, 37		     \n\t"
f97f55
-	"xvaddsp	38, 38, 39		     \n\t"
f97f55
-
f97f55
-	"xvaddsp	32, 32, 34		     \n\t"
f97f55
-	"xvaddsp	36, 36, 38		     \n\t"
f97f55
-
f97f55
-	"xvaddsp	32, 32, 36		     \n\t"
f97f55
-
f97f55
-
f97f55
-	"stxvw4x	32, 0, %3		     \n\t"
fe8187
-
fe8187
-	:
f97f55
-        : 
f97f55
-          "r" (i),	// 0	
f97f55
-	  "r" (n),  	// 1
f97f55
-          "r" (x1),     // 2
f97f55
-          "r" (svec),   // 3
f97f55
-          "r" (pre),    // 4
f97f55
-	  "r" (o16),	// 5
f97f55
-	  "r" (o32),	// 6
f97f55
-	  "r" (o48),    // 7
f97f55
-          "r" (o64),    // 8
f97f55
-          "r" (o80),    // 9
f97f55
-          "r" (o96),    // 10
f97f55
-          "r" (o112)   // 11
f97f55
-	: "cr0", "%0", "%2",  "memory"
fe8187
-	);
fe8187
-
fe8187
-} 
fe8187
-
fe8187
-
fe8187
+  float sum;
fe8187
+  __vector float t0;
fe8187
+  __vector float t1;
fe8187
+  __vector float t2;
fe8187
+  __vector float t3;
fe8187
+
fe8187
+  __asm__
fe8187
+    (
fe8187
+       "dcbt		0, %2		\n\t"
fe8187
+
fe8187
+       "xxlxor		32, 32,	32	\n\t"
33992f
+       "xxlxor		33, 33,	33	\n\t"
33992f
+       "xxlxor		34, 34,	34	\n\t"
33992f
+       "xxlxor		35, 35,	35	\n\t"
33992f
+       "xxlxor		36, 36,	36	\n\t"
33992f
+       "xxlxor		37, 37,	37	\n\t"
33992f
+       "xxlxor		38, 38,	38	\n\t"
33992f
+       "xxlxor		39, 39,	39	\n\t"
33992f
+
33992f
+       "lxvw4x		40, 0, %2	\n\t"
33992f
+       "lxvw4x		41, %8, %2	\n\t"
33992f
+       "lxvw4x		42, %9, %2	\n\t"
33992f
+       "lxvw4x		43, %10, %2	\n\t"
33992f
+       "lxvw4x		44, %11, %2	\n\t"
33992f
+       "lxvw4x		45, %12, %2	\n\t"
33992f
+       "lxvw4x		46, %13, %2	\n\t"
33992f
+       "lxvw4x		47, %14, %2	\n\t"
33992f
+
33992f
+       "addi		%2, %2, 128	\n\t"
33992f
+
33992f
+       "addic.		%1, %1, -16	\n\t"
33992f
+       "ble		2f		\n\t"
33992f
+
33992f
+       ".p2align	5		\n"
33992f
+     "1:				\n\t"
33992f
+
33992f
+       "xvabssp		48, 40		\n\t"
33992f
+       "xvabssp		49, 41		\n\t"
33992f
+       "xvabssp		50, 42		\n\t"
33992f
+       "xvabssp		51, 43		\n\t"
33992f
+
33992f
+       "lxvw4x		40, 0, %2	\n\t"
33992f
+       "lxvw4x		41, %8, %2	\n\t"
33992f
+
33992f
+       "xvabssp		%x3, 44		\n\t"
33992f
+       "xvabssp		%x4, 45		\n\t"
33992f
+
33992f
+       "lxvw4x		42, %9, %2	\n\t"
33992f
+       "lxvw4x		43, %10, %2	\n\t"
33992f
+
33992f
+       "xvabssp		%x5, 46		\n\t"
33992f
+       "xvabssp		%x6, 47		\n\t"
33992f
+
33992f
+       "lxvw4x		44, %11, %2	\n\t"
33992f
+       "lxvw4x		45, %12, %2	\n\t"
33992f
+
33992f
+       "xvaddsp		32, 32, 48	\n\t"
33992f
+       "xvaddsp		33, 33, 49	\n\t"
33992f
+
33992f
+       "lxvw4x		46, %13, %2	\n\t"
33992f
+       "lxvw4x		47, %14, %2	\n\t"
33992f
+
33992f
+       "xvaddsp		34, 34, 50	\n\t"
33992f
+       "xvaddsp		35, 35, 51	\n\t"
33992f
+       "addi		%2, %2, 128	\n\t"
33992f
+       "xvaddsp		36, 36, %x3	\n\t"
33992f
+       "xvaddsp		37, 37, %x4	\n\t"
33992f
+       "addic.		%1, %1, -16	\n\t"
33992f
+       "xvaddsp		38, 38, %x5	\n\t"
33992f
+       "xvaddsp		39, 39, %x6	\n\t"
33992f
+
33992f
+       "bgt		1b		\n"
33992f
+
33992f
+     "2:				\n\t"
33992f
+
33992f
+       "xvabssp		48, 40		\n\t"
33992f
+       "xvabssp		49, 41		\n\t"
33992f
+       "xvabssp		50, 42		\n\t"
33992f
+       "xvabssp		51, 43		\n\t"
33992f
+       "xvabssp		%x3, 44		\n\t"
33992f
+       "xvabssp		%x4, 45		\n\t"
33992f
+       "xvabssp		%x5, 46		\n\t"
33992f
+       "xvabssp		%x6, 47		\n\t"
33992f
+
33992f
+       "xvaddsp		32, 32, 48	\n\t"
33992f
+       "xvaddsp		33, 33, 49	\n\t"
33992f
+       "xvaddsp		34, 34, 50	\n\t"
33992f
+       "xvaddsp		35, 35, 51	\n\t"
33992f
+       "xvaddsp		36, 36, %x3	\n\t"
33992f
+       "xvaddsp		37, 37, %x4	\n\t"
33992f
+       "xvaddsp		38, 38, %x5	\n\t"
33992f
+       "xvaddsp		39, 39, %x6	\n\t"
33992f
+
33992f
+       "xvaddsp		32, 32, 33	\n\t"
33992f
+       "xvaddsp		34, 34, 35	\n\t"
33992f
+       "xvaddsp		36, 36, 37	\n\t"
33992f
+       "xvaddsp		38, 38, 39	\n\t"
33992f
+
33992f
+       "xvaddsp		32, 32, 34	\n\t"
33992f
+       "xvaddsp		36, 36, 38	\n\t"
33992f
+
33992f
+       "xvaddsp		32, 32, 36	\n\t"
33992f
+
33992f
+       "xxsldwi		33, 32, 32, 2	\n\t"
33992f
+       "xvaddsp		32, 32, 33	\n\t"
33992f
+
33992f
+       "xxsldwi		33, 32, 32, 1	\n\t"
33992f
+       "xvaddsp		32, 32, 33	\n\t"
33992f
+
33992f
+       "xscvspdp	%0, 32		\n"
33992f
+
33992f
+     "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
33992f
+     "#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
33992f
+     :
33992f
+       "=f" (sum),	// 0
33992f
+       "+r" (n),	// 1
33992f
+       "+b" (x),	// 2
33992f
+       "=wa" (t0),	// 3
33992f
+       "=wa" (t1),	// 4
33992f
+       "=wa" (t2),	// 5
33992f
+       "=wa" (t3)	// 6
33992f
+     :
33992f
+       "m" (*x),
33992f
+       "b" (16),	// 8
33992f
+       "b" (32),	// 9
33992f
+       "b" (48),	// 10
33992f
+       "b" (64),	// 11
33992f
+       "b" (80),	// 12
33992f
+       "b" (96),	// 13
33992f
+       "b" (112)	// 14
33992f
+     :
33992f
+       "cr0",
33992f
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
33992f
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
33992f
+       "vs48","vs49","vs50","vs51"
33992f
+     );
33992f
+
33992f
+  return sum;
33992f
+}
33992f
diff --git a/kernel/power/ccopy_microk_power8.c b/kernel/power/ccopy_microk_power8.c
33992f
index 95b3559..b2b1bea 100644
33992f
--- a/kernel/power/ccopy_microk_power8.c
33992f
+++ b/kernel/power/ccopy_microk_power8.c
33992f
@@ -35,140 +35,121 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33992f
 
33992f
 #define HAVE_KERNEL_32 1
33992f
 
33992f
-static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
33992f
-
33992f
-static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
33992f
+static void ccopy_kernel_32 (long n, float *x, float *y)
33992f
 {
33992f
-
33992f
-
33992f
-	BLASLONG i = n;
33992f
-	BLASLONG o16 = 16;
33992f
-	BLASLONG o32 = 32;
33992f
-	BLASLONG o48 = 48;
33992f
-	BLASLONG o64 = 64;
33992f
-	BLASLONG o80 = 80;
33992f
-	BLASLONG o96 = 96;
33992f
-	BLASLONG o112 = 112;
33992f
-	FLOAT *x1=x;
33992f
-	FLOAT *y1=y;
33992f
-	BLASLONG pre = 384;
33992f
-	BLASLONG alpha=0;
33992f
-
33992f
-	__asm__  __volatile__
33992f
-	(
33992f
-
33992f
-	"lxvw4x		40, 0, %2			    \n\t"
33992f
-	"lxvw4x		41, %5, %2			    \n\t"
33992f
-	"lxvw4x		42, %6, %2			    \n\t"
33992f
-	"lxvw4x		43, %7, %2			    \n\t"
33992f
-	"lxvw4x		44, %8, %2			    \n\t"
33992f
-	"lxvw4x		45, %9, %2			    \n\t"
33992f
-	"lxvw4x		46, %10, %2			    \n\t"
33992f
-	"lxvw4x		47, %11, %2			    \n\t"
33992f
-
33992f
-	"addi		%2, %2, 128			    \n\t"
33992f
-
33992f
-	"lxvw4x		50, 0, %2			    \n\t"
33992f
-	"lxvw4x		51, %5, %2			    \n\t"
33992f
-	"lxvw4x		52, %6, %2			    \n\t"
33992f
-	"lxvw4x		53, %7, %2			    \n\t"
33992f
-	"lxvw4x		54, %8, %2			    \n\t"
33992f
-	"lxvw4x		55, %9, %2			    \n\t"
33992f
-	"lxvw4x		56, %10, %2			    \n\t"
33992f
-	"lxvw4x		57, %11, %2			    \n\t"
33992f
-
33992f
-	"addi		%2, %2, 128			    \n\t"
33992f
-
33992f
-	"addic.		%0 , %0	, -32  	 	             \n\t"
33992f
-	"ble		2f		             	     \n\t"
33992f
-
33992f
-	".align 5				            \n\t"
33992f
-	"1:				                    \n\t"
33992f
-
33992f
-	"stxvw4x		40, 0, %1			    \n\t"
33992f
-	"stxvw4x		41, %5, %1			    \n\t"
33992f
-	"lxvw4x		40, 0, %2			    \n\t"
33992f
-	"lxvw4x		41, %5, %2			    \n\t"
33992f
-	"stxvw4x		42, %6, %1			    \n\t"
33992f
-	"stxvw4x		43, %7, %1			    \n\t"
33992f
-	"lxvw4x		42, %6, %2			    \n\t"
33992f
-	"lxvw4x		43, %7, %2			    \n\t"
33992f
-	"stxvw4x		44, %8, %1			    \n\t"
33992f
-	"stxvw4x		45, %9, %1			    \n\t"
33992f
-	"lxvw4x		44, %8, %2			    \n\t"
33992f
-	"lxvw4x		45, %9, %2			    \n\t"
33992f
-	"stxvw4x		46, %10, %1			    \n\t"
33992f
-	"stxvw4x		47, %11, %1			    \n\t"
33992f
-	"lxvw4x		46, %10, %2			    \n\t"
33992f
-	"lxvw4x		47, %11, %2			    \n\t"
33992f
-
33992f
-
33992f
-	"addi		%1, %1, 128			    \n\t"
33992f
-	"addi		%2, %2, 128			    \n\t"
33992f
-
33992f
-	"stxvw4x		50, 0, %1			    \n\t"
33992f
-	"stxvw4x		51, %5, %1			    \n\t"
33992f
-	"lxvw4x		50, 0, %2			    \n\t"
33992f
-	"lxvw4x		51, %5, %2			    \n\t"
33992f
-	"stxvw4x		52, %6, %1			    \n\t"
33992f
-	"stxvw4x		53, %7, %1			    \n\t"
33992f
-	"lxvw4x		52, %6, %2			    \n\t"
33992f
-	"lxvw4x		53, %7, %2			    \n\t"
33992f
-	"stxvw4x		54, %8, %1			    \n\t"
33992f
-	"stxvw4x		55, %9, %1			    \n\t"
33992f
-	"lxvw4x		54, %8, %2			    \n\t"
33992f
-	"lxvw4x		55, %9, %2			    \n\t"
33992f
-	"stxvw4x		56, %10, %1			    \n\t"
33992f
-	"stxvw4x		57, %11, %1			    \n\t"
33992f
-	"lxvw4x		56, %10, %2			    \n\t"
33992f
-	"lxvw4x		57, %11, %2			    \n\t"
33992f
-
33992f
-	"addi		%1, %1, 128			    \n\t"
33992f
-	"addi		%2, %2, 128			    \n\t"
33992f
-
33992f
-	"addic.		%0 , %0	, -32  	 	             \n\t"
33992f
-	"bgt		1b		             	     \n\t"
33992f
-
33992f
-	"2:						     \n\t"
33992f
-
33992f
-	"stxvw4x		40, 0, %1			    \n\t"
33992f
-	"stxvw4x		41, %5, %1			    \n\t"
33992f
-	"stxvw4x		42, %6, %1			    \n\t"
33992f
-	"stxvw4x		43, %7, %1			    \n\t"
33992f
-	"stxvw4x		44, %8, %1			    \n\t"
33992f
-	"stxvw4x		45, %9, %1			    \n\t"
33992f
-	"stxvw4x		46, %10, %1			    \n\t"
33992f
-	"stxvw4x		47, %11, %1			    \n\t"
33992f
-
33992f
-	"addi		%1, %1, 128			    \n\t"
33992f
-
33992f
-	"stxvw4x		50, 0, %1			    \n\t"
33992f
-	"stxvw4x		51, %5, %1			    \n\t"
33992f
-	"stxvw4x		52, %6, %1			    \n\t"
33992f
-	"stxvw4x		53, %7, %1			    \n\t"
33992f
-	"stxvw4x		54, %8, %1			    \n\t"
33992f
-	"stxvw4x		55, %9, %1			    \n\t"
33992f
-	"stxvw4x		56, %10, %1			    \n\t"
33992f
-	"stxvw4x		57, %11, %1			    \n\t"
33992f
-
33992f
-
33992f
-	:
33992f
-        : 
33992f
-          "r" (i),	// 0	
33992f
-	  "r" (y1),  	// 1
33992f
-          "r" (x1),     // 2
33992f
-          "r" (alpha),  // 3
33992f
-          "r" (pre),    // 4
33992f
-	  "r" (o16),	// 5
33992f
-	  "r" (o32),	// 6
33992f
-	  "r" (o48),    // 7
33992f
-          "r" (o64),    // 8
33992f
-          "r" (o80),    // 9
33992f
-          "r" (o96),    // 10
33992f
-          "r" (o112)    // 11
33992f
-	: "cr0", "%0", "%2" , "%1", "memory"
33992f
-	);
33992f
-
33992f
-} 
33992f
-
33992f
-
33992f
+  __asm__
33992f
+    (
33992f
+       "lxvw4x		32, 0, %2	\n\t"
33992f
+       "lxvw4x		33, %5, %2	\n\t"
33992f
+       "lxvw4x		34, %6, %2	\n\t"
33992f
+       "lxvw4x		35, %7, %2	\n\t"
33992f
+       "lxvw4x		36, %8, %2	\n\t"
33992f
+       "lxvw4x		37, %9, %2	\n\t"
33992f
+       "lxvw4x		38, %10, %2	\n\t"
33992f
+       "lxvw4x		39, %11, %2	\n\t"
33992f
+
33992f
+       "addi		%2, %2, 128	\n\t"
33992f
+
33992f
+       "lxvw4x		40, 0, %2	\n\t"
33992f
+       "lxvw4x		41, %5, %2	\n\t"
33992f
+       "lxvw4x		42, %6, %2	\n\t"
33992f
+       "lxvw4x		43, %7, %2	\n\t"
33992f
+       "lxvw4x		44, %8, %2	\n\t"
33992f
+       "lxvw4x		45, %9, %2	\n\t"
33992f
+       "lxvw4x		46, %10, %2	\n\t"
33992f
+       "lxvw4x		47, %11, %2	\n\t"
33992f
+
33992f
+       "addi		%2, %2, 128	\n\t"
33992f
+
33992f
+       "addic.		%1, %1, -32	\n\t"
33992f
+       "ble		2f		\n\t"
33992f
+
33992f
+       ".p2align	5		\n"
33992f
+     "1:				\n\t"
33992f
+
33992f
+       "stxvw4x		32, 0, %3	\n\t"
33992f
+       "stxvw4x		33, %5, %3	\n\t"
33992f
+       "lxvw4x		32, 0, %2	\n\t"
33992f
+       "lxvw4x		33, %5, %2	\n\t"
33992f
+       "stxvw4x		34, %6, %3	\n\t"
33992f
+       "stxvw4x		35, %7, %3	\n\t"
33992f
+       "lxvw4x		34, %6, %2	\n\t"
33992f
+       "lxvw4x		35, %7, %2	\n\t"
33992f
+       "stxvw4x		36, %8, %3	\n\t"
33992f
+       "stxvw4x		37, %9, %3	\n\t"
33992f
+       "lxvw4x		36, %8, %2	\n\t"
33992f
+       "lxvw4x		37, %9, %2	\n\t"
33992f
+       "stxvw4x		38, %10, %3	\n\t"
33992f
+       "stxvw4x		39, %11, %3	\n\t"
33992f
+       "lxvw4x		38, %10, %2	\n\t"
33992f
+       "lxvw4x		39, %11, %2	\n\t"
33992f
+
33992f
+       "addi		%3, %3, 128	\n\t"
33992f
+       "addi		%2, %2, 128	\n\t"
33992f
+
33992f
+       "stxvw4x		40, 0, %3	\n\t"
33992f
+       "stxvw4x		41, %5, %3	\n\t"
33992f
+       "lxvw4x		40, 0, %2	\n\t"
33992f
+       "lxvw4x		41, %5, %2	\n\t"
33992f
+       "stxvw4x		42, %6, %3	\n\t"
33992f
+       "stxvw4x		43, %7, %3	\n\t"
33992f
+       "lxvw4x		42, %6, %2	\n\t"
33992f
+       "lxvw4x		43, %7, %2	\n\t"
33992f
+       "stxvw4x		44, %8, %3	\n\t"
33992f
+       "stxvw4x		45, %9, %3	\n\t"
33992f
+       "lxvw4x		44, %8, %2	\n\t"
33992f
+       "lxvw4x		45, %9, %2	\n\t"
33992f
+       "stxvw4x		46, %10, %3	\n\t"
33992f
+       "stxvw4x		47, %11, %3	\n\t"
33992f
+       "lxvw4x		46, %10, %2	\n\t"
33992f
+       "lxvw4x		47, %11, %2	\n\t"
33992f
+
33992f
+       "addi		%3, %3, 128	\n\t"
33992f
+       "addi		%2, %2, 128	\n\t"
33992f
+
33992f
+       "addic.		%1, %1, -32	\n\t"
33992f
+       "bgt		1b		\n"
33992f
+
33992f
+     "2:				\n\t"
33992f
+
33992f
+       "stxvw4x		32, 0, %3	\n\t"
33992f
+       "stxvw4x		33, %5, %3	\n\t"
33992f
+       "stxvw4x		34, %6, %3	\n\t"
33992f
+       "stxvw4x		35, %7, %3	\n\t"
33992f
+       "stxvw4x		36, %8, %3	\n\t"
33992f
+       "stxvw4x		37, %9, %3	\n\t"
33992f
+       "stxvw4x		38, %10, %3	\n\t"
33992f
+       "stxvw4x		39, %11, %3	\n\t"
33992f
+
33992f
+       "addi		%3, %3, 128	\n\t"
33992f
+
33992f
+       "stxvw4x		40, 0, %3	\n\t"
33992f
+       "stxvw4x		41, %5, %3	\n\t"
33992f
+       "stxvw4x		42, %6, %3	\n\t"
33992f
+       "stxvw4x		43, %7, %3	\n\t"
33992f
+       "stxvw4x		44, %8, %3	\n\t"
33992f
+       "stxvw4x		45, %9, %3	\n\t"
33992f
+       "stxvw4x		46, %10, %3	\n\t"
33992f
+       "stxvw4x		47, %11, %3	\n"
33992f
+
33992f
+     "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
33992f
+     :
33992f
+       "=m" (*y),
33992f
+       "+r" (n),	// 1
33992f
+       "+b" (x),	// 2
33992f
+       "+b" (y)		// 3
33992f
+     :
33992f
+       "m" (*x),
33992f
+       "b" (16),	// 5
33992f
+       "b" (32),	// 6
33992f
+       "b" (48),	// 7
33992f
+       "b" (64),	// 8
33992f
+       "b" (80),	// 9
33992f
+       "b" (96),	// 10
33992f
+       "b" (112)	// 11
33992f
+     :
33992f
+       "cr0",
33992f
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
33992f
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
33992f
+     );
33992f
+}
33992f
diff --git a/kernel/power/cswap_microk_power8.c b/kernel/power/cswap_microk_power8.c
33992f
index 90ab59c..1dd03dc 100644
33992f
--- a/kernel/power/cswap_microk_power8.c
33992f
+++ b/kernel/power/cswap_microk_power8.c
33992f
@@ -35,146 +35,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33992f
 
33992f
 #define HAVE_KERNEL_32 1
33992f
 
33992f
-static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
33992f
-
33992f
-static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
33992f
+static void cswap_kernel_32 (long n, float *x, float *y)
33992f
 {
33992f
-
33992f
-
33992f
-	BLASLONG i = n;
33992f
-	BLASLONG o16 = 16;
33992f
-	BLASLONG o32 = 32;
33992f
-	BLASLONG o48 = 48;
33992f
-	BLASLONG o64 = 64;
33992f
-	BLASLONG o80 = 80;
33992f
-	BLASLONG o96 = 96;
33992f
-	BLASLONG o112 = 112;
33992f
-	FLOAT *x1=x;
33992f
-	FLOAT *y1=y;
33992f
-	FLOAT *x2=x+1;
33992f
-	FLOAT *y2=y+1;
33992f
-	BLASLONG pre = 384;
33992f
-	BLASLONG alpha=0;
33992f
-
33992f
-	__asm__  __volatile__
33992f
-	(
33992f
-
33992f
-	"addi		%3, %3, -4			    \n\t"	
33992f
-	"addi		%4, %4, -4			    \n\t"	
33992f
-
33992f
-	".align 5				            \n\t"
33992f
-	"1:				                    \n\t"
33992f
-
33992f
-	"lxvw4x		32, 0, %2			    \n\t"
33992f
-	"lxvw4x		33, %5, %2			    \n\t"
33992f
-	"lxvw4x		34, %6, %2			    \n\t"
33992f
-	"lxvw4x		35, %7, %2			    \n\t"
33992f
-	"lxvw4x		36, %8, %2			    \n\t"
33992f
-	"lxvw4x		37, %9, %2			    \n\t"
33992f
-	"lxvw4x		38, %10, %2			    \n\t"
33992f
-	"lxvw4x		39, %11, %2			    \n\t"
33992f
-
33992f
-	"addi		%2, %2, 128			    \n\t"
33992f
-
33992f
-	"lxvw4x		40, 0, %2			    \n\t"
33992f
-	"lxvw4x		41, %5, %2			    \n\t"
33992f
-	"lxvw4x		42, %6, %2			    \n\t"
33992f
-	"lxvw4x		43, %7, %2			    \n\t"
33992f
-	"lxvw4x		44, %8, %2			    \n\t"
33992f
-	"lxvw4x		45, %9, %2			    \n\t"
33992f
-	"lxvw4x		46, %10, %2			    \n\t"
33992f
-	"lxvw4x		47, %11, %2			    \n\t"
33992f
-
33992f
-	"addi		%2, %2, 128			    \n\t"
33992f
-
33992f
-	"lxvw4x		48, 0, %1			    \n\t"
33992f
-	"lxvw4x		49, %5, %1			    \n\t"
33992f
-	"lxvw4x		50, %6, %1			    \n\t"
33992f
-	"lxvw4x		51, %7, %1			    \n\t"
33992f
-	"lxvw4x		52, %8, %1			    \n\t"
33992f
-	"lxvw4x		53, %9, %1			    \n\t"
33992f
-	"lxvw4x		54, %10, %1			    \n\t"
33992f
-	"lxvw4x		55, %11, %1			    \n\t"
33992f
-
33992f
-	"addi		%1, %1, 128			    \n\t"
33992f
-
33992f
-	"lxvw4x		56, 0, %1			    \n\t"
33992f
-	"lxvw4x		57, %5, %1			    \n\t"
33992f
-	"lxvw4x		58, %6, %1			    \n\t"
33992f
-	"lxvw4x		59, %7, %1			    \n\t"
33992f
-	"lxvw4x		60, %8, %1			    \n\t"
33992f
-	"lxvw4x		61, %9, %1			    \n\t"
33992f
-	"lxvw4x		62, %10, %1			    \n\t"
33992f
-	"lxvw4x		63, %11, %1			    \n\t"
33992f
-
33992f
-	"addi		%1, %1, 128			    \n\t"
33992f
-
33992f
-	"stxvw4x		32, 0, %3			    \n\t"
33992f
-	"stxvw4x		33, %5, %3			    \n\t"
33992f
-	"stxvw4x		34, %6, %3			    \n\t"
33992f
-	"stxvw4x		35, %7, %3			    \n\t"
33992f
-	"stxvw4x		36, %8, %3			    \n\t"
33992f
-	"stxvw4x		37, %9, %3			    \n\t"
33992f
-	"stxvw4x		38, %10, %3			    \n\t"
33992f
-	"stxvw4x		39, %11, %3			    \n\t"
33992f
-
33992f
-	"addi		%3, %3, 128			    \n\t"
33992f
-
33992f
-	"stxvw4x		40, 0, %3			    \n\t"
33992f
-	"stxvw4x		41, %5, %3			    \n\t"
33992f
-	"stxvw4x		42, %6, %3			    \n\t"
33992f
-	"stxvw4x		43, %7, %3			    \n\t"
33992f
-	"stxvw4x		44, %8, %3			    \n\t"
33992f
-	"stxvw4x		45, %9, %3			    \n\t"
33992f
-	"stxvw4x		46, %10, %3			    \n\t"
33992f
-	"stxvw4x		47, %11, %3			    \n\t"
33992f
-
33992f
-	"addi		%3, %3, 128			    \n\t"
33992f
-
33992f
-	"stxvw4x		48, 0, %4			    \n\t"
33992f
-	"stxvw4x		49, %5, %4			    \n\t"
33992f
-	"stxvw4x		50, %6, %4			    \n\t"
33992f
-	"stxvw4x		51, %7, %4			    \n\t"
33992f
-	"stxvw4x		52, %8, %4			    \n\t"
33992f
-	"stxvw4x		53, %9, %4			    \n\t"
33992f
-	"stxvw4x		54, %10, %4			    \n\t"
33992f
-	"stxvw4x		55, %11, %4			    \n\t"
33992f
-
33992f
-	"addi		%4, %4, 128			    \n\t"
33992f
-
33992f
-	"stxvw4x		56, 0, %4			    \n\t"
33992f
-	"stxvw4x		57, %5, %4			    \n\t"
33992f
-	"stxvw4x		58, %6, %4			    \n\t"
33992f
-	"stxvw4x		59, %7, %4			    \n\t"
33992f
-	"stxvw4x		60, %8, %4			    \n\t"
33992f
-	"stxvw4x		61, %9, %4			    \n\t"
33992f
-	"stxvw4x		62, %10, %4			    \n\t"
33992f
-	"stxvw4x		63, %11, %4			    \n\t"
33992f
-
33992f
-	"addi		%4, %4, 128			    \n\t"
33992f
-
33992f
-	"addic.		%0 , %0	, -32  	 	             \n\t"
33992f
-	"bgt		1b		             	     \n\t"
33992f
-
33992f
-	"2:						     \n\t"
33992f
-
33992f
-	:
33992f
-        : 
33992f
-          "r" (i),	// 0	
33992f
-	  "r" (y1),  	// 1
33992f
-          "r" (x1),     // 2
33992f
-          "r" (y2),     // 3
33992f
-          "r" (x2),     // 4
33992f
-	  "r" (o16),	// 5
33992f
-	  "r" (o32),	// 6
33992f
-	  "r" (o48),    // 7
33992f
-          "r" (o64),    // 8
33992f
-          "r" (o80),    // 9
33992f
-          "r" (o96),    // 10
33992f
-          "r" (o112)    // 11
33992f
-	: "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
33992f
-	);
33992f
-
33992f
-} 
33992f
-
33992f
-
33992f
+  __asm__
33992f
+    (
33992f
+       ".p2align	5		\n"
33992f
+     "1:				\n\t"
33992f
+
33992f
+       "lxvw4x		32, 0, %4	\n\t"
33992f
+       "lxvw4x		33, %5, %4	\n\t"
33992f
+       "lxvw4x		34, %6, %4	\n\t"
33992f
+       "lxvw4x		35, %7, %4	\n\t"
33992f
+       "lxvw4x		36, %8, %4	\n\t"
33992f
+       "lxvw4x		37, %9, %4	\n\t"
33992f
+       "lxvw4x		38, %10, %4	\n\t"
33992f
+       "lxvw4x		39, %11, %4	\n\t"
33992f
+
33992f
+       "addi		%4, %4, 128	\n\t"
33992f
+
33992f
+       "lxvw4x		40, 0, %4	\n\t"
33992f
+       "lxvw4x		41, %5, %4	\n\t"
33992f
+       "lxvw4x		42, %6, %4	\n\t"
33992f
+       "lxvw4x		43, %7, %4	\n\t"
33992f
+       "lxvw4x		44, %8, %4	\n\t"
33992f
+       "lxvw4x		45, %9, %4	\n\t"
33992f
+       "lxvw4x		46, %10, %4	\n\t"
33992f
+       "lxvw4x		47, %11, %4	\n\t"
33992f
+
33992f
+       "addi		%4, %4, -128	\n\t"
33992f
+
33992f
+       "lxvw4x		48, 0, %3	\n\t"
33992f
+       "lxvw4x		49, %5, %3	\n\t"
33992f
+       "lxvw4x		50, %6, %3	\n\t"
33992f
+       "lxvw4x		51, %7, %3	\n\t"
33992f
+       "lxvw4x		0, %8, %3	\n\t"
33992f
+       "lxvw4x		1, %9, %3	\n\t"
33992f
+       "lxvw4x		2, %10, %3	\n\t"
33992f
+       "lxvw4x		3, %11, %3	\n\t"
33992f
+
33992f
+       "addi		%3, %3, 128	\n\t"
33992f
+
33992f
+       "lxvw4x		4, 0, %3	\n\t"
33992f
+       "lxvw4x		5, %5, %3	\n\t"
33992f
+       "lxvw4x		6, %6, %3	\n\t"
33992f
+       "lxvw4x		7, %7, %3	\n\t"
33992f
+       "lxvw4x		8, %8, %3	\n\t"
33992f
+       "lxvw4x		9, %9, %3	\n\t"
33992f
+       "lxvw4x		10, %10, %3	\n\t"
33992f
+       "lxvw4x		11, %11, %3	\n\t"
33992f
+
33992f
+       "addi		%3, %3, -128	\n\t"
33992f
+
33992f
+       "stxvw4x		32, 0, %3	\n\t"
33992f
+       "stxvw4x		33, %5, %3	\n\t"
33992f
+       "stxvw4x		34, %6, %3	\n\t"
33992f
+       "stxvw4x		35, %7, %3	\n\t"
33992f
+       "stxvw4x		36, %8, %3	\n\t"
33992f
+       "stxvw4x		37, %9, %3	\n\t"
33992f
+       "stxvw4x		38, %10, %3	\n\t"
33992f
+       "stxvw4x		39, %11, %3	\n\t"
33992f
+
33992f
+       "addi		%3, %3, 128	\n\t"
33992f
+
33992f
+       "stxvw4x		40, 0, %3	\n\t"
33992f
+       "stxvw4x		41, %5, %3	\n\t"
33992f
+       "stxvw4x		42, %6, %3	\n\t"
33992f
+       "stxvw4x		43, %7, %3	\n\t"
33992f
+       "stxvw4x		44, %8, %3	\n\t"
33992f
+       "stxvw4x		45, %9, %3	\n\t"
33992f
+       "stxvw4x		46, %10, %3	\n\t"
33992f
+       "stxvw4x		47, %11, %3	\n\t"
33992f
+
33992f
+       "addi		%3, %3, 128	\n\t"
33992f
+
33992f
+       "stxvw4x		48, 0, %4	\n\t"
33992f
+       "stxvw4x		49, %5, %4	\n\t"
33992f
+       "stxvw4x		50, %6, %4	\n\t"
33992f
+       "stxvw4x		51, %7, %4	\n\t"
33992f
+       "stxvw4x		0, %8, %4	\n\t"
33992f
+       "stxvw4x		1, %9, %4	\n\t"
33992f
+       "stxvw4x		2, %10, %4	\n\t"
33992f
+       "stxvw4x		3, %11, %4	\n\t"
33992f
+
33992f
+       "addi		%4, %4, 128	\n\t"
33992f
+
33992f
+       "stxvw4x		4, 0, %4	\n\t"
33992f
+       "stxvw4x		5, %5, %4	\n\t"
33992f
+       "stxvw4x		6, %6, %4	\n\t"
33992f
+       "stxvw4x		7, %7, %4	\n\t"
33992f
+       "stxvw4x		8, %8, %4	\n\t"
33992f
+       "stxvw4x		9, %9, %4	\n\t"
33992f
+       "stxvw4x		10, %10, %4	\n\t"
33992f
+       "stxvw4x		11, %11, %4	\n\t"
33992f
+
33992f
+       "addi		%4, %4, 128	\n\t"
33992f
+
33992f
+       "addic.		%2, %2, -32	\n\t"
33992f
+       "bgt		1b		\n"
33992f
+
33992f
+     "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
33992f
+     :
33992f
+       "+m" (*x),
33992f
+       "+m" (*y),
33992f
+       "+r" (n),	// 2
33992f
+       "+b" (x),	// 3
33992f
+       "+b" (y)		// 4
33992f
+     :
33992f
+       "b" (16),	// 5
33992f
+       "b" (32),	// 6
33992f
+       "b" (48),	// 7
33992f
+       "b" (64),	// 8
33992f
+       "b" (80),	// 9
33992f
+       "b" (96),	// 10
33992f
+       "b" (112)	// 11
33992f
+     :
33992f
+       "cr0",
33992f
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
33992f
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
33992f
+       "vs48","vs49","vs50","vs51","vs0","vs1","vs2","vs3",
33992f
+       "vs4","vs5","vs6","vs7","vs8","vs9","vs10","vs11"
33992f
+     );
33992f
+}
33992f
diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c
33992f
index 77f5345..73962c2 100644
33992f
--- a/kernel/power/dasum.c
33992f
+++ b/kernel/power/dasum.c
33992f
@@ -42,7 +42,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33992f
 
33992f
 #else
33992f
 
33992f
-#define ABS fabsf
33992f
+#error supports double only
33992f
 
33992f
 #endif
33992f
 
33992f
@@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33992f
 
33992f
 #ifndef HAVE_KERNEL_16
33992f
 
33992f
-static void dasum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
33992f
+static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
33992f
 {
33992f
 
33992f
 	BLASLONG i=0;
33992f
@@ -92,9 +92,7 @@ static void dasum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
33992f
 
33992f
 	}
33992f
 
33992f
-	svec[0] = sum0+sum1+sum2+sum3;
33992f
-	svec[1] = 0.0;
33992f
-
33992f
+	return sum0+sum1+sum2+sum3;
33992f
 }
33992f
 
33992f
 #endif
33992f
@@ -103,7 +101,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
33992f
 {
33992f
 	BLASLONG i=0;
33992f
 	FLOAT sumf = 0.0;
33992f
-	FLOAT svec[2] __attribute__ ((aligned (16)));;
33992f
 	BLASLONG n1;
33992f
 
33992f
 	if (n <= 0 || inc_x <= 0) return(sumf);
33992f
@@ -115,8 +112,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
33992f
 		if ( n1 > 0 )
33992f
 		{
33992f
 
33992f
-			dasum_kernel_16(n1, x, svec);
33992f
-			sumf = svec[0] + svec[1];
33992f
+			sumf = dasum_kernel_16(n1, x);
33992f
 			i=n1;
33992f
 		}
33992f
 
33992f
diff --git a/kernel/power/dasum_microk_power8.c b/kernel/power/dasum_microk_power8.c
33992f
index cc38c4f..880d7d2 100644
33992f
--- a/kernel/power/dasum_microk_power8.c
33992f
+++ b/kernel/power/dasum_microk_power8.c
33992f
@@ -34,144 +34,142 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33992f
 **************************************************************************************/
33992f
 
33992f
 #define HAVE_KERNEL_16 1
33992f
-static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
33992f
 
33992f
-static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
33992f
+static double dasum_kernel_16 (long n, double *x)
33992f
 {
33992f
-
33992f
-
33992f
-	BLASLONG i = n;
33992f
-	BLASLONG o16 = 16;
33992f
-	BLASLONG o32 = 32;
33992f
-	BLASLONG o48 = 48;
33992f
-	BLASLONG o64 = 64;
33992f
-	BLASLONG o80 = 80;
33992f
-	BLASLONG o96 = 96;
33992f
-	BLASLONG o112 = 112;
33992f
-	FLOAT *x1=x;
33992f
-	BLASLONG pre = 384;
33992f
-
33992f
-	__asm__  __volatile__
33992f
-	(
33992f
-
33992f
-	"dcbt		%2 , %4				    \n\t"
33992f
-
33992f
-	"xxlxor		32,32,32			    \n\t"
33992f
-	"xxlxor		33,33,33			    \n\t"
33992f
-	"xxlxor		34,34,34			    \n\t"
33992f
-	"xxlxor		35,35,35			    \n\t"
33992f
-	"xxlxor		36,36,36			    \n\t"
33992f
-	"xxlxor		37,37,37			    \n\t"
33992f
-	"xxlxor		38,38,38			    \n\t"
33992f
-	"xxlxor		39,39,39			    \n\t"
33992f
-
33992f
-	"lxvd2x		40, 0, %2			    \n\t"
33992f
-	"lxvd2x		41, %5, %2			    \n\t"
33992f
-	"lxvd2x		42, %6, %2			    \n\t"
33992f
-	"lxvd2x		43, %7, %2			    \n\t"
33992f
-	"lxvd2x		44, %8, %2			    \n\t"
33992f
-	"lxvd2x		45, %9, %2			    \n\t"
33992f
-	"lxvd2x		46, %10, %2			    \n\t"
33992f
-	"lxvd2x		47, %11, %2			    \n\t"
33992f
-
33992f
-	"addi		%2, %2, 128			    \n\t"
33992f
-
33992f
-	"addic.		%0 , %0	, -16  	 	             \n\t"
33992f
-	"ble		2f		             	     \n\t"
33992f
-
33992f
-	".align 5				            \n\t"
33992f
-	"1:				                    \n\t"
33992f
-
33992f
-	"dcbt		%2 , %4				    \n\t"
33992f
-
33992f
-	"xvabsdp	48, 40				    \n\t"
33992f
-	"xvabsdp	49, 41				    \n\t"
33992f
-	"xvabsdp	50, 42				    \n\t"
33992f
-	"xvabsdp	51, 43				    \n\t"
33992f
-
33992f
-	"lxvd2x		40, 0, %2			    \n\t"
33992f
-	"lxvd2x		41, %5, %2			    \n\t"
33992f
-
33992f
-	"xvabsdp	52, 44				    \n\t"
33992f
-	"xvabsdp	53, 45				    \n\t"
33992f
-
33992f
-	"lxvd2x		42, %6, %2			    \n\t"
33992f
-	"lxvd2x		43, %7, %2			    \n\t"
33992f
-
33992f
-	"xvabsdp	54, 46				    \n\t"
33992f
-	"xvabsdp	55, 47				    \n\t"
33992f
-
33992f
-	"lxvd2x		44, %8, %2			    \n\t"
33992f
-	"lxvd2x		45, %9, %2			    \n\t"
33992f
-
33992f
-	"xvadddp	32, 32, 48		    \n\t"
33992f
-	"xvadddp	33, 33, 49		    \n\t"
33992f
-
33992f
-	"lxvd2x		46, %10, %2			    \n\t"
33992f
-	"lxvd2x		47, %11, %2			    \n\t"
33992f
-
33992f
-	"xvadddp	34, 34, 50		    \n\t"
33992f
-	"xvadddp	35, 35, 51		    \n\t"
33992f
-	"addi		%2, %2, 128			    \n\t"
33992f
-	"xvadddp	36, 36, 52		    \n\t"
33992f
-	"xvadddp	37, 37, 53		    \n\t"
33992f
-	"addic.		%0 , %0	, -16  	 	             \n\t"
33992f
-	"xvadddp	38, 38, 54		    \n\t"
33992f
-	"xvadddp	39, 39, 55		    \n\t"
33992f
-
33992f
-	"bgt		1b		             	     \n\t"
33992f
-
33992f
-	"2:						     \n\t"
33992f
-
33992f
-
33992f
-	"xvabsdp	48, 40				    \n\t"
33992f
-	"xvabsdp	49, 41				    \n\t"
33992f
-	"xvabsdp	50, 42				    \n\t"
33992f
-	"xvabsdp	51, 43				    \n\t"
33992f
-	"xvabsdp	52, 44				    \n\t"
33992f
-	"xvabsdp	53, 45				    \n\t"
33992f
-	"xvabsdp	54, 46				    \n\t"
33992f
-	"xvabsdp	55, 47				    \n\t"
33992f
-
33992f
-	"xvadddp	32, 32, 48		    \n\t"
33992f
-	"xvadddp	33, 33, 49		    \n\t"
33992f
-	"xvadddp	34, 34, 50		    \n\t"
33992f
-	"xvadddp	35, 35, 51		    \n\t"
33992f
-	"xvadddp	36, 36, 52		    \n\t"
33992f
-	"xvadddp	37, 37, 53		    \n\t"
33992f
-	"xvadddp	38, 38, 54		    \n\t"
33992f
-	"xvadddp	39, 39, 55		    \n\t"
33992f
-
33992f
-	"xvadddp	32, 32, 33		     \n\t"
33992f
-	"xvadddp	34, 34, 35		     \n\t"
33992f
-	"xvadddp	36, 36, 37		     \n\t"
33992f
-	"xvadddp	38, 38, 39		     \n\t"
33992f
-
33992f
-	"xvadddp	32, 32, 34		     \n\t"
33992f
-	"xvadddp	36, 36, 38		     \n\t"
33992f
-
33992f
-	"xvadddp	32, 32, 36		     \n\t"
33992f
-
33992f
-
33992f
-	"stxvd2x	32, 0, %3		     \n\t"
33992f
-
33992f
-	:
33992f
-        : 
33992f
-          "r" (i),	// 0	
33992f
-	  "r" (n),  	// 1
33992f
-          "r" (x1),     // 2
33992f
-          "r" (svec),   // 3
33992f
-          "r" (pre),    // 4
33992f
-	  "r" (o16),	// 5
33992f
-	  "r" (o32),	// 6
33992f
-	  "r" (o48),    // 7
33992f
-          "r" (o64),    // 8
33992f
-          "r" (o80),    // 9
33992f
-          "r" (o96),    // 10
33992f
-          "r" (o112)   // 11
33992f
-	: "cr0", "%0", "%2",  "memory"
33992f
-	);
33992f
-
33992f
-} 
33992f
+  double sum;
33992f
+  __vector double t0;
33992f
+  __vector double t1;
33992f
+  __vector double t2;
33992f
+  __vector double t3;
33992f
+
33992f
+  __asm__
33992f
+    (
33992f
+       "dcbt		0, %2		\n\t"
33992f
+
33992f
+       "xxlxor		32, 32,	32	\n\t"
33992f
+       "xxlxor		33, 33,	33	\n\t"
33992f
+       "xxlxor		34, 34,	34	\n\t"
33992f
+       "xxlxor		35, 35,	35	\n\t"
33992f
+       "xxlxor		36, 36,	36	\n\t"
33992f
+       "xxlxor		37, 37,	37	\n\t"
33992f
+       "xxlxor		38, 38,	38	\n\t"
33992f
+       "xxlxor		39, 39,	39	\n\t"
33992f
+
33992f
+       "lxvd2x		40, 0, %2	\n\t"
33992f
+       "lxvd2x		41, %8, %2	\n\t"
33992f
+       "lxvd2x		42, %9, %2	\n\t"
33992f
+       "lxvd2x		43, %10, %2	\n\t"
33992f
+       "lxvd2x		44, %11, %2	\n\t"
33992f
+       "lxvd2x		45, %12, %2	\n\t"
33992f
+       "lxvd2x		46, %13, %2	\n\t"
33992f
+       "lxvd2x		47, %14, %2	\n\t"
33992f
+
33992f
+       "addi		%2, %2, 128	\n\t"
33992f
+
33992f
+       "addic.		%1, %1, -16	\n\t"
33992f
+       "ble		2f		\n\t"
33992f
+
33992f
+       ".p2align	5		\n"
33992f
+     "1:				\n\t"
33992f
+
33992f
+       "xvabsdp		48, 40		\n\t"
33992f
+       "xvabsdp		49, 41		\n\t"
33992f
+       "xvabsdp		50, 42		\n\t"
33992f
+       "xvabsdp		51, 43		\n\t"
33992f
+
33992f
+       "lxvd2x		40, 0, %2	\n\t"
33992f
+       "lxvd2x		41, %8, %2	\n\t"
33992f
+
33992f
+       "xvabsdp		%x3, 44		\n\t"
33992f
+       "xvabsdp		%x4, 45		\n\t"
33992f
+
33992f
+       "lxvd2x		42, %9, %2	\n\t"
33992f
+       "lxvd2x		43, %10, %2	\n\t"
33992f
+
33992f
+       "xvabsdp		%x5, 46		\n\t"
33992f
+       "xvabsdp		%x6, 47		\n\t"
33992f
+
33992f
+       "lxvd2x		44, %11, %2	\n\t"
33992f
+       "lxvd2x		45, %12, %2	\n\t"
33992f
+
33992f
+       "xvadddp		32, 32, 48	\n\t"
33992f
+       "xvadddp		33, 33, 49	\n\t"
33992f
+
33992f
+       "lxvd2x		46, %13, %2	\n\t"
33992f
+       "lxvd2x		47, %14, %2	\n\t"
33992f
+
33992f
+       "xvadddp		34, 34, 50	\n\t"
33992f
+       "xvadddp		35, 35, 51	\n\t"
33992f
+       "addi		%2, %2, 128	\n\t"
33992f
+       "xvadddp		36, 36, %x3	\n\t"
33992f
+       "xvadddp		37, 37, %x4	\n\t"
33992f
+       "addic.		%1, %1, -16	\n\t"
33992f
+       "xvadddp		38, 38, %x5	\n\t"
33992f
+       "xvadddp		39, 39, %x6	\n\t"
33992f
+
33992f
+       "bgt		1b		\n"
33992f
+
33992f
+     "2:				\n\t"
33992f
+
33992f
+       "xvabsdp		48, 40		\n\t"
33992f
+       "xvabsdp		49, 41		\n\t"
33992f
+       "xvabsdp		50, 42		\n\t"
33992f
+       "xvabsdp		51, 43		\n\t"
33992f
+       "xvabsdp		%x3, 44		\n\t"
33992f
+       "xvabsdp		%x4, 45		\n\t"
33992f
+       "xvabsdp		%x5, 46		\n\t"
33992f
+       "xvabsdp		%x6, 47		\n\t"
33992f
+
33992f
+       "xvadddp		32, 32, 48	\n\t"
33992f
+       "xvadddp		33, 33, 49	\n\t"
33992f
+       "xvadddp		34, 34, 50	\n\t"
33992f
+       "xvadddp		35, 35, 51	\n\t"
33992f
+       "xvadddp		36, 36, %x3	\n\t"
33992f
+       "xvadddp		37, 37, %x4	\n\t"
33992f
+       "xvadddp		38, 38, %x5	\n\t"
33992f
+       "xvadddp		39, 39, %x6	\n\t"
33992f
+
33992f
+       "xvadddp		32, 32, 33	\n\t"
33992f
+       "xvadddp		34, 34, 35	\n\t"
33992f
+       "xvadddp		36, 36, 37	\n\t"
33992f
+       "xvadddp		38, 38, 39	\n\t"
33992f
+
33992f
+       "xvadddp		32, 32, 34	\n\t"
33992f
+       "xvadddp		36, 36, 38	\n\t"
33992f
+
33992f
+       "xvadddp		32, 32, 36	\n\t"
33992f
+
33992f
+       "xxswapd		33, 32		\n\t"
33992f
+       "xsadddp		%x0, 32, 33	\n"
33992f
+
33992f
+     "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
33992f
+     "#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
33992f
+     :
33992f
+       "=d" (sum),	// 0
33992f
+       "+r" (n),	// 1
33992f
+       "+b" (x),	// 2
33992f
+       "=wa" (t0),	// 3
33992f
+       "=wa" (t1),	// 4
33992f
+       "=wa" (t2),	// 5
33992f
+       "=wa" (t3)	// 6
33992f
+     :
33992f
+       "m" (*x),
33992f
+       "b" (16),	// 8
33992f
+       "b" (32),	// 9
33992f
+       "b" (48),	// 10
33992f
+       "b" (64),	// 11
33992f
+       "b" (80),	// 12
33992f
+       "b" (96),	// 13
33992f
+       "b" (112)	// 14
33992f
+     :
33992f
+       "cr0",
33992f
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
33992f
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
33992f
+       "vs48","vs49","vs50","vs51"
33992f
+     );
33992f
+
33992f
+  return sum;
33992f
+}
33992f
 
33992f
 
33992f
diff --git a/kernel/power/daxpy.c b/kernel/power/daxpy.c
33992f
index 4365bd8..df0572e 100644
33992f
--- a/kernel/power/daxpy.c
33992f
+++ b/kernel/power/daxpy.c
33992f
@@ -43,21 +43,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33992f
 
33992f
 #ifndef HAVE_KERNEL_8
33992f
 
33992f
-static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
33992f
+static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
33992f
 {
33992f
 	BLASLONG register i = 0;
33992f
-	FLOAT a = *alpha;
33992f
 
33992f
 	while(i < n)
33992f
         {
33992f
-              y[i]   += a * x[i];
33992f
-              y[i+1] += a * x[i+1];
33992f
-              y[i+2] += a * x[i+2];
33992f
-              y[i+3] += a * x[i+3];
33992f
-              y[i+4] += a * x[i+4];
33992f
-              y[i+5] += a * x[i+5];
33992f
-              y[i+6] += a * x[i+6];
33992f
-              y[i+7] += a * x[i+7];
33992f
+              y[i]   += alpha * x[i];
33992f
+              y[i+1] += alpha * x[i+1];
33992f
+              y[i+2] += alpha * x[i+2];
33992f
+              y[i+3] += alpha * x[i+3];
33992f
+              y[i+4] += alpha * x[i+4];
33992f
+              y[i+5] += alpha * x[i+5];
33992f
+              y[i+6] += alpha * x[i+6];
33992f
+              y[i+7] += alpha * x[i+7];
33992f
               i+=8 ;
33992f
 
33992f
        }
33992f
@@ -70,11 +69,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
33992f
 {
33992f
 	BLASLONG i=0;
33992f
 	BLASLONG ix=0,iy=0;
33992f
-	FLOAT a2[4];
33992f
-	a2[0]=da;
33992f
-	a2[1]=da;
33992f
-	a2[2]=da;
33992f
-	a2[3]=da;
33992f
 
33992f
 	if ( n <= 0 )  return(0);
33992f
 
33992f
@@ -84,7 +78,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
33992f
 		BLASLONG n1 = n & -16;
33992f
 
33992f
 		if ( n1 )
33992f
-			daxpy_kernel_8(n1, x, y , a2 );
33992f
+			daxpy_kernel_8(n1, x, y, da);
33992f
 
33992f
 		i = n1;
33992f
 		while(i < n)
33992f
diff --git a/kernel/power/daxpy_microk_power8.c b/kernel/power/daxpy_microk_power8.c
33992f
index bb3f73a..fb714a3 100644
33992f
--- a/kernel/power/daxpy_microk_power8.c
33992f
+++ b/kernel/power/daxpy_microk_power8.c
33992f
@@ -35,167 +35,183 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33992f
 
33992f
 
33992f
 #define HAVE_KERNEL_8 1
33992f
-static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
33992f
 
33992f
-static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
33992f
+static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
33992f
 {
33992f
+  __vector double t0;
33992f
+  __vector double t1;
33992f
+  __vector double t2;
33992f
+  __vector double t3;
33992f
+  __vector double t4;
33992f
+  __vector double t5;
33992f
+  __vector double t6;
33992f
+  __vector double t7;
33992f
+  __vector double t8;
33992f
+  __vector double t9;
33992f
+  __vector double t10;
33992f
+  __vector double t11;
33992f
+  __vector double t12;
33992f
+  __vector double t13;
33992f
+  __vector double t14;
33992f
+  __vector double t15;
33992f
+  __vector double t16;
33992f
 
33992f
+  __asm__
33992f
+    (
33992f
+       "xxspltd		%x4, %x22, 0	\n\t"
33992f
 
33992f
-	BLASLONG i = n;
33992f
-	BLASLONG o16 = 16;
33992f
-	BLASLONG o32 = 32;
33992f
-	BLASLONG o48 = 48;
33992f
-	FLOAT *x1=x;
33992f
-	FLOAT *y1=y;
33992f
-	FLOAT *y2=y+1;
33992f
-	BLASLONG pre = 384;
33992f
+       "dcbt		0, %2		\n\t"
33992f
+       "dcbt		0, %3		\n\t"
33992f
 
33992f
-	__asm__  __volatile__
33992f
-	(
33992f
+       "lxvd2x		%x5, 0, %2	\n\t"
33992f
+       "lxvd2x		%x6, %23, %2	\n\t"
33992f
+       "lxvd2x		%x7, %24, %2	\n\t"
33992f
+       "lxvd2x		%x8, %25, %2	\n\t"
33992f
 
33992f
-	"lxsdx		33, %5, %4			    \n\t"
33992f
-	"xxspltd	32, 33, 0			    \n\t"
33992f
-	"addi		%8, %8, -8			    \n\t"
33992f
+       "lxvd2x		%x13, 0, %3	\n\t"
33992f
+       "lxvd2x		%x14, %23, %3	\n\t"
33992f
+       "lxvd2x		%x15, %24, %3	\n\t"
33992f
+       "lxvd2x		%x16, %25, %3	\n\t"
33992f
 
33992f
-	"dcbt		%2, %9				    \n\t"
33992f
-	"dcbt		%3, %9				    \n\t"
33992f
+       "addi		%2, %2, 64	\n\t"
33992f
+       "addi		%3, %3, 64	\n\t"
33992f
 
33992f
-	"lxvd2x		40, 0, %2			    \n\t"
33992f
-	"lxvd2x		41, %5, %2			    \n\t"
33992f
-	"lxvd2x		42, %6, %2			    \n\t"
33992f
-	"lxvd2x		43, %7, %2			    \n\t"
33992f
+       "lxvd2x		%x9, 0, %2	\n\t"
33992f
+       "lxvd2x		%x10, %23, %2	\n\t"
33992f
+       "lxvd2x		%x11, %24, %2	\n\t"
33992f
+       "lxvd2x		%x12, %25, %2	\n\t"
33992f
 
33992f
-	"lxvd2x		48, 0, %3			    \n\t"
33992f
-	"lxvd2x		49, %5, %3			    \n\t"
33992f
-	"lxvd2x		50, %6, %3			    \n\t"
33992f
-	"lxvd2x		51, %7, %3			    \n\t"
33992f
-
33992f
-	"addi		%2, %2, 64			    \n\t"
33992f
-	"addi		%3, %3, 64			    \n\t"
33992f
-
33992f
-	"lxvd2x		44, 0, %2			    \n\t"
33992f
-	"lxvd2x		45, %5, %2			    \n\t"
33992f
-	"lxvd2x		46, %6, %2			    \n\t"
33992f
-	"lxvd2x		47, %7, %2			    \n\t"
33992f
-
33992f
-	"lxvd2x		52, 0, %3			    \n\t"
33992f
-	"lxvd2x		53, %5, %3			    \n\t"
33992f
-	"lxvd2x		54, %6, %3			    \n\t"
33992f
-	"lxvd2x		55, %7, %3			    \n\t"
33992f
-
33992f
-	"addi		%2, %2, 64			    \n\t"
33992f
-	"addi		%3, %3, 64			    \n\t"
33992f
-
33992f
-	"addic.		%0 , %0	, -16  	 	             \n\t"
33992f
-	"ble		2f		             	     \n\t"
33992f
-
33992f
-	".align 5				            \n\t"
33992f
-	"1:				                    \n\t"
33992f
-
33992f
-	"dcbt		%2, %9				    \n\t"
33992f
-	"dcbt		%3, %9				    \n\t"
33992f
-
33992f
-	"xvmaddadp	48, 40, 32		    	    \n\t"
33992f
-	"xvmaddadp	49, 41, 32		    	    \n\t"
33992f
-
33992f
-	"lxvd2x		40, 0, %2			    \n\t"
33992f
-	"lxvd2x		41, %5, %2			    \n\t"
33992f
-
33992f
-	"stxvd2x	48,  0, %8			    \n\t"
33992f
-	"stxvd2x	49, %5, %8			    \n\t"
33992f
-
33992f
-	"xvmaddadp	50, 42, 32		    	    \n\t"
33992f
-	"xvmaddadp	51, 43, 32		    	    \n\t"
33992f
-
33992f
-	"lxvd2x		42, %6, %2			    \n\t"
33992f
-	"lxvd2x		43, %7, %2			    \n\t"
33992f
-
33992f
-	"stxvd2x	50, %6, %8			    \n\t"
33992f
-	"stxvd2x	51, %7, %8			    \n\t"
33992f
-
33992f
-	"lxvd2x		48, 0, %3			    \n\t"
33992f
-	"lxvd2x		49, %5, %3			    \n\t"
33992f
-	"lxvd2x		50, %6, %3			    \n\t"
33992f
-	"lxvd2x		51, %7, %3			    \n\t"
33992f
-
33992f
-	"addi		%2, %2, 64			    \n\t"
33992f
-	"addi		%8, %8, 64			    \n\t"
33992f
-
33992f
-	"xvmaddadp	52, 44, 32		    	    \n\t"
33992f
-	"addi		%3, %3, 64			    \n\t"
33992f
-	"xvmaddadp	53, 45, 32		    	    \n\t"
33992f
-
33992f
-	"lxvd2x		44, 0, %2			    \n\t"
33992f
-	"lxvd2x		45, %5, %2			    \n\t"
33992f
-
33992f
-	"stxvd2x	52,  0, %8			    \n\t"
33992f
-	"stxvd2x	53, %5, %8			    \n\t"
33992f
-
33992f
-	"xvmaddadp	54, 46, 32		    	    \n\t"
33992f
-	"xvmaddadp	55, 47, 32		    	    \n\t"
33992f
-
33992f
-	"lxvd2x		46, %6, %2			    \n\t"
33992f
-	"lxvd2x		47, %7, %2			    \n\t"
33992f
-
33992f
-	"stxvd2x	54, %6, %8			    \n\t"
33992f
-	"stxvd2x	55, %7, %8			    \n\t"
33992f
-
33992f
-	"addi		%2, %2, 64			    \n\t"
33992f
-	"addi		%8, %8, 64			    \n\t"
33992f
-
33992f
-	"lxvd2x		52, 0, %3			    \n\t"
33992f
-	"lxvd2x		53, %5, %3			    \n\t"
33992f
-	"lxvd2x		54, %6, %3			    \n\t"
33992f
-	"lxvd2x		55, %7, %3			    \n\t"
33992f
-
33992f
-	"addi		%3, %3, 64			    \n\t"
33992f
-
33992f
-
33992f
-	"addic.		%0 , %0	, -16  	 	             \n\t"
33992f
-	"bgt		1b		             	     \n\t"
33992f
-
33992f
-	"2:						     \n\t"
33992f
-
33992f
-	
33992f
-	"xvmaddadp	48, 40, 32		    	    \n\t"
33992f
-	"xvmaddadp	49, 41, 32		    	    \n\t"
33992f
-	"xvmaddadp	50, 42, 32		    	    \n\t"
33992f
-	"xvmaddadp	51, 43, 32		    	    \n\t"
33992f
-
33992f
-	"xvmaddadp	52, 44, 32		    	    \n\t"
33992f
-	"xvmaddadp	53, 45, 32		    	    \n\t"
33992f
-	"xvmaddadp	54, 46, 32		    	    \n\t"
33992f
-	"xvmaddadp	55, 47, 32		    	    \n\t"
33992f
-
33992f
-	"stxvd2x	48,  0, %8			    \n\t"
33992f
-	"stxvd2x	49, %5, %8			    \n\t"
33992f
-	"stxvd2x	50, %6, %8			    \n\t"
33992f
-	"stxvd2x	51, %7, %8			    \n\t"
33992f
-
33992f
-	"addi		%8, %8, 64			    \n\t"
33992f
-
33992f
-	"stxvd2x	52,  0, %8			    \n\t"
33992f
-	"stxvd2x	53, %5, %8			    \n\t"
33992f
-	"stxvd2x	54, %6, %8			    \n\t"
33992f
-	"stxvd2x	55, %7, %8			    \n\t"
33992f
-
33992f
-	"addi		%8, %8, 64			    \n\t"
33992f
-
33992f
-	:
33992f
-        : 
33992f
-          "r" (i),	// 0	
33992f
-	  "r" (n),  	// 1
33992f
-          "r" (x1),     // 2
33992f
-          "r" (y1),     // 3
33992f
-          "r" (alpha),    // 4
33992f
-	  "r" (o16),	// 5
33992f
-	  "r" (o32),	// 6
33992f
-	  "r" (o48),    // 7
33992f
-	  "r" (y2),     // 8
33992f
-	  "r" (pre)	// 9
33992f
-	: "cr0", "%0", "%2" , "%3", "%8", "memory"
33992f
-	);
33992f
-
33992f
-} 
33992f
+       "lxvd2x		%x17, 0, %3	\n\t"
33992f
+       "lxvd2x		%x18, %23, %3	\n\t"
33992f
+       "lxvd2x		%x19, %24, %3	\n\t"
33992f
+       "lxvd2x		%x20, %25, %3	\n\t"
33992f
+
33992f
+       "addi		%2, %2, 64	\n\t"
33992f
+       "addi		%3, %3, -64	\n\t"
33992f
+
33992f
+       "addic.		%1, %1, -16	\n\t"
33992f
+       "ble		2f		\n\t"
33992f
+
33992f
+       ".align 5			\n"
33992f
+     "1:				\n\t"
33992f
+
33992f
+       "xvmaddadp	%x13, %x5, %x4	\n\t"
33992f
+       "xvmaddadp	%x14, %x6, %x4	\n\t"
33992f
+
33992f
+       "lxvd2x		%x5, 0, %2	\n\t"
33992f
+       "lxvd2x		%x6, %23, %2	\n\t"
33992f
+
33992f
+       "stxvd2x		%x13, 0, %3	\n\t"
33992f
+       "stxvd2x		%x14, %23, %3	\n\t"
33992f
+
33992f
+       "xvmaddadp	%x15, %x7, %x4	\n\t"
33992f
+       "xvmaddadp	%x16, %x8, %x4	\n\t"
33992f
+
33992f
+       "lxvd2x		%x7, %24, %2	\n\t"
33992f
+       "lxvd2x		%x8, %25, %2	\n\t"
33992f
+
33992f
+       "stxvd2x		%x15, %24, %3	\n\t"
33992f
+       "stxvd2x		%x16, %25, %3	\n\t"
33992f
+
33992f
+       "addi		%2, %2, 64	\n\t"
33992f
+       "addi		%3, %3, 128	\n\t"
33992f
+
33992f
+       "lxvd2x		%x13, 0, %3	\n\t"
33992f
+       "lxvd2x		%x14, %23, %3	\n\t"
33992f
+       "lxvd2x		%x15, %24, %3	\n\t"
33992f
+       "lxvd2x		%x16, %25, %3	\n\t"
33992f
+
33992f
+       "addi		%3, %3, -64	\n\t"
33992f
+
33992f
+       "xvmaddadp	%x17, %x9, %x4	\n\t"
33992f
+       "xvmaddadp	%x18, %x10, %x4	\n\t"
33992f
+
33992f
+       "lxvd2x		%x9, 0, %2	\n\t"
33992f
+       "lxvd2x		%x10, %23, %2	\n\t"
33992f
+
33992f
+       "stxvd2x		%x17, 0, %3	\n\t"
33992f
+       "stxvd2x		%x18, %23, %3	\n\t"
33992f
+
33992f
+       "xvmaddadp	%x19, %x11, %x4	\n\t"
33992f
+       "xvmaddadp	%x20, %x12, %x4	\n\t"
33992f
+
33992f
+       "lxvd2x		%x11, %24, %2	\n\t"
33992f
+       "lxvd2x		%x12, %25, %2	\n\t"
33992f
+
33992f
+       "stxvd2x		%x19, %24, %3	\n\t"
33992f
+       "stxvd2x		%x20, %25, %3	\n\t"
33992f
+
33992f
+       "addi		%2, %2, 64	\n\t"
33992f
+       "addi		%3, %3, 128	\n\t"
33992f
+
33992f
+       "lxvd2x		%x17, 0, %3	\n\t"
33992f
+       "lxvd2x		%x18, %23, %3	\n\t"
33992f
+       "lxvd2x		%x19, %24, %3	\n\t"
33992f
+       "lxvd2x		%x20, %25, %3	\n\t"
33992f
+
33992f
+       "addi		%3, %3, -64	\n\t"
33992f
+
33992f
+       "addic.		%1, %1, -16	\n\t"
33992f
+       "bgt		1b		\n"
33992f
+
33992f
+     "2:				\n\t"
33992f
+
33992f
+       "xvmaddadp	%x13, %x5, %x4	\n\t"
33992f
+       "xvmaddadp	%x14, %x6, %x4	\n\t"
33992f
+       "xvmaddadp	%x15, %x7, %x4	\n\t"
33992f
+       "xvmaddadp	%x16, %x8, %x4	\n\t"
33992f
+
33992f
+       "xvmaddadp	%x17, %x9, %x4	\n\t"
33992f
+       "xvmaddadp	%x18, %x10, %x4	\n\t"
33992f
+       "xvmaddadp	%x19, %x11, %x4	\n\t"
33992f
+       "xvmaddadp	%x20, %x12, %x4	\n\t"
33992f
+
33992f
+       "stxvd2x		%x13, 0, %3	\n\t"
33992f
+       "stxvd2x		%x14, %23, %3	\n\t"
33992f
+       "stxvd2x		%x15, %24, %3	\n\t"
33992f
+       "stxvd2x		%x16, %25, %3	\n\t"
33992f
+
33992f
+       "addi		%3, %3, 64	\n\t"
33992f
+
33992f
+       "stxvd2x		%x17, 0, %3	\n\t"
33992f
+       "stxvd2x		%x18, %23, %3	\n\t"
33992f
+       "stxvd2x		%x19, %24, %3	\n\t"
33992f
+       "stxvd2x		%x20, %25, %3	\n"
33992f
+
33992f
+     "#n=%1 x=%21=%2 y=%0=%3 alpha=%22 o16=%23 o32=%24 o48=%25\n"
33992f
+     "#t0=%x4 t1=%x5 t2=%x6 t3=%x7 t4=%x8 t5=%x9 t6=%x10 t7=%x11 t8=%x12 t9=%x13 t10=%x14 t11=%x15 t12=%x16 t13=%x17 t14=%x18 t15=%x19 t16=%x20"
33992f
+     :
33992f
+       "+m" (*y),
33992f
+       "+r" (n),	// 1
33992f
+       "+b" (x),	// 2
33992f
+       "+b" (y),	// 3
33992f
+       "=wa" (t0),	// 4
33992f
+       "=wa" (t1),	// 5
33992f
+       "=wa" (t2),	// 6
33992f
+       "=wa" (t3),	// 7
33992f
+       "=wa" (t4),	// 8
33992f
+       "=wa" (t5),	// 9
33992f
+       "=wa" (t6),	// 10
33992f
+       "=wa" (t7),	// 11
33992f
+       "=wa" (t8),	// 12
33992f
+       "=wa" (t9),	// 13
33992f
+       "=wa" (t10),	// 14
33992f
+       "=wa" (t11),	// 15
33992f
+       "=wa" (t12),	// 16
33992f
+       "=wa" (t13),	// 17
33992f
+       "=wa" (t14),	// 18
33992f
+       "=wa" (t15),	// 19
33992f
+       "=wa" (t16)	// 20
33992f
+     :
33992f
+       "m" (*x),
33992f
+       "d" (alpha),	// 22
33992f
+       "b" (16),	// 23
33992f
+       "b" (32),	// 24
33992f
+       "b" (48)		// 25
33992f
+     :
33992f
+       "cr0"
33992f
+     );
33992f
+
33992f
+}
33992f
 
33992f
 
33992f
diff --git a/kernel/power/dcopy_microk_power8.c b/kernel/power/dcopy_microk_power8.c
33992f
index 04f7db5..261dc04 100644
33992f
--- a/kernel/power/dcopy_microk_power8.c
33992f
+++ b/kernel/power/dcopy_microk_power8.c
33992f
@@ -35,140 +35,121 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33992f
 
33992f
 #define HAVE_KERNEL_32 1
33992f
 
33992f
-static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
33992f
-
33992f
-static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
33992f
+static void dcopy_kernel_32 (long n, double *x, double *y)
33992f
 {
33992f
-
33992f
-
33992f
-	BLASLONG i = n;
33992f
-	BLASLONG o16 = 16;
33992f
-	BLASLONG o32 = 32;
33992f
-	BLASLONG o48 = 48;
33992f
-	BLASLONG o64 = 64;
33992f
-	BLASLONG o80 = 80;
33992f
-	BLASLONG o96 = 96;
33992f
-	BLASLONG o112 = 112;
33992f
-	FLOAT *x1=x;
33992f
-	FLOAT *y1=y;
33992f
-	BLASLONG pre = 384;
33992f
-	BLASLONG alpha=0;
33992f
-
33992f
-	__asm__  __volatile__
33992f
-	(
33992f
-
33992f
-	"lxvd2x		40, 0, %2			    \n\t"
33992f
-	"lxvd2x		41, %5, %2			    \n\t"
33992f
-	"lxvd2x		42, %6, %2			    \n\t"
33992f
-	"lxvd2x		43, %7, %2			    \n\t"
33992f
-	"lxvd2x		44, %8, %2			    \n\t"
33992f
-	"lxvd2x		45, %9, %2			    \n\t"
33992f
-	"lxvd2x		46, %10, %2			    \n\t"
33992f
-	"lxvd2x		47, %11, %2			    \n\t"
33992f
-
33992f
-	"addi		%2, %2, 128			    \n\t"
33992f
-
33992f
-	"lxvd2x		50, 0, %2			    \n\t"
33992f
-	"lxvd2x		51, %5, %2			    \n\t"
33992f
-	"lxvd2x		52, %6, %2			    \n\t"
33992f
-	"lxvd2x		53, %7, %2			    \n\t"
33992f
-	"lxvd2x		54, %8, %2			    \n\t"
33992f
-	"lxvd2x		55, %9, %2			    \n\t"
33992f
-	"lxvd2x		56, %10, %2			    \n\t"
33992f
-	"lxvd2x		57, %11, %2			    \n\t"
33992f
-
33992f
-	"addi		%2, %2, 128			    \n\t"
33992f
-
33992f
-	"addic.		%0 , %0	, -32  	 	             \n\t"
33992f
-	"ble		2f		             	     \n\t"
33992f
-
33992f
-	".align 5				            \n\t"
33992f
-	"1:				                    \n\t"
33992f
-
33992f
-	"stxvd2x		40, 0, %1			    \n\t"
33992f
-	"stxvd2x		41, %5, %1			    \n\t"
33992f
-	"lxvd2x		40, 0, %2			    \n\t"
33992f
-	"lxvd2x		41, %5, %2			    \n\t"
33992f
-	"stxvd2x		42, %6, %1			    \n\t"
33992f
-	"stxvd2x		43, %7, %1			    \n\t"
33992f
-	"lxvd2x		42, %6, %2			    \n\t"
33992f
-	"lxvd2x		43, %7, %2			    \n\t"
33992f
-	"stxvd2x		44, %8, %1			    \n\t"
33992f
-	"stxvd2x		45, %9, %1			    \n\t"
33992f
-	"lxvd2x		44, %8, %2			    \n\t"
33992f
-	"lxvd2x		45, %9, %2			    \n\t"
33992f
-	"stxvd2x		46, %10, %1			    \n\t"
33992f
-	"stxvd2x		47, %11, %1			    \n\t"
33992f
-	"lxvd2x		46, %10, %2			    \n\t"
33992f
-	"lxvd2x		47, %11, %2			    \n\t"
33992f
-
33992f
-
33992f
-	"addi		%1, %1, 128			    \n\t"
33992f
-	"addi		%2, %2, 128			    \n\t"
33992f
-
33992f
-	"stxvd2x		50, 0, %1			    \n\t"
33992f
-	"stxvd2x		51, %5, %1			    \n\t"
33992f
-	"lxvd2x		50, 0, %2			    \n\t"
33992f
-	"lxvd2x		51, %5, %2			    \n\t"
33992f
-	"stxvd2x		52, %6, %1			    \n\t"
33992f
-	"stxvd2x		53, %7, %1			    \n\t"
33992f
-	"lxvd2x		52, %6, %2			    \n\t"
33992f
-	"lxvd2x		53, %7, %2			    \n\t"
33992f
-	"stxvd2x		54, %8, %1			    \n\t"
33992f
-	"stxvd2x		55, %9, %1			    \n\t"
33992f
-	"lxvd2x		54, %8, %2			    \n\t"
33992f
-	"lxvd2x		55, %9, %2			    \n\t"
33992f
-	"stxvd2x		56, %10, %1			    \n\t"
33992f
-	"stxvd2x		57, %11, %1			    \n\t"
33992f
-	"lxvd2x		56, %10, %2			    \n\t"
33992f
-	"lxvd2x		57, %11, %2			    \n\t"
33992f
-
33992f
-	"addi		%1, %1, 128			    \n\t"
33992f
-	"addi		%2, %2, 128			    \n\t"
33992f
-
33992f
-	"addic.		%0 , %0	, -32  	 	             \n\t"
33992f
-	"bgt		1b		             	     \n\t"
33992f
-
33992f
-	"2:						     \n\t"
33992f
-
33992f
-	"stxvd2x		40, 0, %1			    \n\t"
33992f
-	"stxvd2x		41, %5, %1			    \n\t"
33992f
-	"stxvd2x		42, %6, %1			    \n\t"
33992f
-	"stxvd2x		43, %7, %1			    \n\t"
33992f
-	"stxvd2x		44, %8, %1			    \n\t"
33992f
-	"stxvd2x		45, %9, %1			    \n\t"
33992f
-	"stxvd2x		46, %10, %1			    \n\t"
33992f
-	"stxvd2x		47, %11, %1			    \n\t"
33992f
-
33992f
-	"addi		%1, %1, 128			    \n\t"
33992f
-
33992f
-	"stxvd2x		50, 0, %1			    \n\t"
33992f
-	"stxvd2x		51, %5, %1			    \n\t"
33992f
-	"stxvd2x		52, %6, %1			    \n\t"
33992f
-	"stxvd2x		53, %7, %1			    \n\t"
33992f
-	"stxvd2x		54, %8, %1			    \n\t"
33992f
-	"stxvd2x		55, %9, %1			    \n\t"
33992f
-	"stxvd2x		56, %10, %1			    \n\t"
33992f
-	"stxvd2x		57, %11, %1			    \n\t"
33992f
-
33992f
-
33992f
-	:
33992f
-        : 
33992f
-          "r" (i),	// 0	
33992f
-	  "r" (y1),  	// 1
33992f
-          "r" (x1),     // 2
33992f
-          "r" (alpha),  // 3
33992f
-          "r" (pre),    // 4
33992f
-	  "r" (o16),	// 5
33992f
-	  "r" (o32),	// 6
33992f
-	  "r" (o48),    // 7
33992f
-          "r" (o64),    // 8
33992f
-          "r" (o80),    // 9
33992f
-          "r" (o96),    // 10
33992f
-          "r" (o112)    // 11
33992f
-	: "cr0", "%0", "%2" , "%1", "memory"
33992f
-	);
33992f
-
33992f
-} 
33992f
-
33992f
-
33992f
+  __asm__
33992f
+    (
33992f
+       "lxvd2x		32, 0, %2	\n\t"
33992f
+       "lxvd2x		33, %5, %2	\n\t"
33992f
+       "lxvd2x		34, %6, %2	\n\t"
33992f
+       "lxvd2x		35, %7, %2	\n\t"
33992f
+       "lxvd2x		36, %8, %2	\n\t"
33992f
+       "lxvd2x		37, %9, %2	\n\t"
33992f
+       "lxvd2x		38, %10, %2	\n\t"
33992f
+       "lxvd2x		39, %11, %2	\n\t"
33992f
+
33992f
+       "addi		%2, %2, 128	\n\t"
33992f
+
33992f
+       "lxvd2x		40, 0, %2	\n\t"
33992f
+       "lxvd2x		41, %5, %2	\n\t"
33992f
+       "lxvd2x		42, %6, %2	\n\t"
33992f
+       "lxvd2x		43, %7, %2	\n\t"
33992f
+       "lxvd2x		44, %8, %2	\n\t"
33992f
+       "lxvd2x		45, %9, %2	\n\t"
33992f
+       "lxvd2x		46, %10, %2	\n\t"
33992f
+       "lxvd2x		47, %11, %2	\n\t"
33992f
+
33992f
+       "addi		%2, %2, 128	\n\t"
33992f
+
33992f
+       "addic.		%1, %1, -32	\n\t"
33992f
+       "ble		2f		\n\t"
33992f
+
33992f
+       ".p2align	5		\n"
33992f
+     "1:				\n\t"
33992f
+
33992f
+       "stxvd2x		32, 0, %3	\n\t"
33992f
+       "stxvd2x		33, %5, %3	\n\t"
33992f
+       "lxvd2x		32, 0, %2	\n\t"
33992f
+       "lxvd2x		33, %5, %2	\n\t"
33992f
+       "stxvd2x		34, %6, %3	\n\t"
33992f
+       "stxvd2x		35, %7, %3	\n\t"
33992f
+       "lxvd2x		34, %6, %2	\n\t"
33992f
+       "lxvd2x		35, %7, %2	\n\t"
33992f
+       "stxvd2x		36, %8, %3	\n\t"
33992f
+       "stxvd2x		37, %9, %3	\n\t"
33992f
+       "lxvd2x		36, %8, %2	\n\t"
33992f
+       "lxvd2x		37, %9, %2	\n\t"
33992f
+       "stxvd2x		38, %10, %3	\n\t"
33992f
+       "stxvd2x		39, %11, %3	\n\t"
33992f
+       "lxvd2x		38, %10, %2	\n\t"
33992f
+       "lxvd2x		39, %11, %2	\n\t"
33992f
+
33992f
+       "addi		%3, %3, 128	\n\t"
33992f
+       "addi		%2, %2, 128	\n\t"
33992f
+
33992f
+       "stxvd2x		40, 0, %3	\n\t"
33992f
+       "stxvd2x		41, %5, %3	\n\t"
33992f
+       "lxvd2x		40, 0, %2	\n\t"
33992f
+       "lxvd2x		41, %5, %2	\n\t"
33992f
+       "stxvd2x		42, %6, %3	\n\t"
33992f
+       "stxvd2x		43, %7, %3	\n\t"
33992f
+       "lxvd2x		42, %6, %2	\n\t"
33992f
+       "lxvd2x		43, %7, %2	\n\t"
33992f
+       "stxvd2x		44, %8, %3	\n\t"
33992f
+       "stxvd2x		45, %9, %3	\n\t"
33992f
+       "lxvd2x		44, %8, %2	\n\t"
33992f
+       "lxvd2x		45, %9, %2	\n\t"
33992f
+       "stxvd2x		46, %10, %3	\n\t"
33992f
+       "stxvd2x		47, %11, %3	\n\t"
33992f
+       "lxvd2x		46, %10, %2	\n\t"
33992f
+       "lxvd2x		47, %11, %2	\n\t"
33992f
+
33992f
+       "addi		%3, %3, 128	\n\t"
33992f
+       "addi		%2, %2, 128	\n\t"
33992f
+
33992f
+       "addic.		%1, %1, -32	\n\t"
33992f
+       "bgt		1b		\n"
33992f
+
33992f
+     "2:				\n\t"
33992f
+
33992f
+       "stxvd2x		32, 0, %3	\n\t"
33992f
+       "stxvd2x		33, %5, %3	\n\t"
33992f
+       "stxvd2x		34, %6, %3	\n\t"
33992f
+       "stxvd2x		35, %7, %3	\n\t"
33992f
+       "stxvd2x		36, %8, %3	\n\t"
33992f
+       "stxvd2x		37, %9, %3	\n\t"
33992f
+       "stxvd2x		38, %10, %3	\n\t"
33992f
+       "stxvd2x		39, %11, %3	\n\t"
33992f
+
33992f
+       "addi		%3, %3, 128	\n\t"
33992f
+
33992f
+       "stxvd2x		40, 0, %3	\n\t"
33992f
+       "stxvd2x		41, %5, %3	\n\t"
33992f
+       "stxvd2x		42, %6, %3	\n\t"
33992f
+       "stxvd2x		43, %7, %3	\n\t"
33992f
+       "stxvd2x		44, %8, %3	\n\t"
33992f
+       "stxvd2x		45, %9, %3	\n\t"
33992f
+       "stxvd2x		46, %10, %3	\n\t"
33992f
+       "stxvd2x		47, %11, %3	\n"
33992f
+
33992f
+     "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
33992f
+     :
33992f
+       "=m" (*y),
33992f
+       "+r" (n),	// 1
33992f
+       "+b" (x),	// 2
33992f
+       "+b" (y)		// 3
33992f
+     :
33992f
+       "m" (*x),
33992f
+       "b" (16),	// 5
33992f
+       "b" (32),	// 6
33992f
+       "b" (48),	// 7
33992f
+       "b" (64),	// 8
33992f
+       "b" (80),	// 9
33992f
+       "b" (96),	// 10
33992f
+       "b" (112)	// 11
33992f
+     :
33992f
+       "cr0",
33992f
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
33992f
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
33992f
+     );
33992f
+}
33992f
diff --git a/kernel/power/ddot.c b/kernel/power/ddot.c
33992f
index cef60a2..e43470e 100644
33992f
--- a/kernel/power/ddot.c
33992f
+++ b/kernel/power/ddot.c
33992f
@@ -43,7 +43,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33992f
 
33992f
 #ifndef HAVE_KERNEL_8
33992f
 
33992f
-static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
33992f
+static FLOAT ddot_kernel_8 (BLASLONG n, FLOAT *x, FLOAT *y)
33992f
 {
33992f
 	BLASLONG register i = 0;
33992f
 	FLOAT dot = 0.0;
33992f
@@ -62,8 +62,7 @@ static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
33992f
               i+=8 ;
33992f
 
33992f
        }
33992f
-       *d += dot;
33992f
-
33992f
+       return dot;
33992f
 }
33992f
 
33992f
 #endif
33992f
@@ -83,7 +82,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
33992f
 		BLASLONG n1 = n & -16;
33992f
 
33992f
 		if ( n1 )
33992f
-			ddot_kernel_8(n1, x, y , &dot );
33992f
+			dot = ddot_kernel_8(n1, x, y);
33992f
 
33992f
 		i = n1;
33992f
 		while(i < n)
33992f
diff --git a/kernel/power/ddot_microk_power8.c b/kernel/power/ddot_microk_power8.c
33992f
index b880492..4e6bc29 100644
33992f
--- a/kernel/power/ddot_microk_power8.c
33992f
+++ b/kernel/power/ddot_microk_power8.c
33992f
@@ -34,145 +34,138 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33992f
 **************************************************************************************/
33992f
 
33992f
 #define HAVE_KERNEL_8 1
33992f
-static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
33992f
 
33992f
-static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
33992f
+static double ddot_kernel_8 (long n, double *x, double *y)
33992f
 {
33992f
-
33992f
-
33992f
-	BLASLONG i = n;
33992f
-	BLASLONG o16 = 16;
33992f
-	BLASLONG o32 = 32;
33992f
-	BLASLONG o48 = 48;
33992f
-	BLASLONG o64 = 64;
33992f
-	BLASLONG o80 = 80;
33992f
-	BLASLONG o96 = 96;
33992f
-	BLASLONG o112 = 112;
33992f
-	FLOAT *x1=x;
33992f
-	FLOAT *y1=y;
33992f
-	BLASLONG pre = 384;
33992f
-
33992f
-	__asm__  __volatile__
33992f
-	(
33992f
-	"xxlxor		32,32,32			    \n\t"
33992f
-	"xxlxor		33,33,33			    \n\t"
33992f
-	"xxlxor		34,34,34			    \n\t"
33992f
-	"xxlxor		35,35,35			    \n\t"
33992f
-	"xxlxor		36,36,36			    \n\t"
33992f
-	"xxlxor		37,37,37			    \n\t"
33992f
-	"xxlxor		38,38,38			    \n\t"
33992f
-	"xxlxor		39,39,39			    \n\t"
33992f
-
33992f
-	"dcbt		%2, %12				    \n\t"
33992f
-	"dcbt		%3, %12				    \n\t"
33992f
-
33992f
-	"lxvd2x		40, 0, %2			    \n\t"
33992f
-	"lxvd2x		48, 0, %3			    \n\t"
33992f
-	"lxvd2x		41, %5, %2			    \n\t"
33992f
-	"lxvd2x		49, %5, %3			    \n\t"
33992f
-	"lxvd2x		42, %6, %2			    \n\t"
33992f
-	"lxvd2x		50, %6, %3			    \n\t"
33992f
-	"lxvd2x		43, %7, %2			    \n\t"
33992f
-	"lxvd2x		51, %7, %3			    \n\t"
33992f
-	"lxvd2x		44, %8, %2			    \n\t"
33992f
-	"lxvd2x		52, %8, %3			    \n\t"
33992f
-	"lxvd2x		45, %9, %2			    \n\t"
33992f
-	"lxvd2x		53, %9, %3			    \n\t"
33992f
-	"lxvd2x		46, %10, %2			    \n\t"
33992f
-	"lxvd2x		54, %10, %3			    \n\t"
33992f
-	"lxvd2x		47, %11, %2			    \n\t"
33992f
-	"lxvd2x		55, %11, %3			    \n\t"
33992f
-
33992f
-	"addi		%2, %2, 128			    \n\t"
33992f
-	"addi		%3, %3, 128			    \n\t"
33992f
-
33992f
-	"addic.		%0 , %0	, -16  	 	             \n\t"
33992f
-	"ble		2f		             	     \n\t"
33992f
-
33992f
-	".align 5				            \n\t"
33992f
-	"1:				                    \n\t"
33992f
-
33992f
-	"dcbt		%2, %12				    \n\t"
33992f
-	"dcbt		%3, %12				    \n\t"
33992f
-
33992f
-	"xvmaddadp	32, 40, 48		    \n\t"
33992f
-	"lxvd2x		40, 0, %2			    \n\t"
33992f
-	"lxvd2x		48, 0, %3			    \n\t"
33992f
-	"xvmaddadp	33, 41, 49		    \n\t"
33992f
-	"lxvd2x		41, %5, %2			    \n\t"
33992f
-	"lxvd2x		49, %5, %3			    \n\t"
33992f
-	"xvmaddadp	34, 42, 50		    \n\t"
33992f
-	"lxvd2x		42, %6, %2			    \n\t"
33992f
-	"lxvd2x		50, %6, %3			    \n\t"
33992f
-	"xvmaddadp	35, 43, 51		    \n\t"
33992f
-	"lxvd2x		43, %7, %2			    \n\t"
33992f
-	"lxvd2x		51, %7, %3			    \n\t"
33992f
-	"xvmaddadp	36, 44, 52		    \n\t"
33992f
-	"lxvd2x		44, %8, %2			    \n\t"
33992f
-	"lxvd2x		52, %8, %3			    \n\t"
33992f
-	"xvmaddadp	37, 45, 53		    \n\t"
33992f
-	"lxvd2x		45, %9, %2			    \n\t"
33992f
-	"lxvd2x		53, %9, %3			    \n\t"
33992f
-	"xvmaddadp	38, 46, 54		    \n\t"
33992f
-	"lxvd2x		46, %10, %2			    \n\t"
33992f
-	"lxvd2x		54, %10, %3			    \n\t"
33992f
-	"xvmaddadp	39, 47, 55		    \n\t"
33992f
-
33992f
-	"lxvd2x		47, %11, %2			    \n\t"
33992f
-	"lxvd2x		55, %11, %3			    \n\t"
33992f
-
33992f
-
33992f
-	"addi		%2, %2, 128			    \n\t"
33992f
-	"addi		%3, %3, 128			    \n\t"
33992f
-
33992f
-	"addic.		%0 , %0	, -16  	 	             \n\t"
33992f
-	"bgt		1b		             	     \n\t"
33992f
-
33992f
-	"2:						     \n\t"
33992f
-
33992f
-	"xvmaddadp	32, 40, 48		    \n\t"
33992f
-	"xvmaddadp	33, 41, 49		    \n\t"
33992f
-	"xvmaddadp	34, 42, 50		    \n\t"
33992f
-	"xvmaddadp	35, 43, 51		    \n\t"
33992f
-	"xvmaddadp	36, 44, 52		    \n\t"
33992f
-	"xvmaddadp	37, 45, 53		    \n\t"
33992f
-	"xvmaddadp	38, 46, 54		    \n\t"
33992f
-	"xvmaddadp	39, 47, 55		    \n\t"
33992f
-
33992f
-	"xvadddp	32, 32, 33		     \n\t"
33992f
-	"xvadddp	34, 34, 35		     \n\t"
33992f
-	"xvadddp	36, 36, 37		     \n\t"
33992f
-	"xvadddp	38, 38, 39		     \n\t"
33992f
-
33992f
-	"xvadddp	32, 32, 34		     \n\t"
33992f
-	"xvadddp	36, 36, 38		     \n\t"
33992f
-
33992f
-	"xvadddp	32, 32, 36		     \n\t"
33992f
-
33992f
-	"xxswapd	33, 32			     \n\t"
33992f
-
33992f
-	"xsadddp	32, 32, 33		     \n\t"
33992f
-
33992f
-	"stxsdx		32, 0, %4			     \n\t"
33992f
-
33992f
-	:
33992f
-        : 
33992f
-          "r" (i),	// 0	
33992f
-	  "r" (n),  	// 1
33992f
-          "r" (x1),     // 2
33992f
-          "r" (y1),     // 3
33992f
-          "r" (dot),    // 4
33992f
-	  "r" (o16),	// 5
33992f
-	  "r" (o32),	// 6
33992f
-	  "r" (o48),    // 7
33992f
-          "r" (o64),    // 8
33992f
-          "r" (o80),    // 9
33992f
-          "r" (o96),    // 10
33992f
-          "r" (o112),   // 11
33992f
-	  "r" (pre)	// 12
33992f
-	: "cr0", "%0", "%2" , "%3", "memory"
33992f
-	);
33992f
-
33992f
-} 
33992f
-
33992f
-
33992f
+  double dot;
33992f
+  __vector double t0;
33992f
+  __vector double t1;
33992f
+  __vector double t2;
33992f
+  __vector double t3;
33992f
+
33992f
+  __asm__
33992f
+    (
33992f
+       "dcbt		0, %2		\n\t"
33992f
+       "dcbt		0, %3		\n\t"
33992f
+
33992f
+       "xxlxor		32, 32,	32	\n\t"
33992f
+       "xxlxor		33, 33,	33	\n\t"
33992f
+       "xxlxor		34, 34,	34	\n\t"
33992f
+       "xxlxor		35, 35,	35	\n\t"
33992f
+       "xxlxor		36, 36,	36	\n\t"
33992f
+       "xxlxor		37, 37,	37	\n\t"
33992f
+       "xxlxor		38, 38,	38	\n\t"
33992f
+       "xxlxor		39, 39,	39	\n\t"