diff -up mpich/src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c.vsx mpich/src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c
--- mpich/src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c.vsx 2020-05-27 13:16:25.000000000 +0200
+++ mpich/src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c 2020-09-21 15:23:52.106489501 +0200
@@ -184,7 +184,7 @@
#ifdef DOUBLE_PRECISION_REAL
#define offset 2
#define __SIMD_DATATYPE __vector double
-#define __SIMD_LOAD (__vector double) vec_ld
+#define _SIMD_LOAD (__vector double) vec_ld
#endif
#ifdef SINGLE_PRECISION_REAL
@@ -197,6 +197,7 @@
#define _SIMD_STORE vec_st
#define _SIMD_ADD vec_add
#define _SIMD_MUL vec_mul
+#define _SIMD_SUB vec_sub
#define _SIMD_SET1 vec_splats
#endif /* VEC_SET == SPARC64_SSE */
@@ -1629,7 +1630,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_
#undef ROW_LENGTH
-#if VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128
+#if VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 6
#define STEP_SIZE 6
@@ -1640,7 +1641,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_
#define STEP_SIZE 12
#define UPPER_BOUND 8
#endif
-#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128 */
+#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
#if VEC_SET == AVX_256 || VEC_SET == AVX2_256
#ifdef DOUBLE_PRECISION_REAL
@@ -1680,14 +1681,14 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_
#undef ROW_LENGTH
-#if VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128
+#if VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 4
#endif
#ifdef SINGLE_PRECISION_REAL
#define ROW_LENGTH 8
#endif
-#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128 */
+#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
#if VEC_SET == AVX_256 || VEC_SET == AVX2_256
#ifdef DOUBLE_PRECISION_REAL
@@ -1715,14 +1716,14 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_
}
#undef ROW_LENGTH
-#if VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128
+#if VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 2
#endif
#ifdef SINGLE_PRECISION_REAL
#define ROW_LENGTH 4
#endif
-#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128 */
+#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
#if VEC_SET == AVX_256 || VEC_SET == AVX2_256
#ifdef DOUBLE_PRECISION_REAL
@@ -1772,7 +1773,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_
#ifdef BLOCK6
#undef ROW_LENGTH
-#if VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128
+#if VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 4
#define STEP_SIZE 4
@@ -1783,7 +1784,7 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_
#define STEP_SIZE 8
#define UPPER_BOUND 4
#endif
-#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128 */
+#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
#if VEC_SET == AVX_256 || VEC_SET == AVX2_256
#ifdef DOUBLE_PRECISION_REAL
@@ -1822,14 +1823,14 @@ void CONCAT_7ARGS(PREFIX,_hh_trafo_real_
}
#undef ROW_LENGTH
-#if VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128
+#if VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128
#ifdef DOUBLE_PRECISION_REAL
#define ROW_LENGTH 2
#endif
#ifdef SINGLE_PRECISION_REAL
#define ROW_LENGTH 4
#endif
-#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == NEON_ARCH64_128 */
+#endif /* VEC_SET == SSE_128 || VEC_SET == SPARC64_SSE || VEC_SET == VSX_SSE || VEC_SET == NEON_ARCH64_128 */
#if VEC_SET == AVX_256 || VEC_SET == AVX2_256
#ifdef DOUBLE_PRECISION_REAL
diff -up mpich/src/elpa2/kernels/real_vsx_4hv_double_precision.c.vsx mpich/src/elpa2/kernels/real_vsx_4hv_double_precision.c
--- mpich/src/elpa2/kernels/real_vsx_4hv_double_precision.c.vsx 2020-05-27 13:16:25.000000000 +0200
+++ mpich/src/elpa2/kernels/real_vsx_4hv_double_precision.c 2020-09-21 15:15:22.777337971 +0200
@@ -49,11 +49,11 @@
#define REALCASE 1
#define DOUBLE_PRECISION 1
#define BLOCK4 1
-#define SIMD_SET VSX_SSE
+#define VEC_SET VSX_SSE
#include "../../general/precision_macros.h"
-#include "real_vsx_4hv_template.c"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK4
-#undef SIMD_SET
+#undef VEC_SET
#undef REALCASE
#undef DOUBLE_PRECISION
diff -up mpich/src/elpa2/kernels/real_vsx_4hv_single_precision.c.vsx mpich/src/elpa2/kernels/real_vsx_4hv_single_precision.c
--- mpich/src/elpa2/kernels/real_vsx_4hv_single_precision.c.vsx 2020-05-27 13:16:25.000000000 +0200
+++ mpich/src/elpa2/kernels/real_vsx_4hv_single_precision.c 2020-09-21 15:15:22.777337971 +0200
@@ -49,11 +49,11 @@
#define REALCASE 1
#define SINGLE_PRECISION 1
#define BLOCK4 1
-#define SIMD_SET VSX_SSE
+#define VEC_SET VSX_SSE
#include "../../general/precision_macros.h"
-#include "real_vsx_4hv_template.c"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK4
-#undef SIMD_SET
+#undef VEC_SET
#undef REALCASE
#undef SINGLE_PRECISION
diff -up mpich/src/elpa2/kernels/real_vsx_6hv_double_precision.c.vsx mpich/src/elpa2/kernels/real_vsx_6hv_double_precision.c
--- mpich/src/elpa2/kernels/real_vsx_6hv_double_precision.c.vsx 2020-05-27 13:16:25.000000000 +0200
+++ mpich/src/elpa2/kernels/real_vsx_6hv_double_precision.c 2020-09-21 15:15:22.777337971 +0200
@@ -49,11 +49,11 @@
#define REALCASE 1
#define DOUBLE_PRECISION 1
#define BLOCK6 1
-#define SIMD_SET VSX_SSE
+#define VEC_SET VSX_SSE
#include "../../general/precision_macros.h"
-#include "real_vsx_6hv_template.c"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK6
-#undef SIMD_SET
+#undef VEC_SET
#undef REALCASE
#undef DOUBLE_PRECISION
diff -up mpich/src/elpa2/kernels/real_vsx_6hv_single_precision.c.vsx mpich/src/elpa2/kernels/real_vsx_6hv_single_precision.c
--- mpich/src/elpa2/kernels/real_vsx_6hv_single_precision.c.vsx 2020-05-27 13:16:25.000000000 +0200
+++ mpich/src/elpa2/kernels/real_vsx_6hv_single_precision.c 2020-09-21 15:15:22.777337971 +0200
@@ -51,7 +51,7 @@
#define BLOCK6 1
#define VEC_SET VSX_SSE
#include "../../general/precision_macros.h"
-#include "real_vsx_6hv_template.c"
+#include "real_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK6
#undef REALCASE