138493a
diff -up openssl-1.0.0d/engines/e_padlock.c.padlock64 openssl-1.0.0d/engines/e_padlock.c
138493a
--- openssl-1.0.0d/engines/e_padlock.c.padlock64	2011-01-30 02:05:38.000000000 +0100
138493a
+++ openssl-1.0.0d/engines/e_padlock.c	2011-04-28 21:03:26.000000000 +0200
138493a
@@ -101,10 +101,15 @@
138493a
    compiler choice is limited to GCC and Microsoft C. */
138493a
 #undef COMPILE_HW_PADLOCK
138493a
 #if !defined(I386_ONLY) && !defined(OPENSSL_NO_INLINE_ASM)
138493a
-# if (defined(__GNUC__) && (defined(__i386__) || defined(__i386))) || \
138493a
+# if (defined(__GNUC__) && __GNUC__>=2 && \
138493a
+	(defined(__i386__) || defined(__i386) || \
138493a
+	 defined(__x86_64__) || defined(__x86_64)) \
138493a
+     ) || \
138493a
      (defined(_MSC_VER) && defined(_M_IX86))
138493a
 #  define COMPILE_HW_PADLOCK
138493a
+#  ifdef OPENSSL_NO_DYNAMIC_ENGINE
138493a
 static ENGINE *ENGINE_padlock (void);
138493a
+#  endif
138493a
 # endif
138493a
 #endif
138493a
 
138493a
@@ -135,7 +140,7 @@ void ENGINE_load_padlock (void)
138493a
 # endif
138493a
 #elif defined(__GNUC__)
138493a
 # ifndef alloca
138493a
-#  define alloca(s) __builtin_alloca(s)
138493a
+#  define alloca(s) __builtin_alloca((s))
138493a
 # endif
138493a
 #endif
138493a
 
138493a
@@ -197,6 +202,7 @@ padlock_bind_helper(ENGINE *e)
138493a
 	return 1;
138493a
 }
138493a
 
138493a
+#ifdef OPENSSL_NO_DYNAMIC_ENGINE
138493a
 /* Constructor */
138493a
 static ENGINE *
138493a
 ENGINE_padlock(void)
138493a
@@ -214,6 +220,7 @@ ENGINE_padlock(void)
138493a
 
138493a
 	return eng;
138493a
 }
138493a
+#endif
138493a
 
138493a
 /* Check availability of the engine */
138493a
 static int
138493a
@@ -298,6 +305,7 @@ static volatile struct padlock_cipher_da
138493a
  * =======================================================
138493a
  */
138493a
 #if defined(__GNUC__) && __GNUC__>=2
138493a
+#if defined(__i386__) || defined(__i386)
138493a
 /*
138493a
  * As for excessive "push %ebx"/"pop %ebx" found all over.
138493a
  * When generating position-independent code GCC won't let
138493a
@@ -377,21 +385,6 @@ padlock_available(void)
138493a
 	return padlock_use_ace + padlock_use_rng;
138493a
 }
138493a
 
138493a
-#ifndef OPENSSL_NO_AES
138493a
-/* Our own htonl()/ntohl() */
138493a
-static inline void
138493a
-padlock_bswapl(AES_KEY *ks)
138493a
-{
138493a
-	size_t i = sizeof(ks->rd_key)/sizeof(ks->rd_key[0]);
138493a
-	unsigned int *key = ks->rd_key;
138493a
-
138493a
-	while (i--) {
138493a
-		asm volatile ("bswapl %0" : "+r"(*key));
138493a
-		key++;
138493a
-	}
138493a
-}
138493a
-#endif
138493a
-
138493a
 /* Force key reload from memory to the CPU microcode.
138493a
    Loading EFLAGS from the stack clears EFLAGS[30] 
138493a
    which does the trick. */
138493a
@@ -449,12 +442,127 @@ static inline void *name(size_t cnt,		\
138493a
 		: "edx", "cc", "memory");	\
138493a
 	return iv;				\
138493a
 }
138493a
+#endif
138493a
+
138493a
+#elif defined(__x86_64__) || defined(__x86_64)
138493a
+
138493a
+/* Load supported features of the CPU to see if
138493a
+   the PadLock is available. */
138493a
+static int
138493a
+padlock_available(void)
138493a
+{
138493a
+	char vendor_string[16];
138493a
+	unsigned int eax, edx;
138493a
+
138493a
+	/* Are we running on the Centaur (VIA) CPU? */
138493a
+	eax = 0x00000000;
138493a
+	vendor_string[12] = 0;
138493a
+	asm volatile (
138493a
+		"cpuid\n"
138493a
+		"movl	%%ebx,(%1)\n"
138493a
+		"movl	%%edx,4(%1)\n"
138493a
+		"movl	%%ecx,8(%1)\n"
138493a
+		: "+a"(eax) : "r"(vendor_string) : "rbx", "rcx", "rdx");
138493a
+	if (strcmp(vendor_string, "CentaurHauls") != 0)
138493a
+		return 0;
138493a
+
138493a
+	/* Check for Centaur Extended Feature Flags presence */
138493a
+	eax = 0xC0000000;
138493a
+	asm volatile ("cpuid"
138493a
+		: "+a"(eax) : : "rbx", "rcx", "rdx");
138493a
+	if (eax < 0xC0000001)
138493a
+		return 0;
138493a
+
138493a
+	/* Read the Centaur Extended Feature Flags */
138493a
+	eax = 0xC0000001;
138493a
+	asm volatile ("cpuid"
138493a
+		: "+a"(eax), "=d"(edx) : : "rbx", "rcx");
138493a
+
138493a
+	/* Fill up some flags */
138493a
+	padlock_use_ace = ((edx & (0x3<<6)) == (0x3<<6));
138493a
+	padlock_use_rng = ((edx & (0x3<<2)) == (0x3<<2));
138493a
+
138493a
+	return padlock_use_ace + padlock_use_rng;
138493a
+}
138493a
 
138493a
+/* Force key reload from memory to the CPU microcode.
138493a
+   Loading EFLAGS from the stack clears EFLAGS[30] 
138493a
+   which does the trick. */
138493a
+static inline void
138493a
+padlock_reload_key(void)
138493a
+{
138493a
+	asm volatile ("pushfq; popfq");
138493a
+}
138493a
+
138493a
+#ifndef OPENSSL_NO_AES
138493a
+/*
138493a
+ * This is heuristic key context tracing. At first one
138493a
+ * believes that one should use atomic swap instructions,
138493a
+ * but it's not actually necessary. Point is that if
138493a
+ * padlock_saved_context was changed by another thread
138493a
+ * after we've read it and before we compare it with cdata,
138493a
+ * our key *shall* be reloaded upon thread context switch
138493a
+ * and we are therefore set in either case...
138493a
+ */
138493a
+static inline void
138493a
+padlock_verify_context(struct padlock_cipher_data *cdata)
138493a
+{
138493a
+	asm volatile (
138493a
+	"pushfq\n"
138493a
+"	btl	$30,(%%rsp)\n"
138493a
+"	jnc	1f\n"
138493a
+"	cmpq	%2,%1\n"
138493a
+"	je	1f\n"
138493a
+"	popfq\n"
138493a
+"	subq	$8,%%rsp\n"
138493a
+"1:	addq	$8,%%rsp\n"
138493a
+"	movq	%2,%0"
138493a
+	:"+m"(padlock_saved_context)
138493a
+	: "r"(padlock_saved_context), "r"(cdata) : "cc");
138493a
+}
138493a
+
138493a
+/* Template for padlock_xcrypt_* modes */
138493a
+/* BIG FAT WARNING: 
138493a
+ * 	The offsets used with 'leal' instructions
138493a
+ * 	describe items of the 'padlock_cipher_data'
138493a
+ * 	structure.
138493a
+ */
138493a
+#define PADLOCK_XCRYPT_ASM(name,rep_xcrypt)	\
138493a
+static inline void *name(size_t cnt,		\
138493a
+	struct padlock_cipher_data *cdata,	\
138493a
+	void *out, const void *inp) 		\
138493a
+{	void *iv; 				\
138493a
+	asm volatile ( "leaq	16(%0),%%rdx\n"	\
138493a
+		"	leaq	32(%0),%%rbx\n"	\
138493a
+			rep_xcrypt "\n"		\
138493a
+		: "=a"(iv), "=c"(cnt), "=D"(out), "=S"(inp) \
138493a
+		: "0"(cdata), "1"(cnt), "2"(out), "3"(inp)  \
138493a
+		: "rbx", "rdx", "cc", "memory");	\
138493a
+	return iv;				\
138493a
+}
138493a
+#endif
138493a
+
138493a
+#endif	/* cpu */
138493a
+
138493a
+#ifndef OPENSSL_NO_AES
138493a
 /* Generate all functions with appropriate opcodes */
138493a
 PADLOCK_XCRYPT_ASM(padlock_xcrypt_ecb, ".byte 0xf3,0x0f,0xa7,0xc8")	/* rep xcryptecb */
138493a
 PADLOCK_XCRYPT_ASM(padlock_xcrypt_cbc, ".byte 0xf3,0x0f,0xa7,0xd0")	/* rep xcryptcbc */
138493a
 PADLOCK_XCRYPT_ASM(padlock_xcrypt_cfb, ".byte 0xf3,0x0f,0xa7,0xe0")	/* rep xcryptcfb */
138493a
 PADLOCK_XCRYPT_ASM(padlock_xcrypt_ofb, ".byte 0xf3,0x0f,0xa7,0xe8")	/* rep xcryptofb */
138493a
+
138493a
+/* Our own htonl()/ntohl() */
138493a
+static inline void
138493a
+padlock_bswapl(AES_KEY *ks)
138493a
+{
138493a
+	size_t i = sizeof(ks->rd_key)/sizeof(ks->rd_key[0]);
138493a
+	unsigned int *key = ks->rd_key;
138493a
+
138493a
+	while (i--) {
138493a
+		asm volatile ("bswapl %0" : "+r"(*key));
138493a
+		key++;
138493a
+	}
138493a
+}
138493a
 #endif
138493a
 
138493a
 /* The RNG call itself */
138493a
@@ -485,8 +593,8 @@ padlock_xstore(void *addr, unsigned int 
138493a
 static inline unsigned char *
138493a
 padlock_memcpy(void *dst,const void *src,size_t n)
138493a
 {
138493a
-	long       *d=dst;
138493a
-	const long *s=src;
138493a
+	size_t       *d=dst;
138493a
+	const size_t *s=src;
138493a
 
138493a
 	n /= sizeof(*d);
138493a
 	do { *d++ = *s++; } while (--n);