4e32573
commit 7cb126e7e7febf9dc3e369cc3e4885e34fb9433b
4e32573
Author: Noah Goldstein <goldstein.w.n@gmail.com>
4e32573
Date:   Wed Nov 10 16:18:56 2021 -0600
4e32573
4e32573
    x86: Shrink memcmp-sse4.S code size
4e32573
    
4e32573
    No bug.
4e32573
    
4e32573
    This implementation refactors memcmp-sse4.S primarily with minimizing
4e32573
    code size in mind. It does this by removing the lookup table logic and
4e32573
    removing the unrolled check from (256, 512] bytes.
4e32573
    
4e32573
    memcmp-sse4 code size reduction : -3487 bytes
4e32573
    wmemcmp-sse4 code size reduction: -1472 bytes
4e32573
    
4e32573
    The current memcmp-sse4.S implementation has a large code size
4e32573
    cost. This has serious adverse affects on the ICache / ITLB. While
4e32573
    in micro-benchmarks the implementations appears fast, traces of
4e32573
    real-world code have shown that the speed in micro benchmarks does not
4e32573
    translate when the ICache/ITLB are not primed, and that the cost
4e32573
    of the code size has measurable negative affects on overall
4e32573
    application performance.
4e32573
    
4e32573
    See https://research.google/pubs/pub48320/ for more details.
4e32573
    
4e32573
    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
4e32573
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
4e32573
    (cherry picked from commit 2f9062d7171850451e6044ef78d91ff8c017b9c0)
4e32573
4e32573
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
4e32573
index b7ac034569ec6178..97c102a9c5ab2b91 100644
4e32573
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
4e32573
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
4e32573
@@ -25,14 +25,14 @@
4e32573
 #  define MEMCMP	__memcmp_sse4_1
4e32573
 # endif
4e32573
 
4e32573
-# define JMPTBL(I, B)	(I - B)
4e32573
+#ifdef USE_AS_WMEMCMP
4e32573
+# define CMPEQ	pcmpeqd
4e32573
+# define CHAR_SIZE	4
4e32573
+#else
4e32573
+# define CMPEQ	pcmpeqb
4e32573
+# define CHAR_SIZE	1
4e32573
+#endif
4e32573
 
4e32573
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
4e32573
-  lea		TABLE(%rip), %r11;				\
4e32573
-  movslq	(%r11, INDEX, SCALE), %rcx;			\
4e32573
-  add		%r11, %rcx;					\
4e32573
-  _CET_NOTRACK jmp *%rcx;					\
4e32573
-  ud2
4e32573
 
4e32573
 /* Warning!
4e32573
            wmemcmp has to use SIGNED comparison for elements.
4e32573
@@ -47,33 +47,253 @@ ENTRY (MEMCMP)
4e32573
 	/* Clear the upper 32 bits.  */
4e32573
 	mov	%edx, %edx
4e32573
 # endif
4e32573
-	pxor	%xmm0, %xmm0
4e32573
 	cmp	$79, %RDX_LP
4e32573
 	ja	L(79bytesormore)
4e32573
+
4e32573
+	cmp	$CHAR_SIZE, %RDX_LP
4e32573
+	jbe	L(firstbyte)
4e32573
+
4e32573
+	/* N in (CHAR_SIZE, 79) bytes.  */
4e32573
+	cmpl	$32, %edx
4e32573
+	ja	L(more_32_bytes)
4e32573
+
4e32573
+	cmpl	$16, %edx
4e32573
+	jae	L(16_to_32_bytes)
4e32573
+
4e32573
 # ifndef USE_AS_WMEMCMP
4e32573
-	cmp	$1, %RDX_LP
4e32573
-	je	L(firstbyte)
4e32573
+	cmpl	$8, %edx
4e32573
+	jae	L(8_to_16_bytes)
4e32573
+
4e32573
+	cmpl	$4, %edx
4e32573
+	jb	L(2_to_3_bytes)
4e32573
+
4e32573
+	movl	(%rdi), %eax
4e32573
+	movl	(%rsi), %ecx
4e32573
+
4e32573
+	bswap	%eax
4e32573
+	bswap	%ecx
4e32573
+
4e32573
+	shlq	$32, %rax
4e32573
+	shlq	$32, %rcx
4e32573
+
4e32573
+	movl	-4(%rdi, %rdx), %edi
4e32573
+	movl	-4(%rsi, %rdx), %esi
4e32573
+
4e32573
+	bswap	%edi
4e32573
+	bswap	%esi
4e32573
+
4e32573
+	orq	%rdi, %rax
4e32573
+	orq	%rsi, %rcx
4e32573
+	subq	%rcx, %rax
4e32573
+	cmovne	%edx, %eax
4e32573
+	sbbl	%ecx, %ecx
4e32573
+	orl	%ecx, %eax
4e32573
+	ret
4e32573
+
4e32573
+	.p2align 4,, 8
4e32573
+L(2_to_3_bytes):
4e32573
+	movzwl	(%rdi), %eax
4e32573
+	movzwl	(%rsi), %ecx
4e32573
+	shll	$8, %eax
4e32573
+	shll	$8, %ecx
4e32573
+	bswap	%eax
4e32573
+	bswap	%ecx
4e32573
+	movzbl	-1(%rdi, %rdx), %edi
4e32573
+	movzbl	-1(%rsi, %rdx), %esi
4e32573
+	orl	%edi, %eax
4e32573
+	orl	%esi, %ecx
4e32573
+	subl	%ecx, %eax
4e32573
+	ret
4e32573
+
4e32573
+	.p2align 4,, 8
4e32573
+L(8_to_16_bytes):
4e32573
+	movq	(%rdi), %rax
4e32573
+	movq	(%rsi), %rcx
4e32573
+
4e32573
+	bswap	%rax
4e32573
+	bswap	%rcx
4e32573
+
4e32573
+	subq	%rcx, %rax
4e32573
+	jne	L(8_to_16_bytes_done)
4e32573
+
4e32573
+	movq	-8(%rdi, %rdx), %rax
4e32573
+	movq	-8(%rsi, %rdx), %rcx
4e32573
+
4e32573
+	bswap	%rax
4e32573
+	bswap	%rcx
4e32573
+
4e32573
+	subq	%rcx, %rax
4e32573
+
4e32573
+L(8_to_16_bytes_done):
4e32573
+	cmovne	%edx, %eax
4e32573
+	sbbl	%ecx, %ecx
4e32573
+	orl	%ecx, %eax
4e32573
+	ret
4e32573
+# else
4e32573
+	xorl	%eax, %eax
4e32573
+	movl	(%rdi), %ecx
4e32573
+	cmpl	(%rsi), %ecx
4e32573
+	jne	L(8_to_16_bytes_done)
4e32573
+	movl	4(%rdi), %ecx
4e32573
+	cmpl	4(%rsi), %ecx
4e32573
+	jne	L(8_to_16_bytes_done)
4e32573
+	movl	-4(%rdi, %rdx), %ecx
4e32573
+	cmpl	-4(%rsi, %rdx), %ecx
4e32573
+	jne	L(8_to_16_bytes_done)
4e32573
+	ret
4e32573
 # endif
4e32573
-	add	%rdx, %rsi
4e32573
-	add	%rdx, %rdi
4e32573
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
4e32573
 
4e32573
-# ifndef USE_AS_WMEMCMP
4e32573
-	.p2align 4
4e32573
+	.p2align 4,, 3
4e32573
+L(ret_zero):
4e32573
+	xorl	%eax, %eax
4e32573
+L(zero):
4e32573
+	ret
4e32573
+
4e32573
+	.p2align 4,, 8
4e32573
 L(firstbyte):
4e32573
+	jb	L(ret_zero)
4e32573
+# ifdef USE_AS_WMEMCMP
4e32573
+	xorl	%eax, %eax
4e32573
+	movl	(%rdi), %ecx
4e32573
+	cmpl	(%rsi), %ecx
4e32573
+	je	L(zero)
4e32573
+L(8_to_16_bytes_done):
4e32573
+	setg	%al
4e32573
+	leal	-1(%rax, %rax), %eax
4e32573
+# else
4e32573
 	movzbl	(%rdi), %eax
4e32573
 	movzbl	(%rsi), %ecx
4e32573
 	sub	%ecx, %eax
4e32573
+# endif
4e32573
 	ret
4e32573
+
4e32573
+	.p2align 4
4e32573
+L(vec_return_begin_48):
4e32573
+	addq	$16, %rdi
4e32573
+	addq	$16, %rsi
4e32573
+L(vec_return_begin_32):
4e32573
+	bsfl	%eax, %eax
4e32573
+# ifdef USE_AS_WMEMCMP
4e32573
+	movl	32(%rdi, %rax), %ecx
4e32573
+	xorl	%edx, %edx
4e32573
+	cmpl	32(%rsi, %rax), %ecx
4e32573
+	setg	%dl
4e32573
+	leal	-1(%rdx, %rdx), %eax
4e32573
+# else
4e32573
+	movzbl	32(%rsi, %rax), %ecx
4e32573
+	movzbl	32(%rdi, %rax), %eax
4e32573
+	subl	%ecx, %eax
4e32573
+# endif
4e32573
+	ret
4e32573
+
4e32573
+	.p2align 4
4e32573
+L(vec_return_begin_16):
4e32573
+	addq	$16, %rdi
4e32573
+	addq	$16, %rsi
4e32573
+L(vec_return_begin):
4e32573
+	bsfl	%eax, %eax
4e32573
+# ifdef USE_AS_WMEMCMP
4e32573
+	movl	(%rdi, %rax), %ecx
4e32573
+	xorl	%edx, %edx
4e32573
+	cmpl	(%rsi, %rax), %ecx
4e32573
+	setg	%dl
4e32573
+	leal	-1(%rdx, %rdx), %eax
4e32573
+# else
4e32573
+	movzbl	(%rsi, %rax), %ecx
4e32573
+	movzbl	(%rdi, %rax), %eax
4e32573
+	subl	%ecx, %eax
4e32573
+# endif
4e32573
+	ret
4e32573
+
4e32573
+	.p2align 4
4e32573
+L(vec_return_end_16):
4e32573
+	subl	$16, %edx
4e32573
+L(vec_return_end):
4e32573
+	bsfl	%eax, %eax
4e32573
+	addl	%edx, %eax
4e32573
+# ifdef USE_AS_WMEMCMP
4e32573
+	movl	-16(%rdi, %rax), %ecx
4e32573
+	xorl	%edx, %edx
4e32573
+	cmpl	-16(%rsi, %rax), %ecx
4e32573
+	setg	%dl
4e32573
+	leal	-1(%rdx, %rdx), %eax
4e32573
+# else
4e32573
+	movzbl	-16(%rsi, %rax), %ecx
4e32573
+	movzbl	-16(%rdi, %rax), %eax
4e32573
+	subl	%ecx, %eax
4e32573
 # endif
4e32573
+	ret
4e32573
+
4e32573
+	.p2align 4,, 8
4e32573
+L(more_32_bytes):
4e32573
+	movdqu	(%rdi), %xmm0
4e32573
+	movdqu	(%rsi), %xmm1
4e32573
+	CMPEQ	%xmm0, %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin)
4e32573
+
4e32573
+	movdqu	16(%rdi), %xmm0
4e32573
+	movdqu	16(%rsi), %xmm1
4e32573
+	CMPEQ	%xmm0, %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin_16)
4e32573
+
4e32573
+	cmpl	$64, %edx
4e32573
+	jbe	L(32_to_64_bytes)
4e32573
+	movdqu	32(%rdi), %xmm0
4e32573
+	movdqu	32(%rsi), %xmm1
4e32573
+	CMPEQ	%xmm0, %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin_32)
4e32573
+
4e32573
+	.p2align 4,, 6
4e32573
+L(32_to_64_bytes):
4e32573
+	movdqu	-32(%rdi, %rdx), %xmm0
4e32573
+	movdqu	-32(%rsi, %rdx), %xmm1
4e32573
+	CMPEQ	%xmm0, %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_end_16)
4e32573
+
4e32573
+	movdqu	-16(%rdi, %rdx), %xmm0
4e32573
+	movdqu	-16(%rsi, %rdx), %xmm1
4e32573
+	CMPEQ	%xmm0, %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_end)
4e32573
+	ret
4e32573
+
4e32573
+	.p2align 4
4e32573
+L(16_to_32_bytes):
4e32573
+	movdqu	(%rdi), %xmm0
4e32573
+	movdqu	(%rsi), %xmm1
4e32573
+	CMPEQ	%xmm0, %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin)
4e32573
+
4e32573
+	movdqu	-16(%rdi, %rdx), %xmm0
4e32573
+	movdqu	-16(%rsi, %rdx), %xmm1
4e32573
+	CMPEQ	%xmm0, %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_end)
4e32573
+	ret
4e32573
+
4e32573
 
4e32573
 	.p2align 4
4e32573
 L(79bytesormore):
4e32573
+	movdqu	(%rdi), %xmm0
4e32573
 	movdqu	(%rsi), %xmm1
4e32573
-	movdqu	(%rdi), %xmm2
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(16bytesin256)
4e32573
+	CMPEQ	%xmm0, %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin)
4e32573
+
4e32573
+
4e32573
 	mov	%rsi, %rcx
4e32573
 	and	$-16, %rsi
4e32573
 	add	$16, %rsi
4e32573
@@ -86,1694 +306,499 @@ L(79bytesormore):
4e32573
 
4e32573
 	cmp	$128, %rdx
4e32573
 	ja	L(128bytesormore)
4e32573
-L(less128bytes):
4e32573
-	sub	$64, %rdx
4e32573
-
4e32573
-	movdqu	(%rdi), %xmm2
4e32573
-	pxor	(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(16bytesin256)
4e32573
 
4e32573
-	movdqu	16(%rdi), %xmm2
4e32573
-	pxor	16(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(32bytesin256)
4e32573
-
4e32573
-	movdqu	32(%rdi), %xmm2
4e32573
-	pxor	32(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(48bytesin256)
4e32573
-
4e32573
-	movdqu	48(%rdi), %xmm2
4e32573
-	pxor	48(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(64bytesin256)
4e32573
-	cmp	$32, %rdx
4e32573
-	jb	L(less32bytesin64)
4e32573
-
4e32573
-	movdqu	64(%rdi), %xmm2
4e32573
-	pxor	64(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(80bytesin256)
4e32573
-
4e32573
-	movdqu	80(%rdi), %xmm2
4e32573
-	pxor	80(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(96bytesin256)
4e32573
-	sub	$32, %rdx
4e32573
-	add	$32, %rdi
4e32573
-	add	$32, %rsi
4e32573
-L(less32bytesin64):
4e32573
-	add	$64, %rdi
4e32573
-	add	$64, %rsi
4e32573
-	add	%rdx, %rsi
4e32573
-	add	%rdx, %rdi
4e32573
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
4e32573
+	.p2align 4,, 6
4e32573
+L(less128bytes):
4e32573
+	movdqu	(%rdi), %xmm1
4e32573
+	CMPEQ	(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin)
4e32573
+
4e32573
+	movdqu	16(%rdi), %xmm1
4e32573
+	CMPEQ	16(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin_16)
4e32573
+
4e32573
+	movdqu	32(%rdi), %xmm1
4e32573
+	CMPEQ	32(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin_32)
4e32573
+
4e32573
+	movdqu	48(%rdi), %xmm1
4e32573
+	CMPEQ	48(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin_48)
4e32573
+
4e32573
+	cmp	$96, %rdx
4e32573
+	jb	L(32_to_64_bytes)
4e32573
+
4e32573
+	addq	$64, %rdi
4e32573
+	addq	$64, %rsi
4e32573
+	subq	$64, %rdx
4e32573
+
4e32573
+	.p2align 4,, 6
4e32573
+L(last_64_bytes):
4e32573
+	movdqu	(%rdi), %xmm1
4e32573
+	CMPEQ	(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin)
4e32573
+
4e32573
+	movdqu	16(%rdi), %xmm1
4e32573
+	CMPEQ	16(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin_16)
4e32573
+
4e32573
+	movdqu	-32(%rdi, %rdx), %xmm0
4e32573
+	movdqu	-32(%rsi, %rdx), %xmm1
4e32573
+	CMPEQ	%xmm0, %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_end_16)
4e32573
+
4e32573
+	movdqu	-16(%rdi, %rdx), %xmm0
4e32573
+	movdqu	-16(%rsi, %rdx), %xmm1
4e32573
+	CMPEQ	%xmm0, %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_end)
4e32573
+	ret
4e32573
 
4e32573
+	.p2align 4
4e32573
 L(128bytesormore):
4e32573
-	cmp	$512, %rdx
4e32573
-	ja	L(512bytesormore)
4e32573
 	cmp	$256, %rdx
4e32573
-	ja	L(less512bytes)
4e32573
+	ja	L(unaligned_loop)
4e32573
 L(less256bytes):
4e32573
-	sub	$128, %rdx
4e32573
-
4e32573
-	movdqu	(%rdi), %xmm2
4e32573
-	pxor	(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(16bytesin256)
4e32573
-
4e32573
-	movdqu	16(%rdi), %xmm2
4e32573
-	pxor	16(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(32bytesin256)
4e32573
-
4e32573
-	movdqu	32(%rdi), %xmm2
4e32573
-	pxor	32(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(48bytesin256)
4e32573
-
4e32573
-	movdqu	48(%rdi), %xmm2
4e32573
-	pxor	48(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(64bytesin256)
4e32573
-
4e32573
-	movdqu	64(%rdi), %xmm2
4e32573
-	pxor	64(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(80bytesin256)
4e32573
-
4e32573
-	movdqu	80(%rdi), %xmm2
4e32573
-	pxor	80(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(96bytesin256)
4e32573
-
4e32573
-	movdqu	96(%rdi), %xmm2
4e32573
-	pxor	96(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(112bytesin256)
4e32573
-
4e32573
-	movdqu	112(%rdi), %xmm2
4e32573
-	pxor	112(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(128bytesin256)
4e32573
-
4e32573
-	add	$128, %rsi
4e32573
-	add	$128, %rdi
4e32573
-
4e32573
-	cmp	$64, %rdx
4e32573
-	jae	L(less128bytes)
4e32573
-
4e32573
-	cmp	$32, %rdx
4e32573
-	jb	L(less32bytesin128)
4e32573
-
4e32573
-	movdqu	(%rdi), %xmm2
4e32573
-	pxor	(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(16bytesin256)
4e32573
-
4e32573
-	movdqu	16(%rdi), %xmm2
4e32573
-	pxor	16(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(32bytesin256)
4e32573
-	sub	$32, %rdx
4e32573
-	add	$32, %rdi
4e32573
-	add	$32, %rsi
4e32573
-L(less32bytesin128):
4e32573
-	add	%rdx, %rsi
4e32573
-	add	%rdx, %rdi
4e32573
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
4e32573
-
4e32573
-L(less512bytes):
4e32573
-	sub	$256, %rdx
4e32573
-	movdqu	(%rdi), %xmm2
4e32573
-	pxor	(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(16bytesin256)
4e32573
-
4e32573
-	movdqu	16(%rdi), %xmm2
4e32573
-	pxor	16(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(32bytesin256)
4e32573
-
4e32573
-	movdqu	32(%rdi), %xmm2
4e32573
-	pxor	32(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(48bytesin256)
4e32573
-
4e32573
-	movdqu	48(%rdi), %xmm2
4e32573
-	pxor	48(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(64bytesin256)
4e32573
-
4e32573
-	movdqu	64(%rdi), %xmm2
4e32573
-	pxor	64(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(80bytesin256)
4e32573
-
4e32573
-	movdqu	80(%rdi), %xmm2
4e32573
-	pxor	80(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(96bytesin256)
4e32573
-
4e32573
-	movdqu	96(%rdi), %xmm2
4e32573
-	pxor	96(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(112bytesin256)
4e32573
-
4e32573
-	movdqu	112(%rdi), %xmm2
4e32573
-	pxor	112(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(128bytesin256)
4e32573
-
4e32573
-	movdqu	128(%rdi), %xmm2
4e32573
-	pxor	128(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(144bytesin256)
4e32573
-
4e32573
-	movdqu	144(%rdi), %xmm2
4e32573
-	pxor	144(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(160bytesin256)
4e32573
-
4e32573
-	movdqu	160(%rdi), %xmm2
4e32573
-	pxor	160(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(176bytesin256)
4e32573
-
4e32573
-	movdqu	176(%rdi), %xmm2
4e32573
-	pxor	176(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(192bytesin256)
4e32573
-
4e32573
-	movdqu	192(%rdi), %xmm2
4e32573
-	pxor	192(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(208bytesin256)
4e32573
-
4e32573
-	movdqu	208(%rdi), %xmm2
4e32573
-	pxor	208(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(224bytesin256)
4e32573
-
4e32573
-	movdqu	224(%rdi), %xmm2
4e32573
-	pxor	224(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(240bytesin256)
4e32573
-
4e32573
-	movdqu	240(%rdi), %xmm2
4e32573
-	pxor	240(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(256bytesin256)
4e32573
-
4e32573
-	add	$256, %rsi
4e32573
-	add	$256, %rdi
4e32573
-
4e32573
-	cmp	$128, %rdx
4e32573
-	jae	L(less256bytes)
4e32573
+	movdqu	(%rdi), %xmm1
4e32573
+	CMPEQ	(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin)
4e32573
+
4e32573
+	movdqu	16(%rdi), %xmm1
4e32573
+	CMPEQ	16(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin_16)
4e32573
+
4e32573
+	movdqu	32(%rdi), %xmm1
4e32573
+	CMPEQ	32(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin_32)
4e32573
+
4e32573
+	movdqu	48(%rdi), %xmm1
4e32573
+	CMPEQ	48(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin_48)
4e32573
+
4e32573
+	addq	$64, %rdi
4e32573
+	addq	$64, %rsi
4e32573
+
4e32573
+	movdqu	(%rdi), %xmm1
4e32573
+	CMPEQ	(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin)
4e32573
+
4e32573
+	movdqu	16(%rdi), %xmm1
4e32573
+	CMPEQ	16(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin_16)
4e32573
+
4e32573
+	movdqu	32(%rdi), %xmm1
4e32573
+	CMPEQ	32(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin_32)
4e32573
+
4e32573
+	movdqu	48(%rdi), %xmm1
4e32573
+	CMPEQ	48(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin_48)
4e32573
+
4e32573
+	addq	$-128, %rdx
4e32573
+	subq	$-64, %rsi
4e32573
+	subq	$-64, %rdi
4e32573
 
4e32573
 	cmp	$64, %rdx
4e32573
-	jae	L(less128bytes)
4e32573
+	ja	L(less128bytes)
4e32573
 
4e32573
 	cmp	$32, %rdx
4e32573
-	jb	L(less32bytesin256)
4e32573
-
4e32573
-	movdqu	(%rdi), %xmm2
4e32573
-	pxor	(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(16bytesin256)
4e32573
-
4e32573
-	movdqu	16(%rdi), %xmm2
4e32573
-	pxor	16(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(32bytesin256)
4e32573
-	sub	$32, %rdx
4e32573
-	add	$32, %rdi
4e32573
-	add	$32, %rsi
4e32573
-L(less32bytesin256):
4e32573
-	add	%rdx, %rsi
4e32573
-	add	%rdx, %rdi
4e32573
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
4e32573
+	ja	L(last_64_bytes)
4e32573
+
4e32573
+	movdqu	-32(%rdi, %rdx), %xmm0
4e32573
+	movdqu	-32(%rsi, %rdx), %xmm1
4e32573
+	CMPEQ	%xmm0, %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_end_16)
4e32573
+
4e32573
+	movdqu	-16(%rdi, %rdx), %xmm0
4e32573
+	movdqu	-16(%rsi, %rdx), %xmm1
4e32573
+	CMPEQ	%xmm0, %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_end)
4e32573
+	ret
4e32573
 
4e32573
 	.p2align 4
4e32573
-L(512bytesormore):
4e32573
+L(unaligned_loop):
4e32573
 # ifdef DATA_CACHE_SIZE_HALF
4e32573
 	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
4e32573
 # else
4e32573
 	mov	__x86_data_cache_size_half(%rip), %R8_LP
4e32573
 # endif
4e32573
-	mov	%r8, %r9
4e32573
-	shr	$1, %r8
4e32573
-	add	%r9, %r8
4e32573
-	cmp	%r8, %rdx
4e32573
-	ja	L(L2_L3_cache_unaglined)
4e32573
+	movq	%r8, %r9
4e32573
+	addq	%r8, %r8
4e32573
+	addq	%r9, %r8
4e32573
+	cmpq	%r8, %rdx
4e32573
+	ja	L(L2_L3_cache_unaligned)
4e32573
 	sub	$64, %rdx
4e32573
 	.p2align 4
4e32573
 L(64bytesormore_loop):
4e32573
-	movdqu	(%rdi), %xmm2
4e32573
-	pxor	(%rsi), %xmm2
4e32573
-	movdqa	%xmm2, %xmm1
4e32573
+	movdqu	(%rdi), %xmm0
4e32573
+	movdqu	16(%rdi), %xmm1
4e32573
+	movdqu	32(%rdi), %xmm2
4e32573
+	movdqu	48(%rdi), %xmm3
4e32573
 
4e32573
-	movdqu	16(%rdi), %xmm3
4e32573
-	pxor	16(%rsi), %xmm3
4e32573
-	por	%xmm3, %xmm1
4e32573
+	CMPEQ	(%rsi), %xmm0
4e32573
+	CMPEQ	16(%rsi), %xmm1
4e32573
+	CMPEQ	32(%rsi), %xmm2
4e32573
+	CMPEQ	48(%rsi), %xmm3
4e32573
 
4e32573
-	movdqu	32(%rdi), %xmm4
4e32573
-	pxor	32(%rsi), %xmm4
4e32573
-	por	%xmm4, %xmm1
4e32573
+	pand	%xmm0, %xmm1
4e32573
+	pand	%xmm2, %xmm3
4e32573
+	pand	%xmm1, %xmm3
4e32573
 
4e32573
-	movdqu	48(%rdi), %xmm5
4e32573
-	pxor	48(%rsi), %xmm5
4e32573
-	por	%xmm5, %xmm1
4e32573
+	pmovmskb %xmm3, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(64bytesormore_loop_end)
4e32573
 
4e32573
-	ptest	%xmm1, %xmm0
4e32573
-	jnc	L(64bytesormore_loop_end)
4e32573
 	add	$64, %rsi
4e32573
 	add	$64, %rdi
4e32573
 	sub	$64, %rdx
4e32573
-	jae	L(64bytesormore_loop)
4e32573
+	ja	L(64bytesormore_loop)
4e32573
 
4e32573
-	add	$64, %rdx
4e32573
-	add	%rdx, %rsi
4e32573
-	add	%rdx, %rdi
4e32573
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
4e32573
+	.p2align 4,, 6
4e32573
+L(loop_tail):
4e32573
+	addq	%rdx, %rdi
4e32573
+	movdqu	(%rdi), %xmm0
4e32573
+	movdqu	16(%rdi), %xmm1
4e32573
+	movdqu	32(%rdi), %xmm2
4e32573
+	movdqu	48(%rdi), %xmm3
4e32573
+
4e32573
+	addq	%rdx, %rsi
4e32573
+	movdqu	(%rsi), %xmm4
4e32573
+	movdqu	16(%rsi), %xmm5
4e32573
+	movdqu	32(%rsi), %xmm6
4e32573
+	movdqu	48(%rsi), %xmm7
4e32573
+
4e32573
+	CMPEQ	%xmm4, %xmm0
4e32573
+	CMPEQ	%xmm5, %xmm1
4e32573
+	CMPEQ	%xmm6, %xmm2
4e32573
+	CMPEQ	%xmm7, %xmm3
4e32573
+
4e32573
+	pand	%xmm0, %xmm1
4e32573
+	pand	%xmm2, %xmm3
4e32573
+	pand	%xmm1, %xmm3
4e32573
+
4e32573
+	pmovmskb %xmm3, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(64bytesormore_loop_end)
4e32573
+	ret
4e32573
 
4e32573
-L(L2_L3_cache_unaglined):
4e32573
-	sub	$64, %rdx
4e32573
+L(L2_L3_cache_unaligned):
4e32573
+	subq	$64, %rdx
4e32573
 	.p2align 4
4e32573
 L(L2_L3_unaligned_128bytes_loop):
4e32573
 	prefetchnta 0x1c0(%rdi)
4e32573
 	prefetchnta 0x1c0(%rsi)
4e32573
-	movdqu	(%rdi), %xmm2
4e32573
-	pxor	(%rsi), %xmm2
4e32573
-	movdqa	%xmm2, %xmm1
4e32573
 
4e32573
-	movdqu	16(%rdi), %xmm3
4e32573
-	pxor	16(%rsi), %xmm3
4e32573
-	por	%xmm3, %xmm1
4e32573
+	movdqu	(%rdi), %xmm0
4e32573
+	movdqu	16(%rdi), %xmm1
4e32573
+	movdqu	32(%rdi), %xmm2
4e32573
+	movdqu	48(%rdi), %xmm3
4e32573
+
4e32573
+	CMPEQ	(%rsi), %xmm0
4e32573
+	CMPEQ	16(%rsi), %xmm1
4e32573
+	CMPEQ	32(%rsi), %xmm2
4e32573
+	CMPEQ	48(%rsi), %xmm3
4e32573
 
4e32573
-	movdqu	32(%rdi), %xmm4
4e32573
-	pxor	32(%rsi), %xmm4
4e32573
-	por	%xmm4, %xmm1
4e32573
+	pand	%xmm0, %xmm1
4e32573
+	pand	%xmm2, %xmm3
4e32573
+	pand	%xmm1, %xmm3
4e32573
 
4e32573
-	movdqu	48(%rdi), %xmm5
4e32573
-	pxor	48(%rsi), %xmm5
4e32573
-	por	%xmm5, %xmm1
4e32573
+	pmovmskb %xmm3, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(64bytesormore_loop_end)
4e32573
 
4e32573
-	ptest	%xmm1, %xmm0
4e32573
-	jnc	L(64bytesormore_loop_end)
4e32573
 	add	$64, %rsi
4e32573
 	add	$64, %rdi
4e32573
 	sub	$64, %rdx
4e32573
-	jae	L(L2_L3_unaligned_128bytes_loop)
4e32573
+	ja	L(L2_L3_unaligned_128bytes_loop)
4e32573
+	jmp	L(loop_tail)
4e32573
 
4e32573
-	add	$64, %rdx
4e32573
-	add	%rdx, %rsi
4e32573
-	add	%rdx, %rdi
4e32573
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
4e32573
 
4e32573
-/*
4e32573
- * This case is for machines which are sensitive for unaligned instructions.
4e32573
- */
4e32573
+	/* This case is for machines which are sensitive for unaligned
4e32573
+	 * instructions.  */
4e32573
 	.p2align 4
4e32573
 L(2aligned):
4e32573
 	cmp	$128, %rdx
4e32573
 	ja	L(128bytesormorein2aligned)
4e32573
 L(less128bytesin2aligned):
4e32573
-	sub	$64, %rdx
4e32573
-
4e32573
-	movdqa	(%rdi), %xmm2
4e32573
-	pxor	(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(16bytesin256)
4e32573
-
4e32573
-	movdqa	16(%rdi), %xmm2
4e32573
-	pxor	16(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(32bytesin256)
4e32573
-
4e32573
-	movdqa	32(%rdi), %xmm2
4e32573
-	pxor	32(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(48bytesin256)
4e32573
-
4e32573
-	movdqa	48(%rdi), %xmm2
4e32573
-	pxor	48(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(64bytesin256)
4e32573
-	cmp	$32, %rdx
4e32573
-	jb	L(less32bytesin64in2alinged)
4e32573
-
4e32573
-	movdqa	64(%rdi), %xmm2
4e32573
-	pxor	64(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(80bytesin256)
4e32573
-
4e32573
-	movdqa	80(%rdi), %xmm2
4e32573
-	pxor	80(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(96bytesin256)
4e32573
-	sub	$32, %rdx
4e32573
-	add	$32, %rdi
4e32573
-	add	$32, %rsi
4e32573
-L(less32bytesin64in2alinged):
4e32573
-	add	$64, %rdi
4e32573
-	add	$64, %rsi
4e32573
-	add	%rdx, %rsi
4e32573
-	add	%rdx, %rdi
4e32573
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
4e32573
+	movdqa	(%rdi), %xmm1
4e32573
+	CMPEQ	(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin)
4e32573
+
4e32573
+	movdqa	16(%rdi), %xmm1
4e32573
+	CMPEQ	16(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin_16)
4e32573
+
4e32573
+	movdqa	32(%rdi), %xmm1
4e32573
+	CMPEQ	32(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin_32)
4e32573
+
4e32573
+	movdqa	48(%rdi), %xmm1
4e32573
+	CMPEQ	48(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin_48)
4e32573
+
4e32573
+	cmp	$96, %rdx
4e32573
+	jb	L(32_to_64_bytes)
4e32573
+
4e32573
+	addq	$64, %rdi
4e32573
+	addq	$64, %rsi
4e32573
+	subq	$64, %rdx
4e32573
+
4e32573
+	.p2align 4,, 6
4e32573
+L(aligned_last_64_bytes):
4e32573
+	movdqa	(%rdi), %xmm1
4e32573
+	CMPEQ	(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin)
4e32573
+
4e32573
+	movdqa	16(%rdi), %xmm1
4e32573
+	CMPEQ	16(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin_16)
4e32573
+
4e32573
+	movdqu	-32(%rdi, %rdx), %xmm0
4e32573
+	movdqu	-32(%rsi, %rdx), %xmm1
4e32573
+	CMPEQ	%xmm0, %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_end_16)
4e32573
+
4e32573
+	movdqu	-16(%rdi, %rdx), %xmm0
4e32573
+	movdqu	-16(%rsi, %rdx), %xmm1
4e32573
+	CMPEQ	%xmm0, %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_end)
4e32573
+	ret
4e32573
 
4e32573
 	.p2align 4
4e32573
 L(128bytesormorein2aligned):
4e32573
-	cmp	$512, %rdx
4e32573
-	ja	L(512bytesormorein2aligned)
4e32573
 	cmp	$256, %rdx
4e32573
-	ja	L(256bytesormorein2aligned)
4e32573
+	ja	L(aligned_loop)
4e32573
 L(less256bytesin2alinged):
4e32573
-	sub	$128, %rdx
4e32573
-
4e32573
-	movdqa	(%rdi), %xmm2
4e32573
-	pxor	(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(16bytesin256)
4e32573
-
4e32573
-	movdqa	16(%rdi), %xmm2
4e32573
-	pxor	16(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(32bytesin256)
4e32573
-
4e32573
-	movdqa	32(%rdi), %xmm2
4e32573
-	pxor	32(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(48bytesin256)
4e32573
-
4e32573
-	movdqa	48(%rdi), %xmm2
4e32573
-	pxor	48(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(64bytesin256)
4e32573
-
4e32573
-	movdqa	64(%rdi), %xmm2
4e32573
-	pxor	64(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(80bytesin256)
4e32573
-
4e32573
-	movdqa	80(%rdi), %xmm2
4e32573
-	pxor	80(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(96bytesin256)
4e32573
-
4e32573
-	movdqa	96(%rdi), %xmm2
4e32573
-	pxor	96(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(112bytesin256)
4e32573
-
4e32573
-	movdqa	112(%rdi), %xmm2
4e32573
-	pxor	112(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(128bytesin256)
4e32573
-
4e32573
-	add	$128, %rsi
4e32573
-	add	$128, %rdi
4e32573
+	movdqa	(%rdi), %xmm1
4e32573
+	CMPEQ	(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin)
4e32573
+
4e32573
+	movdqa	16(%rdi), %xmm1
4e32573
+	CMPEQ	16(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin_16)
4e32573
+
4e32573
+	movdqa	32(%rdi), %xmm1
4e32573
+	CMPEQ	32(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin_32)
4e32573
+
4e32573
+	movdqa	48(%rdi), %xmm1
4e32573
+	CMPEQ	48(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin_48)
4e32573
+
4e32573
+	addq	$64, %rdi
4e32573
+	addq	$64, %rsi
4e32573
+
4e32573
+	movdqa	(%rdi), %xmm1
4e32573
+	CMPEQ	(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin)
4e32573
+
4e32573
+	movdqa	16(%rdi), %xmm1
4e32573
+	CMPEQ	16(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin_16)
4e32573
+
4e32573
+	movdqa	32(%rdi), %xmm1
4e32573
+	CMPEQ	32(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin_32)
4e32573
+
4e32573
+	movdqa	48(%rdi), %xmm1
4e32573
+	CMPEQ	48(%rsi), %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_begin_48)
4e32573
+
4e32573
+	addq	$-128, %rdx
4e32573
+	subq	$-64, %rsi
4e32573
+	subq	$-64, %rdi
4e32573
 
4e32573
 	cmp	$64, %rdx
4e32573
-	jae	L(less128bytesin2aligned)
4e32573
+	ja	L(less128bytesin2aligned)
4e32573
 
4e32573
 	cmp	$32, %rdx
4e32573
-	jb	L(less32bytesin128in2aligned)
4e32573
-
4e32573
-	movdqu	(%rdi), %xmm2
4e32573
-	pxor	(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(16bytesin256)
4e32573
-
4e32573
-	movdqu	16(%rdi), %xmm2
4e32573
-	pxor	16(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(32bytesin256)
4e32573
-	sub	$32, %rdx
4e32573
-	add	$32, %rdi
4e32573
-	add	$32, %rsi
4e32573
-L(less32bytesin128in2aligned):
4e32573
-	add	%rdx, %rsi
4e32573
-	add	%rdx, %rdi
4e32573
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
4e32573
-
4e32573
-	.p2align 4
4e32573
-L(256bytesormorein2aligned):
4e32573
-
4e32573
-	sub	$256, %rdx
4e32573
-	movdqa	(%rdi), %xmm2
4e32573
-	pxor	(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(16bytesin256)
4e32573
-
4e32573
-	movdqa	16(%rdi), %xmm2
4e32573
-	pxor	16(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(32bytesin256)
4e32573
-
4e32573
-	movdqa	32(%rdi), %xmm2
4e32573
-	pxor	32(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(48bytesin256)
4e32573
-
4e32573
-	movdqa	48(%rdi), %xmm2
4e32573
-	pxor	48(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(64bytesin256)
4e32573
-
4e32573
-	movdqa	64(%rdi), %xmm2
4e32573
-	pxor	64(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(80bytesin256)
4e32573
-
4e32573
-	movdqa	80(%rdi), %xmm2
4e32573
-	pxor	80(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(96bytesin256)
4e32573
-
4e32573
-	movdqa	96(%rdi), %xmm2
4e32573
-	pxor	96(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(112bytesin256)
4e32573
-
4e32573
-	movdqa	112(%rdi), %xmm2
4e32573
-	pxor	112(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(128bytesin256)
4e32573
-
4e32573
-	movdqa	128(%rdi), %xmm2
4e32573
-	pxor	128(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(144bytesin256)
4e32573
-
4e32573
-	movdqa	144(%rdi), %xmm2
4e32573
-	pxor	144(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(160bytesin256)
4e32573
-
4e32573
-	movdqa	160(%rdi), %xmm2
4e32573
-	pxor	160(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(176bytesin256)
4e32573
-
4e32573
-	movdqa	176(%rdi), %xmm2
4e32573
-	pxor	176(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(192bytesin256)
4e32573
-
4e32573
-	movdqa	192(%rdi), %xmm2
4e32573
-	pxor	192(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(208bytesin256)
4e32573
-
4e32573
-	movdqa	208(%rdi), %xmm2
4e32573
-	pxor	208(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(224bytesin256)
4e32573
-
4e32573
-	movdqa	224(%rdi), %xmm2
4e32573
-	pxor	224(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(240bytesin256)
4e32573
-
4e32573
-	movdqa	240(%rdi), %xmm2
4e32573
-	pxor	240(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(256bytesin256)
4e32573
-
4e32573
-	add	$256, %rsi
4e32573
-	add	$256, %rdi
4e32573
-
4e32573
-	cmp	$128, %rdx
4e32573
-	jae	L(less256bytesin2alinged)
4e32573
-
4e32573
-	cmp	$64, %rdx
4e32573
-	jae	L(less128bytesin2aligned)
4e32573
-
4e32573
-	cmp	$32, %rdx
4e32573
-	jb	L(less32bytesin256in2alinged)
4e32573
-
4e32573
-	movdqa	(%rdi), %xmm2
4e32573
-	pxor	(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(16bytesin256)
4e32573
-
4e32573
-	movdqa	16(%rdi), %xmm2
4e32573
-	pxor	16(%rsi), %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(32bytesin256)
4e32573
-	sub	$32, %rdx
4e32573
-	add	$32, %rdi
4e32573
-	add	$32, %rsi
4e32573
-L(less32bytesin256in2alinged):
4e32573
-	add	%rdx, %rsi
4e32573
-	add	%rdx, %rdi
4e32573
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
4e32573
+	ja	L(aligned_last_64_bytes)
4e32573
+
4e32573
+	movdqu	-32(%rdi, %rdx), %xmm0
4e32573
+	movdqu	-32(%rsi, %rdx), %xmm1
4e32573
+	CMPEQ	%xmm0, %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_end_16)
4e32573
+
4e32573
+	movdqu	-16(%rdi, %rdx), %xmm0
4e32573
+	movdqu	-16(%rsi, %rdx), %xmm1
4e32573
+	CMPEQ	%xmm0, %xmm1
4e32573
+	pmovmskb %xmm1, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(vec_return_end)
4e32573
+	ret
4e32573
 
4e32573
 	.p2align 4
4e32573
-L(512bytesormorein2aligned):
4e32573
+L(aligned_loop):
4e32573
 # ifdef DATA_CACHE_SIZE_HALF
4e32573
 	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
4e32573
 # else
4e32573
 	mov	__x86_data_cache_size_half(%rip), %R8_LP
4e32573
 # endif
4e32573
-	mov	%r8, %r9
4e32573
-	shr	$1, %r8
4e32573
-	add	%r9, %r8
4e32573
-	cmp	%r8, %rdx
4e32573
-	ja	L(L2_L3_cache_aglined)
4e32573
+	movq	%r8, %r9
4e32573
+	addq	%r8, %r8
4e32573
+	addq	%r9, %r8
4e32573
+	cmpq	%r8, %rdx
4e32573
+	ja	L(L2_L3_cache_aligned)
4e32573
 
4e32573
 	sub	$64, %rdx
4e32573
 	.p2align 4
4e32573
 L(64bytesormore_loopin2aligned):
4e32573
-	movdqa	(%rdi), %xmm2
4e32573
-	pxor	(%rsi), %xmm2
4e32573
-	movdqa	%xmm2, %xmm1
4e32573
-
4e32573
-	movdqa	16(%rdi), %xmm3
4e32573
-	pxor	16(%rsi), %xmm3
4e32573
-	por	%xmm3, %xmm1
4e32573
+	movdqa	(%rdi), %xmm0
4e32573
+	movdqa	16(%rdi), %xmm1
4e32573
+	movdqa	32(%rdi), %xmm2
4e32573
+	movdqa	48(%rdi), %xmm3
4e32573
 
4e32573
-	movdqa	32(%rdi), %xmm4
4e32573
-	pxor	32(%rsi), %xmm4
4e32573
-	por	%xmm4, %xmm1
4e32573
+	CMPEQ	(%rsi), %xmm0
4e32573
+	CMPEQ	16(%rsi), %xmm1
4e32573
+	CMPEQ	32(%rsi), %xmm2
4e32573
+	CMPEQ	48(%rsi), %xmm3
4e32573
 
4e32573
-	movdqa	48(%rdi), %xmm5
4e32573
-	pxor	48(%rsi), %xmm5
4e32573
-	por	%xmm5, %xmm1
4e32573
+	pand	%xmm0, %xmm1
4e32573
+	pand	%xmm2, %xmm3
4e32573
+	pand	%xmm1, %xmm3
4e32573
 
4e32573
-	ptest	%xmm1, %xmm0
4e32573
-	jnc	L(64bytesormore_loop_end)
4e32573
+	pmovmskb %xmm3, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(64bytesormore_loop_end)
4e32573
 	add	$64, %rsi
4e32573
 	add	$64, %rdi
4e32573
 	sub	$64, %rdx
4e32573
-	jae	L(64bytesormore_loopin2aligned)
4e32573
-
4e32573
-	add	$64, %rdx
4e32573
-	add	%rdx, %rsi
4e32573
-	add	%rdx, %rdi
4e32573
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
4e32573
-L(L2_L3_cache_aglined):
4e32573
-	sub	$64, %rdx
4e32573
+	ja	L(64bytesormore_loopin2aligned)
4e32573
+	jmp	L(loop_tail)
4e32573
 
4e32573
+L(L2_L3_cache_aligned):
4e32573
+	subq	$64, %rdx
4e32573
 	.p2align 4
4e32573
 L(L2_L3_aligned_128bytes_loop):
4e32573
 	prefetchnta 0x1c0(%rdi)
4e32573
 	prefetchnta 0x1c0(%rsi)
4e32573
-	movdqa	(%rdi), %xmm2
4e32573
-	pxor	(%rsi), %xmm2
4e32573
-	movdqa	%xmm2, %xmm1
4e32573
-
4e32573
-	movdqa	16(%rdi), %xmm3
4e32573
-	pxor	16(%rsi), %xmm3
4e32573
-	por	%xmm3, %xmm1
4e32573
+	movdqa	(%rdi), %xmm0
4e32573
+	movdqa	16(%rdi), %xmm1
4e32573
+	movdqa	32(%rdi), %xmm2
4e32573
+	movdqa	48(%rdi), %xmm3
4e32573
 
4e32573
-	movdqa	32(%rdi), %xmm4
4e32573
-	pxor	32(%rsi), %xmm4
4e32573
-	por	%xmm4, %xmm1
4e32573
+	CMPEQ	(%rsi), %xmm0
4e32573
+	CMPEQ	16(%rsi), %xmm1
4e32573
+	CMPEQ	32(%rsi), %xmm2
4e32573
+	CMPEQ	48(%rsi), %xmm3
4e32573
 
4e32573
-	movdqa	48(%rdi), %xmm5
4e32573
-	pxor	48(%rsi), %xmm5
4e32573
-	por	%xmm5, %xmm1
4e32573
+	pand	%xmm0, %xmm1
4e32573
+	pand	%xmm2, %xmm3
4e32573
+	pand	%xmm1, %xmm3
4e32573
 
4e32573
-	ptest	%xmm1, %xmm0
4e32573
-	jnc	L(64bytesormore_loop_end)
4e32573
-	add	$64, %rsi
4e32573
-	add	$64, %rdi
4e32573
-	sub	$64, %rdx
4e32573
-	jae	L(L2_L3_aligned_128bytes_loop)
4e32573
-
4e32573
-	add	$64, %rdx
4e32573
-	add	%rdx, %rsi
4e32573
-	add	%rdx, %rdi
4e32573
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
4e32573
+	pmovmskb %xmm3, %eax
4e32573
+	incw	%ax
4e32573
+	jnz	L(64bytesormore_loop_end)
4e32573
 
4e32573
+	addq	$64, %rsi
4e32573
+	addq	$64, %rdi
4e32573
+	subq	$64, %rdx
4e32573
+	ja	L(L2_L3_aligned_128bytes_loop)
4e32573
+	jmp	L(loop_tail)
4e32573
 
4e32573
 	.p2align 4
4e32573
 L(64bytesormore_loop_end):
4e32573
-	add	$16, %rdi
4e32573
-	add	$16, %rsi
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(16bytes)
4e32573
-
4e32573
-	add	$16, %rdi
4e32573
-	add	$16, %rsi
4e32573
-	ptest	%xmm3, %xmm0
4e32573
-	jnc	L(16bytes)
4e32573
-
4e32573
-	add	$16, %rdi
4e32573
-	add	$16, %rsi
4e32573
-	ptest	%xmm4, %xmm0
4e32573
-	jnc	L(16bytes)
4e32573
-
4e32573
-	add	$16, %rdi
4e32573
-	add	$16, %rsi
4e32573
-	jmp	L(16bytes)
4e32573
-
4e32573
-L(256bytesin256):
4e32573
-	add	$256, %rdi
4e32573
-	add	$256, %rsi
4e32573
-	jmp	L(16bytes)
4e32573
-L(240bytesin256):
4e32573
-	add	$240, %rdi
4e32573
-	add	$240, %rsi
4e32573
-	jmp	L(16bytes)
4e32573
-L(224bytesin256):
4e32573
-	add	$224, %rdi
4e32573
-	add	$224, %rsi
4e32573
-	jmp	L(16bytes)
4e32573
-L(208bytesin256):
4e32573
-	add	$208, %rdi
4e32573
-	add	$208, %rsi
4e32573
-	jmp	L(16bytes)
4e32573
-L(192bytesin256):
4e32573
-	add	$192, %rdi
4e32573
-	add	$192, %rsi
4e32573
-	jmp	L(16bytes)
4e32573
-L(176bytesin256):
4e32573
-	add	$176, %rdi
4e32573
-	add	$176, %rsi
4e32573
-	jmp	L(16bytes)
4e32573
-L(160bytesin256):
4e32573
-	add	$160, %rdi
4e32573
-	add	$160, %rsi
4e32573
-	jmp	L(16bytes)
4e32573
-L(144bytesin256):
4e32573
-	add	$144, %rdi
4e32573
-	add	$144, %rsi
4e32573
-	jmp	L(16bytes)
4e32573
-L(128bytesin256):
4e32573
-	add	$128, %rdi
4e32573
-	add	$128, %rsi
4e32573
-	jmp	L(16bytes)
4e32573
-L(112bytesin256):
4e32573
-	add	$112, %rdi
4e32573
-	add	$112, %rsi
4e32573
-	jmp	L(16bytes)
4e32573
-L(96bytesin256):
4e32573
-	add	$96, %rdi
4e32573
-	add	$96, %rsi
4e32573
-	jmp	L(16bytes)
4e32573
-L(80bytesin256):
4e32573
-	add	$80, %rdi
4e32573
-	add	$80, %rsi
4e32573
-	jmp	L(16bytes)
4e32573
-L(64bytesin256):
4e32573
-	add	$64, %rdi
4e32573
-	add	$64, %rsi
4e32573
-	jmp	L(16bytes)
4e32573
-L(48bytesin256):
4e32573
-	add	$16, %rdi
4e32573
-	add	$16, %rsi
4e32573
-L(32bytesin256):
4e32573
-	add	$16, %rdi
4e32573
-	add	$16, %rsi
4e32573
-L(16bytesin256):
4e32573
-	add	$16, %rdi
4e32573
-	add	$16, %rsi
4e32573
-L(16bytes):
4e32573
-	mov	-16(%rdi), %rax
4e32573
-	mov	-16(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-L(8bytes):
4e32573
-	mov	-8(%rdi), %rax
4e32573
-	mov	-8(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	xor	%eax, %eax
4e32573
-	ret
4e32573
-
4e32573
-	.p2align 4
4e32573
-L(12bytes):
4e32573
-	mov	-12(%rdi), %rax
4e32573
-	mov	-12(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-L(4bytes):
4e32573
-	mov	-4(%rsi), %ecx
4e32573
-# ifndef USE_AS_WMEMCMP
4e32573
-	mov	-4(%rdi), %eax
4e32573
-	cmp	%eax, %ecx
4e32573
-# else
4e32573
-	cmp	-4(%rdi), %ecx
4e32573
-# endif
4e32573
-	jne	L(diffin4bytes)
4e32573
-L(0bytes):
4e32573
-	xor	%eax, %eax
4e32573
-	ret
4e32573
-
4e32573
-# ifndef USE_AS_WMEMCMP
4e32573
-/* unreal case for wmemcmp */
4e32573
-	.p2align 4
4e32573
-L(65bytes):
4e32573
-	movdqu	-65(%rdi), %xmm1
4e32573
-	movdqu	-65(%rsi), %xmm2
4e32573
-	mov	$-65, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(49bytes):
4e32573
-	movdqu	-49(%rdi), %xmm1
4e32573
-	movdqu	-49(%rsi), %xmm2
4e32573
-	mov	$-49, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(33bytes):
4e32573
-	movdqu	-33(%rdi), %xmm1
4e32573
-	movdqu	-33(%rsi), %xmm2
4e32573
-	mov	$-33, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(17bytes):
4e32573
-	mov	-17(%rdi), %rax
4e32573
-	mov	-17(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-L(9bytes):
4e32573
-	mov	-9(%rdi), %rax
4e32573
-	mov	-9(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	movzbl	-1(%rdi), %eax
4e32573
-	movzbl	-1(%rsi), %edx
4e32573
-	sub	%edx, %eax
4e32573
-	ret
4e32573
-
4e32573
-	.p2align 4
4e32573
-L(13bytes):
4e32573
-	mov	-13(%rdi), %rax
4e32573
-	mov	-13(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	mov	-8(%rdi), %rax
4e32573
-	mov	-8(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	xor	%eax, %eax
4e32573
-	ret
4e32573
-
4e32573
-	.p2align 4
4e32573
-L(5bytes):
4e32573
-	mov	-5(%rdi), %eax
4e32573
-	mov	-5(%rsi), %ecx
4e32573
-	cmp	%eax, %ecx
4e32573
-	jne	L(diffin4bytes)
4e32573
-	movzbl	-1(%rdi), %eax
4e32573
-	movzbl	-1(%rsi), %edx
4e32573
-	sub	%edx, %eax
4e32573
-	ret
4e32573
-
4e32573
-	.p2align 4
4e32573
-L(66bytes):
4e32573
-	movdqu	-66(%rdi), %xmm1
4e32573
-	movdqu	-66(%rsi), %xmm2
4e32573
-	mov	$-66, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(50bytes):
4e32573
-	movdqu	-50(%rdi), %xmm1
4e32573
-	movdqu	-50(%rsi), %xmm2
4e32573
-	mov	$-50, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(34bytes):
4e32573
-	movdqu	-34(%rdi), %xmm1
4e32573
-	movdqu	-34(%rsi), %xmm2
4e32573
-	mov	$-34, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(18bytes):
4e32573
-	mov	-18(%rdi), %rax
4e32573
-	mov	-18(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-L(10bytes):
4e32573
-	mov	-10(%rdi), %rax
4e32573
-	mov	-10(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	movzwl	-2(%rdi), %eax
4e32573
-	movzwl	-2(%rsi), %ecx
4e32573
-	cmp	%cl, %al
4e32573
-	jne	L(end)
4e32573
-	and	$0xffff, %eax
4e32573
-	and	$0xffff, %ecx
4e32573
-	sub	%ecx, %eax
4e32573
-	ret
4e32573
-
4e32573
-	.p2align 4
4e32573
-L(14bytes):
4e32573
-	mov	-14(%rdi), %rax
4e32573
-	mov	-14(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	mov	-8(%rdi), %rax
4e32573
-	mov	-8(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	xor	%eax, %eax
4e32573
-	ret
4e32573
-
4e32573
-	.p2align 4
4e32573
-L(6bytes):
4e32573
-	mov	-6(%rdi), %eax
4e32573
-	mov	-6(%rsi), %ecx
4e32573
-	cmp	%eax, %ecx
4e32573
-	jne	L(diffin4bytes)
4e32573
-L(2bytes):
4e32573
-	movzwl	-2(%rsi), %ecx
4e32573
-	movzwl	-2(%rdi), %eax
4e32573
-	cmp	%cl, %al
4e32573
-	jne	L(end)
4e32573
-	and	$0xffff, %eax
4e32573
-	and	$0xffff, %ecx
4e32573
-	sub	%ecx, %eax
4e32573
-	ret
4e32573
-
4e32573
-	.p2align 4
4e32573
-L(67bytes):
4e32573
-	movdqu	-67(%rdi), %xmm2
4e32573
-	movdqu	-67(%rsi), %xmm1
4e32573
-	mov	$-67, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(51bytes):
4e32573
-	movdqu	-51(%rdi), %xmm2
4e32573
-	movdqu	-51(%rsi), %xmm1
4e32573
-	mov	$-51, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(35bytes):
4e32573
-	movdqu	-35(%rsi), %xmm1
4e32573
-	movdqu	-35(%rdi), %xmm2
4e32573
-	mov	$-35, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(19bytes):
4e32573
-	mov	-19(%rdi), %rax
4e32573
-	mov	-19(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-L(11bytes):
4e32573
-	mov	-11(%rdi), %rax
4e32573
-	mov	-11(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	mov	-4(%rdi), %eax
4e32573
-	mov	-4(%rsi), %ecx
4e32573
-	cmp	%eax, %ecx
4e32573
-	jne	L(diffin4bytes)
4e32573
-	xor	%eax, %eax
4e32573
-	ret
4e32573
-
4e32573
-	.p2align 4
4e32573
-L(15bytes):
4e32573
-	mov	-15(%rdi), %rax
4e32573
-	mov	-15(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	mov	-8(%rdi), %rax
4e32573
-	mov	-8(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	xor	%eax, %eax
4e32573
-	ret
4e32573
-
4e32573
-	.p2align 4
4e32573
-L(7bytes):
4e32573
-	mov	-7(%rdi), %eax
4e32573
-	mov	-7(%rsi), %ecx
4e32573
-	cmp	%eax, %ecx
4e32573
-	jne	L(diffin4bytes)
4e32573
-	mov	-4(%rdi), %eax
4e32573
-	mov	-4(%rsi), %ecx
4e32573
-	cmp	%eax, %ecx
4e32573
-	jne	L(diffin4bytes)
4e32573
-	xor	%eax, %eax
4e32573
-	ret
4e32573
-
4e32573
-	.p2align 4
4e32573
-L(3bytes):
4e32573
-	movzwl	-3(%rdi), %eax
4e32573
-	movzwl	-3(%rsi), %ecx
4e32573
-	cmp	%eax, %ecx
4e32573
-	jne	L(diffin2bytes)
4e32573
-L(1bytes):
4e32573
-	movzbl	-1(%rdi), %eax
4e32573
-	movzbl	-1(%rsi), %ecx
4e32573
-	sub	%ecx, %eax
4e32573
-	ret
4e32573
-# endif
4e32573
-
4e32573
-	.p2align 4
4e32573
-L(68bytes):
4e32573
-	movdqu	-68(%rdi), %xmm2
4e32573
-	movdqu	-68(%rsi), %xmm1
4e32573
-	mov	$-68, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(52bytes):
4e32573
-	movdqu	-52(%rdi), %xmm2
4e32573
-	movdqu	-52(%rsi), %xmm1
4e32573
-	mov	$-52, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(36bytes):
4e32573
-	movdqu	-36(%rdi), %xmm2
4e32573
-	movdqu	-36(%rsi), %xmm1
4e32573
-	mov	$-36, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(20bytes):
4e32573
-	movdqu	-20(%rdi), %xmm2
4e32573
-	movdqu	-20(%rsi), %xmm1
4e32573
-	mov	$-20, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-	mov	-4(%rsi), %ecx
4e32573
-
4e32573
-# ifndef USE_AS_WMEMCMP
4e32573
-	mov	-4(%rdi), %eax
4e32573
-	cmp	%eax, %ecx
4e32573
-# else
4e32573
-	cmp	-4(%rdi), %ecx
4e32573
-# endif
4e32573
-	jne	L(diffin4bytes)
4e32573
-	xor	%eax, %eax
4e32573
-	ret
4e32573
-
4e32573
-# ifndef USE_AS_WMEMCMP
4e32573
-/* unreal cases for wmemcmp */
4e32573
-	.p2align 4
4e32573
-L(69bytes):
4e32573
-	movdqu	-69(%rsi), %xmm1
4e32573
-	movdqu	-69(%rdi), %xmm2
4e32573
-	mov	$-69, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(53bytes):
4e32573
-	movdqu	-53(%rsi), %xmm1
4e32573
-	movdqu	-53(%rdi), %xmm2
4e32573
-	mov	$-53, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(37bytes):
4e32573
-	movdqu	-37(%rsi), %xmm1
4e32573
-	movdqu	-37(%rdi), %xmm2
4e32573
-	mov	$-37, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(21bytes):
4e32573
-	movdqu	-21(%rsi), %xmm1
4e32573
-	movdqu	-21(%rdi), %xmm2
4e32573
-	mov	$-21, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-	mov	-8(%rdi), %rax
4e32573
-	mov	-8(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	xor	%eax, %eax
4e32573
-	ret
4e32573
-
4e32573
-	.p2align 4
4e32573
-L(70bytes):
4e32573
-	movdqu	-70(%rsi), %xmm1
4e32573
-	movdqu	-70(%rdi), %xmm2
4e32573
-	mov	$-70, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(54bytes):
4e32573
-	movdqu	-54(%rsi), %xmm1
4e32573
-	movdqu	-54(%rdi), %xmm2
4e32573
-	mov	$-54, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(38bytes):
4e32573
-	movdqu	-38(%rsi), %xmm1
4e32573
-	movdqu	-38(%rdi), %xmm2
4e32573
-	mov	$-38, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(22bytes):
4e32573
-	movdqu	-22(%rsi), %xmm1
4e32573
-	movdqu	-22(%rdi), %xmm2
4e32573
-	mov	$-22, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-	mov	-8(%rdi), %rax
4e32573
-	mov	-8(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	xor	%eax, %eax
4e32573
-	ret
4e32573
-
4e32573
-	.p2align 4
4e32573
-L(71bytes):
4e32573
-	movdqu	-71(%rsi), %xmm1
4e32573
-	movdqu	-71(%rdi), %xmm2
4e32573
-	mov	$-71, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(55bytes):
4e32573
-	movdqu	-55(%rdi), %xmm2
4e32573
-	movdqu	-55(%rsi), %xmm1
4e32573
-	mov	$-55, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(39bytes):
4e32573
-	movdqu	-39(%rdi), %xmm2
4e32573
-	movdqu	-39(%rsi), %xmm1
4e32573
-	mov	$-39, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(23bytes):
4e32573
-	movdqu	-23(%rdi), %xmm2
4e32573
-	movdqu	-23(%rsi), %xmm1
4e32573
-	mov	$-23, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-	mov	-8(%rdi), %rax
4e32573
-	mov	-8(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	xor	%eax, %eax
4e32573
-	ret
4e32573
-# endif
4e32573
-
4e32573
-	.p2align 4
4e32573
-L(72bytes):
4e32573
-	movdqu	-72(%rsi), %xmm1
4e32573
-	movdqu	-72(%rdi), %xmm2
4e32573
-	mov	$-72, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(56bytes):
4e32573
-	movdqu	-56(%rdi), %xmm2
4e32573
-	movdqu	-56(%rsi), %xmm1
4e32573
-	mov	$-56, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(40bytes):
4e32573
-	movdqu	-40(%rdi), %xmm2
4e32573
-	movdqu	-40(%rsi), %xmm1
4e32573
-	mov	$-40, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(24bytes):
4e32573
-	movdqu	-24(%rdi), %xmm2
4e32573
-	movdqu	-24(%rsi), %xmm1
4e32573
-	mov	$-24, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-
4e32573
-	mov	-8(%rsi), %rcx
4e32573
-	mov	-8(%rdi), %rax
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	xor	%eax, %eax
4e32573
-	ret
4e32573
-
4e32573
-# ifndef USE_AS_WMEMCMP
4e32573
-/* unreal cases for wmemcmp */
4e32573
-	.p2align 4
4e32573
-L(73bytes):
4e32573
-	movdqu	-73(%rsi), %xmm1
4e32573
-	movdqu	-73(%rdi), %xmm2
4e32573
-	mov	$-73, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(57bytes):
4e32573
-	movdqu	-57(%rdi), %xmm2
4e32573
-	movdqu	-57(%rsi), %xmm1
4e32573
-	mov	$-57, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(41bytes):
4e32573
-	movdqu	-41(%rdi), %xmm2
4e32573
-	movdqu	-41(%rsi), %xmm1
4e32573
-	mov	$-41, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(25bytes):
4e32573
-	movdqu	-25(%rdi), %xmm2
4e32573
-	movdqu	-25(%rsi), %xmm1
4e32573
-	mov	$-25, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-	mov	-9(%rdi), %rax
4e32573
-	mov	-9(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	movzbl	-1(%rdi), %eax
4e32573
-	movzbl	-1(%rsi), %ecx
4e32573
-	sub	%ecx, %eax
4e32573
-	ret
4e32573
-
4e32573
-	.p2align 4
4e32573
-L(74bytes):
4e32573
-	movdqu	-74(%rsi), %xmm1
4e32573
-	movdqu	-74(%rdi), %xmm2
4e32573
-	mov	$-74, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(58bytes):
4e32573
-	movdqu	-58(%rdi), %xmm2
4e32573
-	movdqu	-58(%rsi), %xmm1
4e32573
-	mov	$-58, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(42bytes):
4e32573
-	movdqu	-42(%rdi), %xmm2
4e32573
-	movdqu	-42(%rsi), %xmm1
4e32573
-	mov	$-42, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(26bytes):
4e32573
-	movdqu	-26(%rdi), %xmm2
4e32573
-	movdqu	-26(%rsi), %xmm1
4e32573
-	mov	$-26, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-	mov	-10(%rdi), %rax
4e32573
-	mov	-10(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	movzwl	-2(%rdi), %eax
4e32573
-	movzwl	-2(%rsi), %ecx
4e32573
-	jmp	L(diffin2bytes)
4e32573
-
4e32573
-	.p2align 4
4e32573
-L(75bytes):
4e32573
-	movdqu	-75(%rsi), %xmm1
4e32573
-	movdqu	-75(%rdi), %xmm2
4e32573
-	mov	$-75, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(59bytes):
4e32573
-	movdqu	-59(%rdi), %xmm2
4e32573
-	movdqu	-59(%rsi), %xmm1
4e32573
-	mov	$-59, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(43bytes):
4e32573
-	movdqu	-43(%rdi), %xmm2
4e32573
-	movdqu	-43(%rsi), %xmm1
4e32573
-	mov	$-43, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(27bytes):
4e32573
-	movdqu	-27(%rdi), %xmm2
4e32573
-	movdqu	-27(%rsi), %xmm1
4e32573
-	mov	$-27, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-	mov	-11(%rdi), %rax
4e32573
-	mov	-11(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	mov	-4(%rdi), %eax
4e32573
-	mov	-4(%rsi), %ecx
4e32573
-	cmp	%eax, %ecx
4e32573
-	jne	L(diffin4bytes)
4e32573
-	xor	%eax, %eax
4e32573
-	ret
4e32573
-# endif
4e32573
-	.p2align 4
4e32573
-L(76bytes):
4e32573
-	movdqu	-76(%rsi), %xmm1
4e32573
-	movdqu	-76(%rdi), %xmm2
4e32573
-	mov	$-76, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(60bytes):
4e32573
-	movdqu	-60(%rdi), %xmm2
4e32573
-	movdqu	-60(%rsi), %xmm1
4e32573
-	mov	$-60, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(44bytes):
4e32573
-	movdqu	-44(%rdi), %xmm2
4e32573
-	movdqu	-44(%rsi), %xmm1
4e32573
-	mov	$-44, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(28bytes):
4e32573
-	movdqu	-28(%rdi), %xmm2
4e32573
-	movdqu	-28(%rsi), %xmm1
4e32573
-	mov	$-28, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-	mov	-12(%rdi), %rax
4e32573
-	mov	-12(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	mov	-4(%rsi), %ecx
4e32573
-# ifndef USE_AS_WMEMCMP
4e32573
-	mov	-4(%rdi), %eax
4e32573
-	cmp	%eax, %ecx
4e32573
-# else
4e32573
-	cmp	-4(%rdi), %ecx
4e32573
-# endif
4e32573
-	jne	L(diffin4bytes)
4e32573
-	xor	%eax, %eax
4e32573
-	ret
4e32573
-
4e32573
-# ifndef USE_AS_WMEMCMP
4e32573
-/* unreal cases for wmemcmp */
4e32573
-	.p2align 4
4e32573
-L(77bytes):
4e32573
-	movdqu	-77(%rsi), %xmm1
4e32573
-	movdqu	-77(%rdi), %xmm2
4e32573
-	mov	$-77, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(61bytes):
4e32573
-	movdqu	-61(%rdi), %xmm2
4e32573
-	movdqu	-61(%rsi), %xmm1
4e32573
-	mov	$-61, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(45bytes):
4e32573
-	movdqu	-45(%rdi), %xmm2
4e32573
-	movdqu	-45(%rsi), %xmm1
4e32573
-	mov	$-45, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(29bytes):
4e32573
-	movdqu	-29(%rdi), %xmm2
4e32573
-	movdqu	-29(%rsi), %xmm1
4e32573
-	mov	$-29, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-
4e32573
-	mov	-13(%rdi), %rax
4e32573
-	mov	-13(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-
4e32573
-	mov	-8(%rdi), %rax
4e32573
-	mov	-8(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	xor	%eax, %eax
4e32573
-	ret
4e32573
-
4e32573
-	.p2align 4
4e32573
-L(78bytes):
4e32573
-	movdqu	-78(%rsi), %xmm1
4e32573
-	movdqu	-78(%rdi), %xmm2
4e32573
-	mov	$-78, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(62bytes):
4e32573
-	movdqu	-62(%rdi), %xmm2
4e32573
-	movdqu	-62(%rsi), %xmm1
4e32573
-	mov	$-62, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(46bytes):
4e32573
-	movdqu	-46(%rdi), %xmm2
4e32573
-	movdqu	-46(%rsi), %xmm1
4e32573
-	mov	$-46, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(30bytes):
4e32573
-	movdqu	-30(%rdi), %xmm2
4e32573
-	movdqu	-30(%rsi), %xmm1
4e32573
-	mov	$-30, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-	mov	-14(%rdi), %rax
4e32573
-	mov	-14(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	mov	-8(%rdi), %rax
4e32573
-	mov	-8(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	xor	%eax, %eax
4e32573
-	ret
4e32573
-
4e32573
-	.p2align 4
4e32573
-L(79bytes):
4e32573
-	movdqu	-79(%rsi), %xmm1
4e32573
-	movdqu	-79(%rdi), %xmm2
4e32573
-	mov	$-79, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(63bytes):
4e32573
-	movdqu	-63(%rdi), %xmm2
4e32573
-	movdqu	-63(%rsi), %xmm1
4e32573
-	mov	$-63, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(47bytes):
4e32573
-	movdqu	-47(%rdi), %xmm2
4e32573
-	movdqu	-47(%rsi), %xmm1
4e32573
-	mov	$-47, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(31bytes):
4e32573
-	movdqu	-31(%rdi), %xmm2
4e32573
-	movdqu	-31(%rsi), %xmm1
4e32573
-	mov	$-31, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-	mov	-15(%rdi), %rax
4e32573
-	mov	-15(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	mov	-8(%rdi), %rax
4e32573
-	mov	-8(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	xor	%eax, %eax
4e32573
-	ret
4e32573
-# endif
4e32573
-	.p2align 4
4e32573
-L(64bytes):
4e32573
-	movdqu	-64(%rdi), %xmm2
4e32573
-	movdqu	-64(%rsi), %xmm1
4e32573
-	mov	$-64, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(48bytes):
4e32573
-	movdqu	-48(%rdi), %xmm2
4e32573
-	movdqu	-48(%rsi), %xmm1
4e32573
-	mov	$-48, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-L(32bytes):
4e32573
-	movdqu	-32(%rdi), %xmm2
4e32573
-	movdqu	-32(%rsi), %xmm1
4e32573
-	mov	$-32, %dl
4e32573
-	pxor	%xmm1, %xmm2
4e32573
-	ptest	%xmm2, %xmm0
4e32573
-	jnc	L(less16bytes)
4e32573
-
4e32573
-	mov	-16(%rdi), %rax
4e32573
-	mov	-16(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-
4e32573
-	mov	-8(%rdi), %rax
4e32573
-	mov	-8(%rsi), %rcx
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	xor	%eax, %eax
4e32573
-	ret
4e32573
-
4e32573
-/*
4e32573
- * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
4e32573
- */
4e32573
-	.p2align 3
4e32573
-L(less16bytes):
4e32573
-	movsbq	%dl, %rdx
4e32573
-	mov	(%rsi, %rdx), %rcx
4e32573
-	mov	(%rdi, %rdx), %rax
4e32573
-	cmp	%rax, %rcx
4e32573
-	jne	L(diffin8bytes)
4e32573
-	mov	8(%rsi, %rdx), %rcx
4e32573
-	mov	8(%rdi, %rdx), %rax
4e32573
-L(diffin8bytes):
4e32573
-	cmp	%eax, %ecx
4e32573
-	jne	L(diffin4bytes)
4e32573
-	shr	$32, %rcx
4e32573
-	shr	$32, %rax
4e32573
-
4e32573
+	pmovmskb %xmm0, %ecx
4e32573
+	incw	%cx
4e32573
+	jnz	L(loop_end_ret)
4e32573
+
4e32573
+	pmovmskb %xmm1, %ecx
4e32573
+	notw	%cx
4e32573
+	sall	$16, %ecx
4e32573
+	jnz	L(loop_end_ret)
4e32573
+
4e32573
+	pmovmskb %xmm2, %ecx
4e32573
+	notw	%cx
4e32573
+	shlq	$32, %rcx
4e32573
+	jnz	L(loop_end_ret)
4e32573
+
4e32573
+	addq	$48, %rdi
4e32573
+	addq	$48, %rsi
4e32573
+	movq	%rax, %rcx
4e32573
+
4e32573
+	.p2align 4,, 6
4e32573
+L(loop_end_ret):
4e32573
+	bsfq	%rcx, %rcx
4e32573
 # ifdef USE_AS_WMEMCMP
4e32573
-/* for wmemcmp */
4e32573
-	cmp	%eax, %ecx
4e32573
-	jne	L(diffin4bytes)
4e32573
-	xor	%eax, %eax
4e32573
-	ret
4e32573
-# endif
4e32573
-
4e32573
-L(diffin4bytes):
4e32573
-# ifndef USE_AS_WMEMCMP
4e32573
-	cmp	%cx, %ax
4e32573
-	jne	L(diffin2bytes)
4e32573
-	shr	$16, %ecx
4e32573
-	shr	$16, %eax
4e32573
-L(diffin2bytes):
4e32573
-	cmp	%cl, %al
4e32573
-	jne	L(end)
4e32573
-	and	$0xffff, %eax
4e32573
-	and	$0xffff, %ecx
4e32573
-	sub	%ecx, %eax
4e32573
-	ret
4e32573
-
4e32573
-	.p2align 4
4e32573
-L(end):
4e32573
-	and	$0xff, %eax
4e32573
-	and	$0xff, %ecx
4e32573
-	sub	%ecx, %eax
4e32573
-	ret
4e32573
+	movl	(%rdi, %rcx), %eax
4e32573
+	xorl	%edx, %edx
4e32573
+	cmpl	(%rsi, %rcx), %eax
4e32573
+	setg	%dl
4e32573
+	leal	-1(%rdx, %rdx), %eax
4e32573
 # else
4e32573
-
4e32573
-/* for wmemcmp */
4e32573
-	mov	$1, %eax
4e32573
-	jl	L(nequal_bigger)
4e32573
-	neg	%eax
4e32573
-	ret
4e32573
-
4e32573
-	.p2align 4
4e32573
-L(nequal_bigger):
4e32573
-	ret
4e32573
-
4e32573
-L(unreal_case):
4e32573
-	xor	%eax, %eax
4e32573
-	ret
4e32573
+	movzbl	(%rdi, %rcx), %eax
4e32573
+	movzbl	(%rsi, %rcx), %ecx
4e32573
+	subl	%ecx, %eax
4e32573
 # endif
4e32573
-
4e32573
+	ret
4e32573
 END (MEMCMP)
4e32573
-
4e32573
-	.section .rodata.sse4.1,"a",@progbits
4e32573
-	.p2align 3
4e32573
-# ifndef USE_AS_WMEMCMP
4e32573
-L(table_64bytes):
4e32573
-	.int	JMPTBL (L(0bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(1bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(2bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(3bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(4bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(5bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(6bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(7bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(8bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(9bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(10bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(11bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(12bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(13bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(14bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(15bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(16bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(17bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(18bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(19bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(20bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(21bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(22bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(23bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(24bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(25bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(26bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(27bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(28bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(29bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(30bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(31bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(32bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(33bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(34bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(35bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(36bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(37bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(38bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(39bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(40bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(41bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(42bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(43bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(44bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(45bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(46bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(47bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(48bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(49bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(50bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(51bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(52bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(53bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(54bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(55bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(56bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(57bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(58bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(59bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(60bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(61bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(62bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(63bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(64bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(65bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(66bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(67bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(68bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(69bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(70bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(71bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(72bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(73bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(74bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(75bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(76bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(77bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(78bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(79bytes), L(table_64bytes))
4e32573
-# else
4e32573
-L(table_64bytes):
4e32573
-	.int	JMPTBL (L(0bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(4bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(8bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(12bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(16bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(20bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(24bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(28bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(32bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(36bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(40bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(44bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(48bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(52bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(56bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(60bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(64bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(68bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(72bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(76bytes), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
4e32573
-# endif
4e32573
 #endif