Blob Blame History Raw
Author: Florian Weimer <fweimer@redhat.com>
Date:   Wed Oct 18 11:12:29 2023 +0200

    Revert "x86: Prepare `strrchr-evex` and `strrchr-evex512` for AVX10"
    
    This reverts commit a3c50bf46a1ca6d9d2b7d879176d345abf95a9de.

diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
index cd6a0a870a02b9bd..58b2853ab69265e8 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
@@ -1,4 +1,4 @@
-/* Implementation for strrchr using evex256 and evex512.
+/* Placeholder function, not used by any processor at the moment.
    Copyright (C) 2022-2023 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,6 +16,8 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
+/* UNUSED. Exists purely as reference implementation.  */
+
 #include <isa-level.h>
 
 #if ISA_SHOULD_BUILD (4)
@@ -23,351 +25,240 @@
 # include <sysdep.h>
 
 # ifdef USE_AS_WCSRCHR
-#  if VEC_SIZE == 64
-#   define RCX_M	cx
-#   define KORTEST_M	kortestw
-#  else
-#   define RCX_M	cl
-#   define KORTEST_M	kortestb
-#  endif
-
-#  define SHIFT_REG	VRCX
 #  define CHAR_SIZE	4
-#  define VPCMP		vpcmpd
-#  define VPMIN		vpminud
-#  define VPCOMPRESS	vpcompressd
-#  define VPTESTN	vptestnmd
-#  define VPTEST	vptestmd
-#  define VPBROADCAST	vpbroadcastd
+#  define VPBROADCAST   vpbroadcastd
 #  define VPCMPEQ	vpcmpeqd
-
+#  define VPMINU	vpminud
+#  define VPTESTN	vptestnmd
 # else
-#  define SHIFT_REG	VRDI
 #  define CHAR_SIZE	1
-#  define VPCMP		vpcmpb
-#  define VPMIN		vpminub
-#  define VPCOMPRESS	vpcompressb
-#  define VPTESTN	vptestnmb
-#  define VPTEST	vptestmb
-#  define VPBROADCAST	vpbroadcastb
+#  define VPBROADCAST   vpbroadcastb
 #  define VPCMPEQ	vpcmpeqb
-
-#  define RCX_M		VRCX
-#  define KORTEST_M	KORTEST
+#  define VPMINU	vpminub
+#  define VPTESTN	vptestnmb
 # endif
 
-# define VMATCH		VMM(0)
-# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 # define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
 	.section SECTION(.text), "ax", @progbits
-	/* Aligning entry point to 64 byte, provides better performance for
-	   one vector length string.  */
-ENTRY_P2ALIGN(STRRCHR, 6)
-	movl	%edi, %eax
-	/* Broadcast CHAR to VMATCH.  */
-	VPBROADCAST %esi, %VMATCH
+/* Aligning entry point to 64 byte, provides better performance for
+   one vector length string.  */
+ENTRY_P2ALIGN (STRRCHR, 6)
 
-	andl	$(PAGE_SIZE - 1), %eax
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-	jg	L(cross_page_boundary)
+	/* Broadcast CHAR to VMM(0).  */
+	VPBROADCAST %esi, %VMM(0)
+	movl	%edi, %eax
+	sall	$20, %eax
+	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
+	ja	L(page_cross)
 
+L(page_cross_continue):
+	/* Compare [w]char for null, mask bit will be set for match.  */
 	VMOVU	(%rdi), %VMM(1)
-	/* k0 has a 1 for each zero CHAR in YMM1.  */
-	VPTESTN	%VMM(1), %VMM(1), %k0
-	KMOV	%k0, %VGPR(rsi)
-	test	%VGPR(rsi), %VGPR(rsi)
-	jz	L(aligned_more)
-	/* fallthrough: zero CHAR in first VEC.  */
-L(page_cross_return):
-	/* K1 has a 1 for each search CHAR match in VEC(1).  */
-	VPCMPEQ	%VMATCH, %VMM(1), %k1
-	KMOV	%k1, %VGPR(rax)
-	/* Build mask up until first zero CHAR (used to mask of
-	   potential search CHAR matches past the end of the string).  */
-	blsmsk	%VGPR(rsi), %VGPR(rsi)
-	/* Use `and` here to remove any out of bounds matches so we can
-	   do a reverse scan on `rax` to find the last match.  */
-	and	%VGPR(rsi), %VGPR(rax)
-	jz	L(ret0)
-	/* Get last match.  */
-	bsr	%VGPR(rax), %VGPR(rax)
+
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	KMOV	%k1, %VRCX
+	test	%VRCX, %VRCX
+	jz	L(align_more)
+
+	VPCMPEQ	%VMM(1), %VMM(0), %k0
+	KMOV	%k0, %VRAX
+	BLSMSK	%VRCX, %VRCX
+	and	%VRCX, %VRAX
+	jz	L(ret)
+
+	BSR	%VRAX, %VRAX
 # ifdef USE_AS_WCSRCHR
 	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
-	addq	%rdi, %rax
+	add	%rdi, %rax
 # endif
-L(ret0):
+L(ret):
 	ret
 
-	/* Returns for first vec x1/x2/x3 have hard coded backward
-	   search path for earlier matches.  */
-	.p2align 4,, 6
-L(first_vec_x1):
-	VPCMPEQ	%VMATCH, %VMM(2), %k1
-	KMOV	%k1, %VGPR(rax)
-	blsmsk	%VGPR(rcx), %VGPR(rcx)
-	/* eax non-zero if search CHAR in range.  */
-	and	%VGPR(rcx), %VGPR(rax)
-	jnz	L(first_vec_x1_return)
-
-	/* fallthrough: no match in YMM2 then need to check for earlier
-	   matches (in YMM1).  */
-	.p2align 4,, 4
-L(first_vec_x0_test):
-	VPCMPEQ	%VMATCH, %VMM(1), %k1
-	KMOV	%k1, %VGPR(rax)
-	test	%VGPR(rax), %VGPR(rax)
-	jz	L(ret1)
-	bsr	%VGPR(rax), %VGPR(rax)
+L(vector_x2_end):
+	VPCMPEQ	%VMM(2), %VMM(0), %k2
+	KMOV	%k2, %VRAX
+	BLSMSK	%VRCX, %VRCX
+	and	%VRCX, %VRAX
+	jz	L(vector_x1_ret)
+
+	BSR	%VRAX, %VRAX
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	/* Check the first vector at very last to look for match.  */
+L(vector_x1_ret):
+	VPCMPEQ %VMM(1), %VMM(0), %k2
+	KMOV	%k2, %VRAX
+	test	%VRAX, %VRAX
+	jz	L(ret)
+
+	BSR	%VRAX, %VRAX
 # ifdef USE_AS_WCSRCHR
 	leaq	(%rsi, %rax, CHAR_SIZE), %rax
 # else
-	addq	%rsi, %rax
+	add	%rsi, %rax
 # endif
-L(ret1):
 	ret
 
-	.p2align 4,, 10
-L(first_vec_x3):
-	VPCMPEQ	%VMATCH, %VMM(4), %k1
-	KMOV	%k1, %VGPR(rax)
-	blsmsk	%VGPR(rcx), %VGPR(rcx)
-	/* If no search CHAR match in range check YMM1/YMM2/YMM3.  */
-	and	%VGPR(rcx), %VGPR(rax)
-	jz	L(first_vec_x1_or_x2)
-	bsr	%VGPR(rax), %VGPR(rax)
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
-	ret
-	.p2align 4,, 4
-
-L(first_vec_x2):
-	VPCMPEQ	%VMATCH, %VMM(3), %k1
-	KMOV	%k1, %VGPR(rax)
-	blsmsk	%VGPR(rcx), %VGPR(rcx)
-	/* Check YMM3 for last match first. If no match try YMM2/YMM1.  */
-	and	%VGPR(rcx), %VGPR(rax)
-	jz	L(first_vec_x0_x1_test)
-	bsr	%VGPR(rax), %VGPR(rax)
-	leaq	(VEC_SIZE * 2)(%r8, %rax, CHAR_SIZE), %rax
-	ret
-
-	.p2align 4,, 6
-L(first_vec_x0_x1_test):
-	VPCMPEQ	%VMATCH, %VMM(2), %k1
-	KMOV	%k1, %VGPR(rax)
-	/* Check YMM2 for last match first. If no match try YMM1.  */
-	test	%VGPR(rax), %VGPR(rax)
-	jz	L(first_vec_x0_test)
-	.p2align 4,, 4
-L(first_vec_x1_return):
-	bsr	%VGPR(rax), %VGPR(rax)
-	leaq	(VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
-	ret
-
-	.p2align 4,, 12
-L(aligned_more):
-L(page_cross_continue):
-	/* Need to keep original pointer incase VEC(1) has last match.  */
+L(align_more):
+	/* Zero r8 to store match result.  */
+	xorl	%r8d, %r8d
+	/* Save pointer of first vector, in case if no match found.  */
 	movq	%rdi, %rsi
+	/* Align pointer to vector size.  */
 	andq	$-VEC_SIZE, %rdi
-
-	VMOVU	VEC_SIZE(%rdi), %VMM(2)
+	/* Loop unroll for 2 vector loop.  */
+	VMOVA	(VEC_SIZE)(%rdi), %VMM(2)
 	VPTESTN	%VMM(2), %VMM(2), %k0
 	KMOV	%k0, %VRCX
-	movq	%rdi, %r8
 	test	%VRCX, %VRCX
-	jnz	L(first_vec_x1)
-
-	VMOVU	(VEC_SIZE * 2)(%rdi), %VMM(3)
-	VPTESTN	%VMM(3), %VMM(3), %k0
-	KMOV	%k0, %VRCX
-
-	test	%VRCX, %VRCX
-	jnz	L(first_vec_x2)
-
-	VMOVU	(VEC_SIZE * 3)(%rdi), %VMM(4)
-	VPTESTN	%VMM(4), %VMM(4), %k0
-	KMOV	%k0, %VRCX
-
-	/* Intentionally use 64-bit here.  EVEX256 version needs 1-byte
-	   padding for efficient nop before loop alignment.  */
-	test	%rcx, %rcx
-	jnz	L(first_vec_x3)
+	jnz	L(vector_x2_end)
 
+	/* Save pointer of second vector, in case if no match
+	   found.  */
+	movq	%rdi, %r9
+	/* Align address to VEC_SIZE * 2 for loop.  */
 	andq	$-(VEC_SIZE * 2), %rdi
-	.p2align 4
-L(first_aligned_loop):
-	/* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can
-	   gurantee they don't store a match.  */
-	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(5)
-	VMOVA	(VEC_SIZE * 5)(%rdi), %VMM(6)
-
-	VPCMP	$4, %VMM(5), %VMATCH, %k2
-	VPCMP	$4, %VMM(6), %VMATCH, %k3{%k2}
-
-	VPMIN	%VMM(5), %VMM(6), %VMM(7)
 
-	VPTEST	%VMM(7), %VMM(7), %k1{%k3}
-	subq	$(VEC_SIZE * -2), %rdi
-	KORTEST_M %k1, %k1
-	jc	L(first_aligned_loop)
+	.p2align 4,,11
+L(loop):
+	/* 2 vector loop, as it provide better performance as compared
+	   to 4 vector loop.  */
+	VMOVA	(VEC_SIZE * 2)(%rdi), %VMM(3)
+	VMOVA	(VEC_SIZE * 3)(%rdi), %VMM(4)
+	VPCMPEQ	%VMM(3), %VMM(0), %k1
+	VPCMPEQ	%VMM(4), %VMM(0), %k2
+	VPMINU	%VMM(3), %VMM(4), %VMM(5)
+	VPTESTN	%VMM(5), %VMM(5), %k0
+	KOR	%k1, %k2, %k3
+	subq	$-(VEC_SIZE * 2), %rdi
+	/* If k0 and k3 zero, match and end of string not found.  */
+	KORTEST	%k0, %k3
+	jz	L(loop)
+
+	/* If k0 is non zero, end of string found.  */
+	KORTEST %k0, %k0
+	jnz	L(endloop)
+
+	lea	VEC_SIZE(%rdi), %r8
+	/* A match found, it need to be stored in r8 before loop
+	   continue.  */
+	/* Check second vector first.  */
+	KMOV	%k2, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(loop_vec_x2_match)
 
-	VPTESTN	%VMM(7), %VMM(7), %k1
 	KMOV	%k1, %VRDX
-	test	%VRDX, %VRDX
-	jz	L(second_aligned_loop_prep)
+	/* Match is in first vector, rdi offset need to be subtracted
+	  by VEC_SIZE.  */
+	sub	$VEC_SIZE, %r8
+
+	/* If second vector doesn't have match, first vector must
+	   have match.  */
+L(loop_vec_x2_match):
+	BSR	%VRDX, %VRDX
+# ifdef USE_AS_WCSRCHR
+	sal	$2, %rdx
+# endif
+	add	%rdx, %r8
+	jmp	L(loop)
 
-	KORTEST_M %k3, %k3
-	jnc	L(return_first_aligned_loop)
+L(endloop):
+	/* Check if string end in first loop vector.  */
+	VPTESTN	%VMM(3), %VMM(3), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(loop_vector_x1_end)
 
-	.p2align 4,, 6
-L(first_vec_x1_or_x2_or_x3):
-	VPCMPEQ	%VMM(4), %VMATCH, %k4
-	KMOV	%k4, %VRAX
+	/* Check if it has match in first loop vector.  */
+	KMOV	%k1, %VRAX
 	test	%VRAX, %VRAX
-	jz	L(first_vec_x1_or_x2)
-	bsr	%VRAX, %VRAX
-	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
-	ret
+	jz	L(loop_vector_x2_end)
 
-	.p2align 4,, 8
-L(return_first_aligned_loop):
-	VPTESTN	%VMM(5), %VMM(5), %k0
+	BSR	%VRAX, %VRAX
+	leaq	(%rdi, %rax, CHAR_SIZE), %r8
+
+	/* String must end in second loop vector.  */
+L(loop_vector_x2_end):
+	VPTESTN	%VMM(4), %VMM(4), %k0
 	KMOV	%k0, %VRCX
-	blsmsk	%VRCX, %VRCX
-	jnc	L(return_first_new_match_first)
-	blsmsk	%VRDX, %VRDX
-	VPCMPEQ	%VMM(6), %VMATCH, %k0
-	KMOV	%k0, %VRAX
-	addq	$VEC_SIZE, %rdi
-	and	%VRDX, %VRAX
-	jnz	L(return_first_new_match_ret)
-	subq	$VEC_SIZE, %rdi
-L(return_first_new_match_first):
 	KMOV	%k2, %VRAX
-# ifdef USE_AS_WCSRCHR
-	xorl	$((1 << CHAR_PER_VEC)- 1), %VRAX
+	BLSMSK	%VRCX, %VRCX
+	/* Check if it has match in second loop vector.  */
 	and	%VRCX, %VRAX
-# else
-	andn	%VRCX, %VRAX, %VRAX
-# endif
-	jz	L(first_vec_x1_or_x2_or_x3)
-L(return_first_new_match_ret):
-	bsr	%VRAX, %VRAX
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
-	ret
+	jz	L(check_last_match)
 
-	.p2align 4,, 10
-L(first_vec_x1_or_x2):
-	VPCMPEQ	%VMM(3), %VMATCH, %k3
-	KMOV	%k3, %VRAX
-	test	%VRAX, %VRAX
-	jz	L(first_vec_x0_x1_test)
-	bsr	%VRAX, %VRAX
-	leaq	(VEC_SIZE * 2)(%r8, %rax, CHAR_SIZE), %rax
+	BSR	%VRAX, %VRAX
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4
-	/* We can throw away the work done for the first 4x checks here
-	   as we have a later match. This is the 'fast' path persay.  */
-L(second_aligned_loop_prep):
-L(second_aligned_loop_set_furthest_match):
-	movq	%rdi, %rsi
-	VMOVA	%VMM(5), %VMM(7)
-	VMOVA	%VMM(6), %VMM(8)
-	.p2align 4
-L(second_aligned_loop):
-	VMOVU	(VEC_SIZE * 4)(%rdi), %VMM(5)
-	VMOVU	(VEC_SIZE * 5)(%rdi), %VMM(6)
-	VPCMP	$4, %VMM(5), %VMATCH, %k2
-	VPCMP	$4, %VMM(6), %VMATCH, %k3{%k2}
-
-	VPMIN	%VMM(5), %VMM(6), %VMM(4)
-
-	VPTEST	%VMM(4), %VMM(4), %k1{%k3}
-	subq	$(VEC_SIZE * -2), %rdi
-	KMOV	%k1, %VRCX
-	inc	%RCX_M
-	jz	L(second_aligned_loop)
-	VPTESTN	%VMM(4), %VMM(4), %k1
-	KMOV	%k1, %VRDX
-	test	%VRDX, %VRDX
-	jz	L(second_aligned_loop_set_furthest_match)
-
-	KORTEST_M %k3, %k3
-	jnc	L(return_new_match)
-	/* branch here because there is a significant advantage interms
-	   of output dependency chance in using edx.  */
+	/* String end in first loop vector.  */
+L(loop_vector_x1_end):
+	KMOV	%k1, %VRAX
+	BLSMSK	%VRCX, %VRCX
+	/* Check if it has match in second loop vector.  */
+	and	%VRCX, %VRAX
+	jz	L(check_last_match)
 
-L(return_old_match):
-	VPCMPEQ	%VMM(8), %VMATCH, %k0
-	KMOV	%k0, %VRCX
-	bsr	%VRCX, %VRCX
-	jnz	L(return_old_match_ret)
+	BSR	%VRAX, %VRAX
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	VPCMPEQ	%VMM(7), %VMATCH, %k0
-	KMOV	%k0, %VRCX
-	bsr	%VRCX, %VRCX
-	subq	$VEC_SIZE, %rsi
-L(return_old_match_ret):
-	leaq	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %rax
+	/* No match in first and second loop vector.  */
+L(check_last_match):
+	/* Check if any match recorded in r8.  */
+	test	%r8, %r8
+	jz	L(vector_x2_ret)
+	movq	%r8, %rax
 	ret
 
-L(return_new_match):
-	VPTESTN	%VMM(5), %VMM(5), %k0
-	KMOV	%k0, %VRCX
-	blsmsk	%VRCX, %VRCX
-	jnc	L(return_new_match_first)
-	dec	%VRDX
-	VPCMPEQ	%VMM(6), %VMATCH, %k0
-	KMOV	%k0, %VRAX
-	addq	$VEC_SIZE, %rdi
-	and	%VRDX, %VRAX
-	jnz	L(return_new_match_ret)
-	subq	$VEC_SIZE, %rdi
-L(return_new_match_first):
+	/* No match recorded in r8. Check the second saved vector
+	   in beginning.  */
+L(vector_x2_ret):
+	VPCMPEQ %VMM(2), %VMM(0), %k2
 	KMOV	%k2, %VRAX
-# ifdef USE_AS_WCSRCHR
-	xorl	$((1 << CHAR_PER_VEC)- 1), %VRAX
-	and	%VRCX, %VRAX
-# else
-	andn	%VRCX, %VRAX, %VRAX
-# endif
-	jz	L(return_old_match)
-L(return_new_match_ret):
-	bsr	%VRAX, %VRAX
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	test	%VRAX, %VRAX
+	jz	L(vector_x1_ret)
+
+	/* Match found in the second saved vector.  */
+	BSR	%VRAX, %VRAX
+	leaq	(VEC_SIZE)(%r9, %rax, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4,, 4
-L(cross_page_boundary):
-	xorq	%rdi, %rax
-	mov	$-1, %VRDX
-	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
-	VPTESTN	%VMM(6), %VMM(6), %k0
-	KMOV	%k0, %VRSI
+L(page_cross):
+	mov	%rdi, %rax
+	movl	%edi, %ecx
 
 # ifdef USE_AS_WCSRCHR
-	movl	%edi, %ecx
-	and	$(VEC_SIZE - 1), %ecx
-	shrl	$2, %ecx
+	/* Calculate number of compare result bits to be skipped for
+	   wide string alignment adjustment.  */
+	andl	$(VEC_SIZE - 1), %ecx
+	sarl	$2, %ecx
 # endif
-	shlx	%SHIFT_REG, %VRDX, %VRDX
-
+	/* ecx contains number of w[char] to be skipped as a result
+	   of address alignment.  */
+	andq    $-VEC_SIZE, %rax
+	VMOVA	(%rax), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	SHR     %cl, %VRAX
+	jz	L(page_cross_continue)
+	VPCMPEQ	%VMM(1), %VMM(0), %k0
+	KMOV	%k0, %VRDX
+	SHR     %cl, %VRDX
+	BLSMSK	%VRAX, %VRAX
+	and	%VRDX, %VRAX
+	jz	L(ret)
+	BSR	%VRAX, %VRAX
 # ifdef USE_AS_WCSRCHR
-	kmovw	%edx, %k1
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
-	KMOV	%VRDX, %k1
+	add	%rdi, %rax
 # endif
 
-	VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
-	/* We could technically just jmp back after the vpcompress but
-	   it doesn't save any 16-byte blocks.  */
-	shrx	%SHIFT_REG, %VRSI, %VRSI
-	test	%VRSI, %VRSI
-	jnz	L(page_cross_return)
-	jmp	L(page_cross_continue)
-	/* 1-byte from cache line.  */
-END(STRRCHR)
+	ret
+END (STRRCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
index 3bf6a5101422e4d1..85e3b0119f5dc923 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -1,8 +1,394 @@
+/* strrchr/wcsrchr optimized with 256-bit EVEX instructions.
+   Copyright (C) 2021-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# include <sysdep.h>
+
 # ifndef STRRCHR
 #  define STRRCHR	__strrchr_evex
 # endif
 
-#include "x86-evex256-vecs.h"
-#include "reg-macros.h"
+# include "x86-evex256-vecs.h"
+
+# ifdef USE_AS_WCSRCHR
+#  define SHIFT_REG	rsi
+#  define kunpck_2x	kunpckbw
+#  define kmov_2x	kmovd
+#  define maskz_2x	ecx
+#  define maskm_2x	eax
+#  define CHAR_SIZE	4
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
+#  define VPBROADCAST	vpbroadcastd
+#  define VPCMPEQ	vpcmpeqd
+#  define VPCMP	vpcmpd
+
+#  define USE_WIDE_CHAR
+# else
+#  define SHIFT_REG	rdi
+#  define kunpck_2x	kunpckdq
+#  define kmov_2x	kmovq
+#  define maskz_2x	rcx
+#  define maskm_2x	rax
+
+#  define CHAR_SIZE	1
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
+#  define VPBROADCAST	vpbroadcastb
+#  define VPCMPEQ	vpcmpeqb
+#  define VPCMP	vpcmpb
+# endif
+
+# include "reg-macros.h"
+
+# define VMATCH	VMM(0)
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+# define PAGE_SIZE	4096
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN(STRRCHR, 6)
+	movl	%edi, %eax
+	/* Broadcast CHAR to VMATCH.  */
+	VPBROADCAST %esi, %VMATCH
+
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jg	L(cross_page_boundary)
+L(page_cross_continue):
+	VMOVU	(%rdi), %VMM(1)
+	/* k0 has a 1 for each zero CHAR in VEC(1).  */
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRSI
+	test	%VRSI, %VRSI
+	jz	L(aligned_more)
+	/* fallthrough: zero CHAR in first VEC.  */
+	/* K1 has a 1 for each search CHAR match in VEC(1).  */
+	VPCMPEQ	%VMATCH, %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	/* Build mask up until first zero CHAR (used to mask of
+	   potential search CHAR matches past the end of the string).
+	 */
+	blsmsk	%VRSI, %VRSI
+	and	%VRSI, %VRAX
+	jz	L(ret0)
+	/* Get last match (the `and` removed any out of bounds matches).
+	 */
+	bsr	%VRAX, %VRAX
+# ifdef USE_AS_WCSRCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
+# endif
+L(ret0):
+	ret
+
+	/* Returns for first vec x1/x2/x3 have hard coded backward
+	   search path for earlier matches.  */
+	.p2align 4,, 6
+L(first_vec_x1):
+	VPCMPEQ	%VMATCH, %VMM(2), %k1
+	KMOV	%k1, %VRAX
+	blsmsk	%VRCX, %VRCX
+	/* eax non-zero if search CHAR in range.  */
+	and	%VRCX, %VRAX
+	jnz	L(first_vec_x1_return)
+
+	/* fallthrough: no match in VEC(2) then need to check for
+	   earlier matches (in VEC(1)).  */
+	.p2align 4,, 4
+L(first_vec_x0_test):
+	VPCMPEQ	%VMATCH, %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	test	%VRAX, %VRAX
+	jz	L(ret1)
+	bsr	%VRAX, %VRAX
+# ifdef USE_AS_WCSRCHR
+	leaq	(%rsi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rsi, %rax
+# endif
+L(ret1):
+	ret
+
+	.p2align 4,, 10
+L(first_vec_x1_or_x2):
+	VPCMPEQ	%VMM(3), %VMATCH, %k3
+	VPCMPEQ	%VMM(2), %VMATCH, %k2
+	/* K2 and K3 have 1 for any search CHAR match. Test if any
+	   matches between either of them. Otherwise check VEC(1).  */
+	KORTEST %k2, %k3
+	jz	L(first_vec_x0_test)
+
+	/* Guaranteed that VEC(2) and VEC(3) are within range so merge
+	   the two bitmasks then get last result.  */
+	kunpck_2x %k2, %k3, %k3
+	kmov_2x	%k3, %maskm_2x
+	bsr	%maskm_2x, %maskm_2x
+	leaq	(VEC_SIZE * 1)(%r8, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 7
+L(first_vec_x3):
+	VPCMPEQ	%VMATCH, %VMM(4), %k1
+	KMOV	%k1, %VRAX
+	blsmsk	%VRCX, %VRCX
+	/* If no search CHAR match in range check VEC(1)/VEC(2)/VEC(3).
+	 */
+	and	%VRCX, %VRAX
+	jz	L(first_vec_x1_or_x2)
+	bsr	%VRAX, %VRAX
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+
+	.p2align 4,, 6
+L(first_vec_x0_x1_test):
+	VPCMPEQ	%VMATCH, %VMM(2), %k1
+	KMOV	%k1, %VRAX
+	/* Check VEC(2) for last match first. If no match try VEC(1).
+	 */
+	test	%VRAX, %VRAX
+	jz	L(first_vec_x0_test)
+	.p2align 4,, 4
+L(first_vec_x1_return):
+	bsr	%VRAX, %VRAX
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+
+	.p2align 4,, 10
+L(first_vec_x2):
+	VPCMPEQ	%VMATCH, %VMM(3), %k1
+	KMOV	%k1, %VRAX
+	blsmsk	%VRCX, %VRCX
+	/* Check VEC(3) for last match first. If no match try
+	   VEC(2)/VEC(1).  */
+	and	%VRCX, %VRAX
+	jz	L(first_vec_x0_x1_test)
+	bsr	%VRAX, %VRAX
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+
+	.p2align 4,, 12
+L(aligned_more):
+	/* Need to keep original pointer in case VEC(1) has last match.
+	 */
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rdi
+
+	VMOVU	VEC_SIZE(%rdi), %VMM(2)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x1)
+
+	VMOVU	(VEC_SIZE * 2)(%rdi), %VMM(3)
+	VPTESTN	%VMM(3), %VMM(3), %k0
+	KMOV	%k0, %VRCX
+
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x2)
+
+	VMOVU	(VEC_SIZE * 3)(%rdi), %VMM(4)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
+	movq	%rdi, %r8
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x3)
+
+	andq	$-(VEC_SIZE * 2), %rdi
+	.p2align 4,, 10
+L(first_aligned_loop):
+	/* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can
+	   guarantee they don't store a match.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(5)
+	VMOVA	(VEC_SIZE * 5)(%rdi), %VMM(6)
+
+	VPCMPEQ	%VMM(5), %VMATCH, %k2
+	vpxord	%VMM(6), %VMATCH, %VMM(7)
+
+	VPMIN	%VMM(5), %VMM(6), %VMM(8)
+	VPMIN	%VMM(8), %VMM(7), %VMM(7)
+
+	VPTESTN	%VMM(7), %VMM(7), %k1
+	subq	$(VEC_SIZE * -2), %rdi
+	KORTEST %k1, %k2
+	jz	L(first_aligned_loop)
+
+	VPCMPEQ	%VMM(6), %VMATCH, %k3
+	VPTESTN	%VMM(8), %VMM(8), %k1
+
+	/* If k1 is zero, then we found a CHAR match but no null-term.
+	   We can now safely throw out VEC1-4.  */
+	KTEST	%k1, %k1
+	jz	L(second_aligned_loop_prep)
+
+	KORTEST %k2, %k3
+	jnz	L(return_first_aligned_loop)
+
+
+	.p2align 4,, 6
+L(first_vec_x1_or_x2_or_x3):
+	VPCMPEQ	%VMM(4), %VMATCH, %k4
+	KMOV	%k4, %VRAX
+	bsr	%VRAX, %VRAX
+	jz	L(first_vec_x1_or_x2)
+	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
+	ret
+
+
+	.p2align 4,, 8
+L(return_first_aligned_loop):
+	VPTESTN	%VMM(5), %VMM(5), %k0
+
+	/* Combined results from VEC5/6.  */
+	kunpck_2x %k0, %k1, %k0
+	kmov_2x	%k0, %maskz_2x
+
+	blsmsk	%maskz_2x, %maskz_2x
+	kunpck_2x %k2, %k3, %k3
+	kmov_2x	%k3, %maskm_2x
+	and	%maskz_2x, %maskm_2x
+	jz	L(first_vec_x1_or_x2_or_x3)
+
+	bsr	%maskm_2x, %maskm_2x
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4
+	/* We can throw away the work done for the first 4x checks here
+	   as we have a later match. This is the 'fast' path persay.
+	 */
+L(second_aligned_loop_prep):
+L(second_aligned_loop_set_furthest_match):
+	movq	%rdi, %rsi
+	/* Ideally we would safe k2/k3 but `kmov/kunpck` take uops on
+	   port0 and have noticeable overhead in the loop.  */
+	VMOVA	%VMM(5), %VMM(7)
+	VMOVA	%VMM(6), %VMM(8)
+	.p2align 4
+L(second_aligned_loop):
+	VMOVU	(VEC_SIZE * 4)(%rdi), %VMM(5)
+	VMOVU	(VEC_SIZE * 5)(%rdi), %VMM(6)
+	VPCMPEQ	%VMM(5), %VMATCH, %k2
+	vpxord	%VMM(6), %VMATCH, %VMM(3)
+
+	VPMIN	%VMM(5), %VMM(6), %VMM(4)
+	VPMIN	%VMM(3), %VMM(4), %VMM(3)
+
+	VPTESTN	%VMM(3), %VMM(3), %k1
+	subq	$(VEC_SIZE * -2), %rdi
+	KORTEST %k1, %k2
+	jz	L(second_aligned_loop)
+	VPCMPEQ	%VMM(6), %VMATCH, %k3
+	VPTESTN	%VMM(4), %VMM(4), %k1
+	KTEST	%k1, %k1
+	jz	L(second_aligned_loop_set_furthest_match)
+
+	/* branch here because we know we have a match in VEC7/8 but
+	   might not in VEC5/6 so the latter is expected to be less
+	   likely.  */
+	KORTEST %k2, %k3
+	jnz	L(return_new_match)
+
+L(return_old_match):
+	VPCMPEQ	%VMM(8), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	bsr	%VRCX, %VRCX
+	jnz	L(return_old_match_ret)
+
+	VPCMPEQ	%VMM(7), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	bsr	%VRCX, %VRCX
+	subq	$VEC_SIZE, %rsi
+L(return_old_match_ret):
+	leaq	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 10
+L(return_new_match):
+	VPTESTN	%VMM(5), %VMM(5), %k0
+
+	/* Combined results from VEC5/6.  */
+	kunpck_2x %k0, %k1, %k0
+	kmov_2x	%k0, %maskz_2x
+
+	blsmsk	%maskz_2x, %maskz_2x
+	kunpck_2x %k2, %k3, %k3
+	kmov_2x	%k3, %maskm_2x
+
+	/* Match at end was out-of-bounds so use last known match.  */
+	and	%maskz_2x, %maskm_2x
+	jz	L(return_old_match)
+
+	bsr	%maskm_2x, %maskm_2x
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+L(cross_page_boundary):
+	/* eax contains all the page offset bits of src (rdi). `xor rdi,
+	   rax` sets pointer will all page offset bits cleared so
+	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
+	   before page cross (guaranteed to be safe to read). Doing this
+	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
+	   a bit of code size.  */
+	xorq	%rdi, %rax
+	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRCX
+
+	/* Shift out zero CHAR matches that are before the beginning of
+	   src (rdi).  */
+# ifdef USE_AS_WCSRCHR
+	movl	%edi, %esi
+	andl	$(VEC_SIZE - 1), %esi
+	shrl	$2, %esi
+# endif
+	shrx	%VGPR(SHIFT_REG), %VRCX, %VRCX
+
+	test	%VRCX, %VRCX
+	jz	L(page_cross_continue)
 
-#include "strrchr-evex-base.S"
+	/* Found zero CHAR so need to test for search CHAR.  */
+	VPCMP	$0, %VMATCH, %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	/* Shift out search CHAR matches that are before the beginning of
+	   src (rdi).  */
+	shrx	%VGPR(SHIFT_REG), %VRAX, %VRAX
+
+	/* Check if any search CHAR match in range.  */
+	blsmsk	%VRCX, %VRCX
+	and	%VRCX, %VRAX
+	jz	L(ret3)
+	bsr	%VRAX, %VRAX
+# ifdef USE_AS_WCSRCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
+# endif
+L(ret3):
+	ret
+END(STRRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex.S b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
index a584cd3f430ba9d5..e5c5fe3bf28a5966 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
@@ -4,5 +4,4 @@
 
 #define STRRCHR	WCSRCHR
 #define USE_AS_WCSRCHR 1
-#define USE_WIDE_CHAR 1
 #include "strrchr-evex.S"