1/* strrchr/wcsrchr optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRRCHR
24# define STRRCHR __strrchr_evex
25# endif
26
27# define VMOVU vmovdqu64
28# define VMOVA vmovdqa64
29
30# ifdef USE_AS_WCSRCHR
31# define VPBROADCAST vpbroadcastd
32# define VPCMP vpcmpd
33# define SHIFT_REG r8d
34# else
35# define VPBROADCAST vpbroadcastb
36# define VPCMP vpcmpb
37# define SHIFT_REG ecx
38# endif
39
40# define XMMZERO xmm16
41# define YMMZERO ymm16
42# define YMMMATCH ymm17
43# define YMM1 ymm18
44
45# define VEC_SIZE 32
46
47 .section .text.evex,"ax",@progbits
48ENTRY (STRRCHR)
49 movl %edi, %ecx
50 /* Broadcast CHAR to YMMMATCH. */
51 VPBROADCAST %esi, %YMMMATCH
52
53 vpxorq %XMMZERO, %XMMZERO, %XMMZERO
54
55 /* Check if we may cross page boundary with one vector load. */
56 andl $(2 * VEC_SIZE - 1), %ecx
57 cmpl $VEC_SIZE, %ecx
58 ja L(cros_page_boundary)
59
60 VMOVU (%rdi), %YMM1
61
62 /* Each bit in K0 represents a null byte in YMM1. */
63 VPCMP $0, %YMMZERO, %YMM1, %k0
64 /* Each bit in K1 represents a CHAR in YMM1. */
65 VPCMP $0, %YMMMATCH, %YMM1, %k1
66 kmovd %k0, %ecx
67 kmovd %k1, %eax
68
69 addq $VEC_SIZE, %rdi
70
71 testl %eax, %eax
72 jnz L(first_vec)
73
74 testl %ecx, %ecx
75 jnz L(return_null)
76
77 andq $-VEC_SIZE, %rdi
78 xorl %edx, %edx
79 jmp L(aligned_loop)
80
81 .p2align 4
82L(first_vec):
83 /* Check if there is a null byte. */
84 testl %ecx, %ecx
85 jnz L(char_and_nul_in_first_vec)
86
87 /* Remember the match and keep searching. */
88 movl %eax, %edx
89 movq %rdi, %rsi
90 andq $-VEC_SIZE, %rdi
91 jmp L(aligned_loop)
92
93 .p2align 4
94L(cros_page_boundary):
95 andl $(VEC_SIZE - 1), %ecx
96 andq $-VEC_SIZE, %rdi
97
98# ifdef USE_AS_WCSRCHR
99 /* NB: Divide shift count by 4 since each bit in K1 represent 4
100 bytes. */
101 movl %ecx, %SHIFT_REG
102 sarl $2, %SHIFT_REG
103# endif
104
105 VMOVA (%rdi), %YMM1
106
107 /* Each bit in K0 represents a null byte in YMM1. */
108 VPCMP $0, %YMMZERO, %YMM1, %k0
109 /* Each bit in K1 represents a CHAR in YMM1. */
110 VPCMP $0, %YMMMATCH, %YMM1, %k1
111 kmovd %k0, %edx
112 kmovd %k1, %eax
113
114 shrxl %SHIFT_REG, %edx, %edx
115 shrxl %SHIFT_REG, %eax, %eax
116 addq $VEC_SIZE, %rdi
117
118 /* Check if there is a CHAR. */
119 testl %eax, %eax
120 jnz L(found_char)
121
122 testl %edx, %edx
123 jnz L(return_null)
124
125 jmp L(aligned_loop)
126
127 .p2align 4
128L(found_char):
129 testl %edx, %edx
130 jnz L(char_and_nul)
131
132 /* Remember the match and keep searching. */
133 movl %eax, %edx
134 leaq (%rdi, %rcx), %rsi
135
136 .p2align 4
137L(aligned_loop):
138 VMOVA (%rdi), %YMM1
139 addq $VEC_SIZE, %rdi
140
141 /* Each bit in K0 represents a null byte in YMM1. */
142 VPCMP $0, %YMMZERO, %YMM1, %k0
143 /* Each bit in K1 represents a CHAR in YMM1. */
144 VPCMP $0, %YMMMATCH, %YMM1, %k1
145 kmovd %k0, %ecx
146 kmovd %k1, %eax
147 orl %eax, %ecx
148 jnz L(char_nor_null)
149
150 VMOVA (%rdi), %YMM1
151 add $VEC_SIZE, %rdi
152
153 /* Each bit in K0 represents a null byte in YMM1. */
154 VPCMP $0, %YMMZERO, %YMM1, %k0
155 /* Each bit in K1 represents a CHAR in YMM1. */
156 VPCMP $0, %YMMMATCH, %YMM1, %k1
157 kmovd %k0, %ecx
158 kmovd %k1, %eax
159 orl %eax, %ecx
160 jnz L(char_nor_null)
161
162 VMOVA (%rdi), %YMM1
163 addq $VEC_SIZE, %rdi
164
165 /* Each bit in K0 represents a null byte in YMM1. */
166 VPCMP $0, %YMMZERO, %YMM1, %k0
167 /* Each bit in K1 represents a CHAR in YMM1. */
168 VPCMP $0, %YMMMATCH, %YMM1, %k1
169 kmovd %k0, %ecx
170 kmovd %k1, %eax
171 orl %eax, %ecx
172 jnz L(char_nor_null)
173
174 VMOVA (%rdi), %YMM1
175 addq $VEC_SIZE, %rdi
176
177 /* Each bit in K0 represents a null byte in YMM1. */
178 VPCMP $0, %YMMZERO, %YMM1, %k0
179 /* Each bit in K1 represents a CHAR in YMM1. */
180 VPCMP $0, %YMMMATCH, %YMM1, %k1
181 kmovd %k0, %ecx
182 kmovd %k1, %eax
183 orl %eax, %ecx
184 jz L(aligned_loop)
185
186 .p2align 4
187L(char_nor_null):
188 /* Find a CHAR or a null byte in a loop. */
189 testl %eax, %eax
190 jnz L(match)
191L(return_value):
192 testl %edx, %edx
193 jz L(return_null)
194 movl %edx, %eax
195 movq %rsi, %rdi
196 bsrl %eax, %eax
197# ifdef USE_AS_WCSRCHR
198 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
199 leaq -VEC_SIZE(%rdi, %rax, 4), %rax
200# else
201 leaq -VEC_SIZE(%rdi, %rax), %rax
202# endif
203 ret
204
205 .p2align 4
206L(match):
207 /* Find a CHAR. Check if there is a null byte. */
208 kmovd %k0, %ecx
209 testl %ecx, %ecx
210 jnz L(find_nul)
211
212 /* Remember the match and keep searching. */
213 movl %eax, %edx
214 movq %rdi, %rsi
215 jmp L(aligned_loop)
216
217 .p2align 4
218L(find_nul):
219 /* Mask out any matching bits after the null byte. */
220 movl %ecx, %r8d
221 subl $1, %r8d
222 xorl %ecx, %r8d
223 andl %r8d, %eax
224 testl %eax, %eax
225 /* If there is no CHAR here, return the remembered one. */
226 jz L(return_value)
227 bsrl %eax, %eax
228# ifdef USE_AS_WCSRCHR
229 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
230 leaq -VEC_SIZE(%rdi, %rax, 4), %rax
231# else
232 leaq -VEC_SIZE(%rdi, %rax), %rax
233# endif
234 ret
235
236 .p2align 4
237L(char_and_nul):
238 /* Find both a CHAR and a null byte. */
239 addq %rcx, %rdi
240 movl %edx, %ecx
241L(char_and_nul_in_first_vec):
242 /* Mask out any matching bits after the null byte. */
243 movl %ecx, %r8d
244 subl $1, %r8d
245 xorl %ecx, %r8d
246 andl %r8d, %eax
247 testl %eax, %eax
248 /* Return null pointer if the null byte comes first. */
249 jz L(return_null)
250 bsrl %eax, %eax
251# ifdef USE_AS_WCSRCHR
252 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
253 leaq -VEC_SIZE(%rdi, %rax, 4), %rax
254# else
255 leaq -VEC_SIZE(%rdi, %rax), %rax
256# endif
257 ret
258
259 .p2align 4
260L(return_null):
261 xorl %eax, %eax
262 ret
263
264END (STRRCHR)
265#endif
266