1 | /* memrchr optimized with SSE2. |
2 | Copyright (C) 2017-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | /* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation |
22 | so we need this to build for ISA V2 builds. */ |
23 | #if ISA_SHOULD_BUILD (2) |
24 | |
25 | # ifndef MEMRCHR |
26 | # define MEMRCHR __memrchr_sse2 |
27 | # endif |
28 | |
29 | # include <sysdep.h> |
30 | # define VEC_SIZE 16 |
31 | # define PAGE_SIZE 4096 |
32 | |
33 | .text |
34 | ENTRY_P2ALIGN(MEMRCHR, 6) |
35 | # ifdef __ILP32__ |
36 | /* Clear upper bits. */ |
37 | mov %RDX_LP, %RDX_LP |
38 | # endif |
39 | movd %esi, %xmm0 |
40 | |
41 | /* Get end pointer. */ |
42 | leaq (%rdx, %rdi), %rcx |
43 | |
44 | punpcklbw %xmm0, %xmm0 |
45 | punpcklwd %xmm0, %xmm0 |
46 | pshufd $0, %xmm0, %xmm0 |
47 | |
48 | /* Check if we can load 1x VEC without cross a page. */ |
49 | testl $(PAGE_SIZE - VEC_SIZE), %ecx |
50 | jz L(page_cross) |
51 | |
52 | /* NB: This load happens regardless of whether rdx (len) is zero. Since |
53 | it doesn't cross a page and the standard guarantees any pointer have |
54 | at least one-valid byte this load must be safe. For the entire |
55 | history of the x86 memrchr implementation this has been possible so |
56 | no code "should" be relying on a zero-length check before this load. |
57 | The zero-length check is moved to the page cross case because it is |
58 | 1) pretty cold and including it pushes the hot case len <= VEC_SIZE |
59 | into 2-cache lines. */ |
60 | movups -(VEC_SIZE)(%rcx), %xmm1 |
61 | pcmpeqb %xmm0, %xmm1 |
62 | pmovmskb %xmm1, %eax |
63 | |
64 | subq $VEC_SIZE, %rdx |
65 | ja L(more_1x_vec) |
66 | L(ret_vec_x0_test): |
67 | /* Zero-flag set if eax (src) is zero. Destination unchanged if src is |
68 | zero. */ |
69 | bsrl %eax, %eax |
70 | jz L(ret_0) |
71 | /* Check if the CHAR match is in bounds. Need to truly zero `eax` here |
72 | if out of bounds. */ |
73 | addl %edx, %eax |
74 | jl L(zero_0) |
75 | /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base |
76 | ptr. */ |
77 | addq %rdi, %rax |
78 | L(ret_0): |
79 | ret |
80 | |
81 | .p2align 4,, 5 |
82 | L(ret_vec_x0): |
83 | bsrl %eax, %eax |
84 | leaq -(VEC_SIZE)(%rcx, %rax), %rax |
85 | ret |
86 | |
87 | .p2align 4,, 2 |
88 | L(zero_0): |
89 | xorl %eax, %eax |
90 | ret |
91 | |
92 | |
93 | .p2align 4,, 8 |
94 | L(more_1x_vec): |
95 | testl %eax, %eax |
96 | jnz L(ret_vec_x0) |
97 | |
98 | /* Align rcx (pointer to string). */ |
99 | decq %rcx |
100 | andq $-VEC_SIZE, %rcx |
101 | |
102 | movq %rcx, %rdx |
103 | /* NB: We could consistenyl save 1-byte in this pattern with `movaps |
104 | %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is |
105 | it adds more frontend uops (even if the moves can be eliminated) and |
106 | some percentage of the time actual backend uops. */ |
107 | movaps -(VEC_SIZE)(%rcx), %xmm1 |
108 | pcmpeqb %xmm0, %xmm1 |
109 | subq %rdi, %rdx |
110 | pmovmskb %xmm1, %eax |
111 | |
112 | cmpq $(VEC_SIZE * 2), %rdx |
113 | ja L(more_2x_vec) |
114 | L(last_2x_vec): |
115 | subl $VEC_SIZE, %edx |
116 | jbe L(ret_vec_x0_test) |
117 | |
118 | testl %eax, %eax |
119 | jnz L(ret_vec_x0) |
120 | |
121 | movaps -(VEC_SIZE * 2)(%rcx), %xmm1 |
122 | pcmpeqb %xmm0, %xmm1 |
123 | pmovmskb %xmm1, %eax |
124 | |
125 | subl $VEC_SIZE, %edx |
126 | bsrl %eax, %eax |
127 | jz L(ret_1) |
128 | addl %edx, %eax |
129 | jl L(zero_0) |
130 | addq %rdi, %rax |
131 | L(ret_1): |
132 | ret |
133 | |
134 | /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross) |
135 | causes the hot pause (length <= VEC_SIZE) to span multiple cache |
136 | lines. Naturally aligned % 16 to 8-bytes. */ |
137 | L(page_cross): |
138 | /* Zero length check. */ |
139 | testq %rdx, %rdx |
140 | jz L(zero_0) |
141 | |
142 | leaq -1(%rcx), %r8 |
143 | andq $-(VEC_SIZE), %r8 |
144 | |
145 | movaps (%r8), %xmm1 |
146 | pcmpeqb %xmm0, %xmm1 |
147 | pmovmskb %xmm1, %esi |
148 | /* Shift out negative alignment (because we are starting from endptr and |
149 | working backwards). */ |
150 | negl %ecx |
151 | /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count |
152 | explicitly. */ |
153 | andl $(VEC_SIZE - 1), %ecx |
154 | shl %cl, %esi |
155 | movzwl %si, %eax |
156 | leaq (%rdi, %rdx), %rcx |
157 | cmpq %rdi, %r8 |
158 | ja L(more_1x_vec) |
159 | subl $VEC_SIZE, %edx |
160 | bsrl %eax, %eax |
161 | jz L(ret_2) |
162 | addl %edx, %eax |
163 | jl L(zero_1) |
164 | addq %rdi, %rax |
165 | L(ret_2): |
166 | ret |
167 | |
168 | /* Fits in aliging bytes. */ |
169 | L(zero_1): |
170 | xorl %eax, %eax |
171 | ret |
172 | |
173 | .p2align 4,, 5 |
174 | L(ret_vec_x1): |
175 | bsrl %eax, %eax |
176 | leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax |
177 | ret |
178 | |
179 | .p2align 4,, 8 |
180 | L(more_2x_vec): |
181 | testl %eax, %eax |
182 | jnz L(ret_vec_x0) |
183 | |
184 | movaps -(VEC_SIZE * 2)(%rcx), %xmm1 |
185 | pcmpeqb %xmm0, %xmm1 |
186 | pmovmskb %xmm1, %eax |
187 | testl %eax, %eax |
188 | jnz L(ret_vec_x1) |
189 | |
190 | |
191 | movaps -(VEC_SIZE * 3)(%rcx), %xmm1 |
192 | pcmpeqb %xmm0, %xmm1 |
193 | pmovmskb %xmm1, %eax |
194 | |
195 | subq $(VEC_SIZE * 4), %rdx |
196 | ja L(more_4x_vec) |
197 | |
198 | addl $(VEC_SIZE), %edx |
199 | jle L(ret_vec_x2_test) |
200 | |
201 | L(last_vec): |
202 | testl %eax, %eax |
203 | jnz L(ret_vec_x2) |
204 | |
205 | movaps -(VEC_SIZE * 4)(%rcx), %xmm1 |
206 | pcmpeqb %xmm0, %xmm1 |
207 | pmovmskb %xmm1, %eax |
208 | |
209 | subl $(VEC_SIZE), %edx |
210 | bsrl %eax, %eax |
211 | jz L(ret_3) |
212 | addl %edx, %eax |
213 | jl L(zero_2) |
214 | addq %rdi, %rax |
215 | L(ret_3): |
216 | ret |
217 | |
218 | .p2align 4,, 6 |
219 | L(ret_vec_x2_test): |
220 | bsrl %eax, %eax |
221 | jz L(zero_2) |
222 | addl %edx, %eax |
223 | jl L(zero_2) |
224 | addq %rdi, %rax |
225 | ret |
226 | |
227 | L(zero_2): |
228 | xorl %eax, %eax |
229 | ret |
230 | |
231 | |
232 | .p2align 4,, 5 |
233 | L(ret_vec_x2): |
234 | bsrl %eax, %eax |
235 | leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax |
236 | ret |
237 | |
238 | .p2align 4,, 5 |
239 | L(ret_vec_x3): |
240 | bsrl %eax, %eax |
241 | leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax |
242 | ret |
243 | |
244 | .p2align 4,, 8 |
245 | L(more_4x_vec): |
246 | testl %eax, %eax |
247 | jnz L(ret_vec_x2) |
248 | |
249 | movaps -(VEC_SIZE * 4)(%rcx), %xmm1 |
250 | pcmpeqb %xmm0, %xmm1 |
251 | pmovmskb %xmm1, %eax |
252 | |
253 | testl %eax, %eax |
254 | jnz L(ret_vec_x3) |
255 | |
256 | addq $-(VEC_SIZE * 4), %rcx |
257 | cmpq $(VEC_SIZE * 4), %rdx |
258 | jbe L(last_4x_vec) |
259 | |
260 | /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end |
261 | keeping the code from spilling to the next cache line. */ |
262 | addq $(VEC_SIZE * 4 - 1), %rcx |
263 | andq $-(VEC_SIZE * 4), %rcx |
264 | leaq (VEC_SIZE * 4)(%rdi), %rdx |
265 | andq $-(VEC_SIZE * 4), %rdx |
266 | |
267 | .p2align 4,, 11 |
268 | L(loop_4x_vec): |
269 | movaps (VEC_SIZE * -1)(%rcx), %xmm1 |
270 | movaps (VEC_SIZE * -2)(%rcx), %xmm2 |
271 | movaps (VEC_SIZE * -3)(%rcx), %xmm3 |
272 | movaps (VEC_SIZE * -4)(%rcx), %xmm4 |
273 | pcmpeqb %xmm0, %xmm1 |
274 | pcmpeqb %xmm0, %xmm2 |
275 | pcmpeqb %xmm0, %xmm3 |
276 | pcmpeqb %xmm0, %xmm4 |
277 | |
278 | por %xmm1, %xmm2 |
279 | por %xmm3, %xmm4 |
280 | por %xmm2, %xmm4 |
281 | |
282 | pmovmskb %xmm4, %esi |
283 | testl %esi, %esi |
284 | jnz L(loop_end) |
285 | |
286 | addq $-(VEC_SIZE * 4), %rcx |
287 | cmpq %rdx, %rcx |
288 | jne L(loop_4x_vec) |
289 | |
290 | subl %edi, %edx |
291 | |
292 | /* Ends up being 1-byte nop. */ |
293 | .p2align 4,, 2 |
294 | L(last_4x_vec): |
295 | movaps -(VEC_SIZE)(%rcx), %xmm1 |
296 | pcmpeqb %xmm0, %xmm1 |
297 | pmovmskb %xmm1, %eax |
298 | |
299 | cmpl $(VEC_SIZE * 2), %edx |
300 | jbe L(last_2x_vec) |
301 | |
302 | testl %eax, %eax |
303 | jnz L(ret_vec_x0) |
304 | |
305 | |
306 | movaps -(VEC_SIZE * 2)(%rcx), %xmm1 |
307 | pcmpeqb %xmm0, %xmm1 |
308 | pmovmskb %xmm1, %eax |
309 | |
310 | testl %eax, %eax |
311 | jnz L(ret_vec_end) |
312 | |
313 | movaps -(VEC_SIZE * 3)(%rcx), %xmm1 |
314 | pcmpeqb %xmm0, %xmm1 |
315 | pmovmskb %xmm1, %eax |
316 | |
317 | subl $(VEC_SIZE * 3), %edx |
318 | ja L(last_vec) |
319 | bsrl %eax, %eax |
320 | jz L(ret_4) |
321 | addl %edx, %eax |
322 | jl L(zero_3) |
323 | addq %rdi, %rax |
324 | L(ret_4): |
325 | ret |
326 | |
327 | /* Ends up being 1-byte nop. */ |
328 | .p2align 4,, 3 |
329 | L(loop_end): |
330 | pmovmskb %xmm1, %eax |
331 | sall $16, %eax |
332 | jnz L(ret_vec_end) |
333 | |
334 | pmovmskb %xmm2, %eax |
335 | testl %eax, %eax |
336 | jnz L(ret_vec_end) |
337 | |
338 | pmovmskb %xmm3, %eax |
339 | /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) |
340 | then it won't affect the result in esi (VEC4). If ecx is non-zero |
341 | then CHAR in VEC3 and bsrq will use that position. */ |
342 | sall $16, %eax |
343 | orl %esi, %eax |
344 | bsrl %eax, %eax |
345 | leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax |
346 | ret |
347 | |
348 | L(ret_vec_end): |
349 | bsrl %eax, %eax |
350 | leaq (VEC_SIZE * -2)(%rax, %rcx), %rax |
351 | ret |
352 | /* Use in L(last_4x_vec). In the same cache line. This is just a spare |
353 | aligning bytes. */ |
354 | L(zero_3): |
355 | xorl %eax, %eax |
356 | ret |
357 | /* 2-bytes from next cache line. */ |
358 | END(MEMRCHR) |
359 | #endif |
360 | |