1/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
23# include <sysdep.h>
24
25# ifndef STRLEN
26# define STRLEN __strlen_evex
27# endif
28
29# ifndef VEC_SIZE
30# include "x86-evex256-vecs.h"
31# endif
32
33# ifdef USE_AS_WCSLEN
34# define VPCMPEQ vpcmpeqd
35# define VPCMPNEQ vpcmpneqd
36# define VPTESTN vptestnmd
37# define VPTEST vptestmd
38# define VPMINU vpminud
39# define CHAR_SIZE 4
40# define CHAR_SIZE_SHIFT_REG(reg) sar $2, %reg
41# else
42# define VPCMPEQ vpcmpeqb
43# define VPCMPNEQ vpcmpneqb
44# define VPTESTN vptestnmb
45# define VPTEST vptestmb
46# define VPMINU vpminub
47# define CHAR_SIZE 1
48# define CHAR_SIZE_SHIFT_REG(reg)
49
50# define REG_WIDTH VEC_SIZE
51# endif
52
53# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
54
55# include "reg-macros.h"
56
57# if CHAR_PER_VEC == 64
58
59# define TAIL_RETURN_LBL first_vec_x2
60# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2)
61
62# define FALLTHROUGH_RETURN_LBL first_vec_x3
63# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3)
64
65# else
66
67# define TAIL_RETURN_LBL first_vec_x3
68# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3)
69
70# define FALLTHROUGH_RETURN_LBL first_vec_x2
71# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2)
72# endif
73
74# define XZERO VMM_128(0)
75# define VZERO VMM(0)
76# define PAGE_SIZE 4096
77
78 .section SECTION(.text), "ax", @progbits
79ENTRY_P2ALIGN (STRLEN, 6)
80 movl %edi, %eax
81 vpxorq %XZERO, %XZERO, %XZERO
82 andl $(PAGE_SIZE - 1), %eax
83 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
84 ja L(cross_page_boundary)
85
86 /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
87 null byte. */
88 VPCMPEQ (%rdi), %VZERO, %k0
89 KMOV %k0, %VRAX
90 test %VRAX, %VRAX
91 jz L(aligned_more)
92 bsf %VRAX, %VRAX
93 ret
94
95 .p2align 4,, 8
96L(first_vec_x4):
97 bsf %VRAX, %VRAX
98 subl %ecx, %edi
99 CHAR_SIZE_SHIFT_REG (edi)
100 leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
101 ret
102
103
104
105 /* Aligned more for strnlen compares remaining length vs 2 *
106 CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
107 going to the loop. */
108 .p2align 4,, 10
109L(aligned_more):
110 movq %rdi, %rcx
111 andq $(VEC_SIZE * -1), %rdi
112L(cross_page_continue):
113 /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
114 rechecking bounds. */
115 VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0
116 KMOV %k0, %VRAX
117 test %VRAX, %VRAX
118 jnz L(first_vec_x1)
119
120 VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
121 KMOV %k0, %VRAX
122 test %VRAX, %VRAX
123 jnz L(first_vec_x2)
124
125 VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
126 KMOV %k0, %VRAX
127 test %VRAX, %VRAX
128 jnz L(first_vec_x3)
129
130 VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
131 KMOV %k0, %VRAX
132 test %VRAX, %VRAX
133 jnz L(first_vec_x4)
134
135 subq $(VEC_SIZE * -1), %rdi
136
137# if CHAR_PER_VEC == 64
138 /* No partial register stalls on processors that we use evex512
139 on and this saves code size. */
140 xorb %dil, %dil
141# else
142 andq $-(VEC_SIZE * 4), %rdi
143# endif
144
145
146
147 /* Compare 4 * VEC at a time forward. */
148 .p2align 4
149L(loop_4x_vec):
150 VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1)
151 VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
152 VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3)
153 VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
154 VPTESTN %VMM(2), %VMM(2), %k0
155 VPTESTN %VMM(4), %VMM(4), %k2
156
157 subq $-(VEC_SIZE * 4), %rdi
158 KORTEST %k0, %k2
159 jz L(loop_4x_vec)
160
161 VPTESTN %VMM(1), %VMM(1), %k1
162 KMOV %k1, %VRAX
163 test %VRAX, %VRAX
164 jnz L(first_vec_x0)
165
166 KMOV %k0, %VRAX
167 test %VRAX, %VRAX
168 jnz L(first_vec_x1)
169
170 VPTESTN %VMM(3), %VMM(3), %k0
171
172# if CHAR_PER_VEC == 64
173 KMOV %k0, %VRAX
174 test %VRAX, %VRAX
175 jnz L(first_vec_x2)
176 KMOV %k2, %VRAX
177# else
178 /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
179 */
180 kmovd %k2, %edx
181 kmovd %k0, %eax
182 salq $CHAR_PER_VEC, %rdx
183 orq %rdx, %rax
184# endif
185
186 /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
187 */
188 .p2align 4,, 2
189L(FALLTHROUGH_RETURN_LBL):
190 bsfq %rax, %rax
191 subq %rcx, %rdi
192 CHAR_SIZE_SHIFT_REG (rdi)
193 leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
194 ret
195
196 .p2align 4,, 8
197L(first_vec_x0):
198 bsf %VRAX, %VRAX
199 sub %rcx, %rdi
200 CHAR_SIZE_SHIFT_REG (rdi)
201 addq %rdi, %rax
202 ret
203
204 .p2align 4,, 10
205L(first_vec_x1):
206 bsf %VRAX, %VRAX
207 sub %rcx, %rdi
208 CHAR_SIZE_SHIFT_REG (rdi)
209 leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
210 ret
211
212 .p2align 4,, 10
213 /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.
214 */
215L(TAIL_RETURN_LBL):
216 bsf %VRAX, %VRAX
217 sub %VRCX, %VRDI
218 CHAR_SIZE_SHIFT_REG (VRDI)
219 lea (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
220 ret
221
222 .p2align 4,, 8
223L(cross_page_boundary):
224 movq %rdi, %rcx
225 /* Align data to VEC_SIZE. */
226 andq $-VEC_SIZE, %rdi
227
228 VPCMPEQ (%rdi), %VZERO, %k0
229
230 KMOV %k0, %VRAX
231# ifdef USE_AS_WCSLEN
232 movl %ecx, %edx
233 shrl $2, %edx
234 andl $(CHAR_PER_VEC - 1), %edx
235 shrx %edx, %eax, %eax
236 testl %eax, %eax
237# else
238 shr %cl, %VRAX
239# endif
240 jz L(cross_page_continue)
241 bsf %VRAX, %VRAX
242 ret
243
244END (STRLEN)
245#endif
246