| 1 | /* memrchr optimized with 256-bit EVEX instructions. | 
| 2 |    Copyright (C) 2021-2022 Free Software Foundation, Inc. | 
| 3 |    This file is part of the GNU C Library. | 
| 4 |  | 
| 5 |    The GNU C Library is free software; you can redistribute it and/or | 
| 6 |    modify it under the terms of the GNU Lesser General Public | 
| 7 |    License as published by the Free Software Foundation; either | 
| 8 |    version 2.1 of the License, or (at your option) any later version. | 
| 9 |  | 
| 10 |    The GNU C Library is distributed in the hope that it will be useful, | 
| 11 |    but WITHOUT ANY WARRANTY; without even the implied warranty of | 
| 12 |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
| 13 |    Lesser General Public License for more details. | 
| 14 |  | 
| 15 |    You should have received a copy of the GNU Lesser General Public | 
| 16 |    License along with the GNU C Library; if not, see | 
| 17 |    <https://www.gnu.org/licenses/>.  */ | 
| 18 |  | 
| 19 | #include <isa-level.h> | 
| 20 |  | 
| 21 | #if ISA_SHOULD_BUILD (4) | 
| 22 |  | 
| 23 | # include <sysdep.h> | 
| 24 | # include "evex256-vecs.h" | 
| 25 | # if VEC_SIZE != 32 | 
| 26 | #  error "VEC_SIZE != 32 unimplemented" | 
| 27 | # endif | 
| 28 |  | 
| 29 | # ifndef MEMRCHR | 
| 30 | #  define MEMRCHR				__memrchr_evex | 
| 31 | # endif | 
| 32 |  | 
| 33 | # define PAGE_SIZE			4096 | 
| 34 | # define VECMATCH			VEC(0) | 
| 35 |  | 
| 36 | 	.section SECTION(.text), "ax" , @progbits | 
| 37 | ENTRY_P2ALIGN(MEMRCHR, 6) | 
| 38 | # ifdef __ILP32__ | 
| 39 | 	/* Clear upper bits.  */ | 
| 40 | 	and	%RDX_LP, %RDX_LP | 
| 41 | # else | 
| 42 | 	test	%RDX_LP, %RDX_LP | 
| 43 | # endif | 
| 44 | 	jz	L(zero_0) | 
| 45 |  | 
| 46 | 	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a | 
| 47 | 	   correct page cross check and 2) it correctly sets up end ptr to be | 
| 48 | 	   subtract by lzcnt aligned.  */ | 
| 49 | 	leaq	-1(%rdi, %rdx), %rax | 
| 50 | 	vpbroadcastb %esi, %VECMATCH | 
| 51 |  | 
| 52 | 	/* Check if we can load 1x VEC without cross a page.  */ | 
| 53 | 	testl	$(PAGE_SIZE - VEC_SIZE), %eax | 
| 54 | 	jz	L(page_cross) | 
| 55 |  | 
| 56 | 	/* Don't use rax for pointer here because EVEX has better encoding with | 
| 57 | 	   offset % VEC_SIZE == 0.  */ | 
| 58 | 	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0 | 
| 59 | 	kmovd	%k0, %ecx | 
| 60 |  | 
| 61 | 	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */ | 
| 62 | 	cmpq	$VEC_SIZE, %rdx | 
| 63 | 	ja	L(more_1x_vec) | 
| 64 | L(ret_vec_x0_test): | 
| 65 |  | 
| 66 | 	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which | 
| 67 | 	   will guarantee edx (len) is less than it.  */ | 
| 68 | 	lzcntl	%ecx, %ecx | 
| 69 | 	cmpl	%ecx, %edx | 
| 70 | 	jle	L(zero_0) | 
| 71 | 	subq	%rcx, %rax | 
| 72 | 	ret | 
| 73 |  | 
| 74 | 	/* Fits in aligning bytes of first cache line.  */ | 
| 75 | L(zero_0): | 
| 76 | 	xorl	%eax, %eax | 
| 77 | 	ret | 
| 78 |  | 
| 79 | 	.p2align 4,, 9 | 
| 80 | L(ret_vec_x0_dec): | 
| 81 | 	decq	%rax | 
| 82 | L(ret_vec_x0): | 
| 83 | 	lzcntl	%ecx, %ecx | 
| 84 | 	subq	%rcx, %rax | 
| 85 | 	ret | 
| 86 |  | 
| 87 | 	.p2align 4,, 10 | 
| 88 | L(more_1x_vec): | 
| 89 | 	testl	%ecx, %ecx | 
| 90 | 	jnz	L(ret_vec_x0) | 
| 91 |  | 
| 92 | 	/* Align rax (pointer to string).  */ | 
| 93 | 	andq	$-VEC_SIZE, %rax | 
| 94 |  | 
| 95 | 	/* Recompute length after aligning.  */ | 
| 96 | 	movq	%rax, %rdx | 
| 97 |  | 
| 98 | 	/* Need no matter what.  */ | 
| 99 | 	vpcmpb	$0, -(VEC_SIZE)(%rax), %VECMATCH, %k0 | 
| 100 | 	kmovd	%k0, %ecx | 
| 101 |  | 
| 102 | 	subq	%rdi, %rdx | 
| 103 |  | 
| 104 | 	cmpq	$(VEC_SIZE * 2), %rdx | 
| 105 | 	ja	L(more_2x_vec) | 
| 106 | L(last_2x_vec): | 
| 107 |  | 
| 108 | 	/* Must dec rax because L(ret_vec_x0_test) expects it.  */ | 
| 109 | 	decq	%rax | 
| 110 | 	cmpl	$VEC_SIZE, %edx | 
| 111 | 	jbe	L(ret_vec_x0_test) | 
| 112 |  | 
| 113 | 	testl	%ecx, %ecx | 
| 114 | 	jnz	L(ret_vec_x0) | 
| 115 |  | 
| 116 | 	/* Don't use rax for pointer here because EVEX has better encoding with | 
| 117 | 	   offset % VEC_SIZE == 0.  */ | 
| 118 | 	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0 | 
| 119 | 	kmovd	%k0, %ecx | 
| 120 | 	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */ | 
| 121 | 	lzcntq	%rcx, %rcx | 
| 122 | 	cmpl	%ecx, %edx | 
| 123 | 	jle	L(zero_0) | 
| 124 | 	subq	%rcx, %rax | 
| 125 | 	ret | 
| 126 |  | 
| 127 | 	/* Inexpensive place to put this regarding code size / target alignments | 
| 128 | 	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross | 
| 129 | 	   case which in turn is necessary for hot path (len <= VEC_SIZE) to fit | 
| 130 | 	   in first cache line.  */ | 
| 131 | L(page_cross): | 
| 132 | 	movq	%rax, %rsi | 
| 133 | 	andq	$-VEC_SIZE, %rsi | 
| 134 | 	vpcmpb	$0, (%rsi), %VECMATCH, %k0 | 
| 135 | 	kmovd	%k0, %r8d | 
| 136 | 	/* Shift out negative alignment (because we are starting from endptr and | 
| 137 | 	   working backwards).  */ | 
| 138 | 	movl	%eax, %ecx | 
| 139 | 	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */ | 
| 140 | 	notl	%ecx | 
| 141 | 	shlxl	%ecx, %r8d, %ecx | 
| 142 | 	cmpq	%rdi, %rsi | 
| 143 | 	ja	L(more_1x_vec) | 
| 144 | 	lzcntl	%ecx, %ecx | 
| 145 | 	cmpl	%ecx, %edx | 
| 146 | 	jle	L(zero_1) | 
| 147 | 	subq	%rcx, %rax | 
| 148 | 	ret | 
| 149 |  | 
| 150 | 	/* Continue creating zero labels that fit in aligning bytes and get | 
| 151 | 	   2-byte encoding / are in the same cache line as condition.  */ | 
| 152 | L(zero_1): | 
| 153 | 	xorl	%eax, %eax | 
| 154 | 	ret | 
| 155 |  | 
| 156 | 	.p2align 4,, 8 | 
| 157 | L(ret_vec_x1): | 
| 158 | 	/* This will naturally add 32 to position.  */ | 
| 159 | 	bsrl	%ecx, %ecx | 
| 160 | 	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax | 
| 161 | 	ret | 
| 162 |  | 
| 163 | 	.p2align 4,, 8 | 
| 164 | L(more_2x_vec): | 
| 165 | 	testl	%ecx, %ecx | 
| 166 | 	jnz	L(ret_vec_x0_dec) | 
| 167 |  | 
| 168 | 	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0 | 
| 169 | 	kmovd	%k0, %ecx | 
| 170 | 	testl	%ecx, %ecx | 
| 171 | 	jnz	L(ret_vec_x1) | 
| 172 |  | 
| 173 | 	/* Need no matter what.  */ | 
| 174 | 	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0 | 
| 175 | 	kmovd	%k0, %ecx | 
| 176 |  | 
| 177 | 	subq	$(VEC_SIZE * 4), %rdx | 
| 178 | 	ja	L(more_4x_vec) | 
| 179 |  | 
| 180 | 	cmpl	$(VEC_SIZE * -1), %edx | 
| 181 | 	jle	L(ret_vec_x2_test) | 
| 182 | L(last_vec): | 
| 183 | 	testl	%ecx, %ecx | 
| 184 | 	jnz	L(ret_vec_x2) | 
| 185 |  | 
| 186 |  | 
| 187 | 	/* Need no matter what.  */ | 
| 188 | 	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0 | 
| 189 | 	kmovd	%k0, %ecx | 
| 190 | 	lzcntl	%ecx, %ecx | 
| 191 | 	subq	$(VEC_SIZE * 3 + 1), %rax | 
| 192 | 	subq	%rcx, %rax | 
| 193 | 	cmpq	%rax, %rdi | 
| 194 | 	ja	L(zero_1) | 
| 195 | 	ret | 
| 196 |  | 
| 197 | 	.p2align 4,, 8 | 
| 198 | L(ret_vec_x2_test): | 
| 199 | 	lzcntl	%ecx, %ecx | 
| 200 | 	subq	$(VEC_SIZE * 2 + 1), %rax | 
| 201 | 	subq	%rcx, %rax | 
| 202 | 	cmpq	%rax, %rdi | 
| 203 | 	ja	L(zero_1) | 
| 204 | 	ret | 
| 205 |  | 
| 206 | 	.p2align 4,, 8 | 
| 207 | L(ret_vec_x2): | 
| 208 | 	bsrl	%ecx, %ecx | 
| 209 | 	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax | 
| 210 | 	ret | 
| 211 |  | 
| 212 | 	.p2align 4,, 8 | 
| 213 | L(ret_vec_x3): | 
| 214 | 	bsrl	%ecx, %ecx | 
| 215 | 	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax | 
| 216 | 	ret | 
| 217 |  | 
| 218 | 	.p2align 4,, 8 | 
| 219 | L(more_4x_vec): | 
| 220 | 	testl	%ecx, %ecx | 
| 221 | 	jnz	L(ret_vec_x2) | 
| 222 |  | 
| 223 | 	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0 | 
| 224 | 	kmovd	%k0, %ecx | 
| 225 |  | 
| 226 | 	testl	%ecx, %ecx | 
| 227 | 	jnz	L(ret_vec_x3) | 
| 228 |  | 
| 229 | 	/* Check if near end before re-aligning (otherwise might do an | 
| 230 | 	   unnecessary loop iteration).  */ | 
| 231 | 	addq	$-(VEC_SIZE * 4), %rax | 
| 232 | 	cmpq	$(VEC_SIZE * 4), %rdx | 
| 233 | 	jbe	L(last_4x_vec) | 
| 234 |  | 
| 235 | 	decq	%rax | 
| 236 | 	andq	$-(VEC_SIZE * 4), %rax | 
| 237 | 	movq	%rdi, %rdx | 
| 238 | 	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because | 
| 239 | 	   lengths that overflow can be valid and break the comparison.  */ | 
| 240 | 	andq	$-(VEC_SIZE * 4), %rdx | 
| 241 |  | 
| 242 | 	.p2align 4 | 
| 243 | L(loop_4x_vec): | 
| 244 | 	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later | 
| 245 | 	   on).  */ | 
| 246 | 	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1 | 
| 247 |  | 
| 248 | 	/* VEC(2/3) will have zero-byte where we found a CHAR.  */ | 
| 249 | 	vpxorq	(VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2) | 
| 250 | 	vpxorq	(VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3) | 
| 251 | 	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4 | 
| 252 |  | 
| 253 | 	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where | 
| 254 | 	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */ | 
| 255 | 	vpminub	%VEC(2), %VEC(3), %VEC(3){%k1}{z} | 
| 256 | 	vptestnmb %VEC(3), %VEC(3), %k2 | 
| 257 |  | 
| 258 | 	/* Any 1s and we found CHAR.  */ | 
| 259 | 	kortestd %k2, %k4 | 
| 260 | 	jnz	L(loop_end) | 
| 261 |  | 
| 262 | 	addq	$-(VEC_SIZE * 4), %rax | 
| 263 | 	cmpq	%rdx, %rax | 
| 264 | 	jne	L(loop_4x_vec) | 
| 265 |  | 
| 266 | 	/* Need to re-adjust rdx / rax for L(last_4x_vec).  */ | 
| 267 | 	subq	$-(VEC_SIZE * 4), %rdx | 
| 268 | 	movq	%rdx, %rax | 
| 269 | 	subl	%edi, %edx | 
| 270 | L(last_4x_vec): | 
| 271 |  | 
| 272 | 	/* Used no matter what.  */ | 
| 273 | 	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0 | 
| 274 | 	kmovd	%k0, %ecx | 
| 275 |  | 
| 276 | 	cmpl	$(VEC_SIZE * 2), %edx | 
| 277 | 	jbe	L(last_2x_vec) | 
| 278 |  | 
| 279 | 	testl	%ecx, %ecx | 
| 280 | 	jnz	L(ret_vec_x0_dec) | 
| 281 |  | 
| 282 |  | 
| 283 | 	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0 | 
| 284 | 	kmovd	%k0, %ecx | 
| 285 |  | 
| 286 | 	testl	%ecx, %ecx | 
| 287 | 	jnz	L(ret_vec_x1) | 
| 288 |  | 
| 289 | 	/* Used no matter what.  */ | 
| 290 | 	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0 | 
| 291 | 	kmovd	%k0, %ecx | 
| 292 |  | 
| 293 | 	cmpl	$(VEC_SIZE * 3), %edx | 
| 294 | 	ja	L(last_vec) | 
| 295 |  | 
| 296 | 	lzcntl	%ecx, %ecx | 
| 297 | 	subq	$(VEC_SIZE * 2 + 1), %rax | 
| 298 | 	subq	%rcx, %rax | 
| 299 | 	cmpq	%rax, %rdi | 
| 300 | 	jbe	L(ret_1) | 
| 301 | 	xorl	%eax, %eax | 
| 302 | L(ret_1): | 
| 303 | 	ret | 
| 304 |  | 
| 305 | 	.p2align 4,, 6 | 
| 306 | L(loop_end): | 
| 307 | 	kmovd	%k1, %ecx | 
| 308 | 	notl	%ecx | 
| 309 | 	testl	%ecx, %ecx | 
| 310 | 	jnz	L(ret_vec_x0_end) | 
| 311 |  | 
| 312 | 	vptestnmb %VEC(2), %VEC(2), %k0 | 
| 313 | 	kmovd	%k0, %ecx | 
| 314 | 	testl	%ecx, %ecx | 
| 315 | 	jnz	L(ret_vec_x1_end) | 
| 316 |  | 
| 317 | 	kmovd	%k2, %ecx | 
| 318 | 	kmovd	%k4, %esi | 
| 319 | 	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) | 
| 320 | 	   then it won't affect the result in esi (VEC4). If ecx is non-zero | 
| 321 | 	   then CHAR in VEC3 and bsrq will use that position.  */ | 
| 322 | 	salq	$32, %rcx | 
| 323 | 	orq	%rsi, %rcx | 
| 324 | 	bsrq	%rcx, %rcx | 
| 325 | 	addq	%rcx, %rax | 
| 326 | 	ret | 
| 327 | 	.p2align 4,, 4 | 
| 328 | L(ret_vec_x0_end): | 
| 329 | 	addq	$(VEC_SIZE), %rax | 
| 330 | L(ret_vec_x1_end): | 
| 331 | 	bsrl	%ecx, %ecx | 
| 332 | 	leaq	(VEC_SIZE * 2)(%rax, %rcx), %rax | 
| 333 | 	ret | 
| 334 |  | 
| 335 | END(MEMRCHR) | 
| 336 | #endif | 
| 337 |  |