1 | /* strcspn with SSE4.2 intrinsics |
2 | Copyright (C) 2009-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | #if IS_IN (libc) || MINIMUM_X86_ISA_LEVEL >= 2 |
21 | |
22 | # include <nmmintrin.h> |
23 | # include <string.h> |
24 | # include "varshift.h" |
25 | |
26 | /* We use 0x2: |
27 | _SIDD_SBYTE_OPS |
28 | | _SIDD_CMP_EQUAL_ANY |
29 | | _SIDD_POSITIVE_POLARITY |
30 | | _SIDD_LEAST_SIGNIFICANT |
31 | on pcmpistri to compare xmm/mem128 |
32 | |
33 | 0 1 2 3 4 5 6 7 8 9 A B C D E F |
34 | X X X X X X X X X X X X X X X X |
35 | |
36 | against xmm |
37 | |
38 | 0 1 2 3 4 5 6 7 8 9 A B C D E F |
39 | A A A A A A A A A A A A A A A A |
40 | |
41 | to find out if the first 16byte data element has any byte A and |
42 | the offset of the first byte. There are 3 cases: |
43 | |
44 | 1. The first 16byte data element has the byte A at the offset X. |
45 | 2. The first 16byte data element has EOS and doesn't have the byte A. |
46 | 3. The first 16byte data element is valid and doesn't have the byte A. |
47 | |
48 | Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: |
49 | |
50 | 1 X 1 0/1 0 |
51 | 2 16 0 1 0 |
52 | 3 16 0 0 0 |
53 | |
54 | We exit from the loop for cases 1 and 2 with jbe which branches |
55 | when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset |
56 | X for case 1. */ |
57 | |
58 | # ifndef STRCSPN |
59 | # define STRCSPN __strcspn_sse42 |
60 | # endif |
61 | # ifndef STRCSPN_GENERIC |
62 | # define STRCSPN_GENERIC __strcspn_generic |
63 | # endif |
64 | |
65 | # ifdef USE_AS_STRPBRK |
66 | # define RETURN(val1, val2) return val1 |
67 | # else |
68 | # define RETURN(val1, val2) return val2 |
69 | # endif |
70 | |
71 | extern |
72 | # ifdef USE_AS_STRPBRK |
73 | char * |
74 | # else |
75 | size_t |
76 | # endif |
77 | STRCSPN_GENERIC (const char *, const char *) attribute_hidden; |
78 | |
79 | |
80 | # ifdef USE_AS_STRPBRK |
81 | char * |
82 | # else |
83 | size_t |
84 | # endif |
85 | __attribute__ ((section (".text.sse4.2" ))) |
86 | STRCSPN (const char *s, const char *a) |
87 | { |
88 | if (*a == 0) |
89 | RETURN (NULL, strlen (s)); |
90 | |
91 | const char *aligned; |
92 | __m128i mask, maskz, zero; |
93 | unsigned int maskz_bits; |
94 | unsigned int offset = (unsigned int) ((size_t) a & 15); |
95 | zero = _mm_set1_epi8 (0); |
96 | if (offset != 0) |
97 | { |
98 | /* Load masks. */ |
99 | aligned = (const char *) ((size_t) a & -16L); |
100 | __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); |
101 | maskz = _mm_cmpeq_epi8 (mask0, zero); |
102 | |
103 | /* Find where the NULL terminator is. */ |
104 | maskz_bits = _mm_movemask_epi8 (maskz) >> offset; |
105 | if (maskz_bits != 0) |
106 | { |
107 | mask = __m128i_shift_right (mask0, offset); |
108 | offset = (unsigned int) ((size_t) s & 15); |
109 | if (offset) |
110 | goto start_unaligned; |
111 | |
112 | aligned = s; |
113 | goto start_loop; |
114 | } |
115 | } |
116 | |
117 | /* A is aligned. */ |
118 | mask = _mm_loadu_si128 ((__m128i *) a); |
119 | /* Find where the NULL terminator is. */ |
120 | maskz = _mm_cmpeq_epi8 (mask, zero); |
121 | maskz_bits = _mm_movemask_epi8 (maskz); |
122 | if (maskz_bits == 0) |
123 | { |
124 | /* There is no NULL terminator. Don't use SSE4.2 if the length |
125 | of A > 16. */ |
126 | if (a[16] != 0) |
127 | return STRCSPN_GENERIC (s, a); |
128 | } |
129 | |
130 | aligned = s; |
131 | offset = (unsigned int) ((size_t) s & 15); |
132 | if (offset != 0) |
133 | { |
134 | start_unaligned: |
135 | /* Check partial string. */ |
136 | aligned = (const char *) ((size_t) s & -16L); |
137 | __m128i value = _mm_load_si128 ((__m128i *) aligned); |
138 | |
139 | value = __m128i_shift_right (value, offset); |
140 | |
141 | unsigned int length = _mm_cmpistri (mask, value, 0x2); |
142 | /* No need to check ZFlag since ZFlag is always 1. */ |
143 | unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); |
144 | if (cflag) |
145 | RETURN ((char *) (s + length), length); |
146 | /* Find where the NULL terminator is. */ |
147 | unsigned int index = _mm_cmpistri (value, value, 0x3a); |
148 | if (index < 16 - offset) |
149 | RETURN (NULL, index); |
150 | aligned += 16; |
151 | } |
152 | |
153 | start_loop: |
154 | while (1) |
155 | { |
156 | __m128i value = _mm_load_si128 ((__m128i *) aligned); |
157 | unsigned int index = _mm_cmpistri (mask, value, 0x2); |
158 | unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); |
159 | unsigned int zflag = _mm_cmpistrz (mask, value, 0x2); |
160 | if (cflag) |
161 | RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); |
162 | if (zflag) |
163 | RETURN (NULL, |
164 | /* Find where the NULL terminator is. */ |
165 | (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s)); |
166 | aligned += 16; |
167 | } |
168 | } |
169 | #endif |
170 | |