1 | /* strspn with SSE4.2 intrinsics |
2 | Copyright (C) 2009-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <nmmintrin.h> |
20 | #include <string.h> |
21 | #include "varshift.h" |
22 | |
23 | /* We use 0x12: |
24 | _SIDD_SBYTE_OPS |
25 | | _SIDD_CMP_EQUAL_ANY |
26 | | _SIDD_NEGATIVE_POLARITY |
27 | | _SIDD_LEAST_SIGNIFICANT |
28 | on pcmpistri to compare xmm/mem128 |
29 | |
30 | 0 1 2 3 4 5 6 7 8 9 A B C D E F |
31 | X X X X X X X X X X X X X X X X |
32 | |
33 | against xmm |
34 | |
35 | 0 1 2 3 4 5 6 7 8 9 A B C D E F |
36 | A A A A A A A A A A A A A A A A |
37 | |
38 | to find out if the first 16byte data element has any non-A byte and |
39 | the offset of the first byte. There are 2 cases: |
40 | |
41 | 1. The first 16byte data element has the non-A byte, including |
42 | EOS, at the offset X. |
43 | 2. The first 16byte data element is valid and doesn't have the non-A |
44 | byte. |
45 | |
46 | Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: |
47 | |
48 | case ECX CFlag ZFlag SFlag |
49 | 1 X 1 0/1 0 |
50 | 2 16 0 0 0 |
51 | |
52 | We exit from the loop for case 1. */ |
53 | |
54 | extern size_t __strspn_generic (const char *, const char *) attribute_hidden; |
55 | |
56 | #ifndef STRSPN |
57 | # define STRSPN __strspn_sse42 |
58 | #endif |
59 | |
60 | size_t |
61 | __attribute__ ((section (".text.sse4.2" ))) |
62 | STRSPN (const char *s, const char *a) |
63 | { |
64 | if (*a == 0) |
65 | return 0; |
66 | |
67 | const char *aligned; |
68 | __m128i mask, maskz, zero; |
69 | unsigned int maskz_bits; |
70 | unsigned int offset = (int) ((size_t) a & 15); |
71 | zero = _mm_set1_epi8 (0); |
72 | if (offset != 0) |
73 | { |
74 | /* Load masks. */ |
75 | aligned = (const char *) ((size_t) a & -16L); |
76 | __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); |
77 | maskz = _mm_cmpeq_epi8 (mask0, zero); |
78 | |
79 | /* Find where the NULL terminator is. */ |
80 | maskz_bits = _mm_movemask_epi8 (maskz) >> offset; |
81 | if (maskz_bits != 0) |
82 | { |
83 | mask = __m128i_shift_right (mask0, offset); |
84 | offset = (unsigned int) ((size_t) s & 15); |
85 | if (offset) |
86 | goto start_unaligned; |
87 | |
88 | aligned = s; |
89 | goto start_loop; |
90 | } |
91 | } |
92 | |
93 | /* A is aligned. */ |
94 | mask = _mm_loadu_si128 ((__m128i *) a); |
95 | |
96 | /* Find where the NULL terminator is. */ |
97 | maskz = _mm_cmpeq_epi8 (mask, zero); |
98 | maskz_bits = _mm_movemask_epi8 (maskz); |
99 | if (maskz_bits == 0) |
100 | { |
101 | /* There is no NULL terminator. Don't use SSE4.2 if the length |
102 | of A > 16. */ |
103 | if (a[16] != 0) |
104 | return __strspn_generic (s, a); |
105 | } |
106 | aligned = s; |
107 | offset = (unsigned int) ((size_t) s & 15); |
108 | |
109 | if (offset != 0) |
110 | { |
111 | start_unaligned: |
112 | /* Check partial string. */ |
113 | aligned = (const char *) ((size_t) s & -16L); |
114 | __m128i value = _mm_load_si128 ((__m128i *) aligned); |
115 | __m128i adj_value = __m128i_shift_right (value, offset); |
116 | |
117 | unsigned int length = _mm_cmpistri (mask, adj_value, 0x12); |
118 | /* No need to check CFlag since it is always 1. */ |
119 | if (length < 16 - offset) |
120 | return length; |
121 | /* Find where the NULL terminator is. */ |
122 | maskz = _mm_cmpeq_epi8 (value, zero); |
123 | maskz_bits = _mm_movemask_epi8 (maskz) >> offset; |
124 | if (maskz_bits != 0) |
125 | return length; |
126 | aligned += 16; |
127 | } |
128 | |
129 | start_loop: |
130 | while (1) |
131 | { |
132 | __m128i value = _mm_load_si128 ((__m128i *) aligned); |
133 | unsigned int index = _mm_cmpistri (mask, value, 0x12); |
134 | unsigned int cflag = _mm_cmpistrc (mask, value, 0x12); |
135 | if (cflag) |
136 | return (size_t) (aligned + index - s); |
137 | aligned += 16; |
138 | } |
139 | } |
140 | |