1 | /* strspn with SSE4.2 intrinsics |
2 | Copyright (C) 2009-2021 Free Software Foundation, Inc. |
3 | Contributed by Intel Corporation. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include <nmmintrin.h> |
21 | #include <string.h> |
22 | #include "varshift.h" |
23 | |
24 | /* We use 0x12: |
25 | _SIDD_SBYTE_OPS |
26 | | _SIDD_CMP_EQUAL_ANY |
27 | | _SIDD_NEGATIVE_POLARITY |
28 | | _SIDD_LEAST_SIGNIFICANT |
29 | on pcmpistri to compare xmm/mem128 |
30 | |
31 | 0 1 2 3 4 5 6 7 8 9 A B C D E F |
32 | X X X X X X X X X X X X X X X X |
33 | |
34 | against xmm |
35 | |
36 | 0 1 2 3 4 5 6 7 8 9 A B C D E F |
37 | A A A A A A A A A A A A A A A A |
38 | |
39 | to find out if the first 16byte data element has any non-A byte and |
40 | the offset of the first byte. There are 2 cases: |
41 | |
42 | 1. The first 16byte data element has the non-A byte, including |
43 | EOS, at the offset X. |
44 | 2. The first 16byte data element is valid and doesn't have the non-A |
45 | byte. |
46 | |
47 | Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: |
48 | |
49 | case ECX CFlag ZFlag SFlag |
50 | 1 X 1 0/1 0 |
51 | 2 16 0 0 0 |
52 | |
53 | We exit from the loop for case 1. */ |
54 | |
55 | extern size_t __strspn_sse2 (const char *, const char *) attribute_hidden; |
56 | |
57 | |
58 | size_t |
59 | __attribute__ ((section (".text.sse4.2" ))) |
60 | __strspn_sse42 (const char *s, const char *a) |
61 | { |
62 | if (*a == 0) |
63 | return 0; |
64 | |
65 | const char *aligned; |
66 | __m128i mask; |
67 | int offset = (int) ((size_t) a & 15); |
68 | if (offset != 0) |
69 | { |
70 | /* Load masks. */ |
71 | aligned = (const char *) ((size_t) a & -16L); |
72 | __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); |
73 | |
74 | mask = __m128i_shift_right (mask0, offset); |
75 | |
76 | /* Find where the NULL terminator is. */ |
77 | int length = _mm_cmpistri (mask, mask, 0x3a); |
78 | if (length == 16 - offset) |
79 | { |
80 | /* There is no NULL terminator. */ |
81 | __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); |
82 | int index = _mm_cmpistri (mask1, mask1, 0x3a); |
83 | length += index; |
84 | |
85 | /* Don't use SSE4.2 if the length of A > 16. */ |
86 | if (length > 16) |
87 | return __strspn_sse2 (s, a); |
88 | |
89 | if (index != 0) |
90 | { |
91 | /* Combine mask0 and mask1. We could play games with |
92 | palignr, but frankly this data should be in L1 now |
93 | so do the merge via an unaligned load. */ |
94 | mask = _mm_loadu_si128 ((__m128i *) a); |
95 | } |
96 | } |
97 | } |
98 | else |
99 | { |
100 | /* A is aligned. */ |
101 | mask = _mm_load_si128 ((__m128i *) a); |
102 | |
103 | /* Find where the NULL terminator is. */ |
104 | int length = _mm_cmpistri (mask, mask, 0x3a); |
105 | if (length == 16) |
106 | { |
107 | /* There is no NULL terminator. Don't use SSE4.2 if the length |
108 | of A > 16. */ |
109 | if (a[16] != 0) |
110 | return __strspn_sse2 (s, a); |
111 | } |
112 | } |
113 | |
114 | offset = (int) ((size_t) s & 15); |
115 | if (offset != 0) |
116 | { |
117 | /* Check partial string. */ |
118 | aligned = (const char *) ((size_t) s & -16L); |
119 | __m128i value = _mm_load_si128 ((__m128i *) aligned); |
120 | |
121 | value = __m128i_shift_right (value, offset); |
122 | |
123 | int length = _mm_cmpistri (mask, value, 0x12); |
124 | /* No need to check CFlag since it is always 1. */ |
125 | if (length < 16 - offset) |
126 | return length; |
127 | /* Find where the NULL terminator is. */ |
128 | int index = _mm_cmpistri (value, value, 0x3a); |
129 | if (index < 16 - offset) |
130 | return length; |
131 | aligned += 16; |
132 | } |
133 | else |
134 | aligned = s; |
135 | |
136 | while (1) |
137 | { |
138 | __m128i value = _mm_load_si128 ((__m128i *) aligned); |
139 | int index = _mm_cmpistri (mask, value, 0x12); |
140 | int cflag = _mm_cmpistrc (mask, value, 0x12); |
141 | if (cflag) |
142 | return (size_t) (aligned + index - s); |
143 | aligned += 16; |
144 | } |
145 | } |
146 | |