1/* strspn with SSE4.2 intrinsics
2 Copyright (C) 2009-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <nmmintrin.h>
20#include <string.h>
21#include "varshift.h"
22
23/* We use 0x12:
24 _SIDD_SBYTE_OPS
25 | _SIDD_CMP_EQUAL_ANY
26 | _SIDD_NEGATIVE_POLARITY
27 | _SIDD_LEAST_SIGNIFICANT
28 on pcmpistri to compare xmm/mem128
29
30 0 1 2 3 4 5 6 7 8 9 A B C D E F
31 X X X X X X X X X X X X X X X X
32
33 against xmm
34
35 0 1 2 3 4 5 6 7 8 9 A B C D E F
36 A A A A A A A A A A A A A A A A
37
38 to find out if the first 16byte data element has any non-A byte and
39 the offset of the first byte. There are 2 cases:
40
41 1. The first 16byte data element has the non-A byte, including
42 EOS, at the offset X.
43 2. The first 16byte data element is valid and doesn't have the non-A
44 byte.
45
46 Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
47
48 case ECX CFlag ZFlag SFlag
49 1 X 1 0/1 0
50 2 16 0 0 0
51
52 We exit from the loop for case 1. */
53
54extern size_t __strspn_sse2 (const char *, const char *) attribute_hidden;
55
56
57size_t
58__attribute__ ((section (".text.sse4.2")))
59__strspn_sse42 (const char *s, const char *a)
60{
61 if (*a == 0)
62 return 0;
63
64 const char *aligned;
65 __m128i mask;
66 int offset = (int) ((size_t) a & 15);
67 if (offset != 0)
68 {
69 /* Load masks. */
70 aligned = (const char *) ((size_t) a & -16L);
71 __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
72
73 mask = __m128i_shift_right (mask0, offset);
74
75 /* Find where the NULL terminator is. */
76 int length = _mm_cmpistri (mask, mask, 0x3a);
77 if (length == 16 - offset)
78 {
79 /* There is no NULL terminator. */
80 __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
81 int index = _mm_cmpistri (mask1, mask1, 0x3a);
82 length += index;
83
84 /* Don't use SSE4.2 if the length of A > 16. */
85 if (length > 16)
86 return __strspn_sse2 (s, a);
87
88 if (index != 0)
89 {
90 /* Combine mask0 and mask1. We could play games with
91 palignr, but frankly this data should be in L1 now
92 so do the merge via an unaligned load. */
93 mask = _mm_loadu_si128 ((__m128i *) a);
94 }
95 }
96 }
97 else
98 {
99 /* A is aligned. */
100 mask = _mm_load_si128 ((__m128i *) a);
101
102 /* Find where the NULL terminator is. */
103 int length = _mm_cmpistri (mask, mask, 0x3a);
104 if (length == 16)
105 {
106 /* There is no NULL terminator. Don't use SSE4.2 if the length
107 of A > 16. */
108 if (a[16] != 0)
109 return __strspn_sse2 (s, a);
110 }
111 }
112
113 offset = (int) ((size_t) s & 15);
114 if (offset != 0)
115 {
116 /* Check partial string. */
117 aligned = (const char *) ((size_t) s & -16L);
118 __m128i value = _mm_load_si128 ((__m128i *) aligned);
119
120 value = __m128i_shift_right (value, offset);
121
122 int length = _mm_cmpistri (mask, value, 0x12);
123 /* No need to check CFlag since it is always 1. */
124 if (length < 16 - offset)
125 return length;
126 /* Find where the NULL terminator is. */
127 int index = _mm_cmpistri (value, value, 0x3a);
128 if (index < 16 - offset)
129 return length;
130 aligned += 16;
131 }
132 else
133 aligned = s;
134
135 while (1)
136 {
137 __m128i value = _mm_load_si128 ((__m128i *) aligned);
138 int index = _mm_cmpistri (mask, value, 0x12);
139 int cflag = _mm_cmpistrc (mask, value, 0x12);
140 if (cflag)
141 return (size_t) (aligned + index - s);
142 aligned += 16;
143 }
144}
145