strxfrm_l.c source code [glibc/string/strxfrm_l.c]

1	/ Copyright (C) 1995-2023 Free Software Foundation, Inc.*
2	This file is part of the GNU C Library.
3
4	The GNU C Library is free software; you can redistribute it and/or
5	modify it under the terms of the GNU Lesser General Public
6	License as published by the Free Software Foundation; either
7	version 2.1 of the License, or (at your option) any later version.
8
9	The GNU C Library is distributed in the hope that it will be useful,
10	but WITHOUT ANY WARRANTY; without even the implied warranty of
11	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12	Lesser General Public License for more details.
13
14	You should have received a copy of the GNU Lesser General Public
15	License along with the GNU C Library; if not, see
16	<https://www.gnu.org/licenses/>. /*
17
18	#include <assert.h>
19	#include <langinfo.h>
20	#include <locale.h>
21	#include <stddef.h>
22	#include <stdint.h>
23	#include <stdlib.h>
24	#include <string.h>
25	#include <sys/param.h>
26
27	#ifndef STRING_TYPE
28	# define STRING_TYPE char
29	# define USTRING_TYPE unsigned char
30	# define STRXFRM __strxfrm_l
31	# define STRLEN strlen
32	# define STPNCPY __stpncpy
33	# define WEIGHT_H "../locale/weight.h"
34	# define SUFFIX MB
35	# define L(arg) arg
36	#endif
37
38	#define CONCAT(a,b) CONCAT1(a,b)
39	#define CONCAT1(a,b) a##b
40
41	/ Maximum string size that is calculated with cached indices. Right now this*
42	is an arbitrary value open to optimizations. SMALL_STR_SIZE 4 has to be*
43	lower than __MAX_ALLOCA_CUTOFF. Keep localedata/xfrm-test.c in sync. /*
44	#define SMALL_STR_SIZE 4095
45
46	#include "../locale/localeinfo.h"
47	#include WEIGHT_H
48
49	/ Group locale data for shorter parameter lists. /
50	typedef struct
51	{
52	uint32_t nrules;
53	unsigned char *rulesets;
54	USTRING_TYPE *weights;
55	int32_t *table;
56	USTRING_TYPE *extra;
57	int32_t *indirect;
58	} locale_data_t;
59
60	#ifndef WIDE_CHAR_VERSION
61
62	/ We need UTF-8 encoding of numbers. /
63	static int
64	utf8_encode (char buf, int* val)
65	{
66	int retval;
67
68	if (val < `0x80`)
69	{
70	buf++ = (char*) val;
71	retval = `1`;
72	}
73	else
74	{
75	int step;
76
77	for (step = `2`; step < `6`; ++step)
78	if ((val & (~(uint32_t)`0` << (`5` * step + `1`))) == `0`)
79	break;
80	retval = step;
81
82	buf = (unsigned* char) (~`0xff` >> step);
83	--step;
84	do
85	{
86	buf[step] = `0x80` \| (val & `0x3f`);
87	val >>= `6`;
88	}
89	while (--step > `0`);
90	*buf \|= val;
91	}
92
93	return retval;
94	}
95	#endif
96
97	/ Find next weight and rule index. Inlined since called for every char. /
98	static __always_inline size_t
99	find_idx (const USTRING_TYPE *us, int32_t weight_idx,
100	unsigned char rule_idx, const* locale_data_t l_data, const* int pass)
101	{
102	int32_t tmp = findidx (l_data->table, l_data->indirect, l_data->extra, us,
103	-`1`);
104	*rule_idx = tmp >> `24`;
105	int32_t idx = tmp & `0xffffff`;
106	size_t len = l_data->weights[idx++];
107
108	/ Skip over indices of previous levels. /
109	for (int i = `0`; i < pass; i++)
110	{
111	idx += len;
112	len = l_data->weights[idx++];
113	}
114
115	*weight_idx = idx;
116	return len;
117	}
118
119	static int
120	find_position (const USTRING_TYPE us, const* locale_data_t *l_data,
121	const int pass)
122	{
123	int32_t weight_idx;
124	unsigned char rule_idx;
125	const USTRING_TYPE *usrc = us;
126
127	find_idx (&usrc, &weight_idx, &rule_idx, l_data, pass);
128	return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position;
129	}
130
131	/ Do the transformation. /
132	static size_t
133	do_xfrm (const USTRING_TYPE usrc, STRING_TYPE dest, size_t n,
134	const locale_data_t *l_data)
135	{
136	int32_t weight_idx;
137	unsigned char rule_idx;
138	uint32_t pass;
139	size_t needed = `0`;
140	size_t last_needed;
141
142	/ Now the passes over the weights. /
143	for (pass = `0`; pass < l_data->nrules; ++pass)
144	{
145	size_t backw_len = `0`;
146	last_needed = needed;
147	const USTRING_TYPE *cur = usrc;
148	const USTRING_TYPE *backw_start = NULL;
149
150	/ We assume that if a rule has defined `position' in one section*
151	this is true for all of them. /*
152	int position = find_position (cur, l_data, pass);
153
154	if (position == `0`)
155	{
156	while (*cur != L(`'\0'`))
157	{
158	const USTRING_TYPE *pos = cur;
159	size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
160	pass);
161	int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
162
163	if ((rule & sort_forward) != `0`)
164	{
165	/ Handle the pushed backward sequence. /
166	if (backw_start != NULL)
167	{
168	for (size_t i = backw_len; i > `0`; )
169	{
170	int32_t weight_idx;
171	unsigned char rule_idx;
172	size_t len = find_idx (&backw_start, &weight_idx,
173	&rule_idx, l_data, pass);
174	if (needed + i < n)
175	for (size_t j = len; j > `0`; j--)
176	dest[needed + i - j] =
177	l_data->weights[weight_idx++];
178
179	i -= len;
180	}
181
182	needed += backw_len;
183	backw_start = NULL;
184	backw_len = `0`;
185	}
186
187	/ Now handle the forward element. /
188	if (needed + len < n)
189	while (len-- > `0`)
190	dest[needed++] = l_data->weights[weight_idx++];
191	else
192	/ No more characters fit into the buffer. /
193	needed += len;
194	}
195	else
196	{
197	/ Remember start of the backward sequence & track length. /
198	if (backw_start == NULL)
199	backw_start = pos;
200	backw_len += len;
201	}
202	}
203
204
205	/ Handle the pushed backward sequence. /
206	if (backw_start != NULL)
207	{
208	for (size_t i = backw_len; i > `0`; )
209	{
210	size_t len = find_idx (&backw_start, &weight_idx, &rule_idx,
211	l_data, pass);
212	if (needed + i < n)
213	for (size_t j = len; j > `0`; j--)
214	dest[needed + i - j] =
215	l_data->weights[weight_idx++];
216
217	i -= len;
218	}
219
220	needed += backw_len;
221	}
222	}
223	else
224	{
225	int val = `1`;
226	#ifndef WIDE_CHAR_VERSION
227	char buf[`7`];
228	size_t buflen;
229	#endif
230	size_t i;
231
232	while (*cur != L(`'\0'`))
233	{
234	const USTRING_TYPE *pos = cur;
235	size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
236	pass);
237	int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
238
239	if ((rule & sort_forward) != `0`)
240	{
241	/ Handle the pushed backward sequence. /
242	if (backw_start != NULL)
243	{
244	for (size_t p = backw_len; p > `0`; p--)
245	{
246	size_t len;
247	int32_t weight_idx;
248	unsigned char rule_idx;
249	const USTRING_TYPE *backw_cur = backw_start;
250
251	/ To prevent a warning init the used vars. /
252	len = find_idx (&backw_cur, &weight_idx,
253	&rule_idx, l_data, pass);
254
255	for (i = `1`; i < p; i++)
256	len = find_idx (&backw_cur, &weight_idx,
257	&rule_idx, l_data, pass);
258
259	if (len != `0`)
260	{
261	#ifdef WIDE_CHAR_VERSION
262	if (needed + `1` + len < n)
263	{
264	dest[needed] = val;
265	for (i = `0`; i < len; ++i)
266	dest[needed + `1` + i] =
267	l_data->weights[weight_idx + i];
268	}
269	needed += `1` + len;
270	#else
271	buflen = utf8_encode (buf, val);
272	if (needed + buflen + len < n)
273	{
274	for (i = `0`; i < buflen; ++i)
275	dest[needed + i] = buf[i];
276	for (i = `0`; i < len; ++i)
277	dest[needed + buflen + i] =
278	l_data->weights[weight_idx + i];
279	}
280	needed += buflen + len;
281	#endif
282	val = `1`;
283	}
284	else
285	++val;
286	}
287
288	backw_start = NULL;
289	backw_len = `0`;
290	}
291
292	/ Now handle the forward element. /
293	if (len != `0`)
294	{
295	#ifdef WIDE_CHAR_VERSION
296	if (needed + `1` + len < n)
297	{
298	dest[needed] = val;
299	for (i = `0`; i < len; ++i)
300	dest[needed + `1` + i] =
301	l_data->weights[weight_idx + i];
302	}
303	needed += `1` + len;
304	#else
305	buflen = utf8_encode (buf, val);
306	if (needed + buflen + len < n)
307	{
308	for (i = `0`; i < buflen; ++i)
309	dest[needed + i] = buf[i];
310	for (i = `0`; i < len; ++i)
311	dest[needed + buflen + i] =
312	l_data->weights[weight_idx + i];
313	}
314	needed += buflen + len;
315	#endif
316	val = `1`;
317	}
318	else
319	++val;
320	}
321	else
322	{
323	/ Remember start of the backward sequence & track length. /
324	if (backw_start == NULL)
325	backw_start = pos;
326	backw_len++;
327	}
328	}
329
330	/ Handle the pushed backward sequence. /
331	if (backw_start != NULL)
332	{
333	for (size_t p = backw_len; p > `0`; p--)
334	{
335	size_t len;
336	int32_t weight_idx;
337	unsigned char rule_idx;
338	const USTRING_TYPE *backw_cur = backw_start;
339
340	/ To prevent a warning init the used vars. /
341	len = find_idx (&backw_cur, &weight_idx,
342	&rule_idx, l_data, pass);
343
344	for (i = `1`; i < p; i++)
345	len = find_idx (&backw_cur, &weight_idx,
346	&rule_idx, l_data, pass);
347
348	if (len != `0`)
349	{
350	#ifdef WIDE_CHAR_VERSION
351	if (needed + `1` + len < n)
352	{
353	dest[needed] = val;
354	for (i = `0`; i < len; ++i)
355	dest[needed + `1` + i] =
356	l_data->weights[weight_idx + i];
357	}
358	needed += `1` + len;
359	#else
360	buflen = utf8_encode (buf, val);
361	if (needed + buflen + len < n)
362	{
363	for (i = `0`; i < buflen; ++i)
364	dest[needed + i] = buf[i];
365	for (i = `0`; i < len; ++i)
366	dest[needed + buflen + i] =
367	l_data->weights[weight_idx + i];
368	}
369	needed += buflen + len;
370	#endif
371	val = `1`;
372	}
373	else
374	++val;
375	}
376	}
377	}
378
379	/ Finally store the byte to separate the passes or terminate*
380	the string. /*
381	if (needed < n)
382	dest[needed] = pass + `1` < l_data->nrules ? L(`'\1'`) : L(`'\0'`);
383	++needed;
384	}
385
386	/ This is a little optimization: many collation specifications have*
387	a `position' rule at the end and if no non-ignored character
388	is found the last \1 byte is immediately followed by a \0 byte
389	signalling this. We can avoid the \1 byte(s). /*
390	if (needed > `2` && needed == last_needed + `1`)
391	{
392	/ Remove the \1 byte. /
393	if (--needed <= n)
394	dest[needed - `1`] = L(`'\0'`);
395	}
396
397	/ Return the number of bytes/words we need, but don't count the NUL*
398	byte/word at the end. /*
399	return needed - `1`;
400	}
401
402	/ Do the transformation using weight-index and rule cache. /
403	static size_t
404	do_xfrm_cached (STRING_TYPE dest, size_t n, const* locale_data_t *l_data,
405	size_t idxmax, int32_t idxarr, const* unsigned char *rulearr)
406	{
407	uint32_t nrules = l_data->nrules;
408	unsigned char *rulesets = l_data->rulesets;
409	USTRING_TYPE *weights = l_data->weights;
410	uint32_t pass;
411	size_t needed = `0`;
412	size_t last_needed;
413	size_t idxcnt;
414
415	/ Now the passes over the weights. /
416	for (pass = `0`; pass < nrules; ++pass)
417	{
418	size_t backw_stop = ~`0ul`;
419	int rule = rulesets[rulearr[`0`] * nrules + pass];
420	/ We assume that if a rule has defined `position' in one section*
421	this is true for all of them. /*
422	int position = rule & sort_position;
423
424	last_needed = needed;
425	if (position == `0`)
426	{
427	for (idxcnt = `0`; idxcnt < idxmax; ++idxcnt)
428	{
429	if ((rule & sort_forward) != `0`)
430	{
431	size_t len;
432
433	if (backw_stop != ~`0ul`)
434	{
435	/ Handle the pushed elements now. /
436	size_t backw;
437
438	for (backw = idxcnt; backw > backw_stop; )
439	{
440	--backw;
441	len = weights[idxarr[backw]++];
442
443	if (needed + len < n)
444	while (len-- > `0`)
445	dest[needed++] = weights[idxarr[backw]++];
446	else
447	{
448	/ No more characters fit into the buffer. /
449	needed += len;
450	idxarr[backw] += len;
451	}
452	}
453
454	backw_stop = ~`0ul`;
455	}
456
457	/ Now handle the forward element. /
458	len = weights[idxarr[idxcnt]++];
459	if (needed + len < n)
460	while (len-- > `0`)
461	dest[needed++] = weights[idxarr[idxcnt]++];
462	else
463	{
464	/ No more characters fit into the buffer. /
465	needed += len;
466	idxarr[idxcnt] += len;
467	}
468	}
469	else
470	{
471	/ Remember where the backwards series started. /
472	if (backw_stop == ~`0ul`)
473	backw_stop = idxcnt;
474	}
475
476	rule = rulesets[rulearr[idxcnt + `1`] * nrules + pass];
477	}
478
479
480	if (backw_stop != ~`0ul`)
481	{
482	/ Handle the pushed elements now. /
483	size_t backw;
484
485	backw = idxcnt;
486	while (backw > backw_stop)
487	{
488	size_t len = weights[idxarr[--backw]++];
489
490	if (needed + len < n)
491	while (len-- > `0`)
492	dest[needed++] = weights[idxarr[backw]++];
493	else
494	{
495	/ No more characters fit into the buffer. /
496	needed += len;
497	idxarr[backw] += len;
498	}
499	}
500	}
501	}
502	else
503	{
504	int val = `1`;
505	#ifndef WIDE_CHAR_VERSION
506	char buf[`7`];
507	size_t buflen;
508	#endif
509	size_t i;
510
511	for (idxcnt = `0`; idxcnt < idxmax; ++idxcnt)
512	{
513	if ((rule & sort_forward) != `0`)
514	{
515	size_t len;
516
517	if (backw_stop != ~`0ul`)
518	{
519	/ Handle the pushed elements now. /
520	size_t backw;
521
522	for (backw = idxcnt; backw > backw_stop; )
523	{
524	--backw;
525	len = weights[idxarr[backw]++];
526	if (len != `0`)
527	{
528	#ifdef WIDE_CHAR_VERSION
529	if (needed + `1` + len < n)
530	{
531	dest[needed] = val;
532	for (i = `0`; i < len; ++i)
533	dest[needed + `1` + i] =
534	weights[idxarr[backw] + i];
535	}
536	needed += `1` + len;
537	#else
538	buflen = utf8_encode (buf, val);
539	if (needed + buflen + len < n)
540	{
541	for (i = `0`; i < buflen; ++i)
542	dest[needed + i] = buf[i];
543	for (i = `0`; i < len; ++i)
544	dest[needed + buflen + i] =
545	weights[idxarr[backw] + i];
546	}
547	needed += buflen + len;
548	#endif
549	idxarr[backw] += len;
550	val = `1`;
551	}
552	else
553	++val;
554	}
555
556	backw_stop = ~`0ul`;
557	}
558
559	/ Now handle the forward element. /
560	len = weights[idxarr[idxcnt]++];
561	if (len != `0`)
562	{
563	#ifdef WIDE_CHAR_VERSION
564	if (needed + `1`+ len < n)
565	{
566	dest[needed] = val;
567	for (i = `0`; i < len; ++i)
568	dest[needed + `1` + i] =
569	weights[idxarr[idxcnt] + i];
570	}
571	needed += `1` + len;
572	#else
573	buflen = utf8_encode (buf, val);
574	if (needed + buflen + len < n)
575	{
576	for (i = `0`; i < buflen; ++i)
577	dest[needed + i] = buf[i];
578	for (i = `0`; i < len; ++i)
579	dest[needed + buflen + i] =
580	weights[idxarr[idxcnt] + i];
581	}
582	needed += buflen + len;
583	#endif
584	idxarr[idxcnt] += len;
585	val = `1`;
586	}
587	else
588	/ Note that we don't have to increment `idxarr[idxcnt]'*
589	since the length is zero. /*
590	++val;
591	}
592	else
593	{
594	/ Remember where the backwards series started. /
595	if (backw_stop == ~`0ul`)
596	backw_stop = idxcnt;
597	}
598
599	rule = rulesets[rulearr[idxcnt + `1`] * nrules + pass];
600	}
601
602	if (backw_stop != ~`0ul`)
603	{
604	/ Handle the pushed elements now. /
605	size_t backw;
606
607	backw = idxmax - `1`;
608	while (backw > backw_stop)
609	{
610	size_t len = weights[idxarr[--backw]++];
611	if (len != `0`)
612	{
613	#ifdef WIDE_CHAR_VERSION
614	if (needed + `1` + len < n)
615	{
616	dest[needed] = val;
617	for (i = `0`; i < len; ++i)
618	dest[needed + `1` + i] =
619	weights[idxarr[backw] + i];
620	}
621	needed += `1` + len;
622	#else
623	buflen = utf8_encode (buf, val);
624	if (needed + buflen + len < n)
625	{
626	for (i = `0`; i < buflen; ++i)
627	dest[needed + i] = buf[i];
628	for (i = `0`; i < len; ++i)
629	dest[needed + buflen + i] =
630	weights[idxarr[backw] + i];
631	}
632	needed += buflen + len;
633	#endif
634	idxarr[backw] += len;
635	val = `1`;
636	}
637	else
638	++val;
639	}
640	}
641	}
642
643	/ Finally store the byte to separate the passes or terminate*
644	the string. /*
645	if (needed < n)
646	dest[needed] = pass + `1` < nrules ? L(`'\1'`) : L(`'\0'`);
647	++needed;
648	}
649
650	/ This is a little optimization: many collation specifications have*
651	a `position' rule at the end and if no non-ignored character
652	is found the last \1 byte is immediately followed by a \0 byte
653	signalling this. We can avoid the \1 byte(s). /*
654	if (needed > `2` && needed == last_needed + `1`)
655	{
656	/ Remove the \1 byte. /
657	if (--needed <= n)
658	dest[needed - `1`] = L(`'\0'`);
659	}
660
661	/ Return the number of bytes/words we need, but don't count the NUL*
662	byte/word at the end. /*
663	return needed - `1`;
664	}
665
666	size_t
667	STRXFRM (STRING_TYPE dest, const* STRING_TYPE *src, size_t n, locale_t l)
668	{
669	locale_data_t l_data;
670	struct __locale_data *current = l->__locales[LC_COLLATE];
671	l_data.nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
672
673	/ Handle byte comparison case. /
674	if (l_data.nrules == `0`)
675	{
676	size_t srclen = STRLEN (src);
677
678	if (n != `0`)
679	STPNCPY (dest, src, MIN (srclen + `1`, n));
680
681	return srclen;
682	}
683
684	/ Handle an empty string, code hereafter relies on strlen (src) > 0. /
685	if (*src == L(`'\0'`))
686	{
687	if (n != `0`)
688	*dest = L(`'\0'`);
689	return `0`;
690	}
691
692	/ Get the locale data. /
693	l_data.rulesets = (unsigned char *)
694	current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
695	l_data.table = (int32_t *)
696	current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
697	l_data.weights = (USTRING_TYPE *)
698	current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
699	l_data.extra = (USTRING_TYPE *)
700	current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
701	l_data.indirect = (int32_t *)
702	current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
703
704	assert (((uintptr_t) l_data.table) % __alignof__ (l_data.table[`0`]) == `0`);
705	assert (((uintptr_t) l_data.weights) % __alignof__ (l_data.weights[`0`]) == `0`);
706	assert (((uintptr_t) l_data.extra) % __alignof__ (l_data.extra[`0`]) == `0`);
707	assert (((uintptr_t) l_data.indirect) % __alignof__ (l_data.indirect[`0`]) == `0`);
708
709	/ We need the elements of the string as unsigned values since they*
710	are used as indices. /*
711	const USTRING_TYPE usrc = (const* USTRING_TYPE *) src;
712
713	/ Allocate cache for small strings on the stack and fill it with weight and*
714	rule indices. If the cache size is not sufficient, continue with the
715	uncached xfrm version. /*
716	size_t idxmax = `0`;
717	const USTRING_TYPE *cur = usrc;
718	int32_t idxarr = alloca (SMALL_STR_SIZE sizeof (int32_t));
719	unsigned char *rulearr = alloca (SMALL_STR_SIZE + `1`);
720
721	do
722	{
723	int32_t tmp = findidx (l_data.table, l_data.indirect, l_data.extra, &cur,
724	-`1`);
725	rulearr[idxmax] = tmp >> `24`;
726	idxarr[idxmax] = tmp & `0xffffff`;
727
728	++idxmax;
729	}
730	while (*cur != L(`'\0'`) && idxmax < SMALL_STR_SIZE);
731
732	/ This element is only read, the value never used but to determine*
733	another value which then is ignored. /*
734	rulearr[idxmax] = `'\0'`;
735
736	/ Do the transformation. /
737	if (*cur == L(`'\0'`))
738	return do_xfrm_cached (dest, n, &l_data, idxmax, idxarr, rulearr);
739	else
740	return do_xfrm (usrc, dest, n, &l_data);
741	}
742	libc_hidden_def (STRXFRM)
743
744	#ifndef WIDE_CHAR_VERSION
745	weak_alias (__strxfrm_l, strxfrm_l)
746	#endif
747

Browse the source code of glibc/string/strxfrm_l.c