strxfrm_l.c source code [glibc/string/strxfrm_l.c]

1	/ Copyright (C) 1995-2021 Free Software Foundation, Inc.*
2	This file is part of the GNU C Library.
3	Written by Ulrich Drepper <drepper@gnu.org>, 1995.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <assert.h>
20	#include <langinfo.h>
21	#include <locale.h>
22	#include <stddef.h>
23	#include <stdint.h>
24	#include <stdlib.h>
25	#include <string.h>
26	#include <sys/param.h>
27
28	#ifndef STRING_TYPE
29	# define STRING_TYPE char
30	# define USTRING_TYPE unsigned char
31	# define STRXFRM __strxfrm_l
32	# define STRLEN strlen
33	# define STPNCPY __stpncpy
34	# define WEIGHT_H "../locale/weight.h"
35	# define SUFFIX MB
36	# define L(arg) arg
37	#endif
38
39	#define CONCAT(a,b) CONCAT1(a,b)
40	#define CONCAT1(a,b) a##b
41
42	/ Maximum string size that is calculated with cached indices. Right now this*
43	is an arbitrary value open to optimizations. SMALL_STR_SIZE 4 has to be*
44	lower than __MAX_ALLOCA_CUTOFF. Keep localedata/xfrm-test.c in sync. /*
45	#define SMALL_STR_SIZE 4095
46
47	#include "../locale/localeinfo.h"
48	#include WEIGHT_H
49
50	/ Group locale data for shorter parameter lists. /
51	typedef struct
52	{
53	uint_fast32_t nrules;
54	unsigned char *rulesets;
55	USTRING_TYPE *weights;
56	int32_t *table;
57	USTRING_TYPE *extra;
58	int32_t *indirect;
59	} locale_data_t;
60
61	#ifndef WIDE_CHAR_VERSION
62
63	/ We need UTF-8 encoding of numbers. /
64	static int
65	utf8_encode (char buf, int* val)
66	{
67	int retval;
68
69	if (val < `0x80`)
70	{
71	buf++ = (char*) val;
72	retval = `1`;
73	}
74	else
75	{
76	int step;
77
78	for (step = `2`; step < `6`; ++step)
79	if ((val & (~(uint32_t)`0` << (`5` * step + `1`))) == `0`)
80	break;
81	retval = step;
82
83	buf = (unsigned* char) (~`0xff` >> step);
84	--step;
85	do
86	{
87	buf[step] = `0x80` \| (val & `0x3f`);
88	val >>= `6`;
89	}
90	while (--step > `0`);
91	*buf \|= val;
92	}
93
94	return retval;
95	}
96	#endif
97
98	/ Find next weight and rule index. Inlined since called for every char. /
99	static __always_inline size_t
100	find_idx (const USTRING_TYPE *us, int32_t weight_idx,
101	unsigned char rule_idx, const* locale_data_t l_data, const* int pass)
102	{
103	int32_t tmp = findidx (l_data->table, l_data->indirect, l_data->extra, us,
104	-`1`);
105	*rule_idx = tmp >> `24`;
106	int32_t idx = tmp & `0xffffff`;
107	size_t len = l_data->weights[idx++];
108
109	/ Skip over indices of previous levels. /
110	for (int i = `0`; i < pass; i++)
111	{
112	idx += len;
113	len = l_data->weights[idx++];
114	}
115
116	*weight_idx = idx;
117	return len;
118	}
119
120	static int
121	find_position (const USTRING_TYPE us, const* locale_data_t *l_data,
122	const int pass)
123	{
124	int32_t weight_idx;
125	unsigned char rule_idx;
126	const USTRING_TYPE *usrc = us;
127
128	find_idx (&usrc, &weight_idx, &rule_idx, l_data, pass);
129	return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position;
130	}
131
132	/ Do the transformation. /
133	static size_t
134	do_xfrm (const USTRING_TYPE usrc, STRING_TYPE dest, size_t n,
135	const locale_data_t *l_data)
136	{
137	int32_t weight_idx;
138	unsigned char rule_idx;
139	uint_fast32_t pass;
140	size_t needed = `0`;
141	size_t last_needed;
142
143	/ Now the passes over the weights. /
144	for (pass = `0`; pass < l_data->nrules; ++pass)
145	{
146	size_t backw_len = `0`;
147	last_needed = needed;
148	const USTRING_TYPE *cur = usrc;
149	const USTRING_TYPE *backw_start = NULL;
150
151	/ We assume that if a rule has defined `position' in one section*
152	this is true for all of them. /*
153	int position = find_position (cur, l_data, pass);
154
155	if (position == `0`)
156	{
157	while (*cur != L(`'\0'`))
158	{
159	const USTRING_TYPE *pos = cur;
160	size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
161	pass);
162	int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
163
164	if ((rule & sort_forward) != `0`)
165	{
166	/ Handle the pushed backward sequence. /
167	if (backw_start != NULL)
168	{
169	for (size_t i = backw_len; i > `0`; )
170	{
171	int32_t weight_idx;
172	unsigned char rule_idx;
173	size_t len = find_idx (&backw_start, &weight_idx,
174	&rule_idx, l_data, pass);
175	if (needed + i < n)
176	for (size_t j = len; j > `0`; j--)
177	dest[needed + i - j] =
178	l_data->weights[weight_idx++];
179
180	i -= len;
181	}
182
183	needed += backw_len;
184	backw_start = NULL;
185	backw_len = `0`;
186	}
187
188	/ Now handle the forward element. /
189	if (needed + len < n)
190	while (len-- > `0`)
191	dest[needed++] = l_data->weights[weight_idx++];
192	else
193	/ No more characters fit into the buffer. /
194	needed += len;
195	}
196	else
197	{
198	/ Remember start of the backward sequence & track length. /
199	if (backw_start == NULL)
200	backw_start = pos;
201	backw_len += len;
202	}
203	}
204
205
206	/ Handle the pushed backward sequence. /
207	if (backw_start != NULL)
208	{
209	for (size_t i = backw_len; i > `0`; )
210	{
211	size_t len = find_idx (&backw_start, &weight_idx, &rule_idx,
212	l_data, pass);
213	if (needed + i < n)
214	for (size_t j = len; j > `0`; j--)
215	dest[needed + i - j] =
216	l_data->weights[weight_idx++];
217
218	i -= len;
219	}
220
221	needed += backw_len;
222	}
223	}
224	else
225	{
226	int val = `1`;
227	#ifndef WIDE_CHAR_VERSION
228	char buf[`7`];
229	size_t buflen;
230	#endif
231	size_t i;
232
233	while (*cur != L(`'\0'`))
234	{
235	const USTRING_TYPE *pos = cur;
236	size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
237	pass);
238	int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
239
240	if ((rule & sort_forward) != `0`)
241	{
242	/ Handle the pushed backward sequence. /
243	if (backw_start != NULL)
244	{
245	for (size_t p = backw_len; p > `0`; p--)
246	{
247	size_t len;
248	int32_t weight_idx;
249	unsigned char rule_idx;
250	const USTRING_TYPE *backw_cur = backw_start;
251
252	/ To prevent a warning init the used vars. /
253	len = find_idx (&backw_cur, &weight_idx,
254	&rule_idx, l_data, pass);
255
256	for (i = `1`; i < p; i++)
257	len = find_idx (&backw_cur, &weight_idx,
258	&rule_idx, l_data, pass);
259
260	if (len != `0`)
261	{
262	#ifdef WIDE_CHAR_VERSION
263	if (needed + `1` + len < n)
264	{
265	dest[needed] = val;
266	for (i = `0`; i < len; ++i)
267	dest[needed + `1` + i] =
268	l_data->weights[weight_idx + i];
269	}
270	needed += `1` + len;
271	#else
272	buflen = utf8_encode (buf, val);
273	if (needed + buflen + len < n)
274	{
275	for (i = `0`; i < buflen; ++i)
276	dest[needed + i] = buf[i];
277	for (i = `0`; i < len; ++i)
278	dest[needed + buflen + i] =
279	l_data->weights[weight_idx + i];
280	}
281	needed += buflen + len;
282	#endif
283	val = `1`;
284	}
285	else
286	++val;
287	}
288
289	backw_start = NULL;
290	backw_len = `0`;
291	}
292
293	/ Now handle the forward element. /
294	if (len != `0`)
295	{
296	#ifdef WIDE_CHAR_VERSION
297	if (needed + `1` + len < n)
298	{
299	dest[needed] = val;
300	for (i = `0`; i < len; ++i)
301	dest[needed + `1` + i] =
302	l_data->weights[weight_idx + i];
303	}
304	needed += `1` + len;
305	#else
306	buflen = utf8_encode (buf, val);
307	if (needed + buflen + len < n)
308	{
309	for (i = `0`; i < buflen; ++i)
310	dest[needed + i] = buf[i];
311	for (i = `0`; i < len; ++i)
312	dest[needed + buflen + i] =
313	l_data->weights[weight_idx + i];
314	}
315	needed += buflen + len;
316	#endif
317	val = `1`;
318	}
319	else
320	++val;
321	}
322	else
323	{
324	/ Remember start of the backward sequence & track length. /
325	if (backw_start == NULL)
326	backw_start = pos;
327	backw_len++;
328	}
329	}
330
331	/ Handle the pushed backward sequence. /
332	if (backw_start != NULL)
333	{
334	for (size_t p = backw_len; p > `0`; p--)
335	{
336	size_t len;
337	int32_t weight_idx;
338	unsigned char rule_idx;
339	const USTRING_TYPE *backw_cur = backw_start;
340
341	/ To prevent a warning init the used vars. /
342	len = find_idx (&backw_cur, &weight_idx,
343	&rule_idx, l_data, pass);
344
345	for (i = `1`; i < p; i++)
346	len = find_idx (&backw_cur, &weight_idx,
347	&rule_idx, l_data, pass);
348
349	if (len != `0`)
350	{
351	#ifdef WIDE_CHAR_VERSION
352	if (needed + `1` + len < n)
353	{
354	dest[needed] = val;
355	for (i = `0`; i < len; ++i)
356	dest[needed + `1` + i] =
357	l_data->weights[weight_idx + i];
358	}
359	needed += `1` + len;
360	#else
361	buflen = utf8_encode (buf, val);
362	if (needed + buflen + len < n)
363	{
364	for (i = `0`; i < buflen; ++i)
365	dest[needed + i] = buf[i];
366	for (i = `0`; i < len; ++i)
367	dest[needed + buflen + i] =
368	l_data->weights[weight_idx + i];
369	}
370	needed += buflen + len;
371	#endif
372	val = `1`;
373	}
374	else
375	++val;
376	}
377	}
378	}
379
380	/ Finally store the byte to separate the passes or terminate*
381	the string. /*
382	if (needed < n)
383	dest[needed] = pass + `1` < l_data->nrules ? L(`'\1'`) : L(`'\0'`);
384	++needed;
385	}
386
387	/ This is a little optimization: many collation specifications have*
388	a `position' rule at the end and if no non-ignored character
389	is found the last \1 byte is immediately followed by a \0 byte
390	signalling this. We can avoid the \1 byte(s). /*
391	if (needed > `2` && needed == last_needed + `1`)
392	{
393	/ Remove the \1 byte. /
394	if (--needed <= n)
395	dest[needed - `1`] = L(`'\0'`);
396	}
397
398	/ Return the number of bytes/words we need, but don't count the NUL*
399	byte/word at the end. /*
400	return needed - `1`;
401	}
402
403	/ Do the transformation using weight-index and rule cache. /
404	static size_t
405	do_xfrm_cached (STRING_TYPE dest, size_t n, const* locale_data_t *l_data,
406	size_t idxmax, int32_t idxarr, const* unsigned char *rulearr)
407	{
408	uint_fast32_t nrules = l_data->nrules;
409	unsigned char *rulesets = l_data->rulesets;
410	USTRING_TYPE *weights = l_data->weights;
411	uint_fast32_t pass;
412	size_t needed = `0`;
413	size_t last_needed;
414	size_t idxcnt;
415
416	/ Now the passes over the weights. /
417	for (pass = `0`; pass < nrules; ++pass)
418	{
419	size_t backw_stop = ~`0ul`;
420	int rule = rulesets[rulearr[`0`] * nrules + pass];
421	/ We assume that if a rule has defined `position' in one section*
422	this is true for all of them. /*
423	int position = rule & sort_position;
424
425	last_needed = needed;
426	if (position == `0`)
427	{
428	for (idxcnt = `0`; idxcnt < idxmax; ++idxcnt)
429	{
430	if ((rule & sort_forward) != `0`)
431	{
432	size_t len;
433
434	if (backw_stop != ~`0ul`)
435	{
436	/ Handle the pushed elements now. /
437	size_t backw;
438
439	for (backw = idxcnt; backw > backw_stop; )
440	{
441	--backw;
442	len = weights[idxarr[backw]++];
443
444	if (needed + len < n)
445	while (len-- > `0`)
446	dest[needed++] = weights[idxarr[backw]++];
447	else
448	{
449	/ No more characters fit into the buffer. /
450	needed += len;
451	idxarr[backw] += len;
452	}
453	}
454
455	backw_stop = ~`0ul`;
456	}
457
458	/ Now handle the forward element. /
459	len = weights[idxarr[idxcnt]++];
460	if (needed + len < n)
461	while (len-- > `0`)
462	dest[needed++] = weights[idxarr[idxcnt]++];
463	else
464	{
465	/ No more characters fit into the buffer. /
466	needed += len;
467	idxarr[idxcnt] += len;
468	}
469	}
470	else
471	{
472	/ Remember where the backwards series started. /
473	if (backw_stop == ~`0ul`)
474	backw_stop = idxcnt;
475	}
476
477	rule = rulesets[rulearr[idxcnt + `1`] * nrules + pass];
478	}
479
480
481	if (backw_stop != ~`0ul`)
482	{
483	/ Handle the pushed elements now. /
484	size_t backw;
485
486	backw = idxcnt;
487	while (backw > backw_stop)
488	{
489	size_t len = weights[idxarr[--backw]++];
490
491	if (needed + len < n)
492	while (len-- > `0`)
493	dest[needed++] = weights[idxarr[backw]++];
494	else
495	{
496	/ No more characters fit into the buffer. /
497	needed += len;
498	idxarr[backw] += len;
499	}
500	}
501	}
502	}
503	else
504	{
505	int val = `1`;
506	#ifndef WIDE_CHAR_VERSION
507	char buf[`7`];
508	size_t buflen;
509	#endif
510	size_t i;
511
512	for (idxcnt = `0`; idxcnt < idxmax; ++idxcnt)
513	{
514	if ((rule & sort_forward) != `0`)
515	{
516	size_t len;
517
518	if (backw_stop != ~`0ul`)
519	{
520	/ Handle the pushed elements now. /
521	size_t backw;
522
523	for (backw = idxcnt; backw > backw_stop; )
524	{
525	--backw;
526	len = weights[idxarr[backw]++];
527	if (len != `0`)
528	{
529	#ifdef WIDE_CHAR_VERSION
530	if (needed + `1` + len < n)
531	{
532	dest[needed] = val;
533	for (i = `0`; i < len; ++i)
534	dest[needed + `1` + i] =
535	weights[idxarr[backw] + i];
536	}
537	needed += `1` + len;
538	#else
539	buflen = utf8_encode (buf, val);
540	if (needed + buflen + len < n)
541	{
542	for (i = `0`; i < buflen; ++i)
543	dest[needed + i] = buf[i];
544	for (i = `0`; i < len; ++i)
545	dest[needed + buflen + i] =
546	weights[idxarr[backw] + i];
547	}
548	needed += buflen + len;
549	#endif
550	idxarr[backw] += len;
551	val = `1`;
552	}
553	else
554	++val;
555	}
556
557	backw_stop = ~`0ul`;
558	}
559
560	/ Now handle the forward element. /
561	len = weights[idxarr[idxcnt]++];
562	if (len != `0`)
563	{
564	#ifdef WIDE_CHAR_VERSION
565	if (needed + `1`+ len < n)
566	{
567	dest[needed] = val;
568	for (i = `0`; i < len; ++i)
569	dest[needed + `1` + i] =
570	weights[idxarr[idxcnt] + i];
571	}
572	needed += `1` + len;
573	#else
574	buflen = utf8_encode (buf, val);
575	if (needed + buflen + len < n)
576	{
577	for (i = `0`; i < buflen; ++i)
578	dest[needed + i] = buf[i];
579	for (i = `0`; i < len; ++i)
580	dest[needed + buflen + i] =
581	weights[idxarr[idxcnt] + i];
582	}
583	needed += buflen + len;
584	#endif
585	idxarr[idxcnt] += len;
586	val = `1`;
587	}
588	else
589	/ Note that we don't have to increment `idxarr[idxcnt]'*
590	since the length is zero. /*
591	++val;
592	}
593	else
594	{
595	/ Remember where the backwards series started. /
596	if (backw_stop == ~`0ul`)
597	backw_stop = idxcnt;
598	}
599
600	rule = rulesets[rulearr[idxcnt + `1`] * nrules + pass];
601	}
602
603	if (backw_stop != ~`0ul`)
604	{
605	/ Handle the pushed elements now. /
606	size_t backw;
607
608	backw = idxmax - `1`;
609	while (backw > backw_stop)
610	{
611	size_t len = weights[idxarr[--backw]++];
612	if (len != `0`)
613	{
614	#ifdef WIDE_CHAR_VERSION
615	if (needed + `1` + len < n)
616	{
617	dest[needed] = val;
618	for (i = `0`; i < len; ++i)
619	dest[needed + `1` + i] =
620	weights[idxarr[backw] + i];
621	}
622	needed += `1` + len;
623	#else
624	buflen = utf8_encode (buf, val);
625	if (needed + buflen + len < n)
626	{
627	for (i = `0`; i < buflen; ++i)
628	dest[needed + i] = buf[i];
629	for (i = `0`; i < len; ++i)
630	dest[needed + buflen + i] =
631	weights[idxarr[backw] + i];
632	}
633	needed += buflen + len;
634	#endif
635	idxarr[backw] += len;
636	val = `1`;
637	}
638	else
639	++val;
640	}
641	}
642	}
643
644	/ Finally store the byte to separate the passes or terminate*
645	the string. /*
646	if (needed < n)
647	dest[needed] = pass + `1` < nrules ? L(`'\1'`) : L(`'\0'`);
648	++needed;
649	}
650
651	/ This is a little optimization: many collation specifications have*
652	a `position' rule at the end and if no non-ignored character
653	is found the last \1 byte is immediately followed by a \0 byte
654	signalling this. We can avoid the \1 byte(s). /*
655	if (needed > `2` && needed == last_needed + `1`)
656	{
657	/ Remove the \1 byte. /
658	if (--needed <= n)
659	dest[needed - `1`] = L(`'\0'`);
660	}
661
662	/ Return the number of bytes/words we need, but don't count the NUL*
663	byte/word at the end. /*
664	return needed - `1`;
665	}
666
667	size_t
668	STRXFRM (STRING_TYPE dest, const* STRING_TYPE *src, size_t n, locale_t l)
669	{
670	locale_data_t l_data;
671	struct __locale_data *current = l->__locales[LC_COLLATE];
672	l_data.nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
673
674	/ Handle byte comparison case. /
675	if (l_data.nrules == `0`)
676	{
677	size_t srclen = STRLEN (src);
678
679	if (n != `0`)
680	STPNCPY (dest, src, MIN (srclen + `1`, n));
681
682	return srclen;
683	}
684
685	/ Handle an empty string, code hereafter relies on strlen (src) > 0. /
686	if (*src == L(`'\0'`))
687	{
688	if (n != `0`)
689	*dest = L(`'\0'`);
690	return `0`;
691	}
692
693	/ Get the locale data. /
694	l_data.rulesets = (unsigned char *)
695	current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
696	l_data.table = (int32_t *)
697	current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
698	l_data.weights = (USTRING_TYPE *)
699	current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
700	l_data.extra = (USTRING_TYPE *)
701	current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
702	l_data.indirect = (int32_t *)
703	current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
704
705	assert (((uintptr_t) l_data.table) % __alignof__ (l_data.table[`0`]) == `0`);
706	assert (((uintptr_t) l_data.weights) % __alignof__ (l_data.weights[`0`]) == `0`);
707	assert (((uintptr_t) l_data.extra) % __alignof__ (l_data.extra[`0`]) == `0`);
708	assert (((uintptr_t) l_data.indirect) % __alignof__ (l_data.indirect[`0`]) == `0`);
709
710	/ We need the elements of the string as unsigned values since they*
711	are used as indices. /*
712	const USTRING_TYPE usrc = (const* USTRING_TYPE *) src;
713
714	/ Allocate cache for small strings on the stack and fill it with weight and*
715	rule indices. If the cache size is not sufficient, continue with the
716	uncached xfrm version. /*
717	size_t idxmax = `0`;
718	const USTRING_TYPE *cur = usrc;
719	int32_t idxarr = alloca (SMALL_STR_SIZE sizeof (int32_t));
720	unsigned char *rulearr = alloca (SMALL_STR_SIZE + `1`);
721
722	do
723	{
724	int32_t tmp = findidx (l_data.table, l_data.indirect, l_data.extra, &cur,
725	-`1`);
726	rulearr[idxmax] = tmp >> `24`;
727	idxarr[idxmax] = tmp & `0xffffff`;
728
729	++idxmax;
730	}
731	while (*cur != L(`'\0'`) && idxmax < SMALL_STR_SIZE);
732
733	/ This element is only read, the value never used but to determine*
734	another value which then is ignored. /*
735	rulearr[idxmax] = `'\0'`;
736
737	/ Do the transformation. /
738	if (*cur == L(`'\0'`))
739	return do_xfrm_cached (dest, n, &l_data, idxmax, idxarr, rulearr);
740	else
741	return do_xfrm (usrc, dest, n, &l_data);
742	}
743	libc_hidden_def (STRXFRM)
744
745	#ifndef WIDE_CHAR_VERSION
746	weak_alias (__strxfrm_l, strxfrm_l)
747	#endif
748

Browse the source code of glibc/string/strxfrm_l.c