vfs_utfconv.c source code [codebrowser/bsd/vfs/vfs_utfconv.c]

1	/*
2	* Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28
29	/*
30	Includes Unicode 3.2 decomposition code derived from Core Foundation
31	*/
32
33	#include <sys/param.h>
34	#include <sys/utfconv.h>
35	#include <sys/errno.h>
36	#include <sys/malloc.h>
37	#include <libkern/OSByteOrder.h>
38
39	#if defined(KERNEL) && !defined(VFS_UTF8_UNIT_TEST)
40	#include <kern/assert.h>
41	#else
42	#include <assert.h>
43	#endif
44
45	/*
46	* UTF-8 (Unicode Transformation Format)
47	*
48	* UTF-8 is the Unicode Transformation Format that serializes a Unicode
49	* character as a sequence of one to four bytes. Only the shortest form
50	* required to represent the significant Unicode bits is legal.
51	*
52	* UTF-8 Multibyte Codes
53	*
54	* Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
55	* -----------------------------------------------------------------------------
56	* 1 7 0x0000 0x007F 0xxxxxxx
57	* 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
58	* 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
59	* 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
60	* -----------------------------------------------------------------------------
61	*/
62
63
64	#define UNICODE_TO_UTF8_LEN(c) \
65	((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
66
67	#define UCS_ALT_NULL 0x2400
68
69	/ Surrogate Pair Constants /
70	#define SP_HALF_SHIFT 10
71	#define SP_HALF_BASE 0x0010000u
72	#define SP_HALF_MASK 0x3FFu
73
74	#define SP_HIGH_FIRST 0xD800u
75	#define SP_HIGH_LAST 0xDBFFu
76	#define SP_LOW_FIRST 0xDC00u
77	#define SP_LOW_LAST 0xDFFFu
78
79
80	#include "vfs_utfconvdata.h"
81
82
83	/*
84	* Test for a combining character.
85	*
86	* Similar to __CFUniCharIsNonBaseCharacter except that
87	* unicode_combinable also includes Hangul Jamo characters.
88	*/
89	int
90	unicode_combinable(u_int16_t character)
91	{
92	const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
93	u_int8_t value;
94
95	if (character < `0x0300`)
96	return (`0`);
97
98	value = bitmap[(character >> `8`) & `0xFF`];
99
100	if (value == `0xFF`) {
101	return (`1`);
102	} else if (value) {
103	bitmap = bitmap + ((value - `1`) * `32`) + `256`;
104	return (bitmap[(character & `0xFF`) / `8`] & (`1` << (character % `8`)) ? `1` : `0`);
105	}
106	return (`0`);
107	}
108
109	/*
110	* Test for a precomposed character.
111	*
112	* Similar to __CFUniCharIsDecomposableCharacter.
113	*/
114	int
115	unicode_decomposeable(u_int16_t character) {
116	const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
117	u_int8_t value;
118
119	if (character < `0x00C0`)
120	return (`0`);
121
122	value = bitmap[(character >> `8`) & `0xFF`];
123
124	if (value == `0xFF`) {
125	return (`1`);
126	} else if (value) {
127	bitmap = bitmap + ((value - `1`) * `32`) + `256`;
128	return (bitmap[(character & `0xFF`) / `8`] & (`1` << (character % `8`)) ? `1` : `0`);
129	}
130	return (`0`);
131	}
132
133
134	/*
135	* Get the combing class.
136	*
137	* Similar to CFUniCharGetCombiningPropertyForCharacter.
138	*/
139	static inline u_int8_t
140	get_combining_class(u_int16_t character) {
141	const u_int8_t *bitmap = __CFUniCharCombiningPropertyBitmap;
142
143	u_int8_t value = bitmap[(character >> `8`)];
144
145	if (value) {
146	bitmap = bitmap + (value * `256`);
147	return bitmap[character % `256`];
148	}
149	return (`0`);
150	}
151
152
153	static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
154
155	static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
156
157	static void prioritysort(u_int16_t* characters, int count);
158
159	static u_int16_t ucs_to_sfm(u_int16_t ucs_ch, int lastchar);
160
161	static u_int16_t sfm_to_ucs(u_int16_t ucs_ch);
162
163
164	char utf_extrabytes[`32`] = {
165	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
166	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `1`, `1`, `1`, `1`, `2`, `2`, `3`, -`1`
167	};
168
169	const char hexdigits[`16`] = {
170	`'0'`, `'1'`, `'2'`, `'3'`, `'4'`, `'5'`, `'6'`, `'7'`,
171	`'8'`, `'9'`, `'A'`, `'B'`, `'C'`, `'D'`, `'E'`, `'F'`
172	};
173
174	/*
175	* utf8_encodelen - Calculate the UTF-8 encoding length
176	*
177	* This function takes a Unicode input string, ucsp, of ucslen bytes
178	* and calculates the size of the UTF-8 output in bytes (not including
179	* a NULL termination byte). The string must reside in kernel memory.
180	*
181	* If '/' chars are possible in the Unicode input then an alternate
182	* (replacement) char should be provided in altslash.
183	*
184	* FLAGS
185	* UTF_REVERSE_ENDIAN: Unicode byte order is opposite current runtime
186	*
187	* UTF_BIG_ENDIAN: Unicode byte order is always big endian
188	*
189	* UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
190	*
191	* UTF_DECOMPOSED: generate fully decomposed output
192	*
193	* UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
194	*
195	* ERRORS
196	* None
197	*/
198	size_t
199	utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, int flags)
200	{
201	u_int16_t ucs_ch;
202	u_int16_t * chp = NULL;
203	u_int16_t sequence[`8`];
204	int extra = `0`;
205	size_t charcnt;
206	int swapbytes = (flags & UTF_REVERSE_ENDIAN);
207	int decompose = (flags & UTF_DECOMPOSED);
208	size_t len;
209
210	charcnt = ucslen / `2`;
211	len = `0`;
212
213	while (charcnt-- > `0`) {
214	if (extra > `0`) {
215	--extra;
216	ucs_ch = *chp++;
217	} else {
218	ucs_ch = *ucsp++;
219	if (swapbytes) {
220	ucs_ch = OSSwapInt16(ucs_ch);
221	}
222	if (ucs_ch == `'/'`) {
223	ucs_ch = altslash ? altslash : `'_'`;
224	} else if (ucs_ch == `'\0'`) {
225	ucs_ch = UCS_ALT_NULL;
226	} else if (decompose && unicode_decomposeable(ucs_ch)) {
227	extra = unicode_decompose(ucs_ch, sequence) - `1`;
228	charcnt += extra;
229	ucs_ch = sequence[`0`];
230	chp = &sequence[`1`];
231	}
232	}
233	len += UNICODE_TO_UTF8_LEN(ucs_ch);
234	}
235
236	return (len);
237	}
238
239
240	/*
241	* utf8_encodestr - Encodes a Unicode string to UTF-8
242	*
243	* NOTES:
244	* The resulting UTF-8 string is NULL terminated.
245	*
246	* If '/' chars are allowed on disk then an alternate
247	* (replacement) char must be provided in altslash.
248	*
249	* input flags:
250	* UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
251	*
252	* UTF_BIG_ENDIAN: Unicode byte order is always big endian
253	*
254	* UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
255	*
256	* UTF_DECOMPOSED: generate fully decomposed output
257	*
258	* UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
259	*
260	* result:
261	* ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
262	*
263	* EINVAL: Illegal char found; char was replaced by an '_'.
264	*/
265	int
266	utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
267	size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
268	{
269	u_int8_t * bufstart;
270	u_int8_t * bufend;
271	u_int16_t ucs_ch;
272	u_int16_t * chp = NULL;
273	u_int16_t sequence[`8`];
274	int extra = `0`;
275	size_t charcnt;
276	int swapbytes = (flags & UTF_REVERSE_ENDIAN);
277	int nullterm = ((flags & UTF_NO_NULL_TERM) == `0`);
278	int decompose = (flags & UTF_DECOMPOSED);
279	int sfmconv = (flags & UTF_SFM_CONVERSIONS);
280	int result = `0`;
281
282	bufstart = utf8p;
283	bufend = bufstart + buflen;
284	if (nullterm)
285	--bufend;
286	charcnt = ucslen / `2`;
287
288	while (charcnt-- > `0`) {
289	if (extra > `0`) {
290	--extra;
291	ucs_ch = *chp++;
292	} else {
293	ucs_ch = swapbytes ? OSSwapInt16(ucsp++) : ucsp++;
294
295	if (decompose && unicode_decomposeable(ucs_ch)) {
296	extra = unicode_decompose(ucs_ch, sequence) - `1`;
297	charcnt += extra;
298	ucs_ch = sequence[`0`];
299	chp = &sequence[`1`];
300	}
301	}
302
303	/ Slash and NULL are not permitted /
304	if (ucs_ch == `'/'`) {
305	if (altslash)
306	ucs_ch = altslash;
307	else {
308	ucs_ch = `'_'`;
309	result = EINVAL;
310	}
311	} else if (ucs_ch == `'\0'`) {
312	ucs_ch = UCS_ALT_NULL;
313	}
314
315	if (ucs_ch < `0x0080`) {
316	if (utf8p >= bufend) {
317	result = ENAMETOOLONG;
318	break;
319	}
320	*utf8p++ = ucs_ch;
321
322	} else if (ucs_ch < `0x800`) {
323	if ((utf8p + `1`) >= bufend) {
324	result = ENAMETOOLONG;
325	break;
326	}
327	*utf8p++ = `0xc0` \| (ucs_ch >> `6`);
328	*utf8p++ = `0x80` \| (`0x3f` & ucs_ch);
329
330	} else {
331	/ These chars never valid Unicode. /
332	if (ucs_ch == `0xFFFE` \|\| ucs_ch == `0xFFFF`) {
333	result = EINVAL;
334	break;
335	}
336
337	/ Combine valid surrogate pairs /
338	if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
339	&& charcnt > `0`) {
340	u_int16_t ch2;
341	u_int32_t pair;
342
343	ch2 = swapbytes ? OSSwapInt16(ucsp) : ucsp;
344	if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
345	pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
346	+ (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
347	if ((utf8p + `3`) >= bufend) {
348	result = ENAMETOOLONG;
349	break;
350	}
351	--charcnt;
352	++ucsp;
353	*utf8p++ = `0xf0` \| (pair >> `18`);
354	*utf8p++ = `0x80` \| (`0x3f` & (pair >> `12`));
355	*utf8p++ = `0x80` \| (`0x3f` & (pair >> `6`));
356	*utf8p++ = `0x80` \| (`0x3f` & pair);
357	continue;
358	}
359	} else if (sfmconv) {
360	ucs_ch = sfm_to_ucs(ucs_ch);
361	if (ucs_ch < `0x0080`) {
362	if (utf8p >= bufend) {
363	result = ENAMETOOLONG;
364	break;
365	}
366	*utf8p++ = ucs_ch;
367	continue;
368	}
369	}
370	if ((utf8p + `2`) >= bufend) {
371	result = ENAMETOOLONG;
372	break;
373	}
374	*utf8p++ = `0xe0` \| (ucs_ch >> `12`);
375	*utf8p++ = `0x80` \| (`0x3f` & (ucs_ch >> `6`));
376	*utf8p++ = `0x80` \| (`0x3f` & ucs_ch);
377	}
378	}
379
380	*utf8len = utf8p - bufstart;
381	if (nullterm)
382	*utf8p++ = `'\0'`;
383
384	return (result);
385	}
386
387	// Pushes a character taking account of combining character sequences
388	static void push(uint16_t ucs_ch, int combcharcnt, uint16_t *ucsp)
389	{
390	/*
391	* Make multiple combining character sequences canonical
392	*/
393	if (unicode_combinable(ucs_ch)) {
394	++combcharcnt; /* start tracking a run /
395	} else if (*combcharcnt) {
396	if (*combcharcnt > `1`) {
397	prioritysort(ucsp - combcharcnt, *combcharcnt);
398	}
399	combcharcnt = `0`; /* start over /
400	}
401
402	(ucsp)++ = ucs_ch;
403	}
404
405	/*
406	* utf8_decodestr - Decodes a UTF-8 string back to Unicode
407	*
408	* NOTES:
409	* The input UTF-8 string does not need to be null terminated
410	* if utf8len is set.
411	*
412	* If '/' chars are allowed on disk then an alternate
413	* (replacement) char must be provided in altslash.
414	*
415	* input flags:
416	* UTF_REV_ENDIAN: Unicode byte order is opposite current runtime
417	*
418	* UTF_BIG_ENDIAN: Unicode byte order is always big endian
419	*
420	* UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
421	*
422	* UTF_DECOMPOSED: generate fully decomposed output (NFD)
423	*
424	* UTF_PRECOMPOSED: generate precomposed output (NFC)
425	*
426	* UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
427	*
428	* result:
429	* ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
430	*
431	* EINVAL: Illegal UTF-8 sequence found.
432	*/
433	int
434	utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
435	size_t ucslen, size_t buflen, u_int16_t altslash, int* flags)
436	{
437	u_int16_t* bufstart;
438	u_int16_t* bufend;
439	unsigned int ucs_ch;
440	unsigned int byte;
441	int combcharcnt = `0`;
442	int result = `0`;
443	int decompose, precompose, escaping;
444	int sfmconv;
445	int extrabytes;
446
447	decompose = (flags & UTF_DECOMPOSED);
448	precompose = (flags & UTF_PRECOMPOSED);
449	escaping = (flags & UTF_ESCAPE_ILLEGAL);
450	sfmconv = (flags & UTF_SFM_CONVERSIONS);
451
452	bufstart = ucsp;
453	bufend = (u_int16_t )((u_int8_t )ucsp + buflen);
454
455	while (utf8len-- > `0` && (byte = *utf8p++) != `'\0'`) {
456	if (ucsp >= bufend)
457	goto toolong;
458
459	/ check for ascii /
460	if (byte < `0x80`) {
461	ucs_ch = sfmconv ? ucs_to_sfm(byte, utf8len == `0`) : byte;
462	} else {
463	u_int32_t ch;
464
465	extrabytes = utf_extrabytes[byte >> `3`];
466	if ((extrabytes < `0`) \|\| ((int)utf8len < extrabytes)) {
467	goto escape;
468	}
469	utf8len -= extrabytes;
470
471	switch (extrabytes) {
472	case `1`:
473	ch = byte; ch <<= `6`; / 1st byte /
474	byte = utf8p++; /* 2nd byte /
475	if ((byte >> `6`) != `2`)
476	goto escape2;
477	ch += byte;
478	ch -= `0x00003080UL`;
479	if (ch < `0x0080`)
480	goto escape2;
481	ucs_ch = ch;
482	break;
483	case `2`:
484	ch = byte; ch <<= `6`; / 1st byte /
485	byte = utf8p++; /* 2nd byte /
486	if ((byte >> `6`) != `2`)
487	goto escape2;
488	ch += byte; ch <<= `6`;
489	byte = utf8p++; /* 3rd byte /
490	if ((byte >> `6`) != `2`)
491	goto escape3;
492	ch += byte;
493	ch -= `0x000E2080UL`;
494	if (ch < `0x0800`)
495	goto escape3;
496	if (ch >= `0xD800`) {
497	if (ch <= `0xDFFF`)
498	goto escape3;
499	if (ch == `0xFFFE` \|\| ch == `0xFFFF`)
500	goto escape3;
501	}
502	ucs_ch = ch;
503	break;
504	case `3`:
505	ch = byte; ch <<= `6`; / 1st byte /
506	byte = utf8p++; /* 2nd byte /
507	if ((byte >> `6`) != `2`)
508	goto escape2;
509	ch += byte; ch <<= `6`;
510	byte = utf8p++; /* 3rd byte /
511	if ((byte >> `6`) != `2`)
512	goto escape3;
513	ch += byte; ch <<= `6`;
514	byte = utf8p++; /* 4th byte /
515	if ((byte >> `6`) != `2`)
516	goto escape4;
517	ch += byte;
518	ch -= `0x03C82080UL` + SP_HALF_BASE;
519	ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
520	if (ucs_ch < SP_HIGH_FIRST \|\| ucs_ch > SP_HIGH_LAST)
521	goto escape4;
522	push(ucs_ch, &combcharcnt, &ucsp);
523	if (ucsp >= bufend)
524	goto toolong;
525	ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
526	if (ucs_ch < SP_LOW_FIRST \|\| ucs_ch > SP_LOW_LAST) {
527	--ucsp;
528	goto escape4;
529	}
530	*ucsp++ = ucs_ch;
531	continue;
532	default:
533	result = EINVAL;
534	goto exit;
535	}
536	if (decompose) {
537	if (unicode_decomposeable(ucs_ch)) {
538	u_int16_t sequence[`8`];
539	int count, i;
540
541	count = unicode_decompose(ucs_ch, sequence);
542
543	for (i = `0`; i < count; ++i) {
544	if (ucsp >= bufend)
545	goto toolong;
546
547	push(sequence[i], &combcharcnt, &ucsp);
548	}
549
550	continue;
551	}
552	} else if (precompose && (ucsp != bufstart)) {
553	u_int16_t composite, base;
554
555	if (unicode_combinable(ucs_ch)) {
556	base = ucsp[-`1`];
557	composite = unicode_combine(base, ucs_ch);
558	if (composite) {
559	--ucsp;
560	ucs_ch = composite;
561	}
562	}
563	}
564	if (ucs_ch == UCS_ALT_NULL)
565	ucs_ch = `'\0'`;
566	}
567	if (ucs_ch == altslash)
568	ucs_ch = `'/'`;
569
570	push(ucs_ch, &combcharcnt, &ucsp);
571	continue;
572
573	/*
574	* Escape illegal UTF-8 into something legal.
575	*/
576	escape4:
577	utf8p -= `3`;
578	goto escape;
579	escape3:
580	utf8p -= `2`;
581	goto escape;
582	escape2:
583	utf8p -= `1`;
584	escape:
585	if (!escaping) {
586	result = EINVAL;
587	goto exit;
588	}
589	if (extrabytes > `0`)
590	utf8len += extrabytes;
591	byte = *(utf8p - `1`);
592
593	if ((ucsp + `2`) >= bufend)
594	goto toolong;
595
596	/ Make a previous combining sequence canonical. /
597	if (combcharcnt > `1`) {
598	prioritysort(ucsp - combcharcnt, combcharcnt);
599	}
600	combcharcnt = `0`;
601
602	ucs_ch = `'%'`;
603	*ucsp++ = ucs_ch;
604	ucs_ch = hexdigits[byte >> `4`];
605	*ucsp++ = ucs_ch;
606	ucs_ch = hexdigits[byte & `0x0F`];
607	*ucsp++ = ucs_ch;
608	}
609	/*
610	* Make a previous combining sequence canonical
611	*/
612	if (combcharcnt > `1`) {
613	prioritysort(ucsp - combcharcnt, combcharcnt);
614	}
615
616	if (flags & UTF_REVERSE_ENDIAN) {
617	uint16_t *p = bufstart;
618	while (p < ucsp) {
619	p = OSSwapInt16(p);
620	++p;
621	}
622	}
623
624	exit:
625	ucslen = (u_int8_t)ucsp - (u_int8_t*)bufstart;
626
627	return (result);
628
629	toolong:
630	result = ENAMETOOLONG;
631	goto exit;
632	}
633
634
635	/*
636	* utf8_validatestr - Check for a valid UTF-8 string.
637	*/
638	int
639	utf8_validatestr(const u_int8_t* utf8p, size_t utf8len)
640	{
641	unsigned int byte;
642	u_int32_t ch;
643	unsigned int ucs_ch;
644	size_t extrabytes;
645
646	while (utf8len-- > `0` && (byte = *utf8p++) != `'\0'`) {
647	if (byte < `0x80`)
648	continue; / plain ascii /
649
650	extrabytes = utf_extrabytes[byte >> `3`];
651
652	if (utf8len < extrabytes)
653	goto invalid;
654	utf8len -= extrabytes;
655
656	switch (extrabytes) {
657	case `1`:
658	ch = byte; ch <<= `6`; / 1st byte /
659	byte = utf8p++; /* 2nd byte /
660	if ((byte >> `6`) != `2`)
661	goto invalid;
662	ch += byte;
663	ch -= `0x00003080UL`;
664	if (ch < `0x0080`)
665	goto invalid;
666	break;
667	case `2`:
668	ch = byte; ch <<= `6`; / 1st byte /
669	byte = utf8p++; /* 2nd byte /
670	if ((byte >> `6`) != `2`)
671	goto invalid;
672	ch += byte; ch <<= `6`;
673	byte = utf8p++; /* 3rd byte /
674	if ((byte >> `6`) != `2`)
675	goto invalid;
676	ch += byte;
677	ch -= `0x000E2080UL`;
678	if (ch < `0x0800`)
679	goto invalid;
680	if (ch >= `0xD800`) {
681	if (ch <= `0xDFFF`)
682	goto invalid;
683	if (ch == `0xFFFE` \|\| ch == `0xFFFF`)
684	goto invalid;
685	}
686	break;
687	case `3`:
688	ch = byte; ch <<= `6`; / 1st byte /
689	byte = utf8p++; /* 2nd byte /
690	if ((byte >> `6`) != `2`)
691	goto invalid;
692	ch += byte; ch <<= `6`;
693	byte = utf8p++; /* 3rd byte /
694	if ((byte >> `6`) != `2`)
695	goto invalid;
696	ch += byte; ch <<= `6`;
697	byte = utf8p++; /* 4th byte /
698	if ((byte >> `6`) != `2`)
699	goto invalid;
700	ch += byte;
701	ch -= `0x03C82080UL` + SP_HALF_BASE;
702	ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
703	if (ucs_ch < SP_HIGH_FIRST \|\| ucs_ch > SP_HIGH_LAST)
704	goto invalid;
705	ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
706	if (ucs_ch < SP_LOW_FIRST \|\| ucs_ch > SP_LOW_LAST)
707	goto invalid;
708	break;
709	default:
710	goto invalid;
711	}
712
713	}
714	return (`0`);
715	invalid:
716	return (EINVAL);
717	}
718
719	/*
720	* utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD)
721	*
722	* This function takes an UTF-8 input string, instr, of inlen bytes
723	* and produces normalized UTF-8 output into a buffer of buflen bytes
724	* pointed to by outstr. The size of the output in bytes (not including
725	* a NULL termination byte) is returned in outlen. In-place conversions
726	* are not supported (i.e. instr != outstr).]
727
728	* FLAGS
729	* UTF_DECOMPOSED: output string will be fully decomposed (NFD)
730	*
731	* UTF_PRECOMPOSED: output string will be precomposed (NFC)
732	*
733	* UTF_NO_NULL_TERM: do not add null termination to output string
734	*
735	* UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
736	*
737	* ERRORS
738	* ENAMETOOLONG: output did not fit or input exceeded MAXPATHLEN bytes
739	*
740	* EINVAL: illegal UTF-8 sequence encountered or invalid flags
741	*/
742	int
743	utf8_normalizestr(const u_int8_t* instr, size_t inlen, u_int8_t* outstr,
744	size_t outlen, size_t buflen, int* flags)
745	{
746	u_int16_t unicodebuf[`32`];
747	u_int16_t* unistr = NULL;
748	size_t unicode_bytes;
749	size_t uft8_bytes;
750	size_t inbuflen;
751	u_int8_t outbufstart, outbufend;
752	const u_int8_t *inbufstart;
753	unsigned int byte;
754	int decompose, precompose;
755	int result = `0`;
756
757	if (flags & ~(UTF_DECOMPOSED \| UTF_PRECOMPOSED \| UTF_NO_NULL_TERM \| UTF_ESCAPE_ILLEGAL)) {
758	return (EINVAL);
759	}
760	decompose = (flags & UTF_DECOMPOSED);
761	precompose = (flags & UTF_PRECOMPOSED);
762	if ((decompose && precompose) \|\| (!decompose && !precompose)) {
763	return (EINVAL);
764	}
765	outbufstart = outstr;
766	outbufend = outbufstart + buflen;
767	inbufstart = instr;
768	inbuflen = inlen;
769
770	while (inlen-- > `0` && (byte = *instr++) != `'\0'`) {
771	if (outstr >= outbufend) {
772	result = ENAMETOOLONG;
773	goto exit;
774	}
775	if (byte >= `0x80`) {
776	goto nonASCII;
777	}
778	/ ASCII is already normalized. /
779	*outstr++ = byte;
780	}
781	exit:
782	*outlen = outstr - outbufstart;
783	if (((flags & UTF_NO_NULL_TERM) == `0`)) {
784	if (outstr < outbufend)
785	*outstr++ = `'\0'`;
786	else
787	result = ENAMETOOLONG;
788	}
789	return (result);
790
791
792	/*
793	* Non-ASCII uses the existing utf8_encodestr/utf8_decodestr
794	* functions to perform the normalization. Since this will
795	* presumably be used to normalize filenames in the back-end
796	* (on disk or over-the-wire), it should be fast enough.
797	*/
798	nonASCII:
799
800	/ Make sure the input size is reasonable. /
801	if (inbuflen > MAXPATHLEN) {
802	result = ENAMETOOLONG;
803	goto exit;
804	}
805	/*
806	* Compute worst case Unicode buffer size.
807	*
808	* For pre-composed output, every UTF-8 input byte will be at
809	* most 2 Unicode bytes. For decomposed output, 2 UTF-8 bytes
810	* (smallest composite char sequence) may yield 6 Unicode bytes
811	* (1 base char + 2 combining chars).
812	*/
813	unicode_bytes = precompose ? (inbuflen * `2`) : (inbuflen * `3`);
814
815	if (unicode_bytes <= sizeof(unicodebuf))
816	unistr = &unicodebuf[`0`];
817	else
818	MALLOC(unistr, uint16_t *, unicode_bytes, M_TEMP, M_WAITOK);
819
820	/ Normalize the string. /
821	result = utf8_decodestr(inbufstart, inbuflen, unistr, &unicode_bytes,
822	unicode_bytes, `0`, flags & ~UTF_NO_NULL_TERM);
823	if (result == `0`) {
824	/ Put results back into UTF-8. /
825	result = utf8_encodestr(unistr, unicode_bytes, outbufstart,
826	&uft8_bytes, buflen, `0`, UTF_NO_NULL_TERM);
827	outstr = outbufstart + uft8_bytes;
828	}
829	if (unistr && unistr != &unicodebuf[`0`]) {
830	FREE(unistr, M_TEMP);
831	}
832	goto exit;
833	}
834
835
836	/*
837	* Unicode 3.2 decomposition code (derived from Core Foundation)
838	*/
839
840	typedef struct {
841	u_int32_t _key;
842	u_int32_t _value;
843	} unicode_mappings32;
844
845	static inline u_int32_t
846	getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
847	u_int16_t character)
848	{
849	const unicode_mappings32 p, q, *divider;
850
851	if ((character < theTable[`0`]._key) \|\| (character > theTable[numElem-`1`]._key))
852	return (`0`);
853
854	p = theTable;
855	q = p + (numElem-`1`);
856	while (p <= q) {
857	divider = p + ((q - p) >> `1`); / divide by 2 /
858	if (character < divider->_key) { q = divider - `1`; }
859	else if (character > divider->_key) { p = divider + `1`; }
860	else { return (divider->_value); }
861	}
862	return (`0`);
863	}
864
865	#define RECURSIVE_DECOMPOSITION (1 << 15)
866	#define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
867
868	typedef struct {
869	u_int16_t _key;
870	u_int16_t _value;
871	} unicode_mappings16;
872
873	static inline u_int16_t
874	getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
875	u_int16_t character)
876	{
877	const unicode_mappings16 p, q, *divider;
878
879	if ((character < theTable[`0`]._key) \|\| (character > theTable[numElem-`1`]._key))
880	return (`0`);
881
882	p = theTable;
883	q = p + (numElem-`1`);
884	while (p <= q) {
885	divider = p + ((q - p) >> `1`); / divide by 2 /
886	if (character < divider->_key)
887	q = divider - `1`;
888	else if (character > divider->_key)
889	p = divider + `1`;
890	else
891	return (divider->_value);
892	}
893	return (`0`);
894	}
895
896
897	static u_int32_t
898	unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
899	{
900	u_int16_t value;
901	u_int32_t length;
902	u_int16_t firstChar;
903	u_int16_t theChar;
904	const u_int16_t *bmpMappings;
905	u_int32_t usedLength;
906
907	value = getmappedvalue16(
908	(const unicode_mappings16 *)__CFUniCharDecompositionTable,
909	__UniCharDecompositionTableLength, character);
910	length = EXTRACT_COUNT(value);
911	firstChar = value & `0x0FFF`;
912	theChar = firstChar;
913	bmpMappings = (length == `1` ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
914	usedLength = `0`;
915
916	if (value & RECURSIVE_DECOMPOSITION) {
917	usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
918
919	--length; / Decrement for the first char /
920	if (!usedLength)
921	return `0`;
922	++bmpMappings;
923	convertedChars += usedLength;
924	}
925
926	usedLength += length;
927
928	while (length--)
929	(convertedChars++) = (bmpMappings++);
930
931	return (usedLength);
932	}
933
934	#define HANGUL_SBASE 0xAC00
935	#define HANGUL_LBASE 0x1100
936	#define HANGUL_VBASE 0x1161
937	#define HANGUL_TBASE 0x11A7
938
939	#define HANGUL_SCOUNT 11172
940	#define HANGUL_LCOUNT 19
941	#define HANGUL_VCOUNT 21
942	#define HANGUL_TCOUNT 28
943	#define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
944
945	/*
946	* unicode_decompose - decompose a composed Unicode char
947	*
948	* Composed Unicode characters are forbidden on
949	* HFS Plus volumes. ucs_decompose will convert a
950	* composed character into its correct decomposed
951	* sequence.
952	*
953	* Similar to CFUniCharDecomposeCharacter
954	*/
955	static int
956	unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
957	{
958	if ((character >= HANGUL_SBASE) &&
959	(character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
960	u_int32_t length;
961
962	character -= HANGUL_SBASE;
963	length = (character % HANGUL_TCOUNT ? `3` : `2`);
964
965	*(convertedChars++) =
966	character / HANGUL_NCOUNT + HANGUL_LBASE;
967	*(convertedChars++) =
968	(character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
969	if (length > `2`)
970	*convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
971	return (length);
972	} else {
973	return (unicode_recursive_decompose(character, convertedChars));
974	}
975	}
976
977	/*
978	* unicode_combine - generate a precomposed Unicode char
979	*
980	* Precomposed Unicode characters are required for some volume
981	* formats and network protocols. unicode_combine will combine
982	* a decomposed character sequence into a single precomposed
983	* (composite) character.
984	*
985	* Similar toCFUniCharPrecomposeCharacter but unicode_combine
986	* also handles Hangul Jamo characters.
987	*/
988	static u_int16_t
989	unicode_combine(u_int16_t base, u_int16_t combining)
990	{
991	u_int32_t value;
992
993	/ Check HANGUL /
994	if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
995	/ 2 char Hangul sequences /
996	if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
997	(base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
998	return (HANGUL_SBASE +
999	((base - HANGUL_LBASE)(HANGUL_VCOUNTHANGUL_TCOUNT)) +
1000	((combining - HANGUL_VBASE)*HANGUL_TCOUNT));
1001	}
1002
1003	/ 3 char Hangul sequences /
1004	if ((combining > HANGUL_TBASE) &&
1005	(base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
1006	if ((base - HANGUL_SBASE) % HANGUL_TCOUNT)
1007	return (`0`);
1008	else
1009	return (base + (combining - HANGUL_TBASE));
1010	}
1011	}
1012
1013	value = getmappedvalue32(
1014	(const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
1015	__CFUniCharPrecompositionTableLength, combining);
1016
1017	if (value) {
1018	value = getmappedvalue16(
1019	(const unicode_mappings16 *)
1020	((const u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & `0xFFFF`)),
1021	(value >> `16`), base);
1022	}
1023	return (value);
1024	}
1025
1026
1027	/*
1028	* prioritysort - order combining chars into canonical order
1029	*
1030	* Similar to CFUniCharPrioritySort
1031	*/
1032	static void
1033	prioritysort(u_int16_t* characters, int count)
1034	{
1035	u_int32_t p1, p2;
1036	u_int16_t ch1, ch2;
1037	u_int16_t *end;
1038	int changes = `0`;
1039
1040	end = characters + count;
1041	do {
1042	changes = `0`;
1043	ch1 = characters;
1044	ch2 = characters + `1`;
1045	p2 = get_combining_class(*ch1);
1046	while (ch2 < end) {
1047	p1 = p2;
1048	p2 = get_combining_class(*ch2);
1049	if (p1 > p2 && p2 != `0`) {
1050	u_int32_t tmp;
1051
1052	tmp = *ch1;
1053	ch1 = ch2;
1054	*ch2 = tmp;
1055	changes = `1`;
1056
1057	/*
1058	* Make sure that p2 contains the combining class for the
1059	* character now stored at *ch2. This isn't required for
1060	* correctness, but it will be more efficient if a character
1061	* with a large combining class has to "bubble past" several
1062	* characters with lower combining classes.
1063	*/
1064	p2 = p1;
1065	}
1066	++ch1;
1067	++ch2;
1068	}
1069	} while (changes);
1070	}
1071
1072
1073	/*
1074	* Invalid NTFS filename characters are encodeded using the
1075	* SFM (Services for Macintosh) private use Unicode characters.
1076	*
1077	* These should only be used for SMB, MSDOS or NTFS.
1078	*
1079	* Illegal NTFS Char SFM Unicode Char
1080	* ----------------------------------------
1081	* 0x01-0x1f 0xf001-0xf01f
1082	* '"' 0xf020
1083	* '*' 0xf021
1084	* '/' 0xf022
1085	* '<' 0xf023
1086	* '>' 0xf024
1087	* '?' 0xf025
1088	* '\' 0xf026
1089	* '\|' 0xf027
1090	* ' ' 0xf028 (Only if last char of the name)
1091	* '.' 0xf029 (Only if last char of the name)
1092	* ----------------------------------------
1093	*
1094	* Reference: http://support.microsoft.com/kb/q117258/
1095	*/
1096
1097	#define MAX_SFM2MAC 0x29
1098	#define SFMCODE_PREFIX_MASK 0xf000
1099
1100	/*
1101	* In the Mac OS 9 days the colon was illegal in a file name. For that reason
1102	* SFM had no conversion for the colon. There is a conversion for the
1103	* slash. In Mac OS X the slash is illegal in a file name. So for us the colon
1104	* is a slash and a slash is a colon. So we can just replace the slash with the
1105	* colon in our tables and everything will just work.
1106	*/
1107	static u_int8_t
1108	sfm2mac[] = {
1109	`0x00`, `0x01`, `0x02`, `0x03`, `0x04`, `0x05`, `0x06`, `0x07`, / 00 - 07 /
1110	`0x08`, `0x09`, `0x0a`, `0x0b`, `0x0c`, `0x0d`, `0x0e`, `0x0f`, / 08 - 0F /
1111	`0x10`, `0x11`, `0x12`, `0x13`, `0x14`, `0x15`, `0x16`, `0x17`, / 10 - 17 /
1112	`0x18`, `0x19`, `0x1a`, `0x1b`, `0x1c`, `0x1d`, `0x1e`, `0x1f`, / 18 - 1F /
1113	`0x22`, `0x2a`, `0x3a`, `0x3c`, `0x3e`, `0x3f`, `0x5c`, `0x7c`, / 20 - 27 /
1114	`0x20`, `0x2e` / 28 - 29 /
1115	};
1116	#define SFM2MAC_LEN ((sizeof(sfm2mac))/sizeof(sfm2mac[0]))
1117
1118	static u_int8_t
1119	mac2sfm[] = {
1120	`0x20`, `0x21`, `0x20`, `0x23`, `0x24`, `0x25`, `0x26`, `0x27`, / 20 - 27 /
1121	`0x28`, `0x29`, `0x21`, `0x2b`, `0x2c`, `0x2d`, `0x2e`, `0x22`, / 28 - 2f /
1122	`0x30`, `0x31`, `0x32`, `0x33`, `0x34`, `0x35`, `0x36`, `0x37`, / 30 - 37 /
1123	`0x38`, `0x39`, `0x22`, `0x3b`, `0x23`, `0x3d`, `0x24`, `0x25`, / 38 - 3f /
1124	`0x40`, `0x41`, `0x42`, `0x43`, `0x44`, `0x45`, `0x46`, `0x47`, / 40 - 47 /
1125	`0x48`, `0x49`, `0x4a`, `0x4b`, `0x4c`, `0x4d`, `0x4e`, `0x4f`, / 48 - 4f /
1126	`0x50`, `0x51`, `0x52`, `0x53`, `0x54`, `0x55`, `0x56`, `0x57`, / 50 - 57 /
1127	`0x58`, `0x59`, `0x5a`, `0x5b`, `0x26`, `0x5d`, `0x5e`, `0x5f`, / 58 - 5f /
1128	`0x60`, `0x61`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, / 60 - 67 /
1129	`0x68`, `0x69`, `0x6a`, `0x6b`, `0x6c`, `0x6d`, `0x6e`, `0x6f`, / 68 - 6f /
1130	`0x70`, `0x71`, `0x72`, `0x73`, `0x74`, `0x75`, `0x76`, `0x77`, / 70 - 77 /
1131	`0x78`, `0x79`, `0x7a`, `0x7b`, `0x27`, `0x7d`, `0x7e`, `0x7f` / 78 - 7f /
1132	};
1133	#define MAC2SFM_LEN ((sizeof(mac2sfm))/sizeof(mac2sfm[0]))
1134
1135
1136	/*
1137	* Encode illegal NTFS filename characters into SFM Private Unicode characters
1138	*
1139	* Assumes non-zero ASCII input.
1140	*/
1141	static u_int16_t
1142	ucs_to_sfm(u_int16_t ucs_ch, int lastchar)
1143	{
1144	/ The last character of filename cannot be a space or period. /
1145	if (lastchar) {
1146	if (ucs_ch == `0x20`)
1147	return (`0xf028`);
1148	else if (ucs_ch == `0x2e`)
1149	return (`0xf029`);
1150	}
1151	/ 0x01 - 0x1f is simple transformation. /
1152	if (ucs_ch <= `0x1f`) {
1153	return (ucs_ch \| `0xf000`);
1154	} else / 0x20 - 0x7f / {
1155	u_int16_t lsb;
1156
1157	assert((ucs_ch - `0x0020`) < MAC2SFM_LEN);
1158	lsb = mac2sfm[ucs_ch - `0x0020`];
1159	if (lsb != ucs_ch)
1160	return(`0xf000` \| lsb);
1161	}
1162	return (ucs_ch);
1163	}
1164
1165	/*
1166	* Decode any SFM Private Unicode characters
1167	*/
1168	static u_int16_t
1169	sfm_to_ucs(u_int16_t ucs_ch)
1170	{
1171	if (((ucs_ch & `0xffC0`) == SFMCODE_PREFIX_MASK) &&
1172	((ucs_ch & `0x003f`) <= MAX_SFM2MAC)) {
1173	assert((ucs_ch & `0x003f`) < SFM2MAC_LEN);
1174	ucs_ch = sfm2mac[ucs_ch & `0x003f`];
1175	}
1176	return (ucs_ch);
1177	}
1178
1179
1180

Browse the source code of codebrowser/bsd/vfs/vfs_utfconv.c