Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: EcnlProtoTool/trunk/onigmo-5.15.0/src/enc/utf8.c@ 279

Last change on this file since 279 was 279, checked in by coas-nagasima, 7 years ago
ファイルを追加、更新。
Property svn:eol-style set to `native` Property svn:keywords set to `Id` Property svn:mime-type set to `text/x-csrc`
File size: 8.1 KB

Line
1	/**********************************************************************
2	utf8.c - Oniguruma (regular expression library)
3	**********************************************************************/
4	/*-
5	* Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6	* All rights reserved.
7	*
8	* Redistribution and use in source and binary forms, with or without
9	* modification, are permitted provided that the following conditions
10	* are met:
11	* 1. Redistributions of source code must retain the above copyright
12	* notice, this list of conditions and the following disclaimer.
13	* 2. Redistributions in binary form must reproduce the above copyright
14	* notice, this list of conditions and the following disclaimer in the
15	* documentation and/or other materials provided with the distribution.
16	*
17	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27	* SUCH DAMAGE.
28	*/
29
30	#include "regenc.h"
31
32	#define USE_INVALID_CODE_SCHEME
33
34	#ifdef USE_INVALID_CODE_SCHEME
35	/* virtual codepoint values for invalid encoding byte 0xfe and 0xff */
36	#define INVALID_CODE_FE 0xfffffffe
37	#define INVALID_CODE_FF 0xffffffff
38	#define VALID_CODE_LIMIT 0x7fffffff
39	#endif
40
41	#define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80)
42
43	static const int EncLen_UTF8[] = {
44	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
47	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
57	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
58	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
59	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
60	};
61
62	static int
63	mbc_enc_len(const UChar* p)
64	{
65	return EncLen_UTF8[*p];
66	}
67
68	static int
69	is_mbc_newline(const UChar* p, const UChar* end)
70	{
71	if (p < end) {
72	if (*p == 0x0a) return 1;
73
74	#ifdef USE_UNICODE_ALL_LINE_TERMINATORS
75	if (p == 0x0b \|\| p == 0x0c \|\| *p == 0x0d) return 1;
76	if (p + 1 < end) {
77	if ((p+1) == 0x85 && p == 0xc2) /* U+0085 */
78	return 1;
79	if (p + 2 < end) {
80	if (((p+2) == 0xa8 \|\| (p+2) == 0xa9)
81	&& (p+1) == 0x80 && p == 0xe2) /* U+2028, U+2029 */
82	return 1;
83	}
84	}
85	#endif
86	}
87
88	return 0;
89	}
90
91	static OnigCodePoint
92	mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)
93	{
94	int c, len;
95	OnigCodePoint n;
96
97	len = mbc_enc_len(p);
98	c = *p++;
99	if (len > 1) {
100	len--;
101	n = c & ((1 << (6 - len)) - 1);
102	while (len--) {
103	c = *p++;
104	n = (n << 6) \| (c & ((1 << 6) - 1));
105	}
106	return n;
107	}
108	else {
109	#ifdef USE_INVALID_CODE_SCHEME
110	if (c > 0xfd) {
111	return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF);
112	}
113	#endif
114	return (OnigCodePoint )c;
115	}
116	}
117
118	static int
119	code_to_mbclen(OnigCodePoint code)
120	{
121	if ((code & 0xffffff80) == 0) return 1;
122	else if ((code & 0xfffff800) == 0) return 2;
123	else if ((code & 0xffff0000) == 0) return 3;
124	else if ((code & 0xffe00000) == 0) return 4;
125	else if ((code & 0xfc000000) == 0) return 5;
126	else if ((code & 0x80000000) == 0) return 6;
127	#ifdef USE_INVALID_CODE_SCHEME
128	else if (code == INVALID_CODE_FE) return 1;
129	else if (code == INVALID_CODE_FF) return 1;
130	#endif
131	else
132	return ONIGERR_INVALID_CODE_POINT_VALUE;
133	}
134
135	static int
136	code_to_mbc(OnigCodePoint code, UChar *buf)
137	{
138	#define UTF8_TRAILS(code, shift) (UChar )((((code) >> (shift)) & 0x3f) \| 0x80)
139	#define UTF8_TRAIL0(code) (UChar )(((code) & 0x3f) \| 0x80)
140
141	if ((code & 0xffffff80) == 0) {
142	*buf = (UChar )code;
143	return 1;
144	}
145	else {
146	UChar *p = buf;
147
148	if ((code & 0xfffff800) == 0) {
149	*p++ = (UChar )(((code>>6)& 0x1f) \| 0xc0);
150	}
151	else if ((code & 0xffff0000) == 0) {
152	*p++ = (UChar )(((code>>12) & 0x0f) \| 0xe0);
153	*p++ = UTF8_TRAILS(code, 6);
154	}
155	else if ((code & 0xffe00000) == 0) {
156	*p++ = (UChar )(((code>>18) & 0x07) \| 0xf0);
157	*p++ = UTF8_TRAILS(code, 12);
158	*p++ = UTF8_TRAILS(code, 6);
159	}
160	else if ((code & 0xfc000000) == 0) {
161	*p++ = (UChar )(((code>>24) & 0x03) \| 0xf8);
162	*p++ = UTF8_TRAILS(code, 18);
163	*p++ = UTF8_TRAILS(code, 12);
164	*p++ = UTF8_TRAILS(code, 6);
165	}
166	else if ((code & 0x80000000) == 0) {
167	*p++ = (UChar )(((code>>30) & 0x01) \| 0xfc);
168	*p++ = UTF8_TRAILS(code, 24);
169	*p++ = UTF8_TRAILS(code, 18);
170	*p++ = UTF8_TRAILS(code, 12);
171	*p++ = UTF8_TRAILS(code, 6);
172	}
173	#ifdef USE_INVALID_CODE_SCHEME
174	else if (code == INVALID_CODE_FE) {
175	*p = 0xfe;
176	return 1;
177	}
178	else if (code == INVALID_CODE_FF) {
179	*p = 0xff;
180	return 1;
181	}
182	#endif
183	else {
184	return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
185	}
186
187	*p++ = UTF8_TRAIL0(code);
188	return (int )(p - buf);
189	}
190	}
191
192	static int
193	mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,
194	const UChar* end, UChar* fold)
195	{
196	const UChar* p = *pp;
197
198	if (ONIGENC_IS_MBC_ASCII(p)) {
199	#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
200	if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
201	if (*p == 0x49) {
202	*fold++ = 0xc4;
203	*fold = 0xb1;
204	(*pp)++;
205	return 2;
206	}
207	}
208	#endif
209
210	fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(p);
211	(*pp)++;
212	return 1; /* return byte length of converted char to lower */
213	}
214	else {
215	return onigenc_unicode_mbc_case_fold(ONIG_ENCODING_UTF8, flag,
216	pp, end, fold);
217	}
218	}
219
220	#if 0
221	static int
222	is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end)
223	{
224	const UChar* p = *pp;
225
226	if (ONIGENC_IS_MBC_ASCII(p)) {
227	(*pp)++;
228	return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
229	}
230	else {
231	(*pp) += mbc_enc_len(p);
232
233	if (*p == 0xc3) {
234	int c = *(p + 1);
235	if (c >= 0x80) {
236	if (c <= (UChar )0x9e) { /* upper */
237	if (c == (UChar )0x97) return FALSE;
238	return TRUE;
239	}
240	else if (c >= (UChar )0xa0 && c <= (UChar )0xbe) { /* lower */
241	if (c == (UChar )'\267') return FALSE;
242	return TRUE;
243	}
244	else if (c == (UChar )0x9f &&
245	(flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
246	return TRUE;
247	}
248	}
249	}
250	}
251
252	return FALSE;
253	}
254	#endif
255
256
257	static int
258	get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out,
259	const OnigCodePoint* ranges[])
260	{
261	*sb_out = 0x80;
262	return onigenc_unicode_ctype_code_range(ctype, ranges);
263	}
264
265
266	static UChar*
267	left_adjust_char_head(const UChar* start, const UChar* s)
268	{
269	const UChar *p;
270
271	if (s <= start) return (UChar* )s;
272	p = s;
273
274	while (!utf8_islead(*p) && p > start) p--;
275	return (UChar* )p;
276	}
277
278	static int
279	get_case_fold_codes_by_str(OnigCaseFoldType flag,
280	const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
281	{
282	return onigenc_unicode_get_case_fold_codes_by_str(ONIG_ENCODING_UTF8,
283	flag, p, end, items);
284	}
285
286	OnigEncodingType OnigEncodingUTF8 = {
287	mbc_enc_len,
288	"UTF-8", /* name */
289	6, /* max byte length */
290	1, /* min byte length */
291	is_mbc_newline,
292	mbc_to_code,
293	code_to_mbclen,
294	code_to_mbc,
295	mbc_case_fold,
296	onigenc_unicode_apply_all_case_fold,
297	get_case_fold_codes_by_str,
298	onigenc_unicode_property_name_to_ctype,
299	onigenc_unicode_is_code_ctype,
300	get_ctype_code_range,
301	left_adjust_char_head,
302	onigenc_always_true_is_allowed_reverse_match,
303	ONIGENC_FLAG_UNICODE,
304	};

Note: See TracBrowser for help on using the repository browser.

Download in other formats: