source: EcnlProtoTool/trunk/onigmo-6.1.3/src/enc/unicode.c@ 331

Last change on this file since 331 was 331, checked in by coas-nagasima, 6 years ago

prototoolに関連するプロジェクトをnewlibからmuslを使うよう変更・更新
ntshellをnewlibの下位の実装から、muslのsyscallの実装に変更・更新
以下のOSSをアップデート
・mruby-1.3.0
・musl-1.1.18
・onigmo-6.1.3
・tcc-0.9.27
以下のOSSを追加
・openssl-1.1.0e
・curl-7.57.0
・zlib-1.2.11
以下のmrbgemsを追加
・iij/mruby-digest
・iij/mruby-env
・iij/mruby-errno
・iij/mruby-iijson
・iij/mruby-ipaddr
・iij/mruby-mock
・iij/mruby-require
・iij/mruby-tls-openssl

  • Property svn:eol-style set to native
  • Property svn:mime-type set to text/x-csrc;charset=UTF-8
File size: 23.3 KB
Line 
1/**********************************************************************
2 unicode.c - Oniguruma (regular expression library)
3**********************************************************************/
4/*-
5 * Copyright (c) 2002-2013 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include "regint.h"
31
32#define ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code,ctype) \
33 ((EncUNICODE_ISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)
34#if 0
35#define ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(code,cbit) \
36 ((EncUNICODE_ISO_8859_1_CtypeTable[code] & (cbit)) != 0)
37#endif
38
39static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = {
40 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
41 0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
42 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
43 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
44 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
45 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
46 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
47 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
48 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
49 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
50 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
51 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
52 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
53 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
54 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
55 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
56 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0288, 0x0008, 0x0008,
57 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
58 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
59 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
60 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
61 0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x00a8, 0x00a0, 0x00a0,
62 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0,
63 0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
64 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
65 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
66 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,
67 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
68 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
69 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
70 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,
71 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
72};
73
74typedef struct {
75 int n;
76 OnigCodePoint code[3];
77} CodePointList3;
78
79typedef struct {
80 OnigCodePoint from;
81 CodePointList3 to;
82} CaseFold_11_Type;
83
84typedef struct {
85 OnigCodePoint from;
86 CodePointList3 to;
87} CaseUnfold_11_Type;
88
89typedef struct {
90 int n;
91 OnigCodePoint code[2];
92} CodePointList2;
93
94typedef struct {
95 OnigCodePoint from[2];
96 CodePointList2 to;
97} CaseUnfold_12_Type;
98
99typedef struct {
100 OnigCodePoint from[3];
101 CodePointList2 to;
102} CaseUnfold_13_Type;
103
104static inline int
105bits_of(const OnigCodePoint c, const int n)
106{
107 return (c >> (2 - n) * 7) & 127;
108}
109
110static inline int
111bits_at(const OnigCodePoint *c, const int n)
112{
113 return bits_of(c[n / 3], n % 3);
114}
115
116static int
117code1_equal(const OnigCodePoint x, const OnigCodePoint y)
118{
119 if (x != y) return 0;
120 return 1;
121}
122
123static int
124code2_equal(const OnigCodePoint *x, const OnigCodePoint *y)
125{
126 if (x[0] != y[0]) return 0;
127 if (x[1] != y[1]) return 0;
128 return 1;
129}
130
131static int
132code3_equal(const OnigCodePoint *x, const OnigCodePoint *y)
133{
134 if (x[0] != y[0]) return 0;
135 if (x[1] != y[1]) return 0;
136 if (x[2] != y[2]) return 0;
137 return 1;
138}
139
140/* macros related to ONIGENC_CASE flags */
141/* defined here because not used in other files */
142#define ONIGENC_CASE_SPECIALS (ONIGENC_CASE_TITLECASE | ONIGENC_CASE_IS_TITLECASE | ONIGENC_CASE_UP_SPECIAL | ONIGENC_CASE_DOWN_SPECIAL)
143
144/* macros for length in CaseMappingSpecials array in enc/unicode/casefold.h */
145#define SpecialsLengthOffset 25 /* needs to be higher than the 22 bits used for Unicode codepoints */
146#define SpecialsLengthExtract(n) ((n) >> SpecialsLengthOffset)
147#define SpecialsCodepointExtract(n) ((n) & ((1 << SpecialsLengthOffset) - 1))
148#define SpecialsLengthEncode(n) ((n) << SpecialsLengthOffset)
149
150#define OnigSpecialIndexMask (((1 << OnigSpecialIndexWidth) - 1) << OnigSpecialIndexShift)
151#define OnigSpecialIndexEncode(n) ((n) << OnigSpecialIndexShift)
152#define OnigSpecialIndexDecode(n) (((n) & OnigSpecialIndexMask) >> OnigSpecialIndexShift)
153
154/* macros to shorten "enc/unicode/casefold.h", undefined immediately after including the file */
155#define U ONIGENC_CASE_UPCASE
156#define D ONIGENC_CASE_DOWNCASE
157#define F ONIGENC_CASE_FOLD
158#define ST ONIGENC_CASE_TITLECASE
159#define SU ONIGENC_CASE_UP_SPECIAL
160#define SL ONIGENC_CASE_DOWN_SPECIAL
161#define IT ONIGENC_CASE_IS_TITLECASE
162#define I(n) OnigSpecialIndexEncode(n)
163#define L(n) SpecialsLengthEncode(n)
164
165#include "casefold.h"
166
167#undef U
168#undef D
169#undef F
170#undef ST
171#undef SU
172#undef SL
173#undef IT
174#undef I
175#undef L
176
177#include "name2ctype.h"
178
179#define CODE_RANGES_NUM numberof(CodeRanges)
180
181extern int
182onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED)
183{
184 if (
185#ifdef USE_UNICODE_PROPERTIES
186 ctype <= ONIGENC_MAX_STD_CTYPE &&
187#endif
188 code < 256) {
189 return ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code, ctype);
190 }
191
192 if (ctype >= CODE_RANGES_NUM) {
193 return ONIGERR_TYPE_BUG;
194 }
195
196 return onig_is_in_code_range((UChar* )CodeRanges[ctype], code);
197}
198
199
200extern int
201onigenc_unicode_ctype_code_range(int ctype, const OnigCodePoint* ranges[])
202{
203 if (ctype >= CODE_RANGES_NUM) {
204 return ONIGERR_TYPE_BUG;
205 }
206
207 *ranges = CodeRanges[ctype];
208
209 return 0;
210}
211
212extern int
213onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
214 const OnigCodePoint* ranges[],
215 OnigEncoding enc ARG_UNUSED)
216{
217 *sb_out = 0x00;
218 return onigenc_unicode_ctype_code_range(ctype, ranges);
219}
220
221#define PROPERTY_NAME_MAX_SIZE (MAX_WORD_LENGTH + 1)
222
223extern int
224onigenc_unicode_property_name_to_ctype(OnigEncoding enc, const UChar* name, const UChar* end)
225{
226 int len;
227 int ctype;
228 UChar buf[PROPERTY_NAME_MAX_SIZE];
229 const UChar *p;
230 OnigCodePoint code;
231
232 len = 0;
233 for (p = name; p < end; p += enclen(enc, p, end)) {
234 code = ONIGENC_MBC_TO_CODE(enc, p, end);
235 if (code == ' ' || code == '-' || code == '_')
236 continue;
237 if (code >= 0x80)
238 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
239
240 buf[len++] = ONIGENC_ASCII_CODE_TO_LOWER_CASE(code);
241 if (len >= PROPERTY_NAME_MAX_SIZE)
242 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
243 }
244
245 buf[len] = 0;
246
247 if ((ctype = uniname2ctype(buf, len)) < 0) {
248 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
249 }
250
251 return ctype;
252}
253
254#define onigenc_unicode_fold_lookup onigenc_unicode_CaseFold_11_lookup
255#define onigenc_unicode_unfold1_lookup onigenc_unicode_CaseUnfold_11_lookup
256#define onigenc_unicode_unfold2_lookup onigenc_unicode_CaseUnfold_12_lookup
257#define onigenc_unicode_unfold3_lookup onigenc_unicode_CaseUnfold_13_lookup
258
259enum {
260 I_WITH_DOT_ABOVE = 0x0130,
261 DOTLESS_i = 0x0131,
262 DOT_ABOVE = 0x0307
263};
264
265extern int
266onigenc_unicode_mbc_case_fold(OnigEncoding enc,
267 OnigCaseFoldType flag ARG_UNUSED, const UChar** pp, const UChar* end,
268 UChar* fold)
269{
270 const CodePointList3 *to;
271 OnigCodePoint code;
272 int i, len, rlen;
273 const UChar *p = *pp;
274
275 code = ONIGENC_MBC_TO_CODE(enc, p, end);
276 len = enclen(enc, p, end);
277 *pp += len;
278
279#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
280 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
281 if (code == 'I') {
282 return ONIGENC_CODE_TO_MBC(enc, DOTLESS_i, fold);
283 }
284 else if (code == I_WITH_DOT_ABOVE) {
285 return ONIGENC_CODE_TO_MBC(enc, 'i', fold);
286 }
287 }
288#endif
289
290 if ((to = onigenc_unicode_fold_lookup(code)) != 0) {
291 if (OnigCodePointCount(to->n) == 1) {
292 return ONIGENC_CODE_TO_MBC(enc, to->code[0], fold);
293 }
294#if 0
295 /* NO NEEDS TO CHECK */
296 else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0)
297#else
298 else
299#endif
300 {
301 rlen = 0;
302 for (i = 0; i < OnigCodePointCount(to->n); i++) {
303 len = ONIGENC_CODE_TO_MBC(enc, to->code[i], fold);
304 fold += len;
305 rlen += len;
306 }
307 return rlen;
308 }
309 }
310
311 for (i = 0; i < len; i++) {
312 *fold++ = *p++;
313 }
314 return len;
315}
316
317extern int
318onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,
319 OnigApplyAllCaseFoldFunc f, void* arg,
320 OnigEncoding enc ARG_UNUSED)
321{
322 const CaseUnfold_11_Type* p11;
323 OnigCodePoint code;
324 int i, j, k, r;
325
326 for (i = 0; i < numberof(CaseUnfold_11); i++) {
327 p11 = &CaseUnfold_11[i];
328 for (j = 0; j < OnigCodePointCount(p11->to.n); j++) {
329 code = p11->from;
330 r = (*f)(p11->to.code[j], &code, 1, arg);
331 if (r != 0) return r;
332
333 code = p11->to.code[j];
334 r = (*f)(p11->from, &code, 1, arg);
335 if (r != 0) return r;
336
337 for (k = 0; k < j; k++) {
338 r = (*f)(p11->to.code[j], (OnigCodePoint* )(&p11->to.code[k]), 1, arg);
339 if (r != 0) return r;
340
341 r = (*f)(p11->to.code[k], (OnigCodePoint* )(&p11->to.code[j]), 1, arg);
342 if (r != 0) return r;
343 }
344 }
345 }
346
347#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
348 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
349 code = DOTLESS_i;
350 r = (*f)('I', &code, 1, arg);
351 if (r != 0) return r;
352 code = 'I';
353 r = (*f)(DOTLESS_i, &code, 1, arg);
354 if (r != 0) return r;
355
356 code = I_WITH_DOT_ABOVE;
357 r = (*f)('i', &code, 1, arg);
358 if (r != 0) return r;
359 code = 'i';
360 r = (*f)(I_WITH_DOT_ABOVE, &code, 1, arg);
361 if (r != 0) return r;
362 }
363 else {
364#endif
365 for (i = 0; i < numberof(CaseUnfold_11_Locale); i++) {
366 p11 = &CaseUnfold_11_Locale[i];
367 for (j = 0; j < OnigCodePointCount(p11->to.n); j++) {
368 code = p11->from;
369 r = (*f)(p11->to.code[j], &code, 1, arg);
370 if (r != 0) return r;
371
372 code = p11->to.code[j];
373 r = (*f)(p11->from, &code, 1, arg);
374 if (r != 0) return r;
375
376 for (k = 0; k < j; k++) {
377 r = (*f)(p11->to.code[j], (OnigCodePoint* )(&p11->to.code[k]),
378 1, arg);
379 if (r != 0) return r;
380
381 r = (*f)(p11->to.code[k], (OnigCodePoint* )(&p11->to.code[j]),
382 1, arg);
383 if (r != 0) return r;
384 }
385 }
386 }
387#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
388 }
389#endif
390
391 if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
392 for (i = 0; i < numberof(CaseUnfold_12); i++) {
393 for (j = 0; j < OnigCodePointCount(CaseUnfold_12[i].to.n); j++) {
394 r = (*f)(CaseUnfold_12[i].to.code[j],
395 (OnigCodePoint* )CaseUnfold_12[i].from, 2, arg);
396 if (r != 0) return r;
397
398 for (k = 0; k < OnigCodePointCount(CaseUnfold_12[i].to.n); k++) {
399 if (k == j) continue;
400
401 r = (*f)(CaseUnfold_12[i].to.code[j],
402 (OnigCodePoint* )(&CaseUnfold_12[i].to.code[k]), 1, arg);
403 if (r != 0) return r;
404 }
405 }
406 }
407
408#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
409 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) == 0) {
410#endif
411 for (i = 0; i < numberof(CaseUnfold_12_Locale); i++) {
412 for (j = 0; j < OnigCodePointCount(CaseUnfold_12_Locale[i].to.n); j++) {
413 r = (*f)(CaseUnfold_12_Locale[i].to.code[j],
414 (OnigCodePoint* )CaseUnfold_12_Locale[i].from, 2, arg);
415 if (r != 0) return r;
416
417 for (k = 0; k < OnigCodePointCount(CaseUnfold_12_Locale[i].to.n); k++) {
418 if (k == j) continue;
419
420 r = (*f)(CaseUnfold_12_Locale[i].to.code[j],
421 (OnigCodePoint* )(&CaseUnfold_12_Locale[i].to.code[k]),
422 1, arg);
423 if (r != 0) return r;
424 }
425 }
426 }
427#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
428 }
429#endif
430
431 for (i = 0; i < numberof(CaseUnfold_13); i++) {
432 for (j = 0; j < OnigCodePointCount(CaseUnfold_13[i].to.n); j++) {
433 r = (*f)(CaseUnfold_13[i].to.code[j],
434 (OnigCodePoint* )CaseUnfold_13[i].from, 3, arg);
435 if (r != 0) return r;
436
437 for (k = 0; k < OnigCodePointCount(CaseUnfold_13[i].to.n); k++) {
438 if (k == j) continue;
439
440 r = (*f)(CaseUnfold_13[i].to.code[j],
441 (OnigCodePoint* )(&CaseUnfold_13[i].to.code[k]), 1, arg);
442 if (r != 0) return r;
443 }
444 }
445 }
446 }
447
448 return 0;
449}
450
451#define CodePointListValidP(x) (OnigCodePointCount((x)->n) <= numberof((x)->code))
452
453extern int
454onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
455 OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end,
456 OnigCaseFoldCodeItem items[])
457{
458 int n, i, j, k, len;
459 OnigCodePoint code, codes[3];
460 const CodePointList3 *to, *z3;
461 const CodePointList2 *z2;
462
463 n = 0;
464
465 code = ONIGENC_MBC_TO_CODE(enc, p, end);
466 len = enclen(enc, p, end);
467
468#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
469 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
470 switch (code) {
471 case 'I':
472 items[0].byte_len = len;
473 items[0].code_len = 1;
474 items[0].code[0] = DOTLESS_i;
475 return 1;
476 case I_WITH_DOT_ABOVE:
477 items[0].byte_len = len;
478 items[0].code_len = 1;
479 items[0].code[0] = 'i';
480 return 1;
481 case DOTLESS_i:
482 items[0].byte_len = len;
483 items[0].code_len = 1;
484 items[0].code[0] = 'I';
485 return 1;
486 case 'i':
487 items[0].byte_len = len;
488 items[0].code_len = 1;
489 items[0].code[0] = I_WITH_DOT_ABOVE;
490 return 1;
491 }
492 }
493#endif
494
495 if ((to = onigenc_unicode_fold_lookup(code)) != 0) {
496 if (OnigCodePointCount(to->n) == 1) {
497 OnigCodePoint orig_code = code;
498
499 items[0].byte_len = len;
500 items[0].code_len = 1;
501 items[0].code[0] = to->code[0];
502 n++;
503
504 code = to->code[0];
505 if ((to = onigenc_unicode_unfold1_lookup(code)) != 0 &&
506 CodePointListValidP(to)) {
507 for (i = 0; i < OnigCodePointCount(to->n); i++) {
508 if (to->code[i] != orig_code) {
509 items[n].byte_len = len;
510 items[n].code_len = 1;
511 items[n].code[0] = to->code[i];
512 n++;
513 }
514 }
515 }
516 }
517 else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
518 OnigCodePoint cs[3][4];
519 int fn, ncs[3];
520
521 for (fn = 0; fn < OnigCodePointCount(to->n); fn++) {
522 cs[fn][0] = to->code[fn];
523 if ((z3 = onigenc_unicode_unfold1_lookup(cs[fn][0])) != 0) {
524 for (i = 0; i < OnigCodePointCount(z3->n); i++) {
525 cs[fn][i+1] = z3->code[i];
526 }
527 ncs[fn] = OnigCodePointCount(z3->n) + 1;
528 }
529 else
530 ncs[fn] = 1;
531 }
532
533 if (fn == 2) {
534 for (i = 0; i < ncs[0]; i++) {
535 for (j = 0; j < ncs[1]; j++) {
536 items[n].byte_len = len;
537 items[n].code_len = 2;
538 items[n].code[0] = cs[0][i];
539 items[n].code[1] = cs[1][j];
540 n++;
541 }
542 }
543
544 if ((z2 = onigenc_unicode_unfold2_lookup(to->code)) != 0 &&
545 CodePointListValidP(z2)) {
546 for (i = 0; i < OnigCodePointCount(z2->n); i++) {
547 if (z2->code[i] == code) continue;
548
549 items[n].byte_len = len;
550 items[n].code_len = 1;
551 items[n].code[0] = z2->code[i];
552 n++;
553 }
554 }
555 }
556 else {
557 for (i = 0; i < ncs[0]; i++) {
558 for (j = 0; j < ncs[1]; j++) {
559 for (k = 0; k < ncs[2]; k++) {
560 items[n].byte_len = len;
561 items[n].code_len = 3;
562 items[n].code[0] = cs[0][i];
563 items[n].code[1] = cs[1][j];
564 items[n].code[2] = cs[2][k];
565 n++;
566 }
567 }
568 }
569
570 if ((z2 = onigenc_unicode_unfold3_lookup(to->code)) != 0 &&
571 CodePointListValidP(z2)) {
572 for (i = 0; i < OnigCodePointCount(z2->n); i++) {
573 if (z2->code[i] == code) continue;
574
575 items[n].byte_len = len;
576 items[n].code_len = 1;
577 items[n].code[0] = z2->code[i];
578 n++;
579 }
580 }
581 }
582
583 /* multi char folded code is not head of another folded multi char */
584 flag = 0; /* DISABLE_CASE_FOLD_MULTI_CHAR(flag); */
585 }
586 }
587 else {
588 if ((to = onigenc_unicode_unfold1_lookup(code)) != 0 &&
589 CodePointListValidP(to)) {
590 for (i = 0; i < OnigCodePointCount(to->n); i++) {
591 items[n].byte_len = len;
592 items[n].code_len = 1;
593 items[n].code[0] = to->code[i];
594 n++;
595 }
596 }
597 }
598
599
600 if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
601 p += len;
602 if (p < end) {
603 int clen;
604
605 codes[0] = code;
606 code = ONIGENC_MBC_TO_CODE(enc, p, end);
607 if ((to = onigenc_unicode_fold_lookup(code)) != 0
608 && OnigCodePointCount(to->n) == 1) {
609 codes[1] = to->code[0];
610 }
611 else
612 codes[1] = code;
613
614 clen = enclen(enc, p, end);
615 len += clen;
616 if ((z2 = onigenc_unicode_unfold2_lookup(codes)) != 0 &&
617 CodePointListValidP(z2)) {
618 for (i = 0; i < OnigCodePointCount(z2->n); i++) {
619 items[n].byte_len = len;
620 items[n].code_len = 1;
621 items[n].code[0] = z2->code[i];
622 n++;
623 }
624 }
625
626 p += clen;
627 if (p < end) {
628 code = ONIGENC_MBC_TO_CODE(enc, p, end);
629 if ((to = onigenc_unicode_fold_lookup(code)) != 0
630 && OnigCodePointCount(to->n) == 1) {
631 codes[2] = to->code[0];
632 }
633 else
634 codes[2] = code;
635
636 clen = enclen(enc, p, end);
637 len += clen;
638 if ((z2 = onigenc_unicode_unfold3_lookup(codes)) != 0 &&
639 CodePointListValidP(z2)) {
640 for (i = 0; i < OnigCodePointCount(z2->n); i++) {
641 items[n].byte_len = len;
642 items[n].code_len = 1;
643 items[n].code[0] = z2->code[i];
644 n++;
645 }
646 }
647 }
648 }
649 }
650
651 return n;
652}
653
654/* length in bytes for three characters in UTF-32; e.g. needed for ffi (U+FB03) */
655#define CASE_MAPPING_SLACK 12
656#define MODIFIED (flags |= ONIGENC_CASE_MODIFIED)
657extern int
658onigenc_unicode_case_map(OnigCaseFoldType* flagP,
659 const OnigUChar** pp, const OnigUChar* end,
660 OnigUChar* to, OnigUChar* to_end,
661 const struct OnigEncodingTypeST* enc)
662{
663 OnigCodePoint code;
664 OnigUChar *to_start = to;
665 OnigCaseFoldType flags = *flagP;
666 int codepoint_length;
667
668 to_end -= CASE_MAPPING_SLACK;
669 /* copy flags ONIGENC_CASE_UPCASE and ONIGENC_CASE_DOWNCASE over to
670 * ONIGENC_CASE_UP_SPECIAL and ONIGENC_CASE_DOWN_SPECIAL */
671 flags |= (flags & (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) << ONIGENC_CASE_SPECIAL_OFFSET;
672
673 while (*pp < end && to <= to_end) {
674 codepoint_length = ONIGENC_PRECISE_MBC_ENC_LEN(enc, *pp, end);
675 if (codepoint_length < 0)
676 return codepoint_length; /* encoding invalid */
677 code = ONIGENC_MBC_TO_CODE(enc, *pp, end);
678 *pp += codepoint_length;
679
680 if (code <= 'z') { /* ASCII comes first */
681 if (code >= 'a' && code <= 'z') {
682 if (flags & ONIGENC_CASE_UPCASE) {
683 MODIFIED;
684 if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'i')
685 code = I_WITH_DOT_ABOVE;
686 else
687 code += 'A' - 'a';
688 }
689 }
690 else if (code >= 'A' && code <= 'Z') {
691 if (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD)) {
692 MODIFIED;
693 if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'I')
694 code = DOTLESS_i;
695 else
696 code += 'a' - 'A';
697 }
698 }
699 }
700 else if (!(flags & ONIGENC_CASE_ASCII_ONLY) && code >= 0x00B5) { /* deal with non-ASCII; micron sign (U+00B5) is lowest affected */
701 const CodePointList3 *folded;
702
703 if (code == I_WITH_DOT_ABOVE) {
704 if (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD)) {
705 MODIFIED;
706 code = 'i';
707 if (!(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI)) { /* make dot above explicit */
708 to += ONIGENC_CODE_TO_MBC(enc, code, to);
709 code = DOT_ABOVE;
710 }
711 }
712 }
713 else if (code == DOTLESS_i) { /* handle this manually, because it isn't involved in folding */
714 if (flags & ONIGENC_CASE_UPCASE) {
715 MODIFIED;
716 code = 'I';
717 }
718 }
719 else if ((folded = onigenc_unicode_fold_lookup(code)) != 0) { /* data about character found in CaseFold_11_Table */
720 if ((flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, */
721 && (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase */
722 /* already Titlecase, no changes needed */
723 }
724 else if (flags & OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */
725 const OnigCodePoint *next;
726 int count;
727
728 MODIFIED;
729 if (flags & OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_SPECIALS) { /* special */
730 const OnigCodePoint *SpecialsStart = CaseMappingSpecials + OnigSpecialIndexDecode(folded->n);
731
732 if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE) { /* swapCASE available */
733 if ((flags & (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE))
734 == (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) /* swapCASE needed */
735 goto SpecialsCopy;
736 else /* swapCASE not needed */
737 SpecialsStart += SpecialsLengthExtract(*SpecialsStart);
738 }
739 if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_TITLECASE) { /* Titlecase available */
740 if (flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, but not yet Titlecase */
741 goto SpecialsCopy;
742 else /* Titlecase not needed */
743 SpecialsStart += SpecialsLengthExtract(*SpecialsStart);
744 }
745 if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_DOWN_SPECIAL) {
746 if (!(flags & ONIGENC_CASE_DOWN_SPECIAL))
747 SpecialsStart += SpecialsLengthExtract(*SpecialsStart);
748 }
749 /* here, we know we use ONIGENC_CASE_UP_SPECIAL, and the position is right */
750SpecialsCopy:
751 count = SpecialsLengthExtract(*SpecialsStart);
752 next = SpecialsStart;
753 code = SpecialsCodepointExtract(*next++);
754 }
755 else { /* no specials */
756 count = OnigCodePointCount(folded->n);
757 next = folded->code;
758 code = *next++;
759 }
760 if (count == 1)
761 ;
762 else if (count == 2) {
763 to += ONIGENC_CODE_TO_MBC(enc, code, to);
764 code = *next;
765 }
766 else { /* count == 3 */
767 to += ONIGENC_CODE_TO_MBC(enc, code, to);
768 to += ONIGENC_CODE_TO_MBC(enc, *next++, to);
769 code = *next;
770 }
771 }
772 }
773 else if ((folded = onigenc_unicode_unfold1_lookup(code)) != 0 /* data about character found in CaseUnfold_11_Table */
774 && flags & OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */
775 MODIFIED;
776 code = folded->code[(flags & OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_TITLECASE) ? 1 : 0];
777 }
778 }
779 to += ONIGENC_CODE_TO_MBC(enc, code, to);
780 /* switch from titlecase to lowercase for capitalize */
781 if (flags & ONIGENC_CASE_TITLECASE)
782 flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE |
783 ONIGENC_CASE_UP_SPECIAL | ONIGENC_CASE_DOWN_SPECIAL);
784 }
785 *flagP = flags;
786 return (int )(to - to_start);
787}
788
789#if 0
790const char onigenc_unicode_version_string[] =
791#ifdef ONIG_UNICODE_VERSION_STRING
792 ONIG_UNICODE_VERSION_STRING
793#endif
794 "";
795
796const int onigenc_unicode_version_number[3] = {
797#ifdef ONIG_UNICODE_VERSION_MAJOR
798 ONIG_UNICODE_VERSION_MAJOR,
799 ONIG_UNICODE_VERSION_MINOR,
800 ONIG_UNICODE_VERSION_TEENY,
801#else
802 0
803#endif
804};
805#endif
Note: See TracBrowser for help on using the repository browser.