Changeset 331 for EcnlProtoTool/trunk/onigmo-6.1.3/src/enc/unicode.c
- Timestamp:
- Jan 21, 2018, 12:10:09 AM (6 years ago)
- Location:
- EcnlProtoTool/trunk/onigmo-6.1.3
- Files:
-
- 1 edited
- 1 moved
Legend:
- Unmodified
- Added
- Removed
-
EcnlProtoTool/trunk/onigmo-6.1.3/src/enc/unicode.c
r321 r331 138 138 } 139 139 140 #include "enc/unicode/casefold.h" 141 142 #include "enc/unicode/name2ctype.h" 140 /* macros related to ONIGENC_CASE flags */ 141 /* defined here because not used in other files */ 142 #define ONIGENC_CASE_SPECIALS (ONIGENC_CASE_TITLECASE | ONIGENC_CASE_IS_TITLECASE | ONIGENC_CASE_UP_SPECIAL | ONIGENC_CASE_DOWN_SPECIAL) 143 144 /* macros for length in CaseMappingSpecials array in enc/unicode/casefold.h */ 145 #define SpecialsLengthOffset 25 /* needs to be higher than the 22 bits used for Unicode codepoints */ 146 #define SpecialsLengthExtract(n) ((n) >> SpecialsLengthOffset) 147 #define SpecialsCodepointExtract(n) ((n) & ((1 << SpecialsLengthOffset) - 1)) 148 #define SpecialsLengthEncode(n) ((n) << SpecialsLengthOffset) 149 150 #define OnigSpecialIndexMask (((1 << OnigSpecialIndexWidth) - 1) << OnigSpecialIndexShift) 151 #define OnigSpecialIndexEncode(n) ((n) << OnigSpecialIndexShift) 152 #define OnigSpecialIndexDecode(n) (((n) & OnigSpecialIndexMask) >> OnigSpecialIndexShift) 153 154 /* macros to shorten "enc/unicode/casefold.h", undefined immediately after including the file */ 155 #define U ONIGENC_CASE_UPCASE 156 #define D ONIGENC_CASE_DOWNCASE 157 #define F ONIGENC_CASE_FOLD 158 #define ST ONIGENC_CASE_TITLECASE 159 #define SU ONIGENC_CASE_UP_SPECIAL 160 #define SL ONIGENC_CASE_DOWN_SPECIAL 161 #define IT ONIGENC_CASE_IS_TITLECASE 162 #define I(n) OnigSpecialIndexEncode(n) 163 #define L(n) SpecialsLengthEncode(n) 164 165 #include "casefold.h" 166 167 #undef U 168 #undef D 169 #undef F 170 #undef ST 171 #undef SU 172 #undef SL 173 #undef IT 174 #undef I 175 #undef L 176 177 #include "name2ctype.h" 143 178 144 179 #define CODE_RANGES_NUM numberof(CodeRanges) 145 180 146 181 extern int 147 onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype )182 onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED) 148 183 { 149 184 if ( … … 177 212 extern int 178 213 onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out, 179 const OnigCodePoint* ranges[]) 214 const OnigCodePoint* ranges[], 215 OnigEncoding enc ARG_UNUSED) 180 216 { 181 217 *sb_out = 0x00; … … 183 219 } 184 220 185 #include "st.h"186 187 221 #define PROPERTY_NAME_MAX_SIZE (MAX_WORD_LENGTH + 1) 188 222 189 223 extern int 190 onigenc_unicode_property_name_to_ctype(OnigEncoding enc, UChar* name,UChar* end)224 onigenc_unicode_property_name_to_ctype(OnigEncoding enc, const UChar* name, const UChar* end) 191 225 { 192 226 int len; 193 227 int ctype; 194 228 UChar buf[PROPERTY_NAME_MAX_SIZE]; 195 UChar *p;229 const UChar *p; 196 230 OnigCodePoint code; 197 231 198 232 len = 0; 199 for (p = name; p < end; p += enclen(enc, p )) {233 for (p = name; p < end; p += enclen(enc, p, end)) { 200 234 code = ONIGENC_MBC_TO_CODE(enc, p, end); 201 235 if (code == ' ' || code == '-' || code == '_') … … 223 257 #define onigenc_unicode_unfold3_lookup onigenc_unicode_CaseUnfold_13_lookup 224 258 259 enum { 260 I_WITH_DOT_ABOVE = 0x0130, 261 DOTLESS_i = 0x0131, 262 DOT_ABOVE = 0x0307 263 }; 264 225 265 extern int 226 266 onigenc_unicode_mbc_case_fold(OnigEncoding enc, … … 234 274 235 275 code = ONIGENC_MBC_TO_CODE(enc, p, end); 236 len = enclen(enc, p );276 len = enclen(enc, p, end); 237 277 *pp += len; 238 278 239 279 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI 240 280 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { 241 if (code == 0x0049) {242 return ONIGENC_CODE_TO_MBC(enc, 0x0131, fold);243 } 244 else if (code == 0x0130) {245 return ONIGENC_CODE_TO_MBC(enc, 0x0069, fold);281 if (code == 'I') { 282 return ONIGENC_CODE_TO_MBC(enc, DOTLESS_i, fold); 283 } 284 else if (code == I_WITH_DOT_ABOVE) { 285 return ONIGENC_CODE_TO_MBC(enc, 'i', fold); 246 286 } 247 287 } … … 249 289 250 290 if ((to = onigenc_unicode_fold_lookup(code)) != 0) { 251 if ( to->n== 1) {291 if (OnigCodePointCount(to->n) == 1) { 252 292 return ONIGENC_CODE_TO_MBC(enc, to->code[0], fold); 253 293 } … … 260 300 { 261 301 rlen = 0; 262 for (i = 0; i < to->n; i++) {302 for (i = 0; i < OnigCodePointCount(to->n); i++) { 263 303 len = ONIGENC_CODE_TO_MBC(enc, to->code[i], fold); 264 304 fold += len; … … 277 317 extern int 278 318 onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, 279 OnigApplyAllCaseFoldFunc f, void* arg) 319 OnigApplyAllCaseFoldFunc f, void* arg, 320 OnigEncoding enc ARG_UNUSED) 280 321 { 281 322 const CaseUnfold_11_Type* p11; … … 285 326 for (i = 0; i < numberof(CaseUnfold_11); i++) { 286 327 p11 = &CaseUnfold_11[i]; 287 for (j = 0; j < p11->to.n; j++) {328 for (j = 0; j < OnigCodePointCount(p11->to.n); j++) { 288 329 code = p11->from; 289 330 r = (*f)(p11->to.code[j], &code, 1, arg); … … 306 347 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI 307 348 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { 308 code = 0x0131;309 r = (*f)( 0x0049, &code, 1, arg);349 code = DOTLESS_i; 350 r = (*f)('I', &code, 1, arg); 310 351 if (r != 0) return r; 311 code = 0x0049;312 r = (*f)( 0x0131, &code, 1, arg);352 code = 'I'; 353 r = (*f)(DOTLESS_i, &code, 1, arg); 313 354 if (r != 0) return r; 314 355 315 code = 0x0130;316 r = (*f)( 0x0069, &code, 1, arg);356 code = I_WITH_DOT_ABOVE; 357 r = (*f)('i', &code, 1, arg); 317 358 if (r != 0) return r; 318 code = 0x0069;319 r = (*f)( 0x0130, &code, 1, arg);359 code = 'i'; 360 r = (*f)(I_WITH_DOT_ABOVE, &code, 1, arg); 320 361 if (r != 0) return r; 321 362 } … … 324 365 for (i = 0; i < numberof(CaseUnfold_11_Locale); i++) { 325 366 p11 = &CaseUnfold_11_Locale[i]; 326 for (j = 0; j < p11->to.n; j++) {367 for (j = 0; j < OnigCodePointCount(p11->to.n); j++) { 327 368 code = p11->from; 328 369 r = (*f)(p11->to.code[j], &code, 1, arg); … … 350 391 if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { 351 392 for (i = 0; i < numberof(CaseUnfold_12); i++) { 352 for (j = 0; j < CaseUnfold_12[i].to.n; j++) {393 for (j = 0; j < OnigCodePointCount(CaseUnfold_12[i].to.n); j++) { 353 394 r = (*f)(CaseUnfold_12[i].to.code[j], 354 395 (OnigCodePoint* )CaseUnfold_12[i].from, 2, arg); 355 396 if (r != 0) return r; 356 397 357 for (k = 0; k < CaseUnfold_12[i].to.n; k++) {398 for (k = 0; k < OnigCodePointCount(CaseUnfold_12[i].to.n); k++) { 358 399 if (k == j) continue; 359 400 … … 369 410 #endif 370 411 for (i = 0; i < numberof(CaseUnfold_12_Locale); i++) { 371 for (j = 0; j < CaseUnfold_12_Locale[i].to.n; j++) {412 for (j = 0; j < OnigCodePointCount(CaseUnfold_12_Locale[i].to.n); j++) { 372 413 r = (*f)(CaseUnfold_12_Locale[i].to.code[j], 373 414 (OnigCodePoint* )CaseUnfold_12_Locale[i].from, 2, arg); 374 415 if (r != 0) return r; 375 416 376 for (k = 0; k < CaseUnfold_12_Locale[i].to.n; k++) {417 for (k = 0; k < OnigCodePointCount(CaseUnfold_12_Locale[i].to.n); k++) { 377 418 if (k == j) continue; 378 419 … … 389 430 390 431 for (i = 0; i < numberof(CaseUnfold_13); i++) { 391 for (j = 0; j < CaseUnfold_13[i].to.n; j++) {432 for (j = 0; j < OnigCodePointCount(CaseUnfold_13[i].to.n); j++) { 392 433 r = (*f)(CaseUnfold_13[i].to.code[j], 393 434 (OnigCodePoint* )CaseUnfold_13[i].from, 3, arg); 394 435 if (r != 0) return r; 395 436 396 for (k = 0; k < CaseUnfold_13[i].to.n; k++) {437 for (k = 0; k < OnigCodePointCount(CaseUnfold_13[i].to.n); k++) { 397 438 if (k == j) continue; 398 439 … … 407 448 return 0; 408 449 } 450 451 #define CodePointListValidP(x) (OnigCodePointCount((x)->n) <= numberof((x)->code)) 409 452 410 453 extern int … … 421 464 422 465 code = ONIGENC_MBC_TO_CODE(enc, p, end); 423 len = enclen(enc, p );466 len = enclen(enc, p, end); 424 467 425 468 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI 426 469 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { 427 if (code == 0x0049) { 470 switch (code) { 471 case 'I': 428 472 items[0].byte_len = len; 429 473 items[0].code_len = 1; 430 items[0].code[0] = 0x0131;474 items[0].code[0] = DOTLESS_i; 431 475 return 1; 432 } 433 else if (code == 0x0130) { 476 case I_WITH_DOT_ABOVE: 434 477 items[0].byte_len = len; 435 478 items[0].code_len = 1; 436 items[0].code[0] = 0x0069;479 items[0].code[0] = 'i'; 437 480 return 1; 438 } 439 else if (code == 0x0131) { 481 case DOTLESS_i: 440 482 items[0].byte_len = len; 441 483 items[0].code_len = 1; 442 items[0].code[0] = 0x0049;484 items[0].code[0] = 'I'; 443 485 return 1; 444 } 445 else if (code == 0x0069) { 486 case 'i': 446 487 items[0].byte_len = len; 447 488 items[0].code_len = 1; 448 items[0].code[0] = 0x0130;489 items[0].code[0] = I_WITH_DOT_ABOVE; 449 490 return 1; 450 491 } … … 453 494 454 495 if ((to = onigenc_unicode_fold_lookup(code)) != 0) { 455 if ( to->n== 1) {496 if (OnigCodePointCount(to->n) == 1) { 456 497 OnigCodePoint orig_code = code; 457 498 … … 462 503 463 504 code = to->code[0]; 464 if ((to = onigenc_unicode_unfold1_lookup(code)) != 0) { 465 for (i = 0; i < to->n; i++) { 505 if ((to = onigenc_unicode_unfold1_lookup(code)) != 0 && 506 CodePointListValidP(to)) { 507 for (i = 0; i < OnigCodePointCount(to->n); i++) { 466 508 if (to->code[i] != orig_code) { 467 509 items[n].byte_len = len; … … 477 519 int fn, ncs[3]; 478 520 479 for (fn = 0; fn < to->n; fn++) {521 for (fn = 0; fn < OnigCodePointCount(to->n); fn++) { 480 522 cs[fn][0] = to->code[fn]; 481 523 if ((z3 = onigenc_unicode_unfold1_lookup(cs[fn][0])) != 0) { 482 for (i = 0; i < z3->n; i++) {524 for (i = 0; i < OnigCodePointCount(z3->n); i++) { 483 525 cs[fn][i+1] = z3->code[i]; 484 526 } 485 ncs[fn] = z3->n+ 1;527 ncs[fn] = OnigCodePointCount(z3->n) + 1; 486 528 } 487 529 else … … 500 542 } 501 543 502 if ((z2 = onigenc_unicode_unfold2_lookup(to->code)) != 0) { 503 for (i = 0; i < z2->n; i++) { 544 if ((z2 = onigenc_unicode_unfold2_lookup(to->code)) != 0 && 545 CodePointListValidP(z2)) { 546 for (i = 0; i < OnigCodePointCount(z2->n); i++) { 504 547 if (z2->code[i] == code) continue; 505 548 … … 525 568 } 526 569 527 if ((z2 = onigenc_unicode_unfold3_lookup(to->code)) != 0) { 528 for (i = 0; i < z2->n; i++) { 570 if ((z2 = onigenc_unicode_unfold3_lookup(to->code)) != 0 && 571 CodePointListValidP(z2)) { 572 for (i = 0; i < OnigCodePointCount(z2->n); i++) { 529 573 if (z2->code[i] == code) continue; 530 574 … … 542 586 } 543 587 else { 544 if ((to = onigenc_unicode_unfold1_lookup(code)) != 0) { 545 for (i = 0; i < to->n; i++) { 588 if ((to = onigenc_unicode_unfold1_lookup(code)) != 0 && 589 CodePointListValidP(to)) { 590 for (i = 0; i < OnigCodePointCount(to->n); i++) { 546 591 items[n].byte_len = len; 547 592 items[n].code_len = 1; … … 561 606 code = ONIGENC_MBC_TO_CODE(enc, p, end); 562 607 if ((to = onigenc_unicode_fold_lookup(code)) != 0 563 && to->n== 1) {608 && OnigCodePointCount(to->n) == 1) { 564 609 codes[1] = to->code[0]; 565 610 } … … 567 612 codes[1] = code; 568 613 569 clen = enclen(enc, p );614 clen = enclen(enc, p, end); 570 615 len += clen; 571 if ((z2 = onigenc_unicode_unfold2_lookup(codes)) != 0) { 572 for (i = 0; i < z2->n; i++) { 616 if ((z2 = onigenc_unicode_unfold2_lookup(codes)) != 0 && 617 CodePointListValidP(z2)) { 618 for (i = 0; i < OnigCodePointCount(z2->n); i++) { 573 619 items[n].byte_len = len; 574 620 items[n].code_len = 1; … … 582 628 code = ONIGENC_MBC_TO_CODE(enc, p, end); 583 629 if ((to = onigenc_unicode_fold_lookup(code)) != 0 584 && to->n== 1) {630 && OnigCodePointCount(to->n) == 1) { 585 631 codes[2] = to->code[0]; 586 632 } … … 588 634 codes[2] = code; 589 635 590 clen = enclen(enc, p );636 clen = enclen(enc, p, end); 591 637 len += clen; 592 if ((z2 = onigenc_unicode_unfold3_lookup(codes)) != 0) { 593 for (i = 0; i < z2->n; i++) { 638 if ((z2 = onigenc_unicode_unfold3_lookup(codes)) != 0 && 639 CodePointListValidP(z2)) { 640 for (i = 0; i < OnigCodePointCount(z2->n); i++) { 594 641 items[n].byte_len = len; 595 642 items[n].code_len = 1; … … 604 651 return n; 605 652 } 653 654 /* length in bytes for three characters in UTF-32; e.g. needed for ffi (U+FB03) */ 655 #define CASE_MAPPING_SLACK 12 656 #define MODIFIED (flags |= ONIGENC_CASE_MODIFIED) 657 extern int 658 onigenc_unicode_case_map(OnigCaseFoldType* flagP, 659 const OnigUChar** pp, const OnigUChar* end, 660 OnigUChar* to, OnigUChar* to_end, 661 const struct OnigEncodingTypeST* enc) 662 { 663 OnigCodePoint code; 664 OnigUChar *to_start = to; 665 OnigCaseFoldType flags = *flagP; 666 int codepoint_length; 667 668 to_end -= CASE_MAPPING_SLACK; 669 /* copy flags ONIGENC_CASE_UPCASE and ONIGENC_CASE_DOWNCASE over to 670 * ONIGENC_CASE_UP_SPECIAL and ONIGENC_CASE_DOWN_SPECIAL */ 671 flags |= (flags & (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) << ONIGENC_CASE_SPECIAL_OFFSET; 672 673 while (*pp < end && to <= to_end) { 674 codepoint_length = ONIGENC_PRECISE_MBC_ENC_LEN(enc, *pp, end); 675 if (codepoint_length < 0) 676 return codepoint_length; /* encoding invalid */ 677 code = ONIGENC_MBC_TO_CODE(enc, *pp, end); 678 *pp += codepoint_length; 679 680 if (code <= 'z') { /* ASCII comes first */ 681 if (code >= 'a' && code <= 'z') { 682 if (flags & ONIGENC_CASE_UPCASE) { 683 MODIFIED; 684 if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'i') 685 code = I_WITH_DOT_ABOVE; 686 else 687 code += 'A' - 'a'; 688 } 689 } 690 else if (code >= 'A' && code <= 'Z') { 691 if (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD)) { 692 MODIFIED; 693 if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'I') 694 code = DOTLESS_i; 695 else 696 code += 'a' - 'A'; 697 } 698 } 699 } 700 else if (!(flags & ONIGENC_CASE_ASCII_ONLY) && code >= 0x00B5) { /* deal with non-ASCII; micron sign (U+00B5) is lowest affected */ 701 const CodePointList3 *folded; 702 703 if (code == I_WITH_DOT_ABOVE) { 704 if (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD)) { 705 MODIFIED; 706 code = 'i'; 707 if (!(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI)) { /* make dot above explicit */ 708 to += ONIGENC_CODE_TO_MBC(enc, code, to); 709 code = DOT_ABOVE; 710 } 711 } 712 } 713 else if (code == DOTLESS_i) { /* handle this manually, because it isn't involved in folding */ 714 if (flags & ONIGENC_CASE_UPCASE) { 715 MODIFIED; 716 code = 'I'; 717 } 718 } 719 else if ((folded = onigenc_unicode_fold_lookup(code)) != 0) { /* data about character found in CaseFold_11_Table */ 720 if ((flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, */ 721 && (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase */ 722 /* already Titlecase, no changes needed */ 723 } 724 else if (flags & OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */ 725 const OnigCodePoint *next; 726 int count; 727 728 MODIFIED; 729 if (flags & OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_SPECIALS) { /* special */ 730 const OnigCodePoint *SpecialsStart = CaseMappingSpecials + OnigSpecialIndexDecode(folded->n); 731 732 if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE) { /* swapCASE available */ 733 if ((flags & (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) 734 == (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) /* swapCASE needed */ 735 goto SpecialsCopy; 736 else /* swapCASE not needed */ 737 SpecialsStart += SpecialsLengthExtract(*SpecialsStart); 738 } 739 if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_TITLECASE) { /* Titlecase available */ 740 if (flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, but not yet Titlecase */ 741 goto SpecialsCopy; 742 else /* Titlecase not needed */ 743 SpecialsStart += SpecialsLengthExtract(*SpecialsStart); 744 } 745 if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_DOWN_SPECIAL) { 746 if (!(flags & ONIGENC_CASE_DOWN_SPECIAL)) 747 SpecialsStart += SpecialsLengthExtract(*SpecialsStart); 748 } 749 /* here, we know we use ONIGENC_CASE_UP_SPECIAL, and the position is right */ 750 SpecialsCopy: 751 count = SpecialsLengthExtract(*SpecialsStart); 752 next = SpecialsStart; 753 code = SpecialsCodepointExtract(*next++); 754 } 755 else { /* no specials */ 756 count = OnigCodePointCount(folded->n); 757 next = folded->code; 758 code = *next++; 759 } 760 if (count == 1) 761 ; 762 else if (count == 2) { 763 to += ONIGENC_CODE_TO_MBC(enc, code, to); 764 code = *next; 765 } 766 else { /* count == 3 */ 767 to += ONIGENC_CODE_TO_MBC(enc, code, to); 768 to += ONIGENC_CODE_TO_MBC(enc, *next++, to); 769 code = *next; 770 } 771 } 772 } 773 else if ((folded = onigenc_unicode_unfold1_lookup(code)) != 0 /* data about character found in CaseUnfold_11_Table */ 774 && flags & OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */ 775 MODIFIED; 776 code = folded->code[(flags & OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_TITLECASE) ? 1 : 0]; 777 } 778 } 779 to += ONIGENC_CODE_TO_MBC(enc, code, to); 780 /* switch from titlecase to lowercase for capitalize */ 781 if (flags & ONIGENC_CASE_TITLECASE) 782 flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE | 783 ONIGENC_CASE_UP_SPECIAL | ONIGENC_CASE_DOWN_SPECIAL); 784 } 785 *flagP = flags; 786 return (int )(to - to_start); 787 } 788 789 #if 0 790 const char onigenc_unicode_version_string[] = 791 #ifdef ONIG_UNICODE_VERSION_STRING 792 ONIG_UNICODE_VERSION_STRING 793 #endif 794 ""; 795 796 const int onigenc_unicode_version_number[3] = { 797 #ifdef ONIG_UNICODE_VERSION_MAJOR 798 ONIG_UNICODE_VERSION_MAJOR, 799 ONIG_UNICODE_VERSION_MINOR, 800 ONIG_UNICODE_VERSION_TEENY, 801 #else 802 0 803 #endif 804 }; 805 #endif
Note:
See TracChangeset
for help on using the changeset viewer.