- Timestamp:
- Jul 9, 2020, 8:51:43 AM (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
EcnlProtoTool/trunk/mrbgems/mruby-onig-regexp/src/mruby_onig_regexp.c
r331 r439 25 25 #include <string.h> 26 26 #include <ctype.h> 27 #include < stdlib.h>27 #include <memory.h> 28 28 #include <mruby.h> 29 29 #include <mruby/class.h> 30 30 #include <mruby/variable.h> 31 31 #include <mruby/array.h> 32 #include <mruby/hash.h> 32 33 #include <mruby/string.h> 33 34 #include <mruby/data.h> … … 36 37 #define ONIG_EXTERN extern 37 38 #endif 38 #include "onigmo.h" 39 #ifdef HAVE_ONIGMO_H 40 #include <onigmo.h> 41 #elif defined(HAVE_ONIGURUMA_H) 42 #include <oniguruma.h> 43 #else 44 #include "oniguruma.h" 45 #endif 39 46 40 47 #ifdef MRUBY_VERSION … … 44 51 #endif 45 52 53 static const char utf8len_codepage[256] = 54 { 55 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 56 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 57 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 58 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 59 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 60 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 61 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 62 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1, 63 }; 64 65 static mrb_int 66 utf8len(const char* p, const char* e) 67 { 68 mrb_int len; 69 mrb_int i; 70 71 len = utf8len_codepage[(unsigned char)*p]; 72 if (p + len > e) return 1; 73 for (i = 1; i < len; ++i) 74 if ((p[i] & 0xc0) != 0x80) 75 return 1; 76 return len; 77 } 78 46 79 static void 47 80 onig_regexp_free(mrb_state *mrb, void *p) { … … 52 85 "PosixRegexp", onig_regexp_free 53 86 }; 87 88 #define ONIG_REGEXP_P(obj) \ 89 ((mrb_type(obj) == MRB_TT_DATA) && (DATA_TYPE(obj) == &mrb_onig_regexp_type)) 54 90 55 91 static void … … 64 100 65 101 static mrb_value 102 str_substr(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len) 103 { 104 #ifdef MRB_UTF8_STRING 105 return mrb_str_new(mrb, RSTRING_PTR(str) + beg, len); 106 #else 107 return mrb_str_substr(mrb, str, beg, len); 108 #endif 109 } 110 111 static mrb_value 66 112 onig_regexp_initialize(mrb_state *mrb, mrb_value self) { 67 113 mrb_value str, flag = mrb_nil_value(), code = mrb_nil_value(); … … 69 115 70 116 int cflag = 0; 71 OnigSyntaxType* syntax = ONIG_SYNTAX_RUBY;72 117 OnigEncoding enc = ONIG_ENCODING_UTF8; 73 118 if(mrb_string_p(code)) { … … 97 142 OnigRegex reg; 98 143 int result = onig_new(®, (OnigUChar*)RSTRING_PTR(str), (OnigUChar*) RSTRING_PTR(str) + RSTRING_LEN(str), 99 cflag, enc, syntax, &einfo);144 cflag, enc, ONIG_SYNTAX_RUBY, &einfo); 100 145 if (result != ONIG_NORMAL) { 101 146 char err[ONIG_MAX_ERROR_MESSAGE_LEN] = ""; 102 onig_error_code_to_str((OnigUChar*)err, result );103 mrb_raisef(mrb, E_ ARGUMENT_ERROR, "'%S' is an invalid regular expression because %S.",147 onig_error_code_to_str((OnigUChar*)err, result, &einfo); 148 mrb_raisef(mrb, E_REGEXP_ERROR, "'%S' is an invalid regular expression because %S.", 104 149 str, mrb_str_new_cstr(mrb, err)); 105 150 } … … 122 167 return c; 123 168 } 169 170 #define MISMATCH_NIL_OR(v) (result == ONIG_MISMATCH ? mrb_nil_value() : (v)) 124 171 125 172 static int … … 138 185 139 186 struct RObject* const cls = (struct RObject*)mrb_class_get(mrb, "OnigRegexp"); 140 mrb_obj_iv_set(mrb, cls, mrb_intern_lit(mrb, "@last_match"), match_value); 141 142 if (result != ONIG_MISMATCH && 143 mrb_class_get(mrb, "Regexp") == (struct RClass*)cls && 144 mrb_bool(mrb_obj_iv_get(mrb, (struct RObject*)cls, mrb_intern_lit(mrb, "@set_global_variables")))) 187 mrb_obj_iv_set(mrb, cls, mrb_intern_lit(mrb, "@last_match"), MISMATCH_NIL_OR(match_value)); 188 189 if (mrb_class_get(mrb, "Regexp") == (struct RClass*)cls && 190 mrb_bool(mrb_obj_iv_get(mrb, (struct RObject*)cls, mrb_intern_lit(mrb, "@set_global_variables")))) 145 191 { 146 mrb_gv_set(mrb, mrb_intern_lit(mrb, "$~"), match_value); 192 mrb_gv_set(mrb, mrb_intern_lit(mrb, "$~"), 193 MISMATCH_NIL_OR(match_value)); 147 194 mrb_gv_set(mrb, mrb_intern_lit(mrb, "$&"), 148 mrb_funcall(mrb, match_value, "[]", 1, mrb_fixnum_value(0))); 149 mrb_gv_set(mrb, mrb_intern_lit(mrb, "$`"), mrb_funcall(mrb, match_value, "pre_match", 0)); 150 mrb_gv_set(mrb, mrb_intern_lit(mrb, "$'"), mrb_funcall(mrb, match_value, "post_match", 0)); 195 MISMATCH_NIL_OR(mrb_funcall(mrb, match_value, "[]", 1, mrb_fixnum_value(0)))); 196 mrb_gv_set(mrb, mrb_intern_lit(mrb, "$`"), 197 MISMATCH_NIL_OR(mrb_funcall(mrb, match_value, "pre_match", 0))); 198 mrb_gv_set(mrb, mrb_intern_lit(mrb, "$'"), 199 MISMATCH_NIL_OR(mrb_funcall(mrb, match_value, "post_match", 0))); 151 200 mrb_gv_set(mrb, mrb_intern_lit(mrb, "$+"), 152 mrb_funcall(mrb, match_value, "[]", 1, mrb_fixnum_value(match->num_regs - 1)));201 MISMATCH_NIL_OR(mrb_funcall(mrb, match_value, "[]", 1, mrb_fixnum_value(match->num_regs - 1)))); 153 202 154 203 // $1 to $9 … … 171 220 172 221 static mrb_value 222 reg_operand(mrb_state *mrb, mrb_value obj) { 223 mrb_value ret; 224 225 if (mrb_symbol_p(obj)) { 226 ret = mrb_sym2str(mrb, mrb_symbol(obj)); 227 if (mrb_undef_p(ret)) { 228 mrb_bug(mrb, "can not intern %S", obj); 229 } 230 } 231 else { 232 ret = mrb_string_type(mrb, obj); 233 } 234 return ret; 235 } 236 237 static mrb_value 173 238 onig_regexp_match(mrb_state *mrb, mrb_value self) { 174 239 mrb_value str = mrb_nil_value(); 175 240 OnigRegex reg; 176 241 mrb_int pos = 0; 177 178 mrb_get_args(mrb, "o|i", &str, &pos); 242 mrb_value block = mrb_nil_value(); 243 244 mrb_get_args(mrb, "o|i&", &str, &pos, &block); 245 if (mrb_nil_p(str)) { 246 return mrb_nil_value(); 247 } 248 str = reg_operand(mrb, str); 179 249 if (pos < 0 || (pos > 0 && pos >= RSTRING_LEN(str))) { 180 250 return mrb_nil_value(); 181 251 } 182 252 253 Data_Get_Struct(mrb, self, &mrb_onig_regexp_type, reg); 254 255 mrb_value const ret = create_onig_region(mrb, str, self); 256 if (onig_match_common(mrb, reg, ret, str, pos) == ONIG_MISMATCH) { 257 return mrb_nil_value(); 258 } 259 260 if (mrb_nil_p(block)) { 261 return ret; 262 } else { 263 return mrb_yield(mrb, block, ret); 264 } 265 } 266 267 static mrb_value 268 onig_regexp_match_p(mrb_state *mrb, mrb_value self) { 269 mrb_value str = mrb_nil_value(); 270 mrb_int pos = 0; 271 OnigRegex reg; 272 OnigUChar const* str_ptr; 273 274 mrb_get_args(mrb, "o|i", &str, &pos); 183 275 if (mrb_nil_p(str)) { 184 276 return mrb_nil_value(); 185 277 } 278 str = reg_operand(mrb, str); 279 if (pos < 0 || (pos > 0 && pos >= RSTRING_LEN(str))) { 280 return mrb_nil_value(); 281 } 282 283 Data_Get_Struct(mrb, self, &mrb_onig_regexp_type, reg); 284 str_ptr = (OnigUChar const*)RSTRING_PTR(str); 285 return mrb_bool_value(onig_search( 286 reg, str_ptr, str_ptr + RSTRING_LEN(str), 287 str_ptr + pos, str_ptr + RSTRING_LEN(str), NULL, 0) != ONIG_MISMATCH); 288 } 289 290 static mrb_value 291 string_match_p(mrb_state *mrb, mrb_value self) { 292 mrb_value str = self; 293 mrb_int pos = 0; 294 OnigRegex reg; 295 OnigUChar const* str_ptr; 296 297 mrb_get_args(mrb, "d|i", ®, &mrb_onig_regexp_type, &pos); 298 if (pos < 0 || (pos > 0 && pos >= RSTRING_LEN(str))) { 299 return mrb_nil_value(); 300 } 301 302 if (mrb_nil_p(str)) { 303 return mrb_nil_value(); 304 } 186 305 str = mrb_string_type(mrb, str); 187 306 188 Data_Get_Struct(mrb, self, &mrb_onig_regexp_type, reg); 189 190 mrb_value const ret = create_onig_region(mrb, str, self); 191 return (onig_match_common(mrb, reg, ret, str, pos) == ONIG_MISMATCH) 192 ? mrb_nil_value() : ret; 307 str_ptr = (OnigUChar const*)RSTRING_PTR(str); 308 return mrb_bool_value(onig_search( 309 reg, str_ptr, str_ptr + RSTRING_LEN(str), 310 str_ptr + pos, str_ptr + RSTRING_LEN(str), NULL, 0) != ONIG_MISMATCH); 193 311 } 194 312 … … 256 374 257 375 c = *p; 258 if (c == '/' || c == '\\') {376 if (c == '/') { 259 377 buf[0] = '\\'; buf[1] = c; 260 378 mrb_str_cat(mrb, str, buf, 2); … … 326 444 again: 327 445 if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') { 328 329 330 446 int err = 1; 447 ptr += 2; 448 if ((len -= 2) > 0) { 331 449 do { 332 450 if(strchr(ptr, 'i')) { options |= ONIG_OPTION_IGNORECASE; } 333 451 if(strchr(ptr, 'x')) { options |= ONIG_OPTION_EXTEND; } 334 452 if(strchr(ptr, 'm')) { options |= ONIG_OPTION_MULTILINE; } 335 453 ++ptr; 336 454 } while (--len > 0); 337 338 455 } 456 if (len > 1 && *ptr == '-') { 339 457 ++ptr; 340 458 --len; … … 343 461 if(strchr(ptr, 'x')) { options &= ~ONIG_OPTION_EXTEND; } 344 462 if(strchr(ptr, 'm')) { options &= ~ONIG_OPTION_MULTILINE; } 345 463 ++ptr; 346 464 } while (--len > 0); 347 348 465 } 466 if (*ptr == ')') { 349 467 --len; 350 468 ++ptr; 351 469 goto again; 352 353 470 } 471 if (*ptr == ':' && ptr[len-1] == ')') { 354 472 OnigRegex rp; 355 473 ++ptr; … … 358 476 ONIG_ENCODING_UTF8, OnigDefaultSyntax, NULL); 359 477 onig_free(rp); 360 361 478 } 479 if (err) { 362 480 options = onig_get_options(reg); 363 481 ptr = RSTRING_PTR(src); 364 482 len = RSTRING_LEN(src); 365 483 } 366 484 } 367 485 … … 369 487 370 488 if ((options & embeddable) != embeddable) { 371 372 373 489 optbuf[0] = '-'; 490 option_to_str(optbuf + 1, ~options); 491 mrb_str_cat_cstr(mrb, str, optbuf); 374 492 } 375 493 … … 526 644 Data_Get_Struct(mrb, self, &mrb_onig_region_type, reg); 527 645 mrb_value str = mrb_iv_get(mrb, self, mrb_intern_lit(mrb, "string")); 528 return mrb_str_substr(mrb, str, reg->end[0], RSTRING_LEN(str) - reg->end[0]);646 return str_substr(mrb, str, reg->end[0], RSTRING_LEN(str) - reg->end[0]); 529 647 } 530 648 … … 535 653 Data_Get_Struct(mrb, self, &mrb_onig_region_type, reg); 536 654 mrb_value str = mrb_iv_get(mrb, self, mrb_intern_lit(mrb, "string")); 537 return mrb_str_substr(mrb, str, 0, reg->beg[0]);655 return str_substr(mrb, str, 0, reg->beg[0]); 538 656 } 539 657 … … 567 685 mrb_ary_push(mrb, ret, mrb_nil_value()); 568 686 } else { 569 mrb_ary_push(mrb, ret, mrb_str_substr(mrb, str, reg->beg[i], reg->end[i] - reg->beg[i]));687 mrb_ary_push(mrb, ret, str_substr(mrb, str, reg->beg[i], reg->end[i] - reg->beg[i])); 570 688 } 571 689 mrb_gc_arena_restore(mrb, ai); … … 580 698 OnigRegion* reg; 581 699 Data_Get_Struct(mrb, self, &mrb_onig_region_type, reg); 582 return mrb_str_substr(mrb, str, reg->beg[0], reg->end[0] - reg->beg[0]);700 return str_substr(mrb, str, reg->beg[0], reg->end[0] - reg->beg[0]); 583 701 } 584 702 … … 587 705 mrb_value src, OnigRegex reg, OnigRegion* match) 588 706 { 707 if (mrb_hash_p(replace)) { 708 mrb_value v = mrb_hash_get(mrb, replace, mrb_str_substr(mrb, src, match->beg[0], match->end[0] - match->beg[0])); 709 v = mrb_str_to_str(mrb, v); 710 mrb_str_cat_str(mrb, result, v); 711 return; 712 } 713 589 714 mrb_assert(mrb_string_p(replace)); 590 715 char const* ch; … … 607 732 if (idx < 0) { 608 733 mrb_raisef(mrb, E_INDEX_ERROR, "undefined group name reference: %S", 609 mrb_str_substr(mrb, replace, name_beg - RSTRING_PTR(replace), ch - name_beg));734 str_substr(mrb, replace, name_beg - RSTRING_PTR(replace), ch - name_beg)); 610 735 } 611 736 mrb_str_cat(mrb, result, RSTRING_PTR(src) + match->beg[idx], match->end[idx] - match->beg[idx]); … … 619 744 if (isdigit(*ch)) { // group number 0-9 620 745 int const idx = *ch - '0'; 621 if (idx >= match->num_regs) { 622 mrb_raisef(mrb, E_INDEX_ERROR, "undefined group number reference: %S (max: %S)", 623 mrb_fixnum_value(idx), mrb_fixnum_value(match->num_regs)); 746 if (idx < match->num_regs) { 747 mrb_str_cat(mrb, result, RSTRING_PTR(src) + match->beg[idx], match->end[idx] - match->beg[idx]); 624 748 } 625 mrb_str_cat(mrb, result, RSTRING_PTR(src) + match->beg[idx], match->end[idx] - match->beg[idx]);626 749 } else { 627 750 char const str[] = { '\\', *ch }; … … 642 765 string_gsub(mrb_state* mrb, mrb_value self) { 643 766 mrb_value blk, match_expr, replace_expr = mrb_nil_value(); 644 int const argc = mrb_get_args(mrb, "&o| S", &blk, &match_expr, &replace_expr);645 646 if( mrb_string_p(match_expr)) {767 int const argc = mrb_get_args(mrb, "&o|o", &blk, &match_expr, &replace_expr); 768 769 if(!ONIG_REGEXP_P(match_expr)) { 647 770 mrb_value argv[] = { match_expr, replace_expr }; 648 771 return mrb_funcall_with_block(mrb, self, mrb_intern_lit(mrb, "string_gsub"), argc, argv, blk); 649 772 } 650 773 774 if(argc == 1 && mrb_nil_p(blk)) { 775 return mrb_funcall(mrb, self, "to_enum", 2, mrb_symbol_value(mrb_intern_lit(mrb, "onig_regexp_gsub")), match_expr); 776 } 777 651 778 if(!mrb_nil_p(blk) && !mrb_nil_p(replace_expr)) { 652 mrb_raise(mrb, E_ARGUMENT_ERROR, "both block and replace expression must not be passed"); 779 blk = mrb_nil_value(); 780 } 781 782 if (mrb_nil_p(blk) && !mrb_hash_p(replace_expr)) { 783 replace_expr = mrb_string_type(mrb, replace_expr); 653 784 } 654 785 … … 668 799 append_replace_str(mrb, result, replace_expr, self, reg, match); 669 800 } else { 670 mrb_value const tmp_str = mrb_str_to_str(mrb, mrb_yield(mrb, blk, mrb_str_substr(801 mrb_value const tmp_str = mrb_str_to_str(mrb, mrb_yield(mrb, blk, str_substr( 671 802 mrb, self, match->beg[0], match->end[0] - match->beg[0]))); 672 803 mrb_assert(mrb_string_p(tmp_str)); … … 675 806 676 807 last_end_pos = match->end[0]; 677 } 678 808 if (match->beg[0] == match->end[0]) { 809 /* 810 * Always consume at least one character of the input string 811 * in order to prevent infinite loops. 812 */ 813 char* p = RSTRING_PTR(self) + last_end_pos; 814 char* e = p + RSTRING_LEN(self); 815 int len = utf8len(p, e); 816 if (RSTRING_LEN(self) < last_end_pos + len) break; 817 mrb_str_cat(mrb, result, p, len); 818 last_end_pos += len; 819 } 820 } 821 822 if (RSTRING_LEN(self) < last_end_pos) { 823 mrb_raise(mrb, E_ARGUMENT_ERROR, "invalid byte sequence in UTF-8"); 824 } 679 825 mrb_str_cat(mrb, result, RSTRING_PTR(self) + last_end_pos, RSTRING_LEN(self) - last_end_pos); 680 826 return result; … … 687 833 mrb_get_args(mrb, "&o", &blk, &match_expr); 688 834 689 if( mrb_string_p(match_expr)) {835 if(!ONIG_REGEXP_P(match_expr)) { 690 836 return mrb_funcall_with_block(mrb, self, mrb_intern_lit(mrb, "string_scan"), 691 837 1, &match_expr, blk); … … 706 852 mrb_assert(mrb_array_p(result)); 707 853 if(m->num_regs == 1) { 708 mrb_ary_push(mrb, result, mrb_str_substr(mrb, self, m->beg[0], m->end[0] - m->beg[0]));854 mrb_ary_push(mrb, result, str_substr(mrb, self, m->beg[0], m->end[0] - m->beg[0])); 709 855 } else { 710 856 mrb_value const elem = mrb_ary_new_capa(mrb, m->num_regs - 1); 711 857 for(i = 1; i < m->num_regs; ++i) { 712 mrb_ary_push(mrb, elem, mrb_str_substr(mrb, self, m->beg[i], m->end[i] - m->beg[i]));858 mrb_ary_push(mrb, elem, str_substr(mrb, self, m->beg[i], m->end[i] - m->beg[i])); 713 859 } 714 860 mrb_ary_push(mrb, result, elem); … … 717 863 mrb_assert(mrb_string_p(result)); 718 864 if(m->num_regs == 1) { 719 mrb_yield(mrb, blk, mrb_str_substr(mrb, self, m->beg[0], m->end[0] - m->beg[0]));865 mrb_yield(mrb, blk, str_substr(mrb, self, m->beg[0], m->end[0] - m->beg[0])); 720 866 } else { 721 867 mrb_value argv = mrb_ary_new_capa(mrb, m->num_regs - 1); 722 868 for(i = 1; i < m->num_regs; ++i) { 723 mrb_ary_push(mrb, argv, mrb_str_substr(mrb, self, m->beg[i], m->end[i] - m->beg[i]));869 mrb_ary_push(mrb, argv, str_substr(mrb, self, m->beg[i], m->end[i] - m->beg[i])); 724 870 } 725 871 mrb_yield(mrb, blk, argv); … … 727 873 } 728 874 729 last_end_pos = m->end[0]; 875 if (m->beg[0] == m->end[0]) { 876 /* 877 * Always consume at least one character of the input string 878 */ 879 if (RSTRING_LEN(self) > m->end[0]) { 880 char* p = RSTRING_PTR(self) + last_end_pos; 881 char* e = p + RSTRING_LEN(self); 882 int len = utf8len(p, e); 883 last_end_pos = m->end[0] + len; 884 } else { 885 last_end_pos = m->end[0] + 1; 886 } 887 } else { 888 last_end_pos = m->end[0]; 889 } 730 890 } 731 891 … … 738 898 mrb_value pattern = mrb_nil_value(); mrb_int limit = 0; 739 899 int argc = mrb_get_args(mrb, "|oi", &pattern, &limit); 740 741 if(argc == 0) { // check $; global variable 900 mrb_value result, tmp; 901 mrb_bool lim_p = !(argc == 2 && 0 < limit); 902 903 if(mrb_nil_p(pattern)) { // check $; global variable 742 904 pattern = mrb_gv_get(mrb, mrb_intern_lit(mrb, "$;")); 743 if(!mrb_nil_p(pattern)) { argc = 1; } 744 } 745 746 if(mrb_nil_p(pattern) || mrb_string_p(pattern)) { 747 return mrb_funcall(mrb, self, "string_split", argc, pattern, mrb_fixnum_value(limit)); 748 } 749 750 mrb_value const result = mrb_ary_new(mrb); 751 if(RSTRING_LEN(self) == 0) { return result; } 905 if (mrb_nil_p(pattern)) { 906 pattern = mrb_str_new_lit(mrb, " "); 907 } else if (!mrb_string_p(pattern) && !ONIG_REGEXP_P(pattern)) { 908 mrb_raise(mrb, E_TYPE_ERROR, "value of $; must be String or Regexp"); 909 } 910 if (argc == 0) { argc = 1; } 911 } 912 913 if (!ONIG_REGEXP_P(pattern)) { 914 if(!mrb_nil_p(pattern)) { pattern = mrb_string_type(mrb, pattern); } 915 if(mrb_string_p(pattern) && RSTRING_LEN(pattern) == 0) { 916 /* Special case - split into chars */ 917 pattern = mrb_funcall(mrb, mrb_obj_value(mrb_class_get(mrb, "OnigRegexp")), "new", 1, pattern); 918 } else { 919 return mrb_funcall(mrb, self, "string_split", argc, pattern, mrb_fixnum_value(limit)); 920 } 921 } 922 923 if(RSTRING_LEN(self) == 0) { return mrb_ary_new(mrb); } 924 if(limit == 1) { return mrb_ary_new_from_values(mrb, 1, &self); } 925 926 result = mrb_ary_new(mrb); 752 927 753 928 OnigRegex reg; … … 755 930 mrb_value const match_value = create_onig_region(mrb, self, pattern); 756 931 OnigRegion* const match = (OnigRegion*)DATA_PTR(match_value); 757 int last_end_pos = 0, next_match_pos = 0; 758 mrb_int num_matches = 0; 759 760 while (limit <= 0 || (limit - 1) > num_matches) { 761 int i; 762 if(next_match_pos >= RSTRING_LEN(self) || 763 onig_match_common(mrb, reg, match_value, self, next_match_pos) == ONIG_MISMATCH) { break; } 764 765 if (last_end_pos == match->end[0]) { 766 ++next_match_pos; 767 // Remove this loop if not using UTF-8 768 for (; next_match_pos < RSTRING_LEN(self) && (RSTRING_PTR(self)[next_match_pos] & 0xC0) == 0x80; 769 ++next_match_pos) {} 770 } else { 771 mrb_ary_push(mrb, result, mrb_str_substr( 772 mrb, self, last_end_pos, match->beg[0] - last_end_pos)); 773 // If there are captures, add them to the array 774 for (i = 1; i < match->num_regs; ++i) { 775 mrb_ary_push(mrb, result, mrb_str_substr( 776 mrb, self, match->beg[i], match->end[i] - match->beg[i])); 932 char *ptr = mrb_str_to_cstr(mrb, self); 933 mrb_int len = RSTRING_LEN(self); 934 mrb_int start = 0, beg = 0, end = 0; 935 mrb_int idx = 0, i = 0; 936 mrb_int last_null = 0; 937 938 if (argc == 2) { i = 1; } 939 while ((end = onig_match_common(mrb, reg, match_value, self, start)) >= 0) { 940 if (start == end && match->beg[0] == match->end[0]) { 941 if (!ptr) { 942 mrb_ary_push(mrb, result, mrb_str_new_lit(mrb, "")); 943 break; 777 944 } 778 last_end_pos = match->end[0]; 779 next_match_pos = last_end_pos; 780 ++num_matches; 781 } 782 } 783 if (last_end_pos <= RSTRING_LEN(self)) { 784 mrb_ary_push(mrb, result, mrb_str_substr( 785 mrb, self, last_end_pos, RSTRING_LEN(self) - last_end_pos)); 786 } 787 788 if (limit == 0) { // remove empty trailing elements 789 int count = 0, i; 790 for (i = RARRAY_LEN(result); i > 0; --i) { 791 mrb_assert(mrb_string_p(RARRAY_PTR(result)[i - 1])); 792 if (RSTRING_LEN(RARRAY_PTR(result)[i - 1]) != 0) { break; } 793 else { ++count; } 794 } 795 if(count > 0) { 796 return mrb_ary_new_from_values(mrb, RARRAY_LEN(result) - count, RARRAY_PTR(result)); 797 } 945 else if (last_null == 1) { 946 mrb_ary_push(mrb, result, str_substr(mrb, self, beg, utf8len(ptr+beg, ptr+len))); 947 beg = start; 948 } 949 else { 950 if (start == len) 951 start++; 952 else 953 start += utf8len(ptr+start, ptr+len); 954 last_null = 1; 955 continue; 956 } 957 } 958 else { 959 mrb_ary_push(mrb, result, str_substr(mrb, self, beg, end-beg)); 960 beg = start = match->end[0]; 961 } 962 last_null = 0; 963 964 for (idx=1; idx < match->num_regs; idx++) { 965 if (match->beg[idx] == -1) continue; 966 if (match->beg[idx] == match->end[idx]) 967 tmp = mrb_str_new_lit(mrb, ""); 968 else 969 tmp = str_substr(mrb, self, match->beg[idx], match->end[idx]-match->beg[idx]); 970 mrb_ary_push(mrb, result, tmp); 971 } 972 if (!lim_p && limit <= ++i) break; 973 } 974 975 if (RSTRING_LEN(self) > 0 && (!lim_p || RSTRING_LEN(self) > beg || limit < 0)) { 976 if (RSTRING_LEN(self) == beg) 977 tmp = mrb_str_new_lit(mrb, ""); 978 else 979 tmp = str_substr(mrb, self, beg, RSTRING_LEN(self)-beg); 980 mrb_ary_push(mrb, result, tmp); 981 } 982 if (lim_p && limit == 0) { 983 while ((len = RARRAY_LEN(result)) > 0 && 984 (tmp = mrb_ary_ref(mrb, result, len-1), RSTRING_LEN(tmp) == 0)) 985 mrb_ary_pop(mrb, result); 798 986 } 799 987 … … 805 993 string_sub(mrb_state* mrb, mrb_value self) { 806 994 mrb_value blk, match_expr, replace_expr = mrb_nil_value(); 807 int const argc = mrb_get_args(mrb, "&o| S", &blk, &match_expr, &replace_expr);808 809 if( mrb_string_p(match_expr)) {995 int const argc = mrb_get_args(mrb, "&o|o", &blk, &match_expr, &replace_expr); 996 997 if(!ONIG_REGEXP_P(match_expr)) { 810 998 mrb_value argv[] = { match_expr, replace_expr }; 811 999 return mrb_funcall_with_block(mrb, self, mrb_intern_lit(mrb, "string_sub"), argc, argv, blk); 812 1000 } 813 1001 1002 if(argc == 1 && mrb_nil_p(blk)) { 1003 mrb_raise(mrb, E_ARGUMENT_ERROR, "wrong number of arguments (given 1, expected 2)"); 1004 } 1005 814 1006 if(!mrb_nil_p(blk) && !mrb_nil_p(replace_expr)) { 815 mrb_raise(mrb, E_ARGUMENT_ERROR, "both block and replace expression must not be passed"); 1007 blk = mrb_nil_value(); 1008 } 1009 1010 if (mrb_nil_p(blk) && !mrb_hash_p(replace_expr)) { 1011 replace_expr = mrb_string_type(mrb, replace_expr); 816 1012 } 817 1013 … … 830 1026 append_replace_str(mrb, result, replace_expr, self, reg, match); 831 1027 } else { 832 mrb_value const tmp_str = mrb_str_to_str(mrb, mrb_yield(mrb, blk, mrb_str_substr(1028 mrb_value const tmp_str = mrb_str_to_str(mrb, mrb_yield(mrb, blk, str_substr( 833 1029 mrb, self, match->beg[0], match->end[0] - match->beg[0]))); 834 1030 mrb_assert(mrb_string_p(tmp_str)); … … 947 1143 mrb_define_const(mrb, clazz, "NOTBOL", mrb_fixnum_value(ONIG_OPTION_NOTBOL)); 948 1144 mrb_define_const(mrb, clazz, "NOTEOL", mrb_fixnum_value(ONIG_OPTION_NOTEOL)); 1145 #ifdef ONIG_OPTION_POSIX_REGION 1146 mrb_define_const(mrb, clazz, "POSIX_REGION", mrb_fixnum_value(ONIG_OPTION_POSIX_REGION)); 1147 #endif 949 1148 #ifdef ONIG_OPTION_ASCII_RANGE 950 1149 mrb_define_const(mrb, clazz, "ASCII_RANGE", mrb_fixnum_value(ONIG_OPTION_ASCII_RANGE)); … … 969 1168 mrb_define_method(mrb, clazz, "==", onig_regexp_equal, MRB_ARGS_REQ(1)); 970 1169 mrb_define_method(mrb, clazz, "match", onig_regexp_match, MRB_ARGS_REQ(1) | MRB_ARGS_OPT(1)); 1170 mrb_define_method(mrb, clazz, "match?", onig_regexp_match_p, MRB_ARGS_REQ(1) | MRB_ARGS_OPT(1)); 971 1171 mrb_define_method(mrb, clazz, "casefold?", onig_regexp_casefold_p, MRB_ARGS_NONE()); 972 1172 … … 1011 1211 mrb_define_method(mrb, mrb->string_class, "onig_regexp_split", &string_split, MRB_ARGS_REQ(1)); 1012 1212 mrb_define_method(mrb, mrb->string_class, "onig_regexp_scan", &string_scan, MRB_ARGS_REQ(1) | MRB_ARGS_BLOCK()); 1213 mrb_define_method(mrb, mrb->string_class, "onig_regexp_match?", &string_match_p, MRB_ARGS_REQ(1) | MRB_ARGS_OPT(1)); 1013 1214 } 1014 1215
Note:
See TracChangeset
for help on using the changeset viewer.