Ignore:
Timestamp:
Jul 9, 2020, 8:51:43 AM (4 years ago)
Author:
coas-nagasima
Message:

mrubyを2.1.1に更新

File:
1 edited

Legend:

Unmodified
Added
Removed
  • EcnlProtoTool/trunk/mrbgems/mruby-onig-regexp/src/mruby_onig_regexp.c

    r331 r439  
    2525#include <string.h>
    2626#include <ctype.h>
    27 #include <stdlib.h>
     27#include <memory.h>
    2828#include <mruby.h>
    2929#include <mruby/class.h>
    3030#include <mruby/variable.h>
    3131#include <mruby/array.h>
     32#include <mruby/hash.h>
    3233#include <mruby/string.h>
    3334#include <mruby/data.h>
     
    3637#define ONIG_EXTERN extern
    3738#endif
    38 #include "onigmo.h"
     39#ifdef HAVE_ONIGMO_H
     40#include <onigmo.h>
     41#elif defined(HAVE_ONIGURUMA_H)
     42#include <oniguruma.h>
     43#else
     44#include "oniguruma.h"
     45#endif
    3946
    4047#ifdef MRUBY_VERSION
     
    4451#endif
    4552
     53static const char utf8len_codepage[256] =
     54{
     55  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     56  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     57  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     58  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     59  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     60  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     61  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
     62  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1,
     63};
     64
     65static mrb_int
     66utf8len(const char* p, const char* e)
     67{
     68  mrb_int len;
     69  mrb_int i;
     70
     71  len = utf8len_codepage[(unsigned char)*p];
     72  if (p + len > e) return 1;
     73  for (i = 1; i < len; ++i)
     74    if ((p[i] & 0xc0) != 0x80)
     75      return 1;
     76  return len;
     77}
     78
    4679static void
    4780onig_regexp_free(mrb_state *mrb, void *p) {
     
    5285  "PosixRegexp", onig_regexp_free
    5386};
     87
     88#define ONIG_REGEXP_P(obj) \
     89  ((mrb_type(obj) == MRB_TT_DATA) && (DATA_TYPE(obj) == &mrb_onig_regexp_type))
    5490
    5591static void
     
    64100
    65101static mrb_value
     102str_substr(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
     103{
     104#ifdef MRB_UTF8_STRING
     105  return mrb_str_new(mrb, RSTRING_PTR(str) + beg, len);
     106#else
     107  return mrb_str_substr(mrb, str, beg, len);
     108#endif
     109}
     110
     111static mrb_value
    66112onig_regexp_initialize(mrb_state *mrb, mrb_value self) {
    67113  mrb_value str, flag = mrb_nil_value(), code = mrb_nil_value();
     
    69115
    70116  int cflag = 0;
    71   OnigSyntaxType* syntax = ONIG_SYNTAX_RUBY;
    72117  OnigEncoding enc = ONIG_ENCODING_UTF8;
    73118  if(mrb_string_p(code)) {
     
    97142  OnigRegex reg;
    98143  int result = onig_new(&reg, (OnigUChar*)RSTRING_PTR(str), (OnigUChar*) RSTRING_PTR(str) + RSTRING_LEN(str),
    99                         cflag, enc, syntax, &einfo);
     144                        cflag, enc, ONIG_SYNTAX_RUBY, &einfo);
    100145  if (result != ONIG_NORMAL) {
    101146    char err[ONIG_MAX_ERROR_MESSAGE_LEN] = "";
    102     onig_error_code_to_str((OnigUChar*)err, result);
    103     mrb_raisef(mrb, E_ARGUMENT_ERROR, "'%S' is an invalid regular expression because %S.",
     147    onig_error_code_to_str((OnigUChar*)err, result, &einfo);
     148    mrb_raisef(mrb, E_REGEXP_ERROR, "'%S' is an invalid regular expression because %S.",
    104149               str, mrb_str_new_cstr(mrb, err));
    105150  }
     
    122167  return c;
    123168}
     169
     170#define MISMATCH_NIL_OR(v) (result == ONIG_MISMATCH ? mrb_nil_value() : (v))
    124171
    125172static int
     
    138185
    139186  struct RObject* const cls = (struct RObject*)mrb_class_get(mrb, "OnigRegexp");
    140   mrb_obj_iv_set(mrb, cls, mrb_intern_lit(mrb, "@last_match"), match_value);
    141 
    142   if (result != ONIG_MISMATCH &&
    143       mrb_class_get(mrb, "Regexp") == (struct RClass*)cls &&
    144       mrb_bool(mrb_obj_iv_get(mrb, (struct RObject*)cls, mrb_intern_lit(mrb, "@set_global_variables"))))
     187  mrb_obj_iv_set(mrb, cls, mrb_intern_lit(mrb, "@last_match"), MISMATCH_NIL_OR(match_value));
     188
     189  if (mrb_class_get(mrb, "Regexp") == (struct RClass*)cls &&
     190    mrb_bool(mrb_obj_iv_get(mrb, (struct RObject*)cls, mrb_intern_lit(mrb, "@set_global_variables"))))
    145191  {
    146     mrb_gv_set(mrb, mrb_intern_lit(mrb, "$~"), match_value);
     192    mrb_gv_set(mrb, mrb_intern_lit(mrb, "$~"),
     193               MISMATCH_NIL_OR(match_value));
    147194    mrb_gv_set(mrb, mrb_intern_lit(mrb, "$&"),
    148                mrb_funcall(mrb, match_value, "[]", 1, mrb_fixnum_value(0)));
    149     mrb_gv_set(mrb, mrb_intern_lit(mrb, "$`"), mrb_funcall(mrb, match_value, "pre_match", 0));
    150     mrb_gv_set(mrb, mrb_intern_lit(mrb, "$'"), mrb_funcall(mrb, match_value, "post_match", 0));
     195               MISMATCH_NIL_OR(mrb_funcall(mrb, match_value, "[]", 1, mrb_fixnum_value(0))));
     196    mrb_gv_set(mrb, mrb_intern_lit(mrb, "$`"),
     197               MISMATCH_NIL_OR(mrb_funcall(mrb, match_value, "pre_match", 0)));
     198    mrb_gv_set(mrb, mrb_intern_lit(mrb, "$'"),
     199               MISMATCH_NIL_OR(mrb_funcall(mrb, match_value, "post_match", 0)));
    151200    mrb_gv_set(mrb, mrb_intern_lit(mrb, "$+"),
    152                mrb_funcall(mrb, match_value, "[]", 1, mrb_fixnum_value(match->num_regs - 1)));
     201               MISMATCH_NIL_OR(mrb_funcall(mrb, match_value, "[]", 1, mrb_fixnum_value(match->num_regs - 1))));
    153202
    154203    // $1 to $9
     
    171220
    172221static mrb_value
     222reg_operand(mrb_state *mrb, mrb_value obj) {
     223  mrb_value ret;
     224
     225  if (mrb_symbol_p(obj)) {
     226    ret = mrb_sym2str(mrb, mrb_symbol(obj));
     227    if (mrb_undef_p(ret)) {
     228      mrb_bug(mrb, "can not intern %S", obj);
     229    }
     230  }
     231  else {
     232    ret = mrb_string_type(mrb, obj);
     233  }
     234  return ret;
     235}
     236
     237static mrb_value
    173238onig_regexp_match(mrb_state *mrb, mrb_value self) {
    174239  mrb_value str = mrb_nil_value();
    175240  OnigRegex reg;
    176241  mrb_int pos = 0;
    177 
    178   mrb_get_args(mrb, "o|i", &str, &pos);
     242  mrb_value block = mrb_nil_value();
     243
     244  mrb_get_args(mrb, "o|i&", &str, &pos, &block);
     245  if (mrb_nil_p(str)) {
     246    return mrb_nil_value();
     247  }
     248  str = reg_operand(mrb, str);
    179249  if (pos < 0 || (pos > 0 && pos >= RSTRING_LEN(str))) {
    180250    return mrb_nil_value();
    181251  }
    182252
     253  Data_Get_Struct(mrb, self, &mrb_onig_regexp_type, reg);
     254
     255  mrb_value const ret = create_onig_region(mrb, str, self);
     256  if (onig_match_common(mrb, reg, ret, str, pos) == ONIG_MISMATCH) {
     257    return mrb_nil_value();
     258  }
     259
     260  if (mrb_nil_p(block)) {
     261    return ret;
     262  } else {
     263    return mrb_yield(mrb, block, ret);
     264  }
     265}
     266
     267static mrb_value
     268onig_regexp_match_p(mrb_state *mrb, mrb_value self) {
     269  mrb_value str = mrb_nil_value();
     270  mrb_int pos = 0;
     271  OnigRegex reg;
     272  OnigUChar const* str_ptr;
     273
     274  mrb_get_args(mrb, "o|i", &str, &pos);
    183275  if (mrb_nil_p(str)) {
    184276    return mrb_nil_value();
    185277  }
     278  str = reg_operand(mrb, str);
     279  if (pos < 0 || (pos > 0 && pos >= RSTRING_LEN(str))) {
     280    return mrb_nil_value();
     281  }
     282
     283  Data_Get_Struct(mrb, self, &mrb_onig_regexp_type, reg);
     284  str_ptr = (OnigUChar const*)RSTRING_PTR(str);
     285  return mrb_bool_value(onig_search(
     286      reg, str_ptr, str_ptr + RSTRING_LEN(str),
     287      str_ptr + pos, str_ptr + RSTRING_LEN(str), NULL, 0) != ONIG_MISMATCH);
     288}
     289
     290static mrb_value
     291string_match_p(mrb_state *mrb, mrb_value self) {
     292  mrb_value str = self;
     293  mrb_int pos = 0;
     294  OnigRegex reg;
     295  OnigUChar const* str_ptr;
     296
     297  mrb_get_args(mrb, "d|i", &reg, &mrb_onig_regexp_type, &pos);
     298  if (pos < 0 || (pos > 0 && pos >= RSTRING_LEN(str))) {
     299    return mrb_nil_value();
     300  }
     301
     302  if (mrb_nil_p(str)) {
     303    return mrb_nil_value();
     304  }
    186305  str = mrb_string_type(mrb, str);
    187306
    188   Data_Get_Struct(mrb, self, &mrb_onig_regexp_type, reg);
    189 
    190   mrb_value const ret = create_onig_region(mrb, str, self);
    191   return (onig_match_common(mrb, reg, ret, str, pos) == ONIG_MISMATCH)
    192       ? mrb_nil_value() : ret;
     307  str_ptr = (OnigUChar const*)RSTRING_PTR(str);
     308  return mrb_bool_value(onig_search(
     309      reg, str_ptr, str_ptr + RSTRING_LEN(str),
     310      str_ptr + pos, str_ptr + RSTRING_LEN(str), NULL, 0) != ONIG_MISMATCH);
    193311}
    194312
     
    256374
    257375    c = *p;
    258     if (c == '/'|| c == '\\') {
     376    if (c == '/') {
    259377      buf[0] = '\\'; buf[1] = c;
    260378      mrb_str_cat(mrb, str, buf, 2);
     
    326444 again:
    327445  if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') {
    328         int err = 1;
    329         ptr += 2;
    330         if ((len -= 2) > 0) {
     446    int err = 1;
     447    ptr += 2;
     448    if ((len -= 2) > 0) {
    331449      do {
    332450        if(strchr(ptr, 'i')) { options |= ONIG_OPTION_IGNORECASE; }
    333451        if(strchr(ptr, 'x')) { options |= ONIG_OPTION_EXTEND; }
    334452        if(strchr(ptr, 'm')) { options |= ONIG_OPTION_MULTILINE; }
    335                 ++ptr;
     453        ++ptr;
    336454      } while (--len > 0);
    337         }
    338         if (len > 1 && *ptr == '-') {
     455    }
     456    if (len > 1 && *ptr == '-') {
    339457      ++ptr;
    340458      --len;
     
    343461        if(strchr(ptr, 'x')) { options &= ~ONIG_OPTION_EXTEND; }
    344462        if(strchr(ptr, 'm')) { options &= ~ONIG_OPTION_MULTILINE; }
    345                 ++ptr;
     463        ++ptr;
    346464      } while (--len > 0);
    347         }
    348         if (*ptr == ')') {
     465    }
     466    if (*ptr == ')') {
    349467      --len;
    350468      ++ptr;
    351469      goto again;
    352         }
    353         if (*ptr == ':' && ptr[len-1] == ')') {
     470    }
     471    if (*ptr == ':' && ptr[len-1] == ')') {
    354472      OnigRegex rp;
    355473      ++ptr;
     
    358476                     ONIG_ENCODING_UTF8, OnigDefaultSyntax, NULL);
    359477      onig_free(rp);
    360         }
    361         if (err) {
     478    }
     479    if (err) {
    362480      options = onig_get_options(reg);
    363481      ptr = RSTRING_PTR(src);
    364482      len = RSTRING_LEN(src);
    365         }
     483    }
    366484  }
    367485
     
    369487
    370488  if ((options & embeddable) != embeddable) {
    371         optbuf[0] = '-';
    372         option_to_str(optbuf + 1, ~options);
    373         mrb_str_cat_cstr(mrb, str, optbuf);
     489    optbuf[0] = '-';
     490    option_to_str(optbuf + 1, ~options);
     491    mrb_str_cat_cstr(mrb, str, optbuf);
    374492  }
    375493
     
    526644  Data_Get_Struct(mrb, self, &mrb_onig_region_type, reg);
    527645  mrb_value str = mrb_iv_get(mrb, self, mrb_intern_lit(mrb, "string"));
    528   return mrb_str_substr(mrb, str, reg->end[0], RSTRING_LEN(str) - reg->end[0]);
     646  return str_substr(mrb, str, reg->end[0], RSTRING_LEN(str) - reg->end[0]);
    529647}
    530648
     
    535653  Data_Get_Struct(mrb, self, &mrb_onig_region_type, reg);
    536654  mrb_value str = mrb_iv_get(mrb, self, mrb_intern_lit(mrb, "string"));
    537   return mrb_str_substr(mrb, str, 0, reg->beg[0]);
     655  return str_substr(mrb, str, 0, reg->beg[0]);
    538656}
    539657
     
    567685      mrb_ary_push(mrb, ret, mrb_nil_value());
    568686    } else {
    569       mrb_ary_push(mrb, ret, mrb_str_substr(mrb, str, reg->beg[i], reg->end[i] - reg->beg[i]));
     687      mrb_ary_push(mrb, ret, str_substr(mrb, str, reg->beg[i], reg->end[i] - reg->beg[i]));
    570688    }
    571689    mrb_gc_arena_restore(mrb, ai);
     
    580698  OnigRegion* reg;
    581699  Data_Get_Struct(mrb, self, &mrb_onig_region_type, reg);
    582   return mrb_str_substr(mrb, str, reg->beg[0], reg->end[0] - reg->beg[0]);
     700  return str_substr(mrb, str, reg->beg[0], reg->end[0] - reg->beg[0]);
    583701}
    584702
     
    587705                   mrb_value src, OnigRegex reg, OnigRegion* match)
    588706{
     707  if (mrb_hash_p(replace)) {
     708    mrb_value v = mrb_hash_get(mrb, replace, mrb_str_substr(mrb, src, match->beg[0], match->end[0] - match->beg[0]));
     709    v = mrb_str_to_str(mrb, v);
     710    mrb_str_cat_str(mrb, result, v);
     711    return;
     712  }
     713
    589714  mrb_assert(mrb_string_p(replace));
    590715  char const* ch;
     
    607732        if (idx < 0) {
    608733          mrb_raisef(mrb, E_INDEX_ERROR, "undefined group name reference: %S",
    609                      mrb_str_substr(mrb, replace, name_beg - RSTRING_PTR(replace), ch - name_beg));
     734                     str_substr(mrb, replace, name_beg - RSTRING_PTR(replace), ch - name_beg));
    610735        }
    611736        mrb_str_cat(mrb, result, RSTRING_PTR(src) + match->beg[idx], match->end[idx] - match->beg[idx]);
     
    619744        if (isdigit(*ch)) { // group number 0-9
    620745          int const idx = *ch - '0';
    621           if (idx >= match->num_regs) {
    622             mrb_raisef(mrb, E_INDEX_ERROR, "undefined group number reference: %S (max: %S)",
    623                        mrb_fixnum_value(idx), mrb_fixnum_value(match->num_regs));
     746          if (idx < match->num_regs) {
     747            mrb_str_cat(mrb, result, RSTRING_PTR(src) + match->beg[idx], match->end[idx] - match->beg[idx]);
    624748          }
    625           mrb_str_cat(mrb, result, RSTRING_PTR(src) + match->beg[idx], match->end[idx] - match->beg[idx]);
    626749        } else {
    627750          char const str[] = { '\\', *ch };
     
    642765string_gsub(mrb_state* mrb, mrb_value self) {
    643766  mrb_value blk, match_expr, replace_expr = mrb_nil_value();
    644   int const argc = mrb_get_args(mrb, "&o|S", &blk, &match_expr, &replace_expr);
    645 
    646   if(mrb_string_p(match_expr)) {
     767  int const argc = mrb_get_args(mrb, "&o|o", &blk, &match_expr, &replace_expr);
     768
     769  if(!ONIG_REGEXP_P(match_expr)) {
    647770    mrb_value argv[] = { match_expr, replace_expr };
    648771    return mrb_funcall_with_block(mrb, self, mrb_intern_lit(mrb, "string_gsub"), argc, argv, blk);
    649772  }
    650773
     774  if(argc == 1 && mrb_nil_p(blk)) {
     775    return mrb_funcall(mrb, self, "to_enum", 2, mrb_symbol_value(mrb_intern_lit(mrb, "onig_regexp_gsub")), match_expr);
     776  }
     777
    651778  if(!mrb_nil_p(blk) && !mrb_nil_p(replace_expr)) {
    652     mrb_raise(mrb, E_ARGUMENT_ERROR, "both block and replace expression must not be passed");
     779    blk = mrb_nil_value();
     780  }
     781
     782  if (mrb_nil_p(blk) && !mrb_hash_p(replace_expr)) {
     783    replace_expr = mrb_string_type(mrb, replace_expr);
    653784  }
    654785
     
    668799      append_replace_str(mrb, result, replace_expr, self, reg, match);
    669800    } else {
    670       mrb_value const tmp_str = mrb_str_to_str(mrb, mrb_yield(mrb, blk, mrb_str_substr(
     801      mrb_value const tmp_str = mrb_str_to_str(mrb, mrb_yield(mrb, blk, str_substr(
    671802          mrb, self, match->beg[0], match->end[0] - match->beg[0])));
    672803      mrb_assert(mrb_string_p(tmp_str));
     
    675806
    676807    last_end_pos = match->end[0];
    677   }
    678 
     808    if (match->beg[0] == match->end[0]) {
     809      /*
     810       * Always consume at least one character of the input string
     811       * in order to prevent infinite loops.
     812       */
     813      char* p = RSTRING_PTR(self) + last_end_pos;
     814      char* e = p + RSTRING_LEN(self);
     815      int len = utf8len(p, e);
     816      if (RSTRING_LEN(self) < last_end_pos + len) break;
     817      mrb_str_cat(mrb, result, p, len);
     818      last_end_pos += len;
     819    }
     820  }
     821
     822  if (RSTRING_LEN(self) < last_end_pos) {
     823    mrb_raise(mrb, E_ARGUMENT_ERROR, "invalid byte sequence in UTF-8");
     824  }
    679825  mrb_str_cat(mrb, result, RSTRING_PTR(self) + last_end_pos, RSTRING_LEN(self) - last_end_pos);
    680826  return result;
     
    687833  mrb_get_args(mrb, "&o", &blk, &match_expr);
    688834
    689   if(mrb_string_p(match_expr)) {
     835  if(!ONIG_REGEXP_P(match_expr)) {
    690836    return mrb_funcall_with_block(mrb, self, mrb_intern_lit(mrb, "string_scan"),
    691837                                  1, &match_expr, blk);
     
    706852      mrb_assert(mrb_array_p(result));
    707853      if(m->num_regs == 1) {
    708         mrb_ary_push(mrb, result, mrb_str_substr(mrb, self, m->beg[0], m->end[0] - m->beg[0]));
     854        mrb_ary_push(mrb, result, str_substr(mrb, self, m->beg[0], m->end[0] - m->beg[0]));
    709855      } else {
    710856        mrb_value const elem = mrb_ary_new_capa(mrb, m->num_regs - 1);
    711857        for(i = 1; i < m->num_regs; ++i) {
    712           mrb_ary_push(mrb, elem, mrb_str_substr(mrb, self, m->beg[i], m->end[i] - m->beg[i]));
     858          mrb_ary_push(mrb, elem, str_substr(mrb, self, m->beg[i], m->end[i] - m->beg[i]));
    713859        }
    714860        mrb_ary_push(mrb, result, elem);
     
    717863      mrb_assert(mrb_string_p(result));
    718864      if(m->num_regs == 1) {
    719         mrb_yield(mrb, blk, mrb_str_substr(mrb, self, m->beg[0], m->end[0] - m->beg[0]));
     865        mrb_yield(mrb, blk, str_substr(mrb, self, m->beg[0], m->end[0] - m->beg[0]));
    720866      } else {
    721867        mrb_value argv = mrb_ary_new_capa(mrb, m->num_regs - 1);
    722868        for(i = 1; i < m->num_regs; ++i) {
    723           mrb_ary_push(mrb, argv, mrb_str_substr(mrb, self, m->beg[i], m->end[i] - m->beg[i]));
     869          mrb_ary_push(mrb, argv, str_substr(mrb, self, m->beg[i], m->end[i] - m->beg[i]));
    724870        }
    725871        mrb_yield(mrb, blk, argv);
     
    727873    }
    728874
    729     last_end_pos = m->end[0];
     875    if (m->beg[0] == m->end[0]) {
     876      /*
     877      * Always consume at least one character of the input string
     878      */
     879      if (RSTRING_LEN(self) > m->end[0]) {
     880        char* p = RSTRING_PTR(self) + last_end_pos;
     881        char* e = p + RSTRING_LEN(self);
     882        int len = utf8len(p, e);
     883        last_end_pos = m->end[0] + len;
     884      } else {
     885        last_end_pos = m->end[0] + 1;
     886      }
     887    } else {
     888      last_end_pos = m->end[0];
     889    }
    730890  }
    731891
     
    738898  mrb_value pattern = mrb_nil_value(); mrb_int limit = 0;
    739899  int argc = mrb_get_args(mrb, "|oi", &pattern, &limit);
    740 
    741   if(argc == 0) { // check $; global variable
     900  mrb_value result, tmp;
     901  mrb_bool lim_p = !(argc == 2 && 0 < limit);
     902
     903  if(mrb_nil_p(pattern)) { // check $; global variable
    742904    pattern = mrb_gv_get(mrb, mrb_intern_lit(mrb, "$;"));
    743     if(!mrb_nil_p(pattern)) { argc = 1; }
    744   }
    745 
    746   if(mrb_nil_p(pattern) || mrb_string_p(pattern)) {
    747     return mrb_funcall(mrb, self, "string_split", argc, pattern, mrb_fixnum_value(limit));
    748   }
    749 
    750   mrb_value const result = mrb_ary_new(mrb);
    751   if(RSTRING_LEN(self) == 0) { return result; }
     905    if (mrb_nil_p(pattern)) {
     906      pattern = mrb_str_new_lit(mrb, " ");
     907    } else if (!mrb_string_p(pattern) && !ONIG_REGEXP_P(pattern)) {
     908      mrb_raise(mrb, E_TYPE_ERROR, "value of $; must be String or Regexp");
     909    }
     910    if (argc == 0) { argc = 1; }
     911  }
     912
     913  if (!ONIG_REGEXP_P(pattern)) {
     914    if(!mrb_nil_p(pattern)) { pattern = mrb_string_type(mrb, pattern); }
     915    if(mrb_string_p(pattern) && RSTRING_LEN(pattern) == 0) {
     916      /* Special case - split into chars */
     917      pattern = mrb_funcall(mrb, mrb_obj_value(mrb_class_get(mrb, "OnigRegexp")), "new", 1, pattern);
     918    } else {
     919      return mrb_funcall(mrb, self, "string_split", argc, pattern, mrb_fixnum_value(limit));
     920    }
     921  }
     922
     923  if(RSTRING_LEN(self) == 0) { return mrb_ary_new(mrb); }
     924  if(limit == 1) { return mrb_ary_new_from_values(mrb, 1, &self); }
     925
     926  result = mrb_ary_new(mrb);
    752927
    753928  OnigRegex reg;
     
    755930  mrb_value const match_value = create_onig_region(mrb, self, pattern);
    756931  OnigRegion* const match = (OnigRegion*)DATA_PTR(match_value);
    757   int last_end_pos = 0, next_match_pos = 0;
    758   mrb_int num_matches = 0;
    759 
    760   while (limit <= 0 || (limit - 1) > num_matches) {
    761     int i;
    762     if(next_match_pos >= RSTRING_LEN(self) ||
    763        onig_match_common(mrb, reg, match_value, self, next_match_pos) == ONIG_MISMATCH) { break; }
    764 
    765     if (last_end_pos == match->end[0]) {
    766       ++next_match_pos;
    767       // Remove this loop if not using UTF-8
    768       for (; next_match_pos < RSTRING_LEN(self) && (RSTRING_PTR(self)[next_match_pos] & 0xC0) == 0x80;
    769           ++next_match_pos) {}
    770     } else {
    771       mrb_ary_push(mrb, result, mrb_str_substr(
    772           mrb, self, last_end_pos, match->beg[0] - last_end_pos));
    773       // If there are captures, add them to the array
    774       for (i = 1; i < match->num_regs; ++i) {
    775         mrb_ary_push(mrb, result, mrb_str_substr(
    776             mrb, self, match->beg[i], match->end[i] - match->beg[i]));
     932  char *ptr = mrb_str_to_cstr(mrb, self);
     933  mrb_int len = RSTRING_LEN(self);
     934  mrb_int start = 0, beg = 0, end = 0;
     935  mrb_int idx = 0, i = 0;
     936  mrb_int last_null = 0;
     937
     938  if (argc == 2) { i = 1; }
     939  while ((end = onig_match_common(mrb, reg, match_value, self, start)) >= 0) {
     940    if (start == end && match->beg[0] == match->end[0]) {
     941      if (!ptr) {
     942        mrb_ary_push(mrb, result, mrb_str_new_lit(mrb, ""));
     943        break;
    777944      }
    778       last_end_pos = match->end[0];
    779       next_match_pos = last_end_pos;
    780       ++num_matches;
    781     }
    782   }
    783   if (last_end_pos <= RSTRING_LEN(self)) {
    784     mrb_ary_push(mrb, result, mrb_str_substr(
    785         mrb, self, last_end_pos, RSTRING_LEN(self) - last_end_pos));
    786   }
    787 
    788   if (limit == 0) { // remove empty trailing elements
    789     int count = 0, i;
    790     for (i = RARRAY_LEN(result); i > 0; --i) {
    791       mrb_assert(mrb_string_p(RARRAY_PTR(result)[i - 1]));
    792       if (RSTRING_LEN(RARRAY_PTR(result)[i - 1]) != 0) { break; }
    793       else { ++count; }
    794     }
    795     if(count > 0) {
    796       return mrb_ary_new_from_values(mrb, RARRAY_LEN(result) - count, RARRAY_PTR(result));
    797     }
     945      else if (last_null == 1) {
     946        mrb_ary_push(mrb, result, str_substr(mrb, self, beg, utf8len(ptr+beg, ptr+len)));
     947        beg = start;
     948      }
     949      else {
     950        if (start == len)
     951          start++;
     952        else
     953          start += utf8len(ptr+start, ptr+len);
     954        last_null = 1;
     955        continue;
     956      }
     957    }
     958    else {
     959      mrb_ary_push(mrb, result, str_substr(mrb, self, beg, end-beg));
     960      beg = start = match->end[0];
     961    }
     962    last_null = 0;
     963
     964    for (idx=1; idx < match->num_regs; idx++) {
     965      if (match->beg[idx] == -1) continue;
     966      if (match->beg[idx] == match->end[idx])
     967        tmp = mrb_str_new_lit(mrb, "");
     968      else
     969        tmp = str_substr(mrb, self, match->beg[idx], match->end[idx]-match->beg[idx]);
     970      mrb_ary_push(mrb, result, tmp);
     971    }
     972    if (!lim_p && limit <= ++i) break;
     973  }
     974
     975  if (RSTRING_LEN(self) > 0 && (!lim_p || RSTRING_LEN(self) > beg || limit < 0)) {
     976    if (RSTRING_LEN(self) == beg)
     977      tmp = mrb_str_new_lit(mrb, "");
     978    else
     979      tmp = str_substr(mrb, self, beg, RSTRING_LEN(self)-beg);
     980    mrb_ary_push(mrb, result, tmp);
     981  }
     982  if (lim_p && limit == 0) {
     983    while ((len = RARRAY_LEN(result)) > 0 &&
     984        (tmp = mrb_ary_ref(mrb, result, len-1), RSTRING_LEN(tmp) == 0))
     985      mrb_ary_pop(mrb, result);
    798986  }
    799987
     
    805993string_sub(mrb_state* mrb, mrb_value self) {
    806994  mrb_value blk, match_expr, replace_expr = mrb_nil_value();
    807   int const argc = mrb_get_args(mrb, "&o|S", &blk, &match_expr, &replace_expr);
    808 
    809   if(mrb_string_p(match_expr)) {
     995  int const argc = mrb_get_args(mrb, "&o|o", &blk, &match_expr, &replace_expr);
     996
     997  if(!ONIG_REGEXP_P(match_expr)) {
    810998    mrb_value argv[] = { match_expr, replace_expr };
    811999    return mrb_funcall_with_block(mrb, self, mrb_intern_lit(mrb, "string_sub"), argc, argv, blk);
    8121000  }
    8131001
     1002  if(argc == 1 && mrb_nil_p(blk)) {
     1003    mrb_raise(mrb, E_ARGUMENT_ERROR, "wrong number of arguments (given 1, expected 2)");
     1004  }
     1005
    8141006  if(!mrb_nil_p(blk) && !mrb_nil_p(replace_expr)) {
    815     mrb_raise(mrb, E_ARGUMENT_ERROR, "both block and replace expression must not be passed");
     1007    blk = mrb_nil_value();
     1008  }
     1009
     1010  if (mrb_nil_p(blk) && !mrb_hash_p(replace_expr)) {
     1011    replace_expr = mrb_string_type(mrb, replace_expr);
    8161012  }
    8171013
     
    8301026    append_replace_str(mrb, result, replace_expr, self, reg, match);
    8311027  } else {
    832     mrb_value const tmp_str = mrb_str_to_str(mrb, mrb_yield(mrb, blk, mrb_str_substr(
     1028    mrb_value const tmp_str = mrb_str_to_str(mrb, mrb_yield(mrb, blk, str_substr(
    8331029        mrb, self, match->beg[0], match->end[0] - match->beg[0])));
    8341030    mrb_assert(mrb_string_p(tmp_str));
     
    9471143  mrb_define_const(mrb, clazz, "NOTBOL", mrb_fixnum_value(ONIG_OPTION_NOTBOL));
    9481144  mrb_define_const(mrb, clazz, "NOTEOL", mrb_fixnum_value(ONIG_OPTION_NOTEOL));
     1145#ifdef ONIG_OPTION_POSIX_REGION
     1146  mrb_define_const(mrb, clazz, "POSIX_REGION", mrb_fixnum_value(ONIG_OPTION_POSIX_REGION));
     1147#endif
    9491148#ifdef ONIG_OPTION_ASCII_RANGE
    9501149  mrb_define_const(mrb, clazz, "ASCII_RANGE", mrb_fixnum_value(ONIG_OPTION_ASCII_RANGE));
     
    9691168  mrb_define_method(mrb, clazz, "==", onig_regexp_equal, MRB_ARGS_REQ(1));
    9701169  mrb_define_method(mrb, clazz, "match", onig_regexp_match, MRB_ARGS_REQ(1) | MRB_ARGS_OPT(1));
     1170  mrb_define_method(mrb, clazz, "match?", onig_regexp_match_p, MRB_ARGS_REQ(1) | MRB_ARGS_OPT(1));
    9711171  mrb_define_method(mrb, clazz, "casefold?", onig_regexp_casefold_p, MRB_ARGS_NONE());
    9721172
     
    10111211  mrb_define_method(mrb, mrb->string_class, "onig_regexp_split", &string_split, MRB_ARGS_REQ(1));
    10121212  mrb_define_method(mrb, mrb->string_class, "onig_regexp_scan", &string_scan, MRB_ARGS_REQ(1) | MRB_ARGS_BLOCK());
     1213  mrb_define_method(mrb, mrb->string_class, "onig_regexp_match?", &string_match_p, MRB_ARGS_REQ(1) | MRB_ARGS_OPT(1));
    10131214}
    10141215
Note: See TracChangeset for help on using the changeset viewer.