source: EcnlProtoTool/trunk/onigmo-6.1.3/src/regparse.c@ 331

Last change on this file since 331 was 331, checked in by coas-nagasima, 6 years ago

prototoolに関連するプロジェクトをnewlibからmuslを使うよう変更・更新
ntshellをnewlibの下位の実装から、muslのsyscallの実装に変更・更新
以下のOSSをアップデート
・mruby-1.3.0
・musl-1.1.18
・onigmo-6.1.3
・tcc-0.9.27
以下のOSSを追加
・openssl-1.1.0e
・curl-7.57.0
・zlib-1.2.11
以下のmrbgemsを追加
・iij/mruby-digest
・iij/mruby-env
・iij/mruby-errno
・iij/mruby-iijson
・iij/mruby-ipaddr
・iij/mruby-mock
・iij/mruby-require
・iij/mruby-tls-openssl

  • Property svn:eol-style set to native
  • Property svn:mime-type set to text/x-csrc;charset=UTF-8
File size: 161.2 KB
Line 
1/**********************************************************************
2 regparse.c - Onigmo (Oniguruma-mod) (regular expression library)
3**********************************************************************/
4/*-
5 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * Copyright (c) 2011-2016 K.Takata <kentkt AT csc DOT jp>
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include "regparse.h"
32#include <stdarg.h>
33
34#define WARN_BUFSIZE 256
35
36#define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
37
38
39const OnigSyntaxType OnigSyntaxRuby = {
40 (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
41 ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
42 ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
43 ONIG_SYN_OP_ESC_C_CONTROL )
44 & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
45 , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
46 ONIG_SYN_OP2_OPTION_RUBY |
47 ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
48 ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
49 ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
50 ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
51 ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
52 ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
53 ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
54 ONIG_SYN_OP2_ESC_H_XDIGIT |
55#ifndef RUBY
56 ONIG_SYN_OP2_ESC_U_HEX4 |
57#endif
58 ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER |
59 ONIG_SYN_OP2_QMARK_LPAREN_CONDITION |
60 ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK |
61 ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP |
62 ONIG_SYN_OP2_QMARK_TILDE_ABSENT )
63 , ( SYN_GNU_REGEX_BV |
64 ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
65 ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
66 ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
67 ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
68 ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
69 ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
70 ONIG_SYN_WARN_CC_DUP |
71 ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
72 , ( ONIG_OPTION_ASCII_RANGE | ONIG_OPTION_POSIX_BRACKET_ALL_RANGE |
73 ONIG_OPTION_WORD_BOUND_ALL_RANGE )
74 ,
75 {
76 (OnigCodePoint )'\\' /* esc */
77 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
78 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
79 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
80 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
81 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
82 }
83};
84
85const OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
86
87extern void onig_null_warn(const char* s ARG_UNUSED) { }
88
89#ifdef DEFAULT_WARN_FUNCTION
90static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
91#else
92static OnigWarnFunc onig_warn = onig_null_warn;
93#endif
94
95#ifdef DEFAULT_VERB_WARN_FUNCTION
96static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
97#else
98static OnigWarnFunc onig_verb_warn = onig_null_warn;
99#endif
100
101extern void onig_set_warn_func(OnigWarnFunc f)
102{
103 onig_warn = f;
104}
105
106extern void onig_set_verb_warn_func(OnigWarnFunc f)
107{
108 onig_verb_warn = f;
109}
110
111static void CC_DUP_WARN(ScanEnv *env);
112
113
114static unsigned int ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT;
115
116extern unsigned int
117onig_get_parse_depth_limit(void)
118{
119 return ParseDepthLimit;
120}
121
122extern int
123onig_set_parse_depth_limit(unsigned int depth)
124{
125 if (depth == 0)
126 ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT;
127 else
128 ParseDepthLimit = depth;
129 return 0;
130}
131
132
133static void
134bbuf_free(BBuf* bbuf)
135{
136 if (IS_NOT_NULL(bbuf)) {
137 if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
138 xfree(bbuf);
139 }
140}
141
142static int
143bbuf_clone(BBuf** rto, BBuf* from)
144{
145 int r;
146 BBuf *to;
147
148 *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
149 CHECK_NULL_RETURN_MEMERR(to);
150 r = BBUF_INIT(to, from->alloc);
151 if (r != 0) return r;
152 to->used = from->used;
153 xmemcpy(to->p, from->p, from->used);
154 return 0;
155}
156
157#define BACKREF_REL_TO_ABS(rel_no, env) \
158 ((env)->num_mem + 1 + (rel_no))
159
160#define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f))
161
162#define MBCODE_START_POS(enc) \
163 (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
164
165#define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
166 add_code_range_to_buf(pbuf, env, MBCODE_START_POS(enc), ONIG_LAST_CODE_POINT)
167
168#define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
169 if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
170 r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
171 if (r) return r;\
172 }\
173} while (0)
174
175
176#define BITSET_SET_BIT_CHKDUP(bs, pos) do { \
177 if (BITSET_AT(bs, pos)) CC_DUP_WARN(env); \
178 BS_ROOM(bs, pos) |= BS_BIT(pos); \
179} while (0)
180
181#define BITSET_IS_EMPTY(bs,empty) do {\
182 int i;\
183 empty = 1;\
184 for (i = 0; i < BITSET_SIZE; i++) {\
185 if ((bs)[i] != 0) {\
186 empty = 0; break;\
187 }\
188 }\
189} while (0)
190
191static void
192bitset_set_range(ScanEnv *env, BitSetRef bs, int from, int to)
193{
194 int i;
195 for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
196 BITSET_SET_BIT_CHKDUP(bs, i);
197 }
198}
199
200#if 0
201static void
202bitset_set_all(BitSetRef bs)
203{
204 int i;
205 for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); }
206}
207#endif
208
209static void
210bitset_invert(BitSetRef bs)
211{
212 int i;
213 for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~(bs[i]); }
214}
215
216static void
217bitset_invert_to(BitSetRef from, BitSetRef to)
218{
219 int i;
220 for (i = 0; i < BITSET_SIZE; i++) { to[i] = ~(from[i]); }
221}
222
223static void
224bitset_and(BitSetRef dest, BitSetRef bs)
225{
226 int i;
227 for (i = 0; i < BITSET_SIZE; i++) { dest[i] &= bs[i]; }
228}
229
230static void
231bitset_or(BitSetRef dest, BitSetRef bs)
232{
233 int i;
234 for (i = 0; i < BITSET_SIZE; i++) { dest[i] |= bs[i]; }
235}
236
237static void
238bitset_copy(BitSetRef dest, BitSetRef bs)
239{
240 int i;
241 for (i = 0; i < BITSET_SIZE; i++) { dest[i] = bs[i]; }
242}
243
244#if defined(USE_NAMED_GROUP) && !defined(USE_ST_LIBRARY)
245extern int
246onig_strncmp(const UChar* s1, const UChar* s2, int n)
247{
248 int x;
249
250 while (n-- > 0) {
251 x = *s2++ - *s1++;
252 if (x) return x;
253 }
254 return 0;
255}
256#endif
257
258extern void
259onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
260{
261 ptrdiff_t len = end - src;
262 if (len > 0) {
263 xmemcpy(dest, src, len);
264 dest[len] = (UChar )0;
265 }
266}
267
268#ifdef USE_NAMED_GROUP
269static UChar*
270strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
271{
272 ptrdiff_t slen;
273 int term_len, i;
274 UChar *r;
275
276 slen = end - s;
277 term_len = ONIGENC_MBC_MINLEN(enc);
278
279 r = (UChar* )xmalloc(slen + term_len);
280 CHECK_NULL_RETURN(r);
281 xmemcpy(r, s, slen);
282
283 for (i = 0; i < term_len; i++)
284 r[slen + i] = (UChar )0;
285
286 return r;
287}
288#endif
289
290/* scan pattern methods */
291#define PEND_VALUE 0
292
293#ifdef __GNUC__
294/* get rid of Wunused-but-set-variable and Wuninitialized */
295# define PFETCH_READY UChar* pfetch_prev = NULL; (void)pfetch_prev
296#else
297# define PFETCH_READY UChar* pfetch_prev
298#endif
299#define PEND (p < end ? 0 : 1)
300#define PUNFETCH p = pfetch_prev
301#define PINC do { \
302 pfetch_prev = p; \
303 p += enclen(enc, p, end); \
304} while (0)
305#define PFETCH(c) do { \
306 c = ((enc->max_enc_len == 1) ? *p : ONIGENC_MBC_TO_CODE(enc, p, end)); \
307 pfetch_prev = p; \
308 p += enclen(enc, p, end); \
309} while (0)
310
311#define PINC_S do { \
312 p += enclen(enc, p, end); \
313} while (0)
314#define PFETCH_S(c) do { \
315 c = ((enc->max_enc_len == 1) ? *p : ONIGENC_MBC_TO_CODE(enc, p, end)); \
316 p += enclen(enc, p, end); \
317} while (0)
318
319#define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
320#define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c)
321
322static UChar*
323strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
324 size_t capa)
325{
326 UChar* r;
327
328 if (dest)
329 r = (UChar* )xrealloc(dest, capa + 1);
330 else
331 r = (UChar* )xmalloc(capa + 1);
332
333 CHECK_NULL_RETURN(r);
334 onig_strcpy(r + (dest_end - dest), src, src_end);
335 return r;
336}
337
338/* dest on static area */
339static UChar*
340strcat_capa_from_static(UChar* dest, UChar* dest_end,
341 const UChar* src, const UChar* src_end, size_t capa)
342{
343 UChar* r;
344
345 r = (UChar* )xmalloc(capa + 1);
346 CHECK_NULL_RETURN(r);
347 onig_strcpy(r, dest, dest_end);
348 onig_strcpy(r + (dest_end - dest), src, src_end);
349 return r;
350}
351
352
353#ifdef USE_ST_LIBRARY
354
355# ifdef RUBY
356# include "ruby/st.h"
357# else
358# include "st.h"
359# endif
360
361typedef struct {
362 const UChar* s;
363 const UChar* end;
364} st_str_end_key;
365
366static int
367str_end_cmp(st_data_t xp, st_data_t yp)
368{
369 const st_str_end_key *x, *y;
370 const UChar *p, *q;
371 int c;
372
373 x = (const st_str_end_key *)xp;
374 y = (const st_str_end_key *)yp;
375 if ((x->end - x->s) != (y->end - y->s))
376 return 1;
377
378 p = x->s;
379 q = y->s;
380 while (p < x->end) {
381 c = (int )*p - (int )*q;
382 if (c != 0) return c;
383
384 p++; q++;
385 }
386
387 return 0;
388}
389
390static st_index_t
391str_end_hash(st_data_t xp)
392{
393 const st_str_end_key *x = (const st_str_end_key *)xp;
394 const UChar *p;
395 st_index_t val = 0;
396
397 p = x->s;
398 while (p < x->end) {
399 val = val * 997 + (int )*p++;
400 }
401
402 return val + (val >> 5);
403}
404
405extern hash_table_type*
406onig_st_init_strend_table_with_size(st_index_t size)
407{
408 static const struct st_hash_type hashType = {
409 str_end_cmp,
410 str_end_hash,
411 };
412
413 return (hash_table_type* )
414 onig_st_init_table_with_size(&hashType, size);
415}
416
417extern int
418onig_st_lookup_strend(hash_table_type* table, const UChar* str_key,
419 const UChar* end_key, hash_data_type *value)
420{
421 st_str_end_key key;
422
423 key.s = (UChar* )str_key;
424 key.end = (UChar* )end_key;
425
426 return onig_st_lookup(table, (st_data_t )(&key), value);
427}
428
429extern int
430onig_st_insert_strend(hash_table_type* table, const UChar* str_key,
431 const UChar* end_key, hash_data_type value)
432{
433 st_str_end_key* key;
434 int result;
435
436 key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
437 key->s = (UChar* )str_key;
438 key->end = (UChar* )end_key;
439 result = onig_st_insert(table, (st_data_t )key, value);
440 if (result) {
441 xfree(key);
442 }
443 return result;
444}
445
446#endif /* USE_ST_LIBRARY */
447
448
449#ifdef USE_NAMED_GROUP
450
451# define INIT_NAME_BACKREFS_ALLOC_NUM 8
452
453typedef struct {
454 UChar* name;
455 size_t name_len; /* byte length */
456 int back_num; /* number of backrefs */
457 int back_alloc;
458 int back_ref1;
459 int* back_refs;
460} NameEntry;
461
462# ifdef USE_ST_LIBRARY
463
464typedef st_table NameTable;
465typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */
466
467# ifdef ONIG_DEBUG
468static int
469i_print_name_entry(UChar* key, NameEntry* e, void* arg)
470{
471 int i;
472 FILE* fp = (FILE* )arg;
473
474 fprintf(fp, "%s: ", e->name);
475 if (e->back_num == 0)
476 fputs("-", fp);
477 else if (e->back_num == 1)
478 fprintf(fp, "%d", e->back_ref1);
479 else {
480 for (i = 0; i < e->back_num; i++) {
481 if (i > 0) fprintf(fp, ", ");
482 fprintf(fp, "%d", e->back_refs[i]);
483 }
484 }
485 fputs("\n", fp);
486 return ST_CONTINUE;
487}
488
489extern int
490onig_print_names(FILE* fp, regex_t* reg)
491{
492 NameTable* t = (NameTable* )reg->name_table;
493
494 if (IS_NOT_NULL(t)) {
495 fprintf(fp, "name table\n");
496 onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
497 fputs("\n", fp);
498 }
499 return 0;
500}
501# endif /* ONIG_DEBUG */
502
503static int
504i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
505{
506 xfree(e->name);
507 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
508 xfree(key);
509 xfree(e);
510 return ST_DELETE;
511}
512
513static int
514names_clear(regex_t* reg)
515{
516 NameTable* t = (NameTable* )reg->name_table;
517
518 if (IS_NOT_NULL(t)) {
519 onig_st_foreach(t, i_free_name_entry, 0);
520 }
521 return 0;
522}
523
524extern int
525onig_names_free(regex_t* reg)
526{
527 int r;
528 NameTable* t;
529
530 r = names_clear(reg);
531 if (r) return r;
532
533 t = (NameTable* )reg->name_table;
534 if (IS_NOT_NULL(t)) onig_st_free_table(t);
535 reg->name_table = (void* )NULL;
536 return 0;
537}
538
539static NameEntry*
540name_find(regex_t* reg, const UChar* name, const UChar* name_end)
541{
542 NameEntry* e;
543 NameTable* t = (NameTable* )reg->name_table;
544
545 e = (NameEntry* )NULL;
546 if (IS_NOT_NULL(t)) {
547 onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
548 }
549 return e;
550}
551
552typedef struct {
553 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
554 regex_t* reg;
555 void* arg;
556 int ret;
557 OnigEncoding enc;
558} INamesArg;
559
560static int
561i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
562{
563 int r = (*(arg->func))(e->name,
564 e->name + e->name_len,
565 e->back_num,
566 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
567 arg->reg, arg->arg);
568 if (r != 0) {
569 arg->ret = r;
570 return ST_STOP;
571 }
572 return ST_CONTINUE;
573}
574
575extern int
576onig_foreach_name(regex_t* reg,
577 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
578{
579 INamesArg narg;
580 NameTable* t = (NameTable* )reg->name_table;
581
582 narg.ret = 0;
583 if (IS_NOT_NULL(t)) {
584 narg.func = func;
585 narg.reg = reg;
586 narg.arg = arg;
587 narg.enc = reg->enc; /* should be pattern encoding. */
588 onig_st_foreach(t, i_names, (HashDataType )&narg);
589 }
590 return narg.ret;
591}
592
593static int
594i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map)
595{
596 int i;
597
598 if (e->back_num > 1) {
599 for (i = 0; i < e->back_num; i++) {
600 e->back_refs[i] = map[e->back_refs[i]].new_val;
601 }
602 }
603 else if (e->back_num == 1) {
604 e->back_ref1 = map[e->back_ref1].new_val;
605 }
606
607 return ST_CONTINUE;
608}
609
610extern int
611onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
612{
613 NameTable* t = (NameTable* )reg->name_table;
614
615 if (IS_NOT_NULL(t)) {
616 onig_st_foreach(t, i_renumber_name, (HashDataType )map);
617 }
618 return 0;
619}
620
621
622extern int
623onig_number_of_names(const regex_t* reg)
624{
625 NameTable* t = (NameTable* )reg->name_table;
626
627 if (IS_NOT_NULL(t))
628 return (int )t->num_entries;
629 else
630 return 0;
631}
632
633# else /* USE_ST_LIBRARY */
634
635# define INIT_NAMES_ALLOC_NUM 8
636
637typedef struct {
638 NameEntry* e;
639 int num;
640 int alloc;
641} NameTable;
642
643# ifdef ONIG_DEBUG
644extern int
645onig_print_names(FILE* fp, regex_t* reg)
646{
647 int i, j;
648 NameEntry* e;
649 NameTable* t = (NameTable* )reg->name_table;
650
651 if (IS_NOT_NULL(t) && t->num > 0) {
652 fprintf(fp, "name table\n");
653 for (i = 0; i < t->num; i++) {
654 e = &(t->e[i]);
655 fprintf(fp, "%s: ", e->name);
656 if (e->back_num == 0) {
657 fputs("-", fp);
658 }
659 else if (e->back_num == 1) {
660 fprintf(fp, "%d", e->back_ref1);
661 }
662 else {
663 for (j = 0; j < e->back_num; j++) {
664 if (j > 0) fprintf(fp, ", ");
665 fprintf(fp, "%d", e->back_refs[j]);
666 }
667 }
668 fputs("\n", fp);
669 }
670 fputs("\n", fp);
671 }
672 return 0;
673}
674# endif
675
676static int
677names_clear(regex_t* reg)
678{
679 int i;
680 NameEntry* e;
681 NameTable* t = (NameTable* )reg->name_table;
682
683 if (IS_NOT_NULL(t)) {
684 for (i = 0; i < t->num; i++) {
685 e = &(t->e[i]);
686 if (IS_NOT_NULL(e->name)) {
687 xfree(e->name);
688 e->name = NULL;
689 e->name_len = 0;
690 e->back_num = 0;
691 e->back_alloc = 0;
692 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
693 e->back_refs = (int* )NULL;
694 }
695 }
696 if (IS_NOT_NULL(t->e)) {
697 xfree(t->e);
698 t->e = NULL;
699 }
700 t->num = 0;
701 }
702 return 0;
703}
704
705extern int
706onig_names_free(regex_t* reg)
707{
708 int r;
709 NameTable* t;
710
711 r = names_clear(reg);
712 if (r) return r;
713
714 t = (NameTable* )reg->name_table;
715 if (IS_NOT_NULL(t)) xfree(t);
716 reg->name_table = NULL;
717 return 0;
718}
719
720static NameEntry*
721name_find(regex_t* reg, const UChar* name, const UChar* name_end)
722{
723 int i, len;
724 NameEntry* e;
725 NameTable* t = (NameTable* )reg->name_table;
726
727 if (IS_NOT_NULL(t)) {
728 len = name_end - name;
729 for (i = 0; i < t->num; i++) {
730 e = &(t->e[i]);
731 if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
732 return e;
733 }
734 }
735 return (NameEntry* )NULL;
736}
737
738extern int
739onig_foreach_name(regex_t* reg,
740 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
741{
742 int i, r;
743 NameEntry* e;
744 NameTable* t = (NameTable* )reg->name_table;
745
746 if (IS_NOT_NULL(t)) {
747 for (i = 0; i < t->num; i++) {
748 e = &(t->e[i]);
749 r = (*func)(e->name, e->name + e->name_len, e->back_num,
750 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
751 reg, arg);
752 if (r != 0) return r;
753 }
754 }
755 return 0;
756}
757
758extern int
759onig_number_of_names(const regex_t* reg)
760{
761 NameTable* t = (NameTable* )reg->name_table;
762
763 if (IS_NOT_NULL(t))
764 return t->num;
765 else
766 return 0;
767}
768
769# endif /* else USE_ST_LIBRARY */
770
771static int
772name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
773{
774 int alloc;
775 NameEntry* e;
776 NameTable* t = (NameTable* )reg->name_table;
777
778 if (name_end - name <= 0)
779 return ONIGERR_EMPTY_GROUP_NAME;
780
781 e = name_find(reg, name, name_end);
782 if (IS_NULL(e)) {
783# ifdef USE_ST_LIBRARY
784 if (IS_NULL(t)) {
785 t = onig_st_init_strend_table_with_size(5);
786 reg->name_table = (void* )t;
787 }
788 e = (NameEntry* )xmalloc(sizeof(NameEntry));
789 CHECK_NULL_RETURN_MEMERR(e);
790
791 e->name = strdup_with_null(reg->enc, name, name_end);
792 if (IS_NULL(e->name)) {
793 xfree(e);
794 return ONIGERR_MEMORY;
795 }
796 onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
797 (HashDataType )e);
798
799 e->name_len = name_end - name;
800 e->back_num = 0;
801 e->back_alloc = 0;
802 e->back_refs = (int* )NULL;
803
804# else
805
806 if (IS_NULL(t)) {
807 alloc = INIT_NAMES_ALLOC_NUM;
808 t = (NameTable* )xmalloc(sizeof(NameTable));
809 CHECK_NULL_RETURN_MEMERR(t);
810 t->e = NULL;
811 t->alloc = 0;
812 t->num = 0;
813
814 t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
815 if (IS_NULL(t->e)) {
816 xfree(t);
817 return ONIGERR_MEMORY;
818 }
819 t->alloc = alloc;
820 reg->name_table = t;
821 goto clear;
822 }
823 else if (t->num == t->alloc) {
824 int i;
825 NameEntry* p;
826
827 alloc = t->alloc * 2;
828 p = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
829 CHECK_NULL_RETURN_MEMERR(p);
830 t->e = p;
831 t->alloc = alloc;
832
833 clear:
834 for (i = t->num; i < t->alloc; i++) {
835 t->e[i].name = NULL;
836 t->e[i].name_len = 0;
837 t->e[i].back_num = 0;
838 t->e[i].back_alloc = 0;
839 t->e[i].back_refs = (int* )NULL;
840 }
841 }
842 e = &(t->e[t->num]);
843 t->num++;
844 e->name = strdup_with_null(reg->enc, name, name_end);
845 if (IS_NULL(e->name)) return ONIGERR_MEMORY;
846 e->name_len = name_end - name;
847# endif
848 }
849
850 if (e->back_num >= 1 &&
851 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
852 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
853 name, name_end);
854 return ONIGERR_MULTIPLEX_DEFINED_NAME;
855 }
856
857 e->back_num++;
858 if (e->back_num == 1) {
859 e->back_ref1 = backref;
860 }
861 else {
862 if (e->back_num == 2) {
863 alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
864 e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
865 CHECK_NULL_RETURN_MEMERR(e->back_refs);
866 e->back_alloc = alloc;
867 e->back_refs[0] = e->back_ref1;
868 e->back_refs[1] = backref;
869 }
870 else {
871 if (e->back_num > e->back_alloc) {
872 int* p;
873 alloc = e->back_alloc * 2;
874 p = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
875 CHECK_NULL_RETURN_MEMERR(p);
876 e->back_refs = p;
877 e->back_alloc = alloc;
878 }
879 e->back_refs[e->back_num - 1] = backref;
880 }
881 }
882
883 return 0;
884}
885
886extern int
887onig_name_to_group_numbers(regex_t* reg, const UChar* name,
888 const UChar* name_end, int** nums)
889{
890 NameEntry* e = name_find(reg, name, name_end);
891
892 if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
893
894 switch (e->back_num) {
895 case 0:
896 *nums = 0;
897 break;
898 case 1:
899 *nums = &(e->back_ref1);
900 break;
901 default:
902 *nums = e->back_refs;
903 break;
904 }
905 return e->back_num;
906}
907
908extern int
909onig_name_to_backref_number(regex_t* reg, const UChar* name,
910 const UChar* name_end, const OnigRegion *region)
911{
912 int i, n, *nums;
913
914 n = onig_name_to_group_numbers(reg, name, name_end, &nums);
915 if (n < 0)
916 return n;
917 else if (n == 0)
918 return ONIGERR_PARSER_BUG;
919 else if (n == 1)
920 return nums[0];
921 else {
922 if (IS_NOT_NULL(region)) {
923 for (i = n - 1; i >= 0; i--) {
924 if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
925 return nums[i];
926 }
927 }
928 return nums[n - 1];
929 }
930}
931
932#else /* USE_NAMED_GROUP */
933
934extern int
935onig_name_to_group_numbers(regex_t* reg, const UChar* name,
936 const UChar* name_end, int** nums)
937{
938 return ONIG_NO_SUPPORT_CONFIG;
939}
940
941extern int
942onig_name_to_backref_number(regex_t* reg, const UChar* name,
943 const UChar* name_end, const OnigRegion* region)
944{
945 return ONIG_NO_SUPPORT_CONFIG;
946}
947
948extern int
949onig_foreach_name(regex_t* reg,
950 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
951{
952 return ONIG_NO_SUPPORT_CONFIG;
953}
954
955extern int
956onig_number_of_names(const regex_t* reg)
957{
958 return 0;
959}
960#endif /* else USE_NAMED_GROUP */
961
962extern int
963onig_noname_group_capture_is_active(const regex_t* reg)
964{
965 if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
966 return 0;
967
968#ifdef USE_NAMED_GROUP
969 if (onig_number_of_names(reg) > 0 &&
970 IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
971 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
972 return 0;
973 }
974#endif
975
976 return 1;
977}
978
979
980#define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16
981
982static void
983scan_env_clear(ScanEnv* env)
984{
985 int i;
986
987 BIT_STATUS_CLEAR(env->capture_history);
988 BIT_STATUS_CLEAR(env->bt_mem_start);
989 BIT_STATUS_CLEAR(env->bt_mem_end);
990 BIT_STATUS_CLEAR(env->backrefed_mem);
991 env->error = (UChar* )NULL;
992 env->error_end = (UChar* )NULL;
993 env->num_call = 0;
994 env->num_mem = 0;
995#ifdef USE_NAMED_GROUP
996 env->num_named = 0;
997#endif
998 env->mem_alloc = 0;
999 env->mem_nodes_dynamic = (Node** )NULL;
1000
1001 for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
1002 env->mem_nodes_static[i] = NULL_NODE;
1003
1004#ifdef USE_COMBINATION_EXPLOSION_CHECK
1005 env->num_comb_exp_check = 0;
1006 env->comb_exp_max_regnum = 0;
1007 env->curr_max_regnum = 0;
1008 env->has_recursion = 0;
1009#endif
1010 env->parse_depth = 0;
1011 env->warnings_flag = 0;
1012}
1013
1014static int
1015scan_env_add_mem_entry(ScanEnv* env)
1016{
1017 int i, need, alloc;
1018 Node** p;
1019
1020 need = env->num_mem + 1;
1021 if (need > ONIG_MAX_CAPTURE_GROUP_NUM)
1022 return ONIGERR_TOO_MANY_CAPTURE_GROUPS;
1023 if (need >= SCANENV_MEMNODES_SIZE) {
1024 if (env->mem_alloc <= need) {
1025 if (IS_NULL(env->mem_nodes_dynamic)) {
1026 alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
1027 p = (Node** )xmalloc(sizeof(Node*) * alloc);
1028 CHECK_NULL_RETURN_MEMERR(p);
1029 xmemcpy(p, env->mem_nodes_static,
1030 sizeof(Node*) * SCANENV_MEMNODES_SIZE);
1031 }
1032 else {
1033 alloc = env->mem_alloc * 2;
1034 p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc);
1035 CHECK_NULL_RETURN_MEMERR(p);
1036 }
1037
1038 for (i = env->num_mem + 1; i < alloc; i++)
1039 p[i] = NULL_NODE;
1040
1041 env->mem_nodes_dynamic = p;
1042 env->mem_alloc = alloc;
1043 }
1044 }
1045
1046 env->num_mem++;
1047 return env->num_mem;
1048}
1049
1050static int
1051scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
1052{
1053 if (env->num_mem >= num)
1054 SCANENV_MEM_NODES(env)[num] = node;
1055 else
1056 return ONIGERR_PARSER_BUG;
1057 return 0;
1058}
1059
1060
1061extern void
1062onig_node_free(Node* node)
1063{
1064 start:
1065 if (IS_NULL(node)) return ;
1066
1067 switch (NTYPE(node)) {
1068 case NT_STR:
1069 if (NSTR(node)->capa != 0 &&
1070 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1071 xfree(NSTR(node)->s);
1072 }
1073 break;
1074
1075 case NT_LIST:
1076 case NT_ALT:
1077 onig_node_free(NCAR(node));
1078 {
1079 Node* next_node = NCDR(node);
1080
1081 xfree(node);
1082 node = next_node;
1083 goto start;
1084 }
1085 break;
1086
1087 case NT_CCLASS:
1088 {
1089 CClassNode* cc = NCCLASS(node);
1090
1091 if (cc->mbuf)
1092 bbuf_free(cc->mbuf);
1093 }
1094 break;
1095
1096 case NT_QTFR:
1097 if (NQTFR(node)->target)
1098 onig_node_free(NQTFR(node)->target);
1099 break;
1100
1101 case NT_ENCLOSE:
1102 if (NENCLOSE(node)->target)
1103 onig_node_free(NENCLOSE(node)->target);
1104 break;
1105
1106 case NT_BREF:
1107 if (IS_NOT_NULL(NBREF(node)->back_dynamic))
1108 xfree(NBREF(node)->back_dynamic);
1109 break;
1110
1111 case NT_ANCHOR:
1112 if (NANCHOR(node)->target)
1113 onig_node_free(NANCHOR(node)->target);
1114 break;
1115 }
1116
1117 xfree(node);
1118}
1119
1120static Node*
1121node_new(void)
1122{
1123 Node* node;
1124
1125 node = (Node* )xmalloc(sizeof(Node));
1126 /* xmemset(node, 0, sizeof(Node)); */
1127 return node;
1128}
1129
1130static void
1131initialize_cclass(CClassNode* cc)
1132{
1133 BITSET_CLEAR(cc->bs);
1134 /* cc->base.flags = 0; */
1135 cc->flags = 0;
1136 cc->mbuf = NULL;
1137}
1138
1139static Node*
1140node_new_cclass(void)
1141{
1142 Node* node = node_new();
1143 CHECK_NULL_RETURN(node);
1144
1145 SET_NTYPE(node, NT_CCLASS);
1146 initialize_cclass(NCCLASS(node));
1147 return node;
1148}
1149
1150static Node*
1151node_new_ctype(int type, int not, int ascii_range)
1152{
1153 Node* node = node_new();
1154 CHECK_NULL_RETURN(node);
1155
1156 SET_NTYPE(node, NT_CTYPE);
1157 NCTYPE(node)->ctype = type;
1158 NCTYPE(node)->not = not;
1159 NCTYPE(node)->ascii_range = ascii_range;
1160 return node;
1161}
1162
1163static Node*
1164node_new_anychar(void)
1165{
1166 Node* node = node_new();
1167 CHECK_NULL_RETURN(node);
1168
1169 SET_NTYPE(node, NT_CANY);
1170 return node;
1171}
1172
1173static Node*
1174node_new_list(Node* left, Node* right)
1175{
1176 Node* node = node_new();
1177 CHECK_NULL_RETURN(node);
1178
1179 SET_NTYPE(node, NT_LIST);
1180 NCAR(node) = left;
1181 NCDR(node) = right;
1182 return node;
1183}
1184
1185extern Node*
1186onig_node_new_list(Node* left, Node* right)
1187{
1188 return node_new_list(left, right);
1189}
1190
1191extern Node*
1192onig_node_list_add(Node* list, Node* x)
1193{
1194 Node *n;
1195
1196 n = onig_node_new_list(x, NULL);
1197 if (IS_NULL(n)) return NULL_NODE;
1198
1199 if (IS_NOT_NULL(list)) {
1200 while (IS_NOT_NULL(NCDR(list)))
1201 list = NCDR(list);
1202
1203 NCDR(list) = n;
1204 }
1205
1206 return n;
1207}
1208
1209extern Node*
1210onig_node_new_alt(Node* left, Node* right)
1211{
1212 Node* node = node_new();
1213 CHECK_NULL_RETURN(node);
1214
1215 SET_NTYPE(node, NT_ALT);
1216 NCAR(node) = left;
1217 NCDR(node) = right;
1218 return node;
1219}
1220
1221extern Node*
1222onig_node_new_anchor(int type)
1223{
1224 Node* node = node_new();
1225 CHECK_NULL_RETURN(node);
1226
1227 SET_NTYPE(node, NT_ANCHOR);
1228 NANCHOR(node)->type = type;
1229 NANCHOR(node)->target = NULL;
1230 NANCHOR(node)->char_len = -1;
1231 NANCHOR(node)->ascii_range = 0;
1232 return node;
1233}
1234
1235static Node*
1236node_new_backref(int back_num, int* backrefs, int by_name,
1237#ifdef USE_BACKREF_WITH_LEVEL
1238 int exist_level, int nest_level,
1239#endif
1240 ScanEnv* env)
1241{
1242 int i;
1243 Node* node = node_new();
1244
1245 CHECK_NULL_RETURN(node);
1246
1247 SET_NTYPE(node, NT_BREF);
1248 NBREF(node)->state = 0;
1249 NBREF(node)->back_num = back_num;
1250 NBREF(node)->back_dynamic = (int* )NULL;
1251 if (by_name != 0)
1252 NBREF(node)->state |= NST_NAME_REF;
1253
1254#ifdef USE_BACKREF_WITH_LEVEL
1255 if (exist_level != 0) {
1256 NBREF(node)->state |= NST_NEST_LEVEL;
1257 NBREF(node)->nest_level = nest_level;
1258 }
1259#endif
1260
1261 for (i = 0; i < back_num; i++) {
1262 if (backrefs[i] <= env->num_mem &&
1263 IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
1264 NBREF(node)->state |= NST_RECURSION; /* /...(\1).../ */
1265 break;
1266 }
1267 }
1268
1269 if (back_num <= NODE_BACKREFS_SIZE) {
1270 for (i = 0; i < back_num; i++)
1271 NBREF(node)->back_static[i] = backrefs[i];
1272 }
1273 else {
1274 int* p = (int* )xmalloc(sizeof(int) * back_num);
1275 if (IS_NULL(p)) {
1276 onig_node_free(node);
1277 return NULL;
1278 }
1279 NBREF(node)->back_dynamic = p;
1280 for (i = 0; i < back_num; i++)
1281 p[i] = backrefs[i];
1282 }
1283 return node;
1284}
1285
1286#ifdef USE_SUBEXP_CALL
1287static Node*
1288node_new_call(UChar* name, UChar* name_end, int gnum)
1289{
1290 Node* node = node_new();
1291 CHECK_NULL_RETURN(node);
1292
1293 SET_NTYPE(node, NT_CALL);
1294 NCALL(node)->state = 0;
1295 NCALL(node)->target = NULL_NODE;
1296 NCALL(node)->name = name;
1297 NCALL(node)->name_end = name_end;
1298 NCALL(node)->group_num = gnum; /* call by number if gnum != 0 */
1299 return node;
1300}
1301#endif
1302
1303static Node*
1304node_new_quantifier(int lower, int upper, int by_number)
1305{
1306 Node* node = node_new();
1307 CHECK_NULL_RETURN(node);
1308
1309 SET_NTYPE(node, NT_QTFR);
1310 NQTFR(node)->state = 0;
1311 NQTFR(node)->target = NULL;
1312 NQTFR(node)->lower = lower;
1313 NQTFR(node)->upper = upper;
1314 NQTFR(node)->greedy = 1;
1315 NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY;
1316 NQTFR(node)->head_exact = NULL_NODE;
1317 NQTFR(node)->next_head_exact = NULL_NODE;
1318 NQTFR(node)->is_refered = 0;
1319 if (by_number != 0)
1320 NQTFR(node)->state |= NST_BY_NUMBER;
1321
1322#ifdef USE_COMBINATION_EXPLOSION_CHECK
1323 NQTFR(node)->comb_exp_check_num = 0;
1324#endif
1325
1326 return node;
1327}
1328
1329static Node*
1330node_new_enclose(int type)
1331{
1332 Node* node = node_new();
1333 CHECK_NULL_RETURN(node);
1334
1335 SET_NTYPE(node, NT_ENCLOSE);
1336 NENCLOSE(node)->type = type;
1337 NENCLOSE(node)->state = 0;
1338 NENCLOSE(node)->regnum = 0;
1339 NENCLOSE(node)->option = 0;
1340 NENCLOSE(node)->target = NULL;
1341 NENCLOSE(node)->call_addr = -1;
1342 NENCLOSE(node)->opt_count = 0;
1343 return node;
1344}
1345
1346extern Node*
1347onig_node_new_enclose(int type)
1348{
1349 return node_new_enclose(type);
1350}
1351
1352static Node*
1353node_new_enclose_memory(OnigOptionType option, int is_named)
1354{
1355 Node* node = node_new_enclose(ENCLOSE_MEMORY);
1356 CHECK_NULL_RETURN(node);
1357 if (is_named != 0)
1358 SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP);
1359
1360#ifdef USE_SUBEXP_CALL
1361 NENCLOSE(node)->option = option;
1362#endif
1363 return node;
1364}
1365
1366static Node*
1367node_new_option(OnigOptionType option)
1368{
1369 Node* node = node_new_enclose(ENCLOSE_OPTION);
1370 CHECK_NULL_RETURN(node);
1371 NENCLOSE(node)->option = option;
1372 return node;
1373}
1374
1375extern int
1376onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
1377{
1378 ptrdiff_t addlen = end - s;
1379
1380 if (addlen > 0) {
1381 ptrdiff_t len = NSTR(node)->end - NSTR(node)->s;
1382
1383 if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
1384 UChar* p;
1385 ptrdiff_t capa = len + addlen + NODE_STR_MARGIN;
1386
1387 if (capa <= NSTR(node)->capa) {
1388 onig_strcpy(NSTR(node)->s + len, s, end);
1389 }
1390 else {
1391 if (NSTR(node)->s == NSTR(node)->buf)
1392 p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end,
1393 s, end, capa);
1394 else
1395 p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa);
1396
1397 CHECK_NULL_RETURN_MEMERR(p);
1398 NSTR(node)->s = p;
1399 NSTR(node)->capa = (int )capa;
1400 }
1401 }
1402 else {
1403 onig_strcpy(NSTR(node)->s + len, s, end);
1404 }
1405 NSTR(node)->end = NSTR(node)->s + len + addlen;
1406 }
1407
1408 return 0;
1409}
1410
1411extern int
1412onig_node_str_set(Node* node, const UChar* s, const UChar* end)
1413{
1414 onig_node_str_clear(node);
1415 return onig_node_str_cat(node, s, end);
1416}
1417
1418static int
1419node_str_cat_char(Node* node, UChar c)
1420{
1421 UChar s[1];
1422
1423 s[0] = c;
1424 return onig_node_str_cat(node, s, s + 1);
1425}
1426
1427static int
1428node_str_cat_codepoint(Node* node, OnigEncoding enc, OnigCodePoint c)
1429{
1430 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
1431 int num = ONIGENC_CODE_TO_MBC(enc, c, buf);
1432 if (num < 0) return num;
1433 return onig_node_str_cat(node, buf, buf + num);
1434}
1435
1436#if 0
1437extern void
1438onig_node_conv_to_str_node(Node* node, int flag)
1439{
1440 SET_NTYPE(node, NT_STR);
1441 NSTR(node)->flag = flag;
1442 NSTR(node)->capa = 0;
1443 NSTR(node)->s = NSTR(node)->buf;
1444 NSTR(node)->end = NSTR(node)->buf;
1445}
1446#endif
1447
1448extern void
1449onig_node_str_clear(Node* node)
1450{
1451 if (NSTR(node)->capa != 0 &&
1452 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1453 xfree(NSTR(node)->s);
1454 }
1455
1456 NSTR(node)->capa = 0;
1457 NSTR(node)->flag = 0;
1458 NSTR(node)->s = NSTR(node)->buf;
1459 NSTR(node)->end = NSTR(node)->buf;
1460}
1461
1462static Node*
1463node_new_str(const UChar* s, const UChar* end)
1464{
1465 Node* node = node_new();
1466 CHECK_NULL_RETURN(node);
1467
1468 SET_NTYPE(node, NT_STR);
1469 NSTR(node)->capa = 0;
1470 NSTR(node)->flag = 0;
1471 NSTR(node)->s = NSTR(node)->buf;
1472 NSTR(node)->end = NSTR(node)->buf;
1473 if (onig_node_str_cat(node, s, end)) {
1474 onig_node_free(node);
1475 return NULL;
1476 }
1477 return node;
1478}
1479
1480extern Node*
1481onig_node_new_str(const UChar* s, const UChar* end)
1482{
1483 return node_new_str(s, end);
1484}
1485
1486static Node*
1487node_new_str_raw(UChar* s, UChar* end)
1488{
1489 Node* node = node_new_str(s, end);
1490 if (IS_NOT_NULL(node))
1491 NSTRING_SET_RAW(node);
1492 return node;
1493}
1494
1495static Node*
1496node_new_empty(void)
1497{
1498 return node_new_str(NULL, NULL);
1499}
1500
1501static Node*
1502node_new_str_raw_char(UChar c)
1503{
1504 UChar p[1];
1505
1506 p[0] = c;
1507 return node_new_str_raw(p, p + 1);
1508}
1509
1510static Node*
1511str_node_split_last_char(StrNode* sn, OnigEncoding enc)
1512{
1513 const UChar *p;
1514 Node* n = NULL_NODE;
1515
1516 if (sn->end > sn->s) {
1517 p = onigenc_get_prev_char_head(enc, sn->s, sn->end, sn->end);
1518 if (p && p > sn->s) { /* can be split. */
1519 n = node_new_str(p, sn->end);
1520 if (IS_NOT_NULL(n) && (sn->flag & NSTR_RAW) != 0)
1521 NSTRING_SET_RAW(n);
1522 sn->end = (UChar* )p;
1523 }
1524 }
1525 return n;
1526}
1527
1528static int
1529str_node_can_be_split(StrNode* sn, OnigEncoding enc)
1530{
1531 if (sn->end > sn->s) {
1532 return ((enclen(enc, sn->s, sn->end) < sn->end - sn->s) ? 1 : 0);
1533 }
1534 return 0;
1535}
1536
1537#ifdef USE_PAD_TO_SHORT_BYTE_CHAR
1538static int
1539node_str_head_pad(StrNode* sn, int num, UChar val)
1540{
1541 UChar buf[NODE_STR_BUF_SIZE];
1542 int i, len;
1543
1544 len = sn->end - sn->s;
1545 onig_strcpy(buf, sn->s, sn->end);
1546 onig_strcpy(&(sn->s[num]), buf, buf + len);
1547 sn->end += num;
1548
1549 for (i = 0; i < num; i++) {
1550 sn->s[i] = val;
1551 }
1552}
1553#endif
1554
1555extern int
1556onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
1557{
1558 unsigned int num, val;
1559 OnigCodePoint c;
1560 UChar* p = *src;
1561 PFETCH_READY;
1562
1563 num = 0;
1564 while (!PEND) {
1565 PFETCH(c);
1566 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
1567 val = (unsigned int )DIGITVAL(c);
1568 if ((INT_MAX_LIMIT - val) / 10UL < num)
1569 return -1; /* overflow */
1570
1571 num = num * 10 + val;
1572 }
1573 else {
1574 PUNFETCH;
1575 break;
1576 }
1577 }
1578 *src = p;
1579 return num;
1580}
1581
1582static int
1583scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int minlen,
1584 int maxlen, OnigEncoding enc)
1585{
1586 OnigCodePoint c;
1587 unsigned int num, val;
1588 int restlen;
1589 UChar* p = *src;
1590 PFETCH_READY;
1591
1592 restlen = maxlen - minlen;
1593 num = 0;
1594 while (!PEND && maxlen-- != 0) {
1595 PFETCH(c);
1596 if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
1597 val = (unsigned int )XDIGITVAL(enc,c);
1598 if ((INT_MAX_LIMIT - val) / 16UL < num)
1599 return -1; /* overflow */
1600
1601 num = (num << 4) + XDIGITVAL(enc,c);
1602 }
1603 else {
1604 PUNFETCH;
1605 maxlen++;
1606 break;
1607 }
1608 }
1609 if (maxlen > restlen)
1610 return -2; /* not enough digits */
1611 *src = p;
1612 return num;
1613}
1614
1615static int
1616scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
1617 OnigEncoding enc)
1618{
1619 OnigCodePoint c;
1620 unsigned int num, val;
1621 UChar* p = *src;
1622 PFETCH_READY;
1623
1624 num = 0;
1625 while (!PEND && maxlen-- != 0) {
1626 PFETCH(c);
1627 if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
1628 val = ODIGITVAL(c);
1629 if ((INT_MAX_LIMIT - val) / 8UL < num)
1630 return -1; /* overflow */
1631
1632 num = (num << 3) + val;
1633 }
1634 else {
1635 PUNFETCH;
1636 break;
1637 }
1638 }
1639 *src = p;
1640 return num;
1641}
1642
1643
1644#define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
1645 BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
1646
1647/* data format:
1648 [n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
1649 (all data size is OnigCodePoint)
1650 */
1651static int
1652new_code_range(BBuf** pbuf)
1653{
1654#define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5)
1655 int r;
1656 OnigCodePoint n;
1657 BBuf* bbuf;
1658
1659 bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
1660 CHECK_NULL_RETURN_MEMERR(*pbuf);
1661 r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
1662 if (r) return r;
1663
1664 n = 0;
1665 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1666 return 0;
1667}
1668
1669static int
1670add_code_range_to_buf0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to,
1671 int checkdup)
1672{
1673 int r, inc_n, pos;
1674 OnigCodePoint low, high, bound, x;
1675 OnigCodePoint n, *data;
1676 BBuf* bbuf;
1677
1678 if (from > to) {
1679 n = from; from = to; to = n;
1680 }
1681
1682 if (IS_NULL(*pbuf)) {
1683 r = new_code_range(pbuf);
1684 if (r) return r;
1685 bbuf = *pbuf;
1686 n = 0;
1687 }
1688 else {
1689 bbuf = *pbuf;
1690 GET_CODE_POINT(n, bbuf->p);
1691 }
1692 data = (OnigCodePoint* )(bbuf->p);
1693 data++;
1694
1695 bound = (from == 0) ? 0 : n;
1696 for (low = 0; low < bound; ) {
1697 x = (low + bound) >> 1;
1698 if (from - 1 > data[x*2 + 1])
1699 low = x + 1;
1700 else
1701 bound = x;
1702 }
1703
1704 high = (to == ONIG_LAST_CODE_POINT) ? n : low;
1705 for (bound = n; high < bound; ) {
1706 x = (high + bound) >> 1;
1707 if (to + 1 >= data[x*2])
1708 high = x + 1;
1709 else
1710 bound = x;
1711 }
1712 /* data[(low-1)*2+1] << from <= data[low*2]
1713 * data[(high-1)*2+1] <= to << data[high*2]
1714 */
1715
1716 inc_n = low + 1 - high;
1717 if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
1718 return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
1719
1720 if (inc_n != 1) {
1721 if (checkdup && from <= data[low*2+1]
1722 && (data[low*2] <= from || data[low*2+1] <= to))
1723 CC_DUP_WARN(env);
1724 if (from > data[low*2])
1725 from = data[low*2];
1726 if (to < data[(high - 1)*2 + 1])
1727 to = data[(high - 1)*2 + 1];
1728 }
1729
1730 if (inc_n != 0) {
1731 int from_pos = SIZE_CODE_POINT * (1 + high * 2);
1732 int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2);
1733
1734 if (inc_n > 0) {
1735 if (high < n) {
1736 int size = (n - high) * 2 * SIZE_CODE_POINT;
1737 BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
1738 }
1739 }
1740 else {
1741 BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
1742 }
1743 }
1744
1745 pos = SIZE_CODE_POINT * (1 + low * 2);
1746 BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
1747 BBUF_WRITE_CODE_POINT(bbuf, pos, from);
1748 BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
1749 n += inc_n;
1750 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1751
1752 return 0;
1753}
1754
1755static int
1756add_code_range_to_buf(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
1757{
1758 return add_code_range_to_buf0(pbuf, env, from, to, 1);
1759}
1760
1761static int
1762add_code_range0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to, int checkdup)
1763{
1764 if (from > to) {
1765 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
1766 return 0;
1767 else
1768 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
1769 }
1770
1771 return add_code_range_to_buf0(pbuf, env, from, to, checkdup);
1772}
1773
1774static int
1775add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
1776{
1777 return add_code_range0(pbuf, env, from, to, 1);
1778}
1779
1780static int
1781not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf, ScanEnv* env)
1782{
1783 int r, i, n;
1784 OnigCodePoint pre, from, *data, to = 0;
1785
1786 *pbuf = (BBuf* )NULL;
1787 if (IS_NULL(bbuf)) {
1788 set_all:
1789 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1790 }
1791
1792 data = (OnigCodePoint* )(bbuf->p);
1793 GET_CODE_POINT(n, data);
1794 data++;
1795 if (n <= 0) goto set_all;
1796
1797 r = 0;
1798 pre = MBCODE_START_POS(enc);
1799 for (i = 0; i < n; i++) {
1800 from = data[i*2];
1801 to = data[i*2+1];
1802 if (pre <= from - 1) {
1803 r = add_code_range_to_buf(pbuf, env, pre, from - 1);
1804 if (r != 0) return r;
1805 }
1806 if (to == ONIG_LAST_CODE_POINT) break;
1807 pre = to + 1;
1808 }
1809 if (to < ONIG_LAST_CODE_POINT) {
1810 r = add_code_range_to_buf(pbuf, env, to + 1, ONIG_LAST_CODE_POINT);
1811 }
1812 return r;
1813}
1814
1815#define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
1816 BBuf *tbuf; \
1817 int tnot; \
1818 tnot = not1; not1 = not2; not2 = tnot; \
1819 tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
1820} while (0)
1821
1822static int
1823or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
1824 BBuf* bbuf2, int not2, BBuf** pbuf, ScanEnv* env)
1825{
1826 int r;
1827 OnigCodePoint i, n1, *data1;
1828 OnigCodePoint from, to;
1829
1830 *pbuf = (BBuf* )NULL;
1831 if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
1832 if (not1 != 0 || not2 != 0)
1833 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1834 return 0;
1835 }
1836
1837 r = 0;
1838 if (IS_NULL(bbuf2))
1839 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1840
1841 if (IS_NULL(bbuf1)) {
1842 if (not1 != 0) {
1843 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1844 }
1845 else {
1846 if (not2 == 0) {
1847 return bbuf_clone(pbuf, bbuf2);
1848 }
1849 else {
1850 return not_code_range_buf(enc, bbuf2, pbuf, env);
1851 }
1852 }
1853 }
1854
1855 if (not1 != 0)
1856 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1857
1858 data1 = (OnigCodePoint* )(bbuf1->p);
1859 GET_CODE_POINT(n1, data1);
1860 data1++;
1861
1862 if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
1863 r = bbuf_clone(pbuf, bbuf2);
1864 }
1865 else if (not1 == 0) { /* 1 OR (not 2) */
1866 r = not_code_range_buf(enc, bbuf2, pbuf, env);
1867 }
1868 if (r != 0) return r;
1869
1870 for (i = 0; i < n1; i++) {
1871 from = data1[i*2];
1872 to = data1[i*2+1];
1873 r = add_code_range_to_buf(pbuf, env, from, to);
1874 if (r != 0) return r;
1875 }
1876 return 0;
1877}
1878
1879static int
1880and_code_range1(BBuf** pbuf, ScanEnv* env, OnigCodePoint from1, OnigCodePoint to1,
1881 OnigCodePoint* data, int n)
1882{
1883 int i, r;
1884 OnigCodePoint from2, to2;
1885
1886 for (i = 0; i < n; i++) {
1887 from2 = data[i*2];
1888 to2 = data[i*2+1];
1889 if (from2 < from1) {
1890 if (to2 < from1) continue;
1891 else {
1892 from1 = to2 + 1;
1893 }
1894 }
1895 else if (from2 <= to1) {
1896 if (to2 < to1) {
1897 if (from1 <= from2 - 1) {
1898 r = add_code_range_to_buf(pbuf, env, from1, from2-1);
1899 if (r != 0) return r;
1900 }
1901 from1 = to2 + 1;
1902 }
1903 else {
1904 to1 = from2 - 1;
1905 }
1906 }
1907 else {
1908 from1 = from2;
1909 }
1910 if (from1 > to1) break;
1911 }
1912 if (from1 <= to1) {
1913 r = add_code_range_to_buf(pbuf, env, from1, to1);
1914 if (r != 0) return r;
1915 }
1916 return 0;
1917}
1918
1919static int
1920and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf, ScanEnv* env)
1921{
1922 int r;
1923 OnigCodePoint i, j, n1, n2, *data1, *data2;
1924 OnigCodePoint from, to, from1, to1, from2, to2;
1925
1926 *pbuf = (BBuf* )NULL;
1927 if (IS_NULL(bbuf1)) {
1928 if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
1929 return bbuf_clone(pbuf, bbuf2);
1930 return 0;
1931 }
1932 else if (IS_NULL(bbuf2)) {
1933 if (not2 != 0)
1934 return bbuf_clone(pbuf, bbuf1);
1935 return 0;
1936 }
1937
1938 if (not1 != 0)
1939 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1940
1941 data1 = (OnigCodePoint* )(bbuf1->p);
1942 data2 = (OnigCodePoint* )(bbuf2->p);
1943 GET_CODE_POINT(n1, data1);
1944 GET_CODE_POINT(n2, data2);
1945 data1++;
1946 data2++;
1947
1948 if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
1949 for (i = 0; i < n1; i++) {
1950 from1 = data1[i*2];
1951 to1 = data1[i*2+1];
1952 for (j = 0; j < n2; j++) {
1953 from2 = data2[j*2];
1954 to2 = data2[j*2+1];
1955 if (from2 > to1) break;
1956 if (to2 < from1) continue;
1957 from = MAX(from1, from2);
1958 to = MIN(to1, to2);
1959 r = add_code_range_to_buf(pbuf, env, from, to);
1960 if (r != 0) return r;
1961 }
1962 }
1963 }
1964 else if (not1 == 0) { /* 1 AND (not 2) */
1965 for (i = 0; i < n1; i++) {
1966 from1 = data1[i*2];
1967 to1 = data1[i*2+1];
1968 r = and_code_range1(pbuf, env, from1, to1, data2, n2);
1969 if (r != 0) return r;
1970 }
1971 }
1972
1973 return 0;
1974}
1975
1976static int
1977and_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env)
1978{
1979 OnigEncoding enc = env->enc;
1980 int r, not1, not2;
1981 BBuf *buf1, *buf2, *pbuf = 0;
1982 BitSetRef bsr1, bsr2;
1983 BitSet bs1, bs2;
1984
1985 not1 = IS_NCCLASS_NOT(dest);
1986 bsr1 = dest->bs;
1987 buf1 = dest->mbuf;
1988 not2 = IS_NCCLASS_NOT(cc);
1989 bsr2 = cc->bs;
1990 buf2 = cc->mbuf;
1991
1992 if (not1 != 0) {
1993 bitset_invert_to(bsr1, bs1);
1994 bsr1 = bs1;
1995 }
1996 if (not2 != 0) {
1997 bitset_invert_to(bsr2, bs2);
1998 bsr2 = bs2;
1999 }
2000 bitset_and(bsr1, bsr2);
2001 if (bsr1 != dest->bs) {
2002 bitset_copy(dest->bs, bsr1);
2003 bsr1 = dest->bs;
2004 }
2005 if (not1 != 0) {
2006 bitset_invert(dest->bs);
2007 }
2008
2009 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2010 if (not1 != 0 && not2 != 0) {
2011 r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf, env);
2012 }
2013 else {
2014 r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf, env);
2015 if (r == 0 && not1 != 0) {
2016 BBuf *tbuf = 0;
2017 r = not_code_range_buf(enc, pbuf, &tbuf, env);
2018 bbuf_free(pbuf);
2019 pbuf = tbuf;
2020 }
2021 }
2022 if (r != 0) {
2023 bbuf_free(pbuf);
2024 return r;
2025 }
2026
2027 dest->mbuf = pbuf;
2028 bbuf_free(buf1);
2029 return r;
2030 }
2031 return 0;
2032}
2033
2034static int
2035or_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env)
2036{
2037 OnigEncoding enc = env->enc;
2038 int r, not1, not2;
2039 BBuf *buf1, *buf2, *pbuf = 0;
2040 BitSetRef bsr1, bsr2;
2041 BitSet bs1, bs2;
2042
2043 not1 = IS_NCCLASS_NOT(dest);
2044 bsr1 = dest->bs;
2045 buf1 = dest->mbuf;
2046 not2 = IS_NCCLASS_NOT(cc);
2047 bsr2 = cc->bs;
2048 buf2 = cc->mbuf;
2049
2050 if (not1 != 0) {
2051 bitset_invert_to(bsr1, bs1);
2052 bsr1 = bs1;
2053 }
2054 if (not2 != 0) {
2055 bitset_invert_to(bsr2, bs2);
2056 bsr2 = bs2;
2057 }
2058 bitset_or(bsr1, bsr2);
2059 if (bsr1 != dest->bs) {
2060 bitset_copy(dest->bs, bsr1);
2061 bsr1 = dest->bs;
2062 }
2063 if (not1 != 0) {
2064 bitset_invert(dest->bs);
2065 }
2066
2067 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2068 if (not1 != 0 && not2 != 0) {
2069 r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf, env);
2070 }
2071 else {
2072 r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf, env);
2073 if (r == 0 && not1 != 0) {
2074 BBuf *tbuf = 0;
2075 r = not_code_range_buf(enc, pbuf, &tbuf, env);
2076 bbuf_free(pbuf);
2077 pbuf = tbuf;
2078 }
2079 }
2080 if (r != 0) {
2081 bbuf_free(pbuf);
2082 return r;
2083 }
2084
2085 dest->mbuf = pbuf;
2086 bbuf_free(buf1);
2087 return r;
2088 }
2089 else
2090 return 0;
2091}
2092
2093static void UNKNOWN_ESC_WARN(ScanEnv *env, int c);
2094
2095static OnigCodePoint
2096conv_backslash_value(OnigCodePoint c, ScanEnv* env)
2097{
2098 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
2099 switch (c) {
2100 case 'n': return '\n';
2101 case 't': return '\t';
2102 case 'r': return '\r';
2103 case 'f': return '\f';
2104 case 'a': return '\007';
2105 case 'b': return '\010';
2106 case 'e': return '\033';
2107 case 'v':
2108 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
2109 return '\v';
2110 break;
2111
2112 default:
2113 if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
2114 UNKNOWN_ESC_WARN(env, c);
2115 break;
2116 }
2117 }
2118 return c;
2119}
2120
2121#ifdef USE_NO_INVALID_QUANTIFIER
2122# define is_invalid_quantifier_target(node) 0
2123#else
2124static int
2125is_invalid_quantifier_target(Node* node)
2126{
2127 switch (NTYPE(node)) {
2128 case NT_ANCHOR:
2129 return 1;
2130 break;
2131
2132 case NT_ENCLOSE:
2133 /* allow enclosed elements */
2134 /* return is_invalid_quantifier_target(NENCLOSE(node)->target); */
2135 break;
2136
2137 case NT_LIST:
2138 do {
2139 if (! is_invalid_quantifier_target(NCAR(node))) return 0;
2140 } while (IS_NOT_NULL(node = NCDR(node)));
2141 return 0;
2142 break;
2143
2144 case NT_ALT:
2145 do {
2146 if (is_invalid_quantifier_target(NCAR(node))) return 1;
2147 } while (IS_NOT_NULL(node = NCDR(node)));
2148 break;
2149
2150 default:
2151 break;
2152 }
2153 return 0;
2154}
2155#endif
2156
2157/* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
2158static int
2159popular_quantifier_num(QtfrNode* q)
2160{
2161 if (q->greedy) {
2162 if (q->lower == 0) {
2163 if (q->upper == 1) return 0;
2164 else if (IS_REPEAT_INFINITE(q->upper)) return 1;
2165 }
2166 else if (q->lower == 1) {
2167 if (IS_REPEAT_INFINITE(q->upper)) return 2;
2168 }
2169 }
2170 else {
2171 if (q->lower == 0) {
2172 if (q->upper == 1) return 3;
2173 else if (IS_REPEAT_INFINITE(q->upper)) return 4;
2174 }
2175 else if (q->lower == 1) {
2176 if (IS_REPEAT_INFINITE(q->upper)) return 5;
2177 }
2178 }
2179 return -1;
2180}
2181
2182
2183enum ReduceType {
2184 RQ_ASIS = 0, /* as is */
2185 RQ_DEL = 1, /* delete parent */
2186 RQ_A, /* to '*' */
2187 RQ_AQ, /* to '*?' */
2188 RQ_QQ, /* to '??' */
2189 RQ_P_QQ, /* to '+)??' */
2190 RQ_PQ_Q /* to '+?)?' */
2191};
2192
2193static enum ReduceType const ReduceTypeTable[6][6] = {
2194/* '?', '*', '+', '??', '*?', '+?' p / c */
2195 {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */
2196 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */
2197 {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */
2198 {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */
2199 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */
2200 {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */
2201};
2202
2203extern void
2204onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
2205{
2206 int pnum, cnum;
2207 QtfrNode *p, *c;
2208
2209 p = NQTFR(pnode);
2210 c = NQTFR(cnode);
2211 pnum = popular_quantifier_num(p);
2212 cnum = popular_quantifier_num(c);
2213 if (pnum < 0 || cnum < 0) return ;
2214
2215 switch (ReduceTypeTable[cnum][pnum]) {
2216 case RQ_DEL:
2217 *pnode = *cnode;
2218 break;
2219 case RQ_A:
2220 p->target = c->target;
2221 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1;
2222 break;
2223 case RQ_AQ:
2224 p->target = c->target;
2225 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0;
2226 break;
2227 case RQ_QQ:
2228 p->target = c->target;
2229 p->lower = 0; p->upper = 1; p->greedy = 0;
2230 break;
2231 case RQ_P_QQ:
2232 p->target = cnode;
2233 p->lower = 0; p->upper = 1; p->greedy = 0;
2234 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1;
2235 return ;
2236 break;
2237 case RQ_PQ_Q:
2238 p->target = cnode;
2239 p->lower = 0; p->upper = 1; p->greedy = 1;
2240 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0;
2241 return ;
2242 break;
2243 case RQ_ASIS:
2244 p->target = cnode;
2245 return ;
2246 break;
2247 }
2248
2249 c->target = NULL_NODE;
2250 onig_node_free(cnode);
2251}
2252
2253
2254enum TokenSyms {
2255 TK_EOT = 0, /* end of token */
2256 TK_RAW_BYTE = 1,
2257 TK_CHAR,
2258 TK_STRING,
2259 TK_CODE_POINT,
2260 TK_ANYCHAR,
2261 TK_CHAR_TYPE,
2262 TK_BACKREF,
2263 TK_CALL,
2264 TK_ANCHOR,
2265 TK_OP_REPEAT,
2266 TK_INTERVAL,
2267 TK_ANYCHAR_ANYTIME, /* SQL '%' == .* */
2268 TK_ALT,
2269 TK_SUBEXP_OPEN,
2270 TK_SUBEXP_CLOSE,
2271 TK_CC_OPEN,
2272 TK_QUOTE_OPEN,
2273 TK_CHAR_PROPERTY, /* \p{...}, \P{...} */
2274 TK_LINEBREAK,
2275 TK_EXTENDED_GRAPHEME_CLUSTER,
2276 TK_KEEP,
2277 /* in cc */
2278 TK_CC_CLOSE,
2279 TK_CC_RANGE,
2280 TK_POSIX_BRACKET_OPEN,
2281 TK_CC_AND, /* && */
2282 TK_CC_CC_OPEN /* [ */
2283};
2284
2285typedef struct {
2286 enum TokenSyms type;
2287 int escaped;
2288 int base; /* is number: 8, 16 (used in [....]) */
2289 UChar* backp;
2290 union {
2291 UChar* s;
2292 int c;
2293 OnigCodePoint code;
2294 struct {
2295 int subtype;
2296 int ascii_range;
2297 } anchor;
2298 struct {
2299 int lower;
2300 int upper;
2301 int greedy;
2302 int possessive;
2303 } repeat;
2304 struct {
2305 int num;
2306 int ref1;
2307 int* refs;
2308 int by_name;
2309#ifdef USE_BACKREF_WITH_LEVEL
2310 int exist_level;
2311 int level; /* \k<name+n> */
2312#endif
2313 } backref;
2314 struct {
2315 UChar* name;
2316 UChar* name_end;
2317 int gnum;
2318 int rel;
2319 } call;
2320 struct {
2321 int ctype;
2322 int not;
2323 } prop;
2324 } u;
2325} OnigToken;
2326
2327
2328static int
2329fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
2330{
2331 int low, up, syn_allow, non_low = 0;
2332 int r = 0;
2333 OnigCodePoint c;
2334 OnigEncoding enc = env->enc;
2335 UChar* p = *src;
2336 PFETCH_READY;
2337
2338 syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
2339
2340 if (PEND) {
2341 if (syn_allow)
2342 return 1; /* "....{" : OK! */
2343 else
2344 return ONIGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */
2345 }
2346
2347 if (! syn_allow) {
2348 c = PPEEK;
2349 if (c == ')' || c == '(' || c == '|') {
2350 return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
2351 }
2352 }
2353
2354 low = onig_scan_unsigned_number(&p, end, env->enc);
2355 if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2356 if (low > ONIG_MAX_REPEAT_NUM)
2357 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2358
2359 if (p == *src) { /* can't read low */
2360 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
2361 /* allow {,n} as {0,n} */
2362 low = 0;
2363 non_low = 1;
2364 }
2365 else
2366 goto invalid;
2367 }
2368
2369 if (PEND) goto invalid;
2370 PFETCH(c);
2371 if (c == ',') {
2372 UChar* prev = p;
2373 up = onig_scan_unsigned_number(&p, end, env->enc);
2374 if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2375 if (up > ONIG_MAX_REPEAT_NUM)
2376 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2377
2378 if (p == prev) {
2379 if (non_low != 0)
2380 goto invalid;
2381 up = REPEAT_INFINITE; /* {n,} : {n,infinite} */
2382 }
2383 }
2384 else {
2385 if (non_low != 0)
2386 goto invalid;
2387
2388 PUNFETCH;
2389 up = low; /* {n} : exact n times */
2390 r = 2; /* fixed */
2391 }
2392
2393 if (PEND) goto invalid;
2394 PFETCH(c);
2395 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
2396 if (c != MC_ESC(env->syntax)) goto invalid;
2397 if (PEND) goto invalid;
2398 PFETCH(c);
2399 }
2400 if (c != '}') goto invalid;
2401
2402 if (!IS_REPEAT_INFINITE(up) && low > up) {
2403 return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
2404 }
2405
2406 tok->type = TK_INTERVAL;
2407 tok->u.repeat.lower = low;
2408 tok->u.repeat.upper = up;
2409 *src = p;
2410 return r; /* 0: normal {n,m}, 2: fixed {n} */
2411
2412 invalid:
2413 if (syn_allow)
2414 return 1; /* OK */
2415 else
2416 return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
2417}
2418
2419/* \M-, \C-, \c, or \... */
2420static int
2421fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val)
2422{
2423 int v;
2424 OnigCodePoint c;
2425 OnigEncoding enc = env->enc;
2426 UChar* p = *src;
2427
2428 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2429
2430 PFETCH_S(c);
2431 switch (c) {
2432 case 'M':
2433 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
2434 if (PEND) return ONIGERR_END_PATTERN_AT_META;
2435 PFETCH_S(c);
2436 if (c != '-') return ONIGERR_META_CODE_SYNTAX;
2437 if (PEND) return ONIGERR_END_PATTERN_AT_META;
2438 PFETCH_S(c);
2439 if (c == MC_ESC(env->syntax)) {
2440 v = fetch_escaped_value(&p, end, env, &c);
2441 if (v < 0) return v;
2442 }
2443 c = ((c & 0xff) | 0x80);
2444 }
2445 else
2446 goto backslash;
2447 break;
2448
2449 case 'C':
2450 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
2451 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2452 PFETCH_S(c);
2453 if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
2454 goto control;
2455 }
2456 else
2457 goto backslash;
2458
2459 case 'c':
2460 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
2461 control:
2462 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2463 PFETCH_S(c);
2464 if (c == '?') {
2465 c = 0177;
2466 }
2467 else {
2468 if (c == MC_ESC(env->syntax)) {
2469 v = fetch_escaped_value(&p, end, env, &c);
2470 if (v < 0) return v;
2471 }
2472 c &= 0x9f;
2473 }
2474 break;
2475 }
2476 /* fall through */
2477
2478 default:
2479 {
2480 backslash:
2481 c = conv_backslash_value(c, env);
2482 }
2483 break;
2484 }
2485
2486 *src = p;
2487 *val = c;
2488 return 0;
2489}
2490
2491static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
2492
2493static OnigCodePoint
2494get_name_end_code_point(OnigCodePoint start)
2495{
2496 switch (start) {
2497 case '<': return (OnigCodePoint )'>'; break;
2498 case '\'': return (OnigCodePoint )'\''; break;
2499 case '(': return (OnigCodePoint )')'; break;
2500 case '{': return (OnigCodePoint )'}'; break;
2501 default:
2502 break;
2503 }
2504
2505 return (OnigCodePoint )0;
2506}
2507
2508#ifdef USE_NAMED_GROUP
2509# ifdef RUBY
2510# define ONIGENC_IS_CODE_NAME(enc, c) TRUE
2511# else
2512# define ONIGENC_IS_CODE_NAME(enc, c) ONIGENC_IS_CODE_WORD(enc, c)
2513# endif
2514
2515# ifdef USE_BACKREF_WITH_LEVEL
2516/*
2517 \k<name+n>, \k<name-n>
2518 \k<num+n>, \k<num-n>
2519 \k<-num+n>, \k<-num-n>
2520*/
2521static int
2522fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
2523 UChar** rname_end, ScanEnv* env,
2524 int* rback_num, int* rlevel)
2525{
2526 int r, sign, is_num, exist_level;
2527 OnigCodePoint end_code;
2528 OnigCodePoint c = 0;
2529 OnigEncoding enc = env->enc;
2530 UChar *name_end;
2531 UChar *pnum_head;
2532 UChar *p = *src;
2533 PFETCH_READY;
2534
2535 *rback_num = 0;
2536 is_num = exist_level = 0;
2537 sign = 1;
2538 pnum_head = *src;
2539
2540 end_code = get_name_end_code_point(start_code);
2541
2542 name_end = end;
2543 r = 0;
2544 if (PEND) {
2545 return ONIGERR_EMPTY_GROUP_NAME;
2546 }
2547 else {
2548 PFETCH(c);
2549 if (c == end_code)
2550 return ONIGERR_EMPTY_GROUP_NAME;
2551
2552 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2553 is_num = 1;
2554 }
2555 else if (c == '-') {
2556 is_num = 2;
2557 sign = -1;
2558 pnum_head = p;
2559 }
2560 else if (!ONIGENC_IS_CODE_NAME(enc, c)) {
2561 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2562 }
2563 }
2564
2565 while (!PEND) {
2566 name_end = p;
2567 PFETCH(c);
2568 if (c == end_code || c == ')' || c == '+' || c == '-') {
2569 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
2570 break;
2571 }
2572
2573 if (is_num != 0) {
2574 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2575 is_num = 1;
2576 }
2577 else {
2578 r = ONIGERR_INVALID_GROUP_NAME;
2579 is_num = 0;
2580 }
2581 }
2582 else if (!ONIGENC_IS_CODE_NAME(enc, c)) {
2583 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2584 }
2585 }
2586
2587 if (r == 0 && c != end_code) {
2588 if (c == '+' || c == '-') {
2589 int level;
2590 int flag = (c == '-' ? -1 : 1);
2591
2592 if (PEND) {
2593 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2594 goto end;
2595 }
2596 PFETCH(c);
2597 if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
2598 PUNFETCH;
2599 level = onig_scan_unsigned_number(&p, end, enc);
2600 if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
2601 *rlevel = (level * flag);
2602 exist_level = 1;
2603
2604 if (!PEND) {
2605 PFETCH(c);
2606 if (c == end_code)
2607 goto end;
2608 }
2609 }
2610
2611 err:
2612 r = ONIGERR_INVALID_GROUP_NAME;
2613 name_end = end;
2614 }
2615
2616 end:
2617 if (r == 0) {
2618 if (is_num != 0) {
2619 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2620 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2621 else if (*rback_num == 0) goto err;
2622
2623 *rback_num *= sign;
2624 }
2625
2626 *rname_end = name_end;
2627 *src = p;
2628 return (exist_level ? 1 : 0);
2629 }
2630 else {
2631 onig_scan_env_set_error_string(env, r, *src, name_end);
2632 return r;
2633 }
2634}
2635# endif /* USE_BACKREF_WITH_LEVEL */
2636
2637/*
2638 ref: 0 -> define name (don't allow number name)
2639 1 -> reference name (allow number name)
2640*/
2641static int
2642fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2643 UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2644{
2645 int r, is_num, sign;
2646 OnigCodePoint end_code;
2647 OnigCodePoint c = 0;
2648 OnigEncoding enc = env->enc;
2649 UChar *name_end;
2650 UChar *pnum_head;
2651 UChar *p = *src;
2652
2653 *rback_num = 0;
2654
2655 end_code = get_name_end_code_point(start_code);
2656
2657 name_end = end;
2658 pnum_head = *src;
2659 r = 0;
2660 is_num = 0;
2661 sign = 1;
2662 if (PEND) {
2663 return ONIGERR_EMPTY_GROUP_NAME;
2664 }
2665 else {
2666 PFETCH_S(c);
2667 if (c == end_code)
2668 return ONIGERR_EMPTY_GROUP_NAME;
2669
2670 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2671 if (ref == 1)
2672 is_num = 1;
2673 else {
2674 r = ONIGERR_INVALID_GROUP_NAME;
2675 is_num = 0;
2676 }
2677 }
2678 else if (c == '-') {
2679 if (ref == 1) {
2680 is_num = 2;
2681 sign = -1;
2682 pnum_head = p;
2683 }
2684 else {
2685 r = ONIGERR_INVALID_GROUP_NAME;
2686 is_num = 0;
2687 }
2688 }
2689 else if (!ONIGENC_IS_CODE_NAME(enc, c)) {
2690 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2691 }
2692 }
2693
2694 if (r == 0) {
2695 while (!PEND) {
2696 name_end = p;
2697 PFETCH_S(c);
2698 if (c == end_code || c == ')') {
2699 if (is_num == 2) {
2700 r = ONIGERR_INVALID_GROUP_NAME;
2701 goto teardown;
2702 }
2703 break;
2704 }
2705
2706 if (is_num != 0) {
2707 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2708 is_num = 1;
2709 }
2710 else {
2711 if (!ONIGENC_IS_CODE_WORD(enc, c))
2712 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2713 else
2714 r = ONIGERR_INVALID_GROUP_NAME;
2715 goto teardown;
2716 }
2717 }
2718 else {
2719 if (!ONIGENC_IS_CODE_NAME(enc, c)) {
2720 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2721 goto teardown;
2722 }
2723 }
2724 }
2725
2726 if (c != end_code) {
2727 r = ONIGERR_INVALID_GROUP_NAME;
2728 name_end = end;
2729 goto err;
2730 }
2731
2732 if (is_num != 0) {
2733 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2734 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2735 else if (*rback_num == 0) {
2736 r = ONIGERR_INVALID_GROUP_NAME;
2737 goto err;
2738 }
2739
2740 *rback_num *= sign;
2741 }
2742
2743 *rname_end = name_end;
2744 *src = p;
2745 return 0;
2746 }
2747 else {
2748teardown:
2749 while (!PEND) {
2750 name_end = p;
2751 PFETCH_S(c);
2752 if (c == end_code || c == ')')
2753 break;
2754 }
2755 if (PEND)
2756 name_end = end;
2757
2758 err:
2759 onig_scan_env_set_error_string(env, r, *src, name_end);
2760 return r;
2761 }
2762}
2763#else
2764static int
2765fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2766 UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2767{
2768 int r, is_num, sign;
2769 OnigCodePoint end_code;
2770 OnigCodePoint c = 0;
2771 UChar *name_end;
2772 OnigEncoding enc = env->enc;
2773 UChar *pnum_head;
2774 UChar *p = *src;
2775 PFETCH_READY;
2776
2777 *rback_num = 0;
2778
2779 end_code = get_name_end_code_point(start_code);
2780
2781 *rname_end = name_end = end;
2782 r = 0;
2783 pnum_head = *src;
2784 is_num = 0;
2785 sign = 1;
2786
2787 if (PEND) {
2788 return ONIGERR_EMPTY_GROUP_NAME;
2789 }
2790 else {
2791 PFETCH(c);
2792 if (c == end_code)
2793 return ONIGERR_EMPTY_GROUP_NAME;
2794
2795 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2796 is_num = 1;
2797 }
2798 else if (c == '-') {
2799 is_num = 2;
2800 sign = -1;
2801 pnum_head = p;
2802 }
2803 else {
2804 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2805 }
2806 }
2807
2808 while (!PEND) {
2809 name_end = p;
2810
2811 PFETCH(c);
2812 if (c == end_code || c == ')') break;
2813 if (! ONIGENC_IS_CODE_DIGIT(enc, c))
2814 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2815 }
2816 if (r == 0 && c != end_code) {
2817 r = ONIGERR_INVALID_GROUP_NAME;
2818 name_end = end;
2819 }
2820
2821 if (r == 0) {
2822 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2823 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2824 else if (*rback_num == 0) {
2825 r = ONIGERR_INVALID_GROUP_NAME;
2826 goto err;
2827 }
2828 *rback_num *= sign;
2829
2830 *rname_end = name_end;
2831 *src = p;
2832 return 0;
2833 }
2834 else {
2835 err:
2836 onig_scan_env_set_error_string(env, r, *src, name_end);
2837 return r;
2838 }
2839}
2840#endif /* USE_NAMED_GROUP */
2841
2842
2843static void
2844onig_syntax_warn(ScanEnv *env, const char *fmt, ...)
2845{
2846 va_list args;
2847 UChar buf[WARN_BUFSIZE];
2848 va_start(args, fmt);
2849 onig_vsnprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
2850 env->pattern, env->pattern_end,
2851 (const UChar *)fmt, args);
2852 va_end(args);
2853#ifdef RUBY
2854 if (env->sourcefile == NULL)
2855 rb_warn("%s", (char *)buf);
2856 else
2857 rb_compile_warn(env->sourcefile, env->sourceline, "%s", (char *)buf);
2858#else
2859 (*onig_warn)((char* )buf);
2860#endif
2861}
2862
2863static void
2864CC_ESC_WARN(ScanEnv *env, UChar *c)
2865{
2866 if (onig_warn == onig_null_warn) return ;
2867
2868 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
2869 IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
2870 onig_syntax_warn(env, "character class has '%s' without escape", c);
2871 }
2872}
2873
2874static void
2875CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
2876{
2877 if (onig_warn == onig_null_warn) return ;
2878
2879 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
2880 onig_syntax_warn(env, "regular expression has '%s' without escape", c);
2881 }
2882}
2883
2884#ifndef RTEST
2885# define RTEST(v) 1
2886#endif
2887
2888static void
2889CC_DUP_WARN(ScanEnv *env)
2890{
2891 if (onig_warn == onig_null_warn || !RTEST(ruby_verbose)) return ;
2892
2893 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_DUP) &&
2894 !(env->warnings_flag & ONIG_SYN_WARN_CC_DUP)) {
2895 env->warnings_flag |= ONIG_SYN_WARN_CC_DUP;
2896 onig_syntax_warn(env, "character class has duplicated range");
2897 }
2898}
2899
2900static void
2901UNKNOWN_ESC_WARN(ScanEnv *env, int c)
2902{
2903 if (onig_warn == onig_null_warn || !RTEST(ruby_verbose)) return ;
2904 onig_syntax_warn(env, "Unknown escape \\%c is ignored", c);
2905}
2906
2907static UChar*
2908find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
2909 UChar **next, OnigEncoding enc)
2910{
2911 int i;
2912 OnigCodePoint x;
2913 UChar *q;
2914 UChar *p = from;
2915
2916 while (p < to) {
2917 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2918 q = p + enclen(enc, p, to);
2919 if (x == s[0]) {
2920 for (i = 1; i < n && q < to; i++) {
2921 x = ONIGENC_MBC_TO_CODE(enc, q, to);
2922 if (x != s[i]) break;
2923 q += enclen(enc, q, to);
2924 }
2925 if (i >= n) {
2926 if (IS_NOT_NULL(next))
2927 *next = q;
2928 return p;
2929 }
2930 }
2931 p = q;
2932 }
2933 return NULL_UCHARP;
2934}
2935
2936static int
2937str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
2938 OnigCodePoint bad, OnigEncoding enc, const OnigSyntaxType* syn)
2939{
2940 int i, in_esc;
2941 OnigCodePoint x;
2942 UChar *q;
2943 UChar *p = from;
2944
2945 in_esc = 0;
2946 while (p < to) {
2947 if (in_esc) {
2948 in_esc = 0;
2949 p += enclen(enc, p, to);
2950 }
2951 else {
2952 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2953 q = p + enclen(enc, p, to);
2954 if (x == s[0]) {
2955 for (i = 1; i < n && q < to; i++) {
2956 x = ONIGENC_MBC_TO_CODE(enc, q, to);
2957 if (x != s[i]) break;
2958 q += enclen(enc, q, to);
2959 }
2960 if (i >= n) return 1;
2961 p += enclen(enc, p, to);
2962 }
2963 else {
2964 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2965 if (x == bad) return 0;
2966 else if (x == MC_ESC(syn)) in_esc = 1;
2967 p = q;
2968 }
2969 }
2970 }
2971 return 0;
2972}
2973
2974static int
2975fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
2976{
2977 int num;
2978 OnigCodePoint c, c2;
2979 const OnigSyntaxType* syn = env->syntax;
2980 OnigEncoding enc = env->enc;
2981 UChar* prev;
2982 UChar* p = *src;
2983 PFETCH_READY;
2984
2985 if (PEND) {
2986 tok->type = TK_EOT;
2987 return tok->type;
2988 }
2989
2990 PFETCH(c);
2991 tok->type = TK_CHAR;
2992 tok->base = 0;
2993 tok->u.c = c;
2994 tok->escaped = 0;
2995
2996 if (c == ']') {
2997 tok->type = TK_CC_CLOSE;
2998 }
2999 else if (c == '-') {
3000 tok->type = TK_CC_RANGE;
3001 }
3002 else if (c == MC_ESC(syn)) {
3003 if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
3004 goto end;
3005
3006 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
3007
3008 PFETCH(c);
3009 tok->escaped = 1;
3010 tok->u.c = c;
3011 switch (c) {
3012 case 'w':
3013 tok->type = TK_CHAR_TYPE;
3014 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3015 tok->u.prop.not = 0;
3016 break;
3017 case 'W':
3018 tok->type = TK_CHAR_TYPE;
3019 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3020 tok->u.prop.not = 1;
3021 break;
3022 case 'd':
3023 tok->type = TK_CHAR_TYPE;
3024 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3025 tok->u.prop.not = 0;
3026 break;
3027 case 'D':
3028 tok->type = TK_CHAR_TYPE;
3029 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3030 tok->u.prop.not = 1;
3031 break;
3032 case 's':
3033 tok->type = TK_CHAR_TYPE;
3034 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3035 tok->u.prop.not = 0;
3036 break;
3037 case 'S':
3038 tok->type = TK_CHAR_TYPE;
3039 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3040 tok->u.prop.not = 1;
3041 break;
3042 case 'h':
3043 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3044 tok->type = TK_CHAR_TYPE;
3045 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3046 tok->u.prop.not = 0;
3047 break;
3048 case 'H':
3049 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3050 tok->type = TK_CHAR_TYPE;
3051 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3052 tok->u.prop.not = 1;
3053 break;
3054
3055 case 'p':
3056 case 'P':
3057 if (PEND) break;
3058
3059 c2 = PPEEK;
3060 if (c2 == '{' &&
3061 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
3062 PINC;
3063 tok->type = TK_CHAR_PROPERTY;
3064 tok->u.prop.not = (c == 'P' ? 1 : 0);
3065
3066 if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3067 PFETCH(c2);
3068 if (c2 == '^') {
3069 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3070 }
3071 else
3072 PUNFETCH;
3073 }
3074 }
3075 else {
3076 onig_syntax_warn(env, "invalid Unicode Property \\%c", c);
3077 }
3078 break;
3079
3080 case 'x':
3081 if (PEND) break;
3082
3083 prev = p;
3084 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3085 PINC;
3086 num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc);
3087 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3088 if (!PEND) {
3089 c2 = PPEEK;
3090 if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
3091 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3092 }
3093
3094 if (p > prev + enclen(enc, prev, end) && !PEND && (PPEEK_IS('}'))) {
3095 PINC;
3096 tok->type = TK_CODE_POINT;
3097 tok->base = 16;
3098 tok->u.code = (OnigCodePoint )num;
3099 }
3100 else {
3101 /* can't read nothing or invalid format */
3102 p = prev;
3103 }
3104 }
3105 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3106 num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc);
3107 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3108 if (p == prev) { /* can't read nothing. */
3109 num = 0; /* but, it's not error */
3110 }
3111 tok->type = TK_RAW_BYTE;
3112 tok->base = 16;
3113 tok->u.c = num;
3114 }
3115 break;
3116
3117 case 'u':
3118 if (PEND) break;
3119
3120 prev = p;
3121 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3122 num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc);
3123 if (num < -1) return ONIGERR_TOO_SHORT_DIGITS;
3124 else if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3125 if (p == prev) { /* can't read nothing. */
3126 num = 0; /* but, it's not error */
3127 }
3128 tok->type = TK_CODE_POINT;
3129 tok->base = 16;
3130 tok->u.code = (OnigCodePoint )num;
3131 }
3132 break;
3133
3134 case 'o':
3135 if (PEND) break;
3136
3137 prev = p;
3138 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) {
3139 PINC;
3140 num = scan_unsigned_octal_number(&p, end, 11, enc);
3141 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3142 if (!PEND) {
3143 c2 = PPEEK;
3144 if (ONIGENC_IS_CODE_DIGIT(enc, c2) && c2 < '8')
3145 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3146 }
3147
3148 if (p > prev + enclen(enc, prev, end) && !PEND && (PPEEK_IS('}'))) {
3149 PINC;
3150 tok->type = TK_CODE_POINT;
3151 tok->base = 8;
3152 tok->u.code = (OnigCodePoint )num;
3153 }
3154 else {
3155 /* can't read nothing or invalid format */
3156 p = prev;
3157 }
3158 }
3159 break;
3160
3161 case '0':
3162 case '1': case '2': case '3': case '4': case '5': case '6': case '7':
3163 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3164 PUNFETCH;
3165 prev = p;
3166 num = scan_unsigned_octal_number(&p, end, 3, enc);
3167 if (num < 0 || 0xff < num) return ONIGERR_TOO_BIG_NUMBER;
3168 if (p == prev) { /* can't read nothing. */
3169 num = 0; /* but, it's not error */
3170 }
3171 tok->type = TK_RAW_BYTE;
3172 tok->base = 8;
3173 tok->u.c = num;
3174 }
3175 break;
3176
3177 default:
3178 PUNFETCH;
3179 num = fetch_escaped_value(&p, end, env, &c2);
3180 if (num < 0) return num;
3181 if ((OnigCodePoint )tok->u.c != c2) {
3182 tok->u.code = (OnigCodePoint )c2;
3183 tok->type = TK_CODE_POINT;
3184 }
3185 break;
3186 }
3187 }
3188 else if (c == '[') {
3189 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
3190 OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
3191 tok->backp = p; /* point at '[' is read */
3192 PINC;
3193 if (str_exist_check_with_esc(send, 2, p, end,
3194 (OnigCodePoint )']', enc, syn)) {
3195 tok->type = TK_POSIX_BRACKET_OPEN;
3196 }
3197 else {
3198 PUNFETCH;
3199 goto cc_in_cc;
3200 }
3201 }
3202 else {
3203 cc_in_cc:
3204 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
3205 tok->type = TK_CC_CC_OPEN;
3206 }
3207 else {
3208 CC_ESC_WARN(env, (UChar* )"[");
3209 }
3210 }
3211 }
3212 else if (c == '&') {
3213 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
3214 !PEND && (PPEEK_IS('&'))) {
3215 PINC;
3216 tok->type = TK_CC_AND;
3217 }
3218 }
3219
3220 end:
3221 *src = p;
3222 return tok->type;
3223}
3224
3225#ifdef USE_NAMED_GROUP
3226static int
3227fetch_named_backref_token(OnigCodePoint c, OnigToken* tok, UChar** src,
3228 UChar* end, ScanEnv* env)
3229{
3230 int r, num;
3231 const OnigSyntaxType* syn = env->syntax;
3232 UChar* prev;
3233 UChar* p = *src;
3234 UChar* name_end;
3235 int* backs;
3236 int back_num;
3237
3238 prev = p;
3239
3240# ifdef USE_BACKREF_WITH_LEVEL
3241 name_end = NULL_UCHARP; /* no need. escape gcc warning. */
3242 r = fetch_name_with_level(c, &p, end, &name_end,
3243 env, &back_num, &tok->u.backref.level);
3244 if (r == 1) tok->u.backref.exist_level = 1;
3245 else tok->u.backref.exist_level = 0;
3246# else
3247 r = fetch_name(&p, end, &name_end, env, &back_num, 1);
3248# endif
3249 if (r < 0) return r;
3250
3251 if (back_num != 0) {
3252 if (back_num < 0) {
3253 back_num = BACKREF_REL_TO_ABS(back_num, env);
3254 if (back_num <= 0)
3255 return ONIGERR_INVALID_BACKREF;
3256 }
3257
3258 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3259 if (back_num > env->num_mem ||
3260 IS_NULL(SCANENV_MEM_NODES(env)[back_num]))
3261 return ONIGERR_INVALID_BACKREF;
3262 }
3263 tok->type = TK_BACKREF;
3264 tok->u.backref.by_name = 0;
3265 tok->u.backref.num = 1;
3266 tok->u.backref.ref1 = back_num;
3267 }
3268 else {
3269 num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
3270 if (num <= 0) {
3271 onig_scan_env_set_error_string(env,
3272 ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
3273 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3274 }
3275 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3276 int i;
3277 for (i = 0; i < num; i++) {
3278 if (backs[i] > env->num_mem ||
3279 IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
3280 return ONIGERR_INVALID_BACKREF;
3281 }
3282 }
3283
3284 tok->type = TK_BACKREF;
3285 tok->u.backref.by_name = 1;
3286 if (num == 1 || IS_SYNTAX_BV(syn, ONIG_SYN_USE_LEFT_MOST_NAMED_GROUP)) {
3287 tok->u.backref.num = 1;
3288 tok->u.backref.ref1 = backs[0];
3289 }
3290 else {
3291 tok->u.backref.num = num;
3292 tok->u.backref.refs = backs;
3293 }
3294 }
3295 *src = p;
3296 return 0;
3297}
3298#endif
3299
3300static int
3301fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
3302{
3303 int r, num;
3304 OnigCodePoint c;
3305 OnigEncoding enc = env->enc;
3306 const OnigSyntaxType* syn = env->syntax;
3307 UChar* prev;
3308 UChar* p = *src;
3309 PFETCH_READY;
3310
3311 start:
3312 if (PEND) {
3313 tok->type = TK_EOT;
3314 return tok->type;
3315 }
3316
3317 tok->type = TK_STRING;
3318 tok->base = 0;
3319 tok->backp = p;
3320
3321 PFETCH(c);
3322 if (IS_MC_ESC_CODE(c, syn)) {
3323 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
3324
3325 tok->backp = p;
3326 PFETCH(c);
3327
3328 tok->u.c = c;
3329 tok->escaped = 1;
3330 switch (c) {
3331 case '*':
3332 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
3333 tok->type = TK_OP_REPEAT;
3334 tok->u.repeat.lower = 0;
3335 tok->u.repeat.upper = REPEAT_INFINITE;
3336 goto greedy_check;
3337 break;
3338
3339 case '+':
3340 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
3341 tok->type = TK_OP_REPEAT;
3342 tok->u.repeat.lower = 1;
3343 tok->u.repeat.upper = REPEAT_INFINITE;
3344 goto greedy_check;
3345 break;
3346
3347 case '?':
3348 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
3349 tok->type = TK_OP_REPEAT;
3350 tok->u.repeat.lower = 0;
3351 tok->u.repeat.upper = 1;
3352 greedy_check:
3353 if (!PEND && PPEEK_IS('?') &&
3354 IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
3355 PFETCH(c);
3356 tok->u.repeat.greedy = 0;
3357 tok->u.repeat.possessive = 0;
3358 }
3359 else {
3360 possessive_check:
3361 if (!PEND && PPEEK_IS('+') &&
3362 ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
3363 tok->type != TK_INTERVAL) ||
3364 (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
3365 tok->type == TK_INTERVAL))) {
3366 PFETCH(c);
3367 tok->u.repeat.greedy = 1;
3368 tok->u.repeat.possessive = 1;
3369 }
3370 else {
3371 tok->u.repeat.greedy = 1;
3372 tok->u.repeat.possessive = 0;
3373 }
3374 }
3375 break;
3376
3377 case '{':
3378 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
3379 r = fetch_range_quantifier(&p, end, tok, env);
3380 if (r < 0) return r; /* error */
3381 if (r == 0) goto greedy_check;
3382 else if (r == 2) { /* {n} */
3383 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3384 goto possessive_check;
3385
3386 goto greedy_check;
3387 }
3388 /* r == 1 : normal char */
3389 break;
3390
3391 case '|':
3392 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
3393 tok->type = TK_ALT;
3394 break;
3395
3396 case '(':
3397 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3398 tok->type = TK_SUBEXP_OPEN;
3399 break;
3400
3401 case ')':
3402 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3403 tok->type = TK_SUBEXP_CLOSE;
3404 break;
3405
3406 case 'w':
3407 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3408 tok->type = TK_CHAR_TYPE;
3409 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3410 tok->u.prop.not = 0;
3411 break;
3412
3413 case 'W':
3414 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3415 tok->type = TK_CHAR_TYPE;
3416 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3417 tok->u.prop.not = 1;
3418 break;
3419
3420 case 'b':
3421 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3422 tok->type = TK_ANCHOR;
3423 tok->u.anchor.subtype = ANCHOR_WORD_BOUND;
3424 tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option)
3425 && ! IS_WORD_BOUND_ALL_RANGE(env->option);
3426 break;
3427
3428 case 'B':
3429 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3430 tok->type = TK_ANCHOR;
3431 tok->u.anchor.subtype = ANCHOR_NOT_WORD_BOUND;
3432 tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option)
3433 && ! IS_WORD_BOUND_ALL_RANGE(env->option);
3434 break;
3435
3436#ifdef USE_WORD_BEGIN_END
3437 case '<':
3438 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3439 tok->type = TK_ANCHOR;
3440 tok->u.anchor.subtype = ANCHOR_WORD_BEGIN;
3441 tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option);
3442 break;
3443
3444 case '>':
3445 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3446 tok->type = TK_ANCHOR;
3447 tok->u.anchor.subtype = ANCHOR_WORD_END;
3448 tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option);
3449 break;
3450#endif
3451
3452 case 's':
3453 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3454 tok->type = TK_CHAR_TYPE;
3455 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3456 tok->u.prop.not = 0;
3457 break;
3458
3459 case 'S':
3460 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3461 tok->type = TK_CHAR_TYPE;
3462 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3463 tok->u.prop.not = 1;
3464 break;
3465
3466 case 'd':
3467 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3468 tok->type = TK_CHAR_TYPE;
3469 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3470 tok->u.prop.not = 0;
3471 break;
3472
3473 case 'D':
3474 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3475 tok->type = TK_CHAR_TYPE;
3476 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3477 tok->u.prop.not = 1;
3478 break;
3479
3480 case 'h':
3481 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3482 tok->type = TK_CHAR_TYPE;
3483 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3484 tok->u.prop.not = 0;
3485 break;
3486
3487 case 'H':
3488 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3489 tok->type = TK_CHAR_TYPE;
3490 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3491 tok->u.prop.not = 1;
3492 break;
3493
3494 case 'A':
3495 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3496 begin_buf:
3497 tok->type = TK_ANCHOR;
3498 tok->u.anchor.subtype = ANCHOR_BEGIN_BUF;
3499 break;
3500
3501 case 'Z':
3502 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3503 tok->type = TK_ANCHOR;
3504 tok->u.anchor.subtype = ANCHOR_SEMI_END_BUF;
3505 break;
3506
3507 case 'z':
3508 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3509 end_buf:
3510 tok->type = TK_ANCHOR;
3511 tok->u.anchor.subtype = ANCHOR_END_BUF;
3512 break;
3513
3514 case 'G':
3515 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
3516 tok->type = TK_ANCHOR;
3517 tok->u.anchor.subtype = ANCHOR_BEGIN_POSITION;
3518 break;
3519
3520 case '`':
3521 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3522 goto begin_buf;
3523 break;
3524
3525 case '\'':
3526 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3527 goto end_buf;
3528 break;
3529
3530 case 'x':
3531 if (PEND) break;
3532
3533 prev = p;
3534 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3535 PINC;
3536 num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc);
3537 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3538 if (!PEND) {
3539 if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
3540 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3541 }
3542
3543 if ((p > prev + enclen(enc, prev, end)) && !PEND && PPEEK_IS('}')) {
3544 PINC;
3545 tok->type = TK_CODE_POINT;
3546 tok->u.code = (OnigCodePoint )num;
3547 }
3548 else {
3549 /* can't read nothing or invalid format */
3550 p = prev;
3551 }
3552 }
3553 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3554 num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc);
3555 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3556 if (p == prev) { /* can't read nothing. */
3557 num = 0; /* but, it's not error */
3558 }
3559 tok->type = TK_RAW_BYTE;
3560 tok->base = 16;
3561 tok->u.c = num;
3562 }
3563 break;
3564
3565 case 'u':
3566 if (PEND) break;
3567
3568 prev = p;
3569 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3570 num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc);
3571 if (num < -1) return ONIGERR_TOO_SHORT_DIGITS;
3572 else if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3573 if (p == prev) { /* can't read nothing. */
3574 num = 0; /* but, it's not error */
3575 }
3576 tok->type = TK_CODE_POINT;
3577 tok->base = 16;
3578 tok->u.code = (OnigCodePoint )num;
3579 }
3580 break;
3581
3582 case 'o':
3583 if (PEND) break;
3584
3585 prev = p;
3586 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) {
3587 PINC;
3588 num = scan_unsigned_octal_number(&p, end, 11, enc);
3589 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3590 if (!PEND) {
3591 OnigCodePoint c = PPEEK;
3592 if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8')
3593 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3594 }
3595
3596 if ((p > prev + enclen(enc, prev, end)) && !PEND && PPEEK_IS('}')) {
3597 PINC;
3598 tok->type = TK_CODE_POINT;
3599 tok->u.code = (OnigCodePoint )num;
3600 }
3601 else {
3602 /* can't read nothing or invalid format */
3603 p = prev;
3604 }
3605 }
3606 break;
3607
3608 case '1': case '2': case '3': case '4':
3609 case '5': case '6': case '7': case '8': case '9':
3610 PUNFETCH;
3611 prev = p;
3612 num = onig_scan_unsigned_number(&p, end, enc);
3613 if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
3614 goto skip_backref;
3615 }
3616
3617 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
3618 (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */
3619 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3620 if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num]))
3621 return ONIGERR_INVALID_BACKREF;
3622 }
3623
3624 tok->type = TK_BACKREF;
3625 tok->u.backref.num = 1;
3626 tok->u.backref.ref1 = num;
3627 tok->u.backref.by_name = 0;
3628#ifdef USE_BACKREF_WITH_LEVEL
3629 tok->u.backref.exist_level = 0;
3630#endif
3631 break;
3632 }
3633
3634 skip_backref:
3635 if (c == '8' || c == '9') {
3636 /* normal char */
3637 p = prev; PINC;
3638 break;
3639 }
3640
3641 p = prev;
3642 /* fall through */
3643 case '0':
3644 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3645 prev = p;
3646 num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
3647 if (num < 0 || 0xff < num) return ONIGERR_TOO_BIG_NUMBER;
3648 if (p == prev) { /* can't read nothing. */
3649 num = 0; /* but, it's not error */
3650 }
3651 tok->type = TK_RAW_BYTE;
3652 tok->base = 8;
3653 tok->u.c = num;
3654 }
3655 else if (c != '0') {
3656 PINC;
3657 }
3658 break;
3659
3660#ifdef USE_NAMED_GROUP
3661 case 'k':
3662 if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
3663 PFETCH(c);
3664 if (c == '<' || c == '\'') {
3665 r = fetch_named_backref_token(c, tok, &p, end, env);
3666 if (r < 0) return r;
3667 }
3668 else {
3669 PUNFETCH;
3670 onig_syntax_warn(env, "invalid back reference");
3671 }
3672 }
3673 break;
3674#endif
3675
3676#if defined(USE_SUBEXP_CALL) || defined(USE_NAMED_GROUP)
3677 case 'g':
3678# ifdef USE_NAMED_GROUP
3679 if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_BRACE_BACKREF)) {
3680 PFETCH(c);
3681 if (c == '{') {
3682 r = fetch_named_backref_token(c, tok, &p, end, env);
3683 if (r < 0) return r;
3684 }
3685 else
3686 PUNFETCH;
3687 }
3688# endif
3689# ifdef USE_SUBEXP_CALL
3690 if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
3691 PFETCH(c);
3692 if (c == '<' || c == '\'') {
3693 int gnum = -1, rel = 0;
3694 UChar* name_end;
3695 OnigCodePoint cnext;
3696
3697 cnext = PPEEK;
3698 if (cnext == '0') {
3699 PINC;
3700 if (PPEEK_IS(get_name_end_code_point(c))) { /* \g<0>, \g'0' */
3701 PINC;
3702 name_end = p;
3703 gnum = 0;
3704 }
3705 }
3706 else if (cnext == '+') {
3707 PINC;
3708 rel = 1;
3709 }
3710 prev = p;
3711 if (gnum < 0) {
3712 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1);
3713 if (r < 0) return r;
3714 }
3715
3716 tok->type = TK_CALL;
3717 tok->u.call.name = prev;
3718 tok->u.call.name_end = name_end;
3719 tok->u.call.gnum = gnum;
3720 tok->u.call.rel = rel;
3721 }
3722 else {
3723 onig_syntax_warn(env, "invalid subexp call");
3724 PUNFETCH;
3725 }
3726 }
3727# endif
3728 break;
3729#endif
3730
3731 case 'Q':
3732 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
3733 tok->type = TK_QUOTE_OPEN;
3734 }
3735 break;
3736
3737 case 'p':
3738 case 'P':
3739 if (PPEEK_IS('{') &&
3740 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
3741 PINC;
3742 tok->type = TK_CHAR_PROPERTY;
3743 tok->u.prop.not = (c == 'P' ? 1 : 0);
3744
3745 if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3746 PFETCH(c);
3747 if (c == '^') {
3748 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3749 }
3750 else
3751 PUNFETCH;
3752 }
3753 }
3754 else {
3755 onig_syntax_warn(env, "invalid Unicode Property \\%c", c);
3756 }
3757 break;
3758
3759 case 'R':
3760 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK)) {
3761 tok->type = TK_LINEBREAK;
3762 }
3763 break;
3764
3765 case 'X':
3766 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER)) {
3767 tok->type = TK_EXTENDED_GRAPHEME_CLUSTER;
3768 }
3769 break;
3770
3771 case 'K':
3772 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP)) {
3773 tok->type = TK_KEEP;
3774 }
3775 break;
3776
3777 default:
3778 {
3779 OnigCodePoint c2;
3780
3781 PUNFETCH;
3782 num = fetch_escaped_value(&p, end, env, &c2);
3783 if (num < 0) return num;
3784 /* set_raw: */
3785 if ((OnigCodePoint )tok->u.c != c2) {
3786 tok->type = TK_CODE_POINT;
3787 tok->u.code = (OnigCodePoint )c2;
3788 }
3789 else { /* string */
3790 p = tok->backp + enclen(enc, tok->backp, end);
3791 }
3792 }
3793 break;
3794 }
3795 }
3796 else {
3797 tok->u.c = c;
3798 tok->escaped = 0;
3799
3800#ifdef USE_VARIABLE_META_CHARS
3801 if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
3802 IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
3803 if (c == MC_ANYCHAR(syn))
3804 goto any_char;
3805 else if (c == MC_ANYTIME(syn))
3806 goto anytime;
3807 else if (c == MC_ZERO_OR_ONE_TIME(syn))
3808 goto zero_or_one_time;
3809 else if (c == MC_ONE_OR_MORE_TIME(syn))
3810 goto one_or_more_time;
3811 else if (c == MC_ANYCHAR_ANYTIME(syn)) {
3812 tok->type = TK_ANYCHAR_ANYTIME;
3813 goto out;
3814 }
3815 }
3816#endif
3817
3818 switch (c) {
3819 case '.':
3820 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
3821#ifdef USE_VARIABLE_META_CHARS
3822 any_char:
3823#endif
3824 tok->type = TK_ANYCHAR;
3825 break;
3826
3827 case '*':
3828 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
3829#ifdef USE_VARIABLE_META_CHARS
3830 anytime:
3831#endif
3832 tok->type = TK_OP_REPEAT;
3833 tok->u.repeat.lower = 0;
3834 tok->u.repeat.upper = REPEAT_INFINITE;
3835 goto greedy_check;
3836 break;
3837
3838 case '+':
3839 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
3840#ifdef USE_VARIABLE_META_CHARS
3841 one_or_more_time:
3842#endif
3843 tok->type = TK_OP_REPEAT;
3844 tok->u.repeat.lower = 1;
3845 tok->u.repeat.upper = REPEAT_INFINITE;
3846 goto greedy_check;
3847 break;
3848
3849 case '?':
3850 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
3851#ifdef USE_VARIABLE_META_CHARS
3852 zero_or_one_time:
3853#endif
3854 tok->type = TK_OP_REPEAT;
3855 tok->u.repeat.lower = 0;
3856 tok->u.repeat.upper = 1;
3857 goto greedy_check;
3858 break;
3859
3860 case '{':
3861 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
3862 r = fetch_range_quantifier(&p, end, tok, env);
3863 if (r < 0) return r; /* error */
3864 if (r == 0) goto greedy_check;
3865 else if (r == 2) { /* {n} */
3866 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3867 goto possessive_check;
3868
3869 goto greedy_check;
3870 }
3871 /* r == 1 : normal char */
3872 break;
3873
3874 case '|':
3875 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
3876 tok->type = TK_ALT;
3877 break;
3878
3879 case '(':
3880 if (PPEEK_IS('?') &&
3881 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
3882 PINC;
3883 if (PPEEK_IS('#')) {
3884 PFETCH(c);
3885 while (1) {
3886 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
3887 PFETCH(c);
3888 if (c == MC_ESC(syn)) {
3889 if (!PEND) PFETCH(c);
3890 }
3891 else {
3892 if (c == ')') break;
3893 }
3894 }
3895 goto start;
3896 }
3897#ifdef USE_PERL_SUBEXP_CALL
3898 /* (?&name), (?n), (?R), (?0), (?+n), (?-n) */
3899 c = PPEEK;
3900 if ((c == '&' || c == 'R' || ONIGENC_IS_CODE_DIGIT(enc, c)) &&
3901 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_SUBEXP_CALL)) {
3902 /* (?&name), (?n), (?R), (?0) */
3903 int gnum;
3904 UChar *name;
3905 UChar *name_end;
3906
3907 if (c == 'R' || c == '0') {
3908 PINC; /* skip 'R' / '0' */
3909 if (!PPEEK_IS(')')) return ONIGERR_INVALID_GROUP_NAME;
3910 PINC; /* skip ')' */
3911 name_end = name = p;
3912 gnum = 0;
3913 }
3914 else {
3915 int numref = 1;
3916 if (c == '&') { /* (?&name) */
3917 PINC;
3918 numref = 0; /* don't allow number name */
3919 }
3920 name = p;
3921 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, numref);
3922 if (r < 0) return r;
3923 }
3924
3925 tok->type = TK_CALL;
3926 tok->u.call.name = name;
3927 tok->u.call.name_end = name_end;
3928 tok->u.call.gnum = gnum;
3929 tok->u.call.rel = 0;
3930 break;
3931 }
3932 else if ((c == '-' || c == '+') &&
3933 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_SUBEXP_CALL)) {
3934 /* (?+n), (?-n) */
3935 int gnum;
3936 UChar *name;
3937 UChar *name_end;
3938 OnigCodePoint cnext;
3939 PFETCH_READY;
3940
3941 PINC; /* skip '-' / '+' */
3942 cnext = PPEEK;
3943 if (ONIGENC_IS_CODE_DIGIT(enc, cnext)) {
3944 if (c == '-') PUNFETCH;
3945 name = p;
3946 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, 1);
3947 if (r < 0) return r;
3948
3949 tok->type = TK_CALL;
3950 tok->u.call.name = name;
3951 tok->u.call.name_end = name_end;
3952 tok->u.call.gnum = gnum;
3953 tok->u.call.rel = 1;
3954 break;
3955 }
3956 }
3957#endif /* USE_PERL_SUBEXP_CALL */
3958#ifdef USE_CAPITAL_P_NAMED_GROUP
3959 if (PPEEK_IS('P') &&
3960 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) {
3961 int gnum;
3962 UChar *name;
3963 UChar *name_end;
3964 PFETCH_READY;
3965
3966 PINC; /* skip 'P' */
3967 if (PEND) return ONIGERR_UNDEFINED_GROUP_OPTION;
3968 PFETCH(c);
3969 if (c == '=') { /* (?P=name): backref */
3970 r = fetch_named_backref_token((OnigCodePoint )'(', tok, &p, end, env);
3971 if (r < 0) return r;
3972 break;
3973 }
3974 else if (c == '>') { /* (?P>name): subexp call */
3975 name = p;
3976 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, 0);
3977 if (r < 0) return r;
3978
3979 tok->type = TK_CALL;
3980 tok->u.call.name = name;
3981 tok->u.call.name_end = name_end;
3982 tok->u.call.gnum = gnum;
3983 tok->u.call.rel = 0;
3984 break;
3985 }
3986 }
3987#endif /* USE_CAPITAL_P_NAMED_GROUP */
3988 PUNFETCH;
3989 }
3990
3991 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3992 tok->type = TK_SUBEXP_OPEN;
3993 break;
3994
3995 case ')':
3996 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3997 tok->type = TK_SUBEXP_CLOSE;
3998 break;
3999
4000 case '^':
4001 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
4002 tok->type = TK_ANCHOR;
4003 tok->u.anchor.subtype = (IS_SINGLELINE(env->option)
4004 ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
4005 break;
4006
4007 case '$':
4008 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
4009 tok->type = TK_ANCHOR;
4010 tok->u.anchor.subtype = (IS_SINGLELINE(env->option)
4011 ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
4012 break;
4013
4014 case '[':
4015 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
4016 tok->type = TK_CC_OPEN;
4017 break;
4018
4019 case ']':
4020 if (*src > env->pattern) /* /].../ is allowed. */
4021 CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
4022 break;
4023
4024 case '#':
4025 if (IS_EXTEND(env->option)) {
4026 while (!PEND) {
4027 PFETCH(c);
4028 if (ONIGENC_IS_CODE_NEWLINE(enc, c))
4029 break;
4030 }
4031 goto start;
4032 break;
4033 }
4034 break;
4035
4036 case ' ': case '\t': case '\n': case '\r': case '\f':
4037 if (IS_EXTEND(env->option))
4038 goto start;
4039 break;
4040
4041 default:
4042 /* string */
4043 break;
4044 }
4045 }
4046
4047#ifdef USE_VARIABLE_META_CHARS
4048 out:
4049#endif
4050 *src = p;
4051 return tok->type;
4052}
4053
4054static int
4055add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
4056 ScanEnv* env,
4057 OnigCodePoint sb_out, const OnigCodePoint mbr[])
4058{
4059 int i, r;
4060 OnigCodePoint j;
4061
4062 int n = ONIGENC_CODE_RANGE_NUM(mbr);
4063
4064 if (not == 0) {
4065 for (i = 0; i < n; i++) {
4066 for (j = ONIGENC_CODE_RANGE_FROM(mbr, i);
4067 j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
4068 if (j >= sb_out) {
4069 if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
4070 r = add_code_range_to_buf(&(cc->mbuf), env, j,
4071 ONIGENC_CODE_RANGE_TO(mbr, i));
4072 if (r != 0) return r;
4073 i++;
4074 }
4075
4076 goto sb_end;
4077 }
4078 BITSET_SET_BIT_CHKDUP(cc->bs, j);
4079 }
4080 }
4081
4082 sb_end:
4083 for ( ; i < n; i++) {
4084 r = add_code_range_to_buf(&(cc->mbuf), env,
4085 ONIGENC_CODE_RANGE_FROM(mbr, i),
4086 ONIGENC_CODE_RANGE_TO(mbr, i));
4087 if (r != 0) return r;
4088 }
4089 }
4090 else {
4091 OnigCodePoint prev = 0;
4092
4093 for (i = 0; i < n; i++) {
4094 for (j = prev;
4095 j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) {
4096 if (j >= sb_out) {
4097 goto sb_end2;
4098 }
4099 BITSET_SET_BIT_CHKDUP(cc->bs, j);
4100 }
4101 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
4102 }
4103 for (j = prev; j < sb_out; j++) {
4104 BITSET_SET_BIT_CHKDUP(cc->bs, j);
4105 }
4106
4107 sb_end2:
4108 prev = sb_out;
4109
4110 for (i = 0; i < n; i++) {
4111 if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
4112 r = add_code_range_to_buf(&(cc->mbuf), env, prev,
4113 ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
4114 if (r != 0) return r;
4115 }
4116 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
4117 }
4118 if (prev < 0x7fffffff) {
4119 r = add_code_range_to_buf(&(cc->mbuf), env, prev, 0x7fffffff);
4120 if (r != 0) return r;
4121 }
4122 }
4123
4124 return 0;
4125}
4126
4127static int
4128add_ctype_to_cc(CClassNode* cc, int ctype, int not, int ascii_range, ScanEnv* env)
4129{
4130 int maxcode;
4131 int c, r;
4132 const OnigCodePoint *ranges;
4133 OnigCodePoint sb_out;
4134 OnigEncoding enc = env->enc;
4135
4136 r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
4137 if (r == 0) {
4138 if (ascii_range) {
4139 CClassNode ccwork;
4140 initialize_cclass(&ccwork);
4141 r = add_ctype_to_cc_by_range(&ccwork, ctype, not, env, sb_out,
4142 ranges);
4143 if (r == 0) {
4144 if (not) {
4145 r = add_code_range_to_buf0(&(ccwork.mbuf), env, 0x80, ONIG_LAST_CODE_POINT, FALSE);
4146 }
4147 else {
4148 CClassNode ccascii;
4149 initialize_cclass(&ccascii);
4150 if (ONIGENC_MBC_MINLEN(env->enc) > 1) {
4151 r = add_code_range(&(ccascii.mbuf), env, 0x00, 0x7F);
4152 }
4153 else {
4154 bitset_set_range(env, ccascii.bs, 0x00, 0x7F);
4155 r = 0;
4156 }
4157 if (r == 0) {
4158 r = and_cclass(&ccwork, &ccascii, env);
4159 }
4160 if (IS_NOT_NULL(ccascii.mbuf)) bbuf_free(ccascii.mbuf);
4161 }
4162 if (r == 0) {
4163 r = or_cclass(cc, &ccwork, env);
4164 }
4165 if (IS_NOT_NULL(ccwork.mbuf)) bbuf_free(ccwork.mbuf);
4166 }
4167 }
4168 else {
4169 r = add_ctype_to_cc_by_range(cc, ctype, not, env, sb_out, ranges);
4170 }
4171 return r;
4172 }
4173 else if (r != ONIG_NO_SUPPORT_CONFIG) {
4174 return r;
4175 }
4176
4177 maxcode = ascii_range ? 0x80 : SINGLE_BYTE_SIZE;
4178 r = 0;
4179 switch (ctype) {
4180 case ONIGENC_CTYPE_ALPHA:
4181 case ONIGENC_CTYPE_BLANK:
4182 case ONIGENC_CTYPE_CNTRL:
4183 case ONIGENC_CTYPE_DIGIT:
4184 case ONIGENC_CTYPE_LOWER:
4185 case ONIGENC_CTYPE_PUNCT:
4186 case ONIGENC_CTYPE_SPACE:
4187 case ONIGENC_CTYPE_UPPER:
4188 case ONIGENC_CTYPE_XDIGIT:
4189 case ONIGENC_CTYPE_ASCII:
4190 case ONIGENC_CTYPE_ALNUM:
4191 if (not != 0) {
4192 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
4193 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
4194 BITSET_SET_BIT_CHKDUP(cc->bs, c);
4195 }
4196 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
4197 }
4198 else {
4199 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
4200 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
4201 BITSET_SET_BIT_CHKDUP(cc->bs, c);
4202 }
4203 }
4204 break;
4205
4206 case ONIGENC_CTYPE_GRAPH:
4207 case ONIGENC_CTYPE_PRINT:
4208 if (not != 0) {
4209 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
4210 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)
4211 || c >= maxcode)
4212 BITSET_SET_BIT_CHKDUP(cc->bs, c);
4213 }
4214 if (ascii_range)
4215 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
4216 }
4217 else {
4218 for (c = 0; c < maxcode; c++) {
4219 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
4220 BITSET_SET_BIT_CHKDUP(cc->bs, c);
4221 }
4222 if (! ascii_range)
4223 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
4224 }
4225 break;
4226
4227 case ONIGENC_CTYPE_WORD:
4228 if (not == 0) {
4229 for (c = 0; c < maxcode; c++) {
4230 if (ONIGENC_IS_CODE_WORD(enc, c)) BITSET_SET_BIT_CHKDUP(cc->bs, c);
4231 }
4232 if (! ascii_range)
4233 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
4234 }
4235 else {
4236 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
4237 if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */
4238 && (! ONIGENC_IS_CODE_WORD(enc, c) || c >= maxcode))
4239 BITSET_SET_BIT_CHKDUP(cc->bs, c);
4240 }
4241 if (ascii_range)
4242 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
4243 }
4244 break;
4245
4246 default:
4247 return ONIGERR_PARSER_BUG;
4248 break;
4249 }
4250
4251 return r;
4252}
4253
4254static int
4255parse_posix_bracket(CClassNode* cc, CClassNode* asc_cc,
4256 UChar** src, UChar* end, ScanEnv* env)
4257{
4258#define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20
4259#define POSIX_BRACKET_NAME_MIN_LEN 4
4260
4261 static const PosixBracketEntryType PBS[] = {
4262 POSIX_BRACKET_ENTRY_INIT("alnum", ONIGENC_CTYPE_ALNUM),
4263 POSIX_BRACKET_ENTRY_INIT("alpha", ONIGENC_CTYPE_ALPHA),
4264 POSIX_BRACKET_ENTRY_INIT("blank", ONIGENC_CTYPE_BLANK),
4265 POSIX_BRACKET_ENTRY_INIT("cntrl", ONIGENC_CTYPE_CNTRL),
4266 POSIX_BRACKET_ENTRY_INIT("digit", ONIGENC_CTYPE_DIGIT),
4267 POSIX_BRACKET_ENTRY_INIT("graph", ONIGENC_CTYPE_GRAPH),
4268 POSIX_BRACKET_ENTRY_INIT("lower", ONIGENC_CTYPE_LOWER),
4269 POSIX_BRACKET_ENTRY_INIT("print", ONIGENC_CTYPE_PRINT),
4270 POSIX_BRACKET_ENTRY_INIT("punct", ONIGENC_CTYPE_PUNCT),
4271 POSIX_BRACKET_ENTRY_INIT("space", ONIGENC_CTYPE_SPACE),
4272 POSIX_BRACKET_ENTRY_INIT("upper", ONIGENC_CTYPE_UPPER),
4273 POSIX_BRACKET_ENTRY_INIT("xdigit", ONIGENC_CTYPE_XDIGIT),
4274 POSIX_BRACKET_ENTRY_INIT("ascii", ONIGENC_CTYPE_ASCII),
4275 POSIX_BRACKET_ENTRY_INIT("word", ONIGENC_CTYPE_WORD),
4276 };
4277
4278 const PosixBracketEntryType *pb;
4279 int not, i, r;
4280 int ascii_range;
4281 OnigCodePoint c;
4282 OnigEncoding enc = env->enc;
4283 UChar *p = *src;
4284
4285 if (PPEEK_IS('^')) {
4286 PINC_S;
4287 not = 1;
4288 }
4289 else
4290 not = 0;
4291
4292 if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3)
4293 goto not_posix_bracket;
4294
4295 ascii_range = IS_ASCII_RANGE(env->option) &&
4296 ! IS_POSIX_BRACKET_ALL_RANGE(env->option);
4297 for (pb = PBS; pb < PBS + numberof(PBS); pb++) {
4298 if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
4299 p = (UChar* )onigenc_step(enc, p, end, pb->len);
4300 if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
4301 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
4302
4303 r = add_ctype_to_cc(cc, pb->ctype, not, ascii_range, env);
4304 if (r != 0) return r;
4305
4306 if (IS_NOT_NULL(asc_cc)) {
4307 if (pb->ctype != ONIGENC_CTYPE_WORD &&
4308 pb->ctype != ONIGENC_CTYPE_ASCII &&
4309 !ascii_range)
4310 r = add_ctype_to_cc(asc_cc, pb->ctype, not, ascii_range, env);
4311 if (r != 0) return r;
4312 }
4313
4314 PINC_S; PINC_S;
4315 *src = p;
4316 return 0;
4317 }
4318 }
4319
4320 not_posix_bracket:
4321 c = 0;
4322 i = 0;
4323 while (!PEND && ((c = PPEEK) != ':') && c != ']') {
4324 PINC_S;
4325 if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
4326 }
4327 if (c == ':' && ! PEND) {
4328 PINC_S;
4329 if (! PEND) {
4330 PFETCH_S(c);
4331 if (c == ']')
4332 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
4333 }
4334 }
4335
4336 return 1; /* 1: is not POSIX bracket, but no error. */
4337}
4338
4339static int
4340fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
4341{
4342 int r;
4343 OnigCodePoint c;
4344 OnigEncoding enc = env->enc;
4345 UChar *prev, *start, *p = *src;
4346
4347 r = 0;
4348 start = prev = p;
4349
4350 while (!PEND) {
4351 prev = p;
4352 PFETCH_S(c);
4353 if (c == '}') {
4354 r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev);
4355 if (r < 0) break;
4356
4357 *src = p;
4358 return r;
4359 }
4360 else if (c == '(' || c == ')' || c == '{' || c == '|') {
4361 r = ONIGERR_INVALID_CHAR_PROPERTY_NAME;
4362 break;
4363 }
4364 }
4365
4366 onig_scan_env_set_error_string(env, r, *src, prev);
4367 return r;
4368}
4369
4370static int cclass_case_fold(Node** np, CClassNode* cc, CClassNode* asc_cc, ScanEnv* env);
4371
4372static int
4373parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
4374 ScanEnv* env)
4375{
4376 int r, ctype;
4377 CClassNode* cc;
4378
4379 ctype = fetch_char_property_to_ctype(src, end, env);
4380 if (ctype < 0) return ctype;
4381
4382 *np = node_new_cclass();
4383 CHECK_NULL_RETURN_MEMERR(*np);
4384 cc = NCCLASS(*np);
4385 r = add_ctype_to_cc(cc, ctype, 0, 0, env);
4386 if (r != 0) return r;
4387 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
4388
4389 if (IS_IGNORECASE(env->option)) {
4390 if (ctype != ONIGENC_CTYPE_ASCII)
4391 r = cclass_case_fold(np, cc, cc, env);
4392 }
4393 return r;
4394}
4395
4396
4397enum CCSTATE {
4398 CCS_VALUE,
4399 CCS_RANGE,
4400 CCS_COMPLETE,
4401 CCS_START
4402};
4403
4404enum CCVALTYPE {
4405 CCV_SB,
4406 CCV_CODE_POINT,
4407 CCV_CLASS
4408};
4409
4410static int
4411next_state_class(CClassNode* cc, CClassNode* asc_cc,
4412 OnigCodePoint* vs, enum CCVALTYPE* type,
4413 enum CCSTATE* state, ScanEnv* env)
4414{
4415 int r;
4416
4417 if (*state == CCS_RANGE)
4418 return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
4419
4420 if (*state == CCS_VALUE && *type != CCV_CLASS) {
4421 if (*type == CCV_SB) {
4422 BITSET_SET_BIT_CHKDUP(cc->bs, (int )(*vs));
4423 if (IS_NOT_NULL(asc_cc))
4424 BITSET_SET_BIT(asc_cc->bs, (int )(*vs));
4425 }
4426 else if (*type == CCV_CODE_POINT) {
4427 r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4428 if (r < 0) return r;
4429 if (IS_NOT_NULL(asc_cc)) {
4430 r = add_code_range0(&(asc_cc->mbuf), env, *vs, *vs, 0);
4431 if (r < 0) return r;
4432 }
4433 }
4434 }
4435
4436 *state = CCS_VALUE;
4437 *type = CCV_CLASS;
4438 return 0;
4439}
4440
4441static int
4442next_state_val(CClassNode* cc, CClassNode* asc_cc,
4443 OnigCodePoint *from, OnigCodePoint to,
4444 int* from_israw, int to_israw,
4445 enum CCVALTYPE intype, enum CCVALTYPE* type,
4446 enum CCSTATE* state, ScanEnv* env)
4447{
4448 int r;
4449
4450 switch (*state) {
4451 case CCS_VALUE:
4452 if (*type == CCV_SB) {
4453 BITSET_SET_BIT_CHKDUP(cc->bs, (int )(*from));
4454 if (IS_NOT_NULL(asc_cc))
4455 BITSET_SET_BIT(asc_cc->bs, (int )(*from));
4456 }
4457 else if (*type == CCV_CODE_POINT) {
4458 r = add_code_range(&(cc->mbuf), env, *from, *from);
4459 if (r < 0) return r;
4460 if (IS_NOT_NULL(asc_cc)) {
4461 r = add_code_range0(&(asc_cc->mbuf), env, *from, *from, 0);
4462 if (r < 0) return r;
4463 }
4464 }
4465 break;
4466
4467 case CCS_RANGE:
4468 if (intype == *type) {
4469 if (intype == CCV_SB) {
4470 if (*from > 0xff || to > 0xff)
4471 return ONIGERR_INVALID_CODE_POINT_VALUE;
4472
4473 if (*from > to) {
4474 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4475 goto ccs_range_end;
4476 else
4477 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4478 }
4479 bitset_set_range(env, cc->bs, (int )*from, (int )to);
4480 if (IS_NOT_NULL(asc_cc))
4481 bitset_set_range(env, asc_cc->bs, (int )*from, (int )to);
4482 }
4483 else {
4484 r = add_code_range(&(cc->mbuf), env, *from, to);
4485 if (r < 0) return r;
4486 if (IS_NOT_NULL(asc_cc)) {
4487 r = add_code_range0(&(asc_cc->mbuf), env, *from, to, 0);
4488 if (r < 0) return r;
4489 }
4490 }
4491 }
4492 else {
4493 if (*from > to) {
4494 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4495 goto ccs_range_end;
4496 else
4497 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4498 }
4499 bitset_set_range(env, cc->bs, (int )*from, (int )(to < 0xff ? to : 0xff));
4500 r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*from, to);
4501 if (r < 0) return r;
4502 if (IS_NOT_NULL(asc_cc)) {
4503 bitset_set_range(env, asc_cc->bs, (int )*from, (int )(to < 0xff ? to : 0xff));
4504 r = add_code_range0(&(asc_cc->mbuf), env, (OnigCodePoint )*from, to, 0);
4505 if (r < 0) return r;
4506 }
4507 }
4508 ccs_range_end:
4509 *state = CCS_COMPLETE;
4510 break;
4511
4512 case CCS_COMPLETE:
4513 case CCS_START:
4514 *state = CCS_VALUE;
4515 break;
4516
4517 default:
4518 break;
4519 }
4520
4521 *from_israw = to_israw;
4522 *from = to;
4523 *type = intype;
4524 return 0;
4525}
4526
4527static int
4528code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
4529 ScanEnv* env)
4530{
4531 int in_esc;
4532 OnigCodePoint code;
4533 OnigEncoding enc = env->enc;
4534 UChar* p = from;
4535
4536 in_esc = 0;
4537 while (! PEND) {
4538 if (ignore_escaped && in_esc) {
4539 in_esc = 0;
4540 }
4541 else {
4542 PFETCH_S(code);
4543 if (code == c) return 1;
4544 if (code == MC_ESC(env->syntax)) in_esc = 1;
4545 }
4546 }
4547 return 0;
4548}
4549
4550static int
4551parse_char_class(Node** np, Node** asc_np, OnigToken* tok, UChar** src, UChar* end,
4552 ScanEnv* env)
4553{
4554 int r, neg, len, fetched, and_start;
4555 OnigCodePoint v, vs;
4556 UChar *p;
4557 Node* node;
4558 Node* asc_node;
4559 CClassNode *cc, *prev_cc;
4560 CClassNode *asc_cc, *asc_prev_cc;
4561 CClassNode work_cc, asc_work_cc;
4562
4563 enum CCSTATE state;
4564 enum CCVALTYPE val_type, in_type;
4565 int val_israw, in_israw;
4566
4567 *np = *asc_np = NULL_NODE;
4568 env->parse_depth++;
4569 if (env->parse_depth > ParseDepthLimit)
4570 return ONIGERR_PARSE_DEPTH_LIMIT_OVER;
4571 prev_cc = asc_prev_cc = (CClassNode* )NULL;
4572 r = fetch_token_in_cc(tok, src, end, env);
4573 if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
4574 neg = 1;
4575 r = fetch_token_in_cc(tok, src, end, env);
4576 }
4577 else {
4578 neg = 0;
4579 }
4580
4581 if (r < 0) return r;
4582 if (r == TK_CC_CLOSE) {
4583 if (! code_exist_check((OnigCodePoint )']',
4584 *src, env->pattern_end, 1, env))
4585 return ONIGERR_EMPTY_CHAR_CLASS;
4586
4587 CC_ESC_WARN(env, (UChar* )"]");
4588 r = tok->type = TK_CHAR; /* allow []...] */
4589 }
4590
4591 *np = node = node_new_cclass();
4592 CHECK_NULL_RETURN_MEMERR(node);
4593 cc = NCCLASS(node);
4594
4595 if (IS_IGNORECASE(env->option)) {
4596 *asc_np = asc_node = node_new_cclass();
4597 CHECK_NULL_RETURN_MEMERR(asc_node);
4598 asc_cc = NCCLASS(asc_node);
4599 }
4600 else {
4601 asc_node = NULL_NODE;
4602 asc_cc = NULL;
4603 }
4604
4605 and_start = 0;
4606 state = CCS_START;
4607 p = *src;
4608 while (r != TK_CC_CLOSE) {
4609 fetched = 0;
4610 switch (r) {
4611 case TK_CHAR:
4612 if ((tok->u.code >= SINGLE_BYTE_SIZE) ||
4613 (len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c)) > 1) {
4614 in_type = CCV_CODE_POINT;
4615 }
4616 else if (len < 0) {
4617 r = len;
4618 goto err;
4619 }
4620 else {
4621 sb_char:
4622 in_type = CCV_SB;
4623 }
4624 v = (OnigCodePoint )tok->u.c;
4625 in_israw = 0;
4626 goto val_entry2;
4627 break;
4628
4629 case TK_RAW_BYTE:
4630 /* tok->base != 0 : octal or hexadec. */
4631 if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
4632 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4633 UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
4634 UChar* psave = p;
4635 int i, base = tok->base;
4636
4637 buf[0] = (UChar )tok->u.c;
4638 for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
4639 r = fetch_token_in_cc(tok, &p, end, env);
4640 if (r < 0) goto err;
4641 if (r != TK_RAW_BYTE || tok->base != base) {
4642 fetched = 1;
4643 break;
4644 }
4645 buf[i] = (UChar )tok->u.c;
4646 }
4647
4648 if (i < ONIGENC_MBC_MINLEN(env->enc)) {
4649 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4650 goto err;
4651 }
4652
4653 len = enclen(env->enc, buf, buf + i);
4654 if (i < len) {
4655 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4656 goto err;
4657 }
4658 else if (i > len) { /* fetch back */
4659 p = psave;
4660 for (i = 1; i < len; i++) {
4661 (void)fetch_token_in_cc(tok, &p, end, env);
4662 /* no need to check the retun value (already checked above) */
4663 }
4664 fetched = 0;
4665 }
4666
4667 if (i == 1) {
4668 v = (OnigCodePoint )buf[0];
4669 goto raw_single;
4670 }
4671 else {
4672 v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
4673 in_type = CCV_CODE_POINT;
4674 }
4675 }
4676 else {
4677 v = (OnigCodePoint )tok->u.c;
4678 raw_single:
4679 in_type = CCV_SB;
4680 }
4681 in_israw = 1;
4682 goto val_entry2;
4683 break;
4684
4685 case TK_CODE_POINT:
4686 v = tok->u.code;
4687 in_israw = 1;
4688 val_entry:
4689 len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);
4690 if (len < 0) {
4691 r = len;
4692 goto err;
4693 }
4694 in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);
4695 val_entry2:
4696 r = next_state_val(cc, asc_cc, &vs, v, &val_israw, in_israw, in_type, &val_type,
4697 &state, env);
4698 if (r != 0) goto err;
4699 break;
4700
4701 case TK_POSIX_BRACKET_OPEN:
4702 r = parse_posix_bracket(cc, asc_cc, &p, end, env);
4703 if (r < 0) goto err;
4704 if (r == 1) { /* is not POSIX bracket */
4705 CC_ESC_WARN(env, (UChar* )"[");
4706 p = tok->backp;
4707 v = (OnigCodePoint )tok->u.c;
4708 in_israw = 0;
4709 goto val_entry;
4710 }
4711 goto next_class;
4712 break;
4713
4714 case TK_CHAR_TYPE:
4715 r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not,
4716 IS_ASCII_RANGE(env->option), env);
4717 if (r != 0) return r;
4718 if (IS_NOT_NULL(asc_cc)) {
4719 if (tok->u.prop.ctype != ONIGENC_CTYPE_WORD)
4720 r = add_ctype_to_cc(asc_cc, tok->u.prop.ctype, tok->u.prop.not,
4721 IS_ASCII_RANGE(env->option), env);
4722 if (r != 0) return r;
4723 }
4724
4725 next_class:
4726 r = next_state_class(cc, asc_cc, &vs, &val_type, &state, env);
4727 if (r != 0) goto err;
4728 break;
4729
4730 case TK_CHAR_PROPERTY:
4731 {
4732 int ctype;
4733
4734 ctype = fetch_char_property_to_ctype(&p, end, env);
4735 if (ctype < 0) return ctype;
4736 r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, 0, env);
4737 if (r != 0) return r;
4738 if (IS_NOT_NULL(asc_cc)) {
4739 if (ctype != ONIGENC_CTYPE_ASCII)
4740 r = add_ctype_to_cc(asc_cc, ctype, tok->u.prop.not, 0, env);
4741 if (r != 0) return r;
4742 }
4743 goto next_class;
4744 }
4745 break;
4746
4747 case TK_CC_RANGE:
4748 if (state == CCS_VALUE) {
4749 r = fetch_token_in_cc(tok, &p, end, env);
4750 if (r < 0) goto err;
4751 fetched = 1;
4752 if (r == TK_CC_CLOSE) { /* allow [x-] */
4753 range_end_val:
4754 v = (OnigCodePoint )'-';
4755 in_israw = 0;
4756 goto val_entry;
4757 }
4758 else if (r == TK_CC_AND) {
4759 CC_ESC_WARN(env, (UChar* )"-");
4760 goto range_end_val;
4761 }
4762
4763 if (val_type == CCV_CLASS) {
4764 r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
4765 goto err;
4766 }
4767
4768 state = CCS_RANGE;
4769 }
4770 else if (state == CCS_START) {
4771 /* [-xa] is allowed */
4772 v = (OnigCodePoint )tok->u.c;
4773 in_israw = 0;
4774
4775 r = fetch_token_in_cc(tok, &p, end, env);
4776 if (r < 0) goto err;
4777 fetched = 1;
4778 /* [--x] or [a&&-x] is warned. */
4779 if (r == TK_CC_RANGE || and_start != 0)
4780 CC_ESC_WARN(env, (UChar* )"-");
4781
4782 goto val_entry;
4783 }
4784 else if (state == CCS_RANGE) {
4785 CC_ESC_WARN(env, (UChar* )"-");
4786 goto sb_char; /* [!--x] is allowed */
4787 }
4788 else { /* CCS_COMPLETE */
4789 r = fetch_token_in_cc(tok, &p, end, env);
4790 if (r < 0) goto err;
4791 fetched = 1;
4792 if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */
4793 else if (r == TK_CC_AND) {
4794 CC_ESC_WARN(env, (UChar* )"-");
4795 goto range_end_val;
4796 }
4797
4798 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
4799 CC_ESC_WARN(env, (UChar* )"-");
4800 goto range_end_val; /* [0-9-a] is allowed as [0-9\-a] */
4801 }
4802 r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
4803 goto err;
4804 }
4805 break;
4806
4807 case TK_CC_CC_OPEN: /* [ */
4808 {
4809 Node *anode, *aasc_node;
4810 CClassNode* acc;
4811
4812 r = parse_char_class(&anode, &aasc_node, tok, &p, end, env);
4813 if (r == 0) {
4814 acc = NCCLASS(anode);
4815 r = or_cclass(cc, acc, env);
4816 }
4817 if (r == 0 && IS_NOT_NULL(aasc_node)) {
4818 acc = NCCLASS(aasc_node);
4819 r = or_cclass(asc_cc, acc, env);
4820 }
4821 onig_node_free(anode);
4822 onig_node_free(aasc_node);
4823 if (r != 0) goto err;
4824 }
4825 break;
4826
4827 case TK_CC_AND: /* && */
4828 {
4829 if (state == CCS_VALUE) {
4830 r = next_state_val(cc, asc_cc, &vs, 0, &val_israw, 0, val_type,
4831 &val_type, &state, env);
4832 if (r != 0) goto err;
4833 }
4834 /* initialize local variables */
4835 and_start = 1;
4836 state = CCS_START;
4837
4838 if (IS_NOT_NULL(prev_cc)) {
4839 r = and_cclass(prev_cc, cc, env);
4840 if (r != 0) goto err;
4841 bbuf_free(cc->mbuf);
4842 if (IS_NOT_NULL(asc_cc)) {
4843 r = and_cclass(asc_prev_cc, asc_cc, env);
4844 if (r != 0) goto err;
4845 bbuf_free(asc_cc->mbuf);
4846 }
4847 }
4848 else {
4849 prev_cc = cc;
4850 cc = &work_cc;
4851 if (IS_NOT_NULL(asc_cc)) {
4852 asc_prev_cc = asc_cc;
4853 asc_cc = &asc_work_cc;
4854 }
4855 }
4856 initialize_cclass(cc);
4857 if (IS_NOT_NULL(asc_cc))
4858 initialize_cclass(asc_cc);
4859 }
4860 break;
4861
4862 case TK_EOT:
4863 r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
4864 goto err;
4865 break;
4866 default:
4867 r = ONIGERR_PARSER_BUG;
4868 goto err;
4869 break;
4870 }
4871
4872 if (fetched)
4873 r = tok->type;
4874 else {
4875 r = fetch_token_in_cc(tok, &p, end, env);
4876 if (r < 0) goto err;
4877 }
4878 }
4879
4880 if (state == CCS_VALUE) {
4881 r = next_state_val(cc, asc_cc, &vs, 0, &val_israw, 0, val_type,
4882 &val_type, &state, env);
4883 if (r != 0) goto err;
4884 }
4885
4886 if (IS_NOT_NULL(prev_cc)) {
4887 r = and_cclass(prev_cc, cc, env);
4888 if (r != 0) goto err;
4889 bbuf_free(cc->mbuf);
4890 cc = prev_cc;
4891 if (IS_NOT_NULL(asc_cc)) {
4892 r = and_cclass(asc_prev_cc, asc_cc, env);
4893 if (r != 0) goto err;
4894 bbuf_free(asc_cc->mbuf);
4895 asc_cc = asc_prev_cc;
4896 }
4897 }
4898
4899 if (neg != 0) {
4900 NCCLASS_SET_NOT(cc);
4901 if (IS_NOT_NULL(asc_cc))
4902 NCCLASS_SET_NOT(asc_cc);
4903 }
4904 else {
4905 NCCLASS_CLEAR_NOT(cc);
4906 if (IS_NOT_NULL(asc_cc))
4907 NCCLASS_CLEAR_NOT(asc_cc);
4908 }
4909 if (IS_NCCLASS_NOT(cc) &&
4910 IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
4911 int is_empty;
4912
4913 is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
4914 if (is_empty != 0)
4915 BITSET_IS_EMPTY(cc->bs, is_empty);
4916
4917 if (is_empty == 0) {
4918#define NEWLINE_CODE 0x0a
4919
4920 if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
4921 if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
4922 BITSET_SET_BIT_CHKDUP(cc->bs, NEWLINE_CODE);
4923 else {
4924 r = add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
4925 if (r < 0) goto err;
4926 }
4927 }
4928 }
4929 }
4930 *src = p;
4931 env->parse_depth--;
4932 return 0;
4933
4934 err:
4935 if (cc != NCCLASS(*np))
4936 bbuf_free(cc->mbuf);
4937 if (IS_NOT_NULL(asc_cc) && (asc_cc != NCCLASS(*asc_np)))
4938 bbuf_free(asc_cc->mbuf);
4939 return r;
4940}
4941
4942static int parse_subexp(Node** top, OnigToken* tok, int term,
4943 UChar** src, UChar* end, ScanEnv* env);
4944
4945static int
4946parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
4947 ScanEnv* env)
4948{
4949 int r = 0, num;
4950 Node *target, *work1 = NULL, *work2 = NULL;
4951 OnigOptionType option;
4952 OnigCodePoint c;
4953 OnigEncoding enc = env->enc;
4954
4955#ifdef USE_NAMED_GROUP
4956 int list_capture;
4957#endif
4958
4959 UChar* p = *src;
4960 PFETCH_READY;
4961
4962 *np = NULL;
4963 if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
4964
4965 option = env->option;
4966 if (PPEEK_IS('?') &&
4967 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
4968 PINC;
4969 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4970
4971 PFETCH(c);
4972 switch (c) {
4973 case ':': /* (?:...) grouping only */
4974 group:
4975 r = fetch_token(tok, &p, end, env);
4976 if (r < 0) return r;
4977 r = parse_subexp(np, tok, term, &p, end, env);
4978 if (r < 0) return r;
4979 *src = p;
4980 return 1; /* group */
4981 break;
4982
4983 case '=':
4984 *np = onig_node_new_anchor(ANCHOR_PREC_READ);
4985 break;
4986 case '!': /* preceding read */
4987 *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
4988 break;
4989 case '>': /* (?>...) stop backtrack */
4990 *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
4991 break;
4992 case '~': /* (?~...) absent operator */
4993 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_TILDE_ABSENT)) {
4994 *np = node_new_enclose(ENCLOSE_ABSENT);
4995 }
4996 else {
4997 return ONIGERR_UNDEFINED_GROUP_OPTION;
4998 }
4999 break;
5000
5001#ifdef USE_NAMED_GROUP
5002 case '\'':
5003 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
5004 goto named_group1;
5005 }
5006 else
5007 return ONIGERR_UNDEFINED_GROUP_OPTION;
5008 break;
5009
5010# ifdef USE_CAPITAL_P_NAMED_GROUP
5011 case 'P': /* (?P<name>...) */
5012 if (!PEND &&
5013 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) {
5014 PFETCH(c);
5015 if (c == '<') goto named_group1;
5016 }
5017 return ONIGERR_UNDEFINED_GROUP_OPTION;
5018 break;
5019# endif
5020#endif
5021
5022 case '<': /* look behind (?<=...), (?<!...) */
5023 if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
5024 PFETCH(c);
5025 if (c == '=')
5026 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
5027 else if (c == '!')
5028 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
5029#ifdef USE_NAMED_GROUP
5030 else { /* (?<name>...) */
5031 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
5032 UChar *name;
5033 UChar *name_end;
5034
5035 PUNFETCH;
5036 c = '<';
5037
5038 named_group1:
5039 list_capture = 0;
5040
5041# ifdef USE_CAPTURE_HISTORY
5042 named_group2:
5043# endif
5044 name = p;
5045 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
5046 if (r < 0) return r;
5047
5048 num = scan_env_add_mem_entry(env);
5049 if (num < 0) return num;
5050 if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM)
5051 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
5052
5053 r = name_add(env->reg, name, name_end, num, env);
5054 if (r != 0) return r;
5055 *np = node_new_enclose_memory(env->option, 1);
5056 CHECK_NULL_RETURN_MEMERR(*np);
5057 NENCLOSE(*np)->regnum = num;
5058 if (list_capture != 0)
5059 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
5060 env->num_named++;
5061 }
5062 else {
5063 return ONIGERR_UNDEFINED_GROUP_OPTION;
5064 }
5065 }
5066#else
5067 else {
5068 return ONIGERR_UNDEFINED_GROUP_OPTION;
5069 }
5070#endif
5071 break;
5072
5073#ifdef USE_CAPTURE_HISTORY
5074 case '@':
5075 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
5076# ifdef USE_NAMED_GROUP
5077 if (!PEND &&
5078 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
5079 PFETCH(c);
5080 if (c == '<' || c == '\'') {
5081 list_capture = 1;
5082 goto named_group2; /* (?@<name>...) */
5083 }
5084 PUNFETCH;
5085 }
5086# endif
5087 *np = node_new_enclose_memory(env->option, 0);
5088 CHECK_NULL_RETURN_MEMERR(*np);
5089 num = scan_env_add_mem_entry(env);
5090 if (num < 0) return num;
5091 if (num >= (int )BIT_STATUS_BITS_NUM)
5092 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
5093
5094 NENCLOSE(*np)->regnum = num;
5095 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
5096 }
5097 else {
5098 return ONIGERR_UNDEFINED_GROUP_OPTION;
5099 }
5100 break;
5101#endif /* USE_CAPTURE_HISTORY */
5102
5103 case '(': /* conditional expression: (?(cond)yes), (?(cond)yes|no) */
5104 if (!PEND &&
5105 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LPAREN_CONDITION)) {
5106 UChar *name = NULL;
5107 UChar *name_end;
5108 PFETCH(c);
5109 if (ONIGENC_IS_CODE_DIGIT(enc, c)) { /* (n) */
5110 PUNFETCH;
5111 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &num, 1);
5112 if (r < 0) return r;
5113#if 0
5114 /* Relative number is not currently supported. (same as Perl) */
5115 if (num < 0) {
5116 num = BACKREF_REL_TO_ABS(num, env);
5117 if (num <= 0)
5118 return ONIGERR_INVALID_BACKREF;
5119 }
5120#endif
5121 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) {
5122 if (num > env->num_mem ||
5123 IS_NULL(SCANENV_MEM_NODES(env)[num]))
5124 return ONIGERR_INVALID_BACKREF;
5125 }
5126 }
5127#ifdef USE_NAMED_GROUP
5128 else if (c == '<' || c == '\'') { /* (<name>), ('name') */
5129 name = p;
5130 r = fetch_named_backref_token(c, tok, &p, end, env);
5131 if (r < 0) return r;
5132 if (!PPEEK_IS(')')) return ONIGERR_UNDEFINED_GROUP_OPTION;
5133 PINC;
5134
5135 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_USE_LEFT_MOST_NAMED_GROUP)) {
5136 num = tok->u.backref.ref1;
5137 }
5138 else {
5139 /* FIXME:
5140 * Use left most named group for now. This is the same as Perl.
5141 * However this should use the same strategy as normal back-
5142 * references on Ruby syntax; search right to left. */
5143 int len = tok->u.backref.num;
5144 num = len > 1 ? tok->u.backref.refs[0] : tok->u.backref.ref1;
5145 }
5146 }
5147#endif
5148 else
5149 return ONIGERR_INVALID_CONDITION_PATTERN;
5150 *np = node_new_enclose(ENCLOSE_CONDITION);
5151 CHECK_NULL_RETURN_MEMERR(*np);
5152 NENCLOSE(*np)->regnum = num;
5153 if (IS_NOT_NULL(name)) NENCLOSE(*np)->state |= NST_NAME_REF;
5154 }
5155 else
5156 return ONIGERR_UNDEFINED_GROUP_OPTION;
5157 break;
5158
5159#if 0
5160 case '|': /* branch reset: (?|...) */
5161 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_VBAR_BRANCH_RESET)) {
5162 /* TODO */
5163 }
5164 else
5165 return ONIGERR_UNDEFINED_GROUP_OPTION;
5166 break;
5167#endif
5168
5169 case '^': /* loads default options */
5170 if (!PEND && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
5171 /* d-imsx */
5172 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
5173 ONOFF(option, ONIG_OPTION_IGNORECASE, 1);
5174 ONOFF(option, ONIG_OPTION_SINGLELINE, 0);
5175 ONOFF(option, ONIG_OPTION_MULTILINE, 1);
5176 ONOFF(option, ONIG_OPTION_EXTEND, 1);
5177 PFETCH(c);
5178 }
5179#if 0
5180 else if (!PEND && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
5181 /* d-imx */
5182 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0);
5183 ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0);
5184 ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0);
5185 ONOFF(option, ONIG_OPTION_IGNORECASE, 1);
5186 ONOFF(option, ONIG_OPTION_MULTILINE, 1);
5187 ONOFF(option, ONIG_OPTION_EXTEND, 1);
5188 PFETCH(c);
5189 }
5190#endif
5191 else {
5192 return ONIGERR_UNDEFINED_GROUP_OPTION;
5193 }
5194 /* fall through */
5195#ifdef USE_POSIXLINE_OPTION
5196 case 'p':
5197#endif
5198 case '-': case 'i': case 'm': case 's': case 'x':
5199 case 'a': case 'd': case 'l': case 'u':
5200 {
5201 int neg = 0;
5202
5203 while (1) {
5204 switch (c) {
5205 case ':':
5206 case ')':
5207 break;
5208
5209 case '-': neg = 1; break;
5210 case 'x': ONOFF(option, ONIG_OPTION_EXTEND, neg); break;
5211 case 'i': ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break;
5212 case 's':
5213 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
5214 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
5215 }
5216 else
5217 return ONIGERR_UNDEFINED_GROUP_OPTION;
5218 break;
5219
5220 case 'm':
5221 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
5222 ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0));
5223 }
5224 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
5225 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
5226 }
5227 else
5228 return ONIGERR_UNDEFINED_GROUP_OPTION;
5229 break;
5230#ifdef USE_POSIXLINE_OPTION
5231 case 'p':
5232 ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
5233 break;
5234#endif
5235
5236 case 'a': /* limits \d, \s, \w and POSIX brackets to ASCII range */
5237 if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) ||
5238 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) &&
5239 (neg == 0)) {
5240 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0);
5241 ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1);
5242 ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1);
5243 }
5244 else
5245 return ONIGERR_UNDEFINED_GROUP_OPTION;
5246 break;
5247
5248 case 'u':
5249 if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) ||
5250 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) &&
5251 (neg == 0)) {
5252 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
5253 ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1);
5254 ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1);
5255 }
5256 else
5257 return ONIGERR_UNDEFINED_GROUP_OPTION;
5258 break;
5259
5260 case 'd':
5261 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) &&
5262 (neg == 0)) {
5263 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
5264 }
5265 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY) &&
5266 (neg == 0)) {
5267 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0);
5268 ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0);
5269 ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0);
5270 }
5271 else
5272 return ONIGERR_UNDEFINED_GROUP_OPTION;
5273 break;
5274
5275 case 'l':
5276 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) && (neg == 0)) {
5277 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
5278 }
5279 else
5280 return ONIGERR_UNDEFINED_GROUP_OPTION;
5281 break;
5282
5283 default:
5284 return ONIGERR_UNDEFINED_GROUP_OPTION;
5285 }
5286
5287 if (c == ')') {
5288 *np = node_new_option(option);
5289 CHECK_NULL_RETURN_MEMERR(*np);
5290 *src = p;
5291 return 2; /* option only */
5292 }
5293 else if (c == ':') {
5294 OnigOptionType prev = env->option;
5295
5296 env->option = option;
5297 r = fetch_token(tok, &p, end, env);
5298 if (r < 0) {
5299 env->option = prev;
5300 return r;
5301 }
5302 r = parse_subexp(&target, tok, term, &p, end, env);
5303 env->option = prev;
5304 if (r < 0) return r;
5305 *np = node_new_option(option);
5306 CHECK_NULL_RETURN_MEMERR(*np);
5307 NENCLOSE(*np)->target = target;
5308 *src = p;
5309 return 0;
5310 }
5311
5312 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
5313 PFETCH(c);
5314 }
5315 }
5316 break;
5317
5318 default:
5319 return ONIGERR_UNDEFINED_GROUP_OPTION;
5320 }
5321 }
5322 else {
5323 if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP))
5324 goto group;
5325
5326 *np = node_new_enclose_memory(env->option, 0);
5327 CHECK_NULL_RETURN_MEMERR(*np);
5328 num = scan_env_add_mem_entry(env);
5329 if (num < 0) return num;
5330 NENCLOSE(*np)->regnum = num;
5331 }
5332
5333 CHECK_NULL_RETURN_MEMERR(*np);
5334 r = fetch_token(tok, &p, end, env);
5335 if (r < 0) return r;
5336 r = parse_subexp(&target, tok, term, &p, end, env);
5337 if (r < 0) {
5338 onig_node_free(target);
5339 return r;
5340 }
5341
5342 if (NTYPE(*np) == NT_ANCHOR)
5343 NANCHOR(*np)->target = target;
5344 else {
5345 NENCLOSE(*np)->target = target;
5346 if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) {
5347 /* Don't move this to previous of parse_subexp() */
5348 r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np);
5349 if (r != 0) return r;
5350 }
5351 else if (NENCLOSE(*np)->type == ENCLOSE_CONDITION) {
5352 if (NTYPE(target) != NT_ALT) {
5353 /* convert (?(cond)yes) to (?(cond)yes|empty) */
5354 work1 = node_new_empty();
5355 if (IS_NULL(work1)) goto err;
5356 work2 = onig_node_new_alt(work1, NULL_NODE);
5357 if (IS_NULL(work2)) goto err;
5358 work1 = onig_node_new_alt(target, work2);
5359 if (IS_NULL(work1)) goto err;
5360 NENCLOSE(*np)->target = work1;
5361 }
5362 }
5363 }
5364
5365 *src = p;
5366 return 0;
5367
5368 err:
5369 onig_node_free(work1);
5370 onig_node_free(work2);
5371 onig_node_free(*np);
5372 *np = NULL;
5373 return ONIGERR_MEMORY;
5374}
5375
5376static const char* const PopularQStr[] = {
5377 "?", "*", "+", "??", "*?", "+?"
5378};
5379
5380static const char* const ReduceQStr[] = {
5381 "", "", "*", "*?", "??", "+ and ??", "+? and ?"
5382};
5383
5384static int
5385set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)
5386{
5387 QtfrNode* qn;
5388
5389 qn = NQTFR(qnode);
5390 if (qn->lower == 1 && qn->upper == 1) {
5391 return 1;
5392 }
5393
5394 switch (NTYPE(target)) {
5395 case NT_STR:
5396 if (! group) {
5397 StrNode* sn = NSTR(target);
5398 if (str_node_can_be_split(sn, env->enc)) {
5399 Node* n = str_node_split_last_char(sn, env->enc);
5400 if (IS_NOT_NULL(n)) {
5401 qn->target = n;
5402 return 2;
5403 }
5404 }
5405 }
5406 break;
5407
5408 case NT_QTFR:
5409 { /* check redundant double repeat. */
5410 /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */
5411 QtfrNode* qnt = NQTFR(target);
5412 int nestq_num = popular_quantifier_num(qn);
5413 int targetq_num = popular_quantifier_num(qnt);
5414
5415#ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
5416 if (nestq_num >= 0 && targetq_num >= 0 &&
5417 IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
5418 switch (ReduceTypeTable[targetq_num][nestq_num]) {
5419 case RQ_ASIS:
5420 break;
5421
5422 case RQ_DEL:
5423 if (onig_warn != onig_null_warn) {
5424 onig_syntax_warn(env, "regular expression has redundant nested repeat operator '%s'",
5425 PopularQStr[targetq_num]);
5426 }
5427 goto warn_exit;
5428 break;
5429
5430 default:
5431 if (onig_warn != onig_null_warn) {
5432 onig_syntax_warn(env, "nested repeat operator '%s' and '%s' was replaced with '%s' in regular expression",
5433 PopularQStr[targetq_num], PopularQStr[nestq_num],
5434 ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
5435 }
5436 goto warn_exit;
5437 break;
5438 }
5439 }
5440
5441 warn_exit:
5442#endif
5443 if (targetq_num >= 0) {
5444 if (nestq_num >= 0) {
5445 onig_reduce_nested_quantifier(qnode, target);
5446 goto q_exit;
5447 }
5448 else if (targetq_num == 1 || targetq_num == 2) { /* * or + */
5449 /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */
5450 if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) {
5451 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
5452 }
5453 }
5454 }
5455 }
5456 break;
5457
5458 default:
5459 break;
5460 }
5461
5462 qn->target = target;
5463 q_exit:
5464 return 0;
5465}
5466
5467
5468#ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
5469static int
5470clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
5471{
5472 BBuf *tbuf;
5473 int r;
5474
5475 if (IS_NCCLASS_NOT(cc)) {
5476 bitset_invert(cc->bs);
5477
5478 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
5479 r = not_code_range_buf(enc, cc->mbuf, &tbuf);
5480 if (r != 0) return r;
5481
5482 bbuf_free(cc->mbuf);
5483 cc->mbuf = tbuf;
5484 }
5485
5486 NCCLASS_CLEAR_NOT(cc);
5487 }
5488
5489 return 0;
5490}
5491#endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
5492
5493typedef struct {
5494 ScanEnv* env;
5495 CClassNode* cc;
5496 CClassNode* asc_cc;
5497 Node* alt_root;
5498 Node** ptail;
5499} IApplyCaseFoldArg;
5500
5501static int
5502i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
5503 int to_len, void* arg)
5504{
5505 IApplyCaseFoldArg* iarg;
5506 ScanEnv* env;
5507 CClassNode* cc;
5508 CClassNode* asc_cc;
5509 BitSetRef bs;
5510 int add_flag, r;
5511
5512 iarg = (IApplyCaseFoldArg* )arg;
5513 env = iarg->env;
5514 cc = iarg->cc;
5515 asc_cc = iarg->asc_cc;
5516 bs = cc->bs;
5517
5518 if (IS_NULL(asc_cc)) {
5519 add_flag = 0;
5520 }
5521 else if (ONIGENC_IS_ASCII_CODE(from) == ONIGENC_IS_ASCII_CODE(*to)) {
5522 add_flag = 1;
5523 }
5524 else {
5525 add_flag = onig_is_code_in_cc(env->enc, from, asc_cc);
5526 if (IS_NCCLASS_NOT(asc_cc))
5527 add_flag = !add_flag;
5528 }
5529
5530 if (to_len == 1) {
5531 int is_in = onig_is_code_in_cc(env->enc, from, cc);
5532#ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
5533 if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
5534 (is_in == 0 && IS_NCCLASS_NOT(cc))) {
5535 if (add_flag) {
5536 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
5537 r = add_code_range0(&(cc->mbuf), env, *to, *to, 0);
5538 if (r < 0) return r;
5539 }
5540 else {
5541 BITSET_SET_BIT(bs, *to);
5542 }
5543 }
5544 }
5545#else
5546 if (is_in != 0) {
5547 if (add_flag) {
5548 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
5549 if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
5550 r = add_code_range0(&(cc->mbuf), env, *to, *to, 0);
5551 if (r < 0) return r;
5552 }
5553 else {
5554 if (IS_NCCLASS_NOT(cc)) {
5555 BITSET_CLEAR_BIT(bs, *to);
5556 }
5557 else {
5558 BITSET_SET_BIT(bs, *to);
5559 }
5560 }
5561 }
5562 }
5563#endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
5564 }
5565 else {
5566 int r, i, len;
5567 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5568 Node *snode = NULL_NODE;
5569
5570 if (onig_is_code_in_cc(env->enc, from, cc)
5571#ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
5572 && !IS_NCCLASS_NOT(cc)
5573#endif
5574 ) {
5575 for (i = 0; i < to_len; i++) {
5576 len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
5577 if (i == 0) {
5578 snode = onig_node_new_str(buf, buf + len);
5579 CHECK_NULL_RETURN_MEMERR(snode);
5580
5581 /* char-class expanded multi-char only
5582 compare with string folded at match time. */
5583 NSTRING_SET_AMBIG(snode);
5584 }
5585 else {
5586 r = onig_node_str_cat(snode, buf, buf + len);
5587 if (r < 0) {
5588 onig_node_free(snode);
5589 return r;
5590 }
5591 }
5592 }
5593
5594 *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE);
5595 CHECK_NULL_RETURN_MEMERR(*(iarg->ptail));
5596 iarg->ptail = &(NCDR((*(iarg->ptail))));
5597 }
5598 }
5599
5600 return 0;
5601}
5602
5603static int
5604cclass_case_fold(Node** np, CClassNode* cc, CClassNode* asc_cc, ScanEnv* env)
5605{
5606 int r;
5607 IApplyCaseFoldArg iarg;
5608
5609 iarg.env = env;
5610 iarg.cc = cc;
5611 iarg.asc_cc = asc_cc;
5612 iarg.alt_root = NULL_NODE;
5613 iarg.ptail = &(iarg.alt_root);
5614
5615 r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
5616 i_apply_case_fold, &iarg);
5617 if (r != 0) {
5618 onig_node_free(iarg.alt_root);
5619 return r;
5620 }
5621 if (IS_NOT_NULL(iarg.alt_root)) {
5622 Node* work = onig_node_new_alt(*np, iarg.alt_root);
5623 if (IS_NULL(work)) {
5624 onig_node_free(iarg.alt_root);
5625 return ONIGERR_MEMORY;
5626 }
5627 *np = work;
5628 }
5629 return r;
5630}
5631
5632static int
5633node_linebreak(Node** np, ScanEnv* env)
5634{
5635 /* same as (?>\x0D\x0A|[\x0A-\x0D\x{85}\x{2028}\x{2029}]) */
5636 Node* left = NULL;
5637 Node* right = NULL;
5638 Node* target1 = NULL;
5639 Node* target2 = NULL;
5640 CClassNode* cc;
5641 int num1, num2, r;
5642 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2];
5643
5644 /* \x0D\x0A */
5645 num1 = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf);
5646 if (num1 < 0) return num1;
5647 num2 = ONIGENC_CODE_TO_MBC(env->enc, 0x0A, buf + num1);
5648 if (num2 < 0) return num2;
5649 left = node_new_str_raw(buf, buf + num1 + num2);
5650 if (IS_NULL(left)) goto err;
5651
5652 /* [\x0A-\x0D] or [\x0A-\x0D\x{85}\x{2028}\x{2029}] */
5653 right = node_new_cclass();
5654 if (IS_NULL(right)) goto err;
5655 cc = NCCLASS(right);
5656 if (ONIGENC_MBC_MINLEN(env->enc) > 1) {
5657 r = add_code_range(&(cc->mbuf), env, 0x0A, 0x0D);
5658 if (r != 0) goto err;
5659 }
5660 else {
5661 bitset_set_range(env, cc->bs, 0x0A, 0x0D);
5662 }
5663
5664 /* TODO: move this block to enc/unicode.c */
5665 if (ONIGENC_IS_UNICODE(env->enc)) {
5666 /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */
5667 r = add_code_range(&(cc->mbuf), env, 0x85, 0x85);
5668 if (r != 0) goto err;
5669 r = add_code_range(&(cc->mbuf), env, 0x2028, 0x2029);
5670 if (r != 0) goto err;
5671 }
5672
5673 /* ...|... */
5674 target1 = onig_node_new_alt(right, NULL_NODE);
5675 if (IS_NULL(target1)) goto err;
5676 right = NULL;
5677 target2 = onig_node_new_alt(left, target1);
5678 if (IS_NULL(target2)) goto err;
5679 left = NULL;
5680 target1 = NULL;
5681
5682 /* (?>...) */
5683 *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
5684 if (IS_NULL(*np)) goto err;
5685 NENCLOSE(*np)->target = target2;
5686 return ONIG_NORMAL;
5687
5688 err:
5689 onig_node_free(left);
5690 onig_node_free(right);
5691 onig_node_free(target1);
5692 onig_node_free(target2);
5693 return ONIGERR_MEMORY;
5694}
5695
5696static int
5697propname2ctype(ScanEnv* env, const char* propname)
5698{
5699 UChar* name = (UChar* )propname;
5700 int ctype = env->enc->property_name_to_ctype(ONIG_ENCODING_ASCII,
5701 name, name + strlen(propname));
5702 return ctype;
5703}
5704
5705static int
5706node_extended_grapheme_cluster(Node** np, ScanEnv* env)
5707{
5708 Node* tmp = NULL;
5709 Node* np1 = NULL;
5710 Node* list = NULL;
5711 Node* list2 = NULL;
5712 Node* alt = NULL;
5713 Node* alt2 = NULL;
5714 BBuf *pbuf1 = NULL;
5715 int r = 0;
5716 int num1;
5717 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2];
5718 OnigOptionType option;
5719
5720#ifdef USE_UNICODE_PROPERTIES
5721 if (ONIGENC_IS_UNICODE(env->enc)) {
5722 /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */
5723 CClassNode* cc;
5724 OnigCodePoint sb_out = (ONIGENC_MBC_MINLEN(env->enc) > 1) ? 0x00 : 0x80;
5725 int extend = propname2ctype(env, "Grapheme_Cluster_Break=Extend");
5726
5727 /* Prepend*
5728 * ( RI-sequence | Hangul-Syllable | !Control )
5729 * ( Grapheme_Extend | SpacingMark )* */
5730
5731 /* ( Grapheme_Extend | SpacingMark )* */
5732 np1 = node_new_cclass();
5733 if (IS_NULL(np1)) goto err;
5734 cc = NCCLASS(np1);
5735 r = add_ctype_to_cc(cc, extend, 0, 0, env);
5736 if (r != 0) goto err;
5737 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=SpacingMark"), 0, 0, env);
5738 if (r != 0) goto err;
5739 r = add_code_range(&(cc->mbuf), env, 0x200D, 0x200D);
5740 if (r != 0) goto err;
5741
5742 tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
5743 if (IS_NULL(tmp)) goto err;
5744 NQTFR(tmp)->target = np1;
5745 np1 = tmp;
5746
5747 tmp = node_new_list(np1, NULL_NODE);
5748 if (IS_NULL(tmp)) goto err;
5749 list = tmp;
5750 np1 = NULL;
5751
5752 /* ( RI-sequence | Hangul-Syllable | !Control ) */
5753 /* !Control */
5754 np1 = node_new_cclass();
5755 if (IS_NULL(np1)) goto err;
5756 cc = NCCLASS(np1);
5757 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=Control"), 1, 0, env);
5758 if (r != 0) goto err;
5759 if (ONIGENC_MBC_MINLEN(env->enc) > 1) {
5760 BBuf *pbuf2 = NULL;
5761 r = add_code_range(&pbuf1, env, 0x0a, 0x0a);
5762 if (r != 0) goto err;
5763 r = add_code_range(&pbuf1, env, 0x0d, 0x0d);
5764 if (r != 0) goto err;
5765 r = and_code_range_buf(cc->mbuf, 0, pbuf1, 1, &pbuf2, env);
5766 if (r != 0) {
5767 bbuf_free(pbuf2);
5768 goto err;
5769 }
5770 bbuf_free(pbuf1);
5771 pbuf1 = NULL;
5772 bbuf_free(cc->mbuf);
5773 cc->mbuf = pbuf2;
5774 }
5775 else {
5776 BITSET_CLEAR_BIT(cc->bs, 0x0a);
5777 BITSET_CLEAR_BIT(cc->bs, 0x0d);
5778 }
5779
5780 tmp = onig_node_new_alt(np1, NULL_NODE);
5781 if (IS_NULL(tmp)) goto err;
5782 alt = tmp;
5783 np1 = NULL;
5784
5785 /* Hangul-Syllable
5786 * := L* V+ T*
5787 * | L* LV V* T*
5788 * | L* LVT T*
5789 * | L+
5790 * | T+ */
5791
5792 /* T+ */
5793 np1 = node_new_cclass();
5794 if (IS_NULL(np1)) goto err;
5795 cc = NCCLASS(np1);
5796 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=T"), 0, 0, env);
5797 if (r != 0) goto err;
5798
5799 tmp = node_new_quantifier(1, REPEAT_INFINITE, 0);
5800 if (IS_NULL(tmp)) goto err;
5801 NQTFR(tmp)->target = np1;
5802 np1 = tmp;
5803
5804 tmp = onig_node_new_alt(np1, alt);
5805 if (IS_NULL(tmp)) goto err;
5806 alt = tmp;
5807 np1 = NULL;
5808
5809 /* L+ */
5810 np1 = node_new_cclass();
5811 if (IS_NULL(np1)) goto err;
5812 cc = NCCLASS(np1);
5813 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=L"), 0, 0, env);
5814 if (r != 0) goto err;
5815
5816 tmp = node_new_quantifier(1, REPEAT_INFINITE, 0);
5817 if (IS_NULL(tmp)) goto err;
5818 NQTFR(tmp)->target = np1;
5819 np1 = tmp;
5820
5821 tmp = onig_node_new_alt(np1, alt);
5822 if (IS_NULL(tmp)) goto err;
5823 alt = tmp;
5824 np1 = NULL;
5825
5826 /* L* LVT T* */
5827 np1 = node_new_cclass();
5828 if (IS_NULL(np1)) goto err;
5829 cc = NCCLASS(np1);
5830 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=T"), 0, 0, env);
5831 if (r != 0) goto err;
5832
5833 tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
5834 if (IS_NULL(tmp)) goto err;
5835 NQTFR(tmp)->target = np1;
5836 np1 = tmp;
5837
5838 tmp = node_new_list(np1, NULL_NODE);
5839 if (IS_NULL(tmp)) goto err;
5840 list2 = tmp;
5841 np1 = NULL;
5842
5843 np1 = node_new_cclass();
5844 if (IS_NULL(np1)) goto err;
5845 cc = NCCLASS(np1);
5846 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=LVT"), 0, 0, env);
5847 if (r != 0) goto err;
5848
5849 tmp = node_new_list(np1, list2);
5850 if (IS_NULL(tmp)) goto err;
5851 list2 = tmp;
5852 np1 = NULL;
5853
5854 np1 = node_new_cclass();
5855 if (IS_NULL(np1)) goto err;
5856 cc = NCCLASS(np1);
5857 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=L"), 0, 0, env);
5858 if (r != 0) goto err;
5859
5860 tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
5861 if (IS_NULL(tmp)) goto err;
5862 NQTFR(tmp)->target = np1;
5863 np1 = tmp;
5864
5865 tmp = node_new_list(np1, list2);
5866 if (IS_NULL(tmp)) goto err;
5867 list2 = tmp;
5868 np1 = NULL;
5869
5870 tmp = onig_node_new_alt(list2, alt);
5871 if (IS_NULL(tmp)) goto err;
5872 alt = tmp;
5873 list2 = NULL;
5874
5875 /* L* LV V* T* */
5876 np1 = node_new_cclass();
5877 if (IS_NULL(np1)) goto err;
5878 cc = NCCLASS(np1);
5879 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=T"), 0, 0, env);
5880 if (r != 0) goto err;
5881
5882 tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
5883 if (IS_NULL(tmp)) goto err;
5884 NQTFR(tmp)->target = np1;
5885 np1 = tmp;
5886
5887 tmp = node_new_list(np1, NULL_NODE);
5888 if (IS_NULL(tmp)) goto err;
5889 list2 = tmp;
5890 np1 = NULL;
5891
5892 np1 = node_new_cclass();
5893 if (IS_NULL(np1)) goto err;
5894 cc = NCCLASS(np1);
5895 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=V"), 0, 0, env);
5896 if (r != 0) goto err;
5897
5898 tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
5899 if (IS_NULL(tmp)) goto err;
5900 NQTFR(tmp)->target = np1;
5901 np1 = tmp;
5902
5903 tmp = node_new_list(np1, list2);
5904 if (IS_NULL(tmp)) goto err;
5905 list2 = tmp;
5906 np1 = NULL;
5907
5908 np1 = node_new_cclass();
5909 if (IS_NULL(np1)) goto err;
5910 cc = NCCLASS(np1);
5911 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=LV"), 0, 0, env);
5912 if (r != 0) goto err;
5913
5914 tmp = node_new_list(np1, list2);
5915 if (IS_NULL(tmp)) goto err;
5916 list2 = tmp;
5917 np1 = NULL;
5918
5919 np1 = node_new_cclass();
5920 if (IS_NULL(np1)) goto err;
5921 cc = NCCLASS(np1);
5922 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=L"), 0, 0, env);
5923 if (r != 0) goto err;
5924
5925 tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
5926 if (IS_NULL(tmp)) goto err;
5927 NQTFR(tmp)->target = np1;
5928 np1 = tmp;
5929
5930 tmp = node_new_list(np1, list2);
5931 if (IS_NULL(tmp)) goto err;
5932 list2 = tmp;
5933 np1 = NULL;
5934
5935 tmp = onig_node_new_alt(list2, alt);
5936 if (IS_NULL(tmp)) goto err;
5937 alt = tmp;
5938 list2 = NULL;
5939
5940 /* L* V+ T* */
5941 np1 = node_new_cclass();
5942 if (IS_NULL(np1)) goto err;
5943 cc = NCCLASS(np1);
5944 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=T"), 0, 0, env);
5945 if (r != 0) goto err;
5946
5947 tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
5948 if (IS_NULL(tmp)) goto err;
5949 NQTFR(tmp)->target = np1;
5950 np1 = tmp;
5951
5952 tmp = node_new_list(np1, NULL_NODE);
5953 if (IS_NULL(tmp)) goto err;
5954 list2 = tmp;
5955 np1 = NULL;
5956
5957 np1 = node_new_cclass();
5958 if (IS_NULL(np1)) goto err;
5959 cc = NCCLASS(np1);
5960 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=V"), 0, 0, env);
5961 if (r != 0) goto err;
5962
5963 tmp = node_new_quantifier(1, REPEAT_INFINITE, 0);
5964 if (IS_NULL(tmp)) goto err;
5965 NQTFR(tmp)->target = np1;
5966 np1 = tmp;
5967
5968 tmp = node_new_list(np1, list2);
5969 if (IS_NULL(tmp)) goto err;
5970 list2 = tmp;
5971 np1 = NULL;
5972
5973 np1 = node_new_cclass();
5974 if (IS_NULL(np1)) goto err;
5975 cc = NCCLASS(np1);
5976 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=L"), 0, 0, env);
5977 if (r != 0) goto err;
5978
5979 tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
5980 if (IS_NULL(tmp)) goto err;
5981 NQTFR(tmp)->target = np1;
5982 np1 = tmp;
5983
5984 tmp = node_new_list(np1, list2);
5985 if (IS_NULL(tmp)) goto err;
5986 list2 = tmp;
5987 np1 = NULL;
5988
5989 tmp = onig_node_new_alt(list2, alt);
5990 if (IS_NULL(tmp)) goto err;
5991 alt = tmp;
5992 list2 = NULL;
5993
5994 /* Emoji sequence := (E_Base | EBG) Extend* E_Modifier?
5995 * (ZWJ (Glue_After_Zwj | EBG Extend* E_Modifier?) )* */
5996
5997 /* ZWJ (Glue_After_Zwj | E_Base_GAZ Extend* E_Modifier?) */
5998 np1 = node_new_cclass();
5999 if (IS_NULL(np1)) goto err;
6000 cc = NCCLASS(np1);
6001 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=E_Modifier"), 0, 0, env);
6002 if (r != 0) goto err;
6003
6004 tmp = node_new_quantifier(0, 1, 0);
6005 if (IS_NULL(tmp)) goto err;
6006 NQTFR(tmp)->target = np1;
6007 np1 = tmp;
6008
6009 tmp = node_new_list(np1, NULL_NODE);
6010 if (IS_NULL(tmp)) goto err;
6011 list2 = tmp;
6012 np1 = NULL;
6013
6014 np1 = node_new_cclass();
6015 if (IS_NULL(np1)) goto err;
6016 cc = NCCLASS(np1);
6017 r = add_ctype_to_cc(cc, extend, 0, 0, env);
6018 if (r != 0) goto err;
6019
6020 tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
6021 if (IS_NULL(tmp)) goto err;
6022 NQTFR(tmp)->target = np1;
6023 np1 = tmp;
6024
6025 tmp = node_new_list(np1, list2);
6026 if (IS_NULL(tmp)) goto err;
6027 list2 = tmp;
6028 np1 = NULL;
6029
6030 np1 = node_new_cclass();
6031 if (IS_NULL(np1)) goto err;
6032 cc = NCCLASS(np1);
6033 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=E_Base_GAZ"), 0, 0, env);
6034 if (r != 0) goto err;
6035
6036 tmp = node_new_list(np1, list2);
6037 if (IS_NULL(tmp)) goto err;
6038 list2 = tmp;
6039 np1 = NULL;
6040
6041 tmp = onig_node_new_alt(list2, NULL_NODE);
6042 if (IS_NULL(tmp)) goto err;
6043 alt2 = tmp;
6044 list2 = NULL;
6045
6046 /* Glue_After_Zwj */
6047 np1 = node_new_cclass();
6048 if (IS_NULL(np1)) goto err;
6049 cc = NCCLASS(np1);
6050 r = add_ctype_to_cc(cc, extend, 0, 0, env);
6051 if (r != 0) goto err;
6052
6053 tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
6054 if (IS_NULL(tmp)) goto err;
6055 NQTFR(tmp)->target = np1;
6056 np1 = tmp;
6057
6058 tmp = node_new_list(np1, NULL_NODE);
6059 if (IS_NULL(tmp)) goto err;
6060 list2 = tmp;
6061 np1 = NULL;
6062
6063 np1 = node_new_cclass();
6064 if (IS_NULL(np1)) goto err;
6065 cc = NCCLASS(np1);
6066 {
6067 static const OnigCodePoint ranges[] = {
6068 13,
6069 0x1F308, 0x1F308,
6070 0x1F33E, 0x1F33E,
6071 0x1F373, 0x1F373,
6072 0x1F393, 0x1F393,
6073 0x1F3A4, 0x1F3A4,
6074 0x1F3A8, 0x1F3A8,
6075 0x1F3EB, 0x1F3EB,
6076 0x1F3ED, 0x1F3ED,
6077 0x1F4BB, 0x1F4BC,
6078 0x1F527, 0x1F527,
6079 0x1F52C, 0x1F52C,
6080 0x1F680, 0x1F680,
6081 0x1F692, 0x1F692,
6082 };
6083 r = add_ctype_to_cc_by_range(cc, -1, 0, env, sb_out, ranges);
6084 if (r != 0) goto err;
6085 }
6086 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=Glue_After_Zwj"), 0, 0, env);
6087 if (r != 0) goto err;
6088
6089 tmp = node_new_list(np1, list2);
6090 if (IS_NULL(tmp)) goto err;
6091 list2 = tmp;
6092 np1 = NULL;
6093
6094 tmp = onig_node_new_alt(list2, alt2);
6095 if (IS_NULL(tmp)) goto err;
6096 alt2 = tmp;
6097 list2 = NULL;
6098
6099 /* Emoji variation sequence
6100 * http://unicode.org/Public/emoji/4.0/emoji-zwj-sequences.txt
6101 */
6102 r = ONIGENC_CODE_TO_MBC(env->enc, 0xfe0f, buf);
6103 if (r < 0) goto err;
6104 np1 = node_new_str_raw(buf, buf + r);
6105 if (IS_NULL(np1)) goto err;
6106
6107 tmp = node_new_quantifier(0, 1, 0);
6108 if (IS_NULL(tmp)) goto err;
6109 NQTFR(tmp)->target = np1;
6110 np1 = tmp;
6111
6112 tmp = node_new_list(np1, NULL_NODE);
6113 if (IS_NULL(tmp)) goto err;
6114 list2 = tmp;
6115 np1 = NULL;
6116
6117 np1 = node_new_cclass();
6118 if (IS_NULL(np1)) goto err;
6119 cc = NCCLASS(np1);
6120 {
6121 static const OnigCodePoint ranges[] = {
6122 4,
6123 0x2640, 0x2640,
6124 0x2642, 0x2642,
6125 0x2695, 0x2696,
6126 0x2708, 0x2708,
6127 };
6128 r = add_ctype_to_cc_by_range(cc, -1, 0, env, sb_out, ranges);
6129 if (r != 0) goto err;
6130 }
6131
6132 tmp = node_new_list(np1, list2);
6133 if (IS_NULL(tmp)) goto err;
6134 list2 = tmp;
6135 np1 = NULL;
6136
6137 tmp = onig_node_new_alt(list2, alt2);
6138 if (IS_NULL(tmp)) goto err;
6139 alt2 = tmp;
6140 list2 = NULL;
6141
6142 tmp = node_new_list(alt2, NULL_NODE);
6143 if (IS_NULL(tmp)) goto err;
6144 list2 = tmp;
6145 alt2 = NULL;
6146
6147 /* ZWJ */
6148 r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf);
6149 if (r < 0) goto err;
6150 np1 = node_new_str_raw(buf, buf + r);
6151 if (IS_NULL(np1)) goto err;
6152
6153 tmp = node_new_list(np1, list2);
6154 if (IS_NULL(tmp)) goto err;
6155 list2 = tmp;
6156 np1 = NULL;
6157
6158 tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
6159 if (IS_NULL(tmp)) goto err;
6160 NQTFR(tmp)->target = list2;
6161 np1 = tmp;
6162 list2 = NULL;
6163
6164 tmp = node_new_list(np1, NULL_NODE);
6165 if (IS_NULL(tmp)) goto err;
6166 list2 = tmp;
6167 np1 = NULL;
6168
6169 /* E_Modifier? */
6170 np1 = node_new_cclass();
6171 if (IS_NULL(np1)) goto err;
6172 cc = NCCLASS(np1);
6173 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=E_Modifier"), 0, 0, env);
6174 if (r != 0) goto err;
6175
6176 tmp = node_new_quantifier(0, 1, 0);
6177 if (IS_NULL(tmp)) goto err;
6178 NQTFR(tmp)->target = np1;
6179 np1 = tmp;
6180
6181 tmp = node_new_list(np1, list2);
6182 if (IS_NULL(tmp)) goto err;
6183 list2 = tmp;
6184 np1 = NULL;
6185
6186 /* Extend* */
6187 np1 = node_new_cclass();
6188 if (IS_NULL(np1)) goto err;
6189 cc = NCCLASS(np1);
6190 r = add_ctype_to_cc(cc, extend, 0, 0, env);
6191 if (r != 0) goto err;
6192
6193 tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
6194 if (IS_NULL(tmp)) goto err;
6195 NQTFR(tmp)->target = np1;
6196 np1 = tmp;
6197
6198 tmp = node_new_list(np1, list2);
6199 if (IS_NULL(tmp)) goto err;
6200 list2 = tmp;
6201 np1 = NULL;
6202
6203 /* (E_Base | EBG) */
6204 np1 = node_new_cclass();
6205 if (IS_NULL(np1)) goto err;
6206 cc = NCCLASS(np1);
6207 {
6208 static const OnigCodePoint ranges[] = {
6209 8,
6210 0x1F3C2, 0x1F3C2,
6211 0x1F3C7, 0x1F3C7,
6212 0x1F3CC, 0x1F3CC,
6213 0x1F3F3, 0x1F3F3,
6214 0x1F441, 0x1F441,
6215 0x1F46F, 0x1F46F,
6216 0x1F574, 0x1F574,
6217 0x1F6CC, 0x1F6CC,
6218 };
6219 r = add_ctype_to_cc_by_range(cc, -1, 0, env, sb_out, ranges);
6220 if (r != 0) goto err;
6221 }
6222 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=E_Base"), 0, 0, env);
6223 if (r != 0) goto err;
6224 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=E_Base_GAZ"), 0, 0, env);
6225 if (r != 0) goto err;
6226
6227 tmp = node_new_list(np1, list2);
6228 if (IS_NULL(tmp)) goto err;
6229 list2 = tmp;
6230 np1 = NULL;
6231
6232 tmp = onig_node_new_alt(list2, alt);
6233 if (IS_NULL(tmp)) goto err;
6234 alt = tmp;
6235 list2 = NULL;
6236
6237 /* ZWJ (E_Base_GAZ | Glue_After_Zwj) E_Modifier? */
6238 /* a sequence starting with ZWJ seems artificial, but GraphemeBreakTest
6239 * has such examples.
6240 * http://www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakTest.html
6241 */
6242 np1 = node_new_cclass();
6243 if (IS_NULL(np1)) goto err;
6244 cc = NCCLASS(np1);
6245 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=E_Modifier"), 0, 0, env);
6246 if (r != 0) goto err;
6247
6248 tmp = node_new_quantifier(0, 1, 0);
6249 if (IS_NULL(tmp)) goto err;
6250 NQTFR(tmp)->target = np1;
6251 np1 = tmp;
6252
6253 tmp = node_new_list(np1, NULL_NODE);
6254 if (IS_NULL(tmp)) goto err;
6255 list2 = tmp;
6256 np1 = NULL;
6257
6258 np1 = node_new_cclass();
6259 if (IS_NULL(np1)) goto err;
6260 cc = NCCLASS(np1);
6261 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=Glue_After_Zwj"), 0, 0, env);
6262 if (r != 0) goto err;
6263 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=E_Base_GAZ"), 0, 0, env);
6264 if (r != 0) goto err;
6265
6266 tmp = node_new_list(np1, list2);
6267 if (IS_NULL(tmp)) goto err;
6268 list2 = tmp;
6269 np1 = NULL;
6270
6271 r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf);
6272 if (r < 0) goto err;
6273 np1 = node_new_str_raw(buf, buf + r);
6274 if (IS_NULL(np1)) goto err;
6275
6276 tmp = node_new_list(np1, list2);
6277 if (IS_NULL(tmp)) goto err;
6278 list2 = tmp;
6279 np1 = NULL;
6280
6281 tmp = onig_node_new_alt(list2, alt);
6282 if (IS_NULL(tmp)) goto err;
6283 alt = tmp;
6284 list2 = NULL;
6285
6286 /* RI-Sequence := Regional_Indicator{2} */
6287 np1 = node_new_cclass();
6288 if (IS_NULL(np1)) goto err;
6289 cc = NCCLASS(np1);
6290 r = add_code_range(&(cc->mbuf), env, 0x1F1E6, 0x1F1FF);
6291 if (r != 0) goto err;
6292
6293 tmp = node_new_quantifier(2, 2, 0);
6294 if (IS_NULL(tmp)) goto err;
6295 NQTFR(tmp)->target = np1;
6296 np1 = tmp;
6297
6298 tmp = node_new_list(np1, list2);
6299 if (IS_NULL(tmp)) goto err;
6300 list2 = tmp;
6301 np1 = NULL;
6302
6303 tmp = onig_node_new_alt(list2, alt);
6304 if (IS_NULL(tmp)) goto err;
6305 alt = tmp;
6306 list2 = NULL;
6307
6308 tmp = node_new_list(alt, list);
6309 if (IS_NULL(tmp)) goto err;
6310 list = tmp;
6311 alt = NULL;
6312
6313 /* Prepend* */
6314 np1 = node_new_cclass();
6315 if (IS_NULL(np1)) goto err;
6316 cc = NCCLASS(np1);
6317 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=Prepend"), 0, 0, env);
6318 if (r != 0) goto err;
6319
6320 tmp = node_new_quantifier(0, REPEAT_INFINITE, 0);
6321 if (IS_NULL(tmp)) goto err;
6322 NQTFR(tmp)->target = np1;
6323 np1 = tmp;
6324
6325 tmp = node_new_list(np1, list);
6326 if (IS_NULL(tmp)) goto err;
6327 list = tmp;
6328 np1 = NULL;
6329
6330 /* PerlSyntax: (?s:.), RubySyntax: (?m:.) */
6331 np1 = node_new_anychar();
6332 if (IS_NULL(np1)) goto err;
6333
6334 option = env->option;
6335 ONOFF(option, ONIG_OPTION_MULTILINE, 0);
6336 tmp = node_new_option(option);
6337 if (IS_NULL(tmp)) goto err;
6338 NENCLOSE(tmp)->target = np1;
6339 np1 = tmp;
6340
6341 tmp = onig_node_new_alt(np1, NULL_NODE);
6342 if (IS_NULL(tmp)) goto err;
6343 alt = tmp;
6344 np1 = NULL;
6345
6346 /* Prepend+ */
6347 r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf);
6348 if (r < 0) goto err;
6349 np1 = node_new_str_raw(buf, buf + r);
6350 if (IS_NULL(np1)) goto err;
6351
6352 tmp = node_new_quantifier(0, 1, 0);
6353 if (IS_NULL(tmp)) goto err;
6354 NQTFR(tmp)->target = np1;
6355 np1 = tmp;
6356
6357 tmp = node_new_list(np1, NULL_NODE);
6358 if (IS_NULL(tmp)) goto err;
6359 list2 = tmp;
6360 np1 = NULL;
6361
6362 np1 = node_new_cclass();
6363 if (IS_NULL(np1)) goto err;
6364 cc = NCCLASS(np1);
6365 r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=Prepend"), 0, 0, env);
6366 if (r != 0) goto err;
6367
6368 tmp = node_new_quantifier(1, REPEAT_INFINITE, 0);
6369 if (IS_NULL(tmp)) goto err;
6370 NQTFR(tmp)->target = np1;
6371 np1 = tmp;
6372
6373 tmp = node_new_list(np1, list2);
6374 if (IS_NULL(tmp)) goto err;
6375 list2 = tmp;
6376 np1 = NULL;
6377
6378 tmp = onig_node_new_alt(list2, alt);
6379 if (IS_NULL(tmp)) goto err;
6380 alt = tmp;
6381 list2 = NULL;
6382
6383 tmp = onig_node_new_alt(list, alt);
6384 if (IS_NULL(tmp)) goto err;
6385 alt = tmp;
6386 list = NULL;
6387 }
6388 else
6389#endif /* USE_UNICODE_PROPERTIES */
6390 {
6391 /* PerlSyntax: (?s:.), RubySyntax: (?m:.) */
6392 np1 = node_new_anychar();
6393 if (IS_NULL(np1)) goto err;
6394
6395 option = env->option;
6396 ONOFF(option, ONIG_OPTION_MULTILINE, 0);
6397 tmp = node_new_option(option);
6398 if (IS_NULL(tmp)) goto err;
6399 NENCLOSE(tmp)->target = np1;
6400 np1 = tmp;
6401
6402 alt = onig_node_new_alt(np1, NULL_NODE);
6403 if (IS_NULL(alt)) goto err;
6404 np1 = NULL;
6405 }
6406
6407 /* \x0D\x0A */
6408 r = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf);
6409 if (r < 0) goto err;
6410 num1 = r;
6411 r = ONIGENC_CODE_TO_MBC(env->enc, 0x0A, buf + num1);
6412 if (r < 0) goto err;
6413 np1 = node_new_str_raw(buf, buf + num1 + r);
6414 if (IS_NULL(np1)) goto err;
6415
6416 tmp = onig_node_new_alt(np1, alt);
6417 if (IS_NULL(tmp)) goto err;
6418 alt = tmp;
6419 np1 = NULL;
6420
6421 /* (?>\x0D\x0A|...) */
6422 tmp = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
6423 if (IS_NULL(tmp)) goto err;
6424 NENCLOSE(tmp)->target = alt;
6425 np1 = tmp;
6426
6427#ifdef USE_UNICODE_PROPERTIES
6428 if (ONIGENC_IS_UNICODE(env->enc)) {
6429 /* Don't ignore case. */
6430 option = env->option;
6431 ONOFF(option, ONIG_OPTION_IGNORECASE, 1);
6432 *np = node_new_option(option);
6433 if (IS_NULL(*np)) goto err;
6434 NENCLOSE(*np)->target = np1;
6435 }
6436 else
6437#endif
6438 {
6439 *np = np1;
6440 }
6441 return ONIG_NORMAL;
6442
6443 err:
6444 onig_node_free(np1);
6445 onig_node_free(list);
6446 onig_node_free(list2);
6447 onig_node_free(alt);
6448 onig_node_free(alt2);
6449 bbuf_free(pbuf1);
6450 return (r == 0) ? ONIGERR_MEMORY : r;
6451}
6452
6453static int
6454countbits(unsigned int bits)
6455{
6456 bits = (bits & 0x55555555) + ((bits >> 1) & 0x55555555);
6457 bits = (bits & 0x33333333) + ((bits >> 2) & 0x33333333);
6458 bits = (bits & 0x0f0f0f0f) + ((bits >> 4) & 0x0f0f0f0f);
6459 bits = (bits & 0x00ff00ff) + ((bits >> 8) & 0x00ff00ff);
6460 return (bits & 0x0000ffff) + ((bits >>16) & 0x0000ffff);
6461}
6462
6463static int
6464is_onechar_cclass(CClassNode* cc, OnigCodePoint* code)
6465{
6466 const OnigCodePoint not_found = ONIG_LAST_CODE_POINT;
6467 OnigCodePoint c = not_found;
6468 int i;
6469 BBuf *bbuf = cc->mbuf;
6470
6471 if (IS_NCCLASS_NOT(cc)) return 0;
6472
6473 /* check bbuf */
6474 if (IS_NOT_NULL(bbuf)) {
6475 OnigCodePoint n, *data;
6476 GET_CODE_POINT(n, bbuf->p);
6477 data = (OnigCodePoint* )(bbuf->p) + 1;
6478 if ((n == 1) && (data[0] == data[1])) {
6479 /* only one char found in the bbuf, save the code point. */
6480 c = data[0];
6481 if (((c < SINGLE_BYTE_SIZE) && BITSET_AT(cc->bs, c))) {
6482 /* skip if c is included in the bitset */
6483 c = not_found;
6484 }
6485 }
6486 else {
6487 return 0; /* the bbuf contains multiple chars */
6488 }
6489 }
6490
6491 /* check bitset */
6492 for (i = 0; i < BITSET_SIZE; i++) {
6493 Bits b1 = cc->bs[i];
6494 if (b1 != 0) {
6495 if (((b1 & (b1 - 1)) == 0) && (c == not_found)) {
6496 c = BITS_IN_ROOM * i + countbits(b1 - 1);
6497 } else {
6498 return 0; /* the character class contains multiple chars */
6499 }
6500 }
6501 }
6502
6503 if (c != not_found) {
6504 *code = c;
6505 return 1;
6506 }
6507
6508 /* the character class contains no char. */
6509 return 0;
6510}
6511
6512
6513static int
6514parse_exp(Node** np, OnigToken* tok, int term,
6515 UChar** src, UChar* end, ScanEnv* env)
6516{
6517 int r, len, group = 0;
6518 Node* qn;
6519 Node** targetp;
6520
6521 *np = NULL;
6522 if (tok->type == (enum TokenSyms )term)
6523 goto end_of_token;
6524
6525 switch (tok->type) {
6526 case TK_ALT:
6527 case TK_EOT:
6528 end_of_token:
6529 *np = node_new_empty();
6530 return tok->type;
6531 break;
6532
6533 case TK_SUBEXP_OPEN:
6534 r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env);
6535 if (r < 0) return r;
6536 if (r == 1) group = 1;
6537 else if (r == 2) { /* option only */
6538 Node* target;
6539 OnigOptionType prev = env->option;
6540
6541 env->option = NENCLOSE(*np)->option;
6542 r = fetch_token(tok, src, end, env);
6543 if (r < 0) {
6544 env->option = prev;
6545 return r;
6546 }
6547 r = parse_subexp(&target, tok, term, src, end, env);
6548 env->option = prev;
6549 if (r < 0) {
6550 onig_node_free(target);
6551 return r;
6552 }
6553 NENCLOSE(*np)->target = target;
6554 return tok->type;
6555 }
6556 break;
6557
6558 case TK_SUBEXP_CLOSE:
6559 if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
6560 return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
6561
6562 if (tok->escaped) goto tk_raw_byte;
6563 else goto tk_byte;
6564 break;
6565
6566 case TK_LINEBREAK:
6567 r = node_linebreak(np, env);
6568 if (r < 0) return r;
6569 break;
6570
6571 case TK_EXTENDED_GRAPHEME_CLUSTER:
6572 r = node_extended_grapheme_cluster(np, env);
6573 if (r < 0) return r;
6574 break;
6575
6576 case TK_KEEP:
6577 *np = onig_node_new_anchor(ANCHOR_KEEP);
6578 CHECK_NULL_RETURN_MEMERR(*np);
6579 break;
6580
6581 case TK_STRING:
6582 tk_byte:
6583 {
6584 *np = node_new_str(tok->backp, *src);
6585 CHECK_NULL_RETURN_MEMERR(*np);
6586
6587 string_loop:
6588 while (1) {
6589 r = fetch_token(tok, src, end, env);
6590 if (r < 0) return r;
6591 if (r == TK_STRING) {
6592 r = onig_node_str_cat(*np, tok->backp, *src);
6593 }
6594#ifndef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
6595 else if (r == TK_CODE_POINT) {
6596 r = node_str_cat_codepoint(*np, env->enc, tok->u.code);
6597 }
6598#endif
6599 else {
6600 break;
6601 }
6602 if (r < 0) return r;
6603 }
6604
6605 string_end:
6606 targetp = np;
6607 goto repeat;
6608 }
6609 break;
6610
6611 case TK_RAW_BYTE:
6612 tk_raw_byte:
6613 {
6614 *np = node_new_str_raw_char((UChar )tok->u.c);
6615 CHECK_NULL_RETURN_MEMERR(*np);
6616 len = 1;
6617 while (1) {
6618 if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
6619 if (len == enclen(env->enc, NSTR(*np)->s, NSTR(*np)->end)) {
6620 r = fetch_token(tok, src, end, env);
6621 NSTRING_CLEAR_RAW(*np);
6622 goto string_end;
6623 }
6624 }
6625
6626 r = fetch_token(tok, src, end, env);
6627 if (r < 0) return r;
6628 if (r != TK_RAW_BYTE) {
6629 /* Don't use this, it is wrong for little endian encodings. */
6630#ifdef USE_PAD_TO_SHORT_BYTE_CHAR
6631 int rem;
6632 if (len < ONIGENC_MBC_MINLEN(env->enc)) {
6633 rem = ONIGENC_MBC_MINLEN(env->enc) - len;
6634 (void )node_str_head_pad(NSTR(*np), rem, (UChar )0);
6635 if (len + rem == enclen(env->enc, NSTR(*np)->s)) {
6636 NSTRING_CLEAR_RAW(*np);
6637 goto string_end;
6638 }
6639 }
6640#endif
6641 return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
6642 }
6643
6644 r = node_str_cat_char(*np, (UChar )tok->u.c);
6645 if (r < 0) return r;
6646
6647 len++;
6648 }
6649 }
6650 break;
6651
6652 case TK_CODE_POINT:
6653 {
6654 *np = node_new_empty();
6655 CHECK_NULL_RETURN_MEMERR(*np);
6656 r = node_str_cat_codepoint(*np, env->enc, tok->u.code);
6657 if (r != 0) return r;
6658#ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
6659 NSTRING_SET_RAW(*np);
6660#else
6661 goto string_loop;
6662#endif
6663 }
6664 break;
6665
6666 case TK_QUOTE_OPEN:
6667 {
6668 OnigCodePoint end_op[2];
6669 UChar *qstart, *qend, *nextp;
6670
6671 end_op[0] = (OnigCodePoint )MC_ESC(env->syntax);
6672 end_op[1] = (OnigCodePoint )'E';
6673 qstart = *src;
6674 qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
6675 if (IS_NULL(qend)) {
6676 nextp = qend = end;
6677 }
6678 *np = node_new_str(qstart, qend);
6679 CHECK_NULL_RETURN_MEMERR(*np);
6680 *src = nextp;
6681 }
6682 break;
6683
6684 case TK_CHAR_TYPE:
6685 {
6686 switch (tok->u.prop.ctype) {
6687 case ONIGENC_CTYPE_WORD:
6688 *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not,
6689 IS_ASCII_RANGE(env->option));
6690 CHECK_NULL_RETURN_MEMERR(*np);
6691 break;
6692
6693 case ONIGENC_CTYPE_SPACE:
6694 case ONIGENC_CTYPE_DIGIT:
6695 case ONIGENC_CTYPE_XDIGIT:
6696 {
6697 CClassNode* cc;
6698
6699 *np = node_new_cclass();
6700 CHECK_NULL_RETURN_MEMERR(*np);
6701 cc = NCCLASS(*np);
6702 r = add_ctype_to_cc(cc, tok->u.prop.ctype, 0,
6703 IS_ASCII_RANGE(env->option), env);
6704 if (r != 0) return r;
6705 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
6706 }
6707 break;
6708
6709 default:
6710 return ONIGERR_PARSER_BUG;
6711 break;
6712 }
6713 }
6714 break;
6715
6716 case TK_CHAR_PROPERTY:
6717 r = parse_char_property(np, tok, src, end, env);
6718 if (r != 0) return r;
6719 break;
6720
6721 case TK_CC_OPEN:
6722 {
6723 Node *asc_node;
6724 CClassNode* cc;
6725 OnigCodePoint code;
6726
6727 r = parse_char_class(np, &asc_node, tok, src, end, env);
6728 if (r != 0) {
6729 onig_node_free(asc_node);
6730 return r;
6731 }
6732
6733 cc = NCCLASS(*np);
6734 if (is_onechar_cclass(cc, &code)) {
6735 onig_node_free(*np);
6736 onig_node_free(asc_node);
6737 *np = node_new_empty();
6738 CHECK_NULL_RETURN_MEMERR(*np);
6739 r = node_str_cat_codepoint(*np, env->enc, code);
6740 if (r != 0) return r;
6741 goto string_loop;
6742 }
6743 if (IS_IGNORECASE(env->option)) {
6744 r = cclass_case_fold(np, cc, NCCLASS(asc_node), env);
6745 if (r != 0) {
6746 onig_node_free(asc_node);
6747 return r;
6748 }
6749 }
6750 onig_node_free(asc_node);
6751 }
6752 break;
6753
6754 case TK_ANYCHAR:
6755 *np = node_new_anychar();
6756 CHECK_NULL_RETURN_MEMERR(*np);
6757 break;
6758
6759 case TK_ANYCHAR_ANYTIME:
6760 *np = node_new_anychar();
6761 CHECK_NULL_RETURN_MEMERR(*np);
6762 qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
6763 CHECK_NULL_RETURN_MEMERR(qn);
6764 NQTFR(qn)->target = *np;
6765 *np = qn;
6766 break;
6767
6768 case TK_BACKREF:
6769 len = tok->u.backref.num;
6770 *np = node_new_backref(len,
6771 (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
6772 tok->u.backref.by_name,
6773#ifdef USE_BACKREF_WITH_LEVEL
6774 tok->u.backref.exist_level,
6775 tok->u.backref.level,
6776#endif
6777 env);
6778 CHECK_NULL_RETURN_MEMERR(*np);
6779 break;
6780
6781#ifdef USE_SUBEXP_CALL
6782 case TK_CALL:
6783 {
6784 int gnum = tok->u.call.gnum;
6785
6786 if (gnum < 0 || tok->u.call.rel != 0) {
6787 if (gnum > 0) gnum--;
6788 gnum = BACKREF_REL_TO_ABS(gnum, env);
6789 if (gnum <= 0)
6790 return ONIGERR_INVALID_BACKREF;
6791 }
6792 *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum);
6793 CHECK_NULL_RETURN_MEMERR(*np);
6794 env->num_call++;
6795 }
6796 break;
6797#endif
6798
6799 case TK_ANCHOR:
6800 *np = onig_node_new_anchor(tok->u.anchor.subtype);
6801 CHECK_NULL_RETURN_MEMERR(*np);
6802 NANCHOR(*np)->ascii_range = tok->u.anchor.ascii_range;
6803 break;
6804
6805 case TK_OP_REPEAT:
6806 case TK_INTERVAL:
6807 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
6808 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
6809 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
6810 else
6811 *np = node_new_empty();
6812 }
6813 else {
6814 goto tk_byte;
6815 }
6816 break;
6817
6818 default:
6819 return ONIGERR_PARSER_BUG;
6820 break;
6821 }
6822
6823 {
6824 targetp = np;
6825
6826 re_entry:
6827 r = fetch_token(tok, src, end, env);
6828 if (r < 0) return r;
6829
6830 repeat:
6831 if (r == TK_OP_REPEAT || r == TK_INTERVAL) {
6832 if (is_invalid_quantifier_target(*targetp))
6833 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
6834
6835 qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
6836 (r == TK_INTERVAL ? 1 : 0));
6837 CHECK_NULL_RETURN_MEMERR(qn);
6838 NQTFR(qn)->greedy = tok->u.repeat.greedy;
6839 r = set_quantifier(qn, *targetp, group, env);
6840 if (r < 0) {
6841 onig_node_free(qn);
6842 return r;
6843 }
6844
6845 if (tok->u.repeat.possessive != 0) {
6846 Node* en;
6847 en = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
6848 if (IS_NULL(en)) {
6849 onig_node_free(qn);
6850 return ONIGERR_MEMORY;
6851 }
6852 NENCLOSE(en)->target = qn;
6853 qn = en;
6854 }
6855
6856 if (r == 0) {
6857 *targetp = qn;
6858 }
6859 else if (r == 1) {
6860 onig_node_free(qn);
6861 }
6862 else if (r == 2) { /* split case: /abc+/ */
6863 Node *tmp;
6864
6865 *targetp = node_new_list(*targetp, NULL);
6866 if (IS_NULL(*targetp)) {
6867 onig_node_free(qn);
6868 return ONIGERR_MEMORY;
6869 }
6870 tmp = NCDR(*targetp) = node_new_list(qn, NULL);
6871 if (IS_NULL(tmp)) {
6872 onig_node_free(qn);
6873 return ONIGERR_MEMORY;
6874 }
6875 targetp = &(NCAR(tmp));
6876 }
6877 goto re_entry;
6878 }
6879 }
6880
6881 return r;
6882}
6883
6884static int
6885parse_branch(Node** top, OnigToken* tok, int term,
6886 UChar** src, UChar* end, ScanEnv* env)
6887{
6888 int r;
6889 Node *node, **headp;
6890
6891 *top = NULL;
6892 r = parse_exp(&node, tok, term, src, end, env);
6893 if (r < 0) {
6894 onig_node_free(node);
6895 return r;
6896 }
6897
6898 if (r == TK_EOT || r == term || r == TK_ALT) {
6899 *top = node;
6900 }
6901 else {
6902 *top = node_new_list(node, NULL);
6903 headp = &(NCDR(*top));
6904 while (r != TK_EOT && r != term && r != TK_ALT) {
6905 r = parse_exp(&node, tok, term, src, end, env);
6906 if (r < 0) {
6907 onig_node_free(node);
6908 return r;
6909 }
6910
6911 if (NTYPE(node) == NT_LIST) {
6912 *headp = node;
6913 while (IS_NOT_NULL(NCDR(node))) node = NCDR(node);
6914 headp = &(NCDR(node));
6915 }
6916 else {
6917 *headp = node_new_list(node, NULL);
6918 headp = &(NCDR(*headp));
6919 }
6920 }
6921 }
6922
6923 return r;
6924}
6925
6926/* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
6927static int
6928parse_subexp(Node** top, OnigToken* tok, int term,
6929 UChar** src, UChar* end, ScanEnv* env)
6930{
6931 int r;
6932 Node *node, **headp;
6933
6934 *top = NULL;
6935 env->parse_depth++;
6936 if (env->parse_depth > ParseDepthLimit)
6937 return ONIGERR_PARSE_DEPTH_LIMIT_OVER;
6938 r = parse_branch(&node, tok, term, src, end, env);
6939 if (r < 0) {
6940 onig_node_free(node);
6941 return r;
6942 }
6943
6944 if (r == term) {
6945 *top = node;
6946 }
6947 else if (r == TK_ALT) {
6948 *top = onig_node_new_alt(node, NULL);
6949 headp = &(NCDR(*top));
6950 while (r == TK_ALT) {
6951 r = fetch_token(tok, src, end, env);
6952 if (r < 0) return r;
6953 r = parse_branch(&node, tok, term, src, end, env);
6954 if (r < 0) {
6955 onig_node_free(node);
6956 return r;
6957 }
6958
6959 *headp = onig_node_new_alt(node, NULL);
6960 headp = &(NCDR(*headp));
6961 }
6962
6963 if (tok->type != (enum TokenSyms )term)
6964 goto err;
6965 }
6966 else {
6967 onig_node_free(node);
6968 err:
6969 if (term == TK_SUBEXP_CLOSE)
6970 return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
6971 else
6972 return ONIGERR_PARSER_BUG;
6973 }
6974
6975 env->parse_depth--;
6976 return r;
6977}
6978
6979static int
6980parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
6981{
6982 int r;
6983 OnigToken tok;
6984
6985 r = fetch_token(&tok, src, end, env);
6986 if (r < 0) return r;
6987 r = parse_subexp(top, &tok, TK_EOT, src, end, env);
6988 if (r < 0) return r;
6989
6990#ifdef USE_SUBEXP_CALL
6991 if (env->num_call > 0) {
6992 /* Capture the pattern itself. It is used for (?R), (?0) and \g<0>. */
6993 const int num = 0;
6994 Node* np;
6995 np = node_new_enclose_memory(env->option, 0);
6996 CHECK_NULL_RETURN_MEMERR(np);
6997 NENCLOSE(np)->regnum = num;
6998 NENCLOSE(np)->target = *top;
6999 r = scan_env_set_mem_node(env, num, np);
7000 if (r != 0) {
7001 onig_node_free(np);
7002 return r;
7003 }
7004 *top = np;
7005 }
7006#endif
7007 return 0;
7008}
7009
7010extern int
7011onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end,
7012 regex_t* reg, ScanEnv* env)
7013{
7014 int r;
7015 UChar* p;
7016
7017#ifdef USE_NAMED_GROUP
7018 names_clear(reg);
7019#endif
7020
7021 scan_env_clear(env);
7022 env->option = reg->options;
7023 env->case_fold_flag = reg->case_fold_flag;
7024 env->enc = reg->enc;
7025 env->syntax = reg->syntax;
7026 env->pattern = (UChar* )pattern;
7027 env->pattern_end = (UChar* )end;
7028 env->reg = reg;
7029
7030 *root = NULL;
7031 p = (UChar* )pattern;
7032 r = parse_regexp(root, &p, (UChar* )end, env);
7033 reg->num_mem = env->num_mem;
7034 return r;
7035}
7036
7037extern void
7038onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
7039 UChar* arg, UChar* arg_end)
7040{
7041 env->error = arg;
7042 env->error_end = arg_end;
7043}
Note: See TracBrowser for help on using the repository browser.