source: EcnlProtoTool/trunk/onigmo-5.15.0/src/regparse.c@ 279

Last change on this file since 279 was 279, checked in by coas-nagasima, 7 years ago

ファイルを追加、更新。

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
  • Property svn:mime-type set to text/x-csrc
File size: 145.3 KB
Line 
1/**********************************************************************
2 regparse.c - Onigmo (Oniguruma-mod) (regular expression library)
3**********************************************************************/
4/*-
5 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * Copyright (c) 2011-2014 K.Takata <kentkt AT csc DOT jp>
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include "regparse.h"
32#include "st.h"
33
34#define WARN_BUFSIZE 256
35
36#define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
37
38
39OnigSyntaxType OnigSyntaxRuby = {
40 (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
41 ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
42 ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
43 ONIG_SYN_OP_ESC_C_CONTROL )
44 & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
45 , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
46 ONIG_SYN_OP2_OPTION_RUBY |
47 ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
48 ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
49 ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
50 ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
51 ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
52 ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
53 ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
54 ONIG_SYN_OP2_ESC_H_XDIGIT |
55 ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER |
56 ONIG_SYN_OP2_QMARK_LPAREN_CONDITION |
57 ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK |
58 ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP )
59 , ( SYN_GNU_REGEX_BV |
60 ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
61 ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
62 ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
63 ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
64 ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
65 ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
66 ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
67 , ( ONIG_OPTION_ASCII_RANGE | ONIG_OPTION_POSIX_BRACKET_ALL_RANGE |
68 ONIG_OPTION_WORD_BOUND_ALL_RANGE )
69 ,
70 {
71 (OnigCodePoint )'\\' /* esc */
72 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
73 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
74 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
75 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
76 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
77 }
78};
79
80OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
81
82extern void onig_null_warn(const char* s ARG_UNUSED) { }
83
84#ifdef DEFAULT_WARN_FUNCTION
85static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
86#else
87static OnigWarnFunc onig_warn = onig_null_warn;
88#endif
89
90#ifdef DEFAULT_VERB_WARN_FUNCTION
91static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
92#else
93static OnigWarnFunc onig_verb_warn = onig_null_warn;
94#endif
95
96extern void onig_set_warn_func(OnigWarnFunc f)
97{
98 onig_warn = f;
99}
100
101extern void onig_set_verb_warn_func(OnigWarnFunc f)
102{
103 onig_verb_warn = f;
104}
105
106static void
107bbuf_free(BBuf* bbuf)
108{
109 if (IS_NOT_NULL(bbuf)) {
110 if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
111 xfree(bbuf);
112 }
113}
114
115static int
116bbuf_clone(BBuf** rto, BBuf* from)
117{
118 int r;
119 BBuf *to;
120
121 *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
122 CHECK_NULL_RETURN_MEMERR(to);
123 r = BBUF_INIT(to, from->alloc);
124 if (r != 0) return r;
125 to->used = from->used;
126 xmemcpy(to->p, from->p, from->used);
127 return 0;
128}
129
130#define BACKREF_REL_TO_ABS(rel_no, env) \
131 ((env)->num_mem + 1 + (rel_no))
132
133#define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f))
134
135#define MBCODE_START_POS(enc) \
136 (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
137
138#define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
139 add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ONIG_LAST_CODE_POINT)
140
141#define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
142 if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
143 r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
144 if (r) return r;\
145 }\
146} while (0)
147
148
149#define BITSET_IS_EMPTY(bs,empty) do {\
150 int i;\
151 empty = 1;\
152 for (i = 0; i < BITSET_SIZE; i++) {\
153 if ((bs)[i] != 0) {\
154 empty = 0; break;\
155 }\
156 }\
157} while (0)
158
159static void
160bitset_set_range(BitSetRef bs, int from, int to)
161{
162 int i;
163 for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
164 BITSET_SET_BIT(bs, i);
165 }
166}
167
168#if 0
169static void
170bitset_set_all(BitSetRef bs)
171{
172 int i;
173 for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); }
174}
175#endif
176
177static void
178bitset_invert(BitSetRef bs)
179{
180 int i;
181 for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~(bs[i]); }
182}
183
184static void
185bitset_invert_to(BitSetRef from, BitSetRef to)
186{
187 int i;
188 for (i = 0; i < BITSET_SIZE; i++) { to[i] = ~(from[i]); }
189}
190
191static void
192bitset_and(BitSetRef dest, BitSetRef bs)
193{
194 int i;
195 for (i = 0; i < BITSET_SIZE; i++) { dest[i] &= bs[i]; }
196}
197
198static void
199bitset_or(BitSetRef dest, BitSetRef bs)
200{
201 int i;
202 for (i = 0; i < BITSET_SIZE; i++) { dest[i] |= bs[i]; }
203}
204
205static void
206bitset_copy(BitSetRef dest, BitSetRef bs)
207{
208 int i;
209 for (i = 0; i < BITSET_SIZE; i++) { dest[i] = bs[i]; }
210}
211
212extern int
213onig_strncmp(const UChar* s1, const UChar* s2, int n)
214{
215 int x;
216
217 while (n-- > 0) {
218 x = *s2++ - *s1++;
219 if (x) return x;
220 }
221 return 0;
222}
223
224extern void
225onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
226{
227 ptrdiff_t len = end - src;
228 if (len > 0) {
229 xmemcpy(dest, src, len);
230 dest[len] = (UChar )0;
231 }
232}
233
234#ifdef USE_NAMED_GROUP
235static UChar*
236strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
237{
238 ptrdiff_t slen;
239 int term_len, i;
240 UChar *r;
241
242 slen = end - s;
243 term_len = ONIGENC_MBC_MINLEN(enc);
244
245 r = (UChar* )xmalloc(slen + term_len);
246 CHECK_NULL_RETURN(r);
247 xmemcpy(r, s, slen);
248
249 for (i = 0; i < term_len; i++)
250 r[slen + i] = (UChar )0;
251
252 return r;
253}
254#endif
255
256/* scan pattern methods */
257#define PEND_VALUE 0
258
259#ifdef __GNUC__
260/* get rid of Wunused-but-set-variable and Wuninitialized */
261#define PFETCH_READY UChar* pfetch_prev = NULL; (void)pfetch_prev
262#else
263#define PFETCH_READY UChar* pfetch_prev
264#endif
265#define PEND (p < end ? 0 : 1)
266#define PUNFETCH p = pfetch_prev
267#define PINC do { \
268 pfetch_prev = p; \
269 p += ONIGENC_MBC_ENC_LEN(enc, p); \
270} while (0)
271#define PFETCH(c) do { \
272 c = ONIGENC_MBC_TO_CODE(enc, p, end); \
273 pfetch_prev = p; \
274 p += ONIGENC_MBC_ENC_LEN(enc, p); \
275} while (0)
276
277#define PINC_S do { \
278 p += ONIGENC_MBC_ENC_LEN(enc, p); \
279} while (0)
280#define PFETCH_S(c) do { \
281 c = ONIGENC_MBC_TO_CODE(enc, p, end); \
282 p += ONIGENC_MBC_ENC_LEN(enc, p); \
283} while (0)
284
285#define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
286#define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c)
287
288static UChar*
289strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
290 size_t capa)
291{
292 UChar* r;
293
294 if (dest)
295 r = (UChar* )xrealloc(dest, capa + 1);
296 else
297 r = (UChar* )xmalloc(capa + 1);
298
299 CHECK_NULL_RETURN(r);
300 onig_strcpy(r + (dest_end - dest), src, src_end);
301 return r;
302}
303
304/* dest on static area */
305static UChar*
306strcat_capa_from_static(UChar* dest, UChar* dest_end,
307 const UChar* src, const UChar* src_end, size_t capa)
308{
309 UChar* r;
310
311 r = (UChar* )xmalloc(capa + 1);
312 CHECK_NULL_RETURN(r);
313 onig_strcpy(r, dest, dest_end);
314 onig_strcpy(r + (dest_end - dest), src, src_end);
315 return r;
316}
317
318
319#ifdef USE_ST_LIBRARY
320
321typedef struct {
322 UChar* s;
323 UChar* end;
324} st_str_end_key;
325
326static int
327str_end_cmp(st_str_end_key* x, st_str_end_key* y)
328{
329 UChar *p, *q;
330 int c;
331
332 if ((x->end - x->s) != (y->end - y->s))
333 return 1;
334
335 p = x->s;
336 q = y->s;
337 while (p < x->end) {
338 c = (int )*p - (int )*q;
339 if (c != 0) return c;
340
341 p++; q++;
342 }
343
344 return 0;
345}
346
347static int
348str_end_hash(st_str_end_key* x)
349{
350 UChar *p;
351 int val = 0;
352
353 p = x->s;
354 while (p < x->end) {
355 val = val * 997 + (int )*p++;
356 }
357
358 return val + (val >> 5);
359}
360
361extern hash_table_type*
362onig_st_init_strend_table_with_size(int size)
363{
364 static struct st_hash_type hashType = {
365 str_end_cmp,
366 str_end_hash,
367 };
368
369 return (hash_table_type* )
370 onig_st_init_table_with_size(&hashType, size);
371}
372
373extern int
374onig_st_lookup_strend(hash_table_type* table, const UChar* str_key,
375 const UChar* end_key, hash_data_type *value)
376{
377 st_str_end_key key;
378
379 key.s = (UChar* )str_key;
380 key.end = (UChar* )end_key;
381
382 return onig_st_lookup(table, (st_data_t )(&key), value);
383}
384
385extern int
386onig_st_insert_strend(hash_table_type* table, const UChar* str_key,
387 const UChar* end_key, hash_data_type value)
388{
389 st_str_end_key* key;
390 int result;
391
392 key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
393 key->s = (UChar* )str_key;
394 key->end = (UChar* )end_key;
395 result = onig_st_insert(table, (st_data_t )key, value);
396 if (result) {
397 xfree(key);
398 }
399 return result;
400}
401
402#endif /* USE_ST_LIBRARY */
403
404
405#ifdef USE_NAMED_GROUP
406
407#define INIT_NAME_BACKREFS_ALLOC_NUM 8
408
409typedef struct {
410 UChar* name;
411 size_t name_len; /* byte length */
412 int back_num; /* number of backrefs */
413 int back_alloc;
414 int back_ref1;
415 int* back_refs;
416} NameEntry;
417
418#ifdef USE_ST_LIBRARY
419
420typedef st_table NameTable;
421typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */
422
423#ifdef ONIG_DEBUG
424static int
425i_print_name_entry(UChar* key, NameEntry* e, void* arg)
426{
427 int i;
428 FILE* fp = (FILE* )arg;
429
430 fprintf(fp, "%s: ", e->name);
431 if (e->back_num == 0)
432 fputs("-", fp);
433 else if (e->back_num == 1)
434 fprintf(fp, "%d", e->back_ref1);
435 else {
436 for (i = 0; i < e->back_num; i++) {
437 if (i > 0) fprintf(fp, ", ");
438 fprintf(fp, "%d", e->back_refs[i]);
439 }
440 }
441 fputs("\n", fp);
442 return ST_CONTINUE;
443}
444
445extern int
446onig_print_names(FILE* fp, regex_t* reg)
447{
448 NameTable* t = (NameTable* )reg->name_table;
449
450 if (IS_NOT_NULL(t)) {
451 fprintf(fp, "name table\n");
452 onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
453 fputs("\n", fp);
454 }
455 return 0;
456}
457#endif /* ONIG_DEBUG */
458
459static int
460i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
461{
462 xfree(e->name);
463 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
464 xfree(key);
465 xfree(e);
466 return ST_DELETE;
467}
468
469static int
470names_clear(regex_t* reg)
471{
472 NameTable* t = (NameTable* )reg->name_table;
473
474 if (IS_NOT_NULL(t)) {
475 onig_st_foreach(t, i_free_name_entry, 0);
476 }
477 return 0;
478}
479
480extern int
481onig_names_free(regex_t* reg)
482{
483 int r;
484 NameTable* t;
485
486 r = names_clear(reg);
487 if (r) return r;
488
489 t = (NameTable* )reg->name_table;
490 if (IS_NOT_NULL(t)) onig_st_free_table(t);
491 reg->name_table = (void* )NULL;
492 return 0;
493}
494
495static NameEntry*
496name_find(regex_t* reg, const UChar* name, const UChar* name_end)
497{
498 NameEntry* e;
499 NameTable* t = (NameTable* )reg->name_table;
500
501 e = (NameEntry* )NULL;
502 if (IS_NOT_NULL(t)) {
503 onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
504 }
505 return e;
506}
507
508typedef struct {
509 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
510 regex_t* reg;
511 void* arg;
512 int ret;
513 OnigEncoding enc;
514} INamesArg;
515
516static int
517i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
518{
519 int r = (*(arg->func))(e->name,
520 e->name + e->name_len,
521 e->back_num,
522 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
523 arg->reg, arg->arg);
524 if (r != 0) {
525 arg->ret = r;
526 return ST_STOP;
527 }
528 return ST_CONTINUE;
529}
530
531extern int
532onig_foreach_name(regex_t* reg,
533 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
534{
535 INamesArg narg;
536 NameTable* t = (NameTable* )reg->name_table;
537
538 narg.ret = 0;
539 if (IS_NOT_NULL(t)) {
540 narg.func = func;
541 narg.reg = reg;
542 narg.arg = arg;
543 narg.enc = reg->enc; /* should be pattern encoding. */
544 onig_st_foreach(t, i_names, (HashDataType )&narg);
545 }
546 return narg.ret;
547}
548
549static int
550i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map)
551{
552 int i;
553
554 if (e->back_num > 1) {
555 for (i = 0; i < e->back_num; i++) {
556 e->back_refs[i] = map[e->back_refs[i]].new_val;
557 }
558 }
559 else if (e->back_num == 1) {
560 e->back_ref1 = map[e->back_ref1].new_val;
561 }
562
563 return ST_CONTINUE;
564}
565
566extern int
567onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
568{
569 NameTable* t = (NameTable* )reg->name_table;
570
571 if (IS_NOT_NULL(t)) {
572 onig_st_foreach(t, i_renumber_name, (HashDataType )map);
573 }
574 return 0;
575}
576
577
578extern int
579onig_number_of_names(regex_t* reg)
580{
581 NameTable* t = (NameTable* )reg->name_table;
582
583 if (IS_NOT_NULL(t))
584 return t->num_entries;
585 else
586 return 0;
587}
588
589#else /* USE_ST_LIBRARY */
590
591#define INIT_NAMES_ALLOC_NUM 8
592
593typedef struct {
594 NameEntry* e;
595 int num;
596 int alloc;
597} NameTable;
598
599#ifdef ONIG_DEBUG
600extern int
601onig_print_names(FILE* fp, regex_t* reg)
602{
603 int i, j;
604 NameEntry* e;
605 NameTable* t = (NameTable* )reg->name_table;
606
607 if (IS_NOT_NULL(t) && t->num > 0) {
608 fprintf(fp, "name table\n");
609 for (i = 0; i < t->num; i++) {
610 e = &(t->e[i]);
611 fprintf(fp, "%s: ", e->name);
612 if (e->back_num == 0) {
613 fputs("-", fp);
614 }
615 else if (e->back_num == 1) {
616 fprintf(fp, "%d", e->back_ref1);
617 }
618 else {
619 for (j = 0; j < e->back_num; j++) {
620 if (j > 0) fprintf(fp, ", ");
621 fprintf(fp, "%d", e->back_refs[j]);
622 }
623 }
624 fputs("\n", fp);
625 }
626 fputs("\n", fp);
627 }
628 return 0;
629}
630#endif
631
632static int
633names_clear(regex_t* reg)
634{
635 int i;
636 NameEntry* e;
637 NameTable* t = (NameTable* )reg->name_table;
638
639 if (IS_NOT_NULL(t)) {
640 for (i = 0; i < t->num; i++) {
641 e = &(t->e[i]);
642 if (IS_NOT_NULL(e->name)) {
643 xfree(e->name);
644 e->name = NULL;
645 e->name_len = 0;
646 e->back_num = 0;
647 e->back_alloc = 0;
648 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
649 e->back_refs = (int* )NULL;
650 }
651 }
652 if (IS_NOT_NULL(t->e)) {
653 xfree(t->e);
654 t->e = NULL;
655 }
656 t->num = 0;
657 }
658 return 0;
659}
660
661extern int
662onig_names_free(regex_t* reg)
663{
664 int r;
665 NameTable* t;
666
667 r = names_clear(reg);
668 if (r) return r;
669
670 t = (NameTable* )reg->name_table;
671 if (IS_NOT_NULL(t)) xfree(t);
672 reg->name_table = NULL;
673 return 0;
674}
675
676static NameEntry*
677name_find(regex_t* reg, const UChar* name, const UChar* name_end)
678{
679 int i, len;
680 NameEntry* e;
681 NameTable* t = (NameTable* )reg->name_table;
682
683 if (IS_NOT_NULL(t)) {
684 len = name_end - name;
685 for (i = 0; i < t->num; i++) {
686 e = &(t->e[i]);
687 if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
688 return e;
689 }
690 }
691 return (NameEntry* )NULL;
692}
693
694extern int
695onig_foreach_name(regex_t* reg,
696 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
697{
698 int i, r;
699 NameEntry* e;
700 NameTable* t = (NameTable* )reg->name_table;
701
702 if (IS_NOT_NULL(t)) {
703 for (i = 0; i < t->num; i++) {
704 e = &(t->e[i]);
705 r = (*func)(e->name, e->name + e->name_len, e->back_num,
706 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
707 reg, arg);
708 if (r != 0) return r;
709 }
710 }
711 return 0;
712}
713
714extern int
715onig_number_of_names(regex_t* reg)
716{
717 NameTable* t = (NameTable* )reg->name_table;
718
719 if (IS_NOT_NULL(t))
720 return t->num;
721 else
722 return 0;
723}
724
725#endif /* else USE_ST_LIBRARY */
726
727static int
728name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
729{
730 int alloc;
731 NameEntry* e;
732 NameTable* t = (NameTable* )reg->name_table;
733
734 if (name_end - name <= 0)
735 return ONIGERR_EMPTY_GROUP_NAME;
736
737 e = name_find(reg, name, name_end);
738 if (IS_NULL(e)) {
739#ifdef USE_ST_LIBRARY
740 if (IS_NULL(t)) {
741 t = onig_st_init_strend_table_with_size(5);
742 reg->name_table = (void* )t;
743 }
744 e = (NameEntry* )xmalloc(sizeof(NameEntry));
745 CHECK_NULL_RETURN_MEMERR(e);
746
747 e->name = strdup_with_null(reg->enc, name, name_end);
748 if (IS_NULL(e->name)) {
749 xfree(e);
750 return ONIGERR_MEMORY;
751 }
752 onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
753 (HashDataType )e);
754
755 e->name_len = name_end - name;
756 e->back_num = 0;
757 e->back_alloc = 0;
758 e->back_refs = (int* )NULL;
759
760#else
761
762 if (IS_NULL(t)) {
763 alloc = INIT_NAMES_ALLOC_NUM;
764 t = (NameTable* )xmalloc(sizeof(NameTable));
765 CHECK_NULL_RETURN_MEMERR(t);
766 t->e = NULL;
767 t->alloc = 0;
768 t->num = 0;
769
770 t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
771 if (IS_NULL(t->e)) {
772 xfree(t);
773 return ONIGERR_MEMORY;
774 }
775 t->alloc = alloc;
776 reg->name_table = t;
777 goto clear;
778 }
779 else if (t->num == t->alloc) {
780 int i;
781 NameEntry* p;
782
783 alloc = t->alloc * 2;
784 p = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
785 CHECK_NULL_RETURN_MEMERR(p);
786 t->e = p;
787 t->alloc = alloc;
788
789 clear:
790 for (i = t->num; i < t->alloc; i++) {
791 t->e[i].name = NULL;
792 t->e[i].name_len = 0;
793 t->e[i].back_num = 0;
794 t->e[i].back_alloc = 0;
795 t->e[i].back_refs = (int* )NULL;
796 }
797 }
798 e = &(t->e[t->num]);
799 t->num++;
800 e->name = strdup_with_null(reg->enc, name, name_end);
801 if (IS_NULL(e->name)) return ONIGERR_MEMORY;
802 e->name_len = name_end - name;
803#endif
804 }
805
806 if (e->back_num >= 1 &&
807 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
808 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
809 name, name_end);
810 return ONIGERR_MULTIPLEX_DEFINED_NAME;
811 }
812
813 e->back_num++;
814 if (e->back_num == 1) {
815 e->back_ref1 = backref;
816 }
817 else {
818 if (e->back_num == 2) {
819 alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
820 e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
821 CHECK_NULL_RETURN_MEMERR(e->back_refs);
822 e->back_alloc = alloc;
823 e->back_refs[0] = e->back_ref1;
824 e->back_refs[1] = backref;
825 }
826 else {
827 if (e->back_num > e->back_alloc) {
828 int* p;
829 alloc = e->back_alloc * 2;
830 p = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
831 CHECK_NULL_RETURN_MEMERR(p);
832 e->back_refs = p;
833 e->back_alloc = alloc;
834 }
835 e->back_refs[e->back_num - 1] = backref;
836 }
837 }
838
839 return 0;
840}
841
842extern int
843onig_name_to_group_numbers(regex_t* reg, const UChar* name,
844 const UChar* name_end, int** nums)
845{
846 NameEntry* e = name_find(reg, name, name_end);
847
848 if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
849
850 switch (e->back_num) {
851 case 0:
852 *nums = 0;
853 break;
854 case 1:
855 *nums = &(e->back_ref1);
856 break;
857 default:
858 *nums = e->back_refs;
859 break;
860 }
861 return e->back_num;
862}
863
864extern int
865onig_name_to_backref_number(regex_t* reg, const UChar* name,
866 const UChar* name_end, OnigRegion *region)
867{
868 int i, n, *nums;
869
870 n = onig_name_to_group_numbers(reg, name, name_end, &nums);
871 if (n < 0)
872 return n;
873 else if (n == 0)
874 return ONIGERR_PARSER_BUG;
875 else if (n == 1)
876 return nums[0];
877 else {
878 if (IS_NOT_NULL(region)) {
879 for (i = n - 1; i >= 0; i--) {
880 if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
881 return nums[i];
882 }
883 }
884 return nums[n - 1];
885 }
886}
887
888#else /* USE_NAMED_GROUP */
889
890extern int
891onig_name_to_group_numbers(regex_t* reg, const UChar* name,
892 const UChar* name_end, int** nums)
893{
894 return ONIG_NO_SUPPORT_CONFIG;
895}
896
897extern int
898onig_name_to_backref_number(regex_t* reg, const UChar* name,
899 const UChar* name_end, OnigRegion* region)
900{
901 return ONIG_NO_SUPPORT_CONFIG;
902}
903
904extern int
905onig_foreach_name(regex_t* reg,
906 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
907{
908 return ONIG_NO_SUPPORT_CONFIG;
909}
910
911extern int
912onig_number_of_names(regex_t* reg)
913{
914 return 0;
915}
916#endif /* else USE_NAMED_GROUP */
917
918extern int
919onig_noname_group_capture_is_active(regex_t* reg)
920{
921 if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
922 return 0;
923
924#ifdef USE_NAMED_GROUP
925 if (onig_number_of_names(reg) > 0 &&
926 IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
927 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
928 return 0;
929 }
930#endif
931
932 return 1;
933}
934
935
936#define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16
937
938static void
939scan_env_clear(ScanEnv* env)
940{
941 int i;
942
943 BIT_STATUS_CLEAR(env->capture_history);
944 BIT_STATUS_CLEAR(env->bt_mem_start);
945 BIT_STATUS_CLEAR(env->bt_mem_end);
946 BIT_STATUS_CLEAR(env->backrefed_mem);
947 env->error = (UChar* )NULL;
948 env->error_end = (UChar* )NULL;
949 env->num_call = 0;
950 env->num_mem = 0;
951#ifdef USE_NAMED_GROUP
952 env->num_named = 0;
953#endif
954 env->mem_alloc = 0;
955 env->mem_nodes_dynamic = (Node** )NULL;
956
957 for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
958 env->mem_nodes_static[i] = NULL_NODE;
959
960#ifdef USE_COMBINATION_EXPLOSION_CHECK
961 env->num_comb_exp_check = 0;
962 env->comb_exp_max_regnum = 0;
963 env->curr_max_regnum = 0;
964 env->has_recursion = 0;
965#endif
966}
967
968static int
969scan_env_add_mem_entry(ScanEnv* env)
970{
971 int i, need, alloc;
972 Node** p;
973
974 need = env->num_mem + 1;
975 if (need > ONIG_MAX_CAPTURE_GROUP_NUM)
976 return ONIGERR_TOO_MANY_CAPTURE_GROUPS;
977 if (need >= SCANENV_MEMNODES_SIZE) {
978 if (env->mem_alloc <= need) {
979 if (IS_NULL(env->mem_nodes_dynamic)) {
980 alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
981 p = (Node** )xmalloc(sizeof(Node*) * alloc);
982 xmemcpy(p, env->mem_nodes_static,
983 sizeof(Node*) * SCANENV_MEMNODES_SIZE);
984 }
985 else {
986 alloc = env->mem_alloc * 2;
987 p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc);
988 }
989 CHECK_NULL_RETURN_MEMERR(p);
990
991 for (i = env->num_mem + 1; i < alloc; i++)
992 p[i] = NULL_NODE;
993
994 env->mem_nodes_dynamic = p;
995 env->mem_alloc = alloc;
996 }
997 }
998
999 env->num_mem++;
1000 return env->num_mem;
1001}
1002
1003static int
1004scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
1005{
1006 if (env->num_mem >= num)
1007 SCANENV_MEM_NODES(env)[num] = node;
1008 else
1009 return ONIGERR_PARSER_BUG;
1010 return 0;
1011}
1012
1013
1014#ifdef USE_PARSE_TREE_NODE_RECYCLE
1015typedef struct _FreeNode {
1016 struct _FreeNode* next;
1017} FreeNode;
1018
1019static FreeNode* FreeNodeList = (FreeNode* )NULL;
1020#endif
1021
1022extern void
1023onig_node_free(Node* node)
1024{
1025 start:
1026 if (IS_NULL(node)) return ;
1027
1028 switch (NTYPE(node)) {
1029 case NT_STR:
1030 if (NSTR(node)->capa != 0 &&
1031 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1032 xfree(NSTR(node)->s);
1033 }
1034 break;
1035
1036 case NT_LIST:
1037 case NT_ALT:
1038 onig_node_free(NCAR(node));
1039 {
1040 Node* next_node = NCDR(node);
1041
1042#ifdef USE_PARSE_TREE_NODE_RECYCLE
1043 {
1044 FreeNode* n = (FreeNode* )node;
1045
1046 THREAD_ATOMIC_START;
1047 n->next = FreeNodeList;
1048 FreeNodeList = n;
1049 THREAD_ATOMIC_END;
1050 }
1051#else
1052 xfree(node);
1053#endif
1054 node = next_node;
1055 goto start;
1056 }
1057 break;
1058
1059 case NT_CCLASS:
1060 {
1061 CClassNode* cc = NCCLASS(node);
1062
1063 if (IS_NCCLASS_SHARE(cc)) return ;
1064 if (cc->mbuf)
1065 bbuf_free(cc->mbuf);
1066 }
1067 break;
1068
1069 case NT_QTFR:
1070 if (NQTFR(node)->target)
1071 onig_node_free(NQTFR(node)->target);
1072 break;
1073
1074 case NT_ENCLOSE:
1075 if (NENCLOSE(node)->target)
1076 onig_node_free(NENCLOSE(node)->target);
1077 break;
1078
1079 case NT_BREF:
1080 if (IS_NOT_NULL(NBREF(node)->back_dynamic))
1081 xfree(NBREF(node)->back_dynamic);
1082 break;
1083
1084 case NT_ANCHOR:
1085 if (NANCHOR(node)->target)
1086 onig_node_free(NANCHOR(node)->target);
1087 break;
1088 }
1089
1090#ifdef USE_PARSE_TREE_NODE_RECYCLE
1091 {
1092 FreeNode* n = (FreeNode* )node;
1093
1094 THREAD_ATOMIC_START;
1095 n->next = FreeNodeList;
1096 FreeNodeList = n;
1097 THREAD_ATOMIC_END;
1098 }
1099#else
1100 xfree(node);
1101#endif
1102}
1103
1104#ifdef USE_PARSE_TREE_NODE_RECYCLE
1105extern int
1106onig_free_node_list(void)
1107{
1108 FreeNode* n;
1109
1110 /* THREAD_ATOMIC_START; */
1111 while (IS_NOT_NULL(FreeNodeList)) {
1112 n = FreeNodeList;
1113 FreeNodeList = FreeNodeList->next;
1114 xfree(n);
1115 }
1116 /* THREAD_ATOMIC_END; */
1117 return 0;
1118}
1119#endif
1120
1121static Node*
1122node_new(void)
1123{
1124 Node* node;
1125
1126#ifdef USE_PARSE_TREE_NODE_RECYCLE
1127 THREAD_ATOMIC_START;
1128 if (IS_NOT_NULL(FreeNodeList)) {
1129 node = (Node* )FreeNodeList;
1130 FreeNodeList = FreeNodeList->next;
1131 THREAD_ATOMIC_END;
1132 return node;
1133 }
1134 THREAD_ATOMIC_END;
1135#endif
1136
1137 node = (Node* )xmalloc(sizeof(Node));
1138 /* xmemset(node, 0, sizeof(Node)); */
1139 return node;
1140}
1141
1142#if defined(USE_MULTI_THREAD_SYSTEM) && \
1143 defined(USE_SHARED_CCLASS_TABLE) && \
1144 defined(USE_PARSE_TREE_NODE_RECYCLE)
1145static Node*
1146node_new_locked(void)
1147{
1148 Node* node;
1149
1150 if (IS_NOT_NULL(FreeNodeList)) {
1151 node = (Node* )FreeNodeList;
1152 FreeNodeList = FreeNodeList->next;
1153 return node;
1154 }
1155
1156 node = (Node* )xmalloc(sizeof(Node));
1157 /* xmemset(node, 0, sizeof(Node)); */
1158 return node;
1159}
1160#endif
1161
1162static void
1163initialize_cclass(CClassNode* cc)
1164{
1165 BITSET_CLEAR(cc->bs);
1166 /* cc->base.flags = 0; */
1167 cc->flags = 0;
1168 cc->mbuf = NULL;
1169}
1170
1171static Node*
1172node_new_cclass(void)
1173{
1174 Node* node = node_new();
1175 CHECK_NULL_RETURN(node);
1176
1177 SET_NTYPE(node, NT_CCLASS);
1178 initialize_cclass(NCCLASS(node));
1179 return node;
1180}
1181
1182#if defined(USE_MULTI_THREAD_SYSTEM) && \
1183 defined(USE_SHARED_CCLASS_TABLE) && \
1184 defined(USE_PARSE_TREE_NODE_RECYCLE)
1185static Node*
1186node_new_cclass_locked(void)
1187{
1188 Node* node = node_new_locked();
1189 CHECK_NULL_RETURN(node);
1190
1191 SET_NTYPE(node, NT_CCLASS);
1192 initialize_cclass(NCCLASS(node));
1193 return node;
1194}
1195#else
1196#define node_new_cclass_locked() node_new_cclass()
1197#endif
1198
1199#ifdef USE_SHARED_CCLASS_TABLE
1200static Node*
1201node_new_cclass_by_codepoint_range(int not, OnigCodePoint sb_out,
1202 const OnigCodePoint ranges[])
1203{
1204 int n, i;
1205 CClassNode* cc;
1206 OnigCodePoint j;
1207
1208 Node* node = node_new_cclass_locked();
1209 CHECK_NULL_RETURN(node);
1210
1211 cc = NCCLASS(node);
1212 if (not != 0) NCCLASS_SET_NOT(cc);
1213
1214 BITSET_CLEAR(cc->bs);
1215 if (sb_out > 0 && IS_NOT_NULL(ranges)) {
1216 n = ONIGENC_CODE_RANGE_NUM(ranges);
1217 for (i = 0; i < n; i++) {
1218 for (j = ONIGENC_CODE_RANGE_FROM(ranges, i);
1219 j <= (OnigCodePoint )ONIGENC_CODE_RANGE_TO(ranges, i); j++) {
1220 if (j >= sb_out) goto sb_end;
1221
1222 BITSET_SET_BIT(cc->bs, j);
1223 }
1224 }
1225 }
1226
1227 sb_end:
1228 if (IS_NULL(ranges)) {
1229 is_null:
1230 cc->mbuf = NULL;
1231 }
1232 else {
1233 BBuf* bbuf;
1234
1235 n = ONIGENC_CODE_RANGE_NUM(ranges);
1236 if (n == 0) goto is_null;
1237
1238 bbuf = (BBuf* )xmalloc(sizeof(BBuf));
1239 CHECK_NULL_RETURN(bbuf);
1240 bbuf->alloc = n + 1;
1241 bbuf->used = n + 1;
1242 bbuf->p = (UChar* )((void* )ranges);
1243
1244 cc->mbuf = bbuf;
1245 }
1246
1247 return node;
1248}
1249#endif /* USE_SHARED_CCLASS_TABLE */
1250
1251static Node*
1252node_new_ctype(int type, int not, int ascii_range)
1253{
1254 Node* node = node_new();
1255 CHECK_NULL_RETURN(node);
1256
1257 SET_NTYPE(node, NT_CTYPE);
1258 NCTYPE(node)->ctype = type;
1259 NCTYPE(node)->not = not;
1260 NCTYPE(node)->ascii_range = ascii_range;
1261 return node;
1262}
1263
1264static Node*
1265node_new_anychar(void)
1266{
1267 Node* node = node_new();
1268 CHECK_NULL_RETURN(node);
1269
1270 SET_NTYPE(node, NT_CANY);
1271 return node;
1272}
1273
1274static Node*
1275node_new_list(Node* left, Node* right)
1276{
1277 Node* node = node_new();
1278 CHECK_NULL_RETURN(node);
1279
1280 SET_NTYPE(node, NT_LIST);
1281 NCAR(node) = left;
1282 NCDR(node) = right;
1283 return node;
1284}
1285
1286extern Node*
1287onig_node_new_list(Node* left, Node* right)
1288{
1289 return node_new_list(left, right);
1290}
1291
1292extern Node*
1293onig_node_list_add(Node* list, Node* x)
1294{
1295 Node *n;
1296
1297 n = onig_node_new_list(x, NULL);
1298 if (IS_NULL(n)) return NULL_NODE;
1299
1300 if (IS_NOT_NULL(list)) {
1301 while (IS_NOT_NULL(NCDR(list)))
1302 list = NCDR(list);
1303
1304 NCDR(list) = n;
1305 }
1306
1307 return n;
1308}
1309
1310extern Node*
1311onig_node_new_alt(Node* left, Node* right)
1312{
1313 Node* node = node_new();
1314 CHECK_NULL_RETURN(node);
1315
1316 SET_NTYPE(node, NT_ALT);
1317 NCAR(node) = left;
1318 NCDR(node) = right;
1319 return node;
1320}
1321
1322extern Node*
1323onig_node_new_anchor(int type)
1324{
1325 Node* node = node_new();
1326 CHECK_NULL_RETURN(node);
1327
1328 SET_NTYPE(node, NT_ANCHOR);
1329 NANCHOR(node)->type = type;
1330 NANCHOR(node)->target = NULL;
1331 NANCHOR(node)->char_len = -1;
1332 NANCHOR(node)->ascii_range = 0;
1333 return node;
1334}
1335
1336static Node*
1337node_new_backref(int back_num, int* backrefs, int by_name,
1338#ifdef USE_BACKREF_WITH_LEVEL
1339 int exist_level, int nest_level,
1340#endif
1341 ScanEnv* env)
1342{
1343 int i;
1344 Node* node = node_new();
1345
1346 CHECK_NULL_RETURN(node);
1347
1348 SET_NTYPE(node, NT_BREF);
1349 NBREF(node)->state = 0;
1350 NBREF(node)->back_num = back_num;
1351 NBREF(node)->back_dynamic = (int* )NULL;
1352 if (by_name != 0)
1353 NBREF(node)->state |= NST_NAME_REF;
1354
1355#ifdef USE_BACKREF_WITH_LEVEL
1356 if (exist_level != 0) {
1357 NBREF(node)->state |= NST_NEST_LEVEL;
1358 NBREF(node)->nest_level = nest_level;
1359 }
1360#endif
1361
1362 for (i = 0; i < back_num; i++) {
1363 if (backrefs[i] <= env->num_mem &&
1364 IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
1365 NBREF(node)->state |= NST_RECURSION; /* /...(\1).../ */
1366 break;
1367 }
1368 }
1369
1370 if (back_num <= NODE_BACKREFS_SIZE) {
1371 for (i = 0; i < back_num; i++)
1372 NBREF(node)->back_static[i] = backrefs[i];
1373 }
1374 else {
1375 int* p = (int* )xmalloc(sizeof(int) * back_num);
1376 if (IS_NULL(p)) {
1377 onig_node_free(node);
1378 return NULL;
1379 }
1380 NBREF(node)->back_dynamic = p;
1381 for (i = 0; i < back_num; i++)
1382 p[i] = backrefs[i];
1383 }
1384 return node;
1385}
1386
1387#ifdef USE_SUBEXP_CALL
1388static Node*
1389node_new_call(UChar* name, UChar* name_end, int gnum)
1390{
1391 Node* node = node_new();
1392 CHECK_NULL_RETURN(node);
1393
1394 SET_NTYPE(node, NT_CALL);
1395 NCALL(node)->state = 0;
1396 NCALL(node)->target = NULL_NODE;
1397 NCALL(node)->name = name;
1398 NCALL(node)->name_end = name_end;
1399 NCALL(node)->group_num = gnum; /* call by number if gnum != 0 */
1400 return node;
1401}
1402#endif
1403
1404static Node*
1405node_new_quantifier(int lower, int upper, int by_number)
1406{
1407 Node* node = node_new();
1408 CHECK_NULL_RETURN(node);
1409
1410 SET_NTYPE(node, NT_QTFR);
1411 NQTFR(node)->state = 0;
1412 NQTFR(node)->target = NULL;
1413 NQTFR(node)->lower = lower;
1414 NQTFR(node)->upper = upper;
1415 NQTFR(node)->greedy = 1;
1416 NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY;
1417 NQTFR(node)->head_exact = NULL_NODE;
1418 NQTFR(node)->next_head_exact = NULL_NODE;
1419 NQTFR(node)->is_refered = 0;
1420 if (by_number != 0)
1421 NQTFR(node)->state |= NST_BY_NUMBER;
1422
1423#ifdef USE_COMBINATION_EXPLOSION_CHECK
1424 NQTFR(node)->comb_exp_check_num = 0;
1425#endif
1426
1427 return node;
1428}
1429
1430static Node*
1431node_new_enclose(int type)
1432{
1433 Node* node = node_new();
1434 CHECK_NULL_RETURN(node);
1435
1436 SET_NTYPE(node, NT_ENCLOSE);
1437 NENCLOSE(node)->type = type;
1438 NENCLOSE(node)->state = 0;
1439 NENCLOSE(node)->regnum = 0;
1440 NENCLOSE(node)->option = 0;
1441 NENCLOSE(node)->target = NULL;
1442 NENCLOSE(node)->call_addr = -1;
1443 NENCLOSE(node)->opt_count = 0;
1444 return node;
1445}
1446
1447extern Node*
1448onig_node_new_enclose(int type)
1449{
1450 return node_new_enclose(type);
1451}
1452
1453static Node*
1454node_new_enclose_memory(OnigOptionType option, int is_named)
1455{
1456 Node* node = node_new_enclose(ENCLOSE_MEMORY);
1457 CHECK_NULL_RETURN(node);
1458 if (is_named != 0)
1459 SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP);
1460
1461#ifdef USE_SUBEXP_CALL
1462 NENCLOSE(node)->option = option;
1463#endif
1464 return node;
1465}
1466
1467static Node*
1468node_new_option(OnigOptionType option)
1469{
1470 Node* node = node_new_enclose(ENCLOSE_OPTION);
1471 CHECK_NULL_RETURN(node);
1472 NENCLOSE(node)->option = option;
1473 return node;
1474}
1475
1476extern int
1477onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
1478{
1479 ptrdiff_t addlen = end - s;
1480
1481 if (addlen > 0) {
1482 ptrdiff_t len = NSTR(node)->end - NSTR(node)->s;
1483
1484 if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
1485 UChar* p;
1486 ptrdiff_t capa = len + addlen + NODE_STR_MARGIN;
1487
1488 if (capa <= NSTR(node)->capa) {
1489 onig_strcpy(NSTR(node)->s + len, s, end);
1490 }
1491 else {
1492 if (NSTR(node)->s == NSTR(node)->buf)
1493 p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end,
1494 s, end, capa);
1495 else
1496 p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa);
1497
1498 CHECK_NULL_RETURN_MEMERR(p);
1499 NSTR(node)->s = p;
1500 NSTR(node)->capa = (int )capa;
1501 }
1502 }
1503 else {
1504 onig_strcpy(NSTR(node)->s + len, s, end);
1505 }
1506 NSTR(node)->end = NSTR(node)->s + len + addlen;
1507 }
1508
1509 return 0;
1510}
1511
1512extern int
1513onig_node_str_set(Node* node, const UChar* s, const UChar* end)
1514{
1515 onig_node_str_clear(node);
1516 return onig_node_str_cat(node, s, end);
1517}
1518
1519static int
1520node_str_cat_char(Node* node, UChar c)
1521{
1522 UChar s[1];
1523
1524 s[0] = c;
1525 return onig_node_str_cat(node, s, s + 1);
1526}
1527
1528static int
1529node_str_cat_codepoint(Node* node, OnigEncoding enc, OnigCodePoint c)
1530{
1531 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
1532 int num = ONIGENC_CODE_TO_MBC(enc, c, buf);
1533 if (num < 0) return num;
1534 return onig_node_str_cat(node, buf, buf + num);
1535}
1536
1537extern void
1538onig_node_conv_to_str_node(Node* node, int flag)
1539{
1540 SET_NTYPE(node, NT_STR);
1541 NSTR(node)->flag = flag;
1542 NSTR(node)->capa = 0;
1543 NSTR(node)->s = NSTR(node)->buf;
1544 NSTR(node)->end = NSTR(node)->buf;
1545}
1546
1547extern void
1548onig_node_str_clear(Node* node)
1549{
1550 if (NSTR(node)->capa != 0 &&
1551 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1552 xfree(NSTR(node)->s);
1553 }
1554
1555 NSTR(node)->capa = 0;
1556 NSTR(node)->flag = 0;
1557 NSTR(node)->s = NSTR(node)->buf;
1558 NSTR(node)->end = NSTR(node)->buf;
1559}
1560
1561static Node*
1562node_new_str(const UChar* s, const UChar* end)
1563{
1564 Node* node = node_new();
1565 CHECK_NULL_RETURN(node);
1566
1567 SET_NTYPE(node, NT_STR);
1568 NSTR(node)->capa = 0;
1569 NSTR(node)->flag = 0;
1570 NSTR(node)->s = NSTR(node)->buf;
1571 NSTR(node)->end = NSTR(node)->buf;
1572 if (onig_node_str_cat(node, s, end)) {
1573 onig_node_free(node);
1574 return NULL;
1575 }
1576 return node;
1577}
1578
1579extern Node*
1580onig_node_new_str(const UChar* s, const UChar* end)
1581{
1582 return node_new_str(s, end);
1583}
1584
1585static Node*
1586node_new_str_raw(UChar* s, UChar* end)
1587{
1588 Node* node = node_new_str(s, end);
1589 if (IS_NOT_NULL(node))
1590 NSTRING_SET_RAW(node);
1591 return node;
1592}
1593
1594static Node*
1595node_new_empty(void)
1596{
1597 return node_new_str(NULL, NULL);
1598}
1599
1600static Node*
1601node_new_str_raw_char(UChar c)
1602{
1603 UChar p[1];
1604
1605 p[0] = c;
1606 return node_new_str_raw(p, p + 1);
1607}
1608
1609static Node*
1610str_node_split_last_char(StrNode* sn, OnigEncoding enc)
1611{
1612 const UChar *p;
1613 Node* n = NULL_NODE;
1614
1615 if (sn->end > sn->s) {
1616 p = onigenc_get_prev_char_head(enc, sn->s, sn->end);
1617 if (p && p > sn->s) { /* can be split. */
1618 n = node_new_str(p, sn->end);
1619 if (IS_NOT_NULL(n) && (sn->flag & NSTR_RAW) != 0)
1620 NSTRING_SET_RAW(n);
1621 sn->end = (UChar* )p;
1622 }
1623 }
1624 return n;
1625}
1626
1627static int
1628str_node_can_be_split(StrNode* sn, OnigEncoding enc)
1629{
1630 if (sn->end > sn->s) {
1631 return ((enclen(enc, sn->s) < sn->end - sn->s) ? 1 : 0);
1632 }
1633 return 0;
1634}
1635
1636#ifdef USE_PAD_TO_SHORT_BYTE_CHAR
1637static int
1638node_str_head_pad(StrNode* sn, int num, UChar val)
1639{
1640 UChar buf[NODE_STR_BUF_SIZE];
1641 int i, len;
1642
1643 len = sn->end - sn->s;
1644 onig_strcpy(buf, sn->s, sn->end);
1645 onig_strcpy(&(sn->s[num]), buf, buf + len);
1646 sn->end += num;
1647
1648 for (i = 0; i < num; i++) {
1649 sn->s[i] = val;
1650 }
1651}
1652#endif
1653
1654extern int
1655onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
1656{
1657 unsigned int num, val;
1658 OnigCodePoint c;
1659 UChar* p = *src;
1660 PFETCH_READY;
1661
1662 num = 0;
1663 while (!PEND) {
1664 PFETCH(c);
1665 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
1666 val = (unsigned int )DIGITVAL(c);
1667 if ((INT_MAX_LIMIT - val) / 10UL < num)
1668 return -1; /* overflow */
1669
1670 num = num * 10 + val;
1671 }
1672 else {
1673 PUNFETCH;
1674 break;
1675 }
1676 }
1677 *src = p;
1678 return num;
1679}
1680
1681static int
1682scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int minlen,
1683 int maxlen, OnigEncoding enc)
1684{
1685 OnigCodePoint c;
1686 unsigned int num, val;
1687 int restlen;
1688 UChar* p = *src;
1689 PFETCH_READY;
1690
1691 restlen = maxlen - minlen;
1692 num = 0;
1693 while (!PEND && maxlen-- != 0) {
1694 PFETCH(c);
1695 if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
1696 val = (unsigned int )XDIGITVAL(enc,c);
1697 if ((INT_MAX_LIMIT - val) / 16UL < num)
1698 return -1; /* overflow */
1699
1700 num = (num << 4) + XDIGITVAL(enc,c);
1701 }
1702 else {
1703 PUNFETCH;
1704 break;
1705 }
1706 }
1707 if (maxlen > restlen)
1708 return -2; /* not enough digits */
1709 *src = p;
1710 return num;
1711}
1712
1713static int
1714scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
1715 OnigEncoding enc)
1716{
1717 OnigCodePoint c;
1718 unsigned int num, val;
1719 UChar* p = *src;
1720 PFETCH_READY;
1721
1722 num = 0;
1723 while (!PEND && maxlen-- != 0) {
1724 PFETCH(c);
1725 if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
1726 val = ODIGITVAL(c);
1727 if ((INT_MAX_LIMIT - val) / 8UL < num)
1728 return -1; /* overflow */
1729
1730 num = (num << 3) + val;
1731 }
1732 else {
1733 PUNFETCH;
1734 break;
1735 }
1736 }
1737 *src = p;
1738 return num;
1739}
1740
1741
1742#define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
1743 BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
1744
1745/* data format:
1746 [n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
1747 (all data size is OnigCodePoint)
1748 */
1749static int
1750new_code_range(BBuf** pbuf)
1751{
1752#define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5)
1753 int r;
1754 OnigCodePoint n;
1755 BBuf* bbuf;
1756
1757 bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
1758 CHECK_NULL_RETURN_MEMERR(*pbuf);
1759 r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
1760 if (r) return r;
1761
1762 n = 0;
1763 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1764 return 0;
1765}
1766
1767static int
1768add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)
1769{
1770 int r, inc_n, pos;
1771 OnigCodePoint low, high, bound, x;
1772 OnigCodePoint n, *data;
1773 BBuf* bbuf;
1774
1775 if (from > to) {
1776 n = from; from = to; to = n;
1777 }
1778
1779 if (IS_NULL(*pbuf)) {
1780 r = new_code_range(pbuf);
1781 if (r) return r;
1782 bbuf = *pbuf;
1783 n = 0;
1784 }
1785 else {
1786 bbuf = *pbuf;
1787 GET_CODE_POINT(n, bbuf->p);
1788 }
1789 data = (OnigCodePoint* )(bbuf->p);
1790 data++;
1791
1792 bound = (from == 0) ? 0 : n;
1793 for (low = 0; low < bound; ) {
1794 x = (low + bound) >> 1;
1795 if (from - 1 > data[x*2 + 1])
1796 low = x + 1;
1797 else
1798 bound = x;
1799 }
1800
1801 high = (to == ONIG_LAST_CODE_POINT) ? n : low;
1802 for (bound = n; high < bound; ) {
1803 x = (high + bound) >> 1;
1804 if (to + 1 >= data[x*2])
1805 high = x + 1;
1806 else
1807 bound = x;
1808 }
1809
1810 inc_n = low + 1 - high;
1811 if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
1812 return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
1813
1814 if (inc_n != 1) {
1815 if (from > data[low*2])
1816 from = data[low*2];
1817 if (to < data[(high - 1)*2 + 1])
1818 to = data[(high - 1)*2 + 1];
1819 }
1820
1821 if (inc_n != 0) {
1822 int from_pos = SIZE_CODE_POINT * (1 + high * 2);
1823 int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2);
1824
1825 if (inc_n > 0) {
1826 if (high < n) {
1827 int size = (n - high) * 2 * SIZE_CODE_POINT;
1828 BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
1829 }
1830 }
1831 else {
1832 BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
1833 }
1834 }
1835
1836 pos = SIZE_CODE_POINT * (1 + low * 2);
1837 BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
1838 BBUF_WRITE_CODE_POINT(bbuf, pos, from);
1839 BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
1840 n += inc_n;
1841 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1842
1843 return 0;
1844}
1845
1846static int
1847add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
1848{
1849 if (from > to) {
1850 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
1851 return 0;
1852 else
1853 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
1854 }
1855
1856 return add_code_range_to_buf(pbuf, from, to);
1857}
1858
1859static int
1860not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf)
1861{
1862 int r, i, n;
1863 OnigCodePoint pre, from, *data, to = 0;
1864
1865 *pbuf = (BBuf* )NULL;
1866 if (IS_NULL(bbuf)) {
1867 set_all:
1868 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1869 }
1870
1871 data = (OnigCodePoint* )(bbuf->p);
1872 GET_CODE_POINT(n, data);
1873 data++;
1874 if (n <= 0) goto set_all;
1875
1876 r = 0;
1877 pre = MBCODE_START_POS(enc);
1878 for (i = 0; i < n; i++) {
1879 from = data[i*2];
1880 to = data[i*2+1];
1881 if (pre <= from - 1) {
1882 r = add_code_range_to_buf(pbuf, pre, from - 1);
1883 if (r != 0) return r;
1884 }
1885 if (to == ONIG_LAST_CODE_POINT) break;
1886 pre = to + 1;
1887 }
1888 if (to < ONIG_LAST_CODE_POINT) {
1889 r = add_code_range_to_buf(pbuf, to + 1, ONIG_LAST_CODE_POINT);
1890 }
1891 return r;
1892}
1893
1894#define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
1895 BBuf *tbuf; \
1896 int tnot; \
1897 tnot = not1; not1 = not2; not2 = tnot; \
1898 tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
1899} while (0)
1900
1901static int
1902or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
1903 BBuf* bbuf2, int not2, BBuf** pbuf)
1904{
1905 int r;
1906 OnigCodePoint i, n1, *data1;
1907 OnigCodePoint from, to;
1908
1909 *pbuf = (BBuf* )NULL;
1910 if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
1911 if (not1 != 0 || not2 != 0)
1912 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1913 return 0;
1914 }
1915
1916 r = 0;
1917 if (IS_NULL(bbuf2))
1918 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1919
1920 if (IS_NULL(bbuf1)) {
1921 if (not1 != 0) {
1922 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1923 }
1924 else {
1925 if (not2 == 0) {
1926 return bbuf_clone(pbuf, bbuf2);
1927 }
1928 else {
1929 return not_code_range_buf(enc, bbuf2, pbuf);
1930 }
1931 }
1932 }
1933
1934 if (not1 != 0)
1935 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1936
1937 data1 = (OnigCodePoint* )(bbuf1->p);
1938 GET_CODE_POINT(n1, data1);
1939 data1++;
1940
1941 if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
1942 r = bbuf_clone(pbuf, bbuf2);
1943 }
1944 else if (not1 == 0) { /* 1 OR (not 2) */
1945 r = not_code_range_buf(enc, bbuf2, pbuf);
1946 }
1947 if (r != 0) return r;
1948
1949 for (i = 0; i < n1; i++) {
1950 from = data1[i*2];
1951 to = data1[i*2+1];
1952 r = add_code_range_to_buf(pbuf, from, to);
1953 if (r != 0) return r;
1954 }
1955 return 0;
1956}
1957
1958static int
1959and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1,
1960 OnigCodePoint* data, int n)
1961{
1962 int i, r;
1963 OnigCodePoint from2, to2;
1964
1965 for (i = 0; i < n; i++) {
1966 from2 = data[i*2];
1967 to2 = data[i*2+1];
1968 if (from2 < from1) {
1969 if (to2 < from1) continue;
1970 else {
1971 from1 = to2 + 1;
1972 }
1973 }
1974 else if (from2 <= to1) {
1975 if (to2 < to1) {
1976 if (from1 <= from2 - 1) {
1977 r = add_code_range_to_buf(pbuf, from1, from2-1);
1978 if (r != 0) return r;
1979 }
1980 from1 = to2 + 1;
1981 }
1982 else {
1983 to1 = from2 - 1;
1984 }
1985 }
1986 else {
1987 from1 = from2;
1988 }
1989 if (from1 > to1) break;
1990 }
1991 if (from1 <= to1) {
1992 r = add_code_range_to_buf(pbuf, from1, to1);
1993 if (r != 0) return r;
1994 }
1995 return 0;
1996}
1997
1998static int
1999and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
2000{
2001 int r;
2002 OnigCodePoint i, j, n1, n2, *data1, *data2;
2003 OnigCodePoint from, to, from1, to1, from2, to2;
2004
2005 *pbuf = (BBuf* )NULL;
2006 if (IS_NULL(bbuf1)) {
2007 if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
2008 return bbuf_clone(pbuf, bbuf2);
2009 return 0;
2010 }
2011 else if (IS_NULL(bbuf2)) {
2012 if (not2 != 0)
2013 return bbuf_clone(pbuf, bbuf1);
2014 return 0;
2015 }
2016
2017 if (not1 != 0)
2018 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
2019
2020 data1 = (OnigCodePoint* )(bbuf1->p);
2021 data2 = (OnigCodePoint* )(bbuf2->p);
2022 GET_CODE_POINT(n1, data1);
2023 GET_CODE_POINT(n2, data2);
2024 data1++;
2025 data2++;
2026
2027 if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
2028 for (i = 0; i < n1; i++) {
2029 from1 = data1[i*2];
2030 to1 = data1[i*2+1];
2031 for (j = 0; j < n2; j++) {
2032 from2 = data2[j*2];
2033 to2 = data2[j*2+1];
2034 if (from2 > to1) break;
2035 if (to2 < from1) continue;
2036 from = MAX(from1, from2);
2037 to = MIN(to1, to2);
2038 r = add_code_range_to_buf(pbuf, from, to);
2039 if (r != 0) return r;
2040 }
2041 }
2042 }
2043 else if (not1 == 0) { /* 1 AND (not 2) */
2044 for (i = 0; i < n1; i++) {
2045 from1 = data1[i*2];
2046 to1 = data1[i*2+1];
2047 r = and_code_range1(pbuf, from1, to1, data2, n2);
2048 if (r != 0) return r;
2049 }
2050 }
2051
2052 return 0;
2053}
2054
2055static int
2056and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
2057{
2058 int r, not1, not2;
2059 BBuf *buf1, *buf2, *pbuf = 0;
2060 BitSetRef bsr1, bsr2;
2061 BitSet bs1, bs2;
2062
2063 not1 = IS_NCCLASS_NOT(dest);
2064 bsr1 = dest->bs;
2065 buf1 = dest->mbuf;
2066 not2 = IS_NCCLASS_NOT(cc);
2067 bsr2 = cc->bs;
2068 buf2 = cc->mbuf;
2069
2070 if (not1 != 0) {
2071 bitset_invert_to(bsr1, bs1);
2072 bsr1 = bs1;
2073 }
2074 if (not2 != 0) {
2075 bitset_invert_to(bsr2, bs2);
2076 bsr2 = bs2;
2077 }
2078 bitset_and(bsr1, bsr2);
2079 if (bsr1 != dest->bs) {
2080 bitset_copy(dest->bs, bsr1);
2081 bsr1 = dest->bs;
2082 }
2083 if (not1 != 0) {
2084 bitset_invert(dest->bs);
2085 }
2086
2087 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2088 if (not1 != 0 && not2 != 0) {
2089 r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf);
2090 }
2091 else {
2092 r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf);
2093 if (r == 0 && not1 != 0) {
2094 BBuf *tbuf = 0;
2095 r = not_code_range_buf(enc, pbuf, &tbuf);
2096 bbuf_free(pbuf);
2097 pbuf = tbuf;
2098 }
2099 }
2100 if (r != 0) {
2101 bbuf_free(pbuf);
2102 return r;
2103 }
2104
2105 dest->mbuf = pbuf;
2106 bbuf_free(buf1);
2107 return r;
2108 }
2109 return 0;
2110}
2111
2112static int
2113or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
2114{
2115 int r, not1, not2;
2116 BBuf *buf1, *buf2, *pbuf = 0;
2117 BitSetRef bsr1, bsr2;
2118 BitSet bs1, bs2;
2119
2120 not1 = IS_NCCLASS_NOT(dest);
2121 bsr1 = dest->bs;
2122 buf1 = dest->mbuf;
2123 not2 = IS_NCCLASS_NOT(cc);
2124 bsr2 = cc->bs;
2125 buf2 = cc->mbuf;
2126
2127 if (not1 != 0) {
2128 bitset_invert_to(bsr1, bs1);
2129 bsr1 = bs1;
2130 }
2131 if (not2 != 0) {
2132 bitset_invert_to(bsr2, bs2);
2133 bsr2 = bs2;
2134 }
2135 bitset_or(bsr1, bsr2);
2136 if (bsr1 != dest->bs) {
2137 bitset_copy(dest->bs, bsr1);
2138 bsr1 = dest->bs;
2139 }
2140 if (not1 != 0) {
2141 bitset_invert(dest->bs);
2142 }
2143
2144 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2145 if (not1 != 0 && not2 != 0) {
2146 r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf);
2147 }
2148 else {
2149 r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf);
2150 if (r == 0 && not1 != 0) {
2151 BBuf *tbuf = 0;
2152 r = not_code_range_buf(enc, pbuf, &tbuf);
2153 bbuf_free(pbuf);
2154 pbuf = tbuf;
2155 }
2156 }
2157 if (r != 0) {
2158 bbuf_free(pbuf);
2159 return r;
2160 }
2161
2162 dest->mbuf = pbuf;
2163 bbuf_free(buf1);
2164 return r;
2165 }
2166 else
2167 return 0;
2168}
2169
2170static int
2171conv_backslash_value(int c, ScanEnv* env)
2172{
2173 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
2174 switch (c) {
2175 case 'n': return '\n';
2176 case 't': return '\t';
2177 case 'r': return '\r';
2178 case 'f': return '\f';
2179 case 'a': return '\007';
2180 case 'b': return '\010';
2181 case 'e': return '\033';
2182 case 'v':
2183 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
2184 return '\v';
2185 break;
2186
2187 default:
2188 break;
2189 }
2190 }
2191 return c;
2192}
2193
2194#ifdef USE_NO_INVALID_QUANTIFIER
2195#define is_invalid_quantifier_target(node) 0
2196#else
2197static int
2198is_invalid_quantifier_target(Node* node)
2199{
2200 switch (NTYPE(node)) {
2201 case NT_ANCHOR:
2202 return 1;
2203 break;
2204
2205 case NT_ENCLOSE:
2206 /* allow enclosed elements */
2207 /* return is_invalid_quantifier_target(NENCLOSE(node)->target); */
2208 break;
2209
2210 case NT_LIST:
2211 do {
2212 if (! is_invalid_quantifier_target(NCAR(node))) return 0;
2213 } while (IS_NOT_NULL(node = NCDR(node)));
2214 return 0;
2215 break;
2216
2217 case NT_ALT:
2218 do {
2219 if (is_invalid_quantifier_target(NCAR(node))) return 1;
2220 } while (IS_NOT_NULL(node = NCDR(node)));
2221 break;
2222
2223 default:
2224 break;
2225 }
2226 return 0;
2227}
2228#endif
2229
2230/* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
2231static int
2232popular_quantifier_num(QtfrNode* q)
2233{
2234 if (q->greedy) {
2235 if (q->lower == 0) {
2236 if (q->upper == 1) return 0;
2237 else if (IS_REPEAT_INFINITE(q->upper)) return 1;
2238 }
2239 else if (q->lower == 1) {
2240 if (IS_REPEAT_INFINITE(q->upper)) return 2;
2241 }
2242 }
2243 else {
2244 if (q->lower == 0) {
2245 if (q->upper == 1) return 3;
2246 else if (IS_REPEAT_INFINITE(q->upper)) return 4;
2247 }
2248 else if (q->lower == 1) {
2249 if (IS_REPEAT_INFINITE(q->upper)) return 5;
2250 }
2251 }
2252 return -1;
2253}
2254
2255
2256enum ReduceType {
2257 RQ_ASIS = 0, /* as is */
2258 RQ_DEL = 1, /* delete parent */
2259 RQ_A, /* to '*' */
2260 RQ_AQ, /* to '*?' */
2261 RQ_QQ, /* to '??' */
2262 RQ_P_QQ, /* to '+)??' */
2263 RQ_PQ_Q /* to '+?)?' */
2264};
2265
2266static enum ReduceType const ReduceTypeTable[6][6] = {
2267 {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */
2268 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */
2269 {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */
2270 {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */
2271 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */
2272 {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */
2273};
2274
2275extern void
2276onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
2277{
2278 int pnum, cnum;
2279 QtfrNode *p, *c;
2280
2281 p = NQTFR(pnode);
2282 c = NQTFR(cnode);
2283 pnum = popular_quantifier_num(p);
2284 cnum = popular_quantifier_num(c);
2285 if (pnum < 0 || cnum < 0) return ;
2286
2287 switch (ReduceTypeTable[cnum][pnum]) {
2288 case RQ_DEL:
2289 *pnode = *cnode;
2290 break;
2291 case RQ_A:
2292 p->target = c->target;
2293 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1;
2294 break;
2295 case RQ_AQ:
2296 p->target = c->target;
2297 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0;
2298 break;
2299 case RQ_QQ:
2300 p->target = c->target;
2301 p->lower = 0; p->upper = 1; p->greedy = 0;
2302 break;
2303 case RQ_P_QQ:
2304 p->target = cnode;
2305 p->lower = 0; p->upper = 1; p->greedy = 0;
2306 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1;
2307 return ;
2308 break;
2309 case RQ_PQ_Q:
2310 p->target = cnode;
2311 p->lower = 0; p->upper = 1; p->greedy = 1;
2312 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0;
2313 return ;
2314 break;
2315 case RQ_ASIS:
2316 p->target = cnode;
2317 return ;
2318 break;
2319 }
2320
2321 c->target = NULL_NODE;
2322 onig_node_free(cnode);
2323}
2324
2325
2326enum TokenSyms {
2327 TK_EOT = 0, /* end of token */
2328 TK_RAW_BYTE = 1,
2329 TK_CHAR,
2330 TK_STRING,
2331 TK_CODE_POINT,
2332 TK_ANYCHAR,
2333 TK_CHAR_TYPE,
2334 TK_BACKREF,
2335 TK_CALL,
2336 TK_ANCHOR,
2337 TK_OP_REPEAT,
2338 TK_INTERVAL,
2339 TK_ANYCHAR_ANYTIME, /* SQL '%' == .* */
2340 TK_ALT,
2341 TK_SUBEXP_OPEN,
2342 TK_SUBEXP_CLOSE,
2343 TK_CC_OPEN,
2344 TK_QUOTE_OPEN,
2345 TK_CHAR_PROPERTY, /* \p{...}, \P{...} */
2346 TK_LINEBREAK,
2347 TK_EXTENDED_GRAPHEME_CLUSTER,
2348 TK_KEEP,
2349 /* in cc */
2350 TK_CC_CLOSE,
2351 TK_CC_RANGE,
2352 TK_POSIX_BRACKET_OPEN,
2353 TK_CC_AND, /* && */
2354 TK_CC_CC_OPEN /* [ */
2355};
2356
2357typedef struct {
2358 enum TokenSyms type;
2359 int escaped;
2360 int base; /* is number: 8, 16 (used in [....]) */
2361 UChar* backp;
2362 union {
2363 UChar* s;
2364 int c;
2365 OnigCodePoint code;
2366 struct {
2367 int subtype;
2368 int ascii_range;
2369 } anchor;
2370 struct {
2371 int lower;
2372 int upper;
2373 int greedy;
2374 int possessive;
2375 } repeat;
2376 struct {
2377 int num;
2378 int ref1;
2379 int* refs;
2380 int by_name;
2381#ifdef USE_BACKREF_WITH_LEVEL
2382 int exist_level;
2383 int level; /* \k<name+n> */
2384#endif
2385 } backref;
2386 struct {
2387 UChar* name;
2388 UChar* name_end;
2389 int gnum;
2390 int rel;
2391 } call;
2392 struct {
2393 int ctype;
2394 int not;
2395 } prop;
2396 } u;
2397} OnigToken;
2398
2399
2400static int
2401fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
2402{
2403 int low, up, syn_allow, non_low = 0;
2404 int r = 0;
2405 OnigCodePoint c;
2406 OnigEncoding enc = env->enc;
2407 UChar* p = *src;
2408 PFETCH_READY;
2409
2410 syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
2411
2412 if (PEND) {
2413 if (syn_allow)
2414 return 1; /* "....{" : OK! */
2415 else
2416 return ONIGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */
2417 }
2418
2419 if (! syn_allow) {
2420 c = PPEEK;
2421 if (c == ')' || c == '(' || c == '|') {
2422 return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
2423 }
2424 }
2425
2426 low = onig_scan_unsigned_number(&p, end, env->enc);
2427 if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2428 if (low > ONIG_MAX_REPEAT_NUM)
2429 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2430
2431 if (p == *src) { /* can't read low */
2432 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
2433 /* allow {,n} as {0,n} */
2434 low = 0;
2435 non_low = 1;
2436 }
2437 else
2438 goto invalid;
2439 }
2440
2441 if (PEND) goto invalid;
2442 PFETCH(c);
2443 if (c == ',') {
2444 UChar* prev = p;
2445 up = onig_scan_unsigned_number(&p, end, env->enc);
2446 if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2447 if (up > ONIG_MAX_REPEAT_NUM)
2448 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2449
2450 if (p == prev) {
2451 if (non_low != 0)
2452 goto invalid;
2453 up = REPEAT_INFINITE; /* {n,} : {n,infinite} */
2454 }
2455 }
2456 else {
2457 if (non_low != 0)
2458 goto invalid;
2459
2460 PUNFETCH;
2461 up = low; /* {n} : exact n times */
2462 r = 2; /* fixed */
2463 }
2464
2465 if (PEND) goto invalid;
2466 PFETCH(c);
2467 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
2468 if (c != MC_ESC(env->syntax)) goto invalid;
2469 PFETCH(c);
2470 }
2471 if (c != '}') goto invalid;
2472
2473 if (!IS_REPEAT_INFINITE(up) && low > up) {
2474 return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
2475 }
2476
2477 tok->type = TK_INTERVAL;
2478 tok->u.repeat.lower = low;
2479 tok->u.repeat.upper = up;
2480 *src = p;
2481 return r; /* 0: normal {n,m}, 2: fixed {n} */
2482
2483 invalid:
2484 if (syn_allow)
2485 return 1; /* OK */
2486 else
2487 return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
2488}
2489
2490/* \M-, \C-, \c, or \... */
2491static int
2492fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
2493{
2494 int v;
2495 OnigCodePoint c;
2496 OnigEncoding enc = env->enc;
2497 UChar* p = *src;
2498
2499 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2500
2501 PFETCH_S(c);
2502 switch (c) {
2503 case 'M':
2504 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
2505 if (PEND) return ONIGERR_END_PATTERN_AT_META;
2506 PFETCH_S(c);
2507 if (c != '-') return ONIGERR_META_CODE_SYNTAX;
2508 if (PEND) return ONIGERR_END_PATTERN_AT_META;
2509 PFETCH_S(c);
2510 if (c == MC_ESC(env->syntax)) {
2511 v = fetch_escaped_value(&p, end, env);
2512 if (v < 0) return v;
2513 c = (OnigCodePoint )v;
2514 }
2515 c = ((c & 0xff) | 0x80);
2516 }
2517 else
2518 goto backslash;
2519 break;
2520
2521 case 'C':
2522 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
2523 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2524 PFETCH_S(c);
2525 if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
2526 goto control;
2527 }
2528 else
2529 goto backslash;
2530
2531 case 'c':
2532 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
2533 control:
2534 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2535 PFETCH_S(c);
2536 if (c == '?') {
2537 c = 0177;
2538 }
2539 else {
2540 if (c == MC_ESC(env->syntax)) {
2541 v = fetch_escaped_value(&p, end, env);
2542 if (v < 0) return v;
2543 c = (OnigCodePoint )v;
2544 }
2545 c &= 0x9f;
2546 }
2547 break;
2548 }
2549 /* fall through */
2550
2551 default:
2552 {
2553 backslash:
2554 c = conv_backslash_value(c, env);
2555 }
2556 break;
2557 }
2558
2559 *src = p;
2560 return c;
2561}
2562
2563static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
2564
2565static OnigCodePoint
2566get_name_end_code_point(OnigCodePoint start)
2567{
2568 switch (start) {
2569 case '<': return (OnigCodePoint )'>'; break;
2570 case '\'': return (OnigCodePoint )'\''; break;
2571 case '(': return (OnigCodePoint )')'; break;
2572 case '{': return (OnigCodePoint )'}'; break;
2573 default:
2574 break;
2575 }
2576
2577 return (OnigCodePoint )0;
2578}
2579
2580#ifdef USE_NAMED_GROUP
2581#ifdef USE_BACKREF_WITH_LEVEL
2582/*
2583 \k<name+n>, \k<name-n>
2584 \k<num+n>, \k<num-n>
2585 \k<-num+n>, \k<-num-n>
2586*/
2587static int
2588fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
2589 UChar** rname_end, ScanEnv* env,
2590 int* rback_num, int* rlevel)
2591{
2592 int r, sign, is_num, exist_level;
2593 OnigCodePoint end_code;
2594 OnigCodePoint c = 0;
2595 OnigEncoding enc = env->enc;
2596 UChar *name_end;
2597 UChar *pnum_head;
2598 UChar *p = *src;
2599 PFETCH_READY;
2600
2601 *rback_num = 0;
2602 is_num = exist_level = 0;
2603 sign = 1;
2604 pnum_head = *src;
2605
2606 end_code = get_name_end_code_point(start_code);
2607
2608 name_end = end;
2609 r = 0;
2610 if (PEND) {
2611 return ONIGERR_EMPTY_GROUP_NAME;
2612 }
2613 else {
2614 PFETCH(c);
2615 if (c == end_code)
2616 return ONIGERR_EMPTY_GROUP_NAME;
2617
2618 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2619 is_num = 1;
2620 }
2621 else if (c == '-') {
2622 is_num = 2;
2623 sign = -1;
2624 pnum_head = p;
2625 }
2626 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2627 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2628 }
2629 }
2630
2631 while (!PEND) {
2632 name_end = p;
2633 PFETCH(c);
2634 if (c == end_code || c == ')' || c == '+' || c == '-') {
2635 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
2636 break;
2637 }
2638
2639 if (is_num != 0) {
2640 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2641 is_num = 1;
2642 }
2643 else {
2644 r = ONIGERR_INVALID_GROUP_NAME;
2645 is_num = 0;
2646 }
2647 }
2648 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2649 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2650 }
2651 }
2652
2653 if (r == 0 && c != end_code) {
2654 if (c == '+' || c == '-') {
2655 int level;
2656 int flag = (c == '-' ? -1 : 1);
2657
2658 PFETCH(c);
2659 if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
2660 PUNFETCH;
2661 level = onig_scan_unsigned_number(&p, end, enc);
2662 if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
2663 *rlevel = (level * flag);
2664 exist_level = 1;
2665
2666 PFETCH(c);
2667 if (c == end_code)
2668 goto end;
2669 }
2670
2671 err:
2672 r = ONIGERR_INVALID_GROUP_NAME;
2673 name_end = end;
2674 }
2675
2676 end:
2677 if (r == 0) {
2678 if (is_num != 0) {
2679 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2680 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2681 else if (*rback_num == 0) goto err;
2682
2683 *rback_num *= sign;
2684 }
2685
2686 *rname_end = name_end;
2687 *src = p;
2688 return (exist_level ? 1 : 0);
2689 }
2690 else {
2691 onig_scan_env_set_error_string(env, r, *src, name_end);
2692 return r;
2693 }
2694}
2695#endif /* USE_BACKREF_WITH_LEVEL */
2696
2697/*
2698 ref: 0 -> define name (don't allow number name)
2699 1 -> reference name (allow number name)
2700*/
2701static int
2702fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2703 UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2704{
2705 int r, is_num, sign;
2706 OnigCodePoint end_code;
2707 OnigCodePoint c = 0;
2708 OnigEncoding enc = env->enc;
2709 UChar *name_end;
2710 UChar *pnum_head;
2711 UChar *p = *src;
2712
2713 *rback_num = 0;
2714
2715 end_code = get_name_end_code_point(start_code);
2716
2717 name_end = end;
2718 pnum_head = *src;
2719 r = 0;
2720 is_num = 0;
2721 sign = 1;
2722 if (PEND) {
2723 return ONIGERR_EMPTY_GROUP_NAME;
2724 }
2725 else {
2726 PFETCH_S(c);
2727 if (c == end_code)
2728 return ONIGERR_EMPTY_GROUP_NAME;
2729
2730 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2731 if (ref == 1)
2732 is_num = 1;
2733 else {
2734 r = ONIGERR_INVALID_GROUP_NAME;
2735 is_num = 0;
2736 }
2737 }
2738 else if (c == '-') {
2739 if (ref == 1) {
2740 is_num = 2;
2741 sign = -1;
2742 pnum_head = p;
2743 }
2744 else {
2745 r = ONIGERR_INVALID_GROUP_NAME;
2746 is_num = 0;
2747 }
2748 }
2749 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2750 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2751 }
2752 }
2753
2754 if (r == 0) {
2755 while (!PEND) {
2756 name_end = p;
2757 PFETCH_S(c);
2758 if (c == end_code || c == ')') {
2759 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
2760 break;
2761 }
2762
2763 if (is_num != 0) {
2764 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2765 is_num = 1;
2766 }
2767 else {
2768 if (!ONIGENC_IS_CODE_WORD(enc, c))
2769 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2770 else
2771 r = ONIGERR_INVALID_GROUP_NAME;
2772 is_num = 0;
2773 }
2774 }
2775 else {
2776 if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2777 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2778 }
2779 }
2780 }
2781
2782 if (c != end_code) {
2783 r = ONIGERR_INVALID_GROUP_NAME;
2784 name_end = end;
2785 }
2786
2787 if (is_num != 0) {
2788 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2789 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2790 else if (*rback_num == 0) {
2791 r = ONIGERR_INVALID_GROUP_NAME;
2792 goto err;
2793 }
2794
2795 *rback_num *= sign;
2796 }
2797
2798 *rname_end = name_end;
2799 *src = p;
2800 return 0;
2801 }
2802 else {
2803 while (!PEND) {
2804 name_end = p;
2805 PFETCH_S(c);
2806 if (c == end_code || c == ')')
2807 break;
2808 }
2809 if (PEND)
2810 name_end = end;
2811
2812 err:
2813 onig_scan_env_set_error_string(env, r, *src, name_end);
2814 return r;
2815 }
2816}
2817#else
2818static int
2819fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2820 UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2821{
2822 int r, is_num, sign;
2823 OnigCodePoint end_code;
2824 OnigCodePoint c = 0;
2825 UChar *name_end;
2826 OnigEncoding enc = env->enc;
2827 UChar *pnum_head;
2828 UChar *p = *src;
2829 PFETCH_READY;
2830
2831 *rback_num = 0;
2832
2833 end_code = get_name_end_code_point(start_code);
2834
2835 *rname_end = name_end = end;
2836 r = 0;
2837 pnum_head = *src;
2838 is_num = 0;
2839 sign = 1;
2840
2841 if (PEND) {
2842 return ONIGERR_EMPTY_GROUP_NAME;
2843 }
2844 else {
2845 PFETCH(c);
2846 if (c == end_code)
2847 return ONIGERR_EMPTY_GROUP_NAME;
2848
2849 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2850 is_num = 1;
2851 }
2852 else if (c == '-') {
2853 is_num = 2;
2854 sign = -1;
2855 pnum_head = p;
2856 }
2857 else {
2858 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2859 }
2860 }
2861
2862 while (!PEND) {
2863 name_end = p;
2864
2865 PFETCH(c);
2866 if (c == end_code || c == ')') break;
2867 if (! ONIGENC_IS_CODE_DIGIT(enc, c))
2868 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2869 }
2870 if (r == 0 && c != end_code) {
2871 r = ONIGERR_INVALID_GROUP_NAME;
2872 name_end = end;
2873 }
2874
2875 if (r == 0) {
2876 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2877 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2878 else if (*rback_num == 0) {
2879 r = ONIGERR_INVALID_GROUP_NAME;
2880 goto err;
2881 }
2882 *rback_num *= sign;
2883
2884 *rname_end = name_end;
2885 *src = p;
2886 return 0;
2887 }
2888 else {
2889 err:
2890 onig_scan_env_set_error_string(env, r, *src, name_end);
2891 return r;
2892 }
2893}
2894#endif /* USE_NAMED_GROUP */
2895
2896static void
2897CC_ESC_WARN(ScanEnv* env, UChar *c)
2898{
2899 if (onig_warn == onig_null_warn) return ;
2900
2901 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
2902 IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
2903 UChar buf[WARN_BUFSIZE];
2904 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
2905 env->pattern, env->pattern_end,
2906 (UChar* )"character class has '%s' without escape", c);
2907 (*onig_warn)((char* )buf);
2908 }
2909}
2910
2911static void
2912CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
2913{
2914 if (onig_warn == onig_null_warn) return ;
2915
2916 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
2917 UChar buf[WARN_BUFSIZE];
2918 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
2919 env->pattern, env->pattern_end,
2920 (UChar* )"regular expression has '%s' without escape", c);
2921 (*onig_warn)((char* )buf);
2922 }
2923}
2924
2925static UChar*
2926find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
2927 UChar **next, OnigEncoding enc)
2928{
2929 int i;
2930 OnigCodePoint x;
2931 UChar *q;
2932 UChar *p = from;
2933
2934 while (p < to) {
2935 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2936 q = p + enclen(enc, p);
2937 if (x == s[0]) {
2938 for (i = 1; i < n && q < to; i++) {
2939 x = ONIGENC_MBC_TO_CODE(enc, q, to);
2940 if (x != s[i]) break;
2941 q += enclen(enc, q);
2942 }
2943 if (i >= n) {
2944 if (IS_NOT_NULL(next))
2945 *next = q;
2946 return p;
2947 }
2948 }
2949 p = q;
2950 }
2951 return NULL_UCHARP;
2952}
2953
2954static int
2955str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
2956 OnigCodePoint bad, OnigEncoding enc, OnigSyntaxType* syn)
2957{
2958 int i, in_esc;
2959 OnigCodePoint x;
2960 UChar *q;
2961 UChar *p = from;
2962
2963 in_esc = 0;
2964 while (p < to) {
2965 if (in_esc) {
2966 in_esc = 0;
2967 p += enclen(enc, p);
2968 }
2969 else {
2970 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2971 q = p + enclen(enc, p);
2972 if (x == s[0]) {
2973 for (i = 1; i < n && q < to; i++) {
2974 x = ONIGENC_MBC_TO_CODE(enc, q, to);
2975 if (x != s[i]) break;
2976 q += enclen(enc, q);
2977 }
2978 if (i >= n) return 1;
2979 p += enclen(enc, p);
2980 }
2981 else {
2982 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2983 if (x == bad) return 0;
2984 else if (x == MC_ESC(syn)) in_esc = 1;
2985 p = q;
2986 }
2987 }
2988 }
2989 return 0;
2990}
2991
2992static int
2993fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
2994{
2995 int num;
2996 OnigCodePoint c, c2;
2997 OnigSyntaxType* syn = env->syntax;
2998 OnigEncoding enc = env->enc;
2999 UChar* prev;
3000 UChar* p = *src;
3001 PFETCH_READY;
3002
3003 if (PEND) {
3004 tok->type = TK_EOT;
3005 return tok->type;
3006 }
3007
3008 PFETCH(c);
3009 tok->type = TK_CHAR;
3010 tok->base = 0;
3011 tok->u.c = c;
3012 tok->escaped = 0;
3013
3014 if (c == ']') {
3015 tok->type = TK_CC_CLOSE;
3016 }
3017 else if (c == '-') {
3018 tok->type = TK_CC_RANGE;
3019 }
3020 else if (c == MC_ESC(syn)) {
3021 if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
3022 goto end;
3023
3024 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
3025
3026 PFETCH(c);
3027 tok->escaped = 1;
3028 tok->u.c = c;
3029 switch (c) {
3030 case 'w':
3031 tok->type = TK_CHAR_TYPE;
3032 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3033 tok->u.prop.not = 0;
3034 break;
3035 case 'W':
3036 tok->type = TK_CHAR_TYPE;
3037 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3038 tok->u.prop.not = 1;
3039 break;
3040 case 'd':
3041 tok->type = TK_CHAR_TYPE;
3042 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3043 tok->u.prop.not = 0;
3044 break;
3045 case 'D':
3046 tok->type = TK_CHAR_TYPE;
3047 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3048 tok->u.prop.not = 1;
3049 break;
3050 case 's':
3051 tok->type = TK_CHAR_TYPE;
3052 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3053 tok->u.prop.not = 0;
3054 break;
3055 case 'S':
3056 tok->type = TK_CHAR_TYPE;
3057 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3058 tok->u.prop.not = 1;
3059 break;
3060 case 'h':
3061 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3062 tok->type = TK_CHAR_TYPE;
3063 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3064 tok->u.prop.not = 0;
3065 break;
3066 case 'H':
3067 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3068 tok->type = TK_CHAR_TYPE;
3069 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3070 tok->u.prop.not = 1;
3071 break;
3072
3073 case 'p':
3074 case 'P':
3075 c2 = PPEEK;
3076 if (c2 == '{' &&
3077 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
3078 PINC;
3079 tok->type = TK_CHAR_PROPERTY;
3080 tok->u.prop.not = (c == 'P' ? 1 : 0);
3081
3082 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3083 PFETCH(c2);
3084 if (c2 == '^') {
3085 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3086 }
3087 else
3088 PUNFETCH;
3089 }
3090 }
3091 break;
3092
3093 case 'x':
3094 if (PEND) break;
3095
3096 prev = p;
3097 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3098 PINC;
3099 num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc);
3100 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3101 if (!PEND) {
3102 c2 = PPEEK;
3103 if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
3104 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3105 }
3106
3107 if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
3108 PINC;
3109 tok->type = TK_CODE_POINT;
3110 tok->base = 16;
3111 tok->u.code = (OnigCodePoint )num;
3112 }
3113 else {
3114 /* can't read nothing or invalid format */
3115 p = prev;
3116 }
3117 }
3118 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3119 num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc);
3120 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3121 if (p == prev) { /* can't read nothing. */
3122 num = 0; /* but, it's not error */
3123 }
3124 tok->type = TK_RAW_BYTE;
3125 tok->base = 16;
3126 tok->u.c = num;
3127 }
3128 break;
3129
3130 case 'u':
3131 if (PEND) break;
3132
3133 prev = p;
3134 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3135 num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc);
3136 if (num < -1) return ONIGERR_TOO_SHORT_DIGITS;
3137 else if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3138 if (p == prev) { /* can't read nothing. */
3139 num = 0; /* but, it's not error */
3140 }
3141 tok->type = TK_CODE_POINT;
3142 tok->base = 16;
3143 tok->u.code = (OnigCodePoint )num;
3144 }
3145 break;
3146
3147 case '0':
3148 case '1': case '2': case '3': case '4': case '5': case '6': case '7':
3149 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3150 PUNFETCH;
3151 prev = p;
3152 num = scan_unsigned_octal_number(&p, end, 3, enc);
3153 if (num < 0 || 0xff < num) return ONIGERR_TOO_BIG_NUMBER;
3154 if (p == prev) { /* can't read nothing. */
3155 num = 0; /* but, it's not error */
3156 }
3157 tok->type = TK_RAW_BYTE;
3158 tok->base = 8;
3159 tok->u.c = num;
3160 }
3161 break;
3162
3163 default:
3164 PUNFETCH;
3165 num = fetch_escaped_value(&p, end, env);
3166 if (num < 0) return num;
3167 if (tok->u.c != num) {
3168 tok->u.code = (OnigCodePoint )num;
3169 tok->type = TK_CODE_POINT;
3170 }
3171 break;
3172 }
3173 }
3174 else if (c == '[') {
3175 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
3176 OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
3177 tok->backp = p; /* point at '[' is read */
3178 PINC;
3179 if (str_exist_check_with_esc(send, 2, p, end,
3180 (OnigCodePoint )']', enc, syn)) {
3181 tok->type = TK_POSIX_BRACKET_OPEN;
3182 }
3183 else {
3184 PUNFETCH;
3185 goto cc_in_cc;
3186 }
3187 }
3188 else {
3189 cc_in_cc:
3190 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
3191 tok->type = TK_CC_CC_OPEN;
3192 }
3193 else {
3194 CC_ESC_WARN(env, (UChar* )"[");
3195 }
3196 }
3197 }
3198 else if (c == '&') {
3199 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
3200 !PEND && (PPEEK_IS('&'))) {
3201 PINC;
3202 tok->type = TK_CC_AND;
3203 }
3204 }
3205
3206 end:
3207 *src = p;
3208 return tok->type;
3209}
3210
3211#ifdef USE_NAMED_GROUP
3212static int
3213fetch_named_backref_token(OnigCodePoint c, OnigToken* tok, UChar** src,
3214 UChar* end, ScanEnv* env)
3215{
3216 int r, num;
3217 OnigSyntaxType* syn = env->syntax;
3218 UChar* prev;
3219 UChar* p = *src;
3220 UChar* name_end;
3221 int* backs;
3222 int back_num;
3223
3224 prev = p;
3225
3226#ifdef USE_BACKREF_WITH_LEVEL
3227 name_end = NULL_UCHARP; /* no need. escape gcc warning. */
3228 r = fetch_name_with_level(c, &p, end, &name_end,
3229 env, &back_num, &tok->u.backref.level);
3230 if (r == 1) tok->u.backref.exist_level = 1;
3231 else tok->u.backref.exist_level = 0;
3232#else
3233 r = fetch_name(&p, end, &name_end, env, &back_num, 1);
3234#endif
3235 if (r < 0) return r;
3236
3237 if (back_num != 0) {
3238 if (back_num < 0) {
3239 back_num = BACKREF_REL_TO_ABS(back_num, env);
3240 if (back_num <= 0)
3241 return ONIGERR_INVALID_BACKREF;
3242 }
3243
3244 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3245 if (back_num > env->num_mem ||
3246 IS_NULL(SCANENV_MEM_NODES(env)[back_num]))
3247 return ONIGERR_INVALID_BACKREF;
3248 }
3249 tok->type = TK_BACKREF;
3250 tok->u.backref.by_name = 0;
3251 tok->u.backref.num = 1;
3252 tok->u.backref.ref1 = back_num;
3253 }
3254 else {
3255 num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
3256 if (num <= 0) {
3257 onig_scan_env_set_error_string(env,
3258 ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
3259 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3260 }
3261 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3262 int i;
3263 for (i = 0; i < num; i++) {
3264 if (backs[i] > env->num_mem ||
3265 IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
3266 return ONIGERR_INVALID_BACKREF;
3267 }
3268 }
3269
3270 tok->type = TK_BACKREF;
3271 tok->u.backref.by_name = 1;
3272 if (num == 1) {
3273 tok->u.backref.num = 1;
3274 tok->u.backref.ref1 = backs[0];
3275 }
3276 else {
3277 tok->u.backref.num = num;
3278 tok->u.backref.refs = backs;
3279 }
3280 }
3281 *src = p;
3282 return 0;
3283}
3284#endif
3285
3286static int
3287fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
3288{
3289 int r, num;
3290 OnigCodePoint c;
3291 OnigEncoding enc = env->enc;
3292 OnigSyntaxType* syn = env->syntax;
3293 UChar* prev;
3294 UChar* p = *src;
3295 PFETCH_READY;
3296
3297 start:
3298 if (PEND) {
3299 tok->type = TK_EOT;
3300 return tok->type;
3301 }
3302
3303 tok->type = TK_STRING;
3304 tok->base = 0;
3305 tok->backp = p;
3306
3307 PFETCH(c);
3308 if (IS_MC_ESC_CODE(c, syn)) {
3309 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
3310
3311 tok->backp = p;
3312 PFETCH(c);
3313
3314 tok->u.c = c;
3315 tok->escaped = 1;
3316 switch (c) {
3317 case '*':
3318 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
3319 tok->type = TK_OP_REPEAT;
3320 tok->u.repeat.lower = 0;
3321 tok->u.repeat.upper = REPEAT_INFINITE;
3322 goto greedy_check;
3323 break;
3324
3325 case '+':
3326 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
3327 tok->type = TK_OP_REPEAT;
3328 tok->u.repeat.lower = 1;
3329 tok->u.repeat.upper = REPEAT_INFINITE;
3330 goto greedy_check;
3331 break;
3332
3333 case '?':
3334 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
3335 tok->type = TK_OP_REPEAT;
3336 tok->u.repeat.lower = 0;
3337 tok->u.repeat.upper = 1;
3338 greedy_check:
3339 if (!PEND && PPEEK_IS('?') &&
3340 IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
3341 PFETCH(c);
3342 tok->u.repeat.greedy = 0;
3343 tok->u.repeat.possessive = 0;
3344 }
3345 else {
3346 possessive_check:
3347 if (!PEND && PPEEK_IS('+') &&
3348 ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
3349 tok->type != TK_INTERVAL) ||
3350 (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
3351 tok->type == TK_INTERVAL))) {
3352 PFETCH(c);
3353 tok->u.repeat.greedy = 1;
3354 tok->u.repeat.possessive = 1;
3355 }
3356 else {
3357 tok->u.repeat.greedy = 1;
3358 tok->u.repeat.possessive = 0;
3359 }
3360 }
3361 break;
3362
3363 case '{':
3364 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
3365 r = fetch_range_quantifier(&p, end, tok, env);
3366 if (r < 0) return r; /* error */
3367 if (r == 0) goto greedy_check;
3368 else if (r == 2) { /* {n} */
3369 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3370 goto possessive_check;
3371
3372 goto greedy_check;
3373 }
3374 /* r == 1 : normal char */
3375 break;
3376
3377 case '|':
3378 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
3379 tok->type = TK_ALT;
3380 break;
3381
3382 case '(':
3383 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3384 tok->type = TK_SUBEXP_OPEN;
3385 break;
3386
3387 case ')':
3388 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3389 tok->type = TK_SUBEXP_CLOSE;
3390 break;
3391
3392 case 'w':
3393 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3394 tok->type = TK_CHAR_TYPE;
3395 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3396 tok->u.prop.not = 0;
3397 break;
3398
3399 case 'W':
3400 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3401 tok->type = TK_CHAR_TYPE;
3402 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3403 tok->u.prop.not = 1;
3404 break;
3405
3406 case 'b':
3407 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3408 tok->type = TK_ANCHOR;
3409 tok->u.anchor.subtype = ANCHOR_WORD_BOUND;
3410 tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option)
3411 && ! IS_WORD_BOUND_ALL_RANGE(env->option);
3412 break;
3413
3414 case 'B':
3415 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3416 tok->type = TK_ANCHOR;
3417 tok->u.anchor.subtype = ANCHOR_NOT_WORD_BOUND;
3418 tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option)
3419 && ! IS_WORD_BOUND_ALL_RANGE(env->option);
3420 break;
3421
3422#ifdef USE_WORD_BEGIN_END
3423 case '<':
3424 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3425 tok->type = TK_ANCHOR;
3426 tok->u.anchor.subtype = ANCHOR_WORD_BEGIN;
3427 tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option);
3428 break;
3429
3430 case '>':
3431 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3432 tok->type = TK_ANCHOR;
3433 tok->u.anchor.subtype = ANCHOR_WORD_END;
3434 tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option);
3435 break;
3436#endif
3437
3438 case 's':
3439 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3440 tok->type = TK_CHAR_TYPE;
3441 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3442 tok->u.prop.not = 0;
3443 break;
3444
3445 case 'S':
3446 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3447 tok->type = TK_CHAR_TYPE;
3448 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3449 tok->u.prop.not = 1;
3450 break;
3451
3452 case 'd':
3453 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3454 tok->type = TK_CHAR_TYPE;
3455 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3456 tok->u.prop.not = 0;
3457 break;
3458
3459 case 'D':
3460 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3461 tok->type = TK_CHAR_TYPE;
3462 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3463 tok->u.prop.not = 1;
3464 break;
3465
3466 case 'h':
3467 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3468 tok->type = TK_CHAR_TYPE;
3469 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3470 tok->u.prop.not = 0;
3471 break;
3472
3473 case 'H':
3474 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3475 tok->type = TK_CHAR_TYPE;
3476 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3477 tok->u.prop.not = 1;
3478 break;
3479
3480 case 'A':
3481 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3482 begin_buf:
3483 tok->type = TK_ANCHOR;
3484 tok->u.anchor.subtype = ANCHOR_BEGIN_BUF;
3485 break;
3486
3487 case 'Z':
3488 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3489 tok->type = TK_ANCHOR;
3490 tok->u.anchor.subtype = ANCHOR_SEMI_END_BUF;
3491 break;
3492
3493 case 'z':
3494 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3495 end_buf:
3496 tok->type = TK_ANCHOR;
3497 tok->u.anchor.subtype = ANCHOR_END_BUF;
3498 break;
3499
3500 case 'G':
3501 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
3502 tok->type = TK_ANCHOR;
3503 tok->u.anchor.subtype = ANCHOR_BEGIN_POSITION;
3504 break;
3505
3506 case '`':
3507 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3508 goto begin_buf;
3509 break;
3510
3511 case '\'':
3512 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3513 goto end_buf;
3514 break;
3515
3516 case 'x':
3517 if (PEND) break;
3518
3519 prev = p;
3520 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3521 PINC;
3522 num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc);
3523 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3524 if (!PEND) {
3525 if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
3526 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3527 }
3528
3529 if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) {
3530 PINC;
3531 tok->type = TK_CODE_POINT;
3532 tok->u.code = (OnigCodePoint )num;
3533 }
3534 else {
3535 /* can't read nothing or invalid format */
3536 p = prev;
3537 }
3538 }
3539 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3540 num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc);
3541 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3542 if (p == prev) { /* can't read nothing. */
3543 num = 0; /* but, it's not error */
3544 }
3545 tok->type = TK_RAW_BYTE;
3546 tok->base = 16;
3547 tok->u.c = num;
3548 }
3549 break;
3550
3551 case 'u':
3552 if (PEND) break;
3553
3554 prev = p;
3555 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3556 num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc);
3557 if (num < -1) return ONIGERR_TOO_SHORT_DIGITS;
3558 else if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3559 if (p == prev) { /* can't read nothing. */
3560 num = 0; /* but, it's not error */
3561 }
3562 tok->type = TK_CODE_POINT;
3563 tok->base = 16;
3564 tok->u.code = (OnigCodePoint )num;
3565 }
3566 break;
3567
3568 case '1': case '2': case '3': case '4':
3569 case '5': case '6': case '7': case '8': case '9':
3570 PUNFETCH;
3571 prev = p;
3572 num = onig_scan_unsigned_number(&p, end, enc);
3573 if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
3574 goto skip_backref;
3575 }
3576
3577 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
3578 (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */
3579 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3580 if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num]))
3581 return ONIGERR_INVALID_BACKREF;
3582 }
3583
3584 tok->type = TK_BACKREF;
3585 tok->u.backref.num = 1;
3586 tok->u.backref.ref1 = num;
3587 tok->u.backref.by_name = 0;
3588#ifdef USE_BACKREF_WITH_LEVEL
3589 tok->u.backref.exist_level = 0;
3590#endif
3591 break;
3592 }
3593
3594 skip_backref:
3595 if (c == '8' || c == '9') {
3596 /* normal char */
3597 p = prev; PINC;
3598 break;
3599 }
3600
3601 p = prev;
3602 /* fall through */
3603 case '0':
3604 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3605 prev = p;
3606 num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
3607 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3608 if (p == prev) { /* can't read nothing. */
3609 num = 0; /* but, it's not error */
3610 }
3611 tok->type = TK_RAW_BYTE;
3612 tok->base = 8;
3613 tok->u.c = num;
3614 }
3615 else if (c != '0') {
3616 PINC;
3617 }
3618 break;
3619
3620#ifdef USE_NAMED_GROUP
3621 case 'k':
3622 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
3623 PFETCH(c);
3624 if (c == '<' || c == '\'') {
3625 r = fetch_named_backref_token(c, tok, &p, end, env);
3626 if (r < 0) return r;
3627 }
3628 else
3629 PUNFETCH;
3630 }
3631 break;
3632#endif
3633
3634#if defined(USE_SUBEXP_CALL) || defined(USE_NAMED_GROUP)
3635 case 'g':
3636#ifdef USE_NAMED_GROUP
3637 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_BRACE_BACKREF)) {
3638 PFETCH(c);
3639 if (c == '{') {
3640 r = fetch_named_backref_token(c, tok, &p, end, env);
3641 if (r < 0) return r;
3642 }
3643 else
3644 PUNFETCH;
3645 }
3646#endif
3647#ifdef USE_SUBEXP_CALL
3648 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
3649 PFETCH(c);
3650 if (c == '<' || c == '\'') {
3651 int gnum = -1, rel = 0;
3652 UChar* name_end;
3653 OnigCodePoint cnext;
3654
3655 cnext = PPEEK;
3656 if (cnext == '0') {
3657 PINC;
3658 if (PPEEK_IS(get_name_end_code_point(c))) { /* \g<0>, \g'0' */
3659 PINC;
3660 name_end = p;
3661 gnum = 0;
3662 }
3663 }
3664 else if (cnext == '+') {
3665 PINC;
3666 rel = 1;
3667 }
3668 prev = p;
3669 if (gnum < 0) {
3670 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1);
3671 if (r < 0) return r;
3672 }
3673
3674 tok->type = TK_CALL;
3675 tok->u.call.name = prev;
3676 tok->u.call.name_end = name_end;
3677 tok->u.call.gnum = gnum;
3678 tok->u.call.rel = rel;
3679 }
3680 else
3681 PUNFETCH;
3682 }
3683#endif
3684 break;
3685#endif
3686
3687 case 'Q':
3688 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
3689 tok->type = TK_QUOTE_OPEN;
3690 }
3691 break;
3692
3693 case 'p':
3694 case 'P':
3695 if (PPEEK_IS('{') &&
3696 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
3697 PINC;
3698 tok->type = TK_CHAR_PROPERTY;
3699 tok->u.prop.not = (c == 'P' ? 1 : 0);
3700
3701 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3702 PFETCH(c);
3703 if (c == '^') {
3704 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3705 }
3706 else
3707 PUNFETCH;
3708 }
3709 }
3710 break;
3711
3712 case 'R':
3713 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK)) {
3714 tok->type = TK_LINEBREAK;
3715 }
3716 break;
3717
3718 case 'X':
3719 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER)) {
3720 tok->type = TK_EXTENDED_GRAPHEME_CLUSTER;
3721 }
3722 break;
3723
3724 case 'K':
3725 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP)) {
3726 tok->type = TK_KEEP;
3727 }
3728 break;
3729
3730 default:
3731 PUNFETCH;
3732 num = fetch_escaped_value(&p, end, env);
3733 if (num < 0) return num;
3734 /* set_raw: */
3735 if (tok->u.c != num) {
3736 tok->type = TK_CODE_POINT;
3737 tok->u.code = (OnigCodePoint )num;
3738 }
3739 else { /* string */
3740 p = tok->backp + enclen(enc, tok->backp);
3741 }
3742 break;
3743 }
3744 }
3745 else {
3746 tok->u.c = c;
3747 tok->escaped = 0;
3748
3749#ifdef USE_VARIABLE_META_CHARS
3750 if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
3751 IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
3752 if (c == MC_ANYCHAR(syn))
3753 goto any_char;
3754 else if (c == MC_ANYTIME(syn))
3755 goto anytime;
3756 else if (c == MC_ZERO_OR_ONE_TIME(syn))
3757 goto zero_or_one_time;
3758 else if (c == MC_ONE_OR_MORE_TIME(syn))
3759 goto one_or_more_time;
3760 else if (c == MC_ANYCHAR_ANYTIME(syn)) {
3761 tok->type = TK_ANYCHAR_ANYTIME;
3762 goto out;
3763 }
3764 }
3765#endif
3766
3767 switch (c) {
3768 case '.':
3769 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
3770#ifdef USE_VARIABLE_META_CHARS
3771 any_char:
3772#endif
3773 tok->type = TK_ANYCHAR;
3774 break;
3775
3776 case '*':
3777 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
3778#ifdef USE_VARIABLE_META_CHARS
3779 anytime:
3780#endif
3781 tok->type = TK_OP_REPEAT;
3782 tok->u.repeat.lower = 0;
3783 tok->u.repeat.upper = REPEAT_INFINITE;
3784 goto greedy_check;
3785 break;
3786
3787 case '+':
3788 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
3789#ifdef USE_VARIABLE_META_CHARS
3790 one_or_more_time:
3791#endif
3792 tok->type = TK_OP_REPEAT;
3793 tok->u.repeat.lower = 1;
3794 tok->u.repeat.upper = REPEAT_INFINITE;
3795 goto greedy_check;
3796 break;
3797
3798 case '?':
3799 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
3800#ifdef USE_VARIABLE_META_CHARS
3801 zero_or_one_time:
3802#endif
3803 tok->type = TK_OP_REPEAT;
3804 tok->u.repeat.lower = 0;
3805 tok->u.repeat.upper = 1;
3806 goto greedy_check;
3807 break;
3808
3809 case '{':
3810 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
3811 r = fetch_range_quantifier(&p, end, tok, env);
3812 if (r < 0) return r; /* error */
3813 if (r == 0) goto greedy_check;
3814 else if (r == 2) { /* {n} */
3815 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3816 goto possessive_check;
3817
3818 goto greedy_check;
3819 }
3820 /* r == 1 : normal char */
3821 break;
3822
3823 case '|':
3824 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
3825 tok->type = TK_ALT;
3826 break;
3827
3828 case '(':
3829 if (PPEEK_IS('?') &&
3830 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
3831 PINC;
3832 if (PPEEK_IS('#')) {
3833 PFETCH(c);
3834 while (1) {
3835 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
3836 PFETCH(c);
3837 if (c == MC_ESC(syn)) {
3838 if (!PEND) PFETCH(c);
3839 }
3840 else {
3841 if (c == ')') break;
3842 }
3843 }
3844 goto start;
3845 }
3846#ifdef USE_PERL_SUBEXP_CALL
3847 /* (?&name), (?n), (?R), (?0), (?+n), (?-n) */
3848 c = PPEEK;
3849 if ((c == '&' || c == 'R' || ONIGENC_IS_CODE_DIGIT(enc, c)) &&
3850 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_SUBEXP_CALL)) {
3851 /* (?&name), (?n), (?R), (?0) */
3852 int gnum;
3853 UChar *name;
3854 UChar *name_end;
3855
3856 if (c == 'R' || c == '0') {
3857 PINC; /* skip 'R' / '0' */
3858 if (!PPEEK_IS(')')) return ONIGERR_INVALID_GROUP_NAME;
3859 PINC; /* skip ')' */
3860 name_end = name = p;
3861 gnum = 0;
3862 }
3863 else {
3864 int numref = 1;
3865 if (c == '&') { /* (?&name) */
3866 PINC;
3867 numref = 0; /* don't allow number name */
3868 }
3869 name = p;
3870 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, numref);
3871 if (r < 0) return r;
3872 }
3873
3874 tok->type = TK_CALL;
3875 tok->u.call.name = name;
3876 tok->u.call.name_end = name_end;
3877 tok->u.call.gnum = gnum;
3878 tok->u.call.rel = 0;
3879 break;
3880 }
3881 else if ((c == '-' || c == '+') &&
3882 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_SUBEXP_CALL)) {
3883 /* (?+n), (?-n) */
3884 int gnum;
3885 UChar *name;
3886 UChar *name_end;
3887 OnigCodePoint cnext;
3888 PFETCH_READY;
3889
3890 PINC; /* skip '-' / '+' */
3891 cnext = PPEEK;
3892 if (ONIGENC_IS_CODE_DIGIT(enc, cnext)) {
3893 if (c == '-') PUNFETCH;
3894 name = p;
3895 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, 1);
3896 if (r < 0) return r;
3897
3898 tok->type = TK_CALL;
3899 tok->u.call.name = name;
3900 tok->u.call.name_end = name_end;
3901 tok->u.call.gnum = gnum;
3902 tok->u.call.rel = 1;
3903 break;
3904 }
3905 }
3906#endif /* USE_PERL_SUBEXP_CALL */
3907#ifdef USE_CAPITAL_P_NAMED_GROUP
3908 if (PPEEK_IS('P') &&
3909 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) {
3910 int gnum;
3911 UChar *name;
3912 UChar *name_end;
3913 PFETCH_READY;
3914
3915 PINC; /* skip 'P' */
3916 PFETCH(c);
3917 if (c == '=') { /* (?P=name): backref */
3918 r = fetch_named_backref_token((OnigCodePoint )'(', tok, &p, end, env);
3919 if (r < 0) return r;
3920 break;
3921 }
3922 else if (c == '>') { /* (?P>name): subexp call */
3923 name = p;
3924 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, 0);
3925 if (r < 0) return r;
3926
3927 tok->type = TK_CALL;
3928 tok->u.call.name = name;
3929 tok->u.call.name_end = name_end;
3930 tok->u.call.gnum = gnum;
3931 tok->u.call.rel = 0;
3932 break;
3933 }
3934 PUNFETCH;
3935 }
3936#endif /* USE_CAPITAL_P_NAMED_GROUP */
3937 PUNFETCH;
3938 }
3939
3940 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3941 tok->type = TK_SUBEXP_OPEN;
3942 break;
3943
3944 case ')':
3945 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3946 tok->type = TK_SUBEXP_CLOSE;
3947 break;
3948
3949 case '^':
3950 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3951 tok->type = TK_ANCHOR;
3952 tok->u.anchor.subtype = (IS_SINGLELINE(env->option)
3953 ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
3954 break;
3955
3956 case '$':
3957 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3958 tok->type = TK_ANCHOR;
3959 tok->u.anchor.subtype = (IS_SINGLELINE(env->option)
3960 ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
3961 break;
3962
3963 case '[':
3964 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
3965 tok->type = TK_CC_OPEN;
3966 break;
3967
3968 case ']':
3969 if (*src > env->pattern) /* /].../ is allowed. */
3970 CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
3971 break;
3972
3973 case '#':
3974 if (IS_EXTEND(env->option)) {
3975 while (!PEND) {
3976 PFETCH(c);
3977 if (ONIGENC_IS_CODE_NEWLINE(enc, c))
3978 break;
3979 }
3980 goto start;
3981 break;
3982 }
3983 break;
3984
3985 case ' ': case '\t': case '\n': case '\r': case '\f':
3986 if (IS_EXTEND(env->option))
3987 goto start;
3988 break;
3989
3990 default:
3991 /* string */
3992 break;
3993 }
3994 }
3995
3996#ifdef USE_VARIABLE_META_CHARS
3997 out:
3998#endif
3999 *src = p;
4000 return tok->type;
4001}
4002
4003static int
4004add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
4005 OnigEncoding enc ARG_UNUSED,
4006 OnigCodePoint sb_out, const OnigCodePoint mbr[])
4007{
4008 int i, r;
4009 OnigCodePoint j;
4010
4011 int n = ONIGENC_CODE_RANGE_NUM(mbr);
4012
4013 if (not == 0) {
4014 for (i = 0; i < n; i++) {
4015 for (j = ONIGENC_CODE_RANGE_FROM(mbr, i);
4016 j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
4017 if (j >= sb_out) {
4018 if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
4019 r = add_code_range_to_buf(&(cc->mbuf), j,
4020 ONIGENC_CODE_RANGE_TO(mbr, i));
4021 if (r != 0) return r;
4022 i++;
4023 }
4024
4025 goto sb_end;
4026 }
4027 BITSET_SET_BIT(cc->bs, j);
4028 }
4029 }
4030
4031 sb_end:
4032 for ( ; i < n; i++) {
4033 r = add_code_range_to_buf(&(cc->mbuf),
4034 ONIGENC_CODE_RANGE_FROM(mbr, i),
4035 ONIGENC_CODE_RANGE_TO(mbr, i));
4036 if (r != 0) return r;
4037 }
4038 }
4039 else {
4040 OnigCodePoint prev = 0;
4041
4042 for (i = 0; i < n; i++) {
4043 for (j = prev;
4044 j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) {
4045 if (j >= sb_out) {
4046 goto sb_end2;
4047 }
4048 BITSET_SET_BIT(cc->bs, j);
4049 }
4050 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
4051 }
4052 for (j = prev; j < sb_out; j++) {
4053 BITSET_SET_BIT(cc->bs, j);
4054 }
4055
4056 sb_end2:
4057 prev = sb_out;
4058
4059 for (i = 0; i < n; i++) {
4060 if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
4061 r = add_code_range_to_buf(&(cc->mbuf), prev,
4062 ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
4063 if (r != 0) return r;
4064 }
4065 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
4066 }
4067 if (prev < 0x7fffffff) {
4068 r = add_code_range_to_buf(&(cc->mbuf), prev, 0x7fffffff);
4069 if (r != 0) return r;
4070 }
4071 }
4072
4073 return 0;
4074}
4075
4076static int
4077add_ctype_to_cc(CClassNode* cc, int ctype, int not, int ascii_range, ScanEnv* env)
4078{
4079 int maxcode;
4080 int c, r;
4081 const OnigCodePoint *ranges;
4082 OnigCodePoint sb_out;
4083 OnigEncoding enc = env->enc;
4084
4085 r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
4086 if (r == 0) {
4087 if (ascii_range) {
4088 CClassNode ccwork;
4089 initialize_cclass(&ccwork);
4090 r = add_ctype_to_cc_by_range(&ccwork, ctype, not, env->enc, sb_out,
4091 ranges);
4092 if (r == 0) {
4093 if (not) {
4094 r = add_code_range_to_buf(&(ccwork.mbuf), 0x80, ONIG_LAST_CODE_POINT);
4095 }
4096 else {
4097 CClassNode ccascii;
4098 initialize_cclass(&ccascii);
4099 if (ONIGENC_MBC_MINLEN(env->enc) > 1) {
4100 add_code_range(&(ccascii.mbuf), env, 0x00, 0x7F);
4101 }
4102 else {
4103 bitset_set_range(ccascii.bs, 0x00, 0x7F);
4104 }
4105 r = and_cclass(&ccwork, &ccascii, enc);
4106 if (IS_NOT_NULL(ccascii.mbuf)) bbuf_free(ccascii.mbuf);
4107 }
4108 if (r == 0) {
4109 r = or_cclass(cc, &ccwork, enc);
4110 }
4111 if (IS_NOT_NULL(ccwork.mbuf)) bbuf_free(ccwork.mbuf);
4112 }
4113 }
4114 else {
4115 r = add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges);
4116 }
4117 return r;
4118 }
4119 else if (r != ONIG_NO_SUPPORT_CONFIG) {
4120 return r;
4121 }
4122
4123 maxcode = ascii_range ? 0x80 : SINGLE_BYTE_SIZE;
4124 r = 0;
4125 switch (ctype) {
4126 case ONIGENC_CTYPE_ALPHA:
4127 case ONIGENC_CTYPE_BLANK:
4128 case ONIGENC_CTYPE_CNTRL:
4129 case ONIGENC_CTYPE_DIGIT:
4130 case ONIGENC_CTYPE_LOWER:
4131 case ONIGENC_CTYPE_PUNCT:
4132 case ONIGENC_CTYPE_SPACE:
4133 case ONIGENC_CTYPE_UPPER:
4134 case ONIGENC_CTYPE_XDIGIT:
4135 case ONIGENC_CTYPE_ASCII:
4136 case ONIGENC_CTYPE_ALNUM:
4137 if (not != 0) {
4138 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
4139 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
4140 BITSET_SET_BIT(cc->bs, c);
4141 }
4142 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
4143 }
4144 else {
4145 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
4146 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
4147 BITSET_SET_BIT(cc->bs, c);
4148 }
4149 }
4150 break;
4151
4152 case ONIGENC_CTYPE_GRAPH:
4153 case ONIGENC_CTYPE_PRINT:
4154 if (not != 0) {
4155 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
4156 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)
4157 || c >= maxcode)
4158 BITSET_SET_BIT(cc->bs, c);
4159 }
4160 if (ascii_range)
4161 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
4162 }
4163 else {
4164 for (c = 0; c < maxcode; c++) {
4165 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
4166 BITSET_SET_BIT(cc->bs, c);
4167 }
4168 if (! ascii_range)
4169 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
4170 }
4171 break;
4172
4173 case ONIGENC_CTYPE_WORD:
4174 if (not == 0) {
4175 for (c = 0; c < maxcode; c++) {
4176 if (ONIGENC_IS_CODE_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c);
4177 }
4178 if (! ascii_range)
4179 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
4180 }
4181 else {
4182 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
4183 if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */
4184 && (! ONIGENC_IS_CODE_WORD(enc, c) || c >= maxcode))
4185 BITSET_SET_BIT(cc->bs, c);
4186 }
4187 if (ascii_range)
4188 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
4189 }
4190 break;
4191
4192 default:
4193 return ONIGERR_PARSER_BUG;
4194 break;
4195 }
4196
4197 return r;
4198}
4199
4200static int
4201parse_posix_bracket(CClassNode* cc, CClassNode* asc_cc,
4202 UChar** src, UChar* end, ScanEnv* env)
4203{
4204#define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20
4205#define POSIX_BRACKET_NAME_MIN_LEN 4
4206
4207 static const PosixBracketEntryType PBS[] = {
4208 POSIX_BRACKET_ENTRY_INIT("alnum", ONIGENC_CTYPE_ALNUM),
4209 POSIX_BRACKET_ENTRY_INIT("alpha", ONIGENC_CTYPE_ALPHA),
4210 POSIX_BRACKET_ENTRY_INIT("blank", ONIGENC_CTYPE_BLANK),
4211 POSIX_BRACKET_ENTRY_INIT("cntrl", ONIGENC_CTYPE_CNTRL),
4212 POSIX_BRACKET_ENTRY_INIT("digit", ONIGENC_CTYPE_DIGIT),
4213 POSIX_BRACKET_ENTRY_INIT("graph", ONIGENC_CTYPE_GRAPH),
4214 POSIX_BRACKET_ENTRY_INIT("lower", ONIGENC_CTYPE_LOWER),
4215 POSIX_BRACKET_ENTRY_INIT("print", ONIGENC_CTYPE_PRINT),
4216 POSIX_BRACKET_ENTRY_INIT("punct", ONIGENC_CTYPE_PUNCT),
4217 POSIX_BRACKET_ENTRY_INIT("space", ONIGENC_CTYPE_SPACE),
4218 POSIX_BRACKET_ENTRY_INIT("upper", ONIGENC_CTYPE_UPPER),
4219 POSIX_BRACKET_ENTRY_INIT("xdigit", ONIGENC_CTYPE_XDIGIT),
4220 POSIX_BRACKET_ENTRY_INIT("ascii", ONIGENC_CTYPE_ASCII),
4221 POSIX_BRACKET_ENTRY_INIT("word", ONIGENC_CTYPE_WORD),
4222 };
4223
4224 const PosixBracketEntryType *pb;
4225 int not, i, r;
4226 int ascii_range;
4227 OnigCodePoint c;
4228 OnigEncoding enc = env->enc;
4229 UChar *p = *src;
4230
4231 if (PPEEK_IS('^')) {
4232 PINC_S;
4233 not = 1;
4234 }
4235 else
4236 not = 0;
4237
4238 if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3)
4239 goto not_posix_bracket;
4240
4241 ascii_range = IS_ASCII_RANGE(env->option) &&
4242 ! IS_POSIX_BRACKET_ALL_RANGE(env->option);
4243 for (pb = PBS; pb < PBS + numberof(PBS); pb++) {
4244 if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
4245 p = (UChar* )onigenc_step(enc, p, end, pb->len);
4246 if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
4247 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
4248
4249 r = add_ctype_to_cc(cc, pb->ctype, not, ascii_range, env);
4250 if (r != 0) return r;
4251
4252 if (IS_NOT_NULL(asc_cc)) {
4253 if (pb->ctype != ONIGENC_CTYPE_WORD &&
4254 pb->ctype != ONIGENC_CTYPE_ASCII &&
4255 !ascii_range)
4256 r = add_ctype_to_cc(asc_cc, pb->ctype, not, ascii_range, env);
4257 if (r != 0) return r;
4258 }
4259
4260 PINC_S; PINC_S;
4261 *src = p;
4262 return 0;
4263 }
4264 }
4265
4266 not_posix_bracket:
4267 c = 0;
4268 i = 0;
4269 while (!PEND && ((c = PPEEK) != ':') && c != ']') {
4270 PINC_S;
4271 if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
4272 }
4273 if (c == ':' && ! PEND) {
4274 PINC_S;
4275 if (! PEND) {
4276 PFETCH_S(c);
4277 if (c == ']')
4278 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
4279 }
4280 }
4281
4282 return 1; /* 1: is not POSIX bracket, but no error. */
4283}
4284
4285static int
4286fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
4287{
4288 int r;
4289 OnigCodePoint c;
4290 OnigEncoding enc = env->enc;
4291 UChar *prev, *start, *p = *src;
4292
4293 r = 0;
4294 start = prev = p;
4295
4296 while (!PEND) {
4297 prev = p;
4298 PFETCH_S(c);
4299 if (c == '}') {
4300 r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev);
4301 if (r < 0) break;
4302
4303 *src = p;
4304 return r;
4305 }
4306 else if (c == '(' || c == ')' || c == '{' || c == '|') {
4307 r = ONIGERR_INVALID_CHAR_PROPERTY_NAME;
4308 break;
4309 }
4310 }
4311
4312 onig_scan_env_set_error_string(env, r, *src, prev);
4313 return r;
4314}
4315
4316static int cclass_case_fold(Node** np, CClassNode* cc, CClassNode* asc_cc, ScanEnv* env);
4317
4318static int
4319parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
4320 ScanEnv* env)
4321{
4322 int r, ctype;
4323 CClassNode* cc;
4324
4325 ctype = fetch_char_property_to_ctype(src, end, env);
4326 if (ctype < 0) return ctype;
4327
4328 *np = node_new_cclass();
4329 CHECK_NULL_RETURN_MEMERR(*np);
4330 cc = NCCLASS(*np);
4331 r = add_ctype_to_cc(cc, ctype, 0, 0, env);
4332 if (r != 0) return r;
4333 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
4334
4335 if (IS_IGNORECASE(env->option)) {
4336 if (ctype != ONIGENC_CTYPE_ASCII)
4337 r = cclass_case_fold(np, cc, cc, env);
4338 }
4339 return r;
4340}
4341
4342
4343enum CCSTATE {
4344 CCS_VALUE,
4345 CCS_RANGE,
4346 CCS_COMPLETE,
4347 CCS_START
4348};
4349
4350enum CCVALTYPE {
4351 CCV_SB,
4352 CCV_CODE_POINT,
4353 CCV_CLASS
4354};
4355
4356static int
4357next_state_class(CClassNode* cc, CClassNode* asc_cc,
4358 OnigCodePoint* vs, enum CCVALTYPE* type,
4359 enum CCSTATE* state, ScanEnv* env)
4360{
4361 int r;
4362
4363 if (*state == CCS_RANGE)
4364 return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
4365
4366 if (*state == CCS_VALUE && *type != CCV_CLASS) {
4367 if (*type == CCV_SB) {
4368 BITSET_SET_BIT(cc->bs, (int )(*vs));
4369 if (IS_NOT_NULL(asc_cc))
4370 BITSET_SET_BIT(asc_cc->bs, (int )(*vs));
4371 }
4372 else if (*type == CCV_CODE_POINT) {
4373 r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4374 if (r < 0) return r;
4375 if (IS_NOT_NULL(asc_cc)) {
4376 r = add_code_range(&(asc_cc->mbuf), env, *vs, *vs);
4377 if (r < 0) return r;
4378 }
4379 }
4380 }
4381
4382 *state = CCS_VALUE;
4383 *type = CCV_CLASS;
4384 return 0;
4385}
4386
4387static int
4388next_state_val(CClassNode* cc, CClassNode* asc_cc,
4389 OnigCodePoint *vs, OnigCodePoint v,
4390 int* vs_israw, int v_israw,
4391 enum CCVALTYPE intype, enum CCVALTYPE* type,
4392 enum CCSTATE* state, ScanEnv* env)
4393{
4394 int r;
4395
4396 switch (*state) {
4397 case CCS_VALUE:
4398 if (*type == CCV_SB) {
4399 BITSET_SET_BIT(cc->bs, (int )(*vs));
4400 if (IS_NOT_NULL(asc_cc))
4401 BITSET_SET_BIT(asc_cc->bs, (int )(*vs));
4402 }
4403 else if (*type == CCV_CODE_POINT) {
4404 r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4405 if (r < 0) return r;
4406 if (IS_NOT_NULL(asc_cc)) {
4407 r = add_code_range(&(asc_cc->mbuf), env, *vs, *vs);
4408 if (r < 0) return r;
4409 }
4410 }
4411 break;
4412
4413 case CCS_RANGE:
4414 if (intype == *type) {
4415 if (intype == CCV_SB) {
4416 if (*vs > 0xff || v > 0xff)
4417 return ONIGERR_INVALID_CODE_POINT_VALUE;
4418
4419 if (*vs > v) {
4420 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4421 goto ccs_range_end;
4422 else
4423 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4424 }
4425 bitset_set_range(cc->bs, (int )*vs, (int )v);
4426 if (IS_NOT_NULL(asc_cc))
4427 bitset_set_range(asc_cc->bs, (int )*vs, (int )v);
4428 }
4429 else {
4430 r = add_code_range(&(cc->mbuf), env, *vs, v);
4431 if (r < 0) return r;
4432 if (IS_NOT_NULL(asc_cc)) {
4433 r = add_code_range(&(asc_cc->mbuf), env, *vs, v);
4434 if (r < 0) return r;
4435 }
4436 }
4437 }
4438 else {
4439#if 0
4440 if (intype == CCV_CODE_POINT && *type == CCV_SB) {
4441#endif
4442 if (*vs > v) {
4443 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4444 goto ccs_range_end;
4445 else
4446 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4447 }
4448 bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
4449 r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
4450 if (r < 0) return r;
4451 if (IS_NOT_NULL(asc_cc)) {
4452 bitset_set_range(asc_cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
4453 r = add_code_range(&(asc_cc->mbuf), env, (OnigCodePoint )*vs, v);
4454 if (r < 0) return r;
4455 }
4456#if 0
4457 }
4458 else
4459 return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE;
4460#endif
4461 }
4462 ccs_range_end:
4463 *state = CCS_COMPLETE;
4464 break;
4465
4466 case CCS_COMPLETE:
4467 case CCS_START:
4468 *state = CCS_VALUE;
4469 break;
4470
4471 default:
4472 break;
4473 }
4474
4475 *vs_israw = v_israw;
4476 *vs = v;
4477 *type = intype;
4478 return 0;
4479}
4480
4481static int
4482code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
4483 ScanEnv* env)
4484{
4485 int in_esc;
4486 OnigCodePoint code;
4487 OnigEncoding enc = env->enc;
4488 UChar* p = from;
4489
4490 in_esc = 0;
4491 while (! PEND) {
4492 if (ignore_escaped && in_esc) {
4493 in_esc = 0;
4494 }
4495 else {
4496 PFETCH_S(code);
4497 if (code == c) return 1;
4498 if (code == MC_ESC(env->syntax)) in_esc = 1;
4499 }
4500 }
4501 return 0;
4502}
4503
4504static int
4505parse_char_class(Node** np, Node** asc_np, OnigToken* tok, UChar** src, UChar* end,
4506 ScanEnv* env)
4507{
4508 int r, neg, len, fetched, and_start;
4509 OnigCodePoint v, vs;
4510 UChar *p;
4511 Node* node;
4512 Node* asc_node;
4513 CClassNode *cc, *prev_cc;
4514 CClassNode *asc_cc, *asc_prev_cc;
4515 CClassNode work_cc, asc_work_cc;
4516
4517 enum CCSTATE state;
4518 enum CCVALTYPE val_type, in_type;
4519 int val_israw, in_israw;
4520
4521 prev_cc = asc_prev_cc = (CClassNode* )NULL;
4522 *np = *asc_np = NULL_NODE;
4523 r = fetch_token_in_cc(tok, src, end, env);
4524 if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
4525 neg = 1;
4526 r = fetch_token_in_cc(tok, src, end, env);
4527 }
4528 else {
4529 neg = 0;
4530 }
4531
4532 if (r < 0) return r;
4533 if (r == TK_CC_CLOSE) {
4534 if (! code_exist_check((OnigCodePoint )']',
4535 *src, env->pattern_end, 1, env))
4536 return ONIGERR_EMPTY_CHAR_CLASS;
4537
4538 CC_ESC_WARN(env, (UChar* )"]");
4539 r = tok->type = TK_CHAR; /* allow []...] */
4540 }
4541
4542 *np = node = node_new_cclass();
4543 CHECK_NULL_RETURN_MEMERR(node);
4544 cc = NCCLASS(node);
4545
4546 if (IS_IGNORECASE(env->option)) {
4547 *asc_np = asc_node = node_new_cclass();
4548 CHECK_NULL_RETURN_MEMERR(asc_node);
4549 asc_cc = NCCLASS(asc_node);
4550 }
4551 else {
4552 asc_node = NULL_NODE;
4553 asc_cc = NULL;
4554 }
4555
4556 and_start = 0;
4557 state = CCS_START;
4558 p = *src;
4559 while (r != TK_CC_CLOSE) {
4560 fetched = 0;
4561 switch (r) {
4562 case TK_CHAR:
4563 if ((tok->u.code >= SINGLE_BYTE_SIZE) ||
4564 (len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c)) > 1) {
4565 in_type = CCV_CODE_POINT;
4566 }
4567 else if (len < 0) {
4568 r = len;
4569 goto err;
4570 }
4571 else {
4572 sb_char:
4573 in_type = CCV_SB;
4574 }
4575 v = (OnigCodePoint )tok->u.c;
4576 in_israw = 0;
4577 goto val_entry2;
4578 break;
4579
4580 case TK_RAW_BYTE:
4581 /* tok->base != 0 : octal or hexadec. */
4582 if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
4583 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4584 UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
4585 UChar* psave = p;
4586 int i, base = tok->base;
4587
4588 buf[0] = (UChar )tok->u.c;
4589 for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
4590 r = fetch_token_in_cc(tok, &p, end, env);
4591 if (r < 0) goto err;
4592 if (r != TK_RAW_BYTE || tok->base != base) {
4593 fetched = 1;
4594 break;
4595 }
4596 buf[i] = (UChar )tok->u.c;
4597 }
4598
4599 if (i < ONIGENC_MBC_MINLEN(env->enc)) {
4600 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4601 goto err;
4602 }
4603
4604 len = enclen(env->enc, buf);
4605 if (i < len) {
4606 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4607 goto err;
4608 }
4609 else if (i > len) { /* fetch back */
4610 p = psave;
4611 for (i = 1; i < len; i++) {
4612 r = fetch_token_in_cc(tok, &p, end, env);
4613 }
4614 fetched = 0;
4615 }
4616
4617 if (i == 1) {
4618 v = (OnigCodePoint )buf[0];
4619 goto raw_single;
4620 }
4621 else {
4622 v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
4623 in_type = CCV_CODE_POINT;
4624 }
4625 }
4626 else {
4627 v = (OnigCodePoint )tok->u.c;
4628 raw_single:
4629 in_type = CCV_SB;
4630 }
4631 in_israw = 1;
4632 goto val_entry2;
4633 break;
4634
4635 case TK_CODE_POINT:
4636 v = tok->u.code;
4637 in_israw = 1;
4638 val_entry:
4639 len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);
4640 if (len < 0) {
4641 r = len;
4642 goto err;
4643 }
4644 in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);
4645 val_entry2:
4646 r = next_state_val(cc, asc_cc, &vs, v, &val_israw, in_israw, in_type, &val_type,
4647 &state, env);
4648 if (r != 0) goto err;
4649 break;
4650
4651 case TK_POSIX_BRACKET_OPEN:
4652 r = parse_posix_bracket(cc, asc_cc, &p, end, env);
4653 if (r < 0) goto err;
4654 if (r == 1) { /* is not POSIX bracket */
4655 CC_ESC_WARN(env, (UChar* )"[");
4656 p = tok->backp;
4657 v = (OnigCodePoint )tok->u.c;
4658 in_israw = 0;
4659 goto val_entry;
4660 }
4661 goto next_class;
4662 break;
4663
4664 case TK_CHAR_TYPE:
4665 r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not,
4666 IS_ASCII_RANGE(env->option), env);
4667 if (r != 0) return r;
4668 if (IS_NOT_NULL(asc_cc)) {
4669 if (tok->u.prop.ctype != ONIGENC_CTYPE_WORD)
4670 r = add_ctype_to_cc(asc_cc, tok->u.prop.ctype, tok->u.prop.not,
4671 IS_ASCII_RANGE(env->option), env);
4672 if (r != 0) return r;
4673 }
4674
4675 next_class:
4676 r = next_state_class(cc, asc_cc, &vs, &val_type, &state, env);
4677 if (r != 0) goto err;
4678 break;
4679
4680 case TK_CHAR_PROPERTY:
4681 {
4682 int ctype;
4683
4684 ctype = fetch_char_property_to_ctype(&p, end, env);
4685 if (ctype < 0) return ctype;
4686 r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, 0, env);
4687 if (r != 0) return r;
4688 if (IS_NOT_NULL(asc_cc)) {
4689 if (ctype != ONIGENC_CTYPE_ASCII)
4690 r = add_ctype_to_cc(asc_cc, ctype, tok->u.prop.not, 0, env);
4691 if (r != 0) return r;
4692 }
4693 goto next_class;
4694 }
4695 break;
4696
4697 case TK_CC_RANGE:
4698 if (state == CCS_VALUE) {
4699 r = fetch_token_in_cc(tok, &p, end, env);
4700 if (r < 0) goto err;
4701 fetched = 1;
4702 if (r == TK_CC_CLOSE) { /* allow [x-] */
4703 range_end_val:
4704 v = (OnigCodePoint )'-';
4705 in_israw = 0;
4706 goto val_entry;
4707 }
4708 else if (r == TK_CC_AND) {
4709 CC_ESC_WARN(env, (UChar* )"-");
4710 goto range_end_val;
4711 }
4712 state = CCS_RANGE;
4713 }
4714 else if (state == CCS_START) {
4715 /* [-xa] is allowed */
4716 v = (OnigCodePoint )tok->u.c;
4717 in_israw = 0;
4718
4719 r = fetch_token_in_cc(tok, &p, end, env);
4720 if (r < 0) goto err;
4721 fetched = 1;
4722 /* [--x] or [a&&-x] is warned. */
4723 if (r == TK_CC_RANGE || and_start != 0)
4724 CC_ESC_WARN(env, (UChar* )"-");
4725
4726 goto val_entry;
4727 }
4728 else if (state == CCS_RANGE) {
4729 CC_ESC_WARN(env, (UChar* )"-");
4730 goto sb_char; /* [!--x] is allowed */
4731 }
4732 else { /* CCS_COMPLETE */
4733 r = fetch_token_in_cc(tok, &p, end, env);
4734 if (r < 0) goto err;
4735 fetched = 1;
4736 if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */
4737 else if (r == TK_CC_AND) {
4738 CC_ESC_WARN(env, (UChar* )"-");
4739 goto range_end_val;
4740 }
4741
4742 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
4743 CC_ESC_WARN(env, (UChar* )"-");
4744 goto range_end_val; /* [0-9-a] is allowed as [0-9\-a] */
4745 }
4746 r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
4747 goto err;
4748 }
4749 break;
4750
4751 case TK_CC_CC_OPEN: /* [ */
4752 {
4753 Node *anode, *aasc_node;
4754 CClassNode* acc;
4755
4756 r = parse_char_class(&anode, &aasc_node, tok, &p, end, env);
4757 if (r == 0) {
4758 acc = NCCLASS(anode);
4759 r = or_cclass(cc, acc, env->enc);
4760 }
4761 if (r == 0 && IS_NOT_NULL(aasc_node)) {
4762 acc = NCCLASS(aasc_node);
4763 r = or_cclass(asc_cc, acc, env->enc);
4764 }
4765 onig_node_free(anode);
4766 onig_node_free(aasc_node);
4767 if (r != 0) goto err;
4768 }
4769 break;
4770
4771 case TK_CC_AND: /* && */
4772 {
4773 if (state == CCS_VALUE) {
4774 r = next_state_val(cc, asc_cc, &vs, 0, &val_israw, 0, val_type,
4775 &val_type, &state, env);
4776 if (r != 0) goto err;
4777 }
4778 /* initialize local variables */
4779 and_start = 1;
4780 state = CCS_START;
4781
4782 if (IS_NOT_NULL(prev_cc)) {
4783 r = and_cclass(prev_cc, cc, env->enc);
4784 if (r != 0) goto err;
4785 bbuf_free(cc->mbuf);
4786 if (IS_NOT_NULL(asc_cc)) {
4787 r = and_cclass(asc_prev_cc, asc_cc, env->enc);
4788 if (r != 0) goto err;
4789 bbuf_free(asc_cc->mbuf);
4790 }
4791 }
4792 else {
4793 prev_cc = cc;
4794 cc = &work_cc;
4795 if (IS_NOT_NULL(asc_cc)) {
4796 asc_prev_cc = asc_cc;
4797 asc_cc = &asc_work_cc;
4798 }
4799 }
4800 initialize_cclass(cc);
4801 if (IS_NOT_NULL(asc_cc))
4802 initialize_cclass(asc_cc);
4803 }
4804 break;
4805
4806 case TK_EOT:
4807 r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
4808 goto err;
4809 break;
4810 default:
4811 r = ONIGERR_PARSER_BUG;
4812 goto err;
4813 break;
4814 }
4815
4816 if (fetched)
4817 r = tok->type;
4818 else {
4819 r = fetch_token_in_cc(tok, &p, end, env);
4820 if (r < 0) goto err;
4821 }
4822 }
4823
4824 if (state == CCS_VALUE) {
4825 r = next_state_val(cc, asc_cc, &vs, 0, &val_israw, 0, val_type,
4826 &val_type, &state, env);
4827 if (r != 0) goto err;
4828 }
4829
4830 if (IS_NOT_NULL(prev_cc)) {
4831 r = and_cclass(prev_cc, cc, env->enc);
4832 if (r != 0) goto err;
4833 bbuf_free(cc->mbuf);
4834 cc = prev_cc;
4835 if (IS_NOT_NULL(asc_cc)) {
4836 r = and_cclass(asc_prev_cc, asc_cc, env->enc);
4837 if (r != 0) goto err;
4838 bbuf_free(asc_cc->mbuf);
4839 asc_cc = asc_prev_cc;
4840 }
4841 }
4842
4843 if (neg != 0) {
4844 NCCLASS_SET_NOT(cc);
4845 if (IS_NOT_NULL(asc_cc))
4846 NCCLASS_SET_NOT(asc_cc);
4847 }
4848 else {
4849 NCCLASS_CLEAR_NOT(cc);
4850 if (IS_NOT_NULL(asc_cc))
4851 NCCLASS_CLEAR_NOT(asc_cc);
4852 }
4853 if (IS_NCCLASS_NOT(cc) &&
4854 IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
4855 int is_empty;
4856
4857 is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
4858 if (is_empty != 0)
4859 BITSET_IS_EMPTY(cc->bs, is_empty);
4860
4861 if (is_empty == 0) {
4862#define NEWLINE_CODE 0x0a
4863
4864 if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
4865 if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
4866 BITSET_SET_BIT(cc->bs, NEWLINE_CODE);
4867 else {
4868 r = add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
4869 if (r < 0) goto err;
4870 }
4871 }
4872 }
4873 }
4874 *src = p;
4875 return 0;
4876
4877 err:
4878 if (cc != NCCLASS(*np))
4879 bbuf_free(cc->mbuf);
4880 if (IS_NOT_NULL(asc_cc) && (asc_cc != NCCLASS(*asc_np)))
4881 bbuf_free(asc_cc->mbuf);
4882 return r;
4883}
4884
4885static int parse_subexp(Node** top, OnigToken* tok, int term,
4886 UChar** src, UChar* end, ScanEnv* env);
4887
4888static int
4889parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
4890 ScanEnv* env)
4891{
4892 int r = 0, num;
4893 Node *target, *work1 = NULL, *work2 = NULL;
4894 OnigOptionType option;
4895 OnigCodePoint c;
4896 OnigEncoding enc = env->enc;
4897
4898#ifdef USE_NAMED_GROUP
4899 int list_capture;
4900#endif
4901
4902 UChar* p = *src;
4903 PFETCH_READY;
4904
4905 *np = NULL;
4906 if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
4907
4908 option = env->option;
4909 if (PPEEK_IS('?') &&
4910 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
4911 PINC;
4912 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4913
4914 PFETCH(c);
4915 switch (c) {
4916 case ':': /* (?:...) grouping only */
4917 group:
4918 r = fetch_token(tok, &p, end, env);
4919 if (r < 0) return r;
4920 r = parse_subexp(np, tok, term, &p, end, env);
4921 if (r < 0) return r;
4922 *src = p;
4923 return 1; /* group */
4924 break;
4925
4926 case '=':
4927 *np = onig_node_new_anchor(ANCHOR_PREC_READ);
4928 break;
4929 case '!': /* preceding read */
4930 *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
4931 break;
4932 case '>': /* (?>...) stop backtrack */
4933 *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
4934 break;
4935
4936#ifdef USE_NAMED_GROUP
4937 case '\'':
4938 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4939 goto named_group1;
4940 }
4941 else
4942 return ONIGERR_UNDEFINED_GROUP_OPTION;
4943 break;
4944
4945#ifdef USE_CAPITAL_P_NAMED_GROUP
4946 case 'P': /* (?P<name>...) */
4947 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) {
4948 PFETCH(c);
4949 if (c == '<') goto named_group1;
4950 }
4951 return ONIGERR_UNDEFINED_GROUP_OPTION;
4952 break;
4953#endif
4954#endif
4955
4956 case '<': /* look behind (?<=...), (?<!...) */
4957 PFETCH(c);
4958 if (c == '=')
4959 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
4960 else if (c == '!')
4961 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
4962#ifdef USE_NAMED_GROUP
4963 else { /* (?<name>...) */
4964 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4965 UChar *name;
4966 UChar *name_end;
4967
4968 PUNFETCH;
4969 c = '<';
4970
4971 named_group1:
4972 list_capture = 0;
4973
4974 named_group2:
4975 name = p;
4976 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
4977 if (r < 0) return r;
4978
4979 num = scan_env_add_mem_entry(env);
4980 if (num < 0) return num;
4981 if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM)
4982 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4983
4984 r = name_add(env->reg, name, name_end, num, env);
4985 if (r != 0) return r;
4986 *np = node_new_enclose_memory(env->option, 1);
4987 CHECK_NULL_RETURN_MEMERR(*np);
4988 NENCLOSE(*np)->regnum = num;
4989 if (list_capture != 0)
4990 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4991 env->num_named++;
4992 }
4993 else {
4994 return ONIGERR_UNDEFINED_GROUP_OPTION;
4995 }
4996 }
4997#else
4998 else {
4999 return ONIGERR_UNDEFINED_GROUP_OPTION;
5000 }
5001#endif
5002 break;
5003
5004 case '@':
5005 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
5006#ifdef USE_NAMED_GROUP
5007 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
5008 PFETCH(c);
5009 if (c == '<' || c == '\'') {
5010 list_capture = 1;
5011 goto named_group2; /* (?@<name>...) */
5012 }
5013 PUNFETCH;
5014 }
5015#endif
5016 *np = node_new_enclose_memory(env->option, 0);
5017 CHECK_NULL_RETURN_MEMERR(*np);
5018 num = scan_env_add_mem_entry(env);
5019 if (num < 0) return num;
5020 if (num >= (int )BIT_STATUS_BITS_NUM)
5021 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
5022
5023 NENCLOSE(*np)->regnum = num;
5024 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
5025 }
5026 else {
5027 return ONIGERR_UNDEFINED_GROUP_OPTION;
5028 }
5029 break;
5030
5031 case '(': /* conditional expression: (?(cond)yes), (?(cond)yes|no) */
5032 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LPAREN_CONDITION)) {
5033 UChar *name = NULL;
5034 UChar *name_end;
5035 PFETCH(c);
5036 if (ONIGENC_IS_CODE_DIGIT(enc, c)) { /* (n) */
5037 PUNFETCH;
5038 r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &num, 1);
5039 if (r < 0) return r;
5040#if 0
5041 /* Relative number is not currently supported. (same as Perl) */
5042 if (num < 0) {
5043 num = BACKREF_REL_TO_ABS(num, env);
5044 if (num <= 0)
5045 return ONIGERR_INVALID_BACKREF;
5046 }
5047#endif
5048 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) {
5049 if (num > env->num_mem ||
5050 IS_NULL(SCANENV_MEM_NODES(env)[num]))
5051 return ONIGERR_INVALID_BACKREF;
5052 }
5053 }
5054#ifdef USE_NAMED_GROUP
5055 else if (c == '<' || c == '\'') { /* (<name>), ('name') */
5056 int nums;
5057 int *backs;
5058
5059 name = p;
5060 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
5061 if (r < 0) return r;
5062 PFETCH(c);
5063 if (c != ')') return ONIGERR_UNDEFINED_GROUP_OPTION;
5064
5065 nums = onig_name_to_group_numbers(env->reg, name, name_end, &backs);
5066 if (nums <= 0) {
5067 onig_scan_env_set_error_string(env,
5068 ONIGERR_UNDEFINED_NAME_REFERENCE, name, name_end);
5069 return ONIGERR_UNDEFINED_NAME_REFERENCE;
5070 }
5071 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) {
5072 int i;
5073 for (i = 0; i < nums; i++) {
5074 if (backs[i] > env->num_mem ||
5075 IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
5076 return ONIGERR_INVALID_BACKREF;
5077 }
5078 }
5079 num = backs[0]; /* XXX: use left most named group as Perl */
5080 }
5081#endif
5082 else
5083 return ONIGERR_INVALID_CONDITION_PATTERN;
5084 *np = node_new_enclose(ENCLOSE_CONDITION);
5085 CHECK_NULL_RETURN_MEMERR(*np);
5086 NENCLOSE(*np)->regnum = num;
5087 if (IS_NOT_NULL(name)) NENCLOSE(*np)->state |= NST_NAME_REF;
5088 }
5089 else
5090 return ONIGERR_UNDEFINED_GROUP_OPTION;
5091 break;
5092
5093#if 0
5094 case '|': /* branch reset: (?|...) */
5095 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_VBAR_BRANCH_RESET)) {
5096 /* TODO */
5097 }
5098 else
5099 return ONIGERR_UNDEFINED_GROUP_OPTION;
5100 break;
5101#endif
5102
5103 case '^': /* loads default options */
5104 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
5105 /* d-imsx */
5106 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
5107 ONOFF(option, ONIG_OPTION_IGNORECASE, 1);
5108 ONOFF(option, ONIG_OPTION_SINGLELINE, 0);
5109 ONOFF(option, ONIG_OPTION_MULTILINE, 1);
5110 ONOFF(option, ONIG_OPTION_EXTEND, 1);
5111 PFETCH(c);
5112 }
5113#if 0
5114 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
5115 /* d-imx */
5116 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0);
5117 ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0);
5118 ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0);
5119 ONOFF(option, ONIG_OPTION_IGNORECASE, 1);
5120 ONOFF(option, ONIG_OPTION_MULTILINE, 1);
5121 ONOFF(option, ONIG_OPTION_EXTEND, 1);
5122 PFETCH(c);
5123 }
5124#endif
5125 else {
5126 return ONIGERR_UNDEFINED_GROUP_OPTION;
5127 }
5128 /* fall through */
5129#ifdef USE_POSIXLINE_OPTION
5130 case 'p':
5131#endif
5132 case '-': case 'i': case 'm': case 's': case 'x':
5133 case 'a': case 'd': case 'l': case 'u':
5134 {
5135 int neg = 0;
5136
5137 while (1) {
5138 switch (c) {
5139 case ':':
5140 case ')':
5141 break;
5142
5143 case '-': neg = 1; break;
5144 case 'x': ONOFF(option, ONIG_OPTION_EXTEND, neg); break;
5145 case 'i': ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break;
5146 case 's':
5147 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
5148 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
5149 }
5150 else
5151 return ONIGERR_UNDEFINED_GROUP_OPTION;
5152 break;
5153
5154 case 'm':
5155 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
5156 ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0));
5157 }
5158 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
5159 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
5160 }
5161 else
5162 return ONIGERR_UNDEFINED_GROUP_OPTION;
5163 break;
5164#ifdef USE_POSIXLINE_OPTION
5165 case 'p':
5166 ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
5167 break;
5168#endif
5169
5170 case 'a': /* limits \d, \s, \w and POSIX brackets to ASCII range */
5171 if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) ||
5172 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) &&
5173 (neg == 0)) {
5174 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0);
5175 ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1);
5176 ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1);
5177 }
5178 else
5179 return ONIGERR_UNDEFINED_GROUP_OPTION;
5180 break;
5181
5182 case 'u':
5183 if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) ||
5184 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) &&
5185 (neg == 0)) {
5186 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
5187 ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1);
5188 ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1);
5189 }
5190 else
5191 return ONIGERR_UNDEFINED_GROUP_OPTION;
5192 break;
5193
5194 case 'd':
5195 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) &&
5196 (neg == 0)) {
5197 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
5198 }
5199 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY) &&
5200 (neg == 0)) {
5201 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0);
5202 ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0);
5203 ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0);
5204 }
5205 else
5206 return ONIGERR_UNDEFINED_GROUP_OPTION;
5207 break;
5208
5209 case 'l':
5210 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) && (neg == 0)) {
5211 ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
5212 }
5213 else
5214 return ONIGERR_UNDEFINED_GROUP_OPTION;
5215 break;
5216
5217 default:
5218 return ONIGERR_UNDEFINED_GROUP_OPTION;
5219 }
5220
5221 if (c == ')') {
5222 *np = node_new_option(option);
5223 CHECK_NULL_RETURN_MEMERR(*np);
5224 *src = p;
5225 return 2; /* option only */
5226 }
5227 else if (c == ':') {
5228 OnigOptionType prev = env->option;
5229
5230 env->option = option;
5231 r = fetch_token(tok, &p, end, env);
5232 if (r < 0) return r;
5233 r = parse_subexp(&target, tok, term, &p, end, env);
5234 env->option = prev;
5235 if (r < 0) return r;
5236 *np = node_new_option(option);
5237 CHECK_NULL_RETURN_MEMERR(*np);
5238 NENCLOSE(*np)->target = target;
5239 *src = p;
5240 return 0;
5241 }
5242
5243 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
5244 PFETCH(c);
5245 }
5246 }
5247 break;
5248
5249 default:
5250 return ONIGERR_UNDEFINED_GROUP_OPTION;
5251 }
5252 }
5253 else {
5254 if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP))
5255 goto group;
5256
5257 *np = node_new_enclose_memory(env->option, 0);
5258 CHECK_NULL_RETURN_MEMERR(*np);
5259 num = scan_env_add_mem_entry(env);
5260 if (num < 0) return num;
5261 NENCLOSE(*np)->regnum = num;
5262 }
5263
5264 CHECK_NULL_RETURN_MEMERR(*np);
5265 r = fetch_token(tok, &p, end, env);
5266 if (r < 0) return r;
5267 r = parse_subexp(&target, tok, term, &p, end, env);
5268 if (r < 0) {
5269 onig_node_free(target);
5270 return r;
5271 }
5272
5273 if (NTYPE(*np) == NT_ANCHOR)
5274 NANCHOR(*np)->target = target;
5275 else {
5276 NENCLOSE(*np)->target = target;
5277 if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) {
5278 /* Don't move this to previous of parse_subexp() */
5279 r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np);
5280 if (r != 0) return r;
5281 }
5282 else if (NENCLOSE(*np)->type == ENCLOSE_CONDITION) {
5283 if (NTYPE(target) != NT_ALT) {
5284 /* convert (?(cond)yes) to (?(cond)yes|empty) */
5285 work1 = node_new_empty();
5286 if (IS_NULL(work1)) goto err;
5287 work2 = onig_node_new_alt(work1, NULL_NODE);
5288 if (IS_NULL(work2)) goto err;
5289 work1 = onig_node_new_alt(target, work2);
5290 if (IS_NULL(work1)) goto err;
5291 NENCLOSE(*np)->target = work1;
5292 }
5293 }
5294 }
5295
5296 *src = p;
5297 return 0;
5298
5299 err:
5300 onig_node_free(work1);
5301 onig_node_free(work2);
5302 onig_node_free(*np);
5303 *np = NULL;
5304 return ONIGERR_MEMORY;
5305}
5306
5307static const char* const PopularQStr[] = {
5308 "?", "*", "+", "??", "*?", "+?"
5309};
5310
5311static const char* const ReduceQStr[] = {
5312 "", "", "*", "*?", "??", "+ and ??", "+? and ?"
5313};
5314
5315static int
5316set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)
5317{
5318 QtfrNode* qn;
5319
5320 qn = NQTFR(qnode);
5321 if (qn->lower == 1 && qn->upper == 1) {
5322 return 1;
5323 }
5324
5325 switch (NTYPE(target)) {
5326 case NT_STR:
5327 if (! group) {
5328 StrNode* sn = NSTR(target);
5329 if (str_node_can_be_split(sn, env->enc)) {
5330 Node* n = str_node_split_last_char(sn, env->enc);
5331 if (IS_NOT_NULL(n)) {
5332 qn->target = n;
5333 return 2;
5334 }
5335 }
5336 }
5337 break;
5338
5339 case NT_QTFR:
5340 { /* check redundant double repeat. */
5341 /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */
5342 QtfrNode* qnt = NQTFR(target);
5343 int nestq_num = popular_quantifier_num(qn);
5344 int targetq_num = popular_quantifier_num(qnt);
5345
5346#ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
5347 if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) &&
5348 IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
5349 UChar buf[WARN_BUFSIZE];
5350
5351 switch (ReduceTypeTable[targetq_num][nestq_num]) {
5352 case RQ_ASIS:
5353 break;
5354
5355 case RQ_DEL:
5356 if (onig_verb_warn != onig_null_warn) {
5357 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
5358 env->pattern, env->pattern_end,
5359 (UChar* )"redundant nested repeat operator");
5360 (*onig_verb_warn)((char* )buf);
5361 }
5362 goto warn_exit;
5363 break;
5364
5365 default:
5366 if (onig_verb_warn != onig_null_warn) {
5367 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
5368 env->pattern, env->pattern_end,
5369 (UChar* )"nested repeat operator %s and %s was replaced with '%s'",
5370 PopularQStr[targetq_num], PopularQStr[nestq_num],
5371 ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
5372 (*onig_verb_warn)((char* )buf);
5373 }
5374 goto warn_exit;
5375 break;
5376 }
5377 }
5378
5379 warn_exit:
5380#endif
5381 if (targetq_num >= 0) {
5382 if (nestq_num >= 0) {
5383 onig_reduce_nested_quantifier(qnode, target);
5384 goto q_exit;
5385 }
5386 else if (targetq_num == 1 || targetq_num == 2) { /* * or + */
5387 /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */
5388 if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) {
5389 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
5390 }
5391 }
5392 }
5393 }
5394 break;
5395
5396 default:
5397 break;
5398 }
5399
5400 qn->target = target;
5401 q_exit:
5402 return 0;
5403}
5404
5405
5406#ifdef USE_SHARED_CCLASS_TABLE
5407
5408#define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS 8
5409
5410/* for ctype node hash table */
5411
5412typedef struct {
5413 OnigEncoding enc;
5414 int not;
5415 int type;
5416} type_cclass_key;
5417
5418static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y)
5419{
5420 if (x->type != y->type) return 1;
5421 if (x->enc != y->enc) return 1;
5422 if (x->not != y->not) return 1;
5423 return 0;
5424}
5425
5426static int type_cclass_hash(type_cclass_key* key)
5427{
5428 int i, val;
5429 UChar *p;
5430
5431 val = 0;
5432
5433 p = (UChar* )&(key->enc);
5434 for (i = 0; i < (int )sizeof(key->enc); i++) {
5435 val = val * 997 + (int )*p++;
5436 }
5437
5438 p = (UChar* )(&key->type);
5439 for (i = 0; i < (int )sizeof(key->type); i++) {
5440 val = val * 997 + (int )*p++;
5441 }
5442
5443 val += key->not;
5444 return val + (val >> 5);
5445}
5446
5447static struct st_hash_type type_type_cclass_hash = {
5448 type_cclass_cmp,
5449 type_cclass_hash,
5450};
5451
5452static st_table* OnigTypeCClassTable;
5453
5454
5455static int
5456i_free_shared_class(type_cclass_key* key, Node* node, void* arg ARG_UNUSED)
5457{
5458 if (IS_NOT_NULL(node)) {
5459 CClassNode* cc = NCCLASS(node);
5460 if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf);
5461 xfree(node);
5462 }
5463
5464 if (IS_NOT_NULL(key)) xfree(key);
5465 return ST_DELETE;
5466}
5467
5468extern int
5469onig_free_shared_cclass_table(void)
5470{
5471 /* THREAD_ATOMIC_START; */
5472 if (IS_NOT_NULL(OnigTypeCClassTable)) {
5473 onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0);
5474 onig_st_free_table(OnigTypeCClassTable);
5475 OnigTypeCClassTable = NULL;
5476 }
5477 /* THREAD_ATOMIC_END; */
5478
5479 return 0;
5480}
5481
5482#endif /* USE_SHARED_CCLASS_TABLE */
5483
5484
5485#ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
5486static int
5487clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
5488{
5489 BBuf *tbuf;
5490 int r;
5491
5492 if (IS_NCCLASS_NOT(cc)) {
5493 bitset_invert(cc->bs);
5494
5495 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
5496 r = not_code_range_buf(enc, cc->mbuf, &tbuf);
5497 if (r != 0) return r;
5498
5499 bbuf_free(cc->mbuf);
5500 cc->mbuf = tbuf;
5501 }
5502
5503 NCCLASS_CLEAR_NOT(cc);
5504 }
5505
5506 return 0;
5507}
5508#endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
5509
5510typedef struct {
5511 ScanEnv* env;
5512 CClassNode* cc;
5513 CClassNode* asc_cc;
5514 Node* alt_root;
5515 Node** ptail;
5516} IApplyCaseFoldArg;
5517
5518static int
5519i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
5520 int to_len, void* arg)
5521{
5522 IApplyCaseFoldArg* iarg;
5523 ScanEnv* env;
5524 CClassNode* cc;
5525 CClassNode* asc_cc;
5526 BitSetRef bs;
5527 int add_flag;
5528
5529 iarg = (IApplyCaseFoldArg* )arg;
5530 env = iarg->env;
5531 cc = iarg->cc;
5532 asc_cc = iarg->asc_cc;
5533 bs = cc->bs;
5534
5535 if (IS_NULL(asc_cc)) {
5536 add_flag = 0;
5537 }
5538 else if (ONIGENC_IS_ASCII_CODE(from) == ONIGENC_IS_ASCII_CODE(*to)) {
5539 add_flag = 1;
5540 }
5541 else {
5542 add_flag = onig_is_code_in_cc(env->enc, from, asc_cc);
5543 if (IS_NCCLASS_NOT(asc_cc))
5544 add_flag = !add_flag;
5545 }
5546
5547 if (to_len == 1) {
5548 int is_in = onig_is_code_in_cc(env->enc, from, cc);
5549#ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
5550 if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
5551 (is_in == 0 && IS_NCCLASS_NOT(cc))) {
5552 if (add_flag) {
5553 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
5554 add_code_range(&(cc->mbuf), env, *to, *to);
5555 }
5556 else {
5557 BITSET_SET_BIT(bs, *to);
5558 }
5559 }
5560 }
5561#else
5562 if (is_in != 0) {
5563 if (add_flag) {
5564 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
5565 if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
5566 add_code_range(&(cc->mbuf), env, *to, *to);
5567 }
5568 else {
5569 if (IS_NCCLASS_NOT(cc)) {
5570 BITSET_CLEAR_BIT(bs, *to);
5571 }
5572 else {
5573 BITSET_SET_BIT(bs, *to);
5574 }
5575 }
5576 }
5577 }
5578#endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
5579 }
5580 else {
5581 int r, i, len;
5582 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5583 Node *snode = NULL_NODE;
5584
5585 if (onig_is_code_in_cc(env->enc, from, cc)
5586#ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
5587 && !IS_NCCLASS_NOT(cc)
5588#endif
5589 ) {
5590 for (i = 0; i < to_len; i++) {
5591 len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
5592 if (i == 0) {
5593 snode = onig_node_new_str(buf, buf + len);
5594 CHECK_NULL_RETURN_MEMERR(snode);
5595
5596 /* char-class expanded multi-char only
5597 compare with string folded at match time. */
5598 NSTRING_SET_AMBIG(snode);
5599 }
5600 else {
5601 r = onig_node_str_cat(snode, buf, buf + len);
5602 if (r < 0) {
5603 onig_node_free(snode);
5604 return r;
5605 }
5606 }
5607 }
5608
5609 *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE);
5610 CHECK_NULL_RETURN_MEMERR(*(iarg->ptail));
5611 iarg->ptail = &(NCDR((*(iarg->ptail))));
5612 }
5613 }
5614
5615 return 0;
5616}
5617
5618static int
5619cclass_case_fold(Node** np, CClassNode* cc, CClassNode* asc_cc, ScanEnv* env)
5620{
5621 int r;
5622 IApplyCaseFoldArg iarg;
5623
5624 iarg.env = env;
5625 iarg.cc = cc;
5626 iarg.asc_cc = asc_cc;
5627 iarg.alt_root = NULL_NODE;
5628 iarg.ptail = &(iarg.alt_root);
5629
5630 r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
5631 i_apply_case_fold, &iarg);
5632 if (r != 0) {
5633 onig_node_free(iarg.alt_root);
5634 return r;
5635 }
5636 if (IS_NOT_NULL(iarg.alt_root)) {
5637 Node* work = onig_node_new_alt(*np, iarg.alt_root);
5638 if (IS_NULL(work)) {
5639 onig_node_free(iarg.alt_root);
5640 return ONIGERR_MEMORY;
5641 }
5642 *np = work;
5643 }
5644 return r;
5645}
5646
5647static int
5648node_linebreak(Node** np, ScanEnv* env)
5649{
5650 /* same as (?>\x0D\x0A|[\x0A-\x0D\x{85}\x{2028}\x{2029}]) */
5651 Node* left = NULL;
5652 Node* right = NULL;
5653 Node* target1 = NULL;
5654 Node* target2 = NULL;
5655 CClassNode* cc;
5656 int num1, num2;
5657 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2];
5658
5659 /* \x0D\x0A */
5660 num1 = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf);
5661 if (num1 < 0) return num1;
5662 num2 = ONIGENC_CODE_TO_MBC(env->enc, 0x0A, buf + num1);
5663 if (num2 < 0) return num2;
5664 left = node_new_str_raw(buf, buf + num1 + num2);
5665 if (IS_NULL(left)) goto err;
5666
5667 /* [\x0A-\x0D] or [\x0A-\x0D\x{85}\x{2028}\x{2029}] */
5668 right = node_new_cclass();
5669 if (IS_NULL(right)) goto err;
5670 cc = NCCLASS(right);
5671 if (ONIGENC_MBC_MINLEN(env->enc) > 1) {
5672 add_code_range(&(cc->mbuf), env, 0x0A, 0x0D);
5673 }
5674 else {
5675 bitset_set_range(cc->bs, 0x0A, 0x0D);
5676 }
5677
5678 /* TODO: move this block to enc/unicode.c */
5679 if (ONIGENC_IS_UNICODE(env->enc)) {
5680 /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */
5681 add_code_range(&(cc->mbuf), env, 0x85, 0x85);
5682 add_code_range(&(cc->mbuf), env, 0x2028, 0x2029);
5683 }
5684
5685 /* ...|... */
5686 target1 = onig_node_new_alt(right, NULL_NODE);
5687 if (IS_NULL(target1)) goto err;
5688 right = NULL;
5689 target2 = onig_node_new_alt(left, target1);
5690 if (IS_NULL(target2)) goto err;
5691 left = NULL;
5692 target1 = NULL;
5693
5694 /* (?>...) */
5695 *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
5696 if (IS_NULL(*np)) goto err;
5697 NENCLOSE(*np)->target = target2;
5698 return ONIG_NORMAL;
5699
5700 err:
5701 onig_node_free(left);
5702 onig_node_free(right);
5703 onig_node_free(target1);
5704 onig_node_free(target2);
5705 return ONIGERR_MEMORY;
5706}
5707
5708static int
5709node_extended_grapheme_cluster(Node** np, ScanEnv* env)
5710{
5711 /* same as (?>\P{M}\p{M}*) */
5712 Node* np1 = NULL;
5713 Node* np2 = NULL;
5714 Node* qn = NULL;
5715 Node* list1 = NULL;
5716 Node* list2 = NULL;
5717 int r = 0;
5718
5719#ifdef USE_UNICODE_PROPERTIES
5720 if (ONIGENC_IS_UNICODE(env->enc)) {
5721 /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */
5722 CClassNode* cc1;
5723 CClassNode* cc2;
5724 UChar* propname = (UChar* )"M";
5725 int ctype = env->enc->property_name_to_ctype(ONIG_ENCODING_ASCII,
5726 propname, propname + 1);
5727 if (ctype >= 0) {
5728 /* \P{M} */
5729 np1 = node_new_cclass();
5730 if (IS_NULL(np1)) goto err;
5731 cc1 = NCCLASS(np1);
5732 r = add_ctype_to_cc(cc1, ctype, 0, 0, env);
5733 if (r != 0) goto err;
5734 NCCLASS_SET_NOT(cc1);
5735
5736 /* \p{M}* */
5737 np2 = node_new_cclass();
5738 if (IS_NULL(np2)) goto err;
5739 cc2 = NCCLASS(np2);
5740 r = add_ctype_to_cc(cc2, ctype, 0, 0, env);
5741 if (r != 0) goto err;
5742
5743 qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
5744 if (IS_NULL(qn)) goto err;
5745 NQTFR(qn)->target = np2;
5746 np2 = NULL;
5747
5748 /* \P{M}\p{M}* */
5749 list2 = node_new_list(qn, NULL_NODE);
5750 if (IS_NULL(list2)) goto err;
5751 qn = NULL;
5752 list1 = node_new_list(np1, list2);
5753 if (IS_NULL(list1)) goto err;
5754 np1 = NULL;
5755 list2 = NULL;
5756
5757 /* (?>...) */
5758 *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
5759 if (IS_NULL(*np)) goto err;
5760 NENCLOSE(*np)->target = list1;
5761 return ONIG_NORMAL;
5762 }
5763 }
5764#endif /* USE_UNICODE_PROPERTIES */
5765 if (IS_NULL(*np)) {
5766 /* PerlSyntax: (?s:.), RubySyntax: (?m:.) */
5767 OnigOptionType option;
5768 np1 = node_new_anychar();
5769 if (IS_NULL(np1)) goto err;
5770
5771 option = env->option;
5772 ONOFF(option, ONIG_OPTION_MULTILINE, 0);
5773 *np = node_new_option(option);
5774 if (IS_NULL(*np)) goto err;
5775 NENCLOSE(*np)->target = np1;
5776 }
5777 return ONIG_NORMAL;
5778
5779 err:
5780 onig_node_free(np1);
5781 onig_node_free(np2);
5782 onig_node_free(qn);
5783 onig_node_free(list1);
5784 onig_node_free(list2);
5785 return (r == 0) ? ONIGERR_MEMORY : r;
5786}
5787
5788static int
5789countbits(unsigned int bits)
5790{
5791 bits = (bits & 0x55555555) + ((bits >> 1) & 0x55555555);
5792 bits = (bits & 0x33333333) + ((bits >> 2) & 0x33333333);
5793 bits = (bits & 0x0f0f0f0f) + ((bits >> 4) & 0x0f0f0f0f);
5794 bits = (bits & 0x00ff00ff) + ((bits >> 8) & 0x00ff00ff);
5795 return (bits & 0x0000ffff) + ((bits >>16) & 0x0000ffff);
5796}
5797
5798static int
5799is_onechar_cclass(CClassNode* cc, OnigCodePoint* code)
5800{
5801 const OnigCodePoint not_found = ONIG_LAST_CODE_POINT;
5802 OnigCodePoint c = not_found;
5803 int i;
5804 BBuf *bbuf = cc->mbuf;
5805
5806 if (IS_NCCLASS_NOT(cc)) return 0;
5807
5808 /* check bbuf */
5809 if (IS_NOT_NULL(bbuf)) {
5810 OnigCodePoint n, *data;
5811 GET_CODE_POINT(n, bbuf->p);
5812 data = (OnigCodePoint* )(bbuf->p) + 1;
5813 if ((n == 1) && (data[0] == data[1])) {
5814 /* only one char found in the bbuf, save the code point. */
5815 c = data[0];
5816 if (((c < SINGLE_BYTE_SIZE) && BITSET_AT(cc->bs, c))) {
5817 /* skip if c is included in the bitset */
5818 c = not_found;
5819 }
5820 }
5821 else {
5822 return 0; /* the bbuf contains multiple chars */
5823 }
5824 }
5825
5826 /* check bitset */
5827 for (i = 0; i < BITSET_SIZE; i++) {
5828 Bits b1 = cc->bs[i];
5829 if (b1 != 0) {
5830 if (((b1 & (b1 - 1)) == 0) && (c == not_found)) {
5831 c = BITS_IN_ROOM * i + countbits(b1 - 1);
5832 } else {
5833 return 0; /* the character class contains multiple chars */
5834 }
5835 }
5836 }
5837
5838 if (c != not_found) {
5839 *code = c;
5840 return 1;
5841 }
5842
5843 /* the character class contains no char. */
5844 return 0;
5845}
5846
5847
5848static int
5849parse_exp(Node** np, OnigToken* tok, int term,
5850 UChar** src, UChar* end, ScanEnv* env)
5851{
5852 int r, len, group = 0;
5853 Node* qn;
5854 Node** targetp;
5855
5856 *np = NULL;
5857 if (tok->type == (enum TokenSyms )term)
5858 goto end_of_token;
5859
5860 switch (tok->type) {
5861 case TK_ALT:
5862 case TK_EOT:
5863 end_of_token:
5864 *np = node_new_empty();
5865 return tok->type;
5866 break;
5867
5868 case TK_SUBEXP_OPEN:
5869 r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env);
5870 if (r < 0) return r;
5871 if (r == 1) group = 1;
5872 else if (r == 2) { /* option only */
5873 Node* target;
5874 OnigOptionType prev = env->option;
5875
5876 env->option = NENCLOSE(*np)->option;
5877 r = fetch_token(tok, src, end, env);
5878 if (r < 0) return r;
5879 r = parse_subexp(&target, tok, term, src, end, env);
5880 env->option = prev;
5881 if (r < 0) {
5882 onig_node_free(target);
5883 return r;
5884 }
5885 NENCLOSE(*np)->target = target;
5886 return tok->type;
5887 }
5888 break;
5889
5890 case TK_SUBEXP_CLOSE:
5891 if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
5892 return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
5893
5894 if (tok->escaped) goto tk_raw_byte;
5895 else goto tk_byte;
5896 break;
5897
5898 case TK_LINEBREAK:
5899 r = node_linebreak(np, env);
5900 if (r < 0) return r;
5901 break;
5902
5903 case TK_EXTENDED_GRAPHEME_CLUSTER:
5904 r = node_extended_grapheme_cluster(np, env);
5905 if (r < 0) return r;
5906 break;
5907
5908 case TK_KEEP:
5909 *np = onig_node_new_anchor(ANCHOR_KEEP);
5910 CHECK_NULL_RETURN_MEMERR(*np);
5911 break;
5912
5913 case TK_STRING:
5914 tk_byte:
5915 {
5916 *np = node_new_str(tok->backp, *src);
5917 CHECK_NULL_RETURN_MEMERR(*np);
5918
5919 string_loop:
5920 while (1) {
5921 r = fetch_token(tok, src, end, env);
5922 if (r < 0) return r;
5923 if (r == TK_STRING) {
5924 r = onig_node_str_cat(*np, tok->backp, *src);
5925 }
5926#ifndef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
5927 else if (r == TK_CODE_POINT) {
5928 r = node_str_cat_codepoint(*np, env->enc, tok->u.code);
5929 }
5930#endif
5931 else {
5932 break;
5933 }
5934 if (r < 0) return r;
5935 }
5936
5937 string_end:
5938 targetp = np;
5939 goto repeat;
5940 }
5941 break;
5942
5943 case TK_RAW_BYTE:
5944 tk_raw_byte:
5945 {
5946 *np = node_new_str_raw_char((UChar )tok->u.c);
5947 CHECK_NULL_RETURN_MEMERR(*np);
5948 len = 1;
5949 while (1) {
5950 if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
5951 if (len == enclen(env->enc, NSTR(*np)->s)) {
5952 r = fetch_token(tok, src, end, env);
5953 NSTRING_CLEAR_RAW(*np);
5954 goto string_end;
5955 }
5956 }
5957
5958 r = fetch_token(tok, src, end, env);
5959 if (r < 0) return r;
5960 if (r != TK_RAW_BYTE) {
5961 /* Don't use this, it is wrong for little endian encodings. */
5962#ifdef USE_PAD_TO_SHORT_BYTE_CHAR
5963 int rem;
5964 if (len < ONIGENC_MBC_MINLEN(env->enc)) {
5965 rem = ONIGENC_MBC_MINLEN(env->enc) - len;
5966 (void )node_str_head_pad(NSTR(*np), rem, (UChar )0);
5967 if (len + rem == enclen(env->enc, NSTR(*np)->s)) {
5968 NSTRING_CLEAR_RAW(*np);
5969 goto string_end;
5970 }
5971 }
5972#endif
5973 return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
5974 }
5975
5976 r = node_str_cat_char(*np, (UChar )tok->u.c);
5977 if (r < 0) return r;
5978
5979 len++;
5980 }
5981 }
5982 break;
5983
5984 case TK_CODE_POINT:
5985 {
5986 *np = node_new_empty();
5987 CHECK_NULL_RETURN_MEMERR(*np);
5988 r = node_str_cat_codepoint(*np, env->enc, tok->u.code);
5989 if (r != 0) return r;
5990#ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
5991 NSTRING_SET_RAW(*np);
5992#else
5993 goto string_loop;
5994#endif
5995 }
5996 break;
5997
5998 case TK_QUOTE_OPEN:
5999 {
6000 OnigCodePoint end_op[2];
6001 UChar *qstart, *qend, *nextp;
6002
6003 end_op[0] = (OnigCodePoint )MC_ESC(env->syntax);
6004 end_op[1] = (OnigCodePoint )'E';
6005 qstart = *src;
6006 qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
6007 if (IS_NULL(qend)) {
6008 nextp = qend = end;
6009 }
6010 *np = node_new_str(qstart, qend);
6011 CHECK_NULL_RETURN_MEMERR(*np);
6012 *src = nextp;
6013 }
6014 break;
6015
6016 case TK_CHAR_TYPE:
6017 {
6018 switch (tok->u.prop.ctype) {
6019 case ONIGENC_CTYPE_WORD:
6020 *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not,
6021 IS_ASCII_RANGE(env->option));
6022 CHECK_NULL_RETURN_MEMERR(*np);
6023 break;
6024
6025 case ONIGENC_CTYPE_SPACE:
6026 case ONIGENC_CTYPE_DIGIT:
6027 case ONIGENC_CTYPE_XDIGIT:
6028 {
6029 CClassNode* cc;
6030
6031#ifdef USE_SHARED_CCLASS_TABLE
6032 const OnigCodePoint *mbr;
6033 OnigCodePoint sb_out;
6034
6035 r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, tok->u.prop.ctype,
6036 &sb_out, &mbr);
6037 if (r == 0 &&
6038 ! IS_ASCII_RANGE(env->option) &&
6039 ONIGENC_CODE_RANGE_NUM(mbr)
6040 >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) {
6041 type_cclass_key key;
6042 type_cclass_key* new_key;
6043
6044 key.enc = env->enc;
6045 key.not = tok->u.prop.not;
6046 key.type = tok->u.prop.ctype;
6047
6048 THREAD_ATOMIC_START;
6049
6050 if (IS_NULL(OnigTypeCClassTable)) {
6051 OnigTypeCClassTable
6052 = onig_st_init_table_with_size(&type_type_cclass_hash, 10);
6053 if (IS_NULL(OnigTypeCClassTable)) {
6054 THREAD_ATOMIC_END;
6055 return ONIGERR_MEMORY;
6056 }
6057 }
6058 else {
6059 if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )&key,
6060 (st_data_t* )np)) {
6061 THREAD_ATOMIC_END;
6062 break;
6063 }
6064 }
6065
6066 *np = node_new_cclass_by_codepoint_range(tok->u.prop.not,
6067 sb_out, mbr);
6068 if (IS_NULL(*np)) {
6069 THREAD_ATOMIC_END;
6070 return ONIGERR_MEMORY;
6071 }
6072
6073 cc = NCCLASS(*np);
6074 NCCLASS_SET_SHARE(cc);
6075 new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key));
6076 xmemcpy(new_key, &key, sizeof(type_cclass_key));
6077 onig_st_add_direct(OnigTypeCClassTable, (st_data_t )new_key,
6078 (st_data_t )*np);
6079
6080 THREAD_ATOMIC_END;
6081 }
6082 else {
6083#endif
6084 *np = node_new_cclass();
6085 CHECK_NULL_RETURN_MEMERR(*np);
6086 cc = NCCLASS(*np);
6087 r = add_ctype_to_cc(cc, tok->u.prop.ctype, 0,
6088 IS_ASCII_RANGE(env->option), env);
6089 if (r != 0) return r;
6090 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
6091#ifdef USE_SHARED_CCLASS_TABLE
6092 }
6093#endif
6094 }
6095 break;
6096
6097 default:
6098 return ONIGERR_PARSER_BUG;
6099 break;
6100 }
6101 }
6102 break;
6103
6104 case TK_CHAR_PROPERTY:
6105 r = parse_char_property(np, tok, src, end, env);
6106 if (r != 0) return r;
6107 break;
6108
6109 case TK_CC_OPEN:
6110 {
6111 Node *asc_node;
6112 CClassNode* cc;
6113 OnigCodePoint code;
6114
6115 r = parse_char_class(np, &asc_node, tok, src, end, env);
6116 if (r != 0) {
6117 onig_node_free(asc_node);
6118 return r;
6119 }
6120
6121 cc = NCCLASS(*np);
6122 if (is_onechar_cclass(cc, &code)) {
6123 onig_node_free(*np);
6124 onig_node_free(asc_node);
6125 *np = node_new_empty();
6126 CHECK_NULL_RETURN_MEMERR(*np);
6127 r = node_str_cat_codepoint(*np, env->enc, code);
6128 if (r != 0) return r;
6129 goto string_loop;
6130 }
6131 if (IS_IGNORECASE(env->option)) {
6132 r = cclass_case_fold(np, cc, NCCLASS(asc_node), env);
6133 if (r != 0) {
6134 onig_node_free(asc_node);
6135 return r;
6136 }
6137 }
6138 onig_node_free(asc_node);
6139 }
6140 break;
6141
6142 case TK_ANYCHAR:
6143 *np = node_new_anychar();
6144 CHECK_NULL_RETURN_MEMERR(*np);
6145 break;
6146
6147 case TK_ANYCHAR_ANYTIME:
6148 *np = node_new_anychar();
6149 CHECK_NULL_RETURN_MEMERR(*np);
6150 qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
6151 CHECK_NULL_RETURN_MEMERR(qn);
6152 NQTFR(qn)->target = *np;
6153 *np = qn;
6154 break;
6155
6156 case TK_BACKREF:
6157 len = tok->u.backref.num;
6158 *np = node_new_backref(len,
6159 (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
6160 tok->u.backref.by_name,
6161#ifdef USE_BACKREF_WITH_LEVEL
6162 tok->u.backref.exist_level,
6163 tok->u.backref.level,
6164#endif
6165 env);
6166 CHECK_NULL_RETURN_MEMERR(*np);
6167 break;
6168
6169#ifdef USE_SUBEXP_CALL
6170 case TK_CALL:
6171 {
6172 int gnum = tok->u.call.gnum;
6173
6174 if (gnum < 0 || tok->u.call.rel != 0) {
6175 if (gnum > 0) gnum--;
6176 gnum = BACKREF_REL_TO_ABS(gnum, env);
6177 if (gnum <= 0)
6178 return ONIGERR_INVALID_BACKREF;
6179 }
6180 *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum);
6181 CHECK_NULL_RETURN_MEMERR(*np);
6182 env->num_call++;
6183 }
6184 break;
6185#endif
6186
6187 case TK_ANCHOR:
6188 *np = onig_node_new_anchor(tok->u.anchor.subtype);
6189 CHECK_NULL_RETURN_MEMERR(*np);
6190 NANCHOR(*np)->ascii_range = tok->u.anchor.ascii_range;
6191 break;
6192
6193 case TK_OP_REPEAT:
6194 case TK_INTERVAL:
6195 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
6196 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
6197 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
6198 else
6199 *np = node_new_empty();
6200 }
6201 else {
6202 goto tk_byte;
6203 }
6204 break;
6205
6206 default:
6207 return ONIGERR_PARSER_BUG;
6208 break;
6209 }
6210
6211 {
6212 targetp = np;
6213
6214 re_entry:
6215 r = fetch_token(tok, src, end, env);
6216 if (r < 0) return r;
6217
6218 repeat:
6219 if (r == TK_OP_REPEAT || r == TK_INTERVAL) {
6220 if (is_invalid_quantifier_target(*targetp))
6221 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
6222
6223 qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
6224 (r == TK_INTERVAL ? 1 : 0));
6225 CHECK_NULL_RETURN_MEMERR(qn);
6226 NQTFR(qn)->greedy = tok->u.repeat.greedy;
6227 r = set_quantifier(qn, *targetp, group, env);
6228 if (r < 0) {
6229 onig_node_free(qn);
6230 return r;
6231 }
6232
6233 if (tok->u.repeat.possessive != 0) {
6234 Node* en;
6235 en = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
6236 if (IS_NULL(en)) {
6237 onig_node_free(qn);
6238 return ONIGERR_MEMORY;
6239 }
6240 NENCLOSE(en)->target = qn;
6241 qn = en;
6242 }
6243
6244 if (r == 0) {
6245 *targetp = qn;
6246 }
6247 else if (r == 1) {
6248 onig_node_free(qn);
6249 }
6250 else if (r == 2) { /* split case: /abc+/ */
6251 Node *tmp;
6252
6253 *targetp = node_new_list(*targetp, NULL);
6254 if (IS_NULL(*targetp)) {
6255 onig_node_free(qn);
6256 return ONIGERR_MEMORY;
6257 }
6258 tmp = NCDR(*targetp) = node_new_list(qn, NULL);
6259 if (IS_NULL(tmp)) {
6260 onig_node_free(qn);
6261 return ONIGERR_MEMORY;
6262 }
6263 targetp = &(NCAR(tmp));
6264 }
6265 goto re_entry;
6266 }
6267 }
6268
6269 return r;
6270}
6271
6272static int
6273parse_branch(Node** top, OnigToken* tok, int term,
6274 UChar** src, UChar* end, ScanEnv* env)
6275{
6276 int r;
6277 Node *node, **headp;
6278
6279 *top = NULL;
6280 r = parse_exp(&node, tok, term, src, end, env);
6281 if (r < 0) {
6282 onig_node_free(node);
6283 return r;
6284 }
6285
6286 if (r == TK_EOT || r == term || r == TK_ALT) {
6287 *top = node;
6288 }
6289 else {
6290 *top = node_new_list(node, NULL);
6291 headp = &(NCDR(*top));
6292 while (r != TK_EOT && r != term && r != TK_ALT) {
6293 r = parse_exp(&node, tok, term, src, end, env);
6294 if (r < 0) {
6295 onig_node_free(node);
6296 return r;
6297 }
6298
6299 if (NTYPE(node) == NT_LIST) {
6300 *headp = node;
6301 while (IS_NOT_NULL(NCDR(node))) node = NCDR(node);
6302 headp = &(NCDR(node));
6303 }
6304 else {
6305 *headp = node_new_list(node, NULL);
6306 headp = &(NCDR(*headp));
6307 }
6308 }
6309 }
6310
6311 return r;
6312}
6313
6314/* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
6315static int
6316parse_subexp(Node** top, OnigToken* tok, int term,
6317 UChar** src, UChar* end, ScanEnv* env)
6318{
6319 int r;
6320 Node *node, **headp;
6321
6322 *top = NULL;
6323 r = parse_branch(&node, tok, term, src, end, env);
6324 if (r < 0) {
6325 onig_node_free(node);
6326 return r;
6327 }
6328
6329 if (r == term) {
6330 *top = node;
6331 }
6332 else if (r == TK_ALT) {
6333 *top = onig_node_new_alt(node, NULL);
6334 headp = &(NCDR(*top));
6335 while (r == TK_ALT) {
6336 r = fetch_token(tok, src, end, env);
6337 if (r < 0) return r;
6338 r = parse_branch(&node, tok, term, src, end, env);
6339 if (r < 0) {
6340 onig_node_free(node);
6341 return r;
6342 }
6343
6344 *headp = onig_node_new_alt(node, NULL);
6345 headp = &(NCDR(*headp));
6346 }
6347
6348 if (tok->type != (enum TokenSyms )term)
6349 goto err;
6350 }
6351 else {
6352 onig_node_free(node);
6353 err:
6354 if (term == TK_SUBEXP_CLOSE)
6355 return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
6356 else
6357 return ONIGERR_PARSER_BUG;
6358 }
6359
6360 return r;
6361}
6362
6363static int
6364parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
6365{
6366 int r;
6367 OnigToken tok;
6368
6369 r = fetch_token(&tok, src, end, env);
6370 if (r < 0) return r;
6371 r = parse_subexp(top, &tok, TK_EOT, src, end, env);
6372 if (r < 0) return r;
6373
6374#ifdef USE_SUBEXP_CALL
6375 if (env->num_call > 0) {
6376 /* Capture the pattern itself. It is used for (?R), (?0) and \g<0>. */
6377 const int num = 0;
6378 Node* np;
6379 np = node_new_enclose_memory(env->option, 0);
6380 CHECK_NULL_RETURN_MEMERR(np);
6381 NENCLOSE(np)->regnum = num;
6382 NENCLOSE(np)->target = *top;
6383 r = scan_env_set_mem_node(env, num, np);
6384 if (r != 0) {
6385 onig_node_free(np);
6386 return r;
6387 }
6388 *top = np;
6389 }
6390#endif
6391 return 0;
6392}
6393
6394extern int
6395onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end,
6396 regex_t* reg, ScanEnv* env)
6397{
6398 int r;
6399 UChar* p;
6400
6401#ifdef USE_NAMED_GROUP
6402 names_clear(reg);
6403#endif
6404
6405 scan_env_clear(env);
6406 env->option = reg->options;
6407 env->case_fold_flag = reg->case_fold_flag;
6408 env->enc = reg->enc;
6409 env->syntax = reg->syntax;
6410 env->pattern = (UChar* )pattern;
6411 env->pattern_end = (UChar* )end;
6412 env->reg = reg;
6413
6414 *root = NULL;
6415 p = (UChar* )pattern;
6416 r = parse_regexp(root, &p, (UChar* )end, env);
6417 reg->num_mem = env->num_mem;
6418 return r;
6419}
6420
6421extern void
6422onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
6423 UChar* arg, UChar* arg_end)
6424{
6425 env->error = arg;
6426 env->error_end = arg_end;
6427}
Note: See TracBrowser for help on using the repository browser.