Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

regcomp.c@ 403

Last change on this file since 403 was 337, checked in by coas-nagasima, 6 years ago
ASP3版ECNLを追加
Property svn:eol-style set to `native` Property svn:mime-type set to `text/x-csrc;charset=UTF-8`
File size: 74.4 KB

Line
1	/*
2	regcomp.c - TRE POSIX compatible regex compilation functions.
3
4	Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
5	All rights reserved.
6
7	Redistribution and use in source and binary forms, with or without
8	modification, are permitted provided that the following conditions
9	are met:
10
11	1. Redistributions of source code must retain the above copyright
12	notice, this list of conditions and the following disclaimer.
13
14	2. Redistributions in binary form must reproduce the above copyright
15	notice, this list of conditions and the following disclaimer in the
16	documentation and/or other materials provided with the distribution.
17
18	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
19	``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22	HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30	*/
31
32	#include <string.h>
33	#include <stdlib.h>
34	#include <regex.h>
35	#include <limits.h>
36	#include <stdint.h>
37	#include <ctype.h>
38
39	#include "tre.h"
40
41	#include <assert.h>
42
43	/***********************************************************************
44	from tre-compile.h
45	***********************************************************************/
46
47	typedef struct {
48	int position;
49	int code_min;
50	int code_max;
51	int *tags;
52	int assertions;
53	tre_ctype_t class;
54	tre_ctype_t *neg_classes;
55	int backref;
56	} tre_pos_and_tags_t;
57
58
59	/***********************************************************************
60	from tre-ast.c and tre-ast.h
61	***********************************************************************/
62
63	/* The different AST node types. */
64	typedef enum {
65	LITERAL,
66	CATENATION,
67	ITERATION,
68	UNION
69	} tre_ast_type_t;
70
71	/* Special subtypes of TRE_LITERAL. */
72	#define EMPTY -1 /* Empty leaf (denotes empty string). */
73	#define ASSERTION -2 /* Assertion leaf. */
74	#define TAG -3 /* Tag leaf. */
75	#define BACKREF -4 /* Back reference leaf. */
76
77	#define IS_SPECIAL(x) ((x)->code_min < 0)
78	#define IS_EMPTY(x) ((x)->code_min == EMPTY)
79	#define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
80	#define IS_TAG(x) ((x)->code_min == TAG)
81	#define IS_BACKREF(x) ((x)->code_min == BACKREF)
82
83
84	/* A generic AST node. All AST nodes consist of this node on the top
85	level with `obj' pointing to the actual content. */
86	typedef struct {
87	tre_ast_type_t type; /* Type of the node. */
88	void obj; / Pointer to actual node. */
89	int nullable;
90	int submatch_id;
91	int num_submatches;
92	int num_tags;
93	tre_pos_and_tags_t *firstpos;
94	tre_pos_and_tags_t *lastpos;
95	} tre_ast_node_t;
96
97
98	/* A "literal" node. These are created for assertions, back references,
99	tags, matching parameter settings, and all expressions that match one
100	character. */
101	typedef struct {
102	long code_min;
103	long code_max;
104	int position;
105	tre_ctype_t class;
106	tre_ctype_t *neg_classes;
107	} tre_literal_t;
108
109	/* A "catenation" node. These are created when two regexps are concatenated.
110	If there are more than one subexpressions in sequence, the `left' part
111	holds all but the last, and `right' part holds the last subexpression
112	(catenation is left associative). */
113	typedef struct {
114	tre_ast_node_t *left;
115	tre_ast_node_t *right;
116	} tre_catenation_t;
117
118	/* An "iteration" node. These are created for the "*", "+", "?", and "{m,n}"
119	operators. */
120	typedef struct {
121	/* Subexpression to match. */
122	tre_ast_node_t *arg;
123	/* Minimum number of consecutive matches. */
124	int min;
125	/* Maximum number of consecutive matches. */
126	int max;
127	/* If 0, match as many characters as possible, if 1 match as few as
128	possible. Note that this does not always mean the same thing as
129	matching as many/few repetitions as possible. */
130	unsigned int minimal:1;
131	} tre_iteration_t;
132
133	/* An "union" node. These are created for the "\|" operator. */
134	typedef struct {
135	tre_ast_node_t *left;
136	tre_ast_node_t *right;
137	} tre_union_t;
138
139
140	static tre_ast_node_t *
141	tre_ast_new_node(tre_mem_t mem, int type, void *obj)
142	{
143	tre_ast_node_t node = tre_mem_calloc(mem, sizeof node);
144	if (!node \|\| !obj)
145	return 0;
146	node->obj = obj;
147	node->type = type;
148	node->nullable = -1;
149	node->submatch_id = -1;
150	return node;
151	}
152
153	static tre_ast_node_t *
154	tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
155	{
156	tre_ast_node_t *node;
157	tre_literal_t *lit;
158
159	lit = tre_mem_calloc(mem, sizeof *lit);
160	node = tre_ast_new_node(mem, LITERAL, lit);
161	if (!node)
162	return 0;
163	lit->code_min = code_min;
164	lit->code_max = code_max;
165	lit->position = position;
166	return node;
167	}
168
169	static tre_ast_node_t *
170	tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal)
171	{
172	tre_ast_node_t *node;
173	tre_iteration_t *iter;
174
175	iter = tre_mem_calloc(mem, sizeof *iter);
176	node = tre_ast_new_node(mem, ITERATION, iter);
177	if (!node)
178	return 0;
179	iter->arg = arg;
180	iter->min = min;
181	iter->max = max;
182	iter->minimal = minimal;
183	node->num_submatches = arg->num_submatches;
184	return node;
185	}
186
187	static tre_ast_node_t *
188	tre_ast_new_union(tre_mem_t mem, tre_ast_node_t left, tre_ast_node_t right)
189	{
190	tre_ast_node_t *node;
191	tre_union_t *un;
192
193	if (!left)
194	return right;
195	un = tre_mem_calloc(mem, sizeof *un);
196	node = tre_ast_new_node(mem, UNION, un);
197	if (!node \|\| !right)
198	return 0;
199	un->left = left;
200	un->right = right;
201	node->num_submatches = left->num_submatches + right->num_submatches;
202	return node;
203	}
204
205	static tre_ast_node_t *
206	tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t left, tre_ast_node_t right)
207	{
208	tre_ast_node_t *node;
209	tre_catenation_t *cat;
210
211	if (!left)
212	return right;
213	cat = tre_mem_calloc(mem, sizeof *cat);
214	node = tre_ast_new_node(mem, CATENATION, cat);
215	if (!node)
216	return 0;
217	cat->left = left;
218	cat->right = right;
219	node->num_submatches = left->num_submatches + right->num_submatches;
220	return node;
221	}
222
223
224	/***********************************************************************
225	from tre-stack.c and tre-stack.h
226	***********************************************************************/
227
228	typedef struct tre_stack_rec tre_stack_t;
229
230	/* Creates a new stack object. `size' is initial size in bytes, `max_size'
231	is maximum size, and `increment' specifies how much more space will be
232	allocated with realloc() if all space gets used up. Returns the stack
233	object or NULL if out of memory. */
234	static tre_stack_t *
235	tre_stack_new(int size, int max_size, int increment);
236
237	/* Frees the stack object. */
238	static void
239	tre_stack_destroy(tre_stack_t *s);
240
241	/* Returns the current number of objects in the stack. */
242	static int
243	tre_stack_num_objects(tre_stack_t *s);
244
245	/* Each tre_stack_push_(tre_stack_t s, <type> value) function pushes
246	`value' on top of stack `s'. Returns REG_ESPACE if out of memory.
247	This tries to realloc() more space before failing if maximum size
248	has not yet been reached. Returns REG_OK if successful. */
249	#define declare_pushf(typetag, type) \
250	static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
251
252	declare_pushf(voidptr, void *);
253	declare_pushf(int, int);
254
255	/* Each tre_stack_pop_(tre_stack_t s) function pops the topmost
256	element off of stack `s' and returns it. The stack must not be
257	empty. */
258	#define declare_popf(typetag, type) \
259	static type tre_stack_pop_ ## typetag(tre_stack_t *s)
260
261	declare_popf(voidptr, void *);
262	declare_popf(int, int);
263
264	/* Just to save some typing. */
265	#define STACK_PUSH(s, typetag, value) \
266	do \
267	{ \
268	status = tre_stack_push_ ## typetag(s, value); \
269	} \
270	while (/CONSTCOND/0)
271
272	#define STACK_PUSHX(s, typetag, value) \
273	{ \
274	status = tre_stack_push_ ## typetag(s, value); \
275	if (status != REG_OK) \
276	break; \
277	}
278
279	#define STACK_PUSHR(s, typetag, value) \
280	{ \
281	reg_errcode_t _status; \
282	_status = tre_stack_push_ ## typetag(s, value); \
283	if (_status != REG_OK) \
284	return _status; \
285	}
286
287	union tre_stack_item {
288	void *voidptr_value;
289	int int_value;
290	};
291
292	struct tre_stack_rec {
293	int size;
294	int max_size;
295	int increment;
296	int ptr;
297	union tre_stack_item *stack;
298	};
299
300
301	static tre_stack_t *
302	tre_stack_new(int size, int max_size, int increment)
303	{
304	tre_stack_t *s;
305
306	s = xmalloc(sizeof(*s));
307	if (s != NULL)
308	{
309	s->stack = xmalloc(sizeof(s->stack) size);
310	if (s->stack == NULL)
311	{
312	xfree(s);
313	return NULL;
314	}
315	s->size = size;
316	s->max_size = max_size;
317	s->increment = increment;
318	s->ptr = 0;
319	}
320	return s;
321	}
322
323	static void
324	tre_stack_destroy(tre_stack_t *s)
325	{
326	xfree(s->stack);
327	xfree(s);
328	}
329
330	static int
331	tre_stack_num_objects(tre_stack_t *s)
332	{
333	return s->ptr;
334	}
335
336	static reg_errcode_t
337	tre_stack_push(tre_stack_t *s, union tre_stack_item value)
338	{
339	if (s->ptr < s->size)
340	{
341	s->stack[s->ptr] = value;
342	s->ptr++;
343	}
344	else
345	{
346	if (s->size >= s->max_size)
347	{
348	return REG_ESPACE;
349	}
350	else
351	{
352	union tre_stack_item *new_buffer;
353	int new_size;
354	new_size = s->size + s->increment;
355	if (new_size > s->max_size)
356	new_size = s->max_size;
357	new_buffer = xrealloc(s->stack, sizeof(new_buffer) new_size);
358	if (new_buffer == NULL)
359	{
360	return REG_ESPACE;
361	}
362	assert(new_size > s->size);
363	s->size = new_size;
364	s->stack = new_buffer;
365	tre_stack_push(s, value);
366	}
367	}
368	return REG_OK;
369	}
370
371	#define define_pushf(typetag, type) \
372	declare_pushf(typetag, type) { \
373	union tre_stack_item item; \
374	item.typetag ## _value = value; \
375	return tre_stack_push(s, item); \
376	}
377
378	define_pushf(int, int)
379	define_pushf(voidptr, void *)
380
381	#define define_popf(typetag, type) \
382	declare_popf(typetag, type) { \
383	return s->stack[--s->ptr].typetag ## _value; \
384	}
385
386	define_popf(int, int)
387	define_popf(voidptr, void *)
388
389
390	/***********************************************************************
391	from tre-parse.c and tre-parse.h
392	***********************************************************************/
393
394	/* Parse context. */
395	typedef struct {
396	/* Memory allocator. The AST is allocated using this. */
397	tre_mem_t mem;
398	/* Stack used for keeping track of regexp syntax. */
399	tre_stack_t *stack;
400	/* The parsed node after a parse function returns. */
401	tre_ast_node_t *n;
402	/* Position in the regexp pattern after a parse function returns. */
403	const char *s;
404	/* The first character of the last subexpression parsed. */
405	const char *start;
406	/* Current submatch ID. */
407	int submatch_id;
408	/* Current position (number of literal). */
409	int position;
410	/* The highest back reference or -1 if none seen so far. */
411	int max_backref;
412	/* Compilation flags. */
413	int cflags;
414	} tre_parse_ctx_t;
415
416	/* Some macros for expanding \w, \s, etc. */
417	static const struct {
418	char c;
419	const char *expansion;
420	} tre_macros[] = {
421	{'t', "\t"}, {'n', "\n"}, {'r', "\r"},
422	{'f', "\f"}, {'a', "\a"}, {'e', "\033"},
423	{'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
424	{'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
425	{ 0, 0 }
426	};
427
428	/* Expands a macro delimited by `regex' and `regex_end' to `buf', which
429	must have at least `len' items. Sets buf[0] to zero if the there
430	is no match in `tre_macros'. */
431	static const char tre_expand_macro(const char s)
432	{
433	int i;
434	for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++);
435	return tre_macros[i].expansion;
436	}
437
438	static int
439	tre_compare_lit(const void a, const void b)
440	{
441	const tre_literal_t const la = a;
442	const tre_literal_t const lb = b;
443	/* assumes the range of valid code_min is < INT_MAX */
444	return la[0]->code_min - lb[0]->code_min;
445	}
446
447	struct literals {
448	tre_mem_t mem;
449	tre_literal_t **a;
450	int len;
451	int cap;
452	};
453
454	static tre_literal_t tre_new_lit(struct literals p)
455	{
456	tre_literal_t **a;
457	if (p->len >= p->cap) {
458	if (p->cap >= 1<<15)
459	return 0;
460	p->cap *= 2;
461	a = xrealloc(p->a, p->cap * sizeof *p->a);
462	if (!a)
463	return 0;
464	p->a = a;
465	}
466	a = p->a + p->len++;
467	a = tre_mem_calloc(p->mem, sizeof *a);
468	return *a;
469	}
470
471	static int add_icase_literals(struct literals *ls, int min, int max)
472	{
473	tre_literal_t *lit;
474	int b, e, c;
475	for (c=min; c<=max; ) {
476	/* assumes islower(c) and isupper(c) are exclusive
477	and toupper(c)!=c if islower(c).
478	multiple opposite case characters are not supported */
479	if (tre_islower(c)) {
480	b = e = tre_toupper(c);
481	for (c++, e++; c<=max; c++, e++)
482	if (tre_toupper(c) != e) break;
483	} else if (tre_isupper(c)) {
484	b = e = tre_tolower(c);
485	for (c++, e++; c<=max; c++, e++)
486	if (tre_tolower(c) != e) break;
487	} else {
488	c++;
489	continue;
490	}
491	lit = tre_new_lit(ls);
492	if (!lit)
493	return -1;
494	lit->code_min = b;
495	lit->code_max = e-1;
496	lit->position = -1;
497	}
498	return 0;
499	}
500
501
502	/* Maximum number of character classes in a negated bracket expression. */
503	#define MAX_NEG_CLASSES 64
504
505	struct neg {
506	int negate;
507	int len;
508	tre_ctype_t a[MAX_NEG_CLASSES];
509	};
510
511	// TODO: parse bracket into a set of non-overlapping [lo,hi] ranges
512
513	/*
514	bracket grammar:
515	Bracket = '[' List ']' \| '[^' List ']'
516	List = Term \| List Term
517	Term = Char \| Range \| Chclass \| Eqclass
518	Range = Char '-' Char \| Char '-' '-'
519	Char = Coll \| coll_single
520	Meta = ']' \| '-'
521	Coll = '[.' coll_single '.]' \| '[.' coll_multi '.]' \| '[.' Meta '.]'
522	Eqclass = '[=' coll_single '=]' \| '[=' coll_multi '=]'
523	Chclass = '[:' class ':]'
524
525	coll_single is a single char collating element but it can be
526	'-' only at the beginning or end of a List and
527	']' only at the beginning of a List and
528	'^' anywhere except after the openning '['
529	*/
530
531	static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t ctx, const char s, struct literals ls, struct neg neg)
532	{
533	const char *start = s;
534	tre_ctype_t class;
535	int min, max;
536	wchar_t wc;
537	int len;
538
539	for (;;) {
540	class = 0;
541	len = mbtowc(&wc, s, -1);
542	if (len <= 0)
543	return *s ? REG_BADPAT : REG_EBRACK;
544	if (*s == ']' && s != start) {
545	ctx->s = s+1;
546	return REG_OK;
547	}
548	if (*s == '-' && s != start && s[1] != ']' &&
549	/* extension: [a-z--@] is accepted as [a-z]\|[--@] */
550	(s[1] != '-' \|\| s[2] == ']'))
551	return REG_ERANGE;
552	if (*s == '[' && (s[1] == '.' \|\| s[1] == '='))
553	/* collating symbols and equivalence classes are not supported */
554	return REG_ECOLLATE;
555	if (*s == '[' && s[1] == ':') {
556	char tmp[CHARCLASS_NAME_MAX+1];
557	s += 2;
558	for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) {
559	if (s[len] == ':') {
560	memcpy(tmp, s, len);
561	tmp[len] = 0;
562	class = tre_ctype(tmp);
563	break;
564	}
565	}
566	if (!class \|\| s[len+1] != ']')
567	return REG_ECTYPE;
568	min = 0;
569	max = TRE_CHAR_MAX;
570	s += len+2;
571	} else {
572	min = max = wc;
573	s += len;
574	if (*s == '-' && s[1] != ']') {
575	s++;
576	len = mbtowc(&wc, s, -1);
577	max = wc;
578	/* XXX - Should use collation order instead of
579	encoding values in character ranges. */
580	if (len <= 0 \|\| min > max)
581	return REG_ERANGE;
582	s += len;
583	}
584	}
585
586	if (class && neg->negate) {
587	if (neg->len >= MAX_NEG_CLASSES)
588	return REG_ESPACE;
589	neg->a[neg->len++] = class;
590	} else {
591	tre_literal_t *lit = tre_new_lit(ls);
592	if (!lit)
593	return REG_ESPACE;
594	lit->code_min = min;
595	lit->code_max = max;
596	lit->class = class;
597	lit->position = -1;
598
599	/* Add opposite-case codepoints if REG_ICASE is present.
600	It seems that POSIX requires that bracket negation
601	should happen before case-folding, but most practical
602	implementations do it the other way around. Changing
603	the order would need efficient representation of
604	case-fold ranges and bracket range sets even with
605	simple patterns so this is ok for now. */
606	if (ctx->cflags & REG_ICASE && !class)
607	if (add_icase_literals(ls, min, max))
608	return REG_ESPACE;
609	}
610	}
611	}
612
613	static reg_errcode_t parse_bracket(tre_parse_ctx_t ctx, const char s)
614	{
615	int i, max, min, negmax, negmin;
616	tre_ast_node_t node = 0, n;
617	tre_ctype_t *nc = 0;
618	tre_literal_t *lit;
619	struct literals ls;
620	struct neg neg;
621	reg_errcode_t err;
622
623	ls.mem = ctx->mem;
624	ls.len = 0;
625	ls.cap = 32;
626	ls.a = xmalloc(ls.cap * sizeof *ls.a);
627	if (!ls.a)
628	return REG_ESPACE;
629	neg.len = 0;
630	neg.negate = *s == '^';
631	if (neg.negate)
632	s++;
633
634	err = parse_bracket_terms(ctx, s, &ls, &neg);
635	if (err != REG_OK)
636	goto parse_bracket_done;
637
638	if (neg.negate) {
639	/*
640	* With REG_NEWLINE, POSIX requires that newlines are not matched by
641	* any form of a non-matching list.
642	*/
643	if (ctx->cflags & REG_NEWLINE) {
644	lit = tre_new_lit(&ls);
645	if (!lit) {
646	err = REG_ESPACE;
647	goto parse_bracket_done;
648	}
649	lit->code_min = '\n';
650	lit->code_max = '\n';
651	lit->position = -1;
652	}
653	/* Sort the array if we need to negate it. */
654	qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
655	/* extra lit for the last negated range */
656	lit = tre_new_lit(&ls);
657	if (!lit) {
658	err = REG_ESPACE;
659	goto parse_bracket_done;
660	}
661	lit->code_min = TRE_CHAR_MAX+1;
662	lit->code_max = TRE_CHAR_MAX+1;
663	lit->position = -1;
664	/* negated classes */
665	if (neg.len) {
666	nc = tre_mem_alloc(ctx->mem, (neg.len+1)sizeof neg.a);
667	if (!nc) {
668	err = REG_ESPACE;
669	goto parse_bracket_done;
670	}
671	memcpy(nc, neg.a, neg.lensizeof neg.a);
672	nc[neg.len] = 0;
673	}
674	}
675
676	/* Build a union of the items in the array, negated if necessary. */
677	negmax = negmin = 0;
678	for (i = 0; i < ls.len; i++) {
679	lit = ls.a[i];
680	min = lit->code_min;
681	max = lit->code_max;
682	if (neg.negate) {
683	if (min <= negmin) {
684	/* Overlap. */
685	negmin = MAX(max + 1, negmin);
686	continue;
687	}
688	negmax = min - 1;
689	lit->code_min = negmin;
690	lit->code_max = negmax;
691	negmin = max + 1;
692	}
693	lit->position = ctx->position;
694	lit->neg_classes = nc;
695	n = tre_ast_new_node(ctx->mem, LITERAL, lit);
696	node = tre_ast_new_union(ctx->mem, node, n);
697	if (!node) {
698	err = REG_ESPACE;
699	break;
700	}
701	}
702
703	parse_bracket_done:
704	xfree(ls.a);
705	ctx->position++;
706	ctx->n = node;
707	return err;
708	}
709
710	static const char parse_dup_count(const char s, int *n)
711	{
712	*n = -1;
713	if (!isdigit(*s))
714	return s;
715	*n = 0;
716	for (;;) {
717	n = 10 n + (s - '0');
718	s++;
719	if (!isdigit(s) \|\| n > RE_DUP_MAX)
720	break;
721	}
722	return s;
723	}
724
725	static const char parse_dup(const char s, int ere, int pmin, int pmax)
726	{
727	int min, max;
728
729	s = parse_dup_count(s, &min);
730	if (*s == ',')
731	s = parse_dup_count(s+1, &max);
732	else
733	max = min;
734
735	if (
736	(max < min && max >= 0) \|\|
737	max > RE_DUP_MAX \|\|
738	min > RE_DUP_MAX \|\|
739	min < 0 \|\|
740	(!ere && *s++ != '\\') \|\|
741	*s++ != '}'
742	)
743	return 0;
744	*pmin = min;
745	*pmax = max;
746	return s;
747	}
748
749	static int hexval(unsigned c)
750	{
751	if (c-'0'<10) return c-'0';
752	c \|= 32;
753	if (c-'a'<6) return c-'a'+10;
754	return -1;
755	}
756
757	static reg_errcode_t marksub(tre_parse_ctx_t ctx, tre_ast_node_t node, int subid)
758	{
759	if (node->submatch_id >= 0) {
760	tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
761	if (!n)
762	return REG_ESPACE;
763	n = tre_ast_new_catenation(ctx->mem, n, node);
764	if (!n)
765	return REG_ESPACE;
766	n->num_submatches = node->num_submatches;
767	node = n;
768	}
769	node->submatch_id = subid;
770	node->num_submatches++;
771	ctx->n = node;
772	return REG_OK;
773	}
774
775	/*
776	BRE grammar:
777	Regex = Branch \| '^' \| '$' \| '^$' \| '^' Branch \| Branch '$' \| '^' Branch '$'
778	Branch = Atom \| Branch Atom
779	Atom = char \| quoted_char \| '.' \| Bracket \| Atom Dup \| '$' Branch '$' \| back_ref
780	Dup = '*' \| '\{' Count '\}' \| '\{' Count ',\}' \| '\{' Count ',' Count '\}'
781
782	(leading ^ and trailing $ in a sub expr may be an anchor or literal as well)
783
784	ERE grammar:
785	Regex = Branch \| Regex '\|' Branch
786	Branch = Atom \| Branch Atom
787	Atom = char \| quoted_char \| '.' \| Bracket \| Atom Dup \| '(' Regex ')' \| '^' \| '$'
788	Dup = '*' \| '+' \| '?' \| '{' Count '}' \| '{' Count ',}' \| '{' Count ',' Count '}'
789
790	(a+?, ^, $+, \X, {, (\|a) are unspecified)
791	*/
792
793	static reg_errcode_t parse_atom(tre_parse_ctx_t ctx, const char s)
794	{
795	int len, ere = ctx->cflags & REG_EXTENDED;
796	const char *p;
797	tre_ast_node_t *node;
798	wchar_t wc;
799	switch (*s) {
800	case '[':
801	return parse_bracket(ctx, s+1);
802	case '\\':
803	p = tre_expand_macro(s+1);
804	if (p) {
805	/* assume \X expansion is a single atom */
806	reg_errcode_t err = parse_atom(ctx, p);
807	ctx->s = s+2;
808	return err;
809	}
810	/* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */
811	switch (*++s) {
812	case 0:
813	return REG_EESCAPE;
814	case 'b':
815	node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1);
816	break;
817	case 'B':
818	node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1);
819	break;
820	case '<':
821	node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1);
822	break;
823	case '>':
824	node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1);
825	break;
826	case 'x':
827	s++;
828	int i, v = 0, c;
829	len = 2;
830	if (*s == '{') {
831	len = 8;
832	s++;
833	}
834	for (i=0; i<len && v<0x110000; i++) {
835	c = hexval(s[i]);
836	if (c < 0) break;
837	v = 16*v + c;
838	}
839	s += i;
840	if (len == 8) {
841	if (*s != '}')
842	return REG_EBRACE;
843	s++;
844	}
845	node = tre_ast_new_literal(ctx->mem, v, v, ctx->position++);
846	s--;
847	break;
848	case '{':
849	case '+':
850	case '?':
851	/* extension: treat \+, \? as repetitions in BRE */
852	/* reject repetitions after empty expression in BRE */
853	if (!ere)
854	return REG_BADRPT;
855	case '\|':
856	/* extension: treat \\| as alternation in BRE */
857	if (!ere) {
858	node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
859	s--;
860	goto end;
861	}
862	/* fallthrough */
863	default:
864	if (!ere && (unsigned)*s-'1' < 9) {
865	/* back reference */
866	int val = *s - '0';
867	node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position++);
868	ctx->max_backref = MAX(val, ctx->max_backref);
869	} else {
870	/* extension: accept unknown escaped char
871	as a literal */
872	goto parse_literal;
873	}
874	}
875	s++;
876	break;
877	case '.':
878	if (ctx->cflags & REG_NEWLINE) {
879	tre_ast_node_t tmp1, tmp2;
880	tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++);
881	tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++);
882	if (tmp1 && tmp2)
883	node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
884	else
885	node = 0;
886	} else {
887	node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++);
888	}
889	s++;
890	break;
891	case '^':
892	/* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */
893	if (!ere && s != ctx->start)
894	goto parse_literal;
895	node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
896	s++;
897	break;
898	case '$':
899	/* '$' is special everywhere in EREs, and at the end of a BRE subexpression. */
900	if (!ere && s[1] && (s[1]!='\\'\|\| (s[2]!=')' && s[2]!='\|')))
901	goto parse_literal;
902	node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
903	s++;
904	break;
905	case '*':
906	case '{':
907	case '+':
908	case '?':
909	/* reject repetitions after empty expression in ERE */
910	if (ere)
911	return REG_BADRPT;
912	case '\|':
913	if (!ere)
914	goto parse_literal;
915	case 0:
916	node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
917	break;
918	default:
919	parse_literal:
920	len = mbtowc(&wc, s, -1);
921	if (len < 0)
922	return REG_BADPAT;
923	if (ctx->cflags & REG_ICASE && (tre_isupper(wc) \|\| tre_islower(wc))) {
924	tre_ast_node_t tmp1, tmp2;
925	/* multiple opposite case characters are not supported */
926	tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position);
927	tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position);
928	if (tmp1 && tmp2)
929	node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
930	else
931	node = 0;
932	} else {
933	node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);
934	}
935	ctx->position++;
936	s += len;
937	break;
938	}
939	end:
940	if (!node)
941	return REG_ESPACE;
942	ctx->n = node;
943	ctx->s = s;
944	return REG_OK;
945	}
946
947	#define PUSHPTR(err, s, v) do { \
948	if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \
949	return err; \
950	} while(0)
951
952	#define PUSHINT(err, s, v) do { \
953	if ((err = tre_stack_push_int(s, v)) != REG_OK) \
954	return err; \
955	} while(0)
956
957	static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx)
958	{
959	tre_ast_node_t nbranch=0, nunion=0;
960	int ere = ctx->cflags & REG_EXTENDED;
961	const char *s = ctx->start;
962	int subid = 0;
963	int depth = 0;
964	reg_errcode_t err;
965	tre_stack_t *stack = ctx->stack;
966
967	PUSHINT(err, stack, subid++);
968	for (;;) {
969	if ((!ere && *s == '\\' && s[1] == '(') \|\|
970	(ere && *s == '(')) {
971	PUSHPTR(err, stack, nunion);
972	PUSHPTR(err, stack, nbranch);
973	PUSHINT(err, stack, subid++);
974	s++;
975	if (!ere)
976	s++;
977	depth++;
978	nbranch = nunion = 0;
979	ctx->start = s;
980	continue;
981	}
982	if ((!ere && *s == '\\' && s[1] == ')') \|\|
983	(ere && *s == ')' && depth)) {
984	ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
985	if (!ctx->n)
986	return REG_ESPACE;
987	} else {
988	err = parse_atom(ctx, s);
989	if (err != REG_OK)
990	return err;
991	s = ctx->s;
992	}
993
994	parse_iter:
995	for (;;) {
996	int min, max;
997
998	if (s!='\\' && s!='*') {
999	if (!ere)
1000	break;
1001	if (s!='+' && s!='?' && *s!='{')
1002	break;
1003	}
1004	if (*s=='\\' && ere)
1005	break;
1006	/* extension: treat \+, \? as repetitions in BRE */
1007	if (*s=='\\' && s[1]!='+' && s[1]!='?' && s[1]!='{')
1008	break;
1009	if (*s=='\\')
1010	s++;
1011
1012	/* handle ^* at the start of a BRE. */
1013	if (!ere && s==ctx->start+1 && s[-1]=='^')
1014	break;
1015
1016	/* extension: multiple consecutive *+?{,} is unspecified,
1017	but (a+)+ has to be supported so accepting a++ makes
1018	sense, note however that the RE_DUP_MAX limit can be
1019	circumvented: (a{255}){255} uses a lot of memory.. */
1020	if (*s=='{') {
1021	s = parse_dup(s+1, ere, &min, &max);
1022	if (!s)
1023	return REG_BADBR;
1024	} else {
1025	min=0;
1026	max=-1;
1027	if (*s == '+')
1028	min = 1;
1029	if (*s == '?')
1030	max = 1;
1031	s++;
1032	}
1033	if (max == 0)
1034	ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1035	else
1036	ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
1037	if (!ctx->n)
1038	return REG_ESPACE;
1039	}
1040
1041	nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
1042	if ((ere && *s == '\|') \|\|
1043	(ere && *s == ')' && depth) \|\|
1044	(!ere && *s == '\\' && s[1] == ')') \|\|
1045	/* extension: treat \\| as alternation in BRE */
1046	(!ere && *s == '\\' && s[1] == '\|') \|\|
1047	!*s) {
1048	/* extension: empty branch is unspecified (), (\|a), (a\|)
1049	here they are not rejected but match on empty string */
1050	int c = *s;
1051	nunion = tre_ast_new_union(ctx->mem, nunion, nbranch);
1052	nbranch = 0;
1053
1054	if (c == '\\' && s[1] == '\|') {
1055	s+=2;
1056	ctx->start = s;
1057	} else if (c == '\|') {
1058	s++;
1059	ctx->start = s;
1060	} else {
1061	if (c == '\\') {
1062	if (!depth) return REG_EPAREN;
1063	s+=2;
1064	} else if (c == ')')
1065	s++;
1066	depth--;
1067	err = marksub(ctx, nunion, tre_stack_pop_int(stack));
1068	if (err != REG_OK)
1069	return err;
1070	if (!c && depth<0) {
1071	ctx->submatch_id = subid;
1072	return REG_OK;
1073	}
1074	if (!c \|\| depth<0)
1075	return REG_EPAREN;
1076	nbranch = tre_stack_pop_voidptr(stack);
1077	nunion = tre_stack_pop_voidptr(stack);
1078	goto parse_iter;
1079	}
1080	}
1081	}
1082	}
1083
1084
1085	/***********************************************************************
1086	from tre-compile.c
1087	***********************************************************************/
1088
1089
1090	/*
1091	TODO:
1092	- Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1093	function calls.
1094	*/
1095
1096	/*
1097	Algorithms to setup tags so that submatch addressing can be done.
1098	*/
1099
1100
1101	/* Inserts a catenation node to the root of the tree given in `node'.
1102	As the left child a new tag with number `tag_id' to `node' is added,
1103	and the right child is the old root. */
1104	static reg_errcode_t
1105	tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1106	{
1107	tre_catenation_t *c;
1108
1109	c = tre_mem_alloc(mem, sizeof(*c));
1110	if (c == NULL)
1111	return REG_ESPACE;
1112	c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1113	if (c->left == NULL)
1114	return REG_ESPACE;
1115	c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1116	if (c->right == NULL)
1117	return REG_ESPACE;
1118
1119	c->right->obj = node->obj;
1120	c->right->type = node->type;
1121	c->right->nullable = -1;
1122	c->right->submatch_id = -1;
1123	c->right->firstpos = NULL;
1124	c->right->lastpos = NULL;
1125	c->right->num_tags = 0;
1126	c->right->num_submatches = 0;
1127	node->obj = c;
1128	node->type = CATENATION;
1129	return REG_OK;
1130	}
1131
1132	/* Inserts a catenation node to the root of the tree given in `node'.
1133	As the right child a new tag with number `tag_id' to `node' is added,
1134	and the left child is the old root. */
1135	static reg_errcode_t
1136	tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1137	{
1138	tre_catenation_t *c;
1139
1140	c = tre_mem_alloc(mem, sizeof(*c));
1141	if (c == NULL)
1142	return REG_ESPACE;
1143	c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1144	if (c->right == NULL)
1145	return REG_ESPACE;
1146	c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1147	if (c->left == NULL)
1148	return REG_ESPACE;
1149
1150	c->left->obj = node->obj;
1151	c->left->type = node->type;
1152	c->left->nullable = -1;
1153	c->left->submatch_id = -1;
1154	c->left->firstpos = NULL;
1155	c->left->lastpos = NULL;
1156	c->left->num_tags = 0;
1157	c->left->num_submatches = 0;
1158	node->obj = c;
1159	node->type = CATENATION;
1160	return REG_OK;
1161	}
1162
1163	typedef enum {
1164	ADDTAGS_RECURSE,
1165	ADDTAGS_AFTER_ITERATION,
1166	ADDTAGS_AFTER_UNION_LEFT,
1167	ADDTAGS_AFTER_UNION_RIGHT,
1168	ADDTAGS_AFTER_CAT_LEFT,
1169	ADDTAGS_AFTER_CAT_RIGHT,
1170	ADDTAGS_SET_SUBMATCH_END
1171	} tre_addtags_symbol_t;
1172
1173
1174	typedef struct {
1175	int tag;
1176	int next_tag;
1177	} tre_tag_states_t;
1178
1179
1180	/* Go through `regset' and set submatch data for submatches that are
1181	using this tag. */
1182	static void
1183	tre_purge_regset(int regset, tre_tnfa_t tnfa, int tag)
1184	{
1185	int i;
1186
1187	for (i = 0; regset[i] >= 0; i++)
1188	{
1189	int id = regset[i] / 2;
1190	int start = !(regset[i] % 2);
1191	if (start)
1192	tnfa->submatch_data[id].so_tag = tag;
1193	else
1194	tnfa->submatch_data[id].eo_tag = tag;
1195	}
1196	regset[0] = -1;
1197	}
1198
1199
1200	/* Adds tags to appropriate locations in the parse tree in `tree', so that
1201	subexpressions marked for submatch addressing can be traced. */
1202	static reg_errcode_t
1203	tre_add_tags(tre_mem_t mem, tre_stack_t stack, tre_ast_node_t tree,
1204	tre_tnfa_t *tnfa)
1205	{
1206	reg_errcode_t status = REG_OK;
1207	tre_addtags_symbol_t symbol;
1208	tre_ast_node_t node = tree; / Tree node we are currently looking at. */
1209	int bottom = tre_stack_num_objects(stack);
1210	/* True for first pass (counting number of needed tags) */
1211	int first_pass = (mem == NULL \|\| tnfa == NULL);
1212	int regset, orig_regset;
1213	int num_tags = 0; /* Total number of tags. */
1214	int num_minimals = 0; /* Number of special minimal tags. */
1215	int tag = 0; /* The tag that is to be added next. */
1216	int next_tag = 1; /* Next tag to use after this one. */
1217	int parents; / Stack of submatches the current submatch is
1218	contained in. */
1219	int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1220	tre_tag_states_t *saved_states;
1221
1222	tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1223	if (!first_pass)
1224	{
1225	tnfa->end_tag = 0;
1226	tnfa->minimal_tags[0] = -1;
1227	}
1228
1229	regset = xmalloc(sizeof(regset) ((tnfa->num_submatches + 1) * 2));
1230	if (regset == NULL)
1231	return REG_ESPACE;
1232	regset[0] = -1;
1233	orig_regset = regset;
1234
1235	parents = xmalloc(sizeof(parents) (tnfa->num_submatches + 1));
1236	if (parents == NULL)
1237	{
1238	xfree(regset);
1239	return REG_ESPACE;
1240	}
1241	parents[0] = -1;
1242
1243	saved_states = xmalloc(sizeof(saved_states) (tnfa->num_submatches + 1));
1244	if (saved_states == NULL)
1245	{
1246	xfree(regset);
1247	xfree(parents);
1248	return REG_ESPACE;
1249	}
1250	else
1251	{
1252	unsigned int i;
1253	for (i = 0; i <= tnfa->num_submatches; i++)
1254	saved_states[i].tag = -1;
1255	}
1256
1257	STACK_PUSH(stack, voidptr, node);
1258	STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1259
1260	while (tre_stack_num_objects(stack) > bottom)
1261	{
1262	if (status != REG_OK)
1263	break;
1264
1265	symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1266	switch (symbol)
1267	{
1268
1269	case ADDTAGS_SET_SUBMATCH_END:
1270	{
1271	int id = tre_stack_pop_int(stack);
1272	int i;
1273
1274	/* Add end of this submatch to regset. */
1275	for (i = 0; regset[i] >= 0; i++);
1276	regset[i] = id * 2 + 1;
1277	regset[i + 1] = -1;
1278
1279	/* Pop this submatch from the parents stack. */
1280	for (i = 0; parents[i] >= 0; i++);
1281	parents[i - 1] = -1;
1282	break;
1283	}
1284
1285	case ADDTAGS_RECURSE:
1286	node = tre_stack_pop_voidptr(stack);
1287
1288	if (node->submatch_id >= 0)
1289	{
1290	int id = node->submatch_id;
1291	int i;
1292
1293
1294	/* Add start of this submatch to regset. */
1295	for (i = 0; regset[i] >= 0; i++);
1296	regset[i] = id * 2;
1297	regset[i + 1] = -1;
1298
1299	if (!first_pass)
1300	{
1301	for (i = 0; parents[i] >= 0; i++);
1302	tnfa->submatch_data[id].parents = NULL;
1303	if (i > 0)
1304	{
1305	int p = xmalloc(sizeof(p) * (i + 1));
1306	if (p == NULL)
1307	{
1308	status = REG_ESPACE;
1309	break;
1310	}
1311	assert(tnfa->submatch_data[id].parents == NULL);
1312	tnfa->submatch_data[id].parents = p;
1313	for (i = 0; parents[i] >= 0; i++)
1314	p[i] = parents[i];
1315	p[i] = -1;
1316	}
1317	}
1318
1319	/* Add end of this submatch to regset after processing this
1320	node. */
1321	STACK_PUSHX(stack, int, node->submatch_id);
1322	STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1323	}
1324
1325	switch (node->type)
1326	{
1327	case LITERAL:
1328	{
1329	tre_literal_t *lit = node->obj;
1330
1331	if (!IS_SPECIAL(lit) \|\| IS_BACKREF(lit))
1332	{
1333	int i;
1334	if (regset[0] >= 0)
1335	{
1336	/* Regset is not empty, so add a tag before the
1337	literal or backref. */
1338	if (!first_pass)
1339	{
1340	status = tre_add_tag_left(mem, node, tag);
1341	tnfa->tag_directions[tag] = direction;
1342	if (minimal_tag >= 0)
1343	{
1344	for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1345	tnfa->minimal_tags[i] = tag;
1346	tnfa->minimal_tags[i + 1] = minimal_tag;
1347	tnfa->minimal_tags[i + 2] = -1;
1348	minimal_tag = -1;
1349	num_minimals++;
1350	}
1351	tre_purge_regset(regset, tnfa, tag);
1352	}
1353	else
1354	{
1355	node->num_tags = 1;
1356	}
1357
1358	regset[0] = -1;
1359	tag = next_tag;
1360	num_tags++;
1361	next_tag++;
1362	}
1363	}
1364	else
1365	{
1366	assert(!IS_TAG(lit));
1367	}
1368	break;
1369	}
1370	case CATENATION:
1371	{
1372	tre_catenation_t *cat = node->obj;
1373	tre_ast_node_t *left = cat->left;
1374	tre_ast_node_t *right = cat->right;
1375	int reserved_tag = -1;
1376
1377
1378	/* After processing right child. */
1379	STACK_PUSHX(stack, voidptr, node);
1380	STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1381
1382	/* Process right child. */
1383	STACK_PUSHX(stack, voidptr, right);
1384	STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1385
1386	/* After processing left child. */
1387	STACK_PUSHX(stack, int, next_tag + left->num_tags);
1388	if (left->num_tags > 0 && right->num_tags > 0)
1389	{
1390	/* Reserve the next tag to the right child. */
1391	reserved_tag = next_tag;
1392	next_tag++;
1393	}
1394	STACK_PUSHX(stack, int, reserved_tag);
1395	STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1396
1397	/* Process left child. */
1398	STACK_PUSHX(stack, voidptr, left);
1399	STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1400
1401	}
1402	break;
1403	case ITERATION:
1404	{
1405	tre_iteration_t *iter = node->obj;
1406
1407	if (first_pass)
1408	{
1409	STACK_PUSHX(stack, int, regset[0] >= 0 \|\| iter->minimal);
1410	}
1411	else
1412	{
1413	STACK_PUSHX(stack, int, tag);
1414	STACK_PUSHX(stack, int, iter->minimal);
1415	}
1416	STACK_PUSHX(stack, voidptr, node);
1417	STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1418
1419	STACK_PUSHX(stack, voidptr, iter->arg);
1420	STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1421
1422	/* Regset is not empty, so add a tag here. */
1423	if (regset[0] >= 0 \|\| iter->minimal)
1424	{
1425	if (!first_pass)
1426	{
1427	int i;
1428	status = tre_add_tag_left(mem, node, tag);
1429	if (iter->minimal)
1430	tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1431	else
1432	tnfa->tag_directions[tag] = direction;
1433	if (minimal_tag >= 0)
1434	{
1435	for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1436	tnfa->minimal_tags[i] = tag;
1437	tnfa->minimal_tags[i + 1] = minimal_tag;
1438	tnfa->minimal_tags[i + 2] = -1;
1439	minimal_tag = -1;
1440	num_minimals++;
1441	}
1442	tre_purge_regset(regset, tnfa, tag);
1443	}
1444
1445	regset[0] = -1;
1446	tag = next_tag;
1447	num_tags++;
1448	next_tag++;
1449	}
1450	direction = TRE_TAG_MINIMIZE;
1451	}
1452	break;
1453	case UNION:
1454	{
1455	tre_union_t *uni = node->obj;
1456	tre_ast_node_t *left = uni->left;
1457	tre_ast_node_t *right = uni->right;
1458	int left_tag;
1459	int right_tag;
1460
1461	if (regset[0] >= 0)
1462	{
1463	left_tag = next_tag;
1464	right_tag = next_tag + 1;
1465	}
1466	else
1467	{
1468	left_tag = tag;
1469	right_tag = next_tag;
1470	}
1471
1472	/* After processing right child. */
1473	STACK_PUSHX(stack, int, right_tag);
1474	STACK_PUSHX(stack, int, left_tag);
1475	STACK_PUSHX(stack, voidptr, regset);
1476	STACK_PUSHX(stack, int, regset[0] >= 0);
1477	STACK_PUSHX(stack, voidptr, node);
1478	STACK_PUSHX(stack, voidptr, right);
1479	STACK_PUSHX(stack, voidptr, left);
1480	STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1481
1482	/* Process right child. */
1483	STACK_PUSHX(stack, voidptr, right);
1484	STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1485
1486	/* After processing left child. */
1487	STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1488
1489	/* Process left child. */
1490	STACK_PUSHX(stack, voidptr, left);
1491	STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1492
1493	/* Regset is not empty, so add a tag here. */
1494	if (regset[0] >= 0)
1495	{
1496	if (!first_pass)
1497	{
1498	int i;
1499	status = tre_add_tag_left(mem, node, tag);
1500	tnfa->tag_directions[tag] = direction;
1501	if (minimal_tag >= 0)
1502	{
1503	for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1504	tnfa->minimal_tags[i] = tag;
1505	tnfa->minimal_tags[i + 1] = minimal_tag;
1506	tnfa->minimal_tags[i + 2] = -1;
1507	minimal_tag = -1;
1508	num_minimals++;
1509	}
1510	tre_purge_regset(regset, tnfa, tag);
1511	}
1512
1513	regset[0] = -1;
1514	tag = next_tag;
1515	num_tags++;
1516	next_tag++;
1517	}
1518
1519	if (node->num_submatches > 0)
1520	{
1521	/* The next two tags are reserved for markers. */
1522	next_tag++;
1523	tag = next_tag;
1524	next_tag++;
1525	}
1526
1527	break;
1528	}
1529	}
1530
1531	if (node->submatch_id >= 0)
1532	{
1533	int i;
1534	/* Push this submatch on the parents stack. */
1535	for (i = 0; parents[i] >= 0; i++);
1536	parents[i] = node->submatch_id;
1537	parents[i + 1] = -1;
1538	}
1539
1540	break; /* end case: ADDTAGS_RECURSE */
1541
1542	case ADDTAGS_AFTER_ITERATION:
1543	{
1544	int minimal = 0;
1545	int enter_tag;
1546	node = tre_stack_pop_voidptr(stack);
1547	if (first_pass)
1548	{
1549	node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1550	+ tre_stack_pop_int(stack);
1551	minimal_tag = -1;
1552	}
1553	else
1554	{
1555	minimal = tre_stack_pop_int(stack);
1556	enter_tag = tre_stack_pop_int(stack);
1557	if (minimal)
1558	minimal_tag = enter_tag;
1559	}
1560
1561	if (!first_pass)
1562	{
1563	if (minimal)
1564	direction = TRE_TAG_MINIMIZE;
1565	else
1566	direction = TRE_TAG_MAXIMIZE;
1567	}
1568	break;
1569	}
1570
1571	case ADDTAGS_AFTER_CAT_LEFT:
1572	{
1573	int new_tag = tre_stack_pop_int(stack);
1574	next_tag = tre_stack_pop_int(stack);
1575	if (new_tag >= 0)
1576	{
1577	tag = new_tag;
1578	}
1579	break;
1580	}
1581
1582	case ADDTAGS_AFTER_CAT_RIGHT:
1583	node = tre_stack_pop_voidptr(stack);
1584	if (first_pass)
1585	node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
1586	+ ((tre_catenation_t *)node->obj)->right->num_tags;
1587	break;
1588
1589	case ADDTAGS_AFTER_UNION_LEFT:
1590	/* Lift the bottom of the `regset' array so that when processing
1591	the right operand the items currently in the array are
1592	invisible. The original bottom was saved at ADDTAGS_UNION and
1593	will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
1594	while (*regset >= 0)
1595	regset++;
1596	break;
1597
1598	case ADDTAGS_AFTER_UNION_RIGHT:
1599	{
1600	int added_tags, tag_left, tag_right;
1601	tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
1602	tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
1603	node = tre_stack_pop_voidptr(stack);
1604	added_tags = tre_stack_pop_int(stack);
1605	if (first_pass)
1606	{
1607	node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
1608	+ ((tre_union_t *)node->obj)->right->num_tags + added_tags
1609	+ ((node->num_submatches > 0) ? 2 : 0);
1610	}
1611	regset = tre_stack_pop_voidptr(stack);
1612	tag_left = tre_stack_pop_int(stack);
1613	tag_right = tre_stack_pop_int(stack);
1614
1615	/* Add tags after both children, the left child gets a smaller
1616	tag than the right child. This guarantees that we prefer
1617	the left child over the right child. */
1618	/* XXX - This is not always necessary (if the children have
1619	tags which must be seen for every match of that child). */
1620	/* XXX - Check if this is the only place where tre_add_tag_right
1621	is used. If so, use tre_add_tag_left (putting the tag before
1622	the child as opposed after the child) and throw away
1623	tre_add_tag_right. */
1624	if (node->num_submatches > 0)
1625	{
1626	if (!first_pass)
1627	{
1628	status = tre_add_tag_right(mem, left, tag_left);
1629	tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
1630	if (status == REG_OK)
1631	status = tre_add_tag_right(mem, right, tag_right);
1632	tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
1633	}
1634	num_tags += 2;
1635	}
1636	direction = TRE_TAG_MAXIMIZE;
1637	break;
1638	}
1639
1640	default:
1641	assert(0);
1642	break;
1643
1644	} /* end switch(symbol) */
1645	} /* end while(tre_stack_num_objects(stack) > bottom) */
1646
1647	if (!first_pass)
1648	tre_purge_regset(regset, tnfa, tag);
1649
1650	if (!first_pass && minimal_tag >= 0)
1651	{
1652	int i;
1653	for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1654	tnfa->minimal_tags[i] = tag;
1655	tnfa->minimal_tags[i + 1] = minimal_tag;
1656	tnfa->minimal_tags[i + 2] = -1;
1657	minimal_tag = -1;
1658	num_minimals++;
1659	}
1660
1661	assert(tree->num_tags == num_tags);
1662	tnfa->end_tag = num_tags;
1663	tnfa->num_tags = num_tags;
1664	tnfa->num_minimals = num_minimals;
1665	xfree(orig_regset);
1666	xfree(parents);
1667	xfree(saved_states);
1668	return status;
1669	}
1670
1671
1672
1673	/*
1674	AST to TNFA compilation routines.
1675	*/
1676
1677	typedef enum {
1678	COPY_RECURSE,
1679	COPY_SET_RESULT_PTR
1680	} tre_copyast_symbol_t;
1681
1682	/* Flags for tre_copy_ast(). */
1683	#define COPY_REMOVE_TAGS 1
1684	#define COPY_MAXIMIZE_FIRST_TAG 2
1685
1686	static reg_errcode_t
1687	tre_copy_ast(tre_mem_t mem, tre_stack_t stack, tre_ast_node_t ast,
1688	int flags, int pos_add, tre_tag_direction_t tag_directions,
1689	tre_ast_node_t *copy, int max_pos)
1690	{
1691	reg_errcode_t status = REG_OK;
1692	int bottom = tre_stack_num_objects(stack);
1693	int num_copied = 0;
1694	int first_tag = 1;
1695	tre_ast_node_t **result = copy;
1696	tre_copyast_symbol_t symbol;
1697
1698	STACK_PUSH(stack, voidptr, ast);
1699	STACK_PUSH(stack, int, COPY_RECURSE);
1700
1701	while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1702	{
1703	tre_ast_node_t *node;
1704	if (status != REG_OK)
1705	break;
1706
1707	symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
1708	switch (symbol)
1709	{
1710	case COPY_SET_RESULT_PTR:
1711	result = tre_stack_pop_voidptr(stack);
1712	break;
1713	case COPY_RECURSE:
1714	node = tre_stack_pop_voidptr(stack);
1715	switch (node->type)
1716	{
1717	case LITERAL:
1718	{
1719	tre_literal_t *lit = node->obj;
1720	int pos = lit->position;
1721	int min = lit->code_min;
1722	int max = lit->code_max;
1723	if (!IS_SPECIAL(lit) \|\| IS_BACKREF(lit))
1724	{
1725	/* XXX - e.g. [ab] has only one position but two
1726	nodes, so we are creating holes in the state space
1727	here. Not fatal, just wastes memory. */
1728	pos += *pos_add;
1729	num_copied++;
1730	}
1731	else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
1732	{
1733	/* Change this tag to empty. */
1734	min = EMPTY;
1735	max = pos = -1;
1736	}
1737	else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
1738	&& first_tag)
1739	{
1740	/* Maximize the first tag. */
1741	tag_directions[max] = TRE_TAG_MAXIMIZE;
1742	first_tag = 0;
1743	}
1744	*result = tre_ast_new_literal(mem, min, max, pos);
1745	if (*result == NULL)
1746	status = REG_ESPACE;
1747	else {
1748	tre_literal_t p = (result)->obj;
1749	p->class = lit->class;
1750	p->neg_classes = lit->neg_classes;
1751	}
1752
1753	if (pos > *max_pos)
1754	*max_pos = pos;
1755	break;
1756	}
1757	case UNION:
1758	{
1759	tre_union_t *uni = node->obj;
1760	tre_union_t *tmp;
1761	*result = tre_ast_new_union(mem, uni->left, uni->right);
1762	if (*result == NULL)
1763	{
1764	status = REG_ESPACE;
1765	break;
1766	}
1767	tmp = (*result)->obj;
1768	result = &tmp->left;
1769	STACK_PUSHX(stack, voidptr, uni->right);
1770	STACK_PUSHX(stack, int, COPY_RECURSE);
1771	STACK_PUSHX(stack, voidptr, &tmp->right);
1772	STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1773	STACK_PUSHX(stack, voidptr, uni->left);
1774	STACK_PUSHX(stack, int, COPY_RECURSE);
1775	break;
1776	}
1777	case CATENATION:
1778	{
1779	tre_catenation_t *cat = node->obj;
1780	tre_catenation_t *tmp;
1781	*result = tre_ast_new_catenation(mem, cat->left, cat->right);
1782	if (*result == NULL)
1783	{
1784	status = REG_ESPACE;
1785	break;
1786	}
1787	tmp = (*result)->obj;
1788	tmp->left = NULL;
1789	tmp->right = NULL;
1790	result = &tmp->left;
1791
1792	STACK_PUSHX(stack, voidptr, cat->right);
1793	STACK_PUSHX(stack, int, COPY_RECURSE);
1794	STACK_PUSHX(stack, voidptr, &tmp->right);
1795	STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1796	STACK_PUSHX(stack, voidptr, cat->left);
1797	STACK_PUSHX(stack, int, COPY_RECURSE);
1798	break;
1799	}
1800	case ITERATION:
1801	{
1802	tre_iteration_t *iter = node->obj;
1803	STACK_PUSHX(stack, voidptr, iter->arg);
1804	STACK_PUSHX(stack, int, COPY_RECURSE);
1805	*result = tre_ast_new_iter(mem, iter->arg, iter->min,
1806	iter->max, iter->minimal);
1807	if (*result == NULL)
1808	{
1809	status = REG_ESPACE;
1810	break;
1811	}
1812	iter = (*result)->obj;
1813	result = &iter->arg;
1814	break;
1815	}
1816	default:
1817	assert(0);
1818	break;
1819	}
1820	break;
1821	}
1822	}
1823	*pos_add += num_copied;
1824	return status;
1825	}
1826
1827	typedef enum {
1828	EXPAND_RECURSE,
1829	EXPAND_AFTER_ITER
1830	} tre_expand_ast_symbol_t;
1831
1832	/* Expands each iteration node that has a finite nonzero minimum or maximum
1833	iteration count to a catenated sequence of copies of the node. */
1834	static reg_errcode_t
1835	tre_expand_ast(tre_mem_t mem, tre_stack_t stack, tre_ast_node_t ast,
1836	int position, tre_tag_direction_t tag_directions)
1837	{
1838	reg_errcode_t status = REG_OK;
1839	int bottom = tre_stack_num_objects(stack);
1840	int pos_add = 0;
1841	int pos_add_total = 0;
1842	int max_pos = 0;
1843	int iter_depth = 0;
1844
1845	STACK_PUSHR(stack, voidptr, ast);
1846	STACK_PUSHR(stack, int, EXPAND_RECURSE);
1847	while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1848	{
1849	tre_ast_node_t *node;
1850	tre_expand_ast_symbol_t symbol;
1851
1852	if (status != REG_OK)
1853	break;
1854
1855	symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
1856	node = tre_stack_pop_voidptr(stack);
1857	switch (symbol)
1858	{
1859	case EXPAND_RECURSE:
1860	switch (node->type)
1861	{
1862	case LITERAL:
1863	{
1864	tre_literal_t *lit= node->obj;
1865	if (!IS_SPECIAL(lit) \|\| IS_BACKREF(lit))
1866	{
1867	lit->position += pos_add;
1868	if (lit->position > max_pos)
1869	max_pos = lit->position;
1870	}
1871	break;
1872	}
1873	case UNION:
1874	{
1875	tre_union_t *uni = node->obj;
1876	STACK_PUSHX(stack, voidptr, uni->right);
1877	STACK_PUSHX(stack, int, EXPAND_RECURSE);
1878	STACK_PUSHX(stack, voidptr, uni->left);
1879	STACK_PUSHX(stack, int, EXPAND_RECURSE);
1880	break;
1881	}
1882	case CATENATION:
1883	{
1884	tre_catenation_t *cat = node->obj;
1885	STACK_PUSHX(stack, voidptr, cat->right);
1886	STACK_PUSHX(stack, int, EXPAND_RECURSE);
1887	STACK_PUSHX(stack, voidptr, cat->left);
1888	STACK_PUSHX(stack, int, EXPAND_RECURSE);
1889	break;
1890	}
1891	case ITERATION:
1892	{
1893	tre_iteration_t *iter = node->obj;
1894	STACK_PUSHX(stack, int, pos_add);
1895	STACK_PUSHX(stack, voidptr, node);
1896	STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
1897	STACK_PUSHX(stack, voidptr, iter->arg);
1898	STACK_PUSHX(stack, int, EXPAND_RECURSE);
1899	/* If we are going to expand this node at EXPAND_AFTER_ITER
1900	then don't increase the `pos' fields of the nodes now, it
1901	will get done when expanding. */
1902	if (iter->min > 1 \|\| iter->max > 1)
1903	pos_add = 0;
1904	iter_depth++;
1905	break;
1906	}
1907	default:
1908	assert(0);
1909	break;
1910	}
1911	break;
1912	case EXPAND_AFTER_ITER:
1913	{
1914	tre_iteration_t *iter = node->obj;
1915	int pos_add_last;
1916	pos_add = tre_stack_pop_int(stack);
1917	pos_add_last = pos_add;
1918	if (iter->min > 1 \|\| iter->max > 1)
1919	{
1920	tre_ast_node_t seq1 = NULL, seq2 = NULL;
1921	int j;
1922	int pos_add_save = pos_add;
1923
1924	/* Create a catenated sequence of copies of the node. */
1925	for (j = 0; j < iter->min; j++)
1926	{
1927	tre_ast_node_t *copy;
1928	/* Remove tags from all but the last copy. */
1929	int flags = ((j + 1 < iter->min)
1930	? COPY_REMOVE_TAGS
1931	: COPY_MAXIMIZE_FIRST_TAG);
1932	pos_add_save = pos_add;
1933	status = tre_copy_ast(mem, stack, iter->arg, flags,
1934	&pos_add, tag_directions, &copy,
1935	&max_pos);
1936	if (status != REG_OK)
1937	return status;
1938	if (seq1 != NULL)
1939	seq1 = tre_ast_new_catenation(mem, seq1, copy);
1940	else
1941	seq1 = copy;
1942	if (seq1 == NULL)
1943	return REG_ESPACE;
1944	}
1945
1946	if (iter->max == -1)
1947	{
1948	/* No upper limit. */
1949	pos_add_save = pos_add;
1950	status = tre_copy_ast(mem, stack, iter->arg, 0,
1951	&pos_add, NULL, &seq2, &max_pos);
1952	if (status != REG_OK)
1953	return status;
1954	seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
1955	if (seq2 == NULL)
1956	return REG_ESPACE;
1957	}
1958	else
1959	{
1960	for (j = iter->min; j < iter->max; j++)
1961	{
1962	tre_ast_node_t tmp, copy;
1963	pos_add_save = pos_add;
1964	status = tre_copy_ast(mem, stack, iter->arg, 0,
1965	&pos_add, NULL, &copy, &max_pos);
1966	if (status != REG_OK)
1967	return status;
1968	if (seq2 != NULL)
1969	seq2 = tre_ast_new_catenation(mem, copy, seq2);
1970	else
1971	seq2 = copy;
1972	if (seq2 == NULL)
1973	return REG_ESPACE;
1974	tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
1975	if (tmp == NULL)
1976	return REG_ESPACE;
1977	seq2 = tre_ast_new_union(mem, tmp, seq2);
1978	if (seq2 == NULL)
1979	return REG_ESPACE;
1980	}
1981	}
1982
1983	pos_add = pos_add_save;
1984	if (seq1 == NULL)
1985	seq1 = seq2;
1986	else if (seq2 != NULL)
1987	seq1 = tre_ast_new_catenation(mem, seq1, seq2);
1988	if (seq1 == NULL)
1989	return REG_ESPACE;
1990	node->obj = seq1->obj;
1991	node->type = seq1->type;
1992	}
1993
1994	iter_depth--;
1995	pos_add_total += pos_add - pos_add_last;
1996	if (iter_depth == 0)
1997	pos_add = pos_add_total;
1998
1999	break;
2000	}
2001	default:
2002	assert(0);
2003	break;
2004	}
2005	}
2006
2007	*position += pos_add_total;
2008
2009	/* `max_pos' should never be larger than `*position' if the above
2010	code works, but just an extra safeguard let's make sure
2011	`*position' is set large enough so enough memory will be
2012	allocated for the transition table. */
2013	if (max_pos > *position)
2014	*position = max_pos;
2015
2016	return status;
2017	}
2018
2019	static tre_pos_and_tags_t *
2020	tre_set_empty(tre_mem_t mem)
2021	{
2022	tre_pos_and_tags_t *new_set;
2023
2024	new_set = tre_mem_calloc(mem, sizeof(*new_set));
2025	if (new_set == NULL)
2026	return NULL;
2027
2028	new_set[0].position = -1;
2029	new_set[0].code_min = -1;
2030	new_set[0].code_max = -1;
2031
2032	return new_set;
2033	}
2034
2035	static tre_pos_and_tags_t *
2036	tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2037	tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
2038	{
2039	tre_pos_and_tags_t *new_set;
2040
2041	new_set = tre_mem_calloc(mem, sizeof(new_set) 2);
2042	if (new_set == NULL)
2043	return NULL;
2044
2045	new_set[0].position = position;
2046	new_set[0].code_min = code_min;
2047	new_set[0].code_max = code_max;
2048	new_set[0].class = class;
2049	new_set[0].neg_classes = neg_classes;
2050	new_set[0].backref = backref;
2051	new_set[1].position = -1;
2052	new_set[1].code_min = -1;
2053	new_set[1].code_max = -1;
2054
2055	return new_set;
2056	}
2057
2058	static tre_pos_and_tags_t *
2059	tre_set_union(tre_mem_t mem, tre_pos_and_tags_t set1, tre_pos_and_tags_t set2,
2060	int *tags, int assertions)
2061	{
2062	int s1, s2, i, j;
2063	tre_pos_and_tags_t *new_set;
2064	int *new_tags;
2065	int num_tags;
2066
2067	for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2068	for (s1 = 0; set1[s1].position >= 0; s1++);
2069	for (s2 = 0; set2[s2].position >= 0; s2++);
2070	new_set = tre_mem_calloc(mem, sizeof(new_set) (s1 + s2 + 1));
2071	if (!new_set )
2072	return NULL;
2073
2074	for (s1 = 0; set1[s1].position >= 0; s1++)
2075	{
2076	new_set[s1].position = set1[s1].position;
2077	new_set[s1].code_min = set1[s1].code_min;
2078	new_set[s1].code_max = set1[s1].code_max;
2079	new_set[s1].assertions = set1[s1].assertions \| assertions;
2080	new_set[s1].class = set1[s1].class;
2081	new_set[s1].neg_classes = set1[s1].neg_classes;
2082	new_set[s1].backref = set1[s1].backref;
2083	if (set1[s1].tags == NULL && tags == NULL)
2084	new_set[s1].tags = NULL;
2085	else
2086	{
2087	for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2088	new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2089	* (i + num_tags + 1)));
2090	if (new_tags == NULL)
2091	return NULL;
2092	for (j = 0; j < i; j++)
2093	new_tags[j] = set1[s1].tags[j];
2094	for (i = 0; i < num_tags; i++)
2095	new_tags[j + i] = tags[i];
2096	new_tags[j + i] = -1;
2097	new_set[s1].tags = new_tags;
2098	}
2099	}
2100
2101	for (s2 = 0; set2[s2].position >= 0; s2++)
2102	{
2103	new_set[s1 + s2].position = set2[s2].position;
2104	new_set[s1 + s2].code_min = set2[s2].code_min;
2105	new_set[s1 + s2].code_max = set2[s2].code_max;
2106	/* XXX - why not \| assertions here as well? */
2107	new_set[s1 + s2].assertions = set2[s2].assertions;
2108	new_set[s1 + s2].class = set2[s2].class;
2109	new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2110	new_set[s1 + s2].backref = set2[s2].backref;
2111	if (set2[s2].tags == NULL)
2112	new_set[s1 + s2].tags = NULL;
2113	else
2114	{
2115	for (i = 0; set2[s2].tags[i] >= 0; i++);
2116	new_tags = tre_mem_alloc(mem, sizeof(new_tags) (i + 1));
2117	if (new_tags == NULL)
2118	return NULL;
2119	for (j = 0; j < i; j++)
2120	new_tags[j] = set2[s2].tags[j];
2121	new_tags[j] = -1;
2122	new_set[s1 + s2].tags = new_tags;
2123	}
2124	}
2125	new_set[s1 + s2].position = -1;
2126	return new_set;
2127	}
2128
2129	/* Finds the empty path through `node' which is the one that should be
2130	taken according to POSIX.2 rules, and adds the tags on that path to
2131	`tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is
2132	set to the number of tags seen on the path. */
2133	static reg_errcode_t
2134	tre_match_empty(tre_stack_t stack, tre_ast_node_t node, int *tags,
2135	int assertions, int num_tags_seen)
2136	{
2137	tre_literal_t *lit;
2138	tre_union_t *uni;
2139	tre_catenation_t *cat;
2140	tre_iteration_t *iter;
2141	int i;
2142	int bottom = tre_stack_num_objects(stack);
2143	reg_errcode_t status = REG_OK;
2144	if (num_tags_seen)
2145	*num_tags_seen = 0;
2146
2147	status = tre_stack_push_voidptr(stack, node);
2148
2149	/* Walk through the tree recursively. */
2150	while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2151	{
2152	node = tre_stack_pop_voidptr(stack);
2153
2154	switch (node->type)
2155	{
2156	case LITERAL:
2157	lit = (tre_literal_t *)node->obj;
2158	switch (lit->code_min)
2159	{
2160	case TAG:
2161	if (lit->code_max >= 0)
2162	{
2163	if (tags != NULL)
2164	{
2165	/* Add the tag to `tags'. */
2166	for (i = 0; tags[i] >= 0; i++)
2167	if (tags[i] == lit->code_max)
2168	break;
2169	if (tags[i] < 0)
2170	{
2171	tags[i] = lit->code_max;
2172	tags[i + 1] = -1;
2173	}
2174	}
2175	if (num_tags_seen)
2176	(*num_tags_seen)++;
2177	}
2178	break;
2179	case ASSERTION:
2180	assert(lit->code_max >= 1
2181	\|\| lit->code_max <= ASSERT_LAST);
2182	if (assertions != NULL)
2183	*assertions \|= lit->code_max;
2184	break;
2185	case EMPTY:
2186	break;
2187	default:
2188	assert(0);
2189	break;
2190	}
2191	break;
2192
2193	case UNION:
2194	/* Subexpressions starting earlier take priority over ones
2195	starting later, so we prefer the left subexpression over the
2196	right subexpression. */
2197	uni = (tre_union_t *)node->obj;
2198	if (uni->left->nullable)
2199	STACK_PUSHX(stack, voidptr, uni->left)
2200	else if (uni->right->nullable)
2201	STACK_PUSHX(stack, voidptr, uni->right)
2202	else
2203	assert(0);
2204	break;
2205
2206	case CATENATION:
2207	/* The path must go through both children. */
2208	cat = (tre_catenation_t *)node->obj;
2209	assert(cat->left->nullable);
2210	assert(cat->right->nullable);
2211	STACK_PUSHX(stack, voidptr, cat->left);
2212	STACK_PUSHX(stack, voidptr, cat->right);
2213	break;
2214
2215	case ITERATION:
2216	/* A match with an empty string is preferred over no match at
2217	all, so we go through the argument if possible. */
2218	iter = (tre_iteration_t *)node->obj;
2219	if (iter->arg->nullable)
2220	STACK_PUSHX(stack, voidptr, iter->arg);
2221	break;
2222
2223	default:
2224	assert(0);
2225	break;
2226	}
2227	}
2228
2229	return status;
2230	}
2231
2232
2233	typedef enum {
2234	NFL_RECURSE,
2235	NFL_POST_UNION,
2236	NFL_POST_CATENATION,
2237	NFL_POST_ITERATION
2238	} tre_nfl_stack_symbol_t;
2239
2240
2241	/* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2242	the nodes of the AST `tree'. */
2243	static reg_errcode_t
2244	tre_compute_nfl(tre_mem_t mem, tre_stack_t stack, tre_ast_node_t tree)
2245	{
2246	int bottom = tre_stack_num_objects(stack);
2247
2248	STACK_PUSHR(stack, voidptr, tree);
2249	STACK_PUSHR(stack, int, NFL_RECURSE);
2250
2251	while (tre_stack_num_objects(stack) > bottom)
2252	{
2253	tre_nfl_stack_symbol_t symbol;
2254	tre_ast_node_t *node;
2255
2256	symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2257	node = tre_stack_pop_voidptr(stack);
2258	switch (symbol)
2259	{
2260	case NFL_RECURSE:
2261	switch (node->type)
2262	{
2263	case LITERAL:
2264	{
2265	tre_literal_t lit = (tre_literal_t )node->obj;
2266	if (IS_BACKREF(lit))
2267	{
2268	/* Back references: nullable = false, firstpos = {i},
2269	lastpos = {i}. */
2270	node->nullable = 0;
2271	node->firstpos = tre_set_one(mem, lit->position, 0,
2272	TRE_CHAR_MAX, 0, NULL, -1);
2273	if (!node->firstpos)
2274	return REG_ESPACE;
2275	node->lastpos = tre_set_one(mem, lit->position, 0,
2276	TRE_CHAR_MAX, 0, NULL,
2277	(int)lit->code_max);
2278	if (!node->lastpos)
2279	return REG_ESPACE;
2280	}
2281	else if (lit->code_min < 0)
2282	{
2283	/* Tags, empty strings, params, and zero width assertions:
2284	nullable = true, firstpos = {}, and lastpos = {}. */
2285	node->nullable = 1;
2286	node->firstpos = tre_set_empty(mem);
2287	if (!node->firstpos)
2288	return REG_ESPACE;
2289	node->lastpos = tre_set_empty(mem);
2290	if (!node->lastpos)
2291	return REG_ESPACE;
2292	}
2293	else
2294	{
2295	/* Literal at position i: nullable = false, firstpos = {i},
2296	lastpos = {i}. */
2297	node->nullable = 0;
2298	node->firstpos =
2299	tre_set_one(mem, lit->position, (int)lit->code_min,
2300	(int)lit->code_max, 0, NULL, -1);
2301	if (!node->firstpos)
2302	return REG_ESPACE;
2303	node->lastpos = tre_set_one(mem, lit->position,
2304	(int)lit->code_min,
2305	(int)lit->code_max,
2306	lit->class, lit->neg_classes,
2307	-1);
2308	if (!node->lastpos)
2309	return REG_ESPACE;
2310	}
2311	break;
2312	}
2313
2314	case UNION:
2315	/* Compute the attributes for the two subtrees, and after that
2316	for this node. */
2317	STACK_PUSHR(stack, voidptr, node);
2318	STACK_PUSHR(stack, int, NFL_POST_UNION);
2319	STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2320	STACK_PUSHR(stack, int, NFL_RECURSE);
2321	STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2322	STACK_PUSHR(stack, int, NFL_RECURSE);
2323	break;
2324
2325	case CATENATION:
2326	/* Compute the attributes for the two subtrees, and after that
2327	for this node. */
2328	STACK_PUSHR(stack, voidptr, node);
2329	STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2330	STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2331	STACK_PUSHR(stack, int, NFL_RECURSE);
2332	STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2333	STACK_PUSHR(stack, int, NFL_RECURSE);
2334	break;
2335
2336	case ITERATION:
2337	/* Compute the attributes for the subtree, and after that for
2338	this node. */
2339	STACK_PUSHR(stack, voidptr, node);
2340	STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2341	STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2342	STACK_PUSHR(stack, int, NFL_RECURSE);
2343	break;
2344	}
2345	break; /* end case: NFL_RECURSE */
2346
2347	case NFL_POST_UNION:
2348	{
2349	tre_union_t uni = (tre_union_t )node->obj;
2350	node->nullable = uni->left->nullable \|\| uni->right->nullable;
2351	node->firstpos = tre_set_union(mem, uni->left->firstpos,
2352	uni->right->firstpos, NULL, 0);
2353	if (!node->firstpos)
2354	return REG_ESPACE;
2355	node->lastpos = tre_set_union(mem, uni->left->lastpos,
2356	uni->right->lastpos, NULL, 0);
2357	if (!node->lastpos)
2358	return REG_ESPACE;
2359	break;
2360	}
2361
2362	case NFL_POST_ITERATION:
2363	{
2364	tre_iteration_t iter = (tre_iteration_t )node->obj;
2365
2366	if (iter->min == 0 \|\| iter->arg->nullable)
2367	node->nullable = 1;
2368	else
2369	node->nullable = 0;
2370	node->firstpos = iter->arg->firstpos;
2371	node->lastpos = iter->arg->lastpos;
2372	break;
2373	}
2374
2375	case NFL_POST_CATENATION:
2376	{
2377	int num_tags, *tags, assertions;
2378	reg_errcode_t status;
2379	tre_catenation_t *cat = node->obj;
2380	node->nullable = cat->left->nullable && cat->right->nullable;
2381
2382	/* Compute firstpos. */
2383	if (cat->left->nullable)
2384	{
2385	/* The left side matches the empty string. Make a first pass
2386	with tre_match_empty() to get the number of tags and
2387	parameters. */
2388	status = tre_match_empty(stack, cat->left,
2389	NULL, NULL, &num_tags);
2390	if (status != REG_OK)
2391	return status;
2392	/* Allocate arrays for the tags and parameters. */
2393	tags = xmalloc(sizeof(tags) (num_tags + 1));
2394	if (!tags)
2395	return REG_ESPACE;
2396	tags[0] = -1;
2397	assertions = 0;
2398	/* Second pass with tre_mach_empty() to get the list of
2399	tags and parameters. */
2400	status = tre_match_empty(stack, cat->left, tags,
2401	&assertions, NULL);
2402	if (status != REG_OK)
2403	{
2404	xfree(tags);
2405	return status;
2406	}
2407	node->firstpos =
2408	tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2409	tags, assertions);
2410	xfree(tags);
2411	if (!node->firstpos)
2412	return REG_ESPACE;
2413	}
2414	else
2415	{
2416	node->firstpos = cat->left->firstpos;
2417	}
2418
2419	/* Compute lastpos. */
2420	if (cat->right->nullable)
2421	{
2422	/* The right side matches the empty string. Make a first pass
2423	with tre_match_empty() to get the number of tags and
2424	parameters. */
2425	status = tre_match_empty(stack, cat->right,
2426	NULL, NULL, &num_tags);
2427	if (status != REG_OK)
2428	return status;
2429	/* Allocate arrays for the tags and parameters. */
2430	tags = xmalloc(sizeof(int) * (num_tags + 1));
2431	if (!tags)
2432	return REG_ESPACE;
2433	tags[0] = -1;
2434	assertions = 0;
2435	/* Second pass with tre_mach_empty() to get the list of
2436	tags and parameters. */
2437	status = tre_match_empty(stack, cat->right, tags,
2438	&assertions, NULL);
2439	if (status != REG_OK)
2440	{
2441	xfree(tags);
2442	return status;
2443	}
2444	node->lastpos =
2445	tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2446	tags, assertions);
2447	xfree(tags);
2448	if (!node->lastpos)
2449	return REG_ESPACE;
2450	}
2451	else
2452	{
2453	node->lastpos = cat->right->lastpos;
2454	}
2455	break;
2456	}
2457
2458	default:
2459	assert(0);
2460	break;
2461	}
2462	}
2463
2464	return REG_OK;
2465	}
2466
2467
2468	/* Adds a transition from each position in `p1' to each position in `p2'. */
2469	static reg_errcode_t
2470	tre_make_trans(tre_pos_and_tags_t p1, tre_pos_and_tags_t p2,
2471	tre_tnfa_transition_t *transitions,
2472	int counts, int offs)
2473	{
2474	tre_pos_and_tags_t *orig_p2 = p2;
2475	tre_tnfa_transition_t *trans;
2476	int i, j, k, l, dup, prev_p2_pos;
2477
2478	if (transitions != NULL)
2479	while (p1->position >= 0)
2480	{
2481	p2 = orig_p2;
2482	prev_p2_pos = -1;
2483	while (p2->position >= 0)
2484	{
2485	/* Optimization: if this position was already handled, skip it. */
2486	if (p2->position == prev_p2_pos)
2487	{
2488	p2++;
2489	continue;
2490	}
2491	prev_p2_pos = p2->position;
2492	/* Set `trans' to point to the next unused transition from
2493	position `p1->position'. */
2494	trans = transitions + offs[p1->position];
2495	while (trans->state != NULL)
2496	{
2497	#if 0
2498	/* If we find a previous transition from `p1->position' to
2499	`p2->position', it is overwritten. This can happen only
2500	if there are nested loops in the regexp, like in "((a))".
2501	In POSIX.2 repetition using the outer loop is always
2502	preferred over using the inner loop. Therefore the
2503	transition for the inner loop is useless and can be thrown
2504	away. */
2505	/* XXX - The same position is used for all nodes in a bracket
2506	expression, so this optimization cannot be used (it will
2507	break bracket expressions) unless I figure out a way to
2508	detect it here. */
2509	if (trans->state_id == p2->position)
2510	{
2511	break;
2512	}
2513	#endif
2514	trans++;
2515	}
2516
2517	if (trans->state == NULL)
2518	(trans + 1)->state = NULL;
2519	/* Use the character ranges, assertions, etc. from `p1' for
2520	the transition from `p1' to `p2'. */
2521	trans->code_min = p1->code_min;
2522	trans->code_max = p1->code_max;
2523	trans->state = transitions + offs[p2->position];
2524	trans->state_id = p2->position;
2525	trans->assertions = p1->assertions \| p2->assertions
2526	\| (p1->class ? ASSERT_CHAR_CLASS : 0)
2527	\| (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2528	if (p1->backref >= 0)
2529	{
2530	assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2531	assert(p2->backref < 0);
2532	trans->u.backref = p1->backref;
2533	trans->assertions \|= ASSERT_BACKREF;
2534	}
2535	else
2536	trans->u.class = p1->class;
2537	if (p1->neg_classes != NULL)
2538	{
2539	for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2540	trans->neg_classes =
2541	xmalloc(sizeof(trans->neg_classes) (i + 1));
2542	if (trans->neg_classes == NULL)
2543	return REG_ESPACE;
2544	for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2545	trans->neg_classes[i] = p1->neg_classes[i];
2546	trans->neg_classes[i] = (tre_ctype_t)0;
2547	}
2548	else
2549	trans->neg_classes = NULL;
2550
2551	/* Find out how many tags this transition has. */
2552	i = 0;
2553	if (p1->tags != NULL)
2554	while(p1->tags[i] >= 0)
2555	i++;
2556	j = 0;
2557	if (p2->tags != NULL)
2558	while(p2->tags[j] >= 0)
2559	j++;
2560
2561	/* If we are overwriting a transition, free the old tag array. */
2562	if (trans->tags != NULL)
2563	xfree(trans->tags);
2564	trans->tags = NULL;
2565
2566	/* If there were any tags, allocate an array and fill it. */
2567	if (i + j > 0)
2568	{
2569	trans->tags = xmalloc(sizeof(trans->tags) (i + j + 1));
2570	if (!trans->tags)
2571	return REG_ESPACE;
2572	i = 0;
2573	if (p1->tags != NULL)
2574	while(p1->tags[i] >= 0)
2575	{
2576	trans->tags[i] = p1->tags[i];
2577	i++;
2578	}
2579	l = i;
2580	j = 0;
2581	if (p2->tags != NULL)
2582	while (p2->tags[j] >= 0)
2583	{
2584	/* Don't add duplicates. */
2585	dup = 0;
2586	for (k = 0; k < i; k++)
2587	if (trans->tags[k] == p2->tags[j])
2588	{
2589	dup = 1;
2590	break;
2591	}
2592	if (!dup)
2593	trans->tags[l++] = p2->tags[j];
2594	j++;
2595	}
2596	trans->tags[l] = -1;
2597	}
2598
2599	p2++;
2600	}
2601	p1++;
2602	}
2603	else
2604	/* Compute a maximum limit for the number of transitions leaving
2605	from each state. */
2606	while (p1->position >= 0)
2607	{
2608	p2 = orig_p2;
2609	while (p2->position >= 0)
2610	{
2611	counts[p1->position]++;
2612	p2++;
2613	}
2614	p1++;
2615	}
2616	return REG_OK;
2617	}
2618
2619	/* Converts the syntax tree to a TNFA. All the transitions in the TNFA are
2620	labelled with one character range (there are no transitions on empty
2621	strings). The TNFA takes O(n^2) space in the worst case, `n' is size of
2622	the regexp. */
2623	static reg_errcode_t
2624	tre_ast_to_tnfa(tre_ast_node_t node, tre_tnfa_transition_t transitions,
2625	int counts, int offs)
2626	{
2627	tre_union_t *uni;
2628	tre_catenation_t *cat;
2629	tre_iteration_t *iter;
2630	reg_errcode_t errcode = REG_OK;
2631
2632	/* XXX - recurse using a stack!. */
2633	switch (node->type)
2634	{
2635	case LITERAL:
2636	break;
2637	case UNION:
2638	uni = (tre_union_t *)node->obj;
2639	errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
2640	if (errcode != REG_OK)
2641	return errcode;
2642	errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
2643	break;
2644
2645	case CATENATION:
2646	cat = (tre_catenation_t *)node->obj;
2647	/* Add a transition from each position in cat->left->lastpos
2648	to each position in cat->right->firstpos. */
2649	errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
2650	transitions, counts, offs);
2651	if (errcode != REG_OK)
2652	return errcode;
2653	errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
2654	if (errcode != REG_OK)
2655	return errcode;
2656	errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
2657	break;
2658
2659	case ITERATION:
2660	iter = (tre_iteration_t *)node->obj;
2661	assert(iter->max == -1 \|\| iter->max == 1);
2662
2663	if (iter->max == -1)
2664	{
2665	assert(iter->min == 0 \|\| iter->min == 1);
2666	/* Add a transition from each last position in the iterated
2667	expression to each first position. */
2668	errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
2669	transitions, counts, offs);
2670	if (errcode != REG_OK)
2671	return errcode;
2672	}
2673	errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
2674	break;
2675	}
2676	return errcode;
2677	}
2678
2679
2680	#define ERROR_EXIT(err) \
2681	do \
2682	{ \
2683	errcode = err; \
2684	if (/CONSTCOND/1) \
2685	goto error_exit; \
2686	} \
2687	while (/CONSTCOND/0)
2688
2689
2690	int
2691	regcomp(regex_t restrict preg, const char restrict regex, int cflags)
2692	{
2693	tre_stack_t *stack;
2694	tre_ast_node_t tree, tmp_ast_l, *tmp_ast_r;
2695	tre_pos_and_tags_t *p;
2696	int counts = NULL, offs = NULL;
2697	int i, add = 0;
2698	tre_tnfa_transition_t transitions, initial;
2699	tre_tnfa_t *tnfa = NULL;
2700	tre_submatch_data_t *submatch_data;
2701	tre_tag_direction_t *tag_directions = NULL;
2702	reg_errcode_t errcode;
2703	tre_mem_t mem;
2704
2705	/* Parse context. */
2706	tre_parse_ctx_t parse_ctx;
2707
2708	/* Allocate a stack used throughout the compilation process for various
2709	purposes. */
2710	stack = tre_stack_new(512, 1024000, 128);
2711	if (!stack)
2712	return REG_ESPACE;
2713	/* Allocate a fast memory allocator. */
2714	mem = tre_mem_new();
2715	if (!mem)
2716	{
2717	tre_stack_destroy(stack);
2718	return REG_ESPACE;
2719	}
2720
2721	/* Parse the regexp. */
2722	memset(&parse_ctx, 0, sizeof(parse_ctx));
2723	parse_ctx.mem = mem;
2724	parse_ctx.stack = stack;
2725	parse_ctx.start = regex;
2726	parse_ctx.cflags = cflags;
2727	parse_ctx.max_backref = -1;
2728	errcode = tre_parse(&parse_ctx);
2729	if (errcode != REG_OK)
2730	ERROR_EXIT(errcode);
2731	preg->re_nsub = parse_ctx.submatch_id - 1;
2732	tree = parse_ctx.n;
2733
2734	#ifdef TRE_DEBUG
2735	tre_ast_print(tree);
2736	#endif /* TRE_DEBUG */
2737
2738	/* Referring to nonexistent subexpressions is illegal. */
2739	if (parse_ctx.max_backref > (int)preg->re_nsub)
2740	ERROR_EXIT(REG_ESUBREG);
2741
2742	/* Allocate the TNFA struct. */
2743	tnfa = xcalloc(1, sizeof(tre_tnfa_t));
2744	if (tnfa == NULL)
2745	ERROR_EXIT(REG_ESPACE);
2746	tnfa->have_backrefs = parse_ctx.max_backref >= 0;
2747	tnfa->have_approx = 0;
2748	tnfa->num_submatches = parse_ctx.submatch_id;
2749
2750	/* Set up tags for submatch addressing. If REG_NOSUB is set and the
2751	regexp does not have back references, this can be skipped. */
2752	if (tnfa->have_backrefs \|\| !(cflags & REG_NOSUB))
2753	{
2754
2755	/* Figure out how many tags we will need. */
2756	errcode = tre_add_tags(NULL, stack, tree, tnfa);
2757	if (errcode != REG_OK)
2758	ERROR_EXIT(errcode);
2759
2760	if (tnfa->num_tags > 0)
2761	{
2762	tag_directions = xmalloc(sizeof(*tag_directions)
2763	* (tnfa->num_tags + 1));
2764	if (tag_directions == NULL)
2765	ERROR_EXIT(REG_ESPACE);
2766	tnfa->tag_directions = tag_directions;
2767	memset(tag_directions, -1,
2768	sizeof(tag_directions) (tnfa->num_tags + 1));
2769	}
2770	tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
2771	sizeof(*tnfa->minimal_tags));
2772	if (tnfa->minimal_tags == NULL)
2773	ERROR_EXIT(REG_ESPACE);
2774
2775	submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
2776	sizeof(*submatch_data));
2777	if (submatch_data == NULL)
2778	ERROR_EXIT(REG_ESPACE);
2779	tnfa->submatch_data = submatch_data;
2780
2781	errcode = tre_add_tags(mem, stack, tree, tnfa);
2782	if (errcode != REG_OK)
2783	ERROR_EXIT(errcode);
2784
2785	}
2786
2787	/* Expand iteration nodes. */
2788	errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
2789	tag_directions);
2790	if (errcode != REG_OK)
2791	ERROR_EXIT(errcode);
2792
2793	/* Add a dummy node for the final state.
2794	XXX - For certain patterns this dummy node can be optimized away,
2795	for example "a" or "ab". Figure out a simple way to detect
2796	this possibility. */
2797	tmp_ast_l = tree;
2798	tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
2799	if (tmp_ast_r == NULL)
2800	ERROR_EXIT(REG_ESPACE);
2801
2802	tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
2803	if (tree == NULL)
2804	ERROR_EXIT(REG_ESPACE);
2805
2806	errcode = tre_compute_nfl(mem, stack, tree);
2807	if (errcode != REG_OK)
2808	ERROR_EXIT(errcode);
2809
2810	counts = xmalloc(sizeof(int) * parse_ctx.position);
2811	if (counts == NULL)
2812	ERROR_EXIT(REG_ESPACE);
2813
2814	offs = xmalloc(sizeof(int) * parse_ctx.position);
2815	if (offs == NULL)
2816	ERROR_EXIT(REG_ESPACE);
2817
2818	for (i = 0; i < parse_ctx.position; i++)
2819	counts[i] = 0;
2820	tre_ast_to_tnfa(tree, NULL, counts, NULL);
2821
2822	add = 0;
2823	for (i = 0; i < parse_ctx.position; i++)
2824	{
2825	offs[i] = add;
2826	add += counts[i] + 1;
2827	counts[i] = 0;
2828	}
2829	transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
2830	if (transitions == NULL)
2831	ERROR_EXIT(REG_ESPACE);
2832	tnfa->transitions = transitions;
2833	tnfa->num_transitions = add;
2834
2835	errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
2836	if (errcode != REG_OK)
2837	ERROR_EXIT(errcode);
2838
2839	tnfa->firstpos_chars = NULL;
2840
2841	p = tree->firstpos;
2842	i = 0;
2843	while (p->position >= 0)
2844	{
2845	i++;
2846	p++;
2847	}
2848
2849	initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
2850	if (initial == NULL)
2851	ERROR_EXIT(REG_ESPACE);
2852	tnfa->initial = initial;
2853
2854	i = 0;
2855	for (p = tree->firstpos; p->position >= 0; p++)
2856	{
2857	initial[i].state = transitions + offs[p->position];
2858	initial[i].state_id = p->position;
2859	initial[i].tags = NULL;
2860	/* Copy the arrays p->tags, and p->params, they are allocated
2861	from a tre_mem object. */
2862	if (p->tags)
2863	{
2864	int j;
2865	for (j = 0; p->tags[j] >= 0; j++);
2866	initial[i].tags = xmalloc(sizeof(p->tags) (j + 1));
2867	if (!initial[i].tags)
2868	ERROR_EXIT(REG_ESPACE);
2869	memcpy(initial[i].tags, p->tags, sizeof(p->tags) (j + 1));
2870	}
2871	initial[i].assertions = p->assertions;
2872	i++;
2873	}
2874	initial[i].state = NULL;
2875
2876	tnfa->num_transitions = add;
2877	tnfa->final = transitions + offs[tree->lastpos[0].position];
2878	tnfa->num_states = parse_ctx.position;
2879	tnfa->cflags = cflags;
2880
2881	tre_mem_destroy(mem);
2882	tre_stack_destroy(stack);
2883	xfree(counts);
2884	xfree(offs);
2885
2886	preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2887	return REG_OK;
2888
2889	error_exit:
2890	/* Free everything that was allocated and return the error code. */
2891	tre_mem_destroy(mem);
2892	if (stack != NULL)
2893	tre_stack_destroy(stack);
2894	if (counts != NULL)
2895	xfree(counts);
2896	if (offs != NULL)
2897	xfree(offs);
2898	preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2899	regfree(preg);
2900	return errcode;
2901	}
2902
2903
2904
2905
2906	void
2907	regfree(regex_t *preg)
2908	{
2909	tre_tnfa_t *tnfa;
2910	unsigned int i;
2911	tre_tnfa_transition_t *trans;
2912
2913	tnfa = (void *)preg->TRE_REGEX_T_FIELD;
2914	if (!tnfa)
2915	return;
2916
2917	for (i = 0; i < tnfa->num_transitions; i++)
2918	if (tnfa->transitions[i].state)
2919	{
2920	if (tnfa->transitions[i].tags)
2921	xfree(tnfa->transitions[i].tags);
2922	if (tnfa->transitions[i].neg_classes)
2923	xfree(tnfa->transitions[i].neg_classes);
2924	}
2925	if (tnfa->transitions)
2926	xfree(tnfa->transitions);
2927
2928	if (tnfa->initial)
2929	{
2930	for (trans = tnfa->initial; trans->state; trans++)
2931	{
2932	if (trans->tags)
2933	xfree(trans->tags);
2934	}
2935	xfree(tnfa->initial);
2936	}
2937
2938	if (tnfa->submatch_data)
2939	{
2940	for (i = 0; i < tnfa->num_submatches; i++)
2941	if (tnfa->submatch_data[i].parents)
2942	xfree(tnfa->submatch_data[i].parents);
2943	xfree(tnfa->submatch_data);
2944	}
2945
2946	if (tnfa->tag_directions)
2947	xfree(tnfa->tag_directions);
2948	if (tnfa->firstpos_chars)
2949	xfree(tnfa->firstpos_chars);
2950	if (tnfa->minimal_tags)
2951	xfree(tnfa->minimal_tags);
2952	xfree(tnfa);
2953	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: asp3_tinet_ecnl_rx/trunk/musl-1.1.18/src/regex/regcomp.c@ 403

Download in other formats: