From 616539252c0a0df3e80af993c80f2cb5f4785366 Mon Sep 17 00:00:00 2001 From: hyung-hwan Date: Wed, 2 Dec 2020 16:07:06 +0000 Subject: [PATCH] made tre_ast_to_tnfa() iterative fixed a bug in tre_expand_macro() fixed a bug in tre_parse() --- hawk/lib/tre-compile.c | 80 ++++++++++++++++++++++++++++++++++++++---- hawk/lib/tre-mem.c | 8 +++-- hawk/lib/tre-parse.c | 45 ++++++++++++++++++------ hawk/lib/tre-parse.h | 5 ++- 4 files changed, 116 insertions(+), 22 deletions(-) diff --git a/hawk/lib/tre-compile.c b/hawk/lib/tre-compile.c index 83647d12..7758bc71 100644 --- a/hawk/lib/tre-compile.c +++ b/hawk/lib/tre-compile.c @@ -58,13 +58,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -/* - TODO: - - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive - function calls. -*/ - - #include #include "tre-stack.h" #include "tre-ast.h" @@ -758,6 +751,9 @@ tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, *result = tre_ast_new_literal(mem, min, max, pos); if (*result == NULL) status = REG_ESPACE; + /* HAWK */ + ((tre_literal_t*)(*result)->obj)->u.class = lit->u.class; + /* END HAWK */ if (pos > *max_pos) *max_pos = pos; break; @@ -1812,6 +1808,8 @@ tre_make_trans(hawk_gem_t* gem, tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2, labelled with one character range (there are no transitions on empty strings). The TNFA takes O(n^2) space in the worst case, `n' is size of the regexp. */ +/* HAWK */ +#if 0 static reg_errcode_t tre_ast_to_tnfa(hawk_gem_t* gem, tre_ast_node_t *node, tre_tnfa_transition_t *transitions, int *counts, int *offs) @@ -1867,7 +1865,75 @@ tre_ast_to_tnfa(hawk_gem_t* gem, tre_ast_node_t *node, tre_tnfa_transition_t *tr } return errcode; } +#endif +static reg_errcode_t +__tre_ast_to_tnfa(hawk_gem_t *gem, tre_stack_t* stack, tre_ast_node_t *node, tre_tnfa_transition_t *transitions, int *counts, int *offs) +{ + tre_union_t *uni; + tre_catenation_t *cat; + tre_iteration_t *iter; + reg_errcode_t errcode = REG_OK; + STACK_PUSHR(stack, voidptr, node); + + while (tre_stack_num_objects(stack)) + { + node = (tre_ast_node_t*)tre_stack_pop_voidptr(stack); + + switch (node->type) + { + case LITERAL: + break; + + case UNION: + uni = (tre_union_t *)node->obj; + STACK_PUSHR(stack, voidptr, uni->right); + STACK_PUSHR(stack, voidptr, uni->left); + break; + + case CATENATION: + cat = (tre_catenation_t *)node->obj; + /* Add a transition from each position in cat->left->lastpos to each position in cat->right->firstpos. */ + errcode = tre_make_trans(gem, cat->left->lastpos, cat->right->firstpos, transitions, counts, offs); + if (errcode != REG_OK) return errcode; + + STACK_PUSHR(stack, voidptr, cat->right); + STACK_PUSHR(stack, voidptr, cat->left); + break; + + case ITERATION: + iter = (tre_iteration_t *)node->obj; + if(!(iter->max == -1 || iter->max == 1)) return REG_BADBR; + + if (iter->max == -1) + { + if(!(iter->min == 0 || iter->min == 1)) return REG_BADBR; + /* Add a transition from each last position in the iterated expression to each first position. */ + errcode = tre_make_trans(gem, iter->arg->lastpos, iter->arg->firstpos, transitions, counts, offs); + if (errcode != REG_OK) return errcode; + } + STACK_PUSHR(stack, voidptr, iter->arg); + break; + } + } + return REG_OK; +} + +static reg_errcode_t +tre_ast_to_tnfa(hawk_gem_t* gem, tre_ast_node_t *node, tre_tnfa_transition_t *transitions, int *counts, int *offs) +{ + reg_errcode_t x; + tre_stack_t* stack; + + stack = tre_stack_new(gem, 1024, -1, 4096); + if (HAWK_UNLIKELY(!stack)) return REG_ESPACE; + + x = __tre_ast_to_tnfa(gem, stack, node, transitions, counts, offs); + + tre_stack_destroy(stack); + return x; +} +/* END HAWK */ #define ERROR_EXIT(err) \ do \ diff --git a/hawk/lib/tre-mem.c b/hawk/lib/tre-mem.c index f1dc43d8..25ceb86f 100644 --- a/hawk/lib/tre-mem.c +++ b/hawk/lib/tre-mem.c @@ -114,13 +114,15 @@ hawk_tre_mem_alloc_impl(hawk_tre_mem_t mem, int provided, void *provided_block, } else { - int block_size; + /* HAWK */ + /* int block_size;*/ + hawk_oow_t block_size; + /* END HAWK */ if (size * 8 > TRE_MEM_BLOCK_SIZE) block_size = size * 8; else block_size = TRE_MEM_BLOCK_SIZE; - DPRINT(("tre_mem_alloc: allocating new %d byte block\n", - block_size)); + DPRINT(("tre_mem_alloc: allocating new %lu byte block\n", (unsigned long int)block_size)); l = xmalloc(mem->gem, sizeof(*l)); if (l == NULL) diff --git a/hawk/lib/tre-parse.c b/hawk/lib/tre-parse.c index d39095dd..d670aa46 100644 --- a/hawk/lib/tre-parse.c +++ b/hawk/lib/tre-parse.c @@ -134,8 +134,11 @@ tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end, unsigned int j; DPRINT(("Expanding macro '%c' => '%s'\n", tre_macros[i].c, tre_macros[i].expansion)); - for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++) + /* HAWK */ + /*for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++)*/ + for (j = 0; tre_macros[i].expansion[j] && j < buf_len - 1; j++) buf[j] = tre_macros[i].expansion[j]; + /* END HAWK */ buf[j] = 0; break; } @@ -214,7 +217,10 @@ tre_compare_items(const void *a, const void *b, void* ctx) const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a; const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b; tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj; - int a_min = l_a->code_min, b_min = l_b->code_min; + /* HAWK: changed int to long */ + /*int a_min = l_a->code_min, b_min = l_b->code_min;*/ + long a_min = l_a->code_min, b_min = l_b->code_min; + /* END HAWK */ if (a_min < b_min) return -1; @@ -295,7 +301,10 @@ tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate, && *re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON) { const tre_char_t *endptr = re + 2; - int len; + /* HAWK: changed int to hawk_oow_t */ + /*int len;*/ + hawk_oow_t len; + /* END HAWK */ DPRINT(("tre_parse_bracket: class: '%.*" STRF "'\n", REST(re))); while (endptr < ctx->re_end && *endptr != CHAR_COLON) endptr++; if (endptr != ctx->re_end) @@ -557,14 +566,24 @@ parse_bracket_done: static int tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end) { + /* HAWK : added overflow check with other code optimizations */ int num = -1; const tre_char_t *r = *regex; - while (r < regex_end && *r >= HAWK_T('0') && *r <= HAWK_T('9')) + + if (r < regex_end && *r >= HAWK_T('0') && *r <= HAWK_T('9')) { - if (num < 0) - num = 0; - num = num * 10 + *r - HAWK_T('0'); - r++; + int ever_overflowed = 0; + + num = 0; + do + { + if (num > (HAWK_TYPE_MAX(int) - 9) / 10) ever_overflowed = 1; + num = num * 10 + *r - HAWK_T('0'); + r++; + } + while (r < regex_end && *r >= HAWK_T('0') && *r <= HAWK_T('9')); + + if (ever_overflowed) num = -1; } *regex = r; return num; @@ -1355,9 +1374,13 @@ reg_errcode_t tre_parse(tre_parse_ctx_t *ctx) break; case CHAR_RPAREN: /* end of current subexpression */ - if ((ctx->cflags & REG_EXTENDED && depth > 0) + /* HAWK: fixed the condition */ + /* if ((ctx->cflags & REG_EXTENDED && depth > 0) || (ctx->re > ctx->re_start - && *(ctx->re - 1) == CHAR_BACKSLASH)) + && *(ctx->re - 1) == CHAR_BACKSLASH)) */ + if (((ctx->cflags & REG_EXTENDED) && depth > 0) || + (!(ctx->cflags & REG_EXTENDED) && ctx->re > ctx->re_start && *(ctx->re - 1) == CHAR_BACKSLASH)) + /* END HAWK */ { DPRINT(("tre_parse: empty: '%.*" STRF "'\n", REST(ctx->re))); /* We were expecting an atom, but instead the current @@ -1649,7 +1672,7 @@ reg_errcode_t tre_parse(tre_parse_ctx_t *ctx) /* We are expecting an atom. If the subexpression (or the whole - regexp ends here, we interpret it as an empty expression + regexp) ends here, we interpret it as an empty expression (which matches an empty string). */ if ( #ifdef REG_LITERAL diff --git a/hawk/lib/tre-parse.h b/hawk/lib/tre-parse.h index c55ef33a..c84dfe41 100644 --- a/hawk/lib/tre-parse.h +++ b/hawk/lib/tre-parse.h @@ -76,7 +76,10 @@ typedef struct const tre_char_t *re_start; /* The first character after the end of the regexp. */ const tre_char_t *re_end; - int len; + /* HAWK: changed int to hawk_oow_t */ + /*int len;*/ + hawk_oow_t len; + /* END HAWK */ /* Current submatch ID. */ int submatch_id; /* Current position (number of literal). */