diff --git a/ase/awk/rex.c b/ase/awk/rex.c index 3b5e9f5a..cc8da1b8 100644 --- a/ase/awk/rex.c +++ b/ase/awk/rex.c @@ -1,5 +1,5 @@ /* - * $Id: rex.c,v 1.6 2006-07-20 03:41:00 bacon Exp $ + * $Id: rex.c,v 1.7 2006-07-20 16:21:54 bacon Exp $ */ #include @@ -26,12 +26,13 @@ enum enum { - CMD_BOL_CHAR, - CMD_EOL_CHAR, - CMD_ORD_CHAR, + CMD_BOL, + CMD_EOL, CMD_ANY_CHAR, + CMD_ORD_CHAR, CMD_CHAR_RANGE, - CMD_CHAR_CLASS + CMD_CHAR_CLASS, + CMD_GROUP }; enum @@ -53,10 +54,10 @@ enum struct __code { - xp_byte_t cmd; + //xp_byte_t cmd; + int cmd; xp_size_t lbound; xp_size_t ubound; - xp_char_t cc; /* optional */ }; #define NEXT_CHAR(rex,level) \ @@ -69,11 +70,15 @@ static int __compile_expression (xp_awk_rex_t* rex); static int __compile_branch (xp_awk_rex_t* rex); static int __compile_atom (xp_awk_rex_t* rex); static int __compile_charset (xp_awk_rex_t* rex); -static int __compile_bound (xp_awk_rex_t* rex); -static int __compile_range (xp_awk_rex_t* rex); +static int __compile_bound (xp_awk_rex_t* rex, struct __code* cmd); +static int __compile_range (xp_awk_rex_t* rex, struct __code* cmd); static int __next_char (xp_awk_rex_t* rex, int level); static int __add_code (xp_awk_rex_t* rex, void* data, xp_size_t len); +static const xp_byte_t* __print_expression (const xp_byte_t* p); +static const xp_byte_t* __print_branch (const xp_byte_t* p); +static const xp_byte_t* __print_atom (const xp_byte_t* p); + xp_awk_rex_t* xp_awk_rex_open (xp_awk_rex_t* rex) { if (rex == XP_NULL) @@ -104,7 +109,6 @@ void xp_awk_rex_close (xp_awk_rex_t* rex) int xp_awk_rex_compile (xp_awk_rex_t* rex, const xp_char_t* ptn, xp_size_t len) { - rex->ptn.ptr = ptn; rex->ptn.end = rex->ptn.ptr + len; rex->ptn.curp = rex->ptn.ptr; @@ -136,9 +140,12 @@ xp_printf (XP_T("garbage after expression\n")); static int __compile_expression (xp_awk_rex_t* rex) { xp_size_t zero = 0; - xp_size_t* nb, * el, * bl; + xp_size_t* nb, * el; + xp_size_t old_size; int n; + old_size = rex->code.size; + /* secure space for header and set the header fields to zero */ nb = (xp_size_t*)&rex->code.buf[rex->code.size]; ADD_CODE (rex, &zero, xp_sizeof(zero)); @@ -147,7 +154,6 @@ static int __compile_expression (xp_awk_rex_t* rex) ADD_CODE (rex, &zero, xp_sizeof(zero)); /* handle the first branch */ - bl = (xp_size_t*)&rex->code.buf[rex->code.size]; n = __compile_branch (rex); if (n == -1) return -1; if (n == 0) @@ -157,7 +163,6 @@ static int __compile_expression (xp_awk_rex_t* rex) } (*nb) += 1; - (*el) += *bl + xp_sizeof(*bl); /* handle subsequent branches if any */ while (rex->ptn.curc.type == __SPECIAL && @@ -165,7 +170,6 @@ static int __compile_expression (xp_awk_rex_t* rex) { NEXT_CHAR (rex, __TOP); - bl = (xp_size_t*)&rex->code.buf[rex->code.size]; n = __compile_branch(rex); if (n == -1) return -1; if (n == 0) @@ -179,26 +183,32 @@ static int __compile_expression (xp_awk_rex_t* rex) } (*nb) += 1; - (*el) += *bl + xp_sizeof(*bl); } + *el = rex->code.size - old_size; return 1; } static int __compile_branch (xp_awk_rex_t* rex) { int n; - xp_size_t* bl; + xp_size_t* na, * bl; xp_size_t old_size; xp_size_t zero = 0; + struct __code* cmd; old_size = rex->code.size; + na = (xp_size_t*)&rex->code.buf[rex->code.size]; + ADD_CODE (rex, &zero, xp_sizeof(zero)); + bl = (xp_size_t*)&rex->code.buf[rex->code.size]; ADD_CODE (rex, &zero, xp_sizeof(zero)); while (1) { + cmd = (struct __code*)&rex->code.buf[rex->code.size]; + n = __compile_atom (rex); if (n == -1) { @@ -208,7 +218,7 @@ static int __compile_branch (xp_awk_rex_t* rex) if (n == 0) break; /* no atom */ - n = __compile_bound (rex); + n = __compile_bound (rex, cmd); if (n == -1) { rex->code.size = old_size; @@ -217,27 +227,33 @@ static int __compile_branch (xp_awk_rex_t* rex) /* n == 0 no bound character. just continue */ /* n == 1 bound has been applied by compile_bound */ + + (*na) += 1; } - return 0; + *bl = rex->code.size - old_size; + return ((*na) == 0)? 0: 1; } static int __compile_atom (xp_awk_rex_t* rex) { - int n = 0; + int n; - if (rex->ptn.curc.type == __EOF) - { - /* no atom */ - return 0; - } - else if (rex->ptn.curc.type == __SPECIAL) + if (rex->ptn.curc.type == __EOF) return 0; + + if (rex->ptn.curc.type == __SPECIAL) { if (rex->ptn.curc.value == XP_T('(')) { - // GROUP + struct __code tmp; + + tmp.cmd = CMD_GROUP; + tmp.lbound = 1; + tmp.ubound = 1; + + ADD_CODE (rex, &tmp, xp_sizeof(tmp)); NEXT_CHAR (rex, __TOP); - + n = __compile_expression (rex); if (n == -1) return -1; @@ -254,7 +270,7 @@ static int __compile_atom (xp_awk_rex_t* rex) { struct __code tmp; - tmp.cmd = CMD_BOL_CHAR; + tmp.cmd = CMD_BOL; tmp.lbound = 1; tmp.ubound = 1; @@ -265,7 +281,7 @@ static int __compile_atom (xp_awk_rex_t* rex) { struct __code tmp; - tmp.cmd = CMD_EOL_CHAR; + tmp.cmd = CMD_EOL; tmp.lbound = 1; tmp.ubound = 1; @@ -287,30 +303,26 @@ static int __compile_atom (xp_awk_rex_t* rex) { if (__compile_charset (rex) == -1) return -1; } - else - { - /*invalid special character....*/ - return -1; - } + else return 0; return 1; } else { - /* normal characters */ struct __code tmp; + xp_assert (rex->ptn.curc.type == __NORMAL); + tmp.cmd = CMD_ORD_CHAR; tmp.lbound = 1; tmp.ubound = 1; ADD_CODE (rex, &tmp, xp_sizeof(tmp)); - ADD_CODE (rex, &rex->ptn.curc, xp_sizeof(rex->ptn.curc)); + ADD_CODE (rex, &rex->ptn.curc.value, xp_sizeof(rex->ptn.curc.value)); NEXT_CHAR (rex, __TOP); return 1; } - } static int __compile_charset (xp_awk_rex_t* rex) @@ -318,7 +330,7 @@ static int __compile_charset (xp_awk_rex_t* rex) return -1; } -static int __compile_bound (xp_awk_rex_t* rex) +static int __compile_bound (xp_awk_rex_t* rex, struct __code* cmd) { if (rex->ptn.curc.type != __SPECIAL) return 0; @@ -326,28 +338,42 @@ static int __compile_bound (xp_awk_rex_t* rex) { case XP_T('+'): { - //__apply_bound (1, MAX); + cmd->lbound = 1; + cmd->ubound = BOUND_MAX; NEXT_CHAR(rex, __TOP); return 1; } case XP_T('*'): { - //__apply_bound (0, MAX); + cmd->lbound = 0; + cmd->ubound = BOUND_MAX; NEXT_CHAR(rex, __TOP); return 1; } case XP_T('?'): { - //__apply_bound (0, 1); + cmd->lbound = 0; + cmd->ubound = 1; NEXT_CHAR(rex, __TOP); return 1; } case XP_T('{'): { - if (__compile_range(rex) == -1) return -1; + NEXT_CHAR (rex, __IN_RANGE); + + if (__compile_range(rex, cmd) == -1) return -1; + + if (rex->ptn.curc.type != __SPECIAL || + rex->ptn.curc.value != XP_T('}')) + { + // rex->errnum = XP_AWK_REX_ERBRACE + return -1; + } + + NEXT_CHAR (rex, __TOP); return 1; } } @@ -355,9 +381,40 @@ static int __compile_bound (xp_awk_rex_t* rex) return 0; } -static int __compile_range (xp_awk_rex_t* rex) +static int __compile_range (xp_awk_rex_t* rex, struct __code* cmd) { - return -1; + xp_size_t bound; + +// TODO: should allow white spaces in the range??? +// what if it is not in the raight format? convert it to ordinary characters?? + bound = 0; + while (rex->ptn.curc.type == __NORMAL && + xp_isdigit(rex->ptn.curc.value)) + { + bound = bound * 10 + rex->ptn.curc.value - XP_T('0'); + NEXT_CHAR (rex, __IN_RANGE); + } + + cmd->lbound = bound; + + if (rex->ptn.curc.type == __SPECIAL && + rex->ptn.curc.value == XP_T(',')) + { + NEXT_CHAR (rex, __IN_RANGE); + + bound = 0; + while (rex->ptn.curc.type == __NORMAL && + xp_isdigit(rex->ptn.curc.value)) + { + bound = bound * 10 + rex->ptn.curc.value - XP_T('0'); + NEXT_CHAR (rex, __IN_RANGE); + } + + cmd->ubound = bound; + } + else cmd->ubound = BOUND_MAX; + + return 0; } static int __next_char (xp_awk_rex_t* rex, int level) @@ -461,3 +518,101 @@ static int __add_code (xp_awk_rex_t* rex, void* data, xp_size_t len) return 0; } + +void xp_awk_rex_print (xp_awk_rex_t* rex) +{ + const xp_byte_t* p; + p = __print_expression (rex->code.buf); + xp_printf (XP_T("\n")); + xp_assert (p == rex->code.buf + rex->code.size); +} + +static const xp_byte_t* __print_expression (const xp_byte_t* p) +{ + xp_size_t nb, el, i; + + nb = *(xp_size_t*)p; p += xp_sizeof(nb); + el = *(xp_size_t*)p; p += xp_sizeof(el); +//xp_printf (XP_T("NA = %u, EL = %u\n"), +// (unsigned int)nb, (unsigned int)el); + + for (i = 0; i < nb; i++) + { + if (i != 0) xp_printf (XP_T("|")); + p = __print_branch (p); + } + + return p; +} + +static const xp_byte_t* __print_branch (const xp_byte_t* p) +{ + xp_size_t na, bl, i; + + na = *(xp_size_t*)p; p += xp_sizeof(na); + bl = *(xp_size_t*)p; p += xp_sizeof(bl); +//xp_printf (XP_T("NA = %u, BL = %u\n"), +// (unsigned int) na, (unsigned int)bl); + + for (i = 0; i < na; i++) + { + p = __print_atom (p); + } + + return p; +} + +static const xp_byte_t* __print_atom (const xp_byte_t* p) +{ + struct __code* cp = (struct __code*)p; + + if (cp->cmd == CMD_BOL) + { + xp_printf (XP_T("^")); + p += xp_sizeof(*cp); + } + else if (cp->cmd == CMD_EOL) + { + xp_printf (XP_T("$")); + p += xp_sizeof(*cp); + } + else if (cp->cmd == CMD_ANY_CHAR) + { + xp_printf (XP_T(".")); + p += xp_sizeof(*cp); + } + else if (cp->cmd == CMD_ORD_CHAR) + { + p += xp_sizeof(*cp); + xp_printf (XP_T("%c"), *(xp_char_t*)p); + p += xp_sizeof(xp_char_t); + } + else if (cp->cmd == CMD_GROUP) + { + p += xp_sizeof(*cp); + xp_printf (XP_T("(")); + p = __print_expression (p); + xp_printf (XP_T(")")); + } + else + { +xp_printf (XP_T("FUCK FUCK FUCK\n")); + } + + if (cp->lbound == 0 && cp->ubound == BOUND_MAX) + xp_printf (XP_T("*")); + else if (cp->lbound == 1 && cp->ubound == BOUND_MAX) + xp_printf (XP_T("+")); + else if (cp->lbound == 0 && cp->ubound == 1) + xp_printf (XP_T("?")); + else if (cp->lbound != 1 || cp->ubound != 1) + { + xp_printf (XP_T("{%lu,%lu}"), + (unsigned long)cp->lbound, (unsigned long)cp->ubound); + } + + + return p; +} + + diff --git a/ase/awk/rex.h b/ase/awk/rex.h index a09e9a65..69cc4486 100644 --- a/ase/awk/rex.h +++ b/ase/awk/rex.h @@ -1,5 +1,5 @@ /* - * $Id: rex.h,v 1.4 2006-07-20 03:41:00 bacon Exp $ + * $Id: rex.h,v 1.5 2006-07-20 16:21:54 bacon Exp $ **/ #ifndef _XP_AWK_REX_H_ @@ -17,12 +17,13 @@ * * Compiled form of a regular expression: * - * | expression | - * | header | branch | branch | branch | - * | nb | el | bl | cmd | arg | cmd | arg | bl | cmd | arg | bl | cmd | + * | expression | + * | header | branch | branch | branch | + * | nb | el | na | bl | cmd | arg | cmd | arg | na | bl | cmd | arg | na | bl | cmd | * * nb: the number of branches * el: the length of a expression excluding the length of nb and el + * na: the number of atoms * bl: the length of a branch excluding the length of bl * cmd: The command and repetition info encoded together. * Some commands require an argument to follow them but some other don't. @@ -67,6 +68,7 @@ extern "C" { xp_awk_rex_t* xp_awk_rex_open (xp_awk_rex_t* rex); void xp_awk_rex_close (xp_awk_rex_t* rex); int xp_awk_rex_compile (xp_awk_rex_t* rex, const xp_char_t* ptn, xp_size_t len); +void xp_awk_rex_print (xp_awk_rex_t* rex); #ifdef __cplusplus } diff --git a/ase/test/awk/rex.c b/ase/test/awk/rex.c index 822797f2..07523b3e 100644 --- a/ase/test/awk/rex.c +++ b/ase/test/awk/rex.c @@ -23,7 +23,7 @@ int xp_main (int argc, const xp_char_t* argv[]) } - ptn = XP_T("^he.llo"); + ptn = XP_T("^he.llo(jo(in|kk)s|com)+h*e{1,40}abc"); if (xp_awk_rex_compile (rex, ptn, xp_strlen(ptn)) == -1) { xp_printf (XP_T("cannot compile pattern...\n")); @@ -31,6 +31,7 @@ int xp_main (int argc, const xp_char_t* argv[]) return -1; } + xp_awk_rex_print (rex); xp_awk_rex_close (rex); return 0; }