*** empty log message ***

This commit is contained in:
hyung-hwan 2006-07-20 16:21:54 +00:00
parent 75a3eaeac0
commit 40b4aecca7
3 changed files with 207 additions and 49 deletions

View File

@ -1,5 +1,5 @@
/* /*
* $Id: rex.c,v 1.6 2006-07-20 03:41:00 bacon Exp $ * $Id: rex.c,v 1.7 2006-07-20 16:21:54 bacon Exp $
*/ */
#include <xp/awk/awk_i.h> #include <xp/awk/awk_i.h>
@ -26,12 +26,13 @@ enum
enum enum
{ {
CMD_BOL_CHAR, CMD_BOL,
CMD_EOL_CHAR, CMD_EOL,
CMD_ORD_CHAR,
CMD_ANY_CHAR, CMD_ANY_CHAR,
CMD_ORD_CHAR,
CMD_CHAR_RANGE, CMD_CHAR_RANGE,
CMD_CHAR_CLASS CMD_CHAR_CLASS,
CMD_GROUP
}; };
enum enum
@ -53,10 +54,10 @@ enum
struct __code struct __code
{ {
xp_byte_t cmd; //xp_byte_t cmd;
int cmd;
xp_size_t lbound; xp_size_t lbound;
xp_size_t ubound; xp_size_t ubound;
xp_char_t cc; /* optional */
}; };
#define NEXT_CHAR(rex,level) \ #define NEXT_CHAR(rex,level) \
@ -69,11 +70,15 @@ static int __compile_expression (xp_awk_rex_t* rex);
static int __compile_branch (xp_awk_rex_t* rex); static int __compile_branch (xp_awk_rex_t* rex);
static int __compile_atom (xp_awk_rex_t* rex); static int __compile_atom (xp_awk_rex_t* rex);
static int __compile_charset (xp_awk_rex_t* rex); static int __compile_charset (xp_awk_rex_t* rex);
static int __compile_bound (xp_awk_rex_t* rex); static int __compile_bound (xp_awk_rex_t* rex, struct __code* cmd);
static int __compile_range (xp_awk_rex_t* rex); static int __compile_range (xp_awk_rex_t* rex, struct __code* cmd);
static int __next_char (xp_awk_rex_t* rex, int level); static int __next_char (xp_awk_rex_t* rex, int level);
static int __add_code (xp_awk_rex_t* rex, void* data, xp_size_t len); static int __add_code (xp_awk_rex_t* rex, void* data, xp_size_t len);
static const xp_byte_t* __print_expression (const xp_byte_t* p);
static const xp_byte_t* __print_branch (const xp_byte_t* p);
static const xp_byte_t* __print_atom (const xp_byte_t* p);
xp_awk_rex_t* xp_awk_rex_open (xp_awk_rex_t* rex) xp_awk_rex_t* xp_awk_rex_open (xp_awk_rex_t* rex)
{ {
if (rex == XP_NULL) if (rex == XP_NULL)
@ -104,7 +109,6 @@ void xp_awk_rex_close (xp_awk_rex_t* rex)
int xp_awk_rex_compile (xp_awk_rex_t* rex, const xp_char_t* ptn, xp_size_t len) int xp_awk_rex_compile (xp_awk_rex_t* rex, const xp_char_t* ptn, xp_size_t len)
{ {
rex->ptn.ptr = ptn; rex->ptn.ptr = ptn;
rex->ptn.end = rex->ptn.ptr + len; rex->ptn.end = rex->ptn.ptr + len;
rex->ptn.curp = rex->ptn.ptr; rex->ptn.curp = rex->ptn.ptr;
@ -136,9 +140,12 @@ xp_printf (XP_T("garbage after expression\n"));
static int __compile_expression (xp_awk_rex_t* rex) static int __compile_expression (xp_awk_rex_t* rex)
{ {
xp_size_t zero = 0; xp_size_t zero = 0;
xp_size_t* nb, * el, * bl; xp_size_t* nb, * el;
xp_size_t old_size;
int n; int n;
old_size = rex->code.size;
/* secure space for header and set the header fields to zero */ /* secure space for header and set the header fields to zero */
nb = (xp_size_t*)&rex->code.buf[rex->code.size]; nb = (xp_size_t*)&rex->code.buf[rex->code.size];
ADD_CODE (rex, &zero, xp_sizeof(zero)); ADD_CODE (rex, &zero, xp_sizeof(zero));
@ -147,7 +154,6 @@ static int __compile_expression (xp_awk_rex_t* rex)
ADD_CODE (rex, &zero, xp_sizeof(zero)); ADD_CODE (rex, &zero, xp_sizeof(zero));
/* handle the first branch */ /* handle the first branch */
bl = (xp_size_t*)&rex->code.buf[rex->code.size];
n = __compile_branch (rex); n = __compile_branch (rex);
if (n == -1) return -1; if (n == -1) return -1;
if (n == 0) if (n == 0)
@ -157,7 +163,6 @@ static int __compile_expression (xp_awk_rex_t* rex)
} }
(*nb) += 1; (*nb) += 1;
(*el) += *bl + xp_sizeof(*bl);
/* handle subsequent branches if any */ /* handle subsequent branches if any */
while (rex->ptn.curc.type == __SPECIAL && while (rex->ptn.curc.type == __SPECIAL &&
@ -165,7 +170,6 @@ static int __compile_expression (xp_awk_rex_t* rex)
{ {
NEXT_CHAR (rex, __TOP); NEXT_CHAR (rex, __TOP);
bl = (xp_size_t*)&rex->code.buf[rex->code.size];
n = __compile_branch(rex); n = __compile_branch(rex);
if (n == -1) return -1; if (n == -1) return -1;
if (n == 0) if (n == 0)
@ -179,26 +183,32 @@ static int __compile_expression (xp_awk_rex_t* rex)
} }
(*nb) += 1; (*nb) += 1;
(*el) += *bl + xp_sizeof(*bl);
} }
*el = rex->code.size - old_size;
return 1; return 1;
} }
static int __compile_branch (xp_awk_rex_t* rex) static int __compile_branch (xp_awk_rex_t* rex)
{ {
int n; int n;
xp_size_t* bl; xp_size_t* na, * bl;
xp_size_t old_size; xp_size_t old_size;
xp_size_t zero = 0; xp_size_t zero = 0;
struct __code* cmd;
old_size = rex->code.size; old_size = rex->code.size;
na = (xp_size_t*)&rex->code.buf[rex->code.size];
ADD_CODE (rex, &zero, xp_sizeof(zero));
bl = (xp_size_t*)&rex->code.buf[rex->code.size]; bl = (xp_size_t*)&rex->code.buf[rex->code.size];
ADD_CODE (rex, &zero, xp_sizeof(zero)); ADD_CODE (rex, &zero, xp_sizeof(zero));
while (1) while (1)
{ {
cmd = (struct __code*)&rex->code.buf[rex->code.size];
n = __compile_atom (rex); n = __compile_atom (rex);
if (n == -1) if (n == -1)
{ {
@ -208,7 +218,7 @@ static int __compile_branch (xp_awk_rex_t* rex)
if (n == 0) break; /* no atom */ if (n == 0) break; /* no atom */
n = __compile_bound (rex); n = __compile_bound (rex, cmd);
if (n == -1) if (n == -1)
{ {
rex->code.size = old_size; rex->code.size = old_size;
@ -217,27 +227,33 @@ static int __compile_branch (xp_awk_rex_t* rex)
/* n == 0 no bound character. just continue */ /* n == 0 no bound character. just continue */
/* n == 1 bound has been applied by compile_bound */ /* n == 1 bound has been applied by compile_bound */
(*na) += 1;
} }
return 0; *bl = rex->code.size - old_size;
return ((*na) == 0)? 0: 1;
} }
static int __compile_atom (xp_awk_rex_t* rex) static int __compile_atom (xp_awk_rex_t* rex)
{ {
int n = 0; int n;
if (rex->ptn.curc.type == __EOF) if (rex->ptn.curc.type == __EOF) return 0;
{
/* no atom */ if (rex->ptn.curc.type == __SPECIAL)
return 0;
}
else if (rex->ptn.curc.type == __SPECIAL)
{ {
if (rex->ptn.curc.value == XP_T('(')) if (rex->ptn.curc.value == XP_T('('))
{ {
// GROUP struct __code tmp;
tmp.cmd = CMD_GROUP;
tmp.lbound = 1;
tmp.ubound = 1;
ADD_CODE (rex, &tmp, xp_sizeof(tmp));
NEXT_CHAR (rex, __TOP); NEXT_CHAR (rex, __TOP);
n = __compile_expression (rex); n = __compile_expression (rex);
if (n == -1) return -1; if (n == -1) return -1;
@ -254,7 +270,7 @@ static int __compile_atom (xp_awk_rex_t* rex)
{ {
struct __code tmp; struct __code tmp;
tmp.cmd = CMD_BOL_CHAR; tmp.cmd = CMD_BOL;
tmp.lbound = 1; tmp.lbound = 1;
tmp.ubound = 1; tmp.ubound = 1;
@ -265,7 +281,7 @@ static int __compile_atom (xp_awk_rex_t* rex)
{ {
struct __code tmp; struct __code tmp;
tmp.cmd = CMD_EOL_CHAR; tmp.cmd = CMD_EOL;
tmp.lbound = 1; tmp.lbound = 1;
tmp.ubound = 1; tmp.ubound = 1;
@ -287,30 +303,26 @@ static int __compile_atom (xp_awk_rex_t* rex)
{ {
if (__compile_charset (rex) == -1) return -1; if (__compile_charset (rex) == -1) return -1;
} }
else else return 0;
{
/*invalid special character....*/
return -1;
}
return 1; return 1;
} }
else else
{ {
/* normal characters */
struct __code tmp; struct __code tmp;
xp_assert (rex->ptn.curc.type == __NORMAL);
tmp.cmd = CMD_ORD_CHAR; tmp.cmd = CMD_ORD_CHAR;
tmp.lbound = 1; tmp.lbound = 1;
tmp.ubound = 1; tmp.ubound = 1;
ADD_CODE (rex, &tmp, xp_sizeof(tmp)); ADD_CODE (rex, &tmp, xp_sizeof(tmp));
ADD_CODE (rex, &rex->ptn.curc, xp_sizeof(rex->ptn.curc)); ADD_CODE (rex, &rex->ptn.curc.value, xp_sizeof(rex->ptn.curc.value));
NEXT_CHAR (rex, __TOP); NEXT_CHAR (rex, __TOP);
return 1; return 1;
} }
} }
static int __compile_charset (xp_awk_rex_t* rex) static int __compile_charset (xp_awk_rex_t* rex)
@ -318,7 +330,7 @@ static int __compile_charset (xp_awk_rex_t* rex)
return -1; return -1;
} }
static int __compile_bound (xp_awk_rex_t* rex) static int __compile_bound (xp_awk_rex_t* rex, struct __code* cmd)
{ {
if (rex->ptn.curc.type != __SPECIAL) return 0; if (rex->ptn.curc.type != __SPECIAL) return 0;
@ -326,28 +338,42 @@ static int __compile_bound (xp_awk_rex_t* rex)
{ {
case XP_T('+'): case XP_T('+'):
{ {
//__apply_bound (1, MAX); cmd->lbound = 1;
cmd->ubound = BOUND_MAX;
NEXT_CHAR(rex, __TOP); NEXT_CHAR(rex, __TOP);
return 1; return 1;
} }
case XP_T('*'): case XP_T('*'):
{ {
//__apply_bound (0, MAX); cmd->lbound = 0;
cmd->ubound = BOUND_MAX;
NEXT_CHAR(rex, __TOP); NEXT_CHAR(rex, __TOP);
return 1; return 1;
} }
case XP_T('?'): case XP_T('?'):
{ {
//__apply_bound (0, 1); cmd->lbound = 0;
cmd->ubound = 1;
NEXT_CHAR(rex, __TOP); NEXT_CHAR(rex, __TOP);
return 1; return 1;
} }
case XP_T('{'): case XP_T('{'):
{ {
if (__compile_range(rex) == -1) return -1; NEXT_CHAR (rex, __IN_RANGE);
if (__compile_range(rex, cmd) == -1) return -1;
if (rex->ptn.curc.type != __SPECIAL ||
rex->ptn.curc.value != XP_T('}'))
{
// rex->errnum = XP_AWK_REX_ERBRACE
return -1;
}
NEXT_CHAR (rex, __TOP);
return 1; return 1;
} }
} }
@ -355,9 +381,40 @@ static int __compile_bound (xp_awk_rex_t* rex)
return 0; return 0;
} }
static int __compile_range (xp_awk_rex_t* rex) static int __compile_range (xp_awk_rex_t* rex, struct __code* cmd)
{ {
return -1; xp_size_t bound;
// TODO: should allow white spaces in the range???
// what if it is not in the raight format? convert it to ordinary characters??
bound = 0;
while (rex->ptn.curc.type == __NORMAL &&
xp_isdigit(rex->ptn.curc.value))
{
bound = bound * 10 + rex->ptn.curc.value - XP_T('0');
NEXT_CHAR (rex, __IN_RANGE);
}
cmd->lbound = bound;
if (rex->ptn.curc.type == __SPECIAL &&
rex->ptn.curc.value == XP_T(','))
{
NEXT_CHAR (rex, __IN_RANGE);
bound = 0;
while (rex->ptn.curc.type == __NORMAL &&
xp_isdigit(rex->ptn.curc.value))
{
bound = bound * 10 + rex->ptn.curc.value - XP_T('0');
NEXT_CHAR (rex, __IN_RANGE);
}
cmd->ubound = bound;
}
else cmd->ubound = BOUND_MAX;
return 0;
} }
static int __next_char (xp_awk_rex_t* rex, int level) static int __next_char (xp_awk_rex_t* rex, int level)
@ -461,3 +518,101 @@ static int __add_code (xp_awk_rex_t* rex, void* data, xp_size_t len)
return 0; return 0;
} }
void xp_awk_rex_print (xp_awk_rex_t* rex)
{
const xp_byte_t* p;
p = __print_expression (rex->code.buf);
xp_printf (XP_T("\n"));
xp_assert (p == rex->code.buf + rex->code.size);
}
static const xp_byte_t* __print_expression (const xp_byte_t* p)
{
xp_size_t nb, el, i;
nb = *(xp_size_t*)p; p += xp_sizeof(nb);
el = *(xp_size_t*)p; p += xp_sizeof(el);
//xp_printf (XP_T("NA = %u, EL = %u\n"),
// (unsigned int)nb, (unsigned int)el);
for (i = 0; i < nb; i++)
{
if (i != 0) xp_printf (XP_T("|"));
p = __print_branch (p);
}
return p;
}
static const xp_byte_t* __print_branch (const xp_byte_t* p)
{
xp_size_t na, bl, i;
na = *(xp_size_t*)p; p += xp_sizeof(na);
bl = *(xp_size_t*)p; p += xp_sizeof(bl);
//xp_printf (XP_T("NA = %u, BL = %u\n"),
// (unsigned int) na, (unsigned int)bl);
for (i = 0; i < na; i++)
{
p = __print_atom (p);
}
return p;
}
static const xp_byte_t* __print_atom (const xp_byte_t* p)
{
struct __code* cp = (struct __code*)p;
if (cp->cmd == CMD_BOL)
{
xp_printf (XP_T("^"));
p += xp_sizeof(*cp);
}
else if (cp->cmd == CMD_EOL)
{
xp_printf (XP_T("$"));
p += xp_sizeof(*cp);
}
else if (cp->cmd == CMD_ANY_CHAR)
{
xp_printf (XP_T("."));
p += xp_sizeof(*cp);
}
else if (cp->cmd == CMD_ORD_CHAR)
{
p += xp_sizeof(*cp);
xp_printf (XP_T("%c"), *(xp_char_t*)p);
p += xp_sizeof(xp_char_t);
}
else if (cp->cmd == CMD_GROUP)
{
p += xp_sizeof(*cp);
xp_printf (XP_T("("));
p = __print_expression (p);
xp_printf (XP_T(")"));
}
else
{
xp_printf (XP_T("FUCK FUCK FUCK\n"));
}
if (cp->lbound == 0 && cp->ubound == BOUND_MAX)
xp_printf (XP_T("*"));
else if (cp->lbound == 1 && cp->ubound == BOUND_MAX)
xp_printf (XP_T("+"));
else if (cp->lbound == 0 && cp->ubound == 1)
xp_printf (XP_T("?"));
else if (cp->lbound != 1 || cp->ubound != 1)
{
xp_printf (XP_T("{%lu,%lu}"),
(unsigned long)cp->lbound, (unsigned long)cp->ubound);
}
return p;
}

View File

@ -1,5 +1,5 @@
/* /*
* $Id: rex.h,v 1.4 2006-07-20 03:41:00 bacon Exp $ * $Id: rex.h,v 1.5 2006-07-20 16:21:54 bacon Exp $
**/ **/
#ifndef _XP_AWK_REX_H_ #ifndef _XP_AWK_REX_H_
@ -17,12 +17,13 @@
* *
* Compiled form of a regular expression: * Compiled form of a regular expression:
* *
* | expression | * | expression |
* | header | branch | branch | branch | * | header | branch | branch | branch |
* | nb | el | bl | cmd | arg | cmd | arg | bl | cmd | arg | bl | cmd | * | nb | el | na | bl | cmd | arg | cmd | arg | na | bl | cmd | arg | na | bl | cmd |
* *
* nb: the number of branches * nb: the number of branches
* el: the length of a expression excluding the length of nb and el * el: the length of a expression excluding the length of nb and el
* na: the number of atoms
* bl: the length of a branch excluding the length of bl * bl: the length of a branch excluding the length of bl
* cmd: The command and repetition info encoded together. * cmd: The command and repetition info encoded together.
* Some commands require an argument to follow them but some other don't. * Some commands require an argument to follow them but some other don't.
@ -67,6 +68,7 @@ extern "C" {
xp_awk_rex_t* xp_awk_rex_open (xp_awk_rex_t* rex); xp_awk_rex_t* xp_awk_rex_open (xp_awk_rex_t* rex);
void xp_awk_rex_close (xp_awk_rex_t* rex); void xp_awk_rex_close (xp_awk_rex_t* rex);
int xp_awk_rex_compile (xp_awk_rex_t* rex, const xp_char_t* ptn, xp_size_t len); int xp_awk_rex_compile (xp_awk_rex_t* rex, const xp_char_t* ptn, xp_size_t len);
void xp_awk_rex_print (xp_awk_rex_t* rex);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -23,7 +23,7 @@ int xp_main (int argc, const xp_char_t* argv[])
} }
ptn = XP_T("^he.llo"); ptn = XP_T("^he.llo(jo(in|kk)s|com)+h*e{1,40}abc");
if (xp_awk_rex_compile (rex, ptn, xp_strlen(ptn)) == -1) if (xp_awk_rex_compile (rex, ptn, xp_strlen(ptn)) == -1)
{ {
xp_printf (XP_T("cannot compile pattern...\n")); xp_printf (XP_T("cannot compile pattern...\n"));
@ -31,6 +31,7 @@ int xp_main (int argc, const xp_char_t* argv[])
return -1; return -1;
} }
xp_awk_rex_print (rex);
xp_awk_rex_close (rex); xp_awk_rex_close (rex);
return 0; return 0;
} }