*** empty log message ***
This commit is contained in:
parent
75a3eaeac0
commit
40b4aecca7
243
ase/awk/rex.c
243
ase/awk/rex.c
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* $Id: rex.c,v 1.6 2006-07-20 03:41:00 bacon Exp $
|
||||
* $Id: rex.c,v 1.7 2006-07-20 16:21:54 bacon Exp $
|
||||
*/
|
||||
|
||||
#include <xp/awk/awk_i.h>
|
||||
@ -26,12 +26,13 @@ enum
|
||||
|
||||
enum
|
||||
{
|
||||
CMD_BOL_CHAR,
|
||||
CMD_EOL_CHAR,
|
||||
CMD_ORD_CHAR,
|
||||
CMD_BOL,
|
||||
CMD_EOL,
|
||||
CMD_ANY_CHAR,
|
||||
CMD_ORD_CHAR,
|
||||
CMD_CHAR_RANGE,
|
||||
CMD_CHAR_CLASS
|
||||
CMD_CHAR_CLASS,
|
||||
CMD_GROUP
|
||||
};
|
||||
|
||||
enum
|
||||
@ -53,10 +54,10 @@ enum
|
||||
|
||||
struct __code
|
||||
{
|
||||
xp_byte_t cmd;
|
||||
//xp_byte_t cmd;
|
||||
int cmd;
|
||||
xp_size_t lbound;
|
||||
xp_size_t ubound;
|
||||
xp_char_t cc; /* optional */
|
||||
};
|
||||
|
||||
#define NEXT_CHAR(rex,level) \
|
||||
@ -69,11 +70,15 @@ static int __compile_expression (xp_awk_rex_t* rex);
|
||||
static int __compile_branch (xp_awk_rex_t* rex);
|
||||
static int __compile_atom (xp_awk_rex_t* rex);
|
||||
static int __compile_charset (xp_awk_rex_t* rex);
|
||||
static int __compile_bound (xp_awk_rex_t* rex);
|
||||
static int __compile_range (xp_awk_rex_t* rex);
|
||||
static int __compile_bound (xp_awk_rex_t* rex, struct __code* cmd);
|
||||
static int __compile_range (xp_awk_rex_t* rex, struct __code* cmd);
|
||||
static int __next_char (xp_awk_rex_t* rex, int level);
|
||||
static int __add_code (xp_awk_rex_t* rex, void* data, xp_size_t len);
|
||||
|
||||
static const xp_byte_t* __print_expression (const xp_byte_t* p);
|
||||
static const xp_byte_t* __print_branch (const xp_byte_t* p);
|
||||
static const xp_byte_t* __print_atom (const xp_byte_t* p);
|
||||
|
||||
xp_awk_rex_t* xp_awk_rex_open (xp_awk_rex_t* rex)
|
||||
{
|
||||
if (rex == XP_NULL)
|
||||
@ -104,7 +109,6 @@ void xp_awk_rex_close (xp_awk_rex_t* rex)
|
||||
|
||||
int xp_awk_rex_compile (xp_awk_rex_t* rex, const xp_char_t* ptn, xp_size_t len)
|
||||
{
|
||||
|
||||
rex->ptn.ptr = ptn;
|
||||
rex->ptn.end = rex->ptn.ptr + len;
|
||||
rex->ptn.curp = rex->ptn.ptr;
|
||||
@ -136,9 +140,12 @@ xp_printf (XP_T("garbage after expression\n"));
|
||||
static int __compile_expression (xp_awk_rex_t* rex)
|
||||
{
|
||||
xp_size_t zero = 0;
|
||||
xp_size_t* nb, * el, * bl;
|
||||
xp_size_t* nb, * el;
|
||||
xp_size_t old_size;
|
||||
int n;
|
||||
|
||||
old_size = rex->code.size;
|
||||
|
||||
/* secure space for header and set the header fields to zero */
|
||||
nb = (xp_size_t*)&rex->code.buf[rex->code.size];
|
||||
ADD_CODE (rex, &zero, xp_sizeof(zero));
|
||||
@ -147,7 +154,6 @@ static int __compile_expression (xp_awk_rex_t* rex)
|
||||
ADD_CODE (rex, &zero, xp_sizeof(zero));
|
||||
|
||||
/* handle the first branch */
|
||||
bl = (xp_size_t*)&rex->code.buf[rex->code.size];
|
||||
n = __compile_branch (rex);
|
||||
if (n == -1) return -1;
|
||||
if (n == 0)
|
||||
@ -157,7 +163,6 @@ static int __compile_expression (xp_awk_rex_t* rex)
|
||||
}
|
||||
|
||||
(*nb) += 1;
|
||||
(*el) += *bl + xp_sizeof(*bl);
|
||||
|
||||
/* handle subsequent branches if any */
|
||||
while (rex->ptn.curc.type == __SPECIAL &&
|
||||
@ -165,7 +170,6 @@ static int __compile_expression (xp_awk_rex_t* rex)
|
||||
{
|
||||
NEXT_CHAR (rex, __TOP);
|
||||
|
||||
bl = (xp_size_t*)&rex->code.buf[rex->code.size];
|
||||
n = __compile_branch(rex);
|
||||
if (n == -1) return -1;
|
||||
if (n == 0)
|
||||
@ -179,26 +183,32 @@ static int __compile_expression (xp_awk_rex_t* rex)
|
||||
}
|
||||
|
||||
(*nb) += 1;
|
||||
(*el) += *bl + xp_sizeof(*bl);
|
||||
}
|
||||
|
||||
*el = rex->code.size - old_size;
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int __compile_branch (xp_awk_rex_t* rex)
|
||||
{
|
||||
int n;
|
||||
xp_size_t* bl;
|
||||
xp_size_t* na, * bl;
|
||||
xp_size_t old_size;
|
||||
xp_size_t zero = 0;
|
||||
struct __code* cmd;
|
||||
|
||||
old_size = rex->code.size;
|
||||
|
||||
na = (xp_size_t*)&rex->code.buf[rex->code.size];
|
||||
ADD_CODE (rex, &zero, xp_sizeof(zero));
|
||||
|
||||
bl = (xp_size_t*)&rex->code.buf[rex->code.size];
|
||||
ADD_CODE (rex, &zero, xp_sizeof(zero));
|
||||
|
||||
while (1)
|
||||
{
|
||||
cmd = (struct __code*)&rex->code.buf[rex->code.size];
|
||||
|
||||
n = __compile_atom (rex);
|
||||
if (n == -1)
|
||||
{
|
||||
@ -208,7 +218,7 @@ static int __compile_branch (xp_awk_rex_t* rex)
|
||||
|
||||
if (n == 0) break; /* no atom */
|
||||
|
||||
n = __compile_bound (rex);
|
||||
n = __compile_bound (rex, cmd);
|
||||
if (n == -1)
|
||||
{
|
||||
rex->code.size = old_size;
|
||||
@ -217,27 +227,33 @@ static int __compile_branch (xp_awk_rex_t* rex)
|
||||
|
||||
/* n == 0 no bound character. just continue */
|
||||
/* n == 1 bound has been applied by compile_bound */
|
||||
|
||||
(*na) += 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
*bl = rex->code.size - old_size;
|
||||
return ((*na) == 0)? 0: 1;
|
||||
}
|
||||
|
||||
static int __compile_atom (xp_awk_rex_t* rex)
|
||||
{
|
||||
int n = 0;
|
||||
int n;
|
||||
|
||||
if (rex->ptn.curc.type == __EOF)
|
||||
{
|
||||
/* no atom */
|
||||
return 0;
|
||||
}
|
||||
else if (rex->ptn.curc.type == __SPECIAL)
|
||||
if (rex->ptn.curc.type == __EOF) return 0;
|
||||
|
||||
if (rex->ptn.curc.type == __SPECIAL)
|
||||
{
|
||||
if (rex->ptn.curc.value == XP_T('('))
|
||||
{
|
||||
// GROUP
|
||||
struct __code tmp;
|
||||
|
||||
tmp.cmd = CMD_GROUP;
|
||||
tmp.lbound = 1;
|
||||
tmp.ubound = 1;
|
||||
|
||||
ADD_CODE (rex, &tmp, xp_sizeof(tmp));
|
||||
NEXT_CHAR (rex, __TOP);
|
||||
|
||||
|
||||
n = __compile_expression (rex);
|
||||
if (n == -1) return -1;
|
||||
|
||||
@ -254,7 +270,7 @@ static int __compile_atom (xp_awk_rex_t* rex)
|
||||
{
|
||||
struct __code tmp;
|
||||
|
||||
tmp.cmd = CMD_BOL_CHAR;
|
||||
tmp.cmd = CMD_BOL;
|
||||
tmp.lbound = 1;
|
||||
tmp.ubound = 1;
|
||||
|
||||
@ -265,7 +281,7 @@ static int __compile_atom (xp_awk_rex_t* rex)
|
||||
{
|
||||
struct __code tmp;
|
||||
|
||||
tmp.cmd = CMD_EOL_CHAR;
|
||||
tmp.cmd = CMD_EOL;
|
||||
tmp.lbound = 1;
|
||||
tmp.ubound = 1;
|
||||
|
||||
@ -287,30 +303,26 @@ static int __compile_atom (xp_awk_rex_t* rex)
|
||||
{
|
||||
if (__compile_charset (rex) == -1) return -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
/*invalid special character....*/
|
||||
return -1;
|
||||
}
|
||||
else return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* normal characters */
|
||||
struct __code tmp;
|
||||
|
||||
xp_assert (rex->ptn.curc.type == __NORMAL);
|
||||
|
||||
tmp.cmd = CMD_ORD_CHAR;
|
||||
tmp.lbound = 1;
|
||||
tmp.ubound = 1;
|
||||
|
||||
ADD_CODE (rex, &tmp, xp_sizeof(tmp));
|
||||
ADD_CODE (rex, &rex->ptn.curc, xp_sizeof(rex->ptn.curc));
|
||||
ADD_CODE (rex, &rex->ptn.curc.value, xp_sizeof(rex->ptn.curc.value));
|
||||
NEXT_CHAR (rex, __TOP);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static int __compile_charset (xp_awk_rex_t* rex)
|
||||
@ -318,7 +330,7 @@ static int __compile_charset (xp_awk_rex_t* rex)
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int __compile_bound (xp_awk_rex_t* rex)
|
||||
static int __compile_bound (xp_awk_rex_t* rex, struct __code* cmd)
|
||||
{
|
||||
if (rex->ptn.curc.type != __SPECIAL) return 0;
|
||||
|
||||
@ -326,28 +338,42 @@ static int __compile_bound (xp_awk_rex_t* rex)
|
||||
{
|
||||
case XP_T('+'):
|
||||
{
|
||||
//__apply_bound (1, MAX);
|
||||
cmd->lbound = 1;
|
||||
cmd->ubound = BOUND_MAX;
|
||||
NEXT_CHAR(rex, __TOP);
|
||||
return 1;
|
||||
}
|
||||
|
||||
case XP_T('*'):
|
||||
{
|
||||
//__apply_bound (0, MAX);
|
||||
cmd->lbound = 0;
|
||||
cmd->ubound = BOUND_MAX;
|
||||
NEXT_CHAR(rex, __TOP);
|
||||
return 1;
|
||||
}
|
||||
|
||||
case XP_T('?'):
|
||||
{
|
||||
//__apply_bound (0, 1);
|
||||
cmd->lbound = 0;
|
||||
cmd->ubound = 1;
|
||||
NEXT_CHAR(rex, __TOP);
|
||||
return 1;
|
||||
}
|
||||
|
||||
case XP_T('{'):
|
||||
{
|
||||
if (__compile_range(rex) == -1) return -1;
|
||||
NEXT_CHAR (rex, __IN_RANGE);
|
||||
|
||||
if (__compile_range(rex, cmd) == -1) return -1;
|
||||
|
||||
if (rex->ptn.curc.type != __SPECIAL ||
|
||||
rex->ptn.curc.value != XP_T('}'))
|
||||
{
|
||||
// rex->errnum = XP_AWK_REX_ERBRACE
|
||||
return -1;
|
||||
}
|
||||
|
||||
NEXT_CHAR (rex, __TOP);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
@ -355,9 +381,40 @@ static int __compile_bound (xp_awk_rex_t* rex)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __compile_range (xp_awk_rex_t* rex)
|
||||
static int __compile_range (xp_awk_rex_t* rex, struct __code* cmd)
|
||||
{
|
||||
return -1;
|
||||
xp_size_t bound;
|
||||
|
||||
// TODO: should allow white spaces in the range???
|
||||
// what if it is not in the raight format? convert it to ordinary characters??
|
||||
bound = 0;
|
||||
while (rex->ptn.curc.type == __NORMAL &&
|
||||
xp_isdigit(rex->ptn.curc.value))
|
||||
{
|
||||
bound = bound * 10 + rex->ptn.curc.value - XP_T('0');
|
||||
NEXT_CHAR (rex, __IN_RANGE);
|
||||
}
|
||||
|
||||
cmd->lbound = bound;
|
||||
|
||||
if (rex->ptn.curc.type == __SPECIAL &&
|
||||
rex->ptn.curc.value == XP_T(','))
|
||||
{
|
||||
NEXT_CHAR (rex, __IN_RANGE);
|
||||
|
||||
bound = 0;
|
||||
while (rex->ptn.curc.type == __NORMAL &&
|
||||
xp_isdigit(rex->ptn.curc.value))
|
||||
{
|
||||
bound = bound * 10 + rex->ptn.curc.value - XP_T('0');
|
||||
NEXT_CHAR (rex, __IN_RANGE);
|
||||
}
|
||||
|
||||
cmd->ubound = bound;
|
||||
}
|
||||
else cmd->ubound = BOUND_MAX;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __next_char (xp_awk_rex_t* rex, int level)
|
||||
@ -461,3 +518,101 @@ static int __add_code (xp_awk_rex_t* rex, void* data, xp_size_t len)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void xp_awk_rex_print (xp_awk_rex_t* rex)
|
||||
{
|
||||
const xp_byte_t* p;
|
||||
p = __print_expression (rex->code.buf);
|
||||
xp_printf (XP_T("\n"));
|
||||
xp_assert (p == rex->code.buf + rex->code.size);
|
||||
}
|
||||
|
||||
static const xp_byte_t* __print_expression (const xp_byte_t* p)
|
||||
{
|
||||
xp_size_t nb, el, i;
|
||||
|
||||
nb = *(xp_size_t*)p; p += xp_sizeof(nb);
|
||||
el = *(xp_size_t*)p; p += xp_sizeof(el);
|
||||
//xp_printf (XP_T("NA = %u, EL = %u\n"),
|
||||
// (unsigned int)nb, (unsigned int)el);
|
||||
|
||||
for (i = 0; i < nb; i++)
|
||||
{
|
||||
if (i != 0) xp_printf (XP_T("|"));
|
||||
p = __print_branch (p);
|
||||
}
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
static const xp_byte_t* __print_branch (const xp_byte_t* p)
|
||||
{
|
||||
xp_size_t na, bl, i;
|
||||
|
||||
na = *(xp_size_t*)p; p += xp_sizeof(na);
|
||||
bl = *(xp_size_t*)p; p += xp_sizeof(bl);
|
||||
//xp_printf (XP_T("NA = %u, BL = %u\n"),
|
||||
// (unsigned int) na, (unsigned int)bl);
|
||||
|
||||
for (i = 0; i < na; i++)
|
||||
{
|
||||
p = __print_atom (p);
|
||||
}
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
static const xp_byte_t* __print_atom (const xp_byte_t* p)
|
||||
{
|
||||
struct __code* cp = (struct __code*)p;
|
||||
|
||||
if (cp->cmd == CMD_BOL)
|
||||
{
|
||||
xp_printf (XP_T("^"));
|
||||
p += xp_sizeof(*cp);
|
||||
}
|
||||
else if (cp->cmd == CMD_EOL)
|
||||
{
|
||||
xp_printf (XP_T("$"));
|
||||
p += xp_sizeof(*cp);
|
||||
}
|
||||
else if (cp->cmd == CMD_ANY_CHAR)
|
||||
{
|
||||
xp_printf (XP_T("."));
|
||||
p += xp_sizeof(*cp);
|
||||
}
|
||||
else if (cp->cmd == CMD_ORD_CHAR)
|
||||
{
|
||||
p += xp_sizeof(*cp);
|
||||
xp_printf (XP_T("%c"), *(xp_char_t*)p);
|
||||
p += xp_sizeof(xp_char_t);
|
||||
}
|
||||
else if (cp->cmd == CMD_GROUP)
|
||||
{
|
||||
p += xp_sizeof(*cp);
|
||||
xp_printf (XP_T("("));
|
||||
p = __print_expression (p);
|
||||
xp_printf (XP_T(")"));
|
||||
}
|
||||
else
|
||||
{
|
||||
xp_printf (XP_T("FUCK FUCK FUCK\n"));
|
||||
}
|
||||
|
||||
if (cp->lbound == 0 && cp->ubound == BOUND_MAX)
|
||||
xp_printf (XP_T("*"));
|
||||
else if (cp->lbound == 1 && cp->ubound == BOUND_MAX)
|
||||
xp_printf (XP_T("+"));
|
||||
else if (cp->lbound == 0 && cp->ubound == 1)
|
||||
xp_printf (XP_T("?"));
|
||||
else if (cp->lbound != 1 || cp->ubound != 1)
|
||||
{
|
||||
xp_printf (XP_T("{%lu,%lu}"),
|
||||
(unsigned long)cp->lbound, (unsigned long)cp->ubound);
|
||||
}
|
||||
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* $Id: rex.h,v 1.4 2006-07-20 03:41:00 bacon Exp $
|
||||
* $Id: rex.h,v 1.5 2006-07-20 16:21:54 bacon Exp $
|
||||
**/
|
||||
|
||||
#ifndef _XP_AWK_REX_H_
|
||||
@ -17,12 +17,13 @@
|
||||
*
|
||||
* Compiled form of a regular expression:
|
||||
*
|
||||
* | expression |
|
||||
* | header | branch | branch | branch |
|
||||
* | nb | el | bl | cmd | arg | cmd | arg | bl | cmd | arg | bl | cmd |
|
||||
* | expression |
|
||||
* | header | branch | branch | branch |
|
||||
* | nb | el | na | bl | cmd | arg | cmd | arg | na | bl | cmd | arg | na | bl | cmd |
|
||||
*
|
||||
* nb: the number of branches
|
||||
* el: the length of a expression excluding the length of nb and el
|
||||
* na: the number of atoms
|
||||
* bl: the length of a branch excluding the length of bl
|
||||
* cmd: The command and repetition info encoded together.
|
||||
* Some commands require an argument to follow them but some other don't.
|
||||
@ -67,6 +68,7 @@ extern "C" {
|
||||
xp_awk_rex_t* xp_awk_rex_open (xp_awk_rex_t* rex);
|
||||
void xp_awk_rex_close (xp_awk_rex_t* rex);
|
||||
int xp_awk_rex_compile (xp_awk_rex_t* rex, const xp_char_t* ptn, xp_size_t len);
|
||||
void xp_awk_rex_print (xp_awk_rex_t* rex);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -23,7 +23,7 @@ int xp_main (int argc, const xp_char_t* argv[])
|
||||
}
|
||||
|
||||
|
||||
ptn = XP_T("^he.llo");
|
||||
ptn = XP_T("^he.llo(jo(in|kk)s|com)+h*e{1,40}abc");
|
||||
if (xp_awk_rex_compile (rex, ptn, xp_strlen(ptn)) == -1)
|
||||
{
|
||||
xp_printf (XP_T("cannot compile pattern...\n"));
|
||||
@ -31,6 +31,7 @@ int xp_main (int argc, const xp_char_t* argv[])
|
||||
return -1;
|
||||
}
|
||||
|
||||
xp_awk_rex_print (rex);
|
||||
xp_awk_rex_close (rex);
|
||||
return 0;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user