fixed a parser bug in awk.

- handling of idiv operators
- tokenization of /=/ as a regular expression
- calling QSE_AWK_FREEREX to free compiled regular expressions.
switched to a new regular expression handler.
This commit is contained in:
2009-12-11 07:03:54 +00:00
parent 93adbf9244
commit faea2475ac
17 changed files with 2089 additions and 3870 deletions

View File

@ -1,5 +1,5 @@
/*
* $Id: rex.h 311 2009-12-09 11:35:54Z hyunghwan.chung $
* $Id: rex.h 312 2009-12-10 13:03:54Z hyunghwan.chung $
*
Copyright 2006-2009 Chung, Hyung-Hwan.
This file is part of QSE.
@ -27,62 +27,59 @@
/** @file
*
* Regular Esseression Syntax
* A regular expression is zero or more branches, separated by '|'.
* ......
* ......
* Regular Expression Syntax
*
* Compiled form of a regular expression:
*
* | expression |
* | header | branch | branch | branch |
* | nb | el | na | bl | cmd | arg | cmd | arg | na | bl | cmd | arg | na | bl | cmd |
*
* - nb: the number of branches
* - el: the length of a expression including the length of nb and el
* - na: the number of atoms
* - bl: the length of a branch including the length of na and bl
* - cmd: The command and repetition info encoded together.
*
* Some commands require an argument to follow them but some other don't.
* It is encoded as follows:
* .................
* regular expression := branch_set
* branch_set := branch [ '|' branch ]*
* atom := char | '.' | '^' | '$' | subgroup
* subgroup = '(' branch_set ')'
* bound := '?' | '*' | '+' | {n,m}
* bounded_atom := atom bound*
* branch := bounded_atom bounded_atom*
*
* Subexpressions can be nested by having the command "GROUP"
* and a subexpression as its argument.
* Special escaping sequence includes:
* \uXXXX, \XX, \000, \t, \n, \r, \v, ...
*
* Examples:
* a.c -> |1|6|5|ORD_CHAR(no bound)|a|ANY_CHAR(no bound)|ORD_CHAR(no bound)|c|
* ab|xy -> |2|10|4|ORD_CHAR(no bound)|a|ORD_CHAR(no bound)|b|4|ORD_CHAR(no bound)|x|ORD_CHAR(no bound)|y|
* A special character preceded by a backslash loses its special role and
* matches the character itself literally.
*
* Some examples of compiled regular expressions are shown below.
*
* @code
* ab
* START --@ NOP --@ CHAR(a) --@ CHAR(b) --@ END
*
* a(bc)d
* START --@ NOP --@ CHAR(A) --@ GROUP --@ CHAR(d) --@ END
* u.g.head => | @
* | | <= u.g.end
* | +---------------------------+
* | | <= u.ge.group
* @ @
* NOP -@ CHAR(b) -@ CHAR(c) -@ GROUPEND
*
* ab|cd
* +--@ CHAR(a) --@ CHAR(b) --+
* | <= u.b.left |
* START --@ NOP --@ BRANCH --+ +--@ END
* | <= u.b.right |
* +--@ CHAR(c) --@ CHAR(d) --+
* @endcode
*
* @todo
* - support \\n to refer to the nth matching substring
*/
#define QSE_REX_NA(code) (*(qse_size_t*)(code))
#define QSE_REX_LEN(code) \
(*(qse_size_t*)((qse_byte_t*)(code)+QSE_SIZEOF(qse_size_t)))
enum qse_rex_option_t
{
QSE_REX_BUILD_NOBOUND = (1 << 1),
QSE_REX_MATCH_IGNORECASE = (1 << 8),
/**< do not allow a special character at normal character position */
/**< do not allow a special character at normal character position. */
QSE_REX_STRICT = (1 << 0),
/**< do not support the {n,m} style occurrence specifier */
/**< do not support the {n,m} style occurrence specifier. */
QSE_REX_NOBOUND = (1 << 1),
#if 0
QSE_REX_ESQ_HEX = (1 << 1), /* \xhh and \uhhhh */
QSE_REX_ESQ_OCTAL = (1 << 2), /* \000 */
QSE_REX_ESQ_CNTRL = (1 << 3), /* \cX where X is A to Z */
#endif
/**< perform case-insensitive match */
QSE_REX_IGNORECASE = (1 << 8)
QSE_REX_IGNORECASE = (1 << 2)
};
enum qse_rex_errnum_t
@ -125,7 +122,8 @@ struct qse_rex_node_t
/* for internal management. not used for startnode */
qse_rex_node_t* link;
/* connect to the next node in the graph */
/* connect to the next node in the graph.
* it is always NULL for a branch node. */
qse_rex_node_t* next;
qse_rex_node_id_t id;
@ -206,6 +204,29 @@ void qse_rex_close (
qse_rex_t* rex
);
qse_rex_t* qse_rex_init (
qse_rex_t* rex,
qse_mmgr_t* mmgr,
qse_rex_node_t* code
);
/**
* The qse_rex_fini() function finalizes a statically initialized
* regular expression object @a rex.
*/
void qse_rex_fini (
qse_rex_t* rex
);
/**
* The qse_rex_yield() function gives up the ownership of a compiled regular
* expression. Once yielded, the compiled regular expression is not destroyed
* when @a rex is closed or finalized.
* @return start node of a compiled regular expression
*/
qse_rex_node_t* qse_rex_yield (
qse_rex_t* rex /**< regular expression processor */
);
/**
* The qse_rex_getoption() function returns the current options.
@ -243,6 +264,7 @@ int qse_rex_exec (
qse_cstr_t* matstr
);
void* qse_buildrex (
qse_mmgr_t* mmgr,
qse_size_t depth,
@ -270,14 +292,6 @@ void qse_freerex (
void* code
);
qse_bool_t qse_isemptyrex (
void* code
);
#if 0
void qse_dprintrex (qse_rex_t* rex, void* rex);
#endif
#ifdef __cplusplus
}
#endif