qse/qse/lib/cmn/rex.c

2096 lines
45 KiB
C

/*
* $Id$
*
Copyright (c) 2006-2019 Chung, Hyung-Hwan. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <qse/cmn/rex.h>
#include <qse/cmn/chr.h>
#include <qse/cmn/str.h>
#include <qse/cmn/arr.h>
#include "mem-prv.h"
#define OCC_MAX QSE_TYPE_MAX(qse_size_t)
/*#define XTRA_DEBUG*/
typedef struct comp_t comp_t;
struct comp_t
{
qse_rex_t* rex;
qse_cstr_t re;
const qse_char_t* ptr;
const qse_char_t* end;
struct
{
qse_cint_t value;
int escaped;
} c;
qse_size_t gdepth; /* group depth */
qse_rex_node_t* start;
};
typedef struct exec_t exec_t;
struct exec_t
{
qse_rex_t* rex;
struct
{
const qse_char_t* ptr;
const qse_char_t* end;
} str;
struct
{
const qse_char_t* ptr;
const qse_char_t* end;
} sub;
struct
{
int active;
int pending;
qse_arr_t set[2]; /* candidate arrays */
} cand;
qse_size_t nmatches;
const qse_char_t* matchend; /* 1 character past the match end */
};
typedef struct pair_t pair_t;
struct pair_t
{
qse_rex_node_t* head;
qse_rex_node_t* tail;
};
/* The group_t type defines a structure to maintain the nested
* traces of subgroups. The actual traces are maintained in a stack
* of sinlgly linked group_t elements. The head element acts
* as a management element where the occ field is a reference count
* and the node field is QSE_NULL always
*/
typedef struct group_t group_t;
struct group_t
{
qse_rex_node_t* node;
qse_size_t occ;
group_t* next;
};
typedef struct cand_t cand_t;
struct cand_t
{
qse_rex_node_t* node;
/* occurrence */
qse_size_t occ;
/* the stack of groups that this candidate belongs to.
* it is in the singliy linked list form */
group_t* group;
/* match pointer. the number of character advancement
* differs across various node types. BOL and EOL don't advance to
* the next character on match while ANY and CHAR do on match.
* therefore, the match pointer is managed per candidate basis. */
const qse_char_t* mptr;
};
int qse_rex_init (qse_rex_t* rex, qse_mmgr_t* mmgr, qse_rex_node_t* code)
{
QSE_MEMSET (rex, 0, QSE_SIZEOF(*rex));
rex->mmgr = mmgr;
QSE_ASSERT (code == QSE_NULL || code->id == QSE_REX_NODE_START);
/* note that passing a compiled expression to qse_rex_open()
* is to delegate it to this rex object. when this rex object
* is closed, the code delegated is destroyed. */
rex->code = code;
return 0;
}
qse_rex_t* qse_rex_open (qse_mmgr_t* mmgr, qse_size_t xtnsize, qse_rex_node_t* code)
{
qse_rex_t* rex;
rex = (qse_rex_t*) QSE_MMGR_ALLOC (mmgr, QSE_SIZEOF(qse_rex_t) + xtnsize);
if (rex == QSE_NULL) return QSE_NULL;
if (qse_rex_init (rex, mmgr, code) <= -1)
{
QSE_MMGR_FREE (mmgr, rex);
return QSE_NULL;
}
QSE_MEMSET (QSE_XTN(rex), 0, xtnsize);
return rex;
}
static void freenode (qse_rex_node_t* node, qse_mmgr_t* mmgr)
{
if (node->id == QSE_REX_NODE_CSET)
{
if (node->u.cset.member != QSE_NULL)
qse_str_close (node->u.cset.member);
}
QSE_MMGR_FREE (mmgr, node);
}
static void freeallnodes (qse_rex_node_t* start)
{
qse_rex_node_t* x, * y;
qse_mmgr_t* mmgr;
QSE_ASSERT (start->id == QSE_REX_NODE_START);
mmgr = start->u.s.mmgr;
x = start->u.s.link;
while (x != QSE_NULL)
{
y = x; x = x->link;
freenode (y, mmgr);
}
QSE_MMGR_FREE (mmgr, start);
}
void qse_rex_fini (qse_rex_t* rex)
{
if (rex->code != QSE_NULL)
{
freeallnodes (rex->code);
rex->code = QSE_NULL;
}
}
void qse_rex_close (qse_rex_t* rex)
{
qse_rex_fini (rex);
QSE_MMGR_FREE (rex->mmgr, rex);
}
qse_mmgr_t* qse_rex_getmmgr (qse_rex_t* rex)
{
return rex->mmgr;
}
void* qse_rex_getxtn (qse_rex_t* rex)
{
return QSE_XTN (rex);
}
qse_rex_node_t* qse_rex_yield (qse_rex_t* rex)
{
qse_rex_node_t* code = rex->code;
rex->code = QSE_NULL;
return code;
}
int qse_rex_getopt (const qse_rex_t* rex)
{
return rex->option;
}
void qse_rex_setopt (qse_rex_t* rex, int opts)
{
rex->option = opts;
}
qse_rex_errnum_t qse_rex_geterrnum (const qse_rex_t* rex)
{
return rex->errnum;
}
const qse_char_t* qse_rex_geterrmsg (const qse_rex_t* rex)
{
static const qse_char_t* errstr[] =
{
QSE_T("no error"),
QSE_T("other error"),
QSE_T("not implemented"),
QSE_T("subsystem error"),
QSE_T("internal error that should never have happened"),
QSE_T("no sufficient memory available"),
QSE_T("no expression compiled"),
QSE_T("recursion too deep"),
QSE_T("right parenthesis expected"),
QSE_T("right bracket expected"),
QSE_T("right brace expected"),
QSE_T("colon expected"),
QSE_T("invalid character range"),
QSE_T("invalid character class"),
QSE_T("invalid occurrence bound"),
QSE_T("special character at wrong position"),
QSE_T("premature expression end")
};
return (rex->errnum >= 0 && rex->errnum < QSE_COUNTOF(errstr))?
errstr[rex->errnum]: QSE_T("unknown error");
}
static qse_rex_node_t* newnode (comp_t* c, qse_rex_node_id_t id)
{
qse_rex_node_t* node;
/* TODO: performance optimization.
* preallocate a large chunk of memory and allocate a node
* from the chunk. increase the chunk if it has been used up.
*/
node = (qse_rex_node_t*)
QSE_MMGR_ALLOC (c->rex->mmgr, QSE_SIZEOF(qse_rex_node_t));
if (node == QSE_NULL)
{
c->rex->errnum = QSE_REX_ENOMEM;
return QSE_NULL;
}
QSE_MEMSET (node, 0, QSE_SIZEOF(*node));
node->id = id;
if (c->start != QSE_NULL)
{
QSE_ASSERT (c->start->id == QSE_REX_NODE_START);
node->link = c->start->u.s.link;
c->start->u.s.link = node;
}
return node;
}
static qse_rex_node_t* newstartnode (comp_t* c)
{
qse_rex_node_t* n = newnode (c, QSE_REX_NODE_START);
if (n != QSE_NULL)
{
n->u.s.mmgr = c->rex->mmgr;
n->u.s.link = QSE_NULL;
}
return n;
}
static qse_rex_node_t* newendnode (comp_t* c)
{
return newnode (c, QSE_REX_NODE_END);
}
static qse_rex_node_t* newnopnode (comp_t* c)
{
return newnode (c, QSE_REX_NODE_NOP);
}
static qse_rex_node_t* newgroupnode (comp_t* c)
{
return newnode (c, QSE_REX_NODE_GROUP);
}
static qse_rex_node_t* newgroupendnode (comp_t* c, qse_rex_node_t* group)
{
qse_rex_node_t* n = newnode (c, QSE_REX_NODE_GROUPEND);
if (n != QSE_NULL) n->u.ge.group = group;
return n;
}
static qse_rex_node_t* newcharnode (comp_t* c, qse_char_t ch)
{
qse_rex_node_t* n = newnode (c, QSE_REX_NODE_CHAR);
if (n != QSE_NULL) n->u.c = ch;
return n;
}
static qse_rex_node_t* newbranchnode (
comp_t* c, qse_rex_node_t* left, qse_rex_node_t* alter)
{
qse_rex_node_t* n = newnode (c, QSE_REX_NODE_BRANCH);
if (n != QSE_NULL)
{
/*n->u.b.left = left; */
n->next = left;
n->u.b.alter = alter;
}
return n;
}
#define CHECK_END(com) \
do { \
if (com->ptr >= com->end) \
{ \
com->rex->errnum = QSE_REX_EPREEND; \
return -1; \
} \
} while(0)
#define IS_HEX(c) \
((c >= QSE_T('0') && c <= QSE_T('9')) || \
(c >= QSE_T('A') && c <= QSE_T('F')) || \
(c >= QSE_T('a') && c <= QSE_T('f')))
#define HEX_TO_NUM(c) \
((c >= QSE_T('0') && c <= QSE_T('9'))? c-QSE_T('0'): \
(c >= QSE_T('A') && c <= QSE_T('F'))? c-QSE_T('A')+10: \
c-QSE_T('a')+10)
#define IS_SPE(com,ch) ((com)->c.value == (ch) && !(com)->c.escaped)
#define IS_ESC(com) ((com)->c.escaped)
#define IS_EOF(com) ((com)->c.value == QSE_CHAR_EOF)
#define getc_noesc(c) getc(c,1)
#define getc_esc(c) getc(c,0)
static int getc (comp_t* com, int noesc)
{
qse_char_t c;
if (com->ptr >= com->end)
{
com->c.value = QSE_CHAR_EOF;
com->c.escaped = 0;
return 0;
}
com->c.value = *com->ptr++;
com->c.escaped = 0;
if (noesc || com->c.value != QSE_T('\\')) return 0;
CHECK_END (com);
c = *com->ptr++;
if (c == QSE_T('n')) c = QSE_T('\n');
else if (c == QSE_T('r')) c = QSE_T('\r');
else if (c == QSE_T('t')) c = QSE_T('\t');
else if (c == QSE_T('f')) c = QSE_T('\f');
else if (c == QSE_T('b')) c = QSE_T('\b');
else if (c == QSE_T('v')) c = QSE_T('\v');
else if (c == QSE_T('a')) c = QSE_T('\a');
#if 0
/* backrefernce conflicts with octal notation */
else if (c >= QSE_T('0') && c <= QSE_T('7'))
{
qse_char_t cx;
c = c - QSE_T('0');
CHECK_END (com);
cx = *com->ptr++;
if (cx >= QSE_T('0') && cx <= QSE_T('7'))
{
c = c * 8 + cx - QSE_T('0');
CHECK_END (com);
cx = *com->ptr++;
if (cx >= QSE_T('0') && cx <= QSE_T('7'))
{
c = c * 8 + cx - QSE_T('0');
}
}
}
#endif
else if (c == QSE_T('x'))
{
qse_char_t cx;
CHECK_END (com);
cx = *com->ptr++;
if (IS_HEX(cx))
{
c = HEX_TO_NUM(cx);
CHECK_END (com);
cx = *com->ptr++;
if (IS_HEX(cx))
{
c = c * 16 + HEX_TO_NUM(cx);
}
}
}
#if defined(QSE_CHAR_IS_WCHAR)
else if (c == QSE_T('u') && QSE_SIZEOF(qse_char_t) >= 2)
{
qse_char_t cx;
CHECK_END (com);
cx = *com->ptr++;
if (IS_HEX(cx))
{
qse_size_t i;
c = HEX_TO_NUM(cx);
for (i = 0; i < 3; i++)
{
CHECK_END (com);
cx = *com->ptr++;
if (!IS_HEX(cx)) break;
c = c * 16 + HEX_TO_NUM(cx);
}
}
}
else if (c == QSE_T('U') && QSE_SIZEOF(qse_char_t) >= 4)
{
qse_char_t cx;
CHECK_END (com);
cx = *com->ptr++;
if (IS_HEX(cx))
{
qse_size_t i;
c = HEX_TO_NUM(cx);
for (i = 0; i < 7; i++)
{
CHECK_END (com);
cx = *com->ptr++;
if (!IS_HEX(cx)) break;
c = c * 16 + HEX_TO_NUM(cx);
}
}
}
#endif
com->c.value = c;
com->c.escaped = QSE_TRUE;
#if 0
com->c = (com->ptr < com->end)? *com->ptr++: QSE_CHAR_EOF;
if (com->c == QSE_CHAR_EOF)
qse_printf (QSE_T("getc => <EOF>\n"));
else qse_printf (QSE_T("getc => %c\n"), com->c);
#endif
return 0;
}
struct ccinfo_t
{
qse_cstr_t name;
int (*func) (exec_t* e, qse_char_t c);
};
#define ISBLANK(c) ((c) == QSE_T(' ') || (c) == QSE_T('\t'))
static int cc_isalnum (exec_t* e, qse_char_t c) { return QSE_ISALNUM (c); }
static int cc_isalpha (exec_t* e, qse_char_t c) { return QSE_ISALPHA (c); }
static int cc_isblank (exec_t* e, qse_char_t c) { return QSE_ISBLANK(c); }
static int cc_iscntrl (exec_t* e, qse_char_t c) { return QSE_ISCNTRL (c); }
static int cc_isdigit (exec_t* e, qse_char_t c) { return QSE_ISDIGIT (c); }
static int cc_isgraph (exec_t* e, qse_char_t c) { return QSE_ISGRAPH (c); }
static int cc_islower (exec_t* e, qse_char_t c)
{
if (e->rex->option & QSE_REX_IGNORECASE) return !0;
return QSE_ISLOWER (c);
}
static int cc_isprint (exec_t* e, qse_char_t c) { return QSE_ISPRINT (c); }
static int cc_ispunct (exec_t* e, qse_char_t c) { return QSE_ISPUNCT (c); }
static int cc_isspace (exec_t* e, qse_char_t c) { return QSE_ISSPACE (c); }
static int cc_isupper (exec_t* e, qse_char_t c)
{
if (e->rex->option & QSE_REX_IGNORECASE) return !0;
return QSE_ISUPPER (c);
}
static int cc_isxdigit (exec_t* e, qse_char_t c) { return QSE_ISXDIGIT (c); }
static int cc_isword (exec_t* e, qse_char_t c)
{
return QSE_ISALNUM (c) || c == QSE_T('_');
}
static struct ccinfo_t ccinfo[] =
{
{ { QSE_T("alnum"), 5 }, cc_isalnum },
{ { QSE_T("alpha"), 5 }, cc_isalpha },
{ { QSE_T("blank"), 5 }, cc_isblank },
{ { QSE_T("cntrl"), 5 }, cc_iscntrl },
{ { QSE_T("digit"), 5 }, cc_isdigit },
{ { QSE_T("graph"), 5 }, cc_isgraph },
{ { QSE_T("lower"), 5 }, cc_islower },
{ { QSE_T("print"), 5 }, cc_isprint },
{ { QSE_T("punct"), 5 }, cc_ispunct },
{ { QSE_T("space"), 5 }, cc_isspace },
{ { QSE_T("upper"), 5 }, cc_isupper },
{ { QSE_T("xdigit"), 6 }, cc_isxdigit },
{ { QSE_T("word"), 4 }, cc_isword },
/*
{ { QSE_T("arabic"), 6 }, cc_isarabic },
{ { QSE_T("chinese"), 7 }, cc_ischinese },
{ { QSE_T("english"), 7 }, cc_isenglish },
{ { QSE_T("japanese"), 8 }, cc_isjapanese },
{ { QSE_T("korean"), 6 }, cc_iskorean },
{ { QSE_T("thai"), 4 }, cc_isthai },
*/
{ { QSE_NULL, 0 }, QSE_NULL }
};
static int charclass (comp_t* com)
{
const struct ccinfo_t* ccp = ccinfo;
qse_size_t len = com->end - com->ptr;
while (ccp->name.ptr != QSE_NULL)
{
if (qse_strxbeg(com->ptr,len,ccp->name.ptr) != QSE_NULL) break;
ccp++;
}
if (ccp->name.ptr == QSE_NULL)
{
/* wrong class name */
com->rex->errnum = QSE_REX_ECCLASS;
return -1;
}
com->ptr += ccp->name.len;
if (getc_noesc(com) <= -1) return -1;
if (com->c.value != QSE_T(':'))
{
com->rex->errnum = QSE_REX_ECCLASS;
return -1;
}
if (getc_noesc(com) <= -1) return -1;
if (com->c.value != QSE_T(']'))
{
com->rex->errnum = QSE_REX_ERBRACK;
return -1;
}
if (getc_esc(com) <= -1) return -1;
return (int)(ccp - ccinfo);
}
#define ADD_CSET_CODE(com,node,code,len) \
do { if (add_cset_code(com,node,code,len) <= -1) return -1; } while(0)
static int add_cset_code (
comp_t* com, qse_rex_node_t* node, const qse_char_t* c, qse_size_t l)
{
if (qse_str_ncat(node->u.cset.member,c,l) == (qse_size_t)-1)
{
com->rex->errnum = QSE_REX_ENOMEM;
return -1;
}
return 0;
}
static int charset (comp_t* com, qse_rex_node_t* node)
{
QSE_ASSERT (node->id == QSE_REX_NODE_CSET);
QSE_ASSERT (node->u.cset.negated == 0);
QSE_ASSERT (node->u.cset.member == QSE_NULL);
if (IS_SPE(com,QSE_T('^')))
{
/* negate an expression */
node->u.cset.negated = 1;
if (getc_noesc(com) <= -1) return -1;
}
/* initialize the member array */
node->u.cset.member = qse_str_open (com->rex->mmgr, 0, 64);
if (node->u.cset.member == QSE_NULL)
{
com->rex->errnum = QSE_REX_ENOMEM;
return -1;
}
/* if ] is the first character or the second character following ^,
* it is treated literally */
do
{
int x1, x2;
qse_char_t c1, c2;
x1 = com->c.escaped;
c1 = com->c.value;
if (c1 == QSE_CHAR_EOF)
{
com->rex->errnum = QSE_REX_EPREEND;
return -1;
}
if (getc_esc(com) <= -1) return -1;
x2 = com->c.escaped;
c2 = com->c.value;
if (!x1 && c1 == QSE_T('[') &&
!x2 && c2 == QSE_T(':'))
{
int n;
qse_char_t tmp[2];
/* begins with [:
* don't read in the next character as charclass()
* matches a class name differently from other routines.
* if (getc_noesc(com) <= -1) return -1;
*/
if ((n = charclass(com)) <= -1) return -1;
QSE_ASSERT (n < QSE_TYPE_MAX(qse_char_t));
tmp[0] = QSE_REX_CSET_CLASS;
tmp[1] = n;
ADD_CSET_CODE (com, node, tmp, QSE_COUNTOF(tmp));
}
else if (!x2 && c2 == QSE_T('-'))
{
if (getc_esc(com) <= -1) return -1;
if (IS_SPE(com, QSE_T(']')))
{
qse_char_t tmp[4];
/* '-' is the last character in the set.
* treat it literally */
tmp[0] = QSE_REX_CSET_CHAR;
tmp[1] = c1;
tmp[2] = QSE_REX_CSET_CHAR;
tmp[3] = c2;
ADD_CSET_CODE (com, node, tmp, QSE_COUNTOF(tmp));
break;
}
if (c1 > com->c.value)
{
/* range end must be >= range start */
com->rex->errnum = QSE_REX_ECRANGE;
return -1;
}
else if (c1 == com->c.value)
{
/* if two chars in the range are the same,
* treat it as a single character */
qse_char_t tmp[2];
tmp[0] = QSE_REX_CSET_CHAR;
tmp[1] = c1;
ADD_CSET_CODE (com, node, tmp, QSE_COUNTOF(tmp));
}
else
{
qse_char_t tmp[3];
tmp[0] = QSE_REX_CSET_RANGE;
tmp[1] = c1;
tmp[2] = com->c.value;
ADD_CSET_CODE (com, node, tmp, QSE_COUNTOF(tmp));
}
if (getc_esc(com) <= -1) return -1;
}
else
{
qse_char_t tmp[2];
tmp[0] = QSE_REX_CSET_CHAR;
tmp[1] = c1;
ADD_CSET_CODE (com, node, tmp, QSE_COUNTOF(tmp));
}
}
while (!IS_SPE(com,QSE_T(']')));
if (getc_esc(com) <= -1) return -1;
return 0;
}
static int occbound (comp_t* com, qse_rex_node_t* n)
{
qse_size_t bound;
bound = 0;
while (com->c.value >= QSE_T('0') && com->c.value <= QSE_T('9'))
{
bound = bound * 10 + com->c.value - QSE_T('0');
if (getc_noesc(com) <= -1) return -1;
}
n->occ.min = bound;
if (com->c.value == QSE_T(','))
{
if (getc_noesc(com) <= -1) return -1;
if (com->c.value >= QSE_T('0') && com->c.value <= QSE_T('9'))
{
bound = 0;
do
{
bound = bound * 10 + com->c.value - QSE_T('0');
if (getc_noesc(com) <= -1) return -1;
}
while (com->c.value >= QSE_T('0') && com->c.value <= QSE_T('9'));
n->occ.max = bound;
}
else n->occ.max = OCC_MAX;
}
else n->occ.max = n->occ.min;
if (n->occ.min > n->occ.max)
{
/* invalid occurrences range */
com->rex->errnum = QSE_REX_EBOUND;
return -1;
}
if (com->c.value != QSE_T('}'))
{
com->rex->errnum = QSE_REX_ERBRACE;
return -1;
}
if (getc_esc(com) <= -1) return -1;
return 0;
}
static qse_rex_node_t* comp_branches (comp_t* com, qse_rex_node_t* ge);
static qse_rex_node_t* comp_group (comp_t* com)
{
/* enter a subgroup */
qse_rex_node_t* body, *g, * ge;
g = newgroupnode (com);
if (g == QSE_NULL) return QSE_NULL;
ge = newgroupendnode (com, g);
if (ge == QSE_NULL) return QSE_NULL;
/* skip '(' */
if (getc_esc(com) <= -1) return QSE_NULL;
com->gdepth++;
/* pass the GROUPEND node so that the
* last node in the subgroup links to
* this GROUPEND node. */
body = comp_branches (com, ge);
if (body == QSE_NULL) return QSE_NULL;
if (!IS_SPE(com,QSE_T(')')))
{
com->rex->errnum = QSE_REX_ERPAREN;
return QSE_NULL;
}
com->gdepth--;
/* skip ')' */
if (getc_esc(com) <= -1) return QSE_NULL;
g->u.g.head = body;
g->u.g.end = ge;
return g;
}
static qse_rex_node_t* comp_occ (comp_t* com, qse_rex_node_t* atom)
{
switch (com->c.value)
{
case QSE_T('?'):
atom->occ.min = 0;
atom->occ.max = 1;
if (getc_esc(com) <= -1) return QSE_NULL;
break;
case QSE_T('*'):
atom->occ.min = 0;
atom->occ.max = OCC_MAX;
if (getc_esc(com) <= -1) return QSE_NULL;
break;
case QSE_T('+'):
atom->occ.min = 1;
atom->occ.max = OCC_MAX;
if (getc_esc(com) <= -1) return QSE_NULL;
break;
case QSE_T('{'):
if (!(com->rex->option & QSE_REX_NOBOUND))
{
if (getc_noesc(com) <= -1 ||
occbound(com,atom) <= -1) return QSE_NULL;
}
break;
}
return atom;
}
static qse_rex_node_t* comp_atom (comp_t* com)
{
qse_rex_node_t* atom;
if (!IS_ESC(com))
{
switch (com->c.value)
{
case QSE_T('('):
atom = comp_group (com);
if (atom == QSE_NULL) return QSE_NULL;
break;
case QSE_T('.'):
atom = newnode (com, QSE_REX_NODE_ANY);
if (atom == QSE_NULL) return QSE_NULL;
if (getc_esc(com) <= -1) return QSE_NULL;
break;
case QSE_T('^'):
atom = newnode (com, QSE_REX_NODE_BOL);
if (atom == QSE_NULL) return QSE_NULL;
if (getc_esc(com) <= -1) return QSE_NULL;
break;
case QSE_T('$'):
atom = newnode (com, QSE_REX_NODE_EOL);
if (atom == QSE_NULL) return QSE_NULL;
if (getc_esc(com) <= -1) return QSE_NULL;
break;
case QSE_T('['):
atom = newnode (com, QSE_REX_NODE_CSET);
if (atom == QSE_NULL) return QSE_NULL;
if (getc_esc(com) <= -1) return QSE_NULL;
if (charset(com, atom) <= -1) return QSE_NULL;
break;
default:
if (com->rex->option & QSE_REX_STRICT)
{
/* check if a special charcter is at the
* position that requires a normal character. */
switch (com->c.value)
{
case QSE_T('{'):
/* { is a normal charcter when bound is disabled */
if (com->rex->option & QSE_REX_NOBOUND) break;
case QSE_T(')'):
case QSE_T('?'):
case QSE_T('*'):
case QSE_T('+'):
/* it's at the wrong postion */
com->rex->errnum = QSE_REX_ESPCAWP;
return QSE_NULL;
}
}
goto normal_char;
}
}
else
{
normal_char:
/* normal character */
atom = newcharnode (com, com->c.value);
if (atom == QSE_NULL) return QSE_NULL;
if (getc_esc(com) <= -1) return QSE_NULL;
}
atom->occ.min = 1;
atom->occ.max = 1;
if (!IS_ESC(com))
{
/* handle the occurrence specifier, if any */
if (comp_occ (com, atom) == QSE_NULL) return QSE_NULL;
}
return atom;
}
#if 0
static qse_rex_node_t* zero_or_more (comp_t* c, qse_rex_node_t* atom)
{
qse_rex_node_t* b;
b = newbranchnode (c, QSE_NULL, atom);
if (b == QSE_NULL) return QSE_NULL;
atom->occ.min = 1;
atom->occ.max = 1;
atom->next = b;
return b;
}
static qse_rex_node_t* one_or_more (comp_t* c, qse_rex_node_t* atom)
{
qse_rex_node_t* b;
b = newbranchnode (c, atom, QSE_NULL);
atom->occ.min = 1;
atom->occ.max = 1;
atom->next = b;
TODO: return b as the tail....
return atom;
}
#endif
static qse_rex_node_t* pseudo_group (comp_t* c, qse_rex_node_t* atom)
{
qse_rex_node_t* g, *ge, * b;
QSE_ASSERT (atom->occ.min <= 0);
g = newgroupnode (c);
if (g == QSE_NULL) return QSE_NULL;
ge = newgroupendnode (c, g);
if (ge == QSE_NULL) return QSE_NULL;
b = newbranchnode (c, atom, ge);
if (b == QSE_NULL) return QSE_NULL;
atom->occ.min = 1;
atom->next = ge;
QSE_ASSERT (atom->occ.max >= atom->occ.min);
g->occ.max = 1;
g->occ.min = 1;
g->u.g.end = ge;
g->u.g.head = b;
g->u.g.pseudo = 1;
ge->u.ge.pseudo = 1;
return g;
}
/* compile a list of atoms at the outermost level and/or
* within a subgroup */
static qse_rex_node_t* comp_branch (comp_t* c, pair_t* pair)
{
#define REACHED_END(c) \
(IS_EOF(c) || IS_SPE(c,QSE_T('|')) || \
(c->gdepth > 0 && IS_SPE(c,QSE_T(')'))))
if (REACHED_END(c))
{
qse_rex_node_t* nop = newnopnode (c);
if (nop == QSE_NULL) return QSE_NULL;
nop->occ.min = 1; nop->occ.max = 1;
pair->head = nop; pair->tail = nop;
}
else
{
pair->head = QSE_NULL; pair->tail = QSE_NULL;
do
{
qse_rex_node_t* atom = comp_atom (c);
if (atom == QSE_NULL) return QSE_NULL;
if (atom->occ.min <= 0)
{
#if 0
if (atom->occ.max >= OCC_MAX)
{
/*
* +-----------next--+
* v |
* BR --alter----> ORG(atom)
* |
* +----next------------------->
*
*/
atom = zero_or_more (c, atom);
}
else
{
#endif
/*
* Given an atom, enclose it with a
* pseudogroup head and a psuedogroup
* tail. the head is followed by a
* branch that conntects to the tail
* and the atom given. The atom given
* gets connected to the tail.
* Head -> BR -> Tail
* -> ORG(atom) -> Tail
*/
atom = pseudo_group (c, atom);
#if 0
}
#endif
if (atom == QSE_NULL) return QSE_NULL;
}
if (pair->tail == QSE_NULL)
{
QSE_ASSERT (pair->head == QSE_NULL);
pair->head = atom;
}
else pair->tail->next = atom;
pair->tail = atom;
}
while (!REACHED_END(c));
}
return pair->head;
#undef REACHED_END
}
static qse_rex_node_t* comp_branches (comp_t* c, qse_rex_node_t* ge)
{
qse_rex_node_t* left, * right, * tmp;
pair_t xpair;
left = comp_branch (c, &xpair);
if (left == QSE_NULL) return QSE_NULL;
xpair.tail->next = ge;
while (IS_SPE(c,QSE_T('|')))
{
if (getc_esc(c) <= -1) return QSE_NULL;
right = comp_branch (c, &xpair);
if (right == QSE_NULL) return QSE_NULL;
xpair.tail->next = ge;
tmp = newbranchnode (c, left, right);
if (tmp == QSE_NULL) return QSE_NULL;
left = tmp;
}
return left;
}
qse_rex_node_t* qse_rex_comp (qse_rex_t* rex, const qse_char_t* ptr, qse_size_t len)
{
comp_t c;
qse_rex_node_t* end, * body;
c.rex = rex;
c.re.ptr = ptr;
c.re.len = len;
c.ptr = ptr;
c.end = ptr + len;
c.c.value = QSE_CHAR_EOF;
c.gdepth = 0;
c.start = QSE_NULL;
/* read the first character */
if (getc_esc(&c) <= -1) return QSE_NULL;
c.start = newstartnode (&c);
if (c.start == QSE_NULL) return QSE_NULL;
end = newendnode (&c);
if (end == QSE_NULL)
{
freenode (c.start, c.rex->mmgr);
return QSE_NULL;
}
body = comp_branches (&c, end);
if (body == QSE_NULL)
{
freeallnodes (c.start);
return QSE_NULL;
}
c.start->next = body;
if (rex->code != QSE_NULL) freeallnodes (rex->code);
rex->code = c.start;
return rex->code;
}
static void freegroupstackmembers (group_t* gs, qse_mmgr_t* mmgr)
{
while (gs != QSE_NULL)
{
group_t* next = gs->next;
QSE_MMGR_FREE (mmgr, gs);
gs = next;
}
}
static void freegroupstack (group_t* gs, qse_mmgr_t* mmgr)
{
QSE_ASSERT (gs != QSE_NULL);
QSE_ASSERTX (gs->node == QSE_NULL,
"The head of a group stack must point to QSE_NULL for management purpose.");
freegroupstackmembers (gs, mmgr);
}
static void refupgroupstack (group_t* gs)
{
if (gs != QSE_NULL)
{
QSE_ASSERTX (gs->node == QSE_NULL,
"The head of a group stack must point to QSE_NULL for management purpose.");
gs->occ++;
}
}
static void refdowngroupstack (group_t* gs, qse_mmgr_t* mmgr)
{
if (gs != QSE_NULL)
{
QSE_ASSERTX (gs->node == QSE_NULL,
"The head of a group stack must point to QSE_NULL for management purpose.");
if (--gs->occ <= 0)
{
freegroupstack (gs, mmgr);
}
}
}
static group_t* dupgroupstackmembers (exec_t* e, group_t* g)
{
group_t* yg, * xg = QSE_NULL;
QSE_ASSERT (g != QSE_NULL);
if (g->next != QSE_NULL)
{
/* TODO: make it non recursive or
* implement stack overflow protection */
xg = dupgroupstackmembers (e, g->next);
if (xg == QSE_NULL) return QSE_NULL;
}
yg = (group_t*) QSE_MMGR_ALLOC (e->rex->mmgr, QSE_SIZEOF(*yg));
if (yg == QSE_NULL)
{
if (xg != QSE_NULL) freegroupstack (xg, e->rex->mmgr);
e->rex->errnum = QSE_REX_ENOMEM;
return QSE_NULL;
}
QSE_MEMCPY (yg, g, QSE_SIZEOF(*yg));
yg->next = xg;
return yg;
}
static group_t* dupgroupstack (exec_t* e, group_t* gs)
{
group_t* head;
QSE_ASSERT (gs != QSE_NULL);
QSE_ASSERTX (gs->node == QSE_NULL,
"The head of a group stack must point to QSE_NULL for management purpose.");
head = dupgroupstackmembers (e, gs);
if (head == QSE_NULL) return QSE_NULL;
QSE_ASSERTX (
head->node == QSE_NULL &&
head->node == gs->node &&
head->occ == gs->occ,
"The duplicated stack head must not be corrupted"
);
/* reset the reference count of a duplicated stack */
head->occ = 0;
return head;
}
/* push 'gn' to the group stack 'gs'.
* if dup is non-zero, the group stack is duplicated and 'gn' is pushed to
* its top */
static group_t* __groupstackpush (
exec_t* e, group_t* gs, qse_rex_node_t* gn, int dup)
{
group_t* head, * elem;
QSE_ASSERT (gn->id == QSE_REX_NODE_GROUP);
if (gs == QSE_NULL)
{
/* gn is the first group pushed. no stack yet.
* create the head to store management info. */
head = (group_t*) QSE_MMGR_ALLOC (e->rex->mmgr, QSE_SIZEOF(*head));
if (head == QSE_NULL)
{
e->rex->errnum = QSE_REX_ENOMEM;
return QSE_NULL;
}
/* the head does not point to any group node. */
head->node = QSE_NULL;
/* the occ field is used for reference counting.
* refupgroupstack and refdowngroupstack update it. */
head->occ = 0;
/* the head links to the first actual group */
head->next = QSE_NULL;
}
else
{
if (dup)
{
/* duplicate existing stack */
head = dupgroupstack (e, gs);
if (head == QSE_NULL) return QSE_NULL;
}
else
{
head = gs;
}
}
/* create a new stack element */
elem = (group_t*) QSE_MMGR_ALLOC (e->rex->mmgr, QSE_SIZEOF(*elem));
if (elem == QSE_NULL)
{
/* rollback */
if (gs == QSE_NULL)
QSE_MMGR_FREE (e->rex->mmgr, head);
else if (dup)
freegroupstack (head, e->rex->mmgr);
e->rex->errnum = QSE_REX_ENOMEM;
return QSE_NULL;
}
/* initialize the element */
elem->node = gn;
elem->occ = 0;
/* make it the top */
elem->next = head->next;
head->next = elem;
return head;
}
#define dupgroupstackpush(e,gs,gn) __groupstackpush(e,gs,gn,1)
#define groupstackpush(e,gs,gn) __groupstackpush(e,gs,gn,0)
/* duplidate a group stack excluding the top data element */
static group_t* dupgroupstackpop (exec_t* e, group_t* gs)
{
group_t* dupg, * head;
QSE_ASSERT (gs != QSE_NULL);
QSE_ASSERTX (gs->node == QSE_NULL,
"The head of a group stack must point to QSE_NULL for management purpose.");
QSE_ASSERTX (gs->next != QSE_NULL && gs->next->next != QSE_NULL,
"dupgroupstackpop() needs at least two data elements");
dupg = dupgroupstackmembers (e, gs->next->next);
if (dupg == QSE_NULL) return QSE_NULL;
head = (group_t*) QSE_MMGR_ALLOC (e->rex->mmgr, QSE_SIZEOF(*head));
if (head == QSE_NULL)
{
if (dupg != QSE_NULL) freegroupstackmembers (dupg, e->rex->mmgr);
e->rex->errnum = QSE_REX_ENOMEM;
return QSE_NULL;
}
head->node = QSE_NULL;
head->occ = 0;
head->next = dupg;
return head;
}
static group_t* groupstackpop (exec_t* e, group_t* gs)
{
group_t* top;
QSE_ASSERT (gs != QSE_NULL);
QSE_ASSERTX (gs->node == QSE_NULL,
"The head of a group stack must point to QSE_NULL for management purpose.");
QSE_ASSERTX (gs->next != QSE_NULL && gs->next->next != QSE_NULL,
"groupstackpop() needs at least two data elements");
top = gs->next;
gs->next = top->next;
QSE_MMGR_FREE (e->rex->mmgr, top);
return gs;
}
static int addsimplecand (
exec_t* e, group_t* group, qse_rex_node_t* node,
qse_size_t occ, const qse_char_t* mptr)
{
cand_t cand;
QSE_ASSERT (
node->id == QSE_REX_NODE_NOP ||
node->id == QSE_REX_NODE_BOL ||
node->id == QSE_REX_NODE_EOL ||
node->id == QSE_REX_NODE_ANY ||
node->id == QSE_REX_NODE_CHAR ||
node->id == QSE_REX_NODE_CSET
);
cand.node = node;
cand.occ = occ;
cand.group = group;
cand.mptr = mptr;
if (qse_arr_search (
&e->cand.set[e->cand.pending],
0, &cand, 1) != QSE_ARR_NIL)
{
/* exclude any existing entries in the array.
* see comp_cand() for the equality test used.
* note this linear search may be a performance bottle neck
* if the arrary grows large. not so sure if it should be
* switched to a different data structure such as a hash table.
* the problem is that most practical regular expressions
* won't have many candidates for a particular match point.
* so i'm a bit skeptical about data struct switching.
*/
return 0;
}
if (qse_arr_insert (
&e->cand.set[e->cand.pending],
QSE_ARR_SIZE(&e->cand.set[e->cand.pending]),
&cand, 1) == QSE_ARR_NIL)
{
e->rex->errnum = QSE_REX_ENOMEM;
return -1;
}
/* the reference must be decremented by the freeer */
refupgroupstack (group);
return 0;
}
/* addcands() function add a candicate from candnode.
* if candnode is not a simple node, it traverses further
* until it reaches a simple node. prevnode is the last
* GROUPEND node visited during traversal. If no GROUPEND
* is visited yet, it can be any starting node */
static int addcands (
exec_t* e, group_t* group, qse_rex_node_t* prevnode,
qse_rex_node_t* candnode, const qse_char_t* mptr)
{
qse_rex_node_t* curcand = candnode;
warpback:
/* skip all NOP nodes */
while (curcand != QSE_NULL && curcand->id == QSE_REX_NODE_NOP)
curcand = curcand->next;
/* nothing to add */
if (curcand == QSE_NULL) return 0;
switch (curcand->id)
{
case QSE_REX_NODE_END:
{
if (e->matchend == QSE_NULL || mptr >= e->matchend)
e->matchend = mptr;
e->nmatches++;
break;
}
case QSE_REX_NODE_BRANCH:
{
group_t* gx = group;
int n;
if (group != QSE_NULL)
{
gx = dupgroupstack (e, group);
if (gx == QSE_NULL) return -1;
}
refupgroupstack (gx);
n = addcands (e, gx,
prevnode, curcand->u.b.alter, mptr);
refdowngroupstack (gx, e->rex->mmgr);
if (n <= -1) return -1;
curcand = curcand->next;
goto warpback;
}
case QSE_REX_NODE_GROUP:
{
qse_rex_node_t* front;
group_t* gx;
#ifdef XTRA_DEBUG
qse_printf (QSE_T("DEBUG: GROUP %p(pseudo=%d) PREV %p\n"),
curcand, curcand->u.g.pseudo, prevnode);
#endif
if (curcand->u.g.pseudo)
{
curcand = curcand->u.g.head;
goto warpback;
}
/* skip all NOP nodes */
front = curcand->u.g.head;
while (front->id == QSE_REX_NODE_NOP)
front = front->next;
if (front->id == QSE_REX_NODE_GROUPEND)
{
/* if GROUPEND is reached, the group
* is empty. jump to the next node
* regardless of its occurrence.
* however, this will never be reached
* as it has been removed in comp() */
curcand = curcand->next;
goto warpback;
}
gx = groupstackpush (e, group, curcand);
if (gx == QSE_NULL) return -1;
/* add the first node in the group to
* the candidate array */
group = gx;
curcand = front;
goto warpback;
}
case QSE_REX_NODE_GROUPEND:
{
int n;
group_t* top;
qse_rex_node_t* node;
qse_size_t occ;
#ifdef XTRA_DEBUG
qse_printf (QSE_T("DEBUG: GROUPEND %p(pseudo=%d) PREV %p\n"),
curcand, curcand->u.ge.pseudo, prevnode);
#endif
if (curcand->u.ge.pseudo)
{
curcand = curcand->u.ge.group->next;
goto warpback;
}
QSE_ASSERTX (
group != QSE_NULL && group->next != QSE_NULL,
"GROUPEND must be paired up with GROUP");
if (prevnode == curcand)
{
/* consider a pattern like (x*)*.
* when GROUPEND is reached, an 'if' block
* below tries to add the first node
* (node->u.g.head) in the group again.
* however, it('x') is optional, a possible
* path reach GROUPEND directly without
* adding a candidate. this check is needed to
* avoid the infinite loop, which otherwise is
* not avoidable. */
break;
}
top = group->next;
top->occ++;
occ = top->occ;
node = top->node;
QSE_ASSERTX (node == curcand->u.ge.group,
"The GROUP node in the group stack must be the one pairing up with the GROUPEND node."
);
if (occ >= node->occ.min)
{
group_t* gx;
/* the lower bound has been met.
* for a pattern (abc){3,4}, 'abc' has been
* repeated 3 times. in this case, the next
* node can be added to the candiate array.
* it is actually a branch case. move on. */
if (top->next == QSE_NULL)
{
/* only one element in the stack.
* falls back to QSE_NULL regardless
* of the need to reuse it */
gx = QSE_NULL;
}
else if (occ < node->occ.max)
{
/* check if the group will be repeated.
* if so, duplicate the group stack
* excluding the top. it goes along a
* different path and hence requires
* duplication. */
gx = dupgroupstackpop (e, group);
if (gx == QSE_NULL) return -1;
}
else
{
/* reuse the group stack. pop the top
* data element off the stack */
gx = groupstackpop (e, group);
/* this function always succeeds and
* returns the same head */
QSE_ASSERT (gx == group);
}
refupgroupstack (gx);
if (prevnode != QSE_NULL &&
prevnode->id == QSE_REX_NODE_GROUPEND)
n = addcands (e, gx, prevnode, node->next, mptr);
else
n = addcands (e, gx, curcand, node->next, mptr);
refdowngroupstack (gx, e->rex->mmgr);
if (n <= -1) return -1;
}
if (occ < node->occ.max)
{
/* repeat itself. */
prevnode = curcand;
curcand = node->u.g.head;
goto warpback;
}
break;
}
default:
{
int n;
if (group) refupgroupstack (group);
n = addsimplecand (e, group, curcand, 1, mptr);
if (group) refdowngroupstack (group, e->rex->mmgr);
if (n <= -1) return -1;
break;
}
}
return 0;
}
static int charset_matched (exec_t* e, qse_rex_node_t* node, qse_char_t c)
{
const qse_char_t* ptr, * end;
int matched = 0;
QSE_ASSERT (node->u.cset.member != QSE_NULL);
ptr = QSE_STR_PTR (node->u.cset.member);
end = ptr + QSE_STR_LEN (node->u.cset.member);
while (ptr < end && !matched)
{
switch (*ptr)
{
case QSE_REX_CSET_CHAR:
{
ptr++;
if (e->rex->option & QSE_REX_IGNORECASE)
{
if (QSE_TOUPPER(c) == QSE_TOUPPER(*ptr)) matched = !0;
}
else
{
if (c == *ptr) matched = !0;
}
break;
}
case QSE_REX_CSET_RANGE:
{
qse_char_t c1, c2;
if (e->rex->option & QSE_REX_IGNORECASE)
{
qse_char_t c3;
ptr++; c1 = QSE_TOUPPER(*ptr);
ptr++; c2 = QSE_TOUPPER(*ptr);
c3 = QSE_TOUPPER(c);
if (c3 >= c1 && c3 <= c2) matched = !0;
}
else
{
c1 = *++ptr; c2 = *++ptr;
if (c >= c1 && c <= c2) matched = !0;
}
break;
}
case QSE_REX_CSET_CLASS:
{
qse_char_t c1;
c1 = *++ptr;
QSE_ASSERT (c1 < QSE_COUNTOF(ccinfo));
if (ccinfo[c1].func(e,c)) matched = !0;
break;
}
default:
{
QSE_ASSERTX (0,
"SHOUL NEVER HAPPEN - membership code for a character set must be one of QSE_REX_CSET_CHAR, QSE_REX_CSET_RANGE, QSE_REX_CSET_CLASS");
/* return no match if this part is reached.
* however, something is totally wrong if it
* happens. */
return 0;
}
}
ptr++;
}
if (node->u.cset.negated) matched = !matched;
return matched;
}
static qse_arr_walk_t walk_cands_for_match (
qse_arr_t* arr, qse_size_t index, void* ctx)
{
exec_t* e = (exec_t*)ctx;
cand_t* cand = QSE_ARR_DPTR(arr,index);
qse_rex_node_t* node = cand->node;
const qse_char_t* nmptr = QSE_NULL;
switch (node->id)
{
case QSE_REX_NODE_BOL:
if (cand->mptr == e->str.ptr)
{
/* the next match pointer remains
* the same as ^ matches a position,
* not a character. */
nmptr = cand->mptr;
#ifdef XTRA_DEBUG
qse_printf (QSE_T("DEBUG: matched <^>\n"));
#endif
}
break;
case QSE_REX_NODE_EOL:
if (cand->mptr >= e->str.end)
{
/* the next match pointer remains
* the same as $ matches a position,
* not a character. */
nmptr = cand->mptr;
#ifdef XTRA_DEBUG
qse_printf (QSE_T("DEBUG: matched <$>\n"));
#endif
}
break;
case QSE_REX_NODE_ANY:
if (cand->mptr < e->sub.end)
{
/* advance the match pointer to the
* next chracter.*/
nmptr = cand->mptr + 1;
#ifdef XTRA_DEBUG
qse_printf (QSE_T("DEBUG: matched <.>\n"));
#endif
}
break;
case QSE_REX_NODE_CHAR:
{
if (cand->mptr < e->sub.end)
{
int equal;
equal = (e->rex->option & QSE_REX_IGNORECASE)?
(QSE_TOUPPER(node->u.c) == QSE_TOUPPER(*cand->mptr)):
(node->u.c == *cand->mptr) ;
if (equal)
{
/* advance the match pointer to the
* next chracter.*/
nmptr = cand->mptr + 1;
}
#ifdef XTRA_DEBUG
qse_printf (QSE_T("DEBUG: matched %c\n"), node->u.c);
#endif
}
break;
}
case QSE_REX_NODE_CSET:
{
if (cand->mptr < e->sub.end &&
charset_matched(e, node, *cand->mptr))
{
/* advance the match pointer
* to the next chracter.*/
nmptr = cand->mptr + 1;
}
break;
}
default:
{
QSE_ASSERTX (0,
"SHOULD NEVER HAPPEN - node ID must be one of QSE_REX_NODE_BOL, QSE_REX_NODE_EOL, QSE_REX_NODE_ANY, QSE_REX_NODE_CHAR, QSE_REX_NODE_CSET, QSE_REX_NODE_NOP");
break;
}
}
if (nmptr != QSE_NULL)
{
int n;
if (cand->occ >= node->occ.min)
{
group_t* gx;
if (cand->occ < node->occ.max && cand->group != QSE_NULL)
{
gx = dupgroupstack (e, cand->group);
if (gx == QSE_NULL) return QSE_ARR_WALK_STOP;
}
else gx = cand->group;
/* move on to the next candidate */
refupgroupstack (gx);
n = addcands (e, gx, node, node->next, nmptr);
refdowngroupstack (gx, e->rex->mmgr);
if (n <= -1) return QSE_ARR_WALK_STOP;
}
if (cand->occ < node->occ.max)
{
/* repeat itself more */
refupgroupstack (cand->group);
n = addsimplecand (
e, cand->group,
node, cand->occ + 1, nmptr);
refdowngroupstack (cand->group, e->rex->mmgr);
if (n <= -1) return QSE_ARR_WALK_STOP;
}
}
return QSE_ARR_WALK_FORWARD;
}
static int exec (exec_t* e)
{
int n;
e->nmatches = 0;
e->matchend = QSE_NULL;
e->cand.pending = 0;
e->cand.active = 1;
/* empty the pending set to collect the initial candidates */
qse_arr_clear (&e->cand.set[e->cand.pending]);
/* the first node must be the START node */
QSE_ASSERT (e->rex->code->id == QSE_REX_NODE_START);
/* collect an initial set of candidates into the pending set */
n = addcands (
e, /* execution structure */
QSE_NULL, /* doesn't belong to any groups yet */
e->rex->code, /* dummy previous node, the start node */
e->rex->code->next, /* start from the second node */
e->sub.ptr /* current match pointer */
);
if (n <= -1) return -1;
do
{
qse_size_t ncands_active;
/* swap the pending and active set indices.
* the pending set becomes active after which the match()
* function tries each candidate in it. New candidates
* are added into the pending set which will become active
* later when the loop reaches here again */
int tmp = e->cand.pending;
e->cand.pending = e->cand.active;
e->cand.active = tmp;
ncands_active = QSE_ARR_SIZE(&e->cand.set[e->cand.active]);
if (ncands_active <= 0)
{
/* we can't go on with no candidates in the
* active set. */
break;
}
/* clear the pending set */
qse_arr_clear (&e->cand.set[e->cand.pending]);
#ifdef XTRA_DEBUG
{
int i;
qse_printf (QSE_T("SET="));
for (i = 0; i < ncands_active; i++)
{
cand_t* cand = QSE_ARR_DPTR(&e->cand.set[e->cand.active],i);
qse_rex_node_t* node = cand->node;
if (node->id == QSE_REX_NODE_CHAR)
qse_printf (QSE_T("%c "), node->u.c);
else if (node->id == QSE_REX_NODE_ANY)
qse_printf (QSE_T(". "), node->u.c);
else if (node->id == QSE_REX_NODE_BOL)
qse_printf (QSE_T("^ "));
else if (node->id == QSE_REX_NODE_EOL)
qse_printf (QSE_T("$ "));
}
qse_printf (QSE_T("\n"));
}
#endif
if (qse_arr_walk (
&e->cand.set[e->cand.active],
walk_cands_for_match, e) != ncands_active)
{
/* if the number of walks is different the number of
* candidates, traversal must have been aborted for
* an error. */
return -1;
}
}
while (1);
#ifdef XTRA_DEBUG
if (e->nmatches > 0)
{
qse_printf (QSE_T("MATCH: %d [%.*s]\n"),
(int)(e->matchend - e->sub.ptr),
(int)(e->matchend - e->sub.ptr), e->sub.ptr);
}
qse_printf (QSE_T("TOTAL MATCHES FOUND... %d\n"), e->nmatches);
#endif
return (e->nmatches > 0)? 1: 0;
}
static void refdowngroupstack_incand (qse_arr_t* arr, void* dptr, qse_size_t dlen)
{
QSE_ASSERT (dlen == 1);
refdowngroupstack (((cand_t*)dptr)->group, arr->mmgr);
}
static int comp_cand (qse_arr_t* arr,
const void* dptr1, qse_size_t dlen1,
const void* dptr2, qse_size_t dlen2)
{
cand_t* c1 = (cand_t*)dptr1;
cand_t* c2 = (cand_t*)dptr2;
/*qse_printf (QSE_T("%p(%d) %p(%d), %p %p, %d %d\n"), c1->node,c1->node->id, c2->node,c1->node->id, c1->mptr, c2->mptr, (int)c1->occ, (int)c2->occ);*/
return (c1->node == c2->node &&
c1->mptr == c2->mptr &&
c1->occ == c2->occ)? 0: 1;
}
static int init_exec_dds (exec_t* e, qse_mmgr_t* mmgr)
{
/* initializes dynamic data structures */
if (qse_arr_init (&e->cand.set[0], mmgr, 100) <= -1)
{
e->rex->errnum = QSE_REX_ENOMEM;
return -1;
}
if (qse_arr_init (&e->cand.set[1], mmgr, 100) <= -1)
{
e->rex->errnum = QSE_REX_ENOMEM;
qse_arr_fini (&e->cand.set[0]);
return -1;
}
qse_arr_setscale (&e->cand.set[0], QSE_SIZEOF(cand_t));
qse_arr_setscale (&e->cand.set[1], QSE_SIZEOF(cand_t));
qse_arr_setcopier (&e->cand.set[0], QSE_ARR_COPIER_INLINE);
qse_arr_setcopier (&e->cand.set[1], QSE_ARR_COPIER_INLINE);
qse_arr_setfreeer (&e->cand.set[0], refdowngroupstack_incand);
qse_arr_setfreeer (&e->cand.set[1], refdowngroupstack_incand);
qse_arr_setcomper (&e->cand.set[0], comp_cand);
qse_arr_setcomper (&e->cand.set[1], comp_cand);
return 0;
}
static void fini_exec_dds (exec_t* e)
{
qse_arr_fini (&e->cand.set[1]);
qse_arr_fini (&e->cand.set[0]);
}
int qse_rex_exec (
qse_rex_t* rex, const qse_cstr_t* str,
const qse_cstr_t* substr, qse_cstr_t* matstr)
{
exec_t e;
int n = 0;
if (rex->code == QSE_NULL)
{
rex->errnum = QSE_REX_ENOCOMP;
return -1;
}
QSE_MEMSET (&e, 0, QSE_SIZEOF(e));
e.rex = rex;
e.str.ptr = str->ptr;
e.str.end = str->ptr + str->len;
e.sub.ptr = substr->ptr;
e.sub.end = substr->ptr + substr->len;
if (init_exec_dds (&e, rex->mmgr) <= -1) return -1;
while (e.sub.ptr <= e.sub.end)
{
n = exec (&e);
if (n <= -1)
{
n = -1;
break;
}
if (n >= 1)
{
QSE_ASSERT (e.nmatches > 0);
QSE_ASSERT (e.matchend != QSE_NULL);
if (matstr)
{
matstr->ptr = e.sub.ptr;
matstr->len = e.matchend - e.sub.ptr;
}
break;
}
e.sub.ptr++;
}
fini_exec_dds (&e);
return n;
}
void* qse_buildrex (
qse_mmgr_t* mmgr, qse_size_t depth, int option,
const qse_char_t* ptn, qse_size_t len, qse_rex_errnum_t* errnum)
{
qse_rex_t rex;
qse_rex_node_t* code;
qse_rex_init (&rex, mmgr, QSE_NULL);
qse_rex_setopt (&rex, option);
if (qse_rex_comp (&rex, ptn, len) == QSE_NULL)
{
*errnum = rex.errnum;
qse_rex_fini (&rex);
return QSE_NULL;
}
code = qse_rex_yield (&rex);
qse_rex_fini (&rex);
return code;
}
int qse_matchrex (
qse_mmgr_t* mmgr, qse_size_t depth,
void* code, int option,
const qse_cstr_t* str, const qse_cstr_t* substr,
qse_cstr_t* match, qse_rex_errnum_t* errnum)
{
qse_rex_t rex;
int n;
qse_rex_init (&rex, mmgr, code);
qse_rex_setopt (&rex, option);
if ((n = qse_rex_exec (&rex, str, substr, match)) <= -1)
{
*errnum = rex.errnum;
qse_rex_yield (&rex);
qse_rex_fini (&rex);
return -1;
}
qse_rex_yield (&rex);
qse_rex_fini (&rex);
return n;
}
void qse_freerex (qse_mmgr_t* mmgr, void* code)
{
qse_rex_t rex;
qse_rex_init (&rex, mmgr, code);
qse_rex_fini (&rex);
}