From 9d3084f455f533dac5cdfbc3d172971ca3752218 Mon Sep 17 00:00:00 2001 From: hyung-hwan Date: Thu, 19 Nov 2009 07:47:12 +0000 Subject: [PATCH] interim commit. updating rex1.c --- qse/include/qse/cmn/rex.h | 6 +- qse/lib/cmn/rex1.c | 701 ++++++++++++++++++++++++++++++-------- qse/samples/cmn/rex.cpp | 20 +- 3 files changed, 579 insertions(+), 148 deletions(-) diff --git a/qse/include/qse/cmn/rex.h b/qse/include/qse/cmn/rex.h index 8ae3630e..474a8bf7 100644 --- a/qse/include/qse/cmn/rex.h +++ b/qse/include/qse/cmn/rex.h @@ -1,5 +1,5 @@ /* - * $Id: rex.h 300 2009-11-13 14:01:57Z hyunghwan.chung $ + * $Id: rex.h 302 2009-11-18 13:47:12Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. This file is part of QSE. @@ -113,6 +113,8 @@ enum qse_rex_node_id_t QSE_REX_NODE_START, QSE_REX_NODE_END, QSE_REX_NODE_NOP, + QSE_REX_NODE_BOL, /* beginning of line */ + QSE_REX_NODE_EOL, /* end of line */ QSE_REX_NODE_ANYCHAR, /* dot */ QSE_REX_NODE_CHAR, /* single character */ QSE_REX_NODE_CHARSET, /* character set */ @@ -125,7 +127,7 @@ typedef enum qse_rex_node_id_t qse_rex_node_id_t; typedef struct qse_rex_node_t qse_rex_node_t; struct qse_rex_node_t { - qse_rex_node_t* link; /* link for management. not used for startnode */ + qse_rex_node_t* link; /* for internal management. not used for startnode */ qse_rex_node_t* next; qse_rex_node_id_t id; diff --git a/qse/lib/cmn/rex1.c b/qse/lib/cmn/rex1.c index 3beff122..ba770484 100644 --- a/qse/lib/cmn/rex1.c +++ b/qse/lib/cmn/rex1.c @@ -1,8 +1,21 @@ /* * $Id$ + * + Copyright 2006-2009 Chung, Hyung-Hwan. + This file is part of QSE. -{LICENSE HERE} + QSE is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation, either version 3 of + the License, or (at your option) any later version. + QSE is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with QSE. If not, see . */ #include @@ -10,7 +23,6 @@ #include #include "mem.h" -#define GETC(c) do { if getc(c) <= -1) return -1; } while (0) #define OCC_MAX QSE_TYPE_MAX(qse_size_t) struct qse_rex_t @@ -27,11 +39,22 @@ struct comp_t qse_rex_t* rex; qse_cstr_t re; + const qse_char_t* ptr; const qse_char_t* end; - qse_cint_t c; - qse_size_t grouplvl; + struct + { + enum + { + CT_NORMAL, + CT_SPECIAL + } type; + qse_cint_t value; + int escaped; + } c; + + qse_size_t gdepth; /* group depth */ qse_rex_node_t* start; }; @@ -40,12 +63,27 @@ struct exec_t { qse_rex_t* rex; - qse_cstr_t str; - qse_cstr_t sub; + struct + { + const qse_char_t* ptr; + const qse_char_t* end; + } str; - qse_lda_t cand[2]; /* candidate arrays */ - int xxx, yyy; - qse_size_t matched; + struct + { + const qse_char_t* ptr; + const qse_char_t* end; + } sub; + + struct + { + int active; + int pending; + qse_lda_t set[2]; /* candidate arrays */ + } cand; + + qse_size_t nmatches; + const qse_char_t* matchend; /* 1 character past the match end */ }; typedef struct pair_t pair_t; @@ -66,9 +104,18 @@ struct group_t typedef struct cand_t cand_t; struct cand_t { - qse_rex_node_t* node; - qse_size_t occ; - group_t* group; + qse_rex_node_t* node; + qse_size_t occ; + + /* the stack of groups that this candidate belongs to. + * it is in the singliy linked list form */ + group_t* group; + + /* match pointer. the number of character advancement + * differs across various node types. BOL and EOL don't advance to + * the next character on match while ANYCHAR and CHAR do on match. + * therefore, the match pointer is managed per candidate basis. */ + const qse_char_t* mptr; }; qse_rex_t* qse_rex_open (qse_mmgr_t* mmgr, qse_size_t xtn, void* code) @@ -211,12 +258,185 @@ static qse_rex_node_t* newbranchnode ( return n; } -static int getc (comp_t* c) +#define CHECK_END(builder) \ + do { \ + if (builder->ptr >= builder->ptn.end) \ + { \ + builder->errnum = QSE_REX_EEND; \ + return -1; \ + } \ + } while(0) + +#define IS_HEX(c) \ + ((c >= QSE_T('0') && c <= QSE_T('9')) || \ + (c >= QSE_T('A') && c <= QSE_T('F')) || \ + (c >= QSE_T('a') && c <= QSE_T('f'))) + +#define HEX_TO_NUM(c) \ + ((c >= QSE_T('0') && c <= QSE_T('9'))? c-QSE_T('0'): \ + (c >= QSE_T('A') && c <= QSE_T('F'))? c-QSE_T('A')+10: \ + c-QSE_T('a')+10) + +static int getc (comp_t* com) { - c->c = (c->ptr < c->end)? *c->ptr++: QSE_CHAR_EOF; -if (c->c == QSE_CHAR_EOF) + if (com->ptr >= com->end) + { + com->c.type = CT_NORMAL; + com->c.value = QSE_CHAR_EOF; + com->c.escaped = 0; + return 0; + } + + com->c.type = CT_NORMAL; + com->c.value = *com->ptr++; + com->c.escaped = QSE_FALSE; + + if (com->c.value == QSE_T('\\')) + { + qse_char_t c; + + CHECK_END (builder); + c = *com->ptr++; + + if (c == QSE_T('n')) c = QSE_T('\n'); + else if (c == QSE_T('r')) c = QSE_T('\r'); + else if (c == QSE_T('t')) c = QSE_T('\t'); + else if (c == QSE_T('f')) c = QSE_T('\f'); + else if (c == QSE_T('b')) c = QSE_T('\b'); + else if (c == QSE_T('v')) c = QSE_T('\v'); + else if (c == QSE_T('a')) c = QSE_T('\a'); + else if (c >= QSE_T('0') && c <= QSE_T('7')) + { + qse_char_t cx; + + c = c - QSE_T('0'); + + CHECK_END (builder); + cx = *com->ptr++; + if (cx >= QSE_T('0') && cx <= QSE_T('7')) + { + c = c * 8 + cx - QSE_T('0'); + + CHECK_END (builder); + cx = *com->ptr++; + if (cx >= QSE_T('0') && cx <= QSE_T('7')) + { + c = c * 8 + cx - QSE_T('0'); + } + } + } + else if (c == QSE_T('x')) + { + qse_char_t cx; + + CHECK_END (builder); + cx = *com->ptr++; + if (IS_HEX(cx)) + { + c = HEX_TO_NUM(cx); + + CHECK_END (builder); + cx = *com->ptr++; + if (IS_HEX(cx)) + { + c = c * 16 + HEX_TO_NUM(cx); + } + } + } + #ifdef QSE_CHAR_IS_WCHAR + else if (c == QSE_T('u') && QSE_SIZEOF(qse_char_t) >= 2) + { + qse_char_t cx; + + CHECK_END (builder); + cx = *com->ptr++; + if (IS_HEX(cx)) + { + qse_size_t i; + + c = HEX_TO_NUM(cx); + + for (i = 0; i < 3; i++) + { + CHECK_END (builder); + cx = *com->ptr++; + + if (!IS_HEX(cx)) break; + c = c * 16 + HEX_TO_NUM(cx); + } + } + } + else if (c == QSE_T('U') && QSE_SIZEOF(qse_char_t) >= 4) + { + qse_char_t cx; + + CHECK_END (builder); + cx = *com->ptr++; + if (IS_HEX(cx)) + { + qse_size_t i; + + c = HEX_TO_NUM(cx); + + for (i = 0; i < 7; i++) + { + CHECK_END (builder); + cx = *com->ptr++; + + if (!IS_HEX(cx)) break; + c = c * 16 + HEX_TO_NUM(cx); + } + } + } + #endif + + com->c.value = c; + com->c.escaped = QSE_TRUE; + + return 0; + } + else + { + if (level == LEVEL_TOP) + { + if (com->c.value == QSE_T('[') || + com->c.value == QSE_T('|') || + com->c.value == QSE_T('^') || + com->c.value == QSE_T('$') || + (!(com->option & QSE_REX_BUILD_NOBOUND) && + com->c.value == QSE_T('{')) || + com->c.value == QSE_T('+') || + com->c.value == QSE_T('?') || + com->c.value == QSE_T('*') || + com->c.value == QSE_T('.') || + com->c.value == QSE_T('(') || + com->c.value == QSE_T(')')) + { + com->c.type = CT_SPECIAL; + } + } + else if (level == LEVEL_CHARSET) + { + if (com->c.value == QSE_T(']')) + { + com->c.type = CT_SPECIAL; + } + } + else if (level == LEVEL_RANGE) + { + if (com->c.value == QSE_T(',') || + com->c.value == QSE_T('}')) + { + com->c.type = CT_SPECIAL; + } + } + } +#if 0 + com->c = (com->ptr < com->end)? *com->ptr++: QSE_CHAR_EOF; +if (com->c == QSE_CHAR_EOF) qse_printf (QSE_T("getc => \n")); -else qse_printf (QSE_T("getc => %c\n"), c->c); +else qse_printf (QSE_T("getc => %c\n"), com->c); +#endif return 0; } @@ -226,7 +446,7 @@ static qse_rex_node_t* comp2 (comp_t* c) { qse_rex_node_t* n; - switch (c->c) + switch (c->c.value) { case QSE_T('('): { @@ -249,7 +469,7 @@ static qse_rex_node_t* comp2 (comp_t* c) return QSE_NULL; } - c->grouplvl++; + c->gdepth++; x = comp0 (c, ge); if (x == QSE_NULL) { @@ -258,7 +478,7 @@ static qse_rex_node_t* comp2 (comp_t* c) return QSE_NULL; } - if (c->c != QSE_T(')')) + if (c->c.value != QSE_T(')')) { qse_printf (QSE_T("expecting )\n")); // UNBALANCED PAREN. @@ -267,7 +487,7 @@ qse_printf (QSE_T("expecting )\n")); return QSE_NULL; } - c->grouplvl--; + c->gdepth--; if (getc(c) <= -1) { // freere (x); @@ -289,6 +509,27 @@ qse_printf (QSE_T("expecting )\n")); } break; + case QSE_T('^'): + n = newnode (c, QSE_REX_NODE_BOL); + if (n == QSE_NULL) return QSE_NULL; + if (getc(c) <= -1) + { + // TODO: error handling.. + return QSE_NULL; + } + break; + + case QSE_T('$'): + n = newnode (c, QSE_REX_NODE_EOL); + if (n == QSE_NULL) return QSE_NULL; + if (getc(c) <= -1) + { + // TODO: error handling.. + return QSE_NULL; + } + break; + + /* case QSE_T('['): .... @@ -296,7 +537,7 @@ qse_printf (QSE_T("expecting )\n")); default: /* normal character */ - n = newcharnode (c, c->c); + n = newcharnode (c, c->c.value); if (n == QSE_NULL) return QSE_NULL; if (getc(c) <= -1) { @@ -363,8 +604,8 @@ static qse_rex_node_t* comp1 (comp_t* c, pair_t* pair) pair->tail = pair->head; - while (c->c != QSE_T('|') && c->c != QSE_CHAR_EOF && - !(c->grouplvl >= 0 && c->c == QSE_T(')'))) + while (c->c.value != QSE_T('|') && c->c.value != QSE_CHAR_EOF && + !(c->gdepth >= 0 && c->c.value == QSE_T(')'))) { qse_rex_node_t* tmp = comp2 (c); if (tmp == QSE_NULL) @@ -389,7 +630,7 @@ static qse_rex_node_t* comp0 (comp_t* c, qse_rex_node_t* ge) if (left == QSE_NULL) return QSE_NULL; xpair.tail->next = ge; - while (c->c == QSE_T('|')) + while (c->c.value == QSE_T('|')) { if (getc (c) <= -1) { @@ -435,8 +676,10 @@ qse_rex_node_t* qse_rex_comp ( c.ptr = ptr; c.end = ptr + len; - c.c = QSE_CHAR_EOF; - c.grouplvl = 0; + + c.c.value = QSE_CHAR_EOF; + + c.gdepth = 0; c.start = QSE_NULL; if (getc(&c) <= -1) return QSE_NULL; @@ -454,11 +697,11 @@ qse_rex_node_t* qse_rex_comp ( else { qse_rex_node_t* tmp; - //tmp = comp0 (&c, QSE_NULL); + /*tmp = comp0 (&c, QSE_NULL);*/ tmp = comp0 (&c, end); if (tmp == QSE_NULL) { - //freenode (c.start, c.rex->mmgr); + /*freenode (c.start, c.rex->mmgr);*/ freeallnodes (c.start); c.start = QSE_NULL; } @@ -474,29 +717,95 @@ qse_printf (QSE_T("start has tmp...\n")); return rex->code; } -static group_t* pushgroup (exec_t* e, group_t* pg, qse_rex_node_t* gn) +static group_t* dupgroups (exec_t* e, group_t* g) { - group_t* g; - QSE_ASSERT (gn->id == QSE_REX_NODE_GROUP); + group_t* yg, * xg = QSE_NULL; - g = (group_t*) QSE_MMGR_ALLOC (e->rex->mmgr, QSE_SIZEOF(*g)); - if (g == QSE_NULL) + QSE_ASSERT (g != QSE_NULL); + + if (g->next != QSE_NULL) + { + /* TODO: make it non recursive or + * implement stack overflow protection */ + xg = dupgroups (e, g->next); + if (xg == QSE_NULL) return QSE_NULL; + } + + yg = (group_t*) QSE_MMGR_ALLOC (e->rex->mmgr, QSE_SIZEOF(*g)); + if (yg == QSE_NULL) + { + /* TODO: freegroups (xg); */ + /* TODO: set error info */ + return QSE_NULL; + } + + QSE_MEMCPY (yg, g, QSE_SIZEOF(*yg)); + yg->next = xg; + + return yg; +} + +static void freegroup (exec_t* e, group_t* group) +{ + QSE_ASSERT (group != QSE_NULL); + QSE_MMGR_FREE (e->rex->mmgr, group); +} + +static void freegroups (exec_t* e, group_t* group) +{ + group_t* next; + + while (group != QSE_NULL) + { + next = group->next; + freegroup (e, group); + group = next; + } +} + +static group_t* pushgroup (exec_t* e, group_t* group, qse_rex_node_t* newgn) +{ + group_t* newg; + + QSE_ASSERT (newgn->id == QSE_REX_NODE_GROUP); + + newg = (group_t*) QSE_MMGR_ALLOC (e->rex->mmgr, QSE_SIZEOF(*newg)); + if (newg == QSE_NULL) { /* TODO: set error info */ return QSE_NULL; } - g->node = gn; - g->occ = 0; - g->next = pg; + newg->node = newgn; + newg->occ = 0; + newg->next = group; - return g; + return newg; +} + +static group_t* pushgroupdup (exec_t* e, group_t* pg, qse_rex_node_t* gn) +{ + group_t* gs = QSE_NULL; + + /* duplicate the group stack if necessary */ + if (pg != QSE_NULL) + { + gs = dupgroups (e, pg); + if (gs == QSE_NULL) return QSE_NULL; + } + + /* and push a new group to the stack */ + return pushgroup (e, gs, gn); } static int addsimplecand ( - exec_t* e, cand_t* pcand, qse_rex_node_t* node, qse_size_t occ) + exec_t* e, group_t* group, qse_rex_node_t* node, + qse_size_t occ, const qse_char_t* mptr) { QSE_ASSERT ( + node->id == QSE_REX_NODE_BOL || + node->id == QSE_REX_NODE_EOL || + node->id == QSE_REX_NODE_ANYCHAR || node->id == QSE_REX_NODE_CHAR || node->id == QSE_REX_NODE_CHARSET ); @@ -505,16 +814,17 @@ static int addsimplecand ( cand.node = node; cand.occ = occ; - cand.group = pcand->group; + cand.group = group; + cand.mptr = mptr; -if (node->id == QSE_REX_NODE_CHAR) +/*if (node->id == QSE_REX_NODE_CHAR) qse_printf (QSE_T("adding %d %c\n"), node->id, node->u.c); else -qse_printf (QSE_T("adding %d NA\n"), node->id); +qse_printf (QSE_T("adding %d NA\n"), node->id);*/ if (qse_lda_insert ( - &e->cand[e->xxx], - QSE_LDA_SIZE(&e->cand[e->xxx]), + &e->cand.set[e->cand.pending], + QSE_LDA_SIZE(&e->cand.set[e->cand.pending]), &cand, 1) == (qse_size_t)-1) { /* TODO: set error code: ENOERR */ @@ -524,116 +834,188 @@ qse_printf (QSE_T("adding %d NA\n"), node->id); return 0; } -static int addnextcands (exec_t* e, group_t* group, qse_rex_node_t* cur) +static int addcands ( + exec_t* e, group_t* group, qse_rex_node_t* prevnode, + qse_rex_node_t* candnode, const qse_char_t* mptr) { /* skip all NOP nodes */ - while (cur && cur->id == QSE_REX_NODE_NOP) cur = cur->next; + while (candnode != QSE_NULL && candnode->id == QSE_REX_NODE_NOP) + candnode = candnode->next; /* nothing to add */ - if (cur == QSE_NULL) return 0; + if (candnode == QSE_NULL) return 0; - if (cur->id == QSE_REX_NODE_END) + if (candnode->id == QSE_REX_NODE_END) { qse_printf (QSE_T("== ADDING THE END(MATCH) NODE MEANING MATCH FOUND == \n")); - e->matched++; + if (e->matchend == QSE_NULL || mptr >= e->matchend) + e->matchend = mptr; + e->nmatches++; } - else if (cur->id == QSE_REX_NODE_BRANCH) + else if (candnode->id == QSE_REX_NODE_BRANCH) { - #if 0 - QSE_ASSERT (cur->next == QSE_NULL); - if (addnextcands (e, group, cur->u.b.left) <= -1) return -1; - if (addnextcands (e, group, cur->u.b.right) <= -1) return -1; - #endif + group_t* groupdup; + + QSE_ASSERT (candnode->next == QSE_NULL); + + groupdup = dupgroups (e, group); + if (groupdup == QSE_NULL) return -1; + + if (addcands (e, group, prevnode, candnode->u.b.left, mptr) <= -1) return -1; + if (addcands (e, groupdup, prevnode, candnode->u.b.right, mptr) <= -1) return -1; } - else if (cur->id == QSE_REX_NODE_GROUP) + else if (candnode->id == QSE_REX_NODE_GROUP) { - group_t* g = pushgroup (e, group, cur); - if (g == QSE_NULL) return -1; + group_t* groupdup; - /* add the first node in the group */ - if (addnextcands (e, g, cur->u.g.head) <= -1) return -1; - - if (cur->occ.min <= 0) + if (candnode->occ.min <= 0) { /* if the group node is optional, - * add the next node to the candidate array. - * branch case => dup group */ - if (addnextcands (e, group, cur->next) <= -1) return -1; + * add the next node to the candidate array. */ + if (addcands (e, group, prevnode, candnode->next, mptr) <= -1) return -1; } + + /* push the candnoderent group node (candnode) to the group + * stack duplicated. */ + groupdup = pushgroupdup (e, group, candnode); + if (groupdup == QSE_NULL) return -1; + + /* add the first node in the group */ + if (addcands (e, groupdup, candnode, candnode->u.g.head, mptr) <= -1) return -1; + } - else if (cur->id == QSE_REX_NODE_GROUPEND) + else if (candnode->id == QSE_REX_NODE_GROUPEND) { - group_t* group; qse_rex_node_t* node; + qse_size_t occ; - group = cand->group; - QSE_ASSERT (group != QSE_NULL); + QSE_ASSERTX (group != QSE_NULL, + "GROUPEND reached must be paired up with a GROUP"); - node = group->node; - QSE_ASSERT (node == cur->u.ge.group); - - if (group->occ < node->occ.max) + if (prevnode != candnode) + /*if (prevnode == QSE_NULL || prevnode->id != QSE_REX_NODE_GROUPEND)*/ { - /* need to repeat itself */ group->occ++; - if (addnextcands (e, cand, node->u.g.head) <= -1) return -1; - } - if (group->occ >= node->occ.min) - { - /* take the next atom as a candidate. - * it is actually a branch case. */ - - cand = dupgrouppoppingtop (cand); - - if (addnextcands (e, pg, node->next) <= -1) return -1; + occ = group->occ; + node = group->node; + QSE_ASSERT (node == candnode->u.ge.group); + + if (occ >= node->occ.min) + { + group_t* gx = group->next; + + /* take the next atom as a candidate. + * it is actually a branch case. move on. */ + + if (occ < node->occ.max) + { + /* check if the group will be repeated. + * if so, duplicate the group stack excluding + * the top. it goes along a different path and + * hence requires a duplicated group stack. */ + if (group->next != QSE_NULL) + { + gx = dupgroups (e, group->next); + if (gx == QSE_NULL) return -1; + } + } + + if (addcands (e, gx, candnode, node->next, mptr) <= -1) return -1; + } + + if (occ < node->occ.max) + { + /* need to repeat itself. */ + if (addcands (e, group, candnode, node->u.g.head, mptr) <= -1) return -1; + } } } else { - if (addsimplecand (e, cand, cur, 1) <= -1) return -1; - if (cur->occ.min <= 0) + group_t* gx = group; + + if (candnode->occ.min <= 0) { /* if the node is optional, - * add the next node to the candidate array */ - if (addnextcands (e, pg, cur->next) <= -1) return -1; + * add the next node to the candidate array */ + if (addcands (e, group, prevnode, candnode->next, mptr) <= -1) return -1; + + if (group != QSE_NULL) + { + gx = dupgroups (e, group); + if (gx == QSE_NULL) return -1; + } } + + if (addsimplecand (e, gx, candnode, 1, mptr) <= -1) return -1; } return 0; } -static int match (exec_t* e, const qse_char_t* curp) +static int match (exec_t* e) { qse_size_t i; - qse_char_t curc = *curp; - for (i = 0; i < QSE_LDA_SIZE(&e->cand[e->yyy]); i++) + QSE_ASSERT (QSE_LDA_SIZE(&e->cand.set[e->cand.active]) > 0); + + for (i = 0; i < QSE_LDA_SIZE(&e->cand.set[e->cand.active]); i++) { - cand_t* cand = QSE_LDA_DPTR(&e->cand[e->yyy],i); + cand_t* cand = QSE_LDA_DPTR(&e->cand.set[e->cand.active],i); qse_rex_node_t* node = cand->node; + const qse_char_t* nmptr = QSE_NULL; - if (node->id == QSE_REX_NODE_CHAR) + switch (node->id) { - if (node->u.c == curc) - { - qse_printf (QSE_T("matched %c\n"), node->u.c); + case QSE_REX_NODE_BOL: + if (cand->mptr == e->str.ptr) nmptr = cand->mptr; + break; + case QSE_REX_NODE_EOL: + if (cand->mptr >= e->str.end) nmptr = cand->mptr; + break; + + case QSE_REX_NODE_ANYCHAR: + if (cand->mptr < e->sub.end) nmptr = cand->mptr + 1; + break; + + case QSE_REX_NODE_CHAR: + if (cand->mptr < e->sub.end && node->u.c == *cand->mptr) nmptr = cand->mptr + 1; + //qse_printf (QSE_T("matched %c\n"), node->u.c); + break; + + case QSE_REX_NODE_CHARSET: + qse_printf (QSE_T("charset not implemented...\n")); + break; + + default: + // TODO: set error code -> internal error. this should not happen + return -1; + } + + if (nmptr != QSE_NULL) + { + if (cand->occ >= node->occ.min) + { + group_t* gx = cand->group; if (cand->occ < node->occ.max) { - if (addsimplecand (e, cand, node, cand->occ+1) <= -1) return -1; - } - if (cand->occ >= node->occ.min) - { - - if (addnextcands (e, cand, node->next) <= -1) return -1; + if (cand->group != QSE_NULL) + { + gx = dupgroups (e, cand->group); + if (gx == QSE_NULL) return -1; + } } + + /* move on to the next candidate */ + if (addcands (e, gx, node, node->next, nmptr) <= -1) return -1; + } + if (cand->occ < node->occ.max) + { + /* repeat itself more */ + if (addsimplecand (e, cand->group, node, cand->occ+1, nmptr) <= -1) return -1; } - } - else - { - QSE_ASSERT (node->id == QSE_REX_NODE_CHARSET); - qse_printf (QSE_T("charset not implemented...\n")); } } @@ -642,73 +1024,110 @@ static int match (exec_t* e, const qse_char_t* curp) static int exec (exec_t* e) { - const qse_char_t* ptr = e->sub.ptr; - const qse_char_t* end = e->sub.ptr + e->sub.len; + int n; - e->matched = 0; - e->xxx = 0; - e->yyy = 1; + e->nmatches = 0; + e->matchend = QSE_NULL; - /* collect the initial candidates to cand[xxx] */ - qse_lda_clear (&e->cand[e->xxx]); + e->cand.pending = 0; + e->cand.active = 1; - if (addnextcands (e, QSE_NULL, e->rex->code->next) <= -1) return -1; + /* empty the pending set to collect the initial candidates */ + qse_lda_clear (&e->cand.set[e->cand.pending]); - while (ptr < end) + /* the first node must be the START node */ + QSE_ASSERT (e->rex->code->id == QSE_REX_NODE_START); + + /* addcands() collects a set of candidates into the pending set */ + n = addcands ( + e, /* execution structure */ + QSE_NULL, /* doesn't belong to any groups yet */ + e->rex->code, /* dummy previous node, the start node */ + e->rex->code->next, /* start from the second node */ + e->sub.ptr /* current match pointer */ + ); + if (n <= -1) return -1; + + do { - /* kind of swap cand[xxx] and cand[yyy] by swapping indices */ - int tmp = e->xxx; - e->xxx = e->yyy; - e->yyy = tmp; + /* kind of swap the next set and the current set by swapping indices */ + int tmp = e->cand.pending; + e->cand.pending = e->cand.active; + e->cand.active = tmp; /* check if there are any next candidates */ - if (QSE_LDA_SIZE(&e->cand[e->yyy]) <= 0) + if (QSE_LDA_SIZE(&e->cand.set[e->cand.active]) <= 0) { - /* if none, break */ + /* if no more candidates, break */ break; } +{ +int i; +qse_printf (QSE_T("SET=")); +for (i = 0; i < QSE_LDA_SIZE(&e->cand.set[e->cand.active]); i++) +{ + cand_t* cand = QSE_LDA_DPTR(&e->cand.set[e->cand.active],i); + qse_rex_node_t* node = cand->node; + + if (node->id == QSE_REX_NODE_CHAR) + qse_printf (QSE_T("%c "), node->u.c); + else if (node->id == QSE_REX_NODE_ANYCHAR) + qse_printf (QSE_T(". "), node->u.c); + else if (node->id == QSE_REX_NODE_BOL) + qse_printf (QSE_T("^ ")); + else if (node->id == QSE_REX_NODE_EOL) + qse_printf (QSE_T("$ ")); +} +qse_printf (QSE_T("\n")); +} + /* clear the array to hold the next candidates */ - qse_lda_clear (&e->cand[e->xxx]); + qse_lda_clear (&e->cand.set[e->cand.pending]); -qse_printf (QSE_T("MATCHING %c\n"), *ptr); - if (match (e, ptr) <= -1) return -1; - - ptr++; + if (match (e) <= -1) return -1; } + while (1); - qse_printf (QSE_T("TOTAL MATCHES FOUND... %d\n"), e->matched); +if (e->nmatches > 0) +{ + qse_printf (QSE_T("MATCH: %d [%.*s]\n"), + (int)(e->matchend - e->sub.ptr), + (int)(e->matchend - e->sub.ptr), e->sub.ptr); +} + + qse_printf (QSE_T("TOTAL MATCHES FOUND... %d\n"), e->nmatches); return 0; } static int init_exec_dds (exec_t* e, qse_mmgr_t* mmgr) { /* initializes dynamic data structures */ - if (qse_lda_init (&e->cand[0], mmgr, 100) == QSE_NULL) + if (qse_lda_init (&e->cand.set[0], mmgr, 100) == QSE_NULL) { /* TOOD: set error */ return -1; } - if (qse_lda_init (&e->cand[1], mmgr, 100) == QSE_NULL) + if (qse_lda_init (&e->cand.set[1], mmgr, 100) == QSE_NULL) { /* TOOD: set error */ - qse_lda_fini (&e->cand[0]); + qse_lda_fini (&e->cand.set[0]); return -1; } - qse_lda_setscale (&e->cand[0], QSE_SIZEOF(cand_t)); - qse_lda_setscale (&e->cand[1], QSE_SIZEOF(cand_t)); + qse_lda_setscale (&e->cand.set[0], QSE_SIZEOF(cand_t)); + qse_lda_setscale (&e->cand.set[1], QSE_SIZEOF(cand_t)); - qse_lda_setcopier (&e->cand[0], QSE_LDA_COPIER_INLINE); - qse_lda_setcopier (&e->cand[1], QSE_LDA_COPIER_INLINE); + qse_lda_setcopier (&e->cand.set[0], QSE_LDA_COPIER_INLINE); + qse_lda_setcopier (&e->cand.set[1], QSE_LDA_COPIER_INLINE); return 0; } static void fini_exec_dds (exec_t* e) { - qse_lda_fini (&e->cand[1]); - qse_lda_fini (&e->cand[0]); + qse_lda_fini (&e->cand.set[1]); + qse_lda_fini (&e->cand.set[0]); } int qse_rex_exec (qse_rex_t* rex, @@ -727,14 +1146,13 @@ int qse_rex_exec (qse_rex_t* rex, QSE_MEMSET (&e, 0, QSE_SIZEOF(e)); e.rex = rex; e.str.ptr = str; - e.str.len = len; + e.str.end = str + len; e.sub.ptr = substr; - e.sub.len = sublen; + e.sub.end = substr + sublen; if (init_exec_dds (&e, rex->mmgr) <= -1) return -1; -// TOOD: may have to execute exec in case sublen is 0. - while (e.sub.len > 0) + while (e.sub.ptr <= e.sub.end) { n = exec (&e); if (n <= -1) @@ -743,10 +1161,9 @@ int qse_rex_exec (qse_rex_t* rex, break; } - if (e.matched > 0) break; + if (e.nmatches > 0) break; e.sub.ptr++; - e.sub.len--; } fini_exec_dds (&e); diff --git a/qse/samples/cmn/rex.cpp b/qse/samples/cmn/rex.cpp index 00aa7dc5..eb89ff34 100644 --- a/qse/samples/cmn/rex.cpp +++ b/qse/samples/cmn/rex.cpp @@ -248,6 +248,18 @@ void MyFrame::drawNode (wxDC& dc, qse_rex_node_t* n) { dc.DrawText (_T("
"), nodex, nodey); } + else if (n->id == QSE_REX_NODE_BOL) + { + dc.DrawText (_T("<^>"), nodex, nodey); + } + else if (n->id == QSE_REX_NODE_EOL) + { + dc.DrawText (_T("<$>"), nodex, nodey); + } + else if (n->id == QSE_REX_NODE_ANYCHAR) + { + dc.DrawText (_T(""), nodex, nodey); + } else if (n->id == QSE_REX_NODE_CHAR) { qse_char_t x[2]; @@ -274,7 +286,7 @@ void MyFrame::drawNode (wxDC& dc, qse_rex_node_t* n) } else if (n->id == QSE_REX_NODE_NOP) { - dc.DrawText (_T(""), nodex, nodey); + dc.DrawText (_T(""), nodex, nodey); } } @@ -287,19 +299,19 @@ void MyFrame::drawChain (wxDC& dc, qse_rex_node_t* n) if (t->id == QSE_REX_NODE_BRANCH) { drawNode (dc, t); - nodex += 50; + nodex += 40; int oldx = nodex; drawChain (dc, t->u.b.left); nodex = oldx; - nodey += 50; + nodey += 40; drawChain (dc, t->u.b.right); } else { drawNode (dc, t); - nodex += 50; + nodex += 40; } if (t->id == QSE_REX_NODE_GROUP)