From f9d4ccc50a47e3d7c7795d6955bdaf956e1b1779 Mon Sep 17 00:00:00 2001 From: hyung-hwan Date: Mon, 7 Sep 2009 07:18:08 +0000 Subject: [PATCH] fixed a bug in merging consecutive codes with occurrences --- qse/lib/cmn/rex.c | 76 +++++++++++++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 32 deletions(-) diff --git a/qse/lib/cmn/rex.c b/qse/lib/cmn/rex.c index b04eef51..4987222d 100644 --- a/qse/lib/cmn/rex.c +++ b/qse/lib/cmn/rex.c @@ -1,5 +1,5 @@ /* - * $Id: rex.c 207 2009-06-22 13:01:28Z hyunghwan.chung $ + * $Id: rex.c 279 2009-09-06 13:18:08Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -1483,16 +1483,25 @@ static const qse_byte_t* match_any_char ( while (p < mat->branch_end && cp->cmd == ((const atom_t*)p)->cmd) { - lbound += ((const atom_t*)p)->lbound; - ubound += ((const atom_t*)p)->ubound; + qse_size_t lb, ub; + + lb = ((const atom_t*)p)->lbound; + ub = ((const atom_t*)p)->ubound; + + /* perform minimal overflow check as this implementation + * uses the maximum value to mean infinite. + * consider the upper bound of '+' and '*'. */ + lbound = (BOUND_MAX-lb >= lbound)? (lbound + lb): BOUND_MAX; + lbound = (BOUND_MAX-lb >= lbound)? (lbound + lb): BOUND_MAX; + ubound = (BOUND_MAX-ub >= ubound)? (ubound + ub): BOUND_MAX; p += QSE_SIZEOF(*cp); } #ifdef DEBUG_REX qse_dprintf ( - QSE_T("match_any_char: lbound = %u, ubound = %u\n"), - (unsigned int)lbound, (unsigned int)ubound); + QSE_T("match_any_char: lbound = %lu, ubound = %lu\n"), + (unsigned long)lbound, (unsigned long)ubound); #endif /* find the longest match */ @@ -1504,7 +1513,7 @@ static const qse_byte_t* match_any_char ( #ifdef DEBUG_REX qse_dprintf ( - QSE_T("match_any_char: max si = %u\n"), (unsigned)si); + QSE_T("match_any_char: max si = %lu\n"), (unsigned long)si); #endif if (si >= lbound && si <= ubound) @@ -1529,42 +1538,45 @@ static const qse_byte_t* match_ord_char ( lbound = cp->lbound; ubound = cp->ubound; +#ifdef DEBUG_REX + qse_dprintf ( + QSE_T("match_ord_char: cc=%c, lbound=%lu, ubound=%lu\n"), + cc, (unsigned long)lbound, (unsigned long)ubound); +#endif + cc = *(qse_char_t*)p; p += QSE_SIZEOF(cc); if (matcher->option & QSE_REX_MATCH_IGNORECASE) cc = QSE_TOUPPER(cc); /* merge the same consecutive codes * for example, a{1,10}a{0,10} is shortened to a{1,20} */ - if (matcher->option & QSE_REX_MATCH_IGNORECASE) + while (p < mat->branch_end && + cp->cmd == ((const atom_t*)p)->cmd) { - while (p < mat->branch_end && - cp->cmd == ((const atom_t*)p)->cmd) - { - if (QSE_TOUPPER (*(qse_char_t*)(p+QSE_SIZEOF(*cp))) != cc) break; + qse_size_t lb, ub; + qse_char_t xc; - lbound += ((const atom_t*)p)->lbound; - ubound += ((const atom_t*)p)->ubound; + xc = *(qse_char_t*)(p+QSE_SIZEOF(*cp)); + if (matcher->option & QSE_REX_MATCH_IGNORECASE) + xc = QSE_TOUPPER(xc); - p += QSE_SIZEOF(*cp) + QSE_SIZEOF(cc); - } - } - else - { - while (p < mat->branch_end && - cp->cmd == ((const atom_t*)p)->cmd) - { - if (*(qse_char_t*)(p+QSE_SIZEOF(*cp)) != cc) break; + if (xc != cc) break; - lbound += ((const atom_t*)p)->lbound; - ubound += ((const atom_t*)p)->ubound; + lb = ((const atom_t*)p)->lbound; + ub = ((const atom_t*)p)->ubound; - p += QSE_SIZEOF(*cp) + QSE_SIZEOF(cc); - } + /* perform minimal overflow check as this implementation + * uses the maximum value to mean infinite. + * consider the upper bound of '+' and '*'. */ + lbound = (BOUND_MAX-lb >= lbound)? (lbound + lb): BOUND_MAX; + ubound = (BOUND_MAX-ub >= ubound)? (ubound + ub): BOUND_MAX; + + p += QSE_SIZEOF(*cp) + QSE_SIZEOF(cc); } #ifdef DEBUG_REX qse_dprintf ( - QSE_T("match_ord_char: cc = %c, lbound = %u, ubound = %u\n"), - cc, (unsigned int)lbound, (unsigned int)ubound); + QSE_T("match_ord_char(after merging): cc=%c, lbound=%lu, ubound=%lu\n"), + cc, (unsigned long)lbound, (unsigned long)ubound); #endif mat->matched = QSE_FALSE; @@ -1602,8 +1614,8 @@ static const qse_byte_t* match_ord_char ( #ifdef DEBUG_REX qse_dprintf ( - QSE_T("match_ord_char: max occurrences=%u, lbound=%u, ubound=%u\n"), - (unsigned)si, (unsigned)lbound, (unsigned)ubound); + QSE_T("match_ord_char: cc=%c, max occ=%lu, lbound=%lu, ubound=%lu\n"), + cc, (unsigned long)si, (unsigned long)lbound, (unsigned long)ubound); #endif if (si >= lbound && si <= ubound) @@ -1632,8 +1644,8 @@ static const qse_byte_t* match_charset ( #ifdef DEBUG_REX qse_dprintf ( - QSE_T("match_charset: lbound = %u, ubound = %u\n"), - (unsigned int)cp->lbound, (unsigned int)cp->ubound); + QSE_T("match_charset: lbound=%lu, ubound=%lu\n"), + (unsigned long)cp->lbound, (unsigned long)cp->ubound); #endif mat->matched = QSE_FALSE;