From 166c18c7d08899af63e0062611072ffd46af622c Mon Sep 17 00:00:00 2001 From: hyung-hwan Date: Fri, 13 Nov 2020 14:56:15 +0000 Subject: [PATCH] enhanced str::split() to handle byte strings better --- hawk/lib/fnc.c | 351 ++++++++++------------------------------ hawk/lib/misc-imp.h | 228 ++++++++++++++++++-------- hawk/lib/misc-prv.h | 33 ++-- hawk/lib/misc.c | 215 +++--------------------- hawk/lib/parse.c | 56 +++---- hawk/lib/run.c | 4 +- hawk/lib/tre-compile.c | 91 ++++------- hawk/lib/tre-match-ut.h | 181 +++++++++++---------- hawk/lib/tre-prv.h | 6 +- hawk/t/h-002.hawk | 27 ++++ 10 files changed, 471 insertions(+), 721 deletions(-) diff --git a/hawk/lib/fnc.c b/hawk/lib/fnc.c index 938860aa..06b7c41f 100644 --- a/hawk/lib/fnc.c +++ b/hawk/lib/fnc.c @@ -787,216 +787,23 @@ int hawk_fnc_substr (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi) return 0; } -#if 0 -static int split_mbs (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi) -{ - hawk_oow_t nargs; - hawk_val_t* a0, * a2, * t1, * t2; - hawk_val_type_t a2_vtype, t1_vtype; - - hawk_bcs_t str; - hawk_bcs_t fs; - hawk_bch_t* fs_free = HAWK_NULL; - const hawk_bch_t* p; - hawk_oow_t str_left, org_len; - hawk_tre_t* fs_rex = HAWK_NULL; - hawk_tre_t* fs_rex_free = HAWK_NULL; - - hawk_bcs_t tok; - hawk_int_t nflds; - int x; - - str.ptr = HAWK_NULL; - str.len = 0; - - nargs = hawk_rtx_getnargs(rtx); - HAWK_ASSERT (nargs >= 2 && nargs <= 3); - - a0 = hawk_rtx_getarg(rtx, 0); - a2 = (nargs >= 3)? hawk_rtx_getarg(rtx, 2): HAWK_NULL; - - str.ptr = hawk_rtx_getvalbcstr(rtx, a0, &str.len); - if (HAWK_UNLIKELY(!str.ptr)) goto oops; - - if (!a2) - { - /* get the value from FS */ - t1 = hawk_rtx_getgbl(rtx, HAWK_GBL_FS); - t1_vtype = HAWK_RTX_GETVALTYPE(rtx, t1); - if (t1_vtype == HAWK_VAL_NIL) - { - fs.ptr = " "; - fs.len = 1; - } - else if (t1_vtype == HAWK_VAL_MBS) - { - fs.ptr = ((hawk_val_mbs_t*)t1)->val.ptr; - fs.len = ((hawk_val_mbs_t*)t1)->val.len; - } - else - { - fs.ptr = hawk_rtx_valtobcstrdup(rtx, t1, &fs.len); - if (HAWK_UNLIKELY(!fs.ptr)) goto oops; - fs_free = (hawk_bch_t*)fs.ptr; - } - - if (fs.len > 1) fs_rex = rtx->gbl.fs[rtx->gbl.ignorecase]; - } - else - { - a2_vtype = HAWK_RTX_GETVALTYPE(rtx, a2); - - if (a2_vtype == HAWK_VAL_REX) - { - /* the third parameter is a regular expression */ - fs_rex = ((hawk_val_rex_t*)a2)->code[rtx->gbl.ignorecase]; - - /* make the loop below to take fs_rex by - * setting fs_len greater than 1*/ - fs.ptr = HAWK_NULL; - fs.len = 2; - } - else - { - if (a2_vtype == HAWK_VAL_MBS) - { - fs.ptr = ((hawk_val_mbs_t*)a2)->val.ptr; - fs.len = ((hawk_val_mbs_t*)a2)->val.len; - } - else - { - fs.ptr = hawk_rtx_valtobcstrdup(rtx, a2, &fs.len); - if (fs.ptr == HAWK_NULL) goto oops; - fs_free = (hawk_bch_t*)fs.ptr; - } - - if (fs.len > 1) - { - int x; - - x = rtx->gbl.ignorecase? - hawk_rtx_buildrex(rtx, fs.ptr, fs.len, HAWK_NULL, &fs_rex): - hawk_rtx_buildrex(rtx, fs.ptr, fs.len, &fs_rex, HAWK_NULL); - if (x <= -1) goto oops; - - fs_rex_free = fs_rex; - } - } - } - - t1 = hawk_rtx_makearrval(rtx); - if (HAWK_UNLIKELY(!t1)) goto oops; - - hawk_rtx_refupval (rtx, t1); - x = hawk_rtx_setrefval(rtx, (hawk_val_ref_t*)hawk_rtx_getarg(rtx, 1), t1); - hawk_rtx_refdownval (rtx, t1); - if (HAWK_UNLIKELY(x <= -1)) goto oops; - - /* fill the map with actual values */ - p = str.ptr; str_left = str.len; org_len = str.len; - nflds = 0; - - while (p) - { - hawk_bch_t key_buf[HAWK_SIZEOF(hawk_int_t)*8+2]; - hawk_oow_t key_len; - - if (fs.len <= 1) - { - p = hawk_rtx_tokoocharswithoochars(rtx, p, str.len, fs.ptr, fs.len, &tok); - } - else - { - p = hawk_rtx_tokoocharsbyrex(rtx, str.ptr, org_len, p, str.len, fs_rex, &tok); - if (p == HAWK_NULL && hawk_rtx_geterrnum(rtx) != HAWK_ENOERR) - { - goto oops; - } - } - - if (nflds == 0 && p == HAWK_NULL && tok.len == 0) - { - /* no field at all*/ - break; - } - - HAWK_ASSERT ((tok.ptr != HAWK_NULL && tok.len > 0) || tok.len == 0); - - /* create the field string - however, the split function must - * create a numeric value if the string is a number */ - /*t2 = hawk_rtx_makembsvalwithbcs (rtx, &tok);*/ - /*t2 = hawk_rtx_makenmbsvalwithbcs(rtx, &tok); */ - t2 = hawk_rtx_makenumormbsvalwithbchars(rtx, tok.ptr, tok.len); - if (HAWK_UNLIKELY(!t2)) goto oops; - - /* put it into the map */ - key_len = hawk_int_to_oocstr(++nflds, 10, HAWK_NULL, key_buf, HAWK_COUNTOF(key_buf)); - HAWK_ASSERT (key_len != (hawk_oow_t)-1); - - if (hawk_rtx_setarrvalfld(rtx, t1, key_buf, key_len, t2) == HAWK_NULL) - { - hawk_rtx_refupval (rtx, t2); - hawk_rtx_refdownval (rtx, t2); - goto oops; - } - - str.len = str_left - (p - str.ptr); - } - - /*if (str_free) hawk_rtx_freemem (rtx, str_free);*/ - hawk_rtx_freevalbcstr (rtx, a0, str.ptr); - - if (fs_free) hawk_rtx_freemem (rtx, fs_free); - - if (fs_rex_free) - { - if (rtx->gbl.ignorecase) - hawk_rtx_freerex (rtx, HAWK_NULL, fs_rex_free); - else - hawk_rtx_freerex (rtx, fs_rex_free, HAWK_NULL); - } - - /*nflds--;*/ - - t1 = hawk_rtx_makeintval(rtx, nflds); - if (HAWK_UNLIKELY(!t1)) return -1; - - hawk_rtx_setretval (rtx, t1); - return 0; - -oops: - if (str.ptr) hawk_rtx_freevalbcstr (rtx, a0, str.ptr); - - if (fs_free) hawk_rtx_freemem (rtx, fs_free); - - if (fs_rex_free) - { - if (rtx->gbl.ignorecase) - hawk_rtx_freerex (rtx, HAWK_NULL, fs_rex_free); - else - hawk_rtx_freerex (rtx, fs_rex_free, HAWK_NULL); - } - return -1; -} -#endif - static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array) { hawk_oow_t nargs; - hawk_val_t* a0, * a2, * t1, * t2; - hawk_val_type_t a2_vtype, t1_vtype; + hawk_val_t* a0, * a2, * t0, * t1, * t2; hawk_oocs_t str; hawk_oocs_t fs; hawk_ooch_t* fs_free = HAWK_NULL; - const hawk_ooch_t* p; + hawk_ooch_t* p; + hawk_oow_t str_left, org_len; hawk_tre_t* fs_rex = HAWK_NULL; hawk_tre_t* fs_rex_free = HAWK_NULL; hawk_oocs_t tok; hawk_int_t nflds; - int x; + int x, byte_str, do_fld = 0; str.ptr = HAWK_NULL; str.len = 0; @@ -1007,65 +814,41 @@ static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array) a0 = hawk_rtx_getarg(rtx, 0); a2 = (nargs >= 3)? hawk_rtx_getarg (rtx, 2): HAWK_NULL; - str.ptr = hawk_rtx_getvaloocstr(rtx, a0, &str.len); - if (HAWK_UNLIKELY(!str.ptr)) goto oops; + str.ptr = HAWK_NULL; + str.len = 0; - if (!a2) + /* field seperator */ + t0 = a2? a2: hawk_rtx_getgbl(rtx, HAWK_GBL_FS); /* if a2 is not available, get the value from FS */ + + if (HAWK_RTX_GETVALTYPE(rtx, t0) == HAWK_VAL_NIL) { - /* get the value from FS */ - t1 = hawk_rtx_getgbl(rtx, HAWK_GBL_FS); - t1_vtype = HAWK_RTX_GETVALTYPE(rtx, t1); - if (t1_vtype == HAWK_VAL_NIL) - { - fs.ptr = HAWK_T(" "); - fs.len = 1; - } - else if (t1_vtype == HAWK_VAL_STR) - { - fs.ptr = ((hawk_val_str_t*)t1)->val.ptr; - fs.len = ((hawk_val_str_t*)t1)->val.len; - } - else - { - fs.ptr = hawk_rtx_valtooocstrdup(rtx, t1, &fs.len); - if (HAWK_UNLIKELY(!fs.ptr)) goto oops; - fs_free = (hawk_ooch_t*)fs.ptr; - } - - if (fs.len > 1) fs_rex = rtx->gbl.fs[rtx->gbl.ignorecase]; + fs.ptr = HAWK_T(" "); + fs.len = 1; } - else + else if (HAWK_RTX_GETVALTYPE(rtx, t0) == HAWK_VAL_REX) { - a2_vtype = HAWK_RTX_GETVALTYPE (rtx, a2); + /* regular expression */ + fs_rex = ((hawk_val_rex_t*)t0)->code[rtx->gbl.ignorecase]; - if (a2_vtype == HAWK_VAL_REX) + /* make the tokenizing loop below to take fs_rex by setting fs_len greater than 1*/ + fs.ptr = HAWK_NULL; + fs.len = 2; + } + else + { + fs.ptr = hawk_rtx_getvaloocstr(rtx, t0, &fs.len); + if (HAWK_UNLIKELY(!fs.ptr)) goto oops; + + fs_free = fs.ptr; + + if (fs.len == 5 && fs.ptr[0] == '?') { - /* the third parameter is a regular expression */ - fs_rex = ((hawk_val_rex_t*)a2)->code[rtx->gbl.ignorecase]; - - /* make the loop below to take fs_rex by - * setting fs_len greater than 1*/ - fs.ptr = HAWK_NULL; - fs.len = 2; + do_fld = 1; } - else + else if (fs.len > 1) { - if (a2_vtype == HAWK_VAL_STR) + if (a2) { - fs.ptr = ((hawk_val_str_t*)a2)->val.ptr; - fs.len = ((hawk_val_str_t*)a2)->val.len; - } - else - { - fs.ptr = hawk_rtx_valtooocstrdup(rtx, a2, &fs.len); - if (fs.ptr == HAWK_NULL) goto oops; - fs_free = (hawk_ooch_t*)fs.ptr; - } - - if (fs.len > 1) - { - int x; - x = rtx->gbl.ignorecase? hawk_rtx_buildrex(rtx, fs.ptr, fs.len, HAWK_NULL, &fs_rex): hawk_rtx_buildrex(rtx, fs.ptr, fs.len, &fs_rex, HAWK_NULL); @@ -1073,9 +856,28 @@ static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array) fs_rex_free = fs_rex; } + else + { + fs_rex = rtx->gbl.fs[rtx->gbl.ignorecase]; + } } } + /* the first parameter - string to split */ + if (HAWK_RTX_GETVALTYPE(rtx, a0) == HAWK_VAL_MBS) + { + byte_str = 1; + str.ptr = do_fld? hawk_rtx_valtobcstrdup(rtx, a0, &str.len): + hawk_rtx_getvalbcstr(rtx, a0, &str.len); + } + else + { + byte_str = 0; + str.ptr = do_fld? hawk_rtx_valtooocstrdup(rtx, a0, &str.len): + hawk_rtx_getvaloocstr(rtx, a0, &str.len); + } + if (HAWK_UNLIKELY(!str.ptr)) goto oops; + t1 = use_array? hawk_rtx_makearrval(rtx, 16): hawk_rtx_makemapval(rtx); if (HAWK_UNLIKELY(!t1)) goto oops; @@ -1090,20 +892,23 @@ static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array) while (p) { - hawk_ooch_t key_buf[HAWK_SIZEOF(hawk_int_t)*8+2]; - hawk_oow_t key_len; - - if (fs.len <= 1) + if (fs_rex) { - p = hawk_rtx_tokoocharswithoochars(rtx, p, str.len, fs.ptr, fs.len, &tok); + p = byte_str? hawk_rtx_tokbcharsbyrex(rtx, str.ptr, org_len, p, str.len, fs_rex, &tok): + hawk_rtx_tokoocharsbyrex(rtx, str.ptr, org_len, p, str.len, fs_rex, &tok); + if (p && hawk_rtx_geterrnum(rtx) != HAWK_ENOERR) goto oops; + } + else if (do_fld) + { + /* [NOTE] even if byte_str is true, the field seperator is of the ooch type. + * there may be some data truncation and related issues */ + p = byte_str? hawk_rtx_fldbchars(rtx, p, str.len, fs.ptr[1], fs.ptr[2], fs.ptr[3], fs.ptr[4], &tok): + hawk_rtx_fldoochars(rtx, p, str.len, fs.ptr[1], fs.ptr[2], fs.ptr[3], fs.ptr[4], &tok); } else { - p = hawk_rtx_tokoocharsbyrex(rtx, str.ptr, org_len, p, str.len, fs_rex, &tok); - if (p == HAWK_NULL && hawk_rtx_geterrnum(rtx) != HAWK_ENOERR) - { - goto oops; - } + p = byte_str? hawk_rtx_tokbcharswithbchars(rtx, p, str.len, fs.ptr, fs.len, &tok): + hawk_rtx_tokoocharswithoochars(rtx, p, str.len, fs.ptr, fs.len, &tok); } if (nflds == 0 && p == HAWK_NULL && tok.len == 0) @@ -1118,7 +923,8 @@ static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array) * create a numeric value if the string is a number */ /*t2 = hawk_rtx_makestrvalwithoocs (rtx, &tok);*/ /*t2 = hawk_rtx_makenstrvalwithoocs(rtx, &tok); */ - t2 = hawk_rtx_makenumorstrvalwithoochars(rtx, tok.ptr, tok.len); + t2 = byte_str? hawk_rtx_makenumormbsvalwithbchars(rtx, tok.ptr, tok.len): + hawk_rtx_makenumorstrvalwithoochars(rtx, tok.ptr, tok.len); if (HAWK_UNLIKELY(!t2)) goto oops; if (use_array) @@ -1133,6 +939,9 @@ static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array) else { /* put it into the map */ + hawk_ooch_t key_buf[HAWK_SIZEOF(hawk_int_t)*8+2]; + hawk_oow_t key_len; + key_len = hawk_int_to_oocstr(++nflds, 10, HAWK_NULL, key_buf, HAWK_COUNTOF(key_buf)); HAWK_ASSERT (key_len != (hawk_oow_t)-1); @@ -1144,13 +953,17 @@ static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array) } } - str.len = str_left - (p - str.ptr); + if (byte_str) + str.len = str_left - ((p - str.ptr) * HAWK_SIZEOF_OOCH_T); + else + str.len = str_left - (p - str.ptr); } - /*if (str_free) hawk_rtx_freemem (rtx, str_free);*/ - hawk_rtx_freevaloocstr (rtx, a0, str.ptr); + if (do_fld) hawk_rtx_freemem (rtx, str.ptr); + else if (byte_str) hawk_rtx_freevalbcstr (rtx, a0, str.ptr); + else hawk_rtx_freevaloocstr (rtx, a0, str.ptr); - if (fs_free) hawk_rtx_freemem (rtx, fs_free); + if (fs_free) hawk_rtx_freevaloocstr (rtx, t0, fs_free); if (fs_rex_free) { @@ -1160,16 +973,19 @@ static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array) hawk_rtx_freerex (rtx, fs_rex_free, HAWK_NULL); } - /*nflds--;*/ - - t1 = hawk_rtx_makeintval (rtx, nflds); + t1 = hawk_rtx_makeintval(rtx, nflds); if (HAWK_UNLIKELY(!t1)) return -1; hawk_rtx_setretval (rtx, t1); return 0; oops: - if (str.ptr) hawk_rtx_freevaloocstr (rtx, a0, str.ptr); + if (str.ptr) + { + if (do_fld) hawk_rtx_freemem (rtx, str.ptr); + else if (byte_str) hawk_rtx_freevalbcstr (rtx, a0, str.ptr); + else hawk_rtx_freevaloocstr (rtx, a0, str.ptr); + } if (fs_free) hawk_rtx_freemem (rtx, fs_free); @@ -1185,7 +1001,8 @@ oops: int hawk_fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi) { - return fnc_split(rtx, fi, 1); + /*return fnc_split(rtx, fi, 1);*/ + return fnc_split(rtx, fi, 0); } int hawk_fnc_tolower (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi) diff --git a/hawk/lib/misc-imp.h b/hawk/lib/misc-imp.h index f6644688..45986399 100644 --- a/hawk/lib/misc-imp.h +++ b/hawk/lib/misc-imp.h @@ -24,6 +24,92 @@ THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +char_t* split_xchars_to_fields (hawk_rtx_t* rtx, char_t* str, hawk_oow_t len, char_t fs, char_t ec, char_t lq, char_t rq, xcs_t* tok) +{ + char_t* p = str; + char_t* end = str + len; + int escaped = 0, quoted = 0; + char_t* ts; /* token start */ + char_t* tp; /* points to one char past the last token char */ + char_t* xp; /* points to one char past the last effective char */ + + /* skip leading spaces */ + while (p < end && is_xch_space(*p)) p++; + + /* initialize token pointers */ + ts = tp = xp = p; + + while (p < end) + { + char c = *p; + + if (escaped) + { + *tp++ = c; xp = tp; p++; + escaped = 0; + } + else + { + if (c == ec) + { + escaped = 1; + p++; + } + else if (quoted) + { + if (c == rq) + { + quoted = 0; + p++; + } + else + { + *tp++ = c; xp = tp; p++; + } + } + else + { + if (c == fs) + { + tok->ptr = ts; + tok->len = xp - ts; + p++; + + if (is_xch_space(fs)) + { + while (p < end && *p == fs) p++; + if (p >= end) return HAWK_NULL; + } + + return p; + } + + if (c == lq) + { + quoted = 1; + p++; + } + else + { + *tp++ = c; p++; + if (!is_xch_space(c)) xp = tp; + } + } + } + } + + if (escaped) + { + /* if it is still escaped, the last character must be + * the escaper itself. treat it as a normal character */ + *xp++ = ec; + } + + tok->ptr = ts; + tok->len = xp - ts; + return HAWK_NULL; +} + char_t* tokenize_xchars (hawk_rtx_t* rtx, const char_t* s, hawk_oow_t len, const char_t* delim, hawk_oow_t delim_len, xcs_t* tok) { const char_t* p = s, *d; @@ -214,88 +300,102 @@ exit_loop: return (char_t*)++p; } -char_t* split_xchars_to_fields (hawk_rtx_t* rtx, char_t* str, hawk_oow_t len, char_t fs, char_t ec, char_t lq, char_t rq, xcs_t* tok) + +char_t* tokenize_xchars_by_rex (hawk_rtx_t* rtx, const char_t* str, hawk_oow_t len, const char_t* substr, hawk_oow_t sublen, hawk_tre_t* rex, xcs_t* tok) { - char_t* p = str; - char_t* end = str + len; - int escaped = 0, quoted = 0; - char_t* ts; /* token start */ - char_t* tp; /* points to one char past the last token char */ - char_t* xp; /* points to one char past the last effective char */ + int n; + hawk_oow_t i; + xcs_t match, s, cursub, realsub; - /* skip leading spaces */ - while (p < end && is_xch_space(*p)) p++; + s.ptr = (char_t*)str; + s.len = len; - /* initialize token pointers */ - ts = tp = xp = p; + cursub.ptr = (char_t*)substr; + cursub.len = sublen; - while (p < end) + realsub.ptr = (char_t*)substr; + realsub.len = sublen; + + while (cursub.len > 0) { - char c = *p; + n = match_rex_with_xcs(rtx, rex, &s, &cursub, &match, HAWK_NULL); + if (n <= -1) return HAWK_NULL; - if (escaped) + if (n == 0) { - *tp++ = c; xp = tp; p++; - escaped = 0; + /* no match has been found. return the entire string as a token */ + hawk_rtx_seterrnum (rtx, HAWK_NULL, HAWK_ENOERR); /* reset HAWK_EREXNOMAT to no error */ + tok->ptr = realsub.ptr; + tok->len = realsub.len; + return HAWK_NULL; } - else + + HAWK_ASSERT (n == 1); + + if (match.len == 0) { - if (c == ec) + /* the match length is zero. */ + cursub.ptr++; + cursub.len--; + } + else if (HAWK_RTX_IS_STRIPRECSPC_ON(rtx)) + { + /* match at the beginning of the input string */ + if (match.ptr == substr) { - escaped = 1; - p++; - } - else if (quoted) - { - if (c == rq) + for (i = 0; i < match.len; i++) { - quoted = 0; - p++; + if (!is_xch_space(match.ptr[i])) goto exit_loop; } - else - { - *tp++ = c; xp = tp; p++; - } - } - else - { - if (c == fs) - { - tok->ptr = ts; - tok->len = xp - ts; - p++; - if (is_xch_space(fs)) - { - while (p < end && *p == fs) p++; - if (p >= end) return HAWK_NULL; - } + /* the match that is all spaces at the + * beginning of the input string is skipped */ + cursub.ptr += match.len; + cursub.len -= match.len; - return p; - } - - if (c == lq) - { - quoted = 1; - p++; - } - else - { - *tp++ = c; p++; - if (!is_xch_space(c)) xp = tp; - } + /* adjust the substring by skipping the leading + * spaces and retry matching */ + realsub.ptr = (char_t*)substr + match.len; + realsub.len -= match.len; } + else break; + } + else break; + } + +exit_loop: + hawk_rtx_seterrnum (rtx, HAWK_NULL, HAWK_ENOERR); + + if (cursub.len <= 0) + { + tok->ptr = realsub.ptr; + tok->len = realsub.len; + return HAWK_NULL; + } + + tok->ptr = realsub.ptr; + tok->len = match.ptr - realsub.ptr; + + for (i = 0; i < match.len; i++) + { + if (!is_xch_space(match.ptr[i])) + { + /* the match contains a non-space character. */ + return (char_t*)match.ptr+match.len; } } - if (escaped) + /* the match is all spaces */ + if (HAWK_RTX_IS_STRIPRECSPC_ON(rtx)) { - /* if it is still escaped, the last character must be - * the escaper itself. treat it as a normal character */ - *xp++ = ec; + /* if the match reached the last character in the input string, + * it returns HAWK_NULL to terminate tokenization. */ + return (match.ptr+match.len >= substr+sublen)? HAWK_NULL: ((char_t*)match.ptr+match.len); + } + else + { + /* if the match went beyond the the last character in the input + * string, it returns HAWK_NULL to terminate tokenization. */ + return (match.ptr+match.len > substr+sublen)? HAWK_NULL: ((char_t*)match.ptr+match.len); } - - tok->ptr = ts; - tok->len = xp - ts; - return HAWK_NULL; } diff --git a/hawk/lib/misc-prv.h b/hawk/lib/misc-prv.h index 8dc93d06..081cb153 100644 --- a/hawk/lib/misc-prv.h +++ b/hawk/lib/misc-prv.h @@ -64,24 +64,37 @@ hawk_bch_t* hawk_rtx_tokbcharswithbchars ( const hawk_bch_t* delim, hawk_oow_t delim_len, hawk_bcs_t* tok); +hawk_uch_t* hawk_rtx_tokucharsbyrex ( + hawk_rtx_t* rtx, + const hawk_uch_t* str, + hawk_oow_t len, + const hawk_uch_t* substr, + hawk_oow_t sublen, + hawk_tre_t* rex, + hawk_ucs_t* tok +); + +hawk_bch_t* hawk_rtx_tokbcharsbyrex ( + hawk_rtx_t* rtx, + const hawk_bch_t* str, + hawk_oow_t len, + const hawk_bch_t* substr, + hawk_oow_t sublen, + hawk_tre_t* rex, + hawk_bcs_t* tok +); + + #if defined(HAWK_OOCH_IS_UCH) # define hawk_rtx_fldoochars hawk_rtx_flduchars # define hawk_rtx_tokoocharswithoochars hawk_rtx_tokucharswithuchars +# define hawk_rtx_tokoocharsbyrex hawk_rtx_tokucharsbyrex #else # define hawk_rtx_fldoochars hawk_rtx_fldbchars # define hawk_rtx_tokoocharswithoochars hawk_rtx_tokbcharswithbchars +# define hawk_rtx_tokoocharsbyrex hawk_rtx_tokbcharsbyrex #endif -hawk_ooch_t* hawk_rtx_tokoocharsbyrex ( - hawk_rtx_t* rtx, - const hawk_ooch_t* str, - hawk_oow_t len, - const hawk_ooch_t* substr, - hawk_oow_t sublen, - hawk_tre_t* rex, - hawk_oocs_t* tok -); - int hawk_rtx_matchvalwithucs ( hawk_rtx_t* rtx, hawk_val_t* val, diff --git a/hawk/lib/misc.c b/hawk/lib/misc.c index 42688e66..691d43e6 100644 --- a/hawk/lib/misc.c +++ b/hawk/lib/misc.c @@ -30,220 +30,41 @@ #undef char_t #undef xcs_t #undef is_xch_space -#undef tokenize_xchars +#undef match_rex_with_xcs #undef split_xchars_to_fields +#undef tokenize_xchars +#undef tokenize_xchars_by_rex + #define char_t hawk_bch_t #define xcs_t hawk_bcs_t #define is_xch_space hawk_is_bch_space -#define tokenize_xchars hawk_rtx_tokbcharswithbchars +#define match_rex_with_xcs hawk_rtx_matchrexwithbcs + #define split_xchars_to_fields hawk_rtx_fldbchars +#define tokenize_xchars hawk_rtx_tokbcharswithbchars +#define tokenize_xchars_by_rex hawk_rtx_tokbcharsbyrex + #include "misc-imp.h" #undef char_t #undef xcs_t #undef is_xch_space -#undef tokenize_xchars +#undef match_rex_with_xcs #undef split_xchars_to_fields +#undef tokenize_xchars +#undef tokenize_xchars_by_rex + #define char_t hawk_uch_t #define xcs_t hawk_ucs_t #define is_xch_space hawk_is_uch_space -#define tokenize_xchars hawk_rtx_tokucharswithuchars +#define match_rex_with_xcs hawk_rtx_matchrexwithucs + #define split_xchars_to_fields hawk_rtx_flduchars +#define tokenize_xchars hawk_rtx_tokucharswithuchars +#define tokenize_xchars_by_rex hawk_rtx_tokucharsbyrex + #include "misc-imp.h" -hawk_ooch_t* hawk_rtx_tokoocharsbyrex ( - hawk_rtx_t* rtx, - const hawk_ooch_t* str, hawk_oow_t len, - const hawk_ooch_t* substr, hawk_oow_t sublen, - hawk_tre_t* rex, hawk_oocs_t* tok) -{ - int n; - hawk_oow_t i; - hawk_oocs_t match, s, cursub, realsub; - - s.ptr = (hawk_ooch_t*)str; - s.len = len; - - cursub.ptr = (hawk_ooch_t*)substr; - cursub.len = sublen; - - realsub.ptr = (hawk_ooch_t*)substr; - realsub.len = sublen; - - while (cursub.len > 0) - { - n = hawk_rtx_matchrexwithoocs(rtx, rex, &s, &cursub, &match, HAWK_NULL); - if (n <= -1) return HAWK_NULL; - - if (n == 0) - { - /* no match has been found. return the entire string as a token */ - hawk_rtx_seterrnum (rtx, HAWK_NULL, HAWK_ENOERR); /* reset HAWK_EREXNOMAT to no error */ - tok->ptr = realsub.ptr; - tok->len = realsub.len; - return HAWK_NULL; - } - - HAWK_ASSERT (n == 1); - - if (match.len == 0) - { - /* the match length is zero. */ - cursub.ptr++; - cursub.len--; - } - else if (HAWK_RTX_IS_STRIPRECSPC_ON(rtx)) - { - /* match at the beginning of the input string */ - if (match.ptr == substr) - { - for (i = 0; i < match.len; i++) - { - if (!hawk_is_ooch_space(match.ptr[i])) goto exit_loop; - } - - /* the match that is all spaces at the - * beginning of the input string is skipped */ - cursub.ptr += match.len; - cursub.len -= match.len; - - /* adjust the substring by skipping the leading - * spaces and retry matching */ - realsub.ptr = (hawk_ooch_t*)substr + match.len; - realsub.len -= match.len; - } - else break; - } - else break; - } - -exit_loop: - hawk_rtx_seterrnum (rtx, HAWK_NULL, HAWK_ENOERR); - - if (cursub.len <= 0) - { - tok->ptr = realsub.ptr; - tok->len = realsub.len; - return HAWK_NULL; - } - - tok->ptr = realsub.ptr; - tok->len = match.ptr - realsub.ptr; - - for (i = 0; i < match.len; i++) - { - if (!hawk_is_ooch_space(match.ptr[i])) - { - /* the match contains a non-space character. */ - return (hawk_ooch_t*)match.ptr+match.len; - } - } - - /* the match is all spaces */ - if (HAWK_RTX_IS_STRIPRECSPC_ON(rtx)) - { - /* if the match reached the last character in the input string, - * it returns HAWK_NULL to terminate tokenization. */ - return (match.ptr+match.len >= substr+sublen)? HAWK_NULL: ((hawk_ooch_t*)match.ptr+match.len); - } - else - { - /* if the match went beyond the the last character in the input - * string, it returns HAWK_NULL to terminate tokenization. */ - return (match.ptr+match.len > substr+sublen)? HAWK_NULL: ((hawk_ooch_t*)match.ptr+match.len); - } -} - -#if 0 -hawk_ooch_t* hawk_rtx_strxnfld ( - hawk_rtx_t* rtx, hawk_ooch_t* str, hawk_oow_t len, - hawk_ooch_t fs, hawk_ooch_t ec, hawk_ooch_t lq, hawk_ooch_t rq, - hawk_oocs_t* tok) -{ - hawk_ooch_t* p = str; - hawk_ooch_t* end = str + len; - int escaped = 0, quoted = 0; - hawk_ooch_t* ts; /* token start */ - hawk_ooch_t* tp; /* points to one char past the last token char */ - hawk_ooch_t* xp; /* points to one char past the last effective char */ - - /* skip leading spaces */ - while (p < end && hawk_is_ooch_space(*p)) p++; - - /* initialize token pointers */ - ts = tp = xp = p; - - while (p < end) - { - char c = *p; - - if (escaped) - { - *tp++ = c; xp = tp; p++; - escaped = 0; - } - else - { - if (c == ec) - { - escaped = 1; - p++; - } - else if (quoted) - { - if (c == rq) - { - quoted = 0; - p++; - } - else - { - *tp++ = c; xp = tp; p++; - } - } - else - { - if (c == fs) - { - tok->ptr = ts; - tok->len = xp - ts; - p++; - - if (hawk_is_ooch_space(fs)) - { - while (p < end && *p == fs) p++; - if (p >= end) return HAWK_NULL; - } - - return p; - } - - if (c == lq) - { - quoted = 1; - p++; - } - else - { - *tp++ = c; p++; - if (!hawk_is_ooch_space(c)) xp = tp; - } - } - } - } - - if (escaped) - { - /* if it is still escaped, the last character must be - * the escaper itself. treat it as a normal character */ - *xp++ = ec; - } - - tok->ptr = ts; - tok->len = xp - ts; - return HAWK_NULL; -} -#endif static int matchtre_ucs (hawk_tre_t* tre, int opt, const hawk_ucs_t* str, hawk_ucs_t* mat, hawk_ucs_t submat[9], hawk_gem_t* errgem) { diff --git a/hawk/lib/parse.c b/hawk/lib/parse.c index bb65321b..b39f1f1c 100644 --- a/hawk/lib/parse.c +++ b/hawk/lib/parse.c @@ -2579,8 +2579,8 @@ static hawk_nde_t* parse_while (hawk_t* hawk, const hawk_loc_t* xloc) if (get_token(hawk) <= -1) goto oops; ploc = hawk->tok.loc; - test = parse_expr_withdc (hawk, &ploc); - if (test == HAWK_NULL) goto oops; + test = parse_expr_withdc(hawk, &ploc); + if (HAWK_UNLIKELY(!test)) goto oops; if (!MATCH(hawk,TOK_RPAREN)) { @@ -2591,11 +2591,11 @@ static hawk_nde_t* parse_while (hawk_t* hawk, const hawk_loc_t* xloc) if (get_token(hawk) <= -1) goto oops; ploc = hawk->tok.loc; - body = parse_statement (hawk, &ploc); - if (body == HAWK_NULL) goto oops; + body = parse_statement(hawk, &ploc); + if (HAWK_UNLIKELY(!body)) goto oops; - nde = (hawk_nde_while_t*) hawk_callocmem (hawk, HAWK_SIZEOF(*nde)); - if (nde == HAWK_NULL) + nde = (hawk_nde_while_t*)hawk_callocmem(hawk, HAWK_SIZEOF(*nde)); + if (HAWK_UNLIKELY(!nde)) { ADJERR_LOC (hawk, xloc); goto oops; @@ -2628,7 +2628,7 @@ static hawk_nde_t* parse_for (hawk_t* hawk, const hawk_loc_t* xloc) return HAWK_NULL; } if (get_token(hawk) <= -1) return HAWK_NULL; - + if (!MATCH(hawk,TOK_SEMICOLON)) { /* this line is very ugly. it checks the entire next @@ -2694,8 +2694,8 @@ static hawk_nde_t* parse_for (hawk_t* hawk, const hawk_loc_t* xloc) if (!MATCH(hawk,TOK_SEMICOLON)) { ploc = hawk->tok.loc; - test = parse_expr_withdc (hawk, &ploc); - if (test == HAWK_NULL) goto oops; + test = parse_expr_withdc(hawk, &ploc); + if (HAWK_UNLIKELY(!test)) goto oops; if (!MATCH(hawk,TOK_SEMICOLON)) { @@ -2717,8 +2717,8 @@ static hawk_nde_t* parse_for (hawk_t* hawk, const hawk_loc_t* xloc) hawk_loc_t eloc; eloc = hawk->tok.loc; - incr = parse_expr_withdc (hawk, &eloc); - if (incr == HAWK_NULL) goto oops; + incr = parse_expr_withdc(hawk, &eloc); + if (HAWK_UNLIKELY(!incr)) goto oops; } if (!MATCH(hawk,TOK_RPAREN)) @@ -2734,8 +2734,8 @@ static hawk_nde_t* parse_for (hawk_t* hawk, const hawk_loc_t* xloc) body = parse_statement (hawk, &ploc); if (body == HAWK_NULL) goto oops; - nde_for = (hawk_nde_for_t*) hawk_callocmem (hawk, HAWK_SIZEOF(*nde_for)); - if (nde_for == HAWK_NULL) + nde_for = (hawk_nde_for_t*)hawk_callocmem(hawk, HAWK_SIZEOF(*nde_for)); + if (HAWK_UNLIKELY(!nde_for)) { ADJERR_LOC (hawk, xloc); goto oops; @@ -2768,8 +2768,8 @@ static hawk_nde_t* parse_dowhile (hawk_t* hawk, const hawk_loc_t* xloc) HAWK_ASSERT (hawk->ptok.type == TOK_DO); ploc = hawk->tok.loc; - body = parse_statement (hawk, &ploc); - if (body == HAWK_NULL) goto oops; + body = parse_statement(hawk, &ploc); + if (HAWK_UNLIKELY(!body)) goto oops; while (MATCH(hawk,TOK_NEWLINE)) { @@ -2794,7 +2794,7 @@ static hawk_nde_t* parse_dowhile (hawk_t* hawk, const hawk_loc_t* xloc) ploc = hawk->tok.loc; test = parse_expr_withdc (hawk, &ploc); - if (test == HAWK_NULL) goto oops; + if (HAWK_UNLIKELY(!test)) goto oops; if (!MATCH(hawk,TOK_RPAREN)) { @@ -2803,9 +2803,9 @@ static hawk_nde_t* parse_dowhile (hawk_t* hawk, const hawk_loc_t* xloc) } if (get_token(hawk) <= -1) goto oops; - - nde = (hawk_nde_while_t*) hawk_callocmem (hawk, HAWK_SIZEOF(*nde)); - if (nde == HAWK_NULL) + + nde = (hawk_nde_while_t*)hawk_callocmem(hawk, HAWK_SIZEOF(*nde)); + if (HAWK_UNLIKELY(!nde)) { ADJERR_LOC (hawk, xloc); goto oops; @@ -2836,8 +2836,8 @@ static hawk_nde_t* parse_break (hawk_t* hawk, const hawk_loc_t* xloc) return HAWK_NULL; } - nde = (hawk_nde_break_t*) hawk_callocmem (hawk, HAWK_SIZEOF(*nde)); - if (nde == HAWK_NULL) + nde = (hawk_nde_break_t*)hawk_callocmem(hawk, HAWK_SIZEOF(*nde)); + if (HAWK_UNLIKELY(!nde)) { ADJERR_LOC (hawk, xloc); return HAWK_NULL; @@ -2845,7 +2845,7 @@ static hawk_nde_t* parse_break (hawk_t* hawk, const hawk_loc_t* xloc) nde->type = HAWK_NDE_BREAK; nde->loc = *xloc; - + return (hawk_nde_t*)nde; } @@ -2860,8 +2860,8 @@ static hawk_nde_t* parse_continue (hawk_t* hawk, const hawk_loc_t* xloc) return HAWK_NULL; } - nde = (hawk_nde_continue_t*) hawk_callocmem (hawk, HAWK_SIZEOF(*nde)); - if (nde == HAWK_NULL) + nde = (hawk_nde_continue_t*)hawk_callocmem(hawk, HAWK_SIZEOF(*nde)); + if (HAWK_UNLIKELY(!nde)) { ADJERR_LOC (hawk, xloc); return HAWK_NULL; @@ -2880,8 +2880,8 @@ static hawk_nde_t* parse_return (hawk_t* hawk, const hawk_loc_t* xloc) HAWK_ASSERT (hawk->ptok.type == TOK_RETURN); - nde = (hawk_nde_return_t*) hawk_callocmem ( hawk, HAWK_SIZEOF(*nde)); - if (nde == HAWK_NULL) + nde = (hawk_nde_return_t*)hawk_callocmem(hawk, HAWK_SIZEOF(*nde)); + if (HAWK_UNLIKELY(!nde)) { ADJERR_LOC (hawk, xloc); return HAWK_NULL; @@ -2900,8 +2900,8 @@ static hawk_nde_t* parse_return (hawk_t* hawk, const hawk_loc_t* xloc) hawk_loc_t eloc; eloc = hawk->tok.loc; - val = parse_expr_withdc (hawk, &eloc); - if (val == HAWK_NULL) + val = parse_expr_withdc(hawk, &eloc); + if (HAWK_UNLIKELY(!val)) { hawk_freemem (hawk, nde); return HAWK_NULL; diff --git a/hawk/lib/run.c b/hawk/lib/run.c index 9634e94a..9beccd5b 100644 --- a/hawk/lib/run.c +++ b/hawk/lib/run.c @@ -379,12 +379,12 @@ static int set_global (hawk_rtx_t* rtx, int idx, hawk_nde_var_t* var, hawk_val_t HAWK_ASSERT (vtype != HAWK_VAL_REX); out.type = HAWK_RTX_VALTOSTR_CPLDUP; - if (hawk_rtx_valtostr (rtx, val, &out) <= -1) return -1; + if (hawk_rtx_valtostr(rtx, val, &out) <= -1) return -1; fs_ptr = out.u.cpldup.ptr; fs_len = out.u.cpldup.len; } - if (fs_len > 1 && !(fs_len == 5 && fs_ptr[0] == HAWK_T('?'))) + if (fs_len > 1 && !(fs_len == 5 && fs_ptr[0] == '?')) { /* it's a regular expression if FS contains multiple characters. * however, it's not a regular expression if it's 5 character diff --git a/hawk/lib/tre-compile.c b/hawk/lib/tre-compile.c index 53798cd9..2fa2de38 100644 --- a/hawk/lib/tre-compile.c +++ b/hawk/lib/tre-compile.c @@ -1869,14 +1869,14 @@ tre_ast_to_tnfa(hawk_gem_t* gem, tre_ast_node_t *node, tre_tnfa_transition_t *tr } -#define ERROR_EXIT(err) \ - do \ - { \ - errcode = err; \ - if (/*CONSTCOND*/1) \ - goto error_exit; \ - } \ - while (/*CONSTCOND*/0) +#define ERROR_EXIT(err) \ + do \ + { \ + errcode = err; \ + if (/*CONSTCOND*/1) \ + goto error_exit; \ + } \ + while (/*CONSTCOND*/0) int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags) @@ -1901,11 +1901,10 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags) /* HAWK: deleted limit on the stack size stack = tre_stack_new(preg->gem, 512, 10240, 128); */ stack = tre_stack_new(preg->gem, 512, -1, 128); - if (!stack) - return REG_ESPACE; + if (HAWK_UNLIKELY(!stack)) return REG_ESPACE; /* Allocate a fast memory allocator. */ mem = tre_mem_new(preg->gem); - if (!mem) + if (HAWK_UNLIKELY(!mem)) { tre_stack_destroy(stack); return REG_ESPACE; @@ -1921,8 +1920,7 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags) parse_ctx.max_backref = -1; DPRINT(("tre_compile: parsing '%.*" STRF "'\n", (int)n, regex)); errcode = tre_parse(&parse_ctx); - if (errcode != REG_OK) - ERROR_EXIT(errcode); + if (errcode != REG_OK) ERROR_EXIT(errcode); preg->re_nsub = parse_ctx.submatch_id - 1; tree = parse_ctx.result; @@ -1941,8 +1939,8 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags) /* Allocate the TNFA struct. */ tnfa = xcalloc(preg->gem, 1, sizeof(tre_tnfa_t)); - if (tnfa == NULL) - ERROR_EXIT(REG_ESPACE); + if (HAWK_UNLIKELY(!tnfa)) ERROR_EXIT(REG_ESPACE); + tnfa->have_backrefs = parse_ctx.max_backref >= 0; tnfa->have_approx = parse_ctx.have_approx; tnfa->num_submatches = parse_ctx.submatch_id; @@ -1966,26 +1964,21 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags) { tag_directions = xmalloc(preg->gem,sizeof(*tag_directions) * (tnfa->num_tags + 1)); - if (tag_directions == NULL) - ERROR_EXIT(REG_ESPACE); + if (tag_directions == NULL) ERROR_EXIT(REG_ESPACE); tnfa->tag_directions = tag_directions; - HAWK_MEMSET(tag_directions, -1, - sizeof(*tag_directions) * (tnfa->num_tags + 1)); + HAWK_MEMSET(tag_directions, -1, sizeof(*tag_directions) * (tnfa->num_tags + 1)); } tnfa->minimal_tags = xcalloc(preg->gem, (unsigned)tnfa->num_tags * 2 + 1, sizeof(tnfa->minimal_tags)); if (tnfa->minimal_tags == NULL) ERROR_EXIT(REG_ESPACE); - submatch_data = xcalloc(preg->gem,(unsigned)parse_ctx.submatch_id, - sizeof(*submatch_data)); - if (submatch_data == NULL) - ERROR_EXIT(REG_ESPACE); + submatch_data = xcalloc(preg->gem,(unsigned)parse_ctx.submatch_id, sizeof(*submatch_data)); + if (HAWK_UNLIKELY(!submatch_data)) ERROR_EXIT(REG_ESPACE); tnfa->submatch_data = submatch_data; errcode = tre_add_tags(mem, stack, tree, tnfa, 0); - if (errcode != REG_OK) - ERROR_EXIT(errcode); + if (errcode != REG_OK) ERROR_EXIT(errcode); #ifdef TRE_DEBUG for (i = 0; i < parse_ctx.submatch_id; i++) @@ -1999,10 +1992,8 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags) } /* Expand iteration nodes. */ - errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position, - tag_directions, &tnfa->params_depth); - if (errcode != REG_OK) - ERROR_EXIT(errcode); + errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position, tag_directions, &tnfa->params_depth); + if (errcode != REG_OK) ERROR_EXIT(errcode); /* Add a dummy node for the final state. XXX - For certain patterns this dummy node can be optimized away, @@ -2010,12 +2001,10 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags) this possibility. */ tmp_ast_l = tree; tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++); - if (tmp_ast_r == NULL) - ERROR_EXIT(REG_ESPACE); + if (HAWK_UNLIKELY(!tmp_ast_r)) ERROR_EXIT(REG_ESPACE); tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r); - if (tree == NULL) - ERROR_EXIT(REG_ESPACE); + if (HAWK_UNLIKELY(!tree)) ERROR_EXIT(REG_ESPACE); #ifdef TRE_DEBUG tre_ast_print(tree); @@ -2023,16 +2012,13 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags) #endif /* TRE_DEBUG */ errcode = tre_compute_nfl(mem, stack, tree); - if (errcode != REG_OK) - ERROR_EXIT(errcode); + if (errcode != REG_OK) ERROR_EXIT(errcode); counts = xmalloc(preg->gem,sizeof(int) * parse_ctx.position); - if (counts == NULL) - ERROR_EXIT(REG_ESPACE); + if (HAWK_UNLIKELY(!counts)) ERROR_EXIT(REG_ESPACE); offs = xmalloc(preg->gem,sizeof(int) * parse_ctx.position); - if (offs == NULL) - ERROR_EXIT(REG_ESPACE); + if (HAWK_UNLIKELY(!offs)) ERROR_EXIT(REG_ESPACE); for (i = 0; i < parse_ctx.position; i++) counts[i] = 0; @@ -2046,15 +2032,13 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags) counts[i] = 0; } transitions = xcalloc(preg->gem, (unsigned)add + 1, sizeof(*transitions)); - if (transitions == NULL) - ERROR_EXIT(REG_ESPACE); + if (HAWK_UNLIKELY(!transitions)) ERROR_EXIT(REG_ESPACE); tnfa->transitions = transitions; tnfa->num_transitions = add; DPRINT(("Converting to TNFA:\n")); errcode = tre_ast_to_tnfa(preg->gem, tree, transitions, counts, offs); - if (errcode != REG_OK) - ERROR_EXIT(errcode); + if (errcode != REG_OK) ERROR_EXIT(errcode); /* If in eight bit mode, compute a table of characters that can be the first character of a match. */ @@ -2145,8 +2129,7 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags) } initial = xcalloc(preg->gem, (unsigned)i + 1, sizeof(tre_tnfa_transition_t)); - if (initial == NULL) - ERROR_EXIT(REG_ESPACE); + if (HAWK_UNLIKELY(!initial)) ERROR_EXIT(REG_ESPACE); tnfa->initial = initial; i = 0; @@ -2162,18 +2145,15 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags) int j; for (j = 0; p->tags[j] >= 0; j++); initial[i].tags = xmalloc(preg->gem,sizeof(*p->tags) * (j + 1)); - if (!initial[i].tags) - ERROR_EXIT(REG_ESPACE); + if (HAWK_UNLIKELY(!initial[i].tags)) ERROR_EXIT(REG_ESPACE); HAWK_MEMCPY (initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1)); } initial[i].params = NULL; if (p->params) { initial[i].params = xmalloc(preg->gem,sizeof(*p->params) * TRE_PARAM_LAST); - if (!initial[i].params) - ERROR_EXIT(REG_ESPACE); - HAWK_MEMCPY (initial[i].params, p->params, - sizeof(*p->params) * TRE_PARAM_LAST); + if (HAWK_UNLIKELY(!initial[i].params)) ERROR_EXIT(REG_ESPACE); + HAWK_MEMCPY (initial[i].params, p->params, sizeof(*p->params) * TRE_PARAM_LAST); } initial[i].assertions = p->assertions; i++; @@ -2198,12 +2178,9 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags) error_exit: /* Free everything that was allocated and return the error code. */ tre_mem_destroy(mem); - if (stack != NULL) - tre_stack_destroy(stack); - if (counts != NULL) - xfree(preg->gem,counts); - if (offs != NULL) - xfree(preg->gem,offs); + if (stack) tre_stack_destroy(stack); + if (counts) xfree(preg->gem,counts); + if (offs) xfree(preg->gem,offs); preg->TRE_REGEX_T_FIELD = (void *)tnfa; tre_free(preg); return errcode; diff --git a/hawk/lib/tre-match-ut.h b/hawk/lib/tre-match-ut.h index 061a5f5c..cdf91fe6 100644 --- a/hawk/lib/tre-match-ut.h +++ b/hawk/lib/tre-match-ut.h @@ -64,83 +64,83 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* Wide character and multibyte support. */ -#define GET_NEXT_WCHAR() \ - do { \ - prev_c = next_c; \ - if (type == STR_BYTE) \ - { \ - pos++; \ - if (len >= 0 && pos >= len) \ - next_c = '\0'; \ - else \ - next_c = (unsigned char)(*str_byte++); \ - } \ - else if (type == STR_WIDE) \ - { \ - pos++; \ - if (len >= 0 && pos >= len) \ - next_c = HAWK_T('\0'); \ - else \ - next_c = *str_wide++; \ - } \ - else if (type == STR_MBS) \ - { \ - pos += pos_add_next; \ - if (str_byte == NULL) \ - next_c = HAWK_T('\0'); \ - else \ - { \ - size_t w; \ - int max; \ - if (len >= 0) \ - max = len - pos; \ - else \ - max = 32; \ - if (max <= 0) \ - { \ - next_c = HAWK_T('\0'); \ - pos_add_next = 1; \ - } \ - else \ - { \ - w = hawk_mbrtowc(str_byte, (size_t)max, &next_c, &mbstate); \ - if (w <= 0 || w > max) \ - return REG_NOMATCH; \ - if (next_c == HAWK_T('\0') && len >= 0) \ - { \ - pos_add_next = 1; \ - next_c = 0; \ - str_byte++; \ - } \ - else \ - { \ - pos_add_next = w; \ - str_byte += w; \ - } \ - } \ - } \ - } \ - } while(/*CONSTCOND*/0) +#define GET_NEXT_WCHAR() \ + do { \ + prev_c = next_c; \ + if (type == STR_BYTE) \ + { \ + pos++; \ + if (len >= 0 && pos >= len) \ + next_c = '\0'; \ + else \ + next_c = (unsigned char)(*str_byte++); \ + } \ + else if (type == STR_WIDE) \ + { \ + pos++; \ + if (len >= 0 && pos >= len) \ + next_c = '\0'; \ + else \ + next_c = *str_wide++; \ + } \ + else if (type == STR_MBS) \ + { \ + pos += pos_add_next; \ + if (str_byte == NULL) \ + next_c = '\0'; \ + else \ + { \ + size_t w; \ + int max; \ + if (len >= 0) \ + max = len - pos; \ + else \ + max = 32; \ + if (max <= 0) \ + { \ + next_c = '\0'; \ + pos_add_next = 1; \ + } \ + else \ + { \ + w = hawk_mbrtowc(str_byte, (size_t)max, &next_c, &mbstate); \ + if (w <= 0 || w > max) \ + return REG_NOMATCH; \ + if (next_c == '\0' && len >= 0) \ + { \ + pos_add_next = 1; \ + next_c = 0; \ + str_byte++; \ + } \ + else \ + { \ + pos_add_next = w; \ + str_byte += w; \ + } \ + } \ + } \ + } \ + } while(/*CONSTCOND*/0) #else /* !TRE_MULTIBYTE */ /* Wide character support, no multibyte support. */ -#define GET_NEXT_WCHAR() \ -do { \ - prev_c = next_c; \ - if (type == STR_BYTE) \ - { \ - pos++; \ - if (len >= 0 && pos >= len) next_c = HAWK_BT('\0'); \ - else next_c = (unsigned char)(*str_byte++); \ - } \ - else if (type == STR_WIDE) \ - { \ - pos++; \ - if (len >= 0 && pos >= len) next_c = HAWK_T('\0'); \ - else next_c = *str_wide++; \ - } \ +#define GET_NEXT_WCHAR() \ +do { \ + prev_c = next_c; \ + if (type == STR_BYTE) \ + { \ + pos++; \ + if (len >= 0 && pos >= len) next_c = '\0'; \ + else next_c = (unsigned char)(*str_byte++); \ + } \ + else if (type == STR_WIDE) \ + { \ + pos++; \ + if (len >= 0 && pos >= len) next_c = '\0'; \ + else next_c = *str_wide++; \ + } \ } while(/*CONSTCOND*/0) #endif /* !TRE_MULTIBYTE */ @@ -166,22 +166,22 @@ do { \ #define IS_WORD_CHAR(c) ((c) == HAWK_T('_') || tre_isalnum(c)) -#define CHECK_ASSERTIONS(assertions) \ - (((assertions & ASSERT_AT_BOL) \ - && (pos > 0 || reg_notbol) \ - && (prev_c != HAWK_T('\n') || !reg_newline)) \ - || ((assertions & ASSERT_AT_EOL) \ - && (next_c != HAWK_T('\0') || reg_noteol) \ - && (next_c != HAWK_T('\n') || !reg_newline)) \ - || ((assertions & ASSERT_AT_BOW) \ - && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c))) \ - || ((assertions & ASSERT_AT_EOW) \ - && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c))) \ - || ((assertions & ASSERT_AT_WB) \ - && (pos != 0 && next_c != HAWK_T('\0') \ - && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c))) \ - || ((assertions & ASSERT_AT_WB_NEG) \ - && (pos == 0 || next_c == HAWK_T('\0') \ +#define CHECK_ASSERTIONS(assertions) \ + (((assertions & ASSERT_AT_BOL) \ + && (pos > 0 || reg_notbol) \ + && (prev_c != HAWK_T('\n') || !reg_newline)) \ + || ((assertions & ASSERT_AT_EOL) \ + && (next_c != HAWK_T('\0') || reg_noteol) \ + && (next_c != HAWK_T('\n') || !reg_newline)) \ + || ((assertions & ASSERT_AT_BOW) \ + && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c))) \ + || ((assertions & ASSERT_AT_EOW) \ + && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c))) \ + || ((assertions & ASSERT_AT_WB) \ + && (pos != 0 && next_c != HAWK_T('\0') \ + && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c))) \ + || ((assertions & ASSERT_AT_WB_NEG) \ + && (pos == 0 || next_c == HAWK_T('\0') \ || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c)))) #define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags) \ @@ -191,7 +191,7 @@ do { \ || ((trans_i->assertions & ASSERT_CHAR_CLASS) \ && (tnfa->cflags & REG_ICASE) \ && !tre_isctype(tre_tolower((tre_cint_t)prev_c),trans_i->u.class) \ - && !tre_isctype(tre_toupper((tre_cint_t)prev_c),trans_i->u.class)) \ + && !tre_isctype(tre_toupper((tre_cint_t)prev_c),trans_i->u.class)) \ || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG) \ && tre_neg_char_classes_match(trans_i->neg_classes,(tre_cint_t)prev_c,\ tnfa->cflags & REG_ICASE))) @@ -201,8 +201,7 @@ do { \ /* Returns 1 if `t1' wins `t2', 0 otherwise. */ HAWK_INLINE static int -tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, - int *t1, int *t2) +tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, int *t1, int *t2) { int i; for (i = 0; i < num_tags; i++) diff --git a/hawk/lib/tre-prv.h b/hawk/lib/tre-prv.h index 6cd31926..db49dc96 100644 --- a/hawk/lib/tre-prv.h +++ b/hawk/lib/tre-prv.h @@ -169,11 +169,7 @@ SUBMATCH[4] = [defg] #define tre_tolower(c) hawk_to_ooch_lower(c) #define tre_toupper(c) hawk_to_ooch_upper(c) -#if defined(HAWK_OOCH_IS_BCH) && (HAWK_SIZEOF_MCHAR_T == HAWK_SIZEOF_CHAR) - typedef unsigned char tre_char_t; -#else - typedef hawk_ooch_t tre_char_t; -#endif +typedef hawk_ooch_t tre_char_t; typedef hawk_ooci_t tre_cint_t; #define size_t hawk_oow_t diff --git a/hawk/t/h-002.hawk b/hawk/t/h-002.hawk index 6973bcb3..4760dcdd 100644 --- a/hawk/t/h-002.hawk +++ b/hawk/t/h-002.hawk @@ -291,6 +291,33 @@ function main() ensure (a[2] === @b"Is", 1, @SCRIPTNAME, @SCRIPTLINE); ensure (a[3] === @b"Some", 1, @SCRIPTNAME, @SCRIPTLINE); ensure (a[4] === @b"Data", 1, @SCRIPTNAME, @SCRIPTLINE); + + ensure (split(@b"Here===Is=Some=====Data", a, /=+/), 4, @SCRIPTNAME, @SCRIPTLINE); + ensure (a[1] === @b"Here", 1, @SCRIPTNAME, @SCRIPTLINE); + ensure (a[2] === @b"Is", 1, @SCRIPTNAME, @SCRIPTLINE); + ensure (a[3] === @b"Some", 1, @SCRIPTNAME, @SCRIPTLINE); + ensure (a[4] === @b"Data", 1, @SCRIPTNAME, @SCRIPTLINE); + + ensure (split("[Here] : [Is] : [So\\]me] :[Da:ta]", a, "?:\\[]"), 4, @SCRIPTNAME, @SCRIPTLINE); + ensure (a[1] === "Here", 1, @SCRIPTNAME, @SCRIPTLINE); + ensure (a[2] === "Is", 1, @SCRIPTNAME, @SCRIPTLINE); + ensure (a[3] === "So]me", 1, @SCRIPTNAME, @SCRIPTLINE); + ensure (a[4] === "Da:ta", 1, @SCRIPTNAME, @SCRIPTLINE); + + ensure (split(@b"[Here] : [Is] : [So\\]me] :[Da:ta]", a, "?:\\[]"), 4, @SCRIPTNAME, @SCRIPTLINE); + ensure (a[1] === @b"Here", 1, @SCRIPTNAME, @SCRIPTLINE); + ensure (a[2] === @b"Is", 1, @SCRIPTNAME, @SCRIPTLINE); + ensure (a[3] === @b"So]me", 1, @SCRIPTNAME, @SCRIPTLINE); + ensure (a[4] === @b"Da:ta", 1, @SCRIPTNAME, @SCRIPTLINE); + + ensure (split("Here===Is=Some=====Data", a, ""), 23, @SCRIPTNAME, @SCRIPTLINE); + + ensure (split("Here Is Some Data", a, / /), 7, @SCRIPTNAME, @SCRIPTLINE); + ensure (split("Here Is Some Data", a, " "), 4, @SCRIPTNAME, @SCRIPTLINE); + ensure (a[1] === "Here", 1, @SCRIPTNAME, @SCRIPTLINE); + ensure (a[2] === "Is", 1, @SCRIPTNAME, @SCRIPTLINE); + ensure (a[3] === "Some", 1, @SCRIPTNAME, @SCRIPTLINE); + ensure (a[4] === "Data", 1, @SCRIPTNAME, @SCRIPTLINE); } print "SUCCESS";