enhanced str::split() to handle byte strings better

This commit is contained in:
hyung-hwan 2020-11-13 14:56:15 +00:00
parent 4a60654b49
commit 166c18c7d0
10 changed files with 471 additions and 721 deletions

View File

@ -787,216 +787,23 @@ int hawk_fnc_substr (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi)
return 0; return 0;
} }
#if 0
static int split_mbs (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi)
{
hawk_oow_t nargs;
hawk_val_t* a0, * a2, * t1, * t2;
hawk_val_type_t a2_vtype, t1_vtype;
hawk_bcs_t str;
hawk_bcs_t fs;
hawk_bch_t* fs_free = HAWK_NULL;
const hawk_bch_t* p;
hawk_oow_t str_left, org_len;
hawk_tre_t* fs_rex = HAWK_NULL;
hawk_tre_t* fs_rex_free = HAWK_NULL;
hawk_bcs_t tok;
hawk_int_t nflds;
int x;
str.ptr = HAWK_NULL;
str.len = 0;
nargs = hawk_rtx_getnargs(rtx);
HAWK_ASSERT (nargs >= 2 && nargs <= 3);
a0 = hawk_rtx_getarg(rtx, 0);
a2 = (nargs >= 3)? hawk_rtx_getarg(rtx, 2): HAWK_NULL;
str.ptr = hawk_rtx_getvalbcstr(rtx, a0, &str.len);
if (HAWK_UNLIKELY(!str.ptr)) goto oops;
if (!a2)
{
/* get the value from FS */
t1 = hawk_rtx_getgbl(rtx, HAWK_GBL_FS);
t1_vtype = HAWK_RTX_GETVALTYPE(rtx, t1);
if (t1_vtype == HAWK_VAL_NIL)
{
fs.ptr = " ";
fs.len = 1;
}
else if (t1_vtype == HAWK_VAL_MBS)
{
fs.ptr = ((hawk_val_mbs_t*)t1)->val.ptr;
fs.len = ((hawk_val_mbs_t*)t1)->val.len;
}
else
{
fs.ptr = hawk_rtx_valtobcstrdup(rtx, t1, &fs.len);
if (HAWK_UNLIKELY(!fs.ptr)) goto oops;
fs_free = (hawk_bch_t*)fs.ptr;
}
if (fs.len > 1) fs_rex = rtx->gbl.fs[rtx->gbl.ignorecase];
}
else
{
a2_vtype = HAWK_RTX_GETVALTYPE(rtx, a2);
if (a2_vtype == HAWK_VAL_REX)
{
/* the third parameter is a regular expression */
fs_rex = ((hawk_val_rex_t*)a2)->code[rtx->gbl.ignorecase];
/* make the loop below to take fs_rex by
* setting fs_len greater than 1*/
fs.ptr = HAWK_NULL;
fs.len = 2;
}
else
{
if (a2_vtype == HAWK_VAL_MBS)
{
fs.ptr = ((hawk_val_mbs_t*)a2)->val.ptr;
fs.len = ((hawk_val_mbs_t*)a2)->val.len;
}
else
{
fs.ptr = hawk_rtx_valtobcstrdup(rtx, a2, &fs.len);
if (fs.ptr == HAWK_NULL) goto oops;
fs_free = (hawk_bch_t*)fs.ptr;
}
if (fs.len > 1)
{
int x;
x = rtx->gbl.ignorecase?
hawk_rtx_buildrex(rtx, fs.ptr, fs.len, HAWK_NULL, &fs_rex):
hawk_rtx_buildrex(rtx, fs.ptr, fs.len, &fs_rex, HAWK_NULL);
if (x <= -1) goto oops;
fs_rex_free = fs_rex;
}
}
}
t1 = hawk_rtx_makearrval(rtx);
if (HAWK_UNLIKELY(!t1)) goto oops;
hawk_rtx_refupval (rtx, t1);
x = hawk_rtx_setrefval(rtx, (hawk_val_ref_t*)hawk_rtx_getarg(rtx, 1), t1);
hawk_rtx_refdownval (rtx, t1);
if (HAWK_UNLIKELY(x <= -1)) goto oops;
/* fill the map with actual values */
p = str.ptr; str_left = str.len; org_len = str.len;
nflds = 0;
while (p)
{
hawk_bch_t key_buf[HAWK_SIZEOF(hawk_int_t)*8+2];
hawk_oow_t key_len;
if (fs.len <= 1)
{
p = hawk_rtx_tokoocharswithoochars(rtx, p, str.len, fs.ptr, fs.len, &tok);
}
else
{
p = hawk_rtx_tokoocharsbyrex(rtx, str.ptr, org_len, p, str.len, fs_rex, &tok);
if (p == HAWK_NULL && hawk_rtx_geterrnum(rtx) != HAWK_ENOERR)
{
goto oops;
}
}
if (nflds == 0 && p == HAWK_NULL && tok.len == 0)
{
/* no field at all*/
break;
}
HAWK_ASSERT ((tok.ptr != HAWK_NULL && tok.len > 0) || tok.len == 0);
/* create the field string - however, the split function must
* create a numeric value if the string is a number */
/*t2 = hawk_rtx_makembsvalwithbcs (rtx, &tok);*/
/*t2 = hawk_rtx_makenmbsvalwithbcs(rtx, &tok); */
t2 = hawk_rtx_makenumormbsvalwithbchars(rtx, tok.ptr, tok.len);
if (HAWK_UNLIKELY(!t2)) goto oops;
/* put it into the map */
key_len = hawk_int_to_oocstr(++nflds, 10, HAWK_NULL, key_buf, HAWK_COUNTOF(key_buf));
HAWK_ASSERT (key_len != (hawk_oow_t)-1);
if (hawk_rtx_setarrvalfld(rtx, t1, key_buf, key_len, t2) == HAWK_NULL)
{
hawk_rtx_refupval (rtx, t2);
hawk_rtx_refdownval (rtx, t2);
goto oops;
}
str.len = str_left - (p - str.ptr);
}
/*if (str_free) hawk_rtx_freemem (rtx, str_free);*/
hawk_rtx_freevalbcstr (rtx, a0, str.ptr);
if (fs_free) hawk_rtx_freemem (rtx, fs_free);
if (fs_rex_free)
{
if (rtx->gbl.ignorecase)
hawk_rtx_freerex (rtx, HAWK_NULL, fs_rex_free);
else
hawk_rtx_freerex (rtx, fs_rex_free, HAWK_NULL);
}
/*nflds--;*/
t1 = hawk_rtx_makeintval(rtx, nflds);
if (HAWK_UNLIKELY(!t1)) return -1;
hawk_rtx_setretval (rtx, t1);
return 0;
oops:
if (str.ptr) hawk_rtx_freevalbcstr (rtx, a0, str.ptr);
if (fs_free) hawk_rtx_freemem (rtx, fs_free);
if (fs_rex_free)
{
if (rtx->gbl.ignorecase)
hawk_rtx_freerex (rtx, HAWK_NULL, fs_rex_free);
else
hawk_rtx_freerex (rtx, fs_rex_free, HAWK_NULL);
}
return -1;
}
#endif
static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array) static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array)
{ {
hawk_oow_t nargs; hawk_oow_t nargs;
hawk_val_t* a0, * a2, * t1, * t2; hawk_val_t* a0, * a2, * t0, * t1, * t2;
hawk_val_type_t a2_vtype, t1_vtype;
hawk_oocs_t str; hawk_oocs_t str;
hawk_oocs_t fs; hawk_oocs_t fs;
hawk_ooch_t* fs_free = HAWK_NULL; hawk_ooch_t* fs_free = HAWK_NULL;
const hawk_ooch_t* p; hawk_ooch_t* p;
hawk_oow_t str_left, org_len; hawk_oow_t str_left, org_len;
hawk_tre_t* fs_rex = HAWK_NULL; hawk_tre_t* fs_rex = HAWK_NULL;
hawk_tre_t* fs_rex_free = HAWK_NULL; hawk_tre_t* fs_rex_free = HAWK_NULL;
hawk_oocs_t tok; hawk_oocs_t tok;
hawk_int_t nflds; hawk_int_t nflds;
int x; int x, byte_str, do_fld = 0;
str.ptr = HAWK_NULL; str.ptr = HAWK_NULL;
str.len = 0; str.len = 0;
@ -1007,65 +814,41 @@ static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array)
a0 = hawk_rtx_getarg(rtx, 0); a0 = hawk_rtx_getarg(rtx, 0);
a2 = (nargs >= 3)? hawk_rtx_getarg (rtx, 2): HAWK_NULL; a2 = (nargs >= 3)? hawk_rtx_getarg (rtx, 2): HAWK_NULL;
str.ptr = hawk_rtx_getvaloocstr(rtx, a0, &str.len); str.ptr = HAWK_NULL;
if (HAWK_UNLIKELY(!str.ptr)) goto oops; str.len = 0;
if (!a2) /* field seperator */
t0 = a2? a2: hawk_rtx_getgbl(rtx, HAWK_GBL_FS); /* if a2 is not available, get the value from FS */
if (HAWK_RTX_GETVALTYPE(rtx, t0) == HAWK_VAL_NIL)
{ {
/* get the value from FS */ fs.ptr = HAWK_T(" ");
t1 = hawk_rtx_getgbl(rtx, HAWK_GBL_FS); fs.len = 1;
t1_vtype = HAWK_RTX_GETVALTYPE(rtx, t1);
if (t1_vtype == HAWK_VAL_NIL)
{
fs.ptr = HAWK_T(" ");
fs.len = 1;
}
else if (t1_vtype == HAWK_VAL_STR)
{
fs.ptr = ((hawk_val_str_t*)t1)->val.ptr;
fs.len = ((hawk_val_str_t*)t1)->val.len;
}
else
{
fs.ptr = hawk_rtx_valtooocstrdup(rtx, t1, &fs.len);
if (HAWK_UNLIKELY(!fs.ptr)) goto oops;
fs_free = (hawk_ooch_t*)fs.ptr;
}
if (fs.len > 1) fs_rex = rtx->gbl.fs[rtx->gbl.ignorecase];
} }
else else if (HAWK_RTX_GETVALTYPE(rtx, t0) == HAWK_VAL_REX)
{ {
a2_vtype = HAWK_RTX_GETVALTYPE (rtx, a2); /* regular expression */
fs_rex = ((hawk_val_rex_t*)t0)->code[rtx->gbl.ignorecase];
if (a2_vtype == HAWK_VAL_REX) /* make the tokenizing loop below to take fs_rex by setting fs_len greater than 1*/
fs.ptr = HAWK_NULL;
fs.len = 2;
}
else
{
fs.ptr = hawk_rtx_getvaloocstr(rtx, t0, &fs.len);
if (HAWK_UNLIKELY(!fs.ptr)) goto oops;
fs_free = fs.ptr;
if (fs.len == 5 && fs.ptr[0] == '?')
{ {
/* the third parameter is a regular expression */ do_fld = 1;
fs_rex = ((hawk_val_rex_t*)a2)->code[rtx->gbl.ignorecase];
/* make the loop below to take fs_rex by
* setting fs_len greater than 1*/
fs.ptr = HAWK_NULL;
fs.len = 2;
} }
else else if (fs.len > 1)
{ {
if (a2_vtype == HAWK_VAL_STR) if (a2)
{ {
fs.ptr = ((hawk_val_str_t*)a2)->val.ptr;
fs.len = ((hawk_val_str_t*)a2)->val.len;
}
else
{
fs.ptr = hawk_rtx_valtooocstrdup(rtx, a2, &fs.len);
if (fs.ptr == HAWK_NULL) goto oops;
fs_free = (hawk_ooch_t*)fs.ptr;
}
if (fs.len > 1)
{
int x;
x = rtx->gbl.ignorecase? x = rtx->gbl.ignorecase?
hawk_rtx_buildrex(rtx, fs.ptr, fs.len, HAWK_NULL, &fs_rex): hawk_rtx_buildrex(rtx, fs.ptr, fs.len, HAWK_NULL, &fs_rex):
hawk_rtx_buildrex(rtx, fs.ptr, fs.len, &fs_rex, HAWK_NULL); hawk_rtx_buildrex(rtx, fs.ptr, fs.len, &fs_rex, HAWK_NULL);
@ -1073,9 +856,28 @@ static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array)
fs_rex_free = fs_rex; fs_rex_free = fs_rex;
} }
else
{
fs_rex = rtx->gbl.fs[rtx->gbl.ignorecase];
}
} }
} }
/* the first parameter - string to split */
if (HAWK_RTX_GETVALTYPE(rtx, a0) == HAWK_VAL_MBS)
{
byte_str = 1;
str.ptr = do_fld? hawk_rtx_valtobcstrdup(rtx, a0, &str.len):
hawk_rtx_getvalbcstr(rtx, a0, &str.len);
}
else
{
byte_str = 0;
str.ptr = do_fld? hawk_rtx_valtooocstrdup(rtx, a0, &str.len):
hawk_rtx_getvaloocstr(rtx, a0, &str.len);
}
if (HAWK_UNLIKELY(!str.ptr)) goto oops;
t1 = use_array? hawk_rtx_makearrval(rtx, 16): hawk_rtx_makemapval(rtx); t1 = use_array? hawk_rtx_makearrval(rtx, 16): hawk_rtx_makemapval(rtx);
if (HAWK_UNLIKELY(!t1)) goto oops; if (HAWK_UNLIKELY(!t1)) goto oops;
@ -1090,20 +892,23 @@ static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array)
while (p) while (p)
{ {
hawk_ooch_t key_buf[HAWK_SIZEOF(hawk_int_t)*8+2]; if (fs_rex)
hawk_oow_t key_len;
if (fs.len <= 1)
{ {
p = hawk_rtx_tokoocharswithoochars(rtx, p, str.len, fs.ptr, fs.len, &tok); p = byte_str? hawk_rtx_tokbcharsbyrex(rtx, str.ptr, org_len, p, str.len, fs_rex, &tok):
hawk_rtx_tokoocharsbyrex(rtx, str.ptr, org_len, p, str.len, fs_rex, &tok);
if (p && hawk_rtx_geterrnum(rtx) != HAWK_ENOERR) goto oops;
}
else if (do_fld)
{
/* [NOTE] even if byte_str is true, the field seperator is of the ooch type.
* there may be some data truncation and related issues */
p = byte_str? hawk_rtx_fldbchars(rtx, p, str.len, fs.ptr[1], fs.ptr[2], fs.ptr[3], fs.ptr[4], &tok):
hawk_rtx_fldoochars(rtx, p, str.len, fs.ptr[1], fs.ptr[2], fs.ptr[3], fs.ptr[4], &tok);
} }
else else
{ {
p = hawk_rtx_tokoocharsbyrex(rtx, str.ptr, org_len, p, str.len, fs_rex, &tok); p = byte_str? hawk_rtx_tokbcharswithbchars(rtx, p, str.len, fs.ptr, fs.len, &tok):
if (p == HAWK_NULL && hawk_rtx_geterrnum(rtx) != HAWK_ENOERR) hawk_rtx_tokoocharswithoochars(rtx, p, str.len, fs.ptr, fs.len, &tok);
{
goto oops;
}
} }
if (nflds == 0 && p == HAWK_NULL && tok.len == 0) if (nflds == 0 && p == HAWK_NULL && tok.len == 0)
@ -1118,7 +923,8 @@ static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array)
* create a numeric value if the string is a number */ * create a numeric value if the string is a number */
/*t2 = hawk_rtx_makestrvalwithoocs (rtx, &tok);*/ /*t2 = hawk_rtx_makestrvalwithoocs (rtx, &tok);*/
/*t2 = hawk_rtx_makenstrvalwithoocs(rtx, &tok); */ /*t2 = hawk_rtx_makenstrvalwithoocs(rtx, &tok); */
t2 = hawk_rtx_makenumorstrvalwithoochars(rtx, tok.ptr, tok.len); t2 = byte_str? hawk_rtx_makenumormbsvalwithbchars(rtx, tok.ptr, tok.len):
hawk_rtx_makenumorstrvalwithoochars(rtx, tok.ptr, tok.len);
if (HAWK_UNLIKELY(!t2)) goto oops; if (HAWK_UNLIKELY(!t2)) goto oops;
if (use_array) if (use_array)
@ -1133,6 +939,9 @@ static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array)
else else
{ {
/* put it into the map */ /* put it into the map */
hawk_ooch_t key_buf[HAWK_SIZEOF(hawk_int_t)*8+2];
hawk_oow_t key_len;
key_len = hawk_int_to_oocstr(++nflds, 10, HAWK_NULL, key_buf, HAWK_COUNTOF(key_buf)); key_len = hawk_int_to_oocstr(++nflds, 10, HAWK_NULL, key_buf, HAWK_COUNTOF(key_buf));
HAWK_ASSERT (key_len != (hawk_oow_t)-1); HAWK_ASSERT (key_len != (hawk_oow_t)-1);
@ -1144,13 +953,17 @@ static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array)
} }
} }
str.len = str_left - (p - str.ptr); if (byte_str)
str.len = str_left - ((p - str.ptr) * HAWK_SIZEOF_OOCH_T);
else
str.len = str_left - (p - str.ptr);
} }
/*if (str_free) hawk_rtx_freemem (rtx, str_free);*/ if (do_fld) hawk_rtx_freemem (rtx, str.ptr);
hawk_rtx_freevaloocstr (rtx, a0, str.ptr); else if (byte_str) hawk_rtx_freevalbcstr (rtx, a0, str.ptr);
else hawk_rtx_freevaloocstr (rtx, a0, str.ptr);
if (fs_free) hawk_rtx_freemem (rtx, fs_free); if (fs_free) hawk_rtx_freevaloocstr (rtx, t0, fs_free);
if (fs_rex_free) if (fs_rex_free)
{ {
@ -1160,16 +973,19 @@ static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array)
hawk_rtx_freerex (rtx, fs_rex_free, HAWK_NULL); hawk_rtx_freerex (rtx, fs_rex_free, HAWK_NULL);
} }
/*nflds--;*/ t1 = hawk_rtx_makeintval(rtx, nflds);
t1 = hawk_rtx_makeintval (rtx, nflds);
if (HAWK_UNLIKELY(!t1)) return -1; if (HAWK_UNLIKELY(!t1)) return -1;
hawk_rtx_setretval (rtx, t1); hawk_rtx_setretval (rtx, t1);
return 0; return 0;
oops: oops:
if (str.ptr) hawk_rtx_freevaloocstr (rtx, a0, str.ptr); if (str.ptr)
{
if (do_fld) hawk_rtx_freemem (rtx, str.ptr);
else if (byte_str) hawk_rtx_freevalbcstr (rtx, a0, str.ptr);
else hawk_rtx_freevaloocstr (rtx, a0, str.ptr);
}
if (fs_free) hawk_rtx_freemem (rtx, fs_free); if (fs_free) hawk_rtx_freemem (rtx, fs_free);
@ -1185,7 +1001,8 @@ oops:
int hawk_fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi) int hawk_fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi)
{ {
return fnc_split(rtx, fi, 1); /*return fnc_split(rtx, fi, 1);*/
return fnc_split(rtx, fi, 0);
} }
int hawk_fnc_tolower (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi) int hawk_fnc_tolower (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi)

View File

@ -24,6 +24,92 @@
THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
char_t* split_xchars_to_fields (hawk_rtx_t* rtx, char_t* str, hawk_oow_t len, char_t fs, char_t ec, char_t lq, char_t rq, xcs_t* tok)
{
char_t* p = str;
char_t* end = str + len;
int escaped = 0, quoted = 0;
char_t* ts; /* token start */
char_t* tp; /* points to one char past the last token char */
char_t* xp; /* points to one char past the last effective char */
/* skip leading spaces */
while (p < end && is_xch_space(*p)) p++;
/* initialize token pointers */
ts = tp = xp = p;
while (p < end)
{
char c = *p;
if (escaped)
{
*tp++ = c; xp = tp; p++;
escaped = 0;
}
else
{
if (c == ec)
{
escaped = 1;
p++;
}
else if (quoted)
{
if (c == rq)
{
quoted = 0;
p++;
}
else
{
*tp++ = c; xp = tp; p++;
}
}
else
{
if (c == fs)
{
tok->ptr = ts;
tok->len = xp - ts;
p++;
if (is_xch_space(fs))
{
while (p < end && *p == fs) p++;
if (p >= end) return HAWK_NULL;
}
return p;
}
if (c == lq)
{
quoted = 1;
p++;
}
else
{
*tp++ = c; p++;
if (!is_xch_space(c)) xp = tp;
}
}
}
}
if (escaped)
{
/* if it is still escaped, the last character must be
* the escaper itself. treat it as a normal character */
*xp++ = ec;
}
tok->ptr = ts;
tok->len = xp - ts;
return HAWK_NULL;
}
char_t* tokenize_xchars (hawk_rtx_t* rtx, const char_t* s, hawk_oow_t len, const char_t* delim, hawk_oow_t delim_len, xcs_t* tok) char_t* tokenize_xchars (hawk_rtx_t* rtx, const char_t* s, hawk_oow_t len, const char_t* delim, hawk_oow_t delim_len, xcs_t* tok)
{ {
const char_t* p = s, *d; const char_t* p = s, *d;
@ -214,88 +300,102 @@ exit_loop:
return (char_t*)++p; return (char_t*)++p;
} }
char_t* split_xchars_to_fields (hawk_rtx_t* rtx, char_t* str, hawk_oow_t len, char_t fs, char_t ec, char_t lq, char_t rq, xcs_t* tok)
char_t* tokenize_xchars_by_rex (hawk_rtx_t* rtx, const char_t* str, hawk_oow_t len, const char_t* substr, hawk_oow_t sublen, hawk_tre_t* rex, xcs_t* tok)
{ {
char_t* p = str; int n;
char_t* end = str + len; hawk_oow_t i;
int escaped = 0, quoted = 0; xcs_t match, s, cursub, realsub;
char_t* ts; /* token start */
char_t* tp; /* points to one char past the last token char */
char_t* xp; /* points to one char past the last effective char */
/* skip leading spaces */ s.ptr = (char_t*)str;
while (p < end && is_xch_space(*p)) p++; s.len = len;
/* initialize token pointers */ cursub.ptr = (char_t*)substr;
ts = tp = xp = p; cursub.len = sublen;
while (p < end) realsub.ptr = (char_t*)substr;
realsub.len = sublen;
while (cursub.len > 0)
{ {
char c = *p; n = match_rex_with_xcs(rtx, rex, &s, &cursub, &match, HAWK_NULL);
if (n <= -1) return HAWK_NULL;
if (escaped) if (n == 0)
{ {
*tp++ = c; xp = tp; p++; /* no match has been found. return the entire string as a token */
escaped = 0; hawk_rtx_seterrnum (rtx, HAWK_NULL, HAWK_ENOERR); /* reset HAWK_EREXNOMAT to no error */
tok->ptr = realsub.ptr;
tok->len = realsub.len;
return HAWK_NULL;
} }
else
HAWK_ASSERT (n == 1);
if (match.len == 0)
{ {
if (c == ec) /* the match length is zero. */
cursub.ptr++;
cursub.len--;
}
else if (HAWK_RTX_IS_STRIPRECSPC_ON(rtx))
{
/* match at the beginning of the input string */
if (match.ptr == substr)
{ {
escaped = 1; for (i = 0; i < match.len; i++)
p++;
}
else if (quoted)
{
if (c == rq)
{ {
quoted = 0; if (!is_xch_space(match.ptr[i])) goto exit_loop;
p++;
} }
else
{
*tp++ = c; xp = tp; p++;
}
}
else
{
if (c == fs)
{
tok->ptr = ts;
tok->len = xp - ts;
p++;
if (is_xch_space(fs)) /* the match that is all spaces at the
{ * beginning of the input string is skipped */
while (p < end && *p == fs) p++; cursub.ptr += match.len;
if (p >= end) return HAWK_NULL; cursub.len -= match.len;
}
return p; /* adjust the substring by skipping the leading
} * spaces and retry matching */
realsub.ptr = (char_t*)substr + match.len;
if (c == lq) realsub.len -= match.len;
{
quoted = 1;
p++;
}
else
{
*tp++ = c; p++;
if (!is_xch_space(c)) xp = tp;
}
} }
else break;
}
else break;
}
exit_loop:
hawk_rtx_seterrnum (rtx, HAWK_NULL, HAWK_ENOERR);
if (cursub.len <= 0)
{
tok->ptr = realsub.ptr;
tok->len = realsub.len;
return HAWK_NULL;
}
tok->ptr = realsub.ptr;
tok->len = match.ptr - realsub.ptr;
for (i = 0; i < match.len; i++)
{
if (!is_xch_space(match.ptr[i]))
{
/* the match contains a non-space character. */
return (char_t*)match.ptr+match.len;
} }
} }
if (escaped) /* the match is all spaces */
if (HAWK_RTX_IS_STRIPRECSPC_ON(rtx))
{ {
/* if it is still escaped, the last character must be /* if the match reached the last character in the input string,
* the escaper itself. treat it as a normal character */ * it returns HAWK_NULL to terminate tokenization. */
*xp++ = ec; return (match.ptr+match.len >= substr+sublen)? HAWK_NULL: ((char_t*)match.ptr+match.len);
}
else
{
/* if the match went beyond the the last character in the input
* string, it returns HAWK_NULL to terminate tokenization. */
return (match.ptr+match.len > substr+sublen)? HAWK_NULL: ((char_t*)match.ptr+match.len);
} }
tok->ptr = ts;
tok->len = xp - ts;
return HAWK_NULL;
} }

View File

@ -64,24 +64,37 @@ hawk_bch_t* hawk_rtx_tokbcharswithbchars (
const hawk_bch_t* delim, hawk_oow_t delim_len, hawk_bcs_t* tok); const hawk_bch_t* delim, hawk_oow_t delim_len, hawk_bcs_t* tok);
hawk_uch_t* hawk_rtx_tokucharsbyrex (
hawk_rtx_t* rtx,
const hawk_uch_t* str,
hawk_oow_t len,
const hawk_uch_t* substr,
hawk_oow_t sublen,
hawk_tre_t* rex,
hawk_ucs_t* tok
);
hawk_bch_t* hawk_rtx_tokbcharsbyrex (
hawk_rtx_t* rtx,
const hawk_bch_t* str,
hawk_oow_t len,
const hawk_bch_t* substr,
hawk_oow_t sublen,
hawk_tre_t* rex,
hawk_bcs_t* tok
);
#if defined(HAWK_OOCH_IS_UCH) #if defined(HAWK_OOCH_IS_UCH)
# define hawk_rtx_fldoochars hawk_rtx_flduchars # define hawk_rtx_fldoochars hawk_rtx_flduchars
# define hawk_rtx_tokoocharswithoochars hawk_rtx_tokucharswithuchars # define hawk_rtx_tokoocharswithoochars hawk_rtx_tokucharswithuchars
# define hawk_rtx_tokoocharsbyrex hawk_rtx_tokucharsbyrex
#else #else
# define hawk_rtx_fldoochars hawk_rtx_fldbchars # define hawk_rtx_fldoochars hawk_rtx_fldbchars
# define hawk_rtx_tokoocharswithoochars hawk_rtx_tokbcharswithbchars # define hawk_rtx_tokoocharswithoochars hawk_rtx_tokbcharswithbchars
# define hawk_rtx_tokoocharsbyrex hawk_rtx_tokbcharsbyrex
#endif #endif
hawk_ooch_t* hawk_rtx_tokoocharsbyrex (
hawk_rtx_t* rtx,
const hawk_ooch_t* str,
hawk_oow_t len,
const hawk_ooch_t* substr,
hawk_oow_t sublen,
hawk_tre_t* rex,
hawk_oocs_t* tok
);
int hawk_rtx_matchvalwithucs ( int hawk_rtx_matchvalwithucs (
hawk_rtx_t* rtx, hawk_val_t* val, hawk_rtx_t* rtx, hawk_val_t* val,

View File

@ -30,220 +30,41 @@
#undef char_t #undef char_t
#undef xcs_t #undef xcs_t
#undef is_xch_space #undef is_xch_space
#undef tokenize_xchars #undef match_rex_with_xcs
#undef split_xchars_to_fields #undef split_xchars_to_fields
#undef tokenize_xchars
#undef tokenize_xchars_by_rex
#define char_t hawk_bch_t #define char_t hawk_bch_t
#define xcs_t hawk_bcs_t #define xcs_t hawk_bcs_t
#define is_xch_space hawk_is_bch_space #define is_xch_space hawk_is_bch_space
#define tokenize_xchars hawk_rtx_tokbcharswithbchars #define match_rex_with_xcs hawk_rtx_matchrexwithbcs
#define split_xchars_to_fields hawk_rtx_fldbchars #define split_xchars_to_fields hawk_rtx_fldbchars
#define tokenize_xchars hawk_rtx_tokbcharswithbchars
#define tokenize_xchars_by_rex hawk_rtx_tokbcharsbyrex
#include "misc-imp.h" #include "misc-imp.h"
#undef char_t #undef char_t
#undef xcs_t #undef xcs_t
#undef is_xch_space #undef is_xch_space
#undef tokenize_xchars #undef match_rex_with_xcs
#undef split_xchars_to_fields #undef split_xchars_to_fields
#undef tokenize_xchars
#undef tokenize_xchars_by_rex
#define char_t hawk_uch_t #define char_t hawk_uch_t
#define xcs_t hawk_ucs_t #define xcs_t hawk_ucs_t
#define is_xch_space hawk_is_uch_space #define is_xch_space hawk_is_uch_space
#define tokenize_xchars hawk_rtx_tokucharswithuchars #define match_rex_with_xcs hawk_rtx_matchrexwithucs
#define split_xchars_to_fields hawk_rtx_flduchars #define split_xchars_to_fields hawk_rtx_flduchars
#define tokenize_xchars hawk_rtx_tokucharswithuchars
#define tokenize_xchars_by_rex hawk_rtx_tokucharsbyrex
#include "misc-imp.h" #include "misc-imp.h"
hawk_ooch_t* hawk_rtx_tokoocharsbyrex (
hawk_rtx_t* rtx,
const hawk_ooch_t* str, hawk_oow_t len,
const hawk_ooch_t* substr, hawk_oow_t sublen,
hawk_tre_t* rex, hawk_oocs_t* tok)
{
int n;
hawk_oow_t i;
hawk_oocs_t match, s, cursub, realsub;
s.ptr = (hawk_ooch_t*)str;
s.len = len;
cursub.ptr = (hawk_ooch_t*)substr;
cursub.len = sublen;
realsub.ptr = (hawk_ooch_t*)substr;
realsub.len = sublen;
while (cursub.len > 0)
{
n = hawk_rtx_matchrexwithoocs(rtx, rex, &s, &cursub, &match, HAWK_NULL);
if (n <= -1) return HAWK_NULL;
if (n == 0)
{
/* no match has been found. return the entire string as a token */
hawk_rtx_seterrnum (rtx, HAWK_NULL, HAWK_ENOERR); /* reset HAWK_EREXNOMAT to no error */
tok->ptr = realsub.ptr;
tok->len = realsub.len;
return HAWK_NULL;
}
HAWK_ASSERT (n == 1);
if (match.len == 0)
{
/* the match length is zero. */
cursub.ptr++;
cursub.len--;
}
else if (HAWK_RTX_IS_STRIPRECSPC_ON(rtx))
{
/* match at the beginning of the input string */
if (match.ptr == substr)
{
for (i = 0; i < match.len; i++)
{
if (!hawk_is_ooch_space(match.ptr[i])) goto exit_loop;
}
/* the match that is all spaces at the
* beginning of the input string is skipped */
cursub.ptr += match.len;
cursub.len -= match.len;
/* adjust the substring by skipping the leading
* spaces and retry matching */
realsub.ptr = (hawk_ooch_t*)substr + match.len;
realsub.len -= match.len;
}
else break;
}
else break;
}
exit_loop:
hawk_rtx_seterrnum (rtx, HAWK_NULL, HAWK_ENOERR);
if (cursub.len <= 0)
{
tok->ptr = realsub.ptr;
tok->len = realsub.len;
return HAWK_NULL;
}
tok->ptr = realsub.ptr;
tok->len = match.ptr - realsub.ptr;
for (i = 0; i < match.len; i++)
{
if (!hawk_is_ooch_space(match.ptr[i]))
{
/* the match contains a non-space character. */
return (hawk_ooch_t*)match.ptr+match.len;
}
}
/* the match is all spaces */
if (HAWK_RTX_IS_STRIPRECSPC_ON(rtx))
{
/* if the match reached the last character in the input string,
* it returns HAWK_NULL to terminate tokenization. */
return (match.ptr+match.len >= substr+sublen)? HAWK_NULL: ((hawk_ooch_t*)match.ptr+match.len);
}
else
{
/* if the match went beyond the the last character in the input
* string, it returns HAWK_NULL to terminate tokenization. */
return (match.ptr+match.len > substr+sublen)? HAWK_NULL: ((hawk_ooch_t*)match.ptr+match.len);
}
}
#if 0
hawk_ooch_t* hawk_rtx_strxnfld (
hawk_rtx_t* rtx, hawk_ooch_t* str, hawk_oow_t len,
hawk_ooch_t fs, hawk_ooch_t ec, hawk_ooch_t lq, hawk_ooch_t rq,
hawk_oocs_t* tok)
{
hawk_ooch_t* p = str;
hawk_ooch_t* end = str + len;
int escaped = 0, quoted = 0;
hawk_ooch_t* ts; /* token start */
hawk_ooch_t* tp; /* points to one char past the last token char */
hawk_ooch_t* xp; /* points to one char past the last effective char */
/* skip leading spaces */
while (p < end && hawk_is_ooch_space(*p)) p++;
/* initialize token pointers */
ts = tp = xp = p;
while (p < end)
{
char c = *p;
if (escaped)
{
*tp++ = c; xp = tp; p++;
escaped = 0;
}
else
{
if (c == ec)
{
escaped = 1;
p++;
}
else if (quoted)
{
if (c == rq)
{
quoted = 0;
p++;
}
else
{
*tp++ = c; xp = tp; p++;
}
}
else
{
if (c == fs)
{
tok->ptr = ts;
tok->len = xp - ts;
p++;
if (hawk_is_ooch_space(fs))
{
while (p < end && *p == fs) p++;
if (p >= end) return HAWK_NULL;
}
return p;
}
if (c == lq)
{
quoted = 1;
p++;
}
else
{
*tp++ = c; p++;
if (!hawk_is_ooch_space(c)) xp = tp;
}
}
}
}
if (escaped)
{
/* if it is still escaped, the last character must be
* the escaper itself. treat it as a normal character */
*xp++ = ec;
}
tok->ptr = ts;
tok->len = xp - ts;
return HAWK_NULL;
}
#endif
static int matchtre_ucs (hawk_tre_t* tre, int opt, const hawk_ucs_t* str, hawk_ucs_t* mat, hawk_ucs_t submat[9], hawk_gem_t* errgem) static int matchtre_ucs (hawk_tre_t* tre, int opt, const hawk_ucs_t* str, hawk_ucs_t* mat, hawk_ucs_t submat[9], hawk_gem_t* errgem)
{ {

View File

@ -2579,8 +2579,8 @@ static hawk_nde_t* parse_while (hawk_t* hawk, const hawk_loc_t* xloc)
if (get_token(hawk) <= -1) goto oops; if (get_token(hawk) <= -1) goto oops;
ploc = hawk->tok.loc; ploc = hawk->tok.loc;
test = parse_expr_withdc (hawk, &ploc); test = parse_expr_withdc(hawk, &ploc);
if (test == HAWK_NULL) goto oops; if (HAWK_UNLIKELY(!test)) goto oops;
if (!MATCH(hawk,TOK_RPAREN)) if (!MATCH(hawk,TOK_RPAREN))
{ {
@ -2591,11 +2591,11 @@ static hawk_nde_t* parse_while (hawk_t* hawk, const hawk_loc_t* xloc)
if (get_token(hawk) <= -1) goto oops; if (get_token(hawk) <= -1) goto oops;
ploc = hawk->tok.loc; ploc = hawk->tok.loc;
body = parse_statement (hawk, &ploc); body = parse_statement(hawk, &ploc);
if (body == HAWK_NULL) goto oops; if (HAWK_UNLIKELY(!body)) goto oops;
nde = (hawk_nde_while_t*) hawk_callocmem (hawk, HAWK_SIZEOF(*nde)); nde = (hawk_nde_while_t*)hawk_callocmem(hawk, HAWK_SIZEOF(*nde));
if (nde == HAWK_NULL) if (HAWK_UNLIKELY(!nde))
{ {
ADJERR_LOC (hawk, xloc); ADJERR_LOC (hawk, xloc);
goto oops; goto oops;
@ -2628,7 +2628,7 @@ static hawk_nde_t* parse_for (hawk_t* hawk, const hawk_loc_t* xloc)
return HAWK_NULL; return HAWK_NULL;
} }
if (get_token(hawk) <= -1) return HAWK_NULL; if (get_token(hawk) <= -1) return HAWK_NULL;
if (!MATCH(hawk,TOK_SEMICOLON)) if (!MATCH(hawk,TOK_SEMICOLON))
{ {
/* this line is very ugly. it checks the entire next /* this line is very ugly. it checks the entire next
@ -2694,8 +2694,8 @@ static hawk_nde_t* parse_for (hawk_t* hawk, const hawk_loc_t* xloc)
if (!MATCH(hawk,TOK_SEMICOLON)) if (!MATCH(hawk,TOK_SEMICOLON))
{ {
ploc = hawk->tok.loc; ploc = hawk->tok.loc;
test = parse_expr_withdc (hawk, &ploc); test = parse_expr_withdc(hawk, &ploc);
if (test == HAWK_NULL) goto oops; if (HAWK_UNLIKELY(!test)) goto oops;
if (!MATCH(hawk,TOK_SEMICOLON)) if (!MATCH(hawk,TOK_SEMICOLON))
{ {
@ -2717,8 +2717,8 @@ static hawk_nde_t* parse_for (hawk_t* hawk, const hawk_loc_t* xloc)
hawk_loc_t eloc; hawk_loc_t eloc;
eloc = hawk->tok.loc; eloc = hawk->tok.loc;
incr = parse_expr_withdc (hawk, &eloc); incr = parse_expr_withdc(hawk, &eloc);
if (incr == HAWK_NULL) goto oops; if (HAWK_UNLIKELY(!incr)) goto oops;
} }
if (!MATCH(hawk,TOK_RPAREN)) if (!MATCH(hawk,TOK_RPAREN))
@ -2734,8 +2734,8 @@ static hawk_nde_t* parse_for (hawk_t* hawk, const hawk_loc_t* xloc)
body = parse_statement (hawk, &ploc); body = parse_statement (hawk, &ploc);
if (body == HAWK_NULL) goto oops; if (body == HAWK_NULL) goto oops;
nde_for = (hawk_nde_for_t*) hawk_callocmem (hawk, HAWK_SIZEOF(*nde_for)); nde_for = (hawk_nde_for_t*)hawk_callocmem(hawk, HAWK_SIZEOF(*nde_for));
if (nde_for == HAWK_NULL) if (HAWK_UNLIKELY(!nde_for))
{ {
ADJERR_LOC (hawk, xloc); ADJERR_LOC (hawk, xloc);
goto oops; goto oops;
@ -2768,8 +2768,8 @@ static hawk_nde_t* parse_dowhile (hawk_t* hawk, const hawk_loc_t* xloc)
HAWK_ASSERT (hawk->ptok.type == TOK_DO); HAWK_ASSERT (hawk->ptok.type == TOK_DO);
ploc = hawk->tok.loc; ploc = hawk->tok.loc;
body = parse_statement (hawk, &ploc); body = parse_statement(hawk, &ploc);
if (body == HAWK_NULL) goto oops; if (HAWK_UNLIKELY(!body)) goto oops;
while (MATCH(hawk,TOK_NEWLINE)) while (MATCH(hawk,TOK_NEWLINE))
{ {
@ -2794,7 +2794,7 @@ static hawk_nde_t* parse_dowhile (hawk_t* hawk, const hawk_loc_t* xloc)
ploc = hawk->tok.loc; ploc = hawk->tok.loc;
test = parse_expr_withdc (hawk, &ploc); test = parse_expr_withdc (hawk, &ploc);
if (test == HAWK_NULL) goto oops; if (HAWK_UNLIKELY(!test)) goto oops;
if (!MATCH(hawk,TOK_RPAREN)) if (!MATCH(hawk,TOK_RPAREN))
{ {
@ -2803,9 +2803,9 @@ static hawk_nde_t* parse_dowhile (hawk_t* hawk, const hawk_loc_t* xloc)
} }
if (get_token(hawk) <= -1) goto oops; if (get_token(hawk) <= -1) goto oops;
nde = (hawk_nde_while_t*) hawk_callocmem (hawk, HAWK_SIZEOF(*nde)); nde = (hawk_nde_while_t*)hawk_callocmem(hawk, HAWK_SIZEOF(*nde));
if (nde == HAWK_NULL) if (HAWK_UNLIKELY(!nde))
{ {
ADJERR_LOC (hawk, xloc); ADJERR_LOC (hawk, xloc);
goto oops; goto oops;
@ -2836,8 +2836,8 @@ static hawk_nde_t* parse_break (hawk_t* hawk, const hawk_loc_t* xloc)
return HAWK_NULL; return HAWK_NULL;
} }
nde = (hawk_nde_break_t*) hawk_callocmem (hawk, HAWK_SIZEOF(*nde)); nde = (hawk_nde_break_t*)hawk_callocmem(hawk, HAWK_SIZEOF(*nde));
if (nde == HAWK_NULL) if (HAWK_UNLIKELY(!nde))
{ {
ADJERR_LOC (hawk, xloc); ADJERR_LOC (hawk, xloc);
return HAWK_NULL; return HAWK_NULL;
@ -2845,7 +2845,7 @@ static hawk_nde_t* parse_break (hawk_t* hawk, const hawk_loc_t* xloc)
nde->type = HAWK_NDE_BREAK; nde->type = HAWK_NDE_BREAK;
nde->loc = *xloc; nde->loc = *xloc;
return (hawk_nde_t*)nde; return (hawk_nde_t*)nde;
} }
@ -2860,8 +2860,8 @@ static hawk_nde_t* parse_continue (hawk_t* hawk, const hawk_loc_t* xloc)
return HAWK_NULL; return HAWK_NULL;
} }
nde = (hawk_nde_continue_t*) hawk_callocmem (hawk, HAWK_SIZEOF(*nde)); nde = (hawk_nde_continue_t*)hawk_callocmem(hawk, HAWK_SIZEOF(*nde));
if (nde == HAWK_NULL) if (HAWK_UNLIKELY(!nde))
{ {
ADJERR_LOC (hawk, xloc); ADJERR_LOC (hawk, xloc);
return HAWK_NULL; return HAWK_NULL;
@ -2880,8 +2880,8 @@ static hawk_nde_t* parse_return (hawk_t* hawk, const hawk_loc_t* xloc)
HAWK_ASSERT (hawk->ptok.type == TOK_RETURN); HAWK_ASSERT (hawk->ptok.type == TOK_RETURN);
nde = (hawk_nde_return_t*) hawk_callocmem ( hawk, HAWK_SIZEOF(*nde)); nde = (hawk_nde_return_t*)hawk_callocmem(hawk, HAWK_SIZEOF(*nde));
if (nde == HAWK_NULL) if (HAWK_UNLIKELY(!nde))
{ {
ADJERR_LOC (hawk, xloc); ADJERR_LOC (hawk, xloc);
return HAWK_NULL; return HAWK_NULL;
@ -2900,8 +2900,8 @@ static hawk_nde_t* parse_return (hawk_t* hawk, const hawk_loc_t* xloc)
hawk_loc_t eloc; hawk_loc_t eloc;
eloc = hawk->tok.loc; eloc = hawk->tok.loc;
val = parse_expr_withdc (hawk, &eloc); val = parse_expr_withdc(hawk, &eloc);
if (val == HAWK_NULL) if (HAWK_UNLIKELY(!val))
{ {
hawk_freemem (hawk, nde); hawk_freemem (hawk, nde);
return HAWK_NULL; return HAWK_NULL;

View File

@ -379,12 +379,12 @@ static int set_global (hawk_rtx_t* rtx, int idx, hawk_nde_var_t* var, hawk_val_t
HAWK_ASSERT (vtype != HAWK_VAL_REX); HAWK_ASSERT (vtype != HAWK_VAL_REX);
out.type = HAWK_RTX_VALTOSTR_CPLDUP; out.type = HAWK_RTX_VALTOSTR_CPLDUP;
if (hawk_rtx_valtostr (rtx, val, &out) <= -1) return -1; if (hawk_rtx_valtostr(rtx, val, &out) <= -1) return -1;
fs_ptr = out.u.cpldup.ptr; fs_ptr = out.u.cpldup.ptr;
fs_len = out.u.cpldup.len; fs_len = out.u.cpldup.len;
} }
if (fs_len > 1 && !(fs_len == 5 && fs_ptr[0] == HAWK_T('?'))) if (fs_len > 1 && !(fs_len == 5 && fs_ptr[0] == '?'))
{ {
/* it's a regular expression if FS contains multiple characters. /* it's a regular expression if FS contains multiple characters.
* however, it's not a regular expression if it's 5 character * however, it's not a regular expression if it's 5 character

View File

@ -1869,14 +1869,14 @@ tre_ast_to_tnfa(hawk_gem_t* gem, tre_ast_node_t *node, tre_tnfa_transition_t *tr
} }
#define ERROR_EXIT(err) \ #define ERROR_EXIT(err) \
do \ do \
{ \ { \
errcode = err; \ errcode = err; \
if (/*CONSTCOND*/1) \ if (/*CONSTCOND*/1) \
goto error_exit; \ goto error_exit; \
} \ } \
while (/*CONSTCOND*/0) while (/*CONSTCOND*/0)
int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags) int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
@ -1901,11 +1901,10 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
/* HAWK: deleted limit on the stack size /* HAWK: deleted limit on the stack size
stack = tre_stack_new(preg->gem, 512, 10240, 128); */ stack = tre_stack_new(preg->gem, 512, 10240, 128); */
stack = tre_stack_new(preg->gem, 512, -1, 128); stack = tre_stack_new(preg->gem, 512, -1, 128);
if (!stack) if (HAWK_UNLIKELY(!stack)) return REG_ESPACE;
return REG_ESPACE;
/* Allocate a fast memory allocator. */ /* Allocate a fast memory allocator. */
mem = tre_mem_new(preg->gem); mem = tre_mem_new(preg->gem);
if (!mem) if (HAWK_UNLIKELY(!mem))
{ {
tre_stack_destroy(stack); tre_stack_destroy(stack);
return REG_ESPACE; return REG_ESPACE;
@ -1921,8 +1920,7 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
parse_ctx.max_backref = -1; parse_ctx.max_backref = -1;
DPRINT(("tre_compile: parsing '%.*" STRF "'\n", (int)n, regex)); DPRINT(("tre_compile: parsing '%.*" STRF "'\n", (int)n, regex));
errcode = tre_parse(&parse_ctx); errcode = tre_parse(&parse_ctx);
if (errcode != REG_OK) if (errcode != REG_OK) ERROR_EXIT(errcode);
ERROR_EXIT(errcode);
preg->re_nsub = parse_ctx.submatch_id - 1; preg->re_nsub = parse_ctx.submatch_id - 1;
tree = parse_ctx.result; tree = parse_ctx.result;
@ -1941,8 +1939,8 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
/* Allocate the TNFA struct. */ /* Allocate the TNFA struct. */
tnfa = xcalloc(preg->gem, 1, sizeof(tre_tnfa_t)); tnfa = xcalloc(preg->gem, 1, sizeof(tre_tnfa_t));
if (tnfa == NULL) if (HAWK_UNLIKELY(!tnfa)) ERROR_EXIT(REG_ESPACE);
ERROR_EXIT(REG_ESPACE);
tnfa->have_backrefs = parse_ctx.max_backref >= 0; tnfa->have_backrefs = parse_ctx.max_backref >= 0;
tnfa->have_approx = parse_ctx.have_approx; tnfa->have_approx = parse_ctx.have_approx;
tnfa->num_submatches = parse_ctx.submatch_id; tnfa->num_submatches = parse_ctx.submatch_id;
@ -1966,26 +1964,21 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
{ {
tag_directions = xmalloc(preg->gem,sizeof(*tag_directions) tag_directions = xmalloc(preg->gem,sizeof(*tag_directions)
* (tnfa->num_tags + 1)); * (tnfa->num_tags + 1));
if (tag_directions == NULL) if (tag_directions == NULL) ERROR_EXIT(REG_ESPACE);
ERROR_EXIT(REG_ESPACE);
tnfa->tag_directions = tag_directions; tnfa->tag_directions = tag_directions;
HAWK_MEMSET(tag_directions, -1, HAWK_MEMSET(tag_directions, -1, sizeof(*tag_directions) * (tnfa->num_tags + 1));
sizeof(*tag_directions) * (tnfa->num_tags + 1));
} }
tnfa->minimal_tags = xcalloc(preg->gem, (unsigned)tnfa->num_tags * 2 + 1, tnfa->minimal_tags = xcalloc(preg->gem, (unsigned)tnfa->num_tags * 2 + 1,
sizeof(tnfa->minimal_tags)); sizeof(tnfa->minimal_tags));
if (tnfa->minimal_tags == NULL) if (tnfa->minimal_tags == NULL)
ERROR_EXIT(REG_ESPACE); ERROR_EXIT(REG_ESPACE);
submatch_data = xcalloc(preg->gem,(unsigned)parse_ctx.submatch_id, submatch_data = xcalloc(preg->gem,(unsigned)parse_ctx.submatch_id, sizeof(*submatch_data));
sizeof(*submatch_data)); if (HAWK_UNLIKELY(!submatch_data)) ERROR_EXIT(REG_ESPACE);
if (submatch_data == NULL)
ERROR_EXIT(REG_ESPACE);
tnfa->submatch_data = submatch_data; tnfa->submatch_data = submatch_data;
errcode = tre_add_tags(mem, stack, tree, tnfa, 0); errcode = tre_add_tags(mem, stack, tree, tnfa, 0);
if (errcode != REG_OK) if (errcode != REG_OK) ERROR_EXIT(errcode);
ERROR_EXIT(errcode);
#ifdef TRE_DEBUG #ifdef TRE_DEBUG
for (i = 0; i < parse_ctx.submatch_id; i++) for (i = 0; i < parse_ctx.submatch_id; i++)
@ -1999,10 +1992,8 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
} }
/* Expand iteration nodes. */ /* Expand iteration nodes. */
errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position, errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position, tag_directions, &tnfa->params_depth);
tag_directions, &tnfa->params_depth); if (errcode != REG_OK) ERROR_EXIT(errcode);
if (errcode != REG_OK)
ERROR_EXIT(errcode);
/* Add a dummy node for the final state. /* Add a dummy node for the final state.
XXX - For certain patterns this dummy node can be optimized away, XXX - For certain patterns this dummy node can be optimized away,
@ -2010,12 +2001,10 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
this possibility. */ this possibility. */
tmp_ast_l = tree; tmp_ast_l = tree;
tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++); tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
if (tmp_ast_r == NULL) if (HAWK_UNLIKELY(!tmp_ast_r)) ERROR_EXIT(REG_ESPACE);
ERROR_EXIT(REG_ESPACE);
tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r); tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
if (tree == NULL) if (HAWK_UNLIKELY(!tree)) ERROR_EXIT(REG_ESPACE);
ERROR_EXIT(REG_ESPACE);
#ifdef TRE_DEBUG #ifdef TRE_DEBUG
tre_ast_print(tree); tre_ast_print(tree);
@ -2023,16 +2012,13 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
#endif /* TRE_DEBUG */ #endif /* TRE_DEBUG */
errcode = tre_compute_nfl(mem, stack, tree); errcode = tre_compute_nfl(mem, stack, tree);
if (errcode != REG_OK) if (errcode != REG_OK) ERROR_EXIT(errcode);
ERROR_EXIT(errcode);
counts = xmalloc(preg->gem,sizeof(int) * parse_ctx.position); counts = xmalloc(preg->gem,sizeof(int) * parse_ctx.position);
if (counts == NULL) if (HAWK_UNLIKELY(!counts)) ERROR_EXIT(REG_ESPACE);
ERROR_EXIT(REG_ESPACE);
offs = xmalloc(preg->gem,sizeof(int) * parse_ctx.position); offs = xmalloc(preg->gem,sizeof(int) * parse_ctx.position);
if (offs == NULL) if (HAWK_UNLIKELY(!offs)) ERROR_EXIT(REG_ESPACE);
ERROR_EXIT(REG_ESPACE);
for (i = 0; i < parse_ctx.position; i++) for (i = 0; i < parse_ctx.position; i++)
counts[i] = 0; counts[i] = 0;
@ -2046,15 +2032,13 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
counts[i] = 0; counts[i] = 0;
} }
transitions = xcalloc(preg->gem, (unsigned)add + 1, sizeof(*transitions)); transitions = xcalloc(preg->gem, (unsigned)add + 1, sizeof(*transitions));
if (transitions == NULL) if (HAWK_UNLIKELY(!transitions)) ERROR_EXIT(REG_ESPACE);
ERROR_EXIT(REG_ESPACE);
tnfa->transitions = transitions; tnfa->transitions = transitions;
tnfa->num_transitions = add; tnfa->num_transitions = add;
DPRINT(("Converting to TNFA:\n")); DPRINT(("Converting to TNFA:\n"));
errcode = tre_ast_to_tnfa(preg->gem, tree, transitions, counts, offs); errcode = tre_ast_to_tnfa(preg->gem, tree, transitions, counts, offs);
if (errcode != REG_OK) if (errcode != REG_OK) ERROR_EXIT(errcode);
ERROR_EXIT(errcode);
/* If in eight bit mode, compute a table of characters that can be the /* If in eight bit mode, compute a table of characters that can be the
first character of a match. */ first character of a match. */
@ -2145,8 +2129,7 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
} }
initial = xcalloc(preg->gem, (unsigned)i + 1, sizeof(tre_tnfa_transition_t)); initial = xcalloc(preg->gem, (unsigned)i + 1, sizeof(tre_tnfa_transition_t));
if (initial == NULL) if (HAWK_UNLIKELY(!initial)) ERROR_EXIT(REG_ESPACE);
ERROR_EXIT(REG_ESPACE);
tnfa->initial = initial; tnfa->initial = initial;
i = 0; i = 0;
@ -2162,18 +2145,15 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
int j; int j;
for (j = 0; p->tags[j] >= 0; j++); for (j = 0; p->tags[j] >= 0; j++);
initial[i].tags = xmalloc(preg->gem,sizeof(*p->tags) * (j + 1)); initial[i].tags = xmalloc(preg->gem,sizeof(*p->tags) * (j + 1));
if (!initial[i].tags) if (HAWK_UNLIKELY(!initial[i].tags)) ERROR_EXIT(REG_ESPACE);
ERROR_EXIT(REG_ESPACE);
HAWK_MEMCPY (initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1)); HAWK_MEMCPY (initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
} }
initial[i].params = NULL; initial[i].params = NULL;
if (p->params) if (p->params)
{ {
initial[i].params = xmalloc(preg->gem,sizeof(*p->params) * TRE_PARAM_LAST); initial[i].params = xmalloc(preg->gem,sizeof(*p->params) * TRE_PARAM_LAST);
if (!initial[i].params) if (HAWK_UNLIKELY(!initial[i].params)) ERROR_EXIT(REG_ESPACE);
ERROR_EXIT(REG_ESPACE); HAWK_MEMCPY (initial[i].params, p->params, sizeof(*p->params) * TRE_PARAM_LAST);
HAWK_MEMCPY (initial[i].params, p->params,
sizeof(*p->params) * TRE_PARAM_LAST);
} }
initial[i].assertions = p->assertions; initial[i].assertions = p->assertions;
i++; i++;
@ -2198,12 +2178,9 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
error_exit: error_exit:
/* Free everything that was allocated and return the error code. */ /* Free everything that was allocated and return the error code. */
tre_mem_destroy(mem); tre_mem_destroy(mem);
if (stack != NULL) if (stack) tre_stack_destroy(stack);
tre_stack_destroy(stack); if (counts) xfree(preg->gem,counts);
if (counts != NULL) if (offs) xfree(preg->gem,offs);
xfree(preg->gem,counts);
if (offs != NULL)
xfree(preg->gem,offs);
preg->TRE_REGEX_T_FIELD = (void *)tnfa; preg->TRE_REGEX_T_FIELD = (void *)tnfa;
tre_free(preg); tre_free(preg);
return errcode; return errcode;

View File

@ -64,83 +64,83 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* Wide character and multibyte support. */ /* Wide character and multibyte support. */
#define GET_NEXT_WCHAR() \ #define GET_NEXT_WCHAR() \
do { \ do { \
prev_c = next_c; \ prev_c = next_c; \
if (type == STR_BYTE) \ if (type == STR_BYTE) \
{ \ { \
pos++; \ pos++; \
if (len >= 0 && pos >= len) \ if (len >= 0 && pos >= len) \
next_c = '\0'; \ next_c = '\0'; \
else \ else \
next_c = (unsigned char)(*str_byte++); \ next_c = (unsigned char)(*str_byte++); \
} \ } \
else if (type == STR_WIDE) \ else if (type == STR_WIDE) \
{ \ { \
pos++; \ pos++; \
if (len >= 0 && pos >= len) \ if (len >= 0 && pos >= len) \
next_c = HAWK_T('\0'); \ next_c = '\0'; \
else \ else \
next_c = *str_wide++; \ next_c = *str_wide++; \
} \ } \
else if (type == STR_MBS) \ else if (type == STR_MBS) \
{ \ { \
pos += pos_add_next; \ pos += pos_add_next; \
if (str_byte == NULL) \ if (str_byte == NULL) \
next_c = HAWK_T('\0'); \ next_c = '\0'; \
else \ else \
{ \ { \
size_t w; \ size_t w; \
int max; \ int max; \
if (len >= 0) \ if (len >= 0) \
max = len - pos; \ max = len - pos; \
else \ else \
max = 32; \ max = 32; \
if (max <= 0) \ if (max <= 0) \
{ \ { \
next_c = HAWK_T('\0'); \ next_c = '\0'; \
pos_add_next = 1; \ pos_add_next = 1; \
} \ } \
else \ else \
{ \ { \
w = hawk_mbrtowc(str_byte, (size_t)max, &next_c, &mbstate); \ w = hawk_mbrtowc(str_byte, (size_t)max, &next_c, &mbstate); \
if (w <= 0 || w > max) \ if (w <= 0 || w > max) \
return REG_NOMATCH; \ return REG_NOMATCH; \
if (next_c == HAWK_T('\0') && len >= 0) \ if (next_c == '\0' && len >= 0) \
{ \ { \
pos_add_next = 1; \ pos_add_next = 1; \
next_c = 0; \ next_c = 0; \
str_byte++; \ str_byte++; \
} \ } \
else \ else \
{ \ { \
pos_add_next = w; \ pos_add_next = w; \
str_byte += w; \ str_byte += w; \
} \ } \
} \ } \
} \ } \
} \ } \
} while(/*CONSTCOND*/0) } while(/*CONSTCOND*/0)
#else /* !TRE_MULTIBYTE */ #else /* !TRE_MULTIBYTE */
/* Wide character support, no multibyte support. */ /* Wide character support, no multibyte support. */
#define GET_NEXT_WCHAR() \ #define GET_NEXT_WCHAR() \
do { \ do { \
prev_c = next_c; \ prev_c = next_c; \
if (type == STR_BYTE) \ if (type == STR_BYTE) \
{ \ { \
pos++; \ pos++; \
if (len >= 0 && pos >= len) next_c = HAWK_BT('\0'); \ if (len >= 0 && pos >= len) next_c = '\0'; \
else next_c = (unsigned char)(*str_byte++); \ else next_c = (unsigned char)(*str_byte++); \
} \ } \
else if (type == STR_WIDE) \ else if (type == STR_WIDE) \
{ \ { \
pos++; \ pos++; \
if (len >= 0 && pos >= len) next_c = HAWK_T('\0'); \ if (len >= 0 && pos >= len) next_c = '\0'; \
else next_c = *str_wide++; \ else next_c = *str_wide++; \
} \ } \
} while(/*CONSTCOND*/0) } while(/*CONSTCOND*/0)
#endif /* !TRE_MULTIBYTE */ #endif /* !TRE_MULTIBYTE */
@ -166,22 +166,22 @@ do { \
#define IS_WORD_CHAR(c) ((c) == HAWK_T('_') || tre_isalnum(c)) #define IS_WORD_CHAR(c) ((c) == HAWK_T('_') || tre_isalnum(c))
#define CHECK_ASSERTIONS(assertions) \ #define CHECK_ASSERTIONS(assertions) \
(((assertions & ASSERT_AT_BOL) \ (((assertions & ASSERT_AT_BOL) \
&& (pos > 0 || reg_notbol) \ && (pos > 0 || reg_notbol) \
&& (prev_c != HAWK_T('\n') || !reg_newline)) \ && (prev_c != HAWK_T('\n') || !reg_newline)) \
|| ((assertions & ASSERT_AT_EOL) \ || ((assertions & ASSERT_AT_EOL) \
&& (next_c != HAWK_T('\0') || reg_noteol) \ && (next_c != HAWK_T('\0') || reg_noteol) \
&& (next_c != HAWK_T('\n') || !reg_newline)) \ && (next_c != HAWK_T('\n') || !reg_newline)) \
|| ((assertions & ASSERT_AT_BOW) \ || ((assertions & ASSERT_AT_BOW) \
&& (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c))) \ && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c))) \
|| ((assertions & ASSERT_AT_EOW) \ || ((assertions & ASSERT_AT_EOW) \
&& (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c))) \ && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c))) \
|| ((assertions & ASSERT_AT_WB) \ || ((assertions & ASSERT_AT_WB) \
&& (pos != 0 && next_c != HAWK_T('\0') \ && (pos != 0 && next_c != HAWK_T('\0') \
&& IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c))) \ && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c))) \
|| ((assertions & ASSERT_AT_WB_NEG) \ || ((assertions & ASSERT_AT_WB_NEG) \
&& (pos == 0 || next_c == HAWK_T('\0') \ && (pos == 0 || next_c == HAWK_T('\0') \
|| IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c)))) || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
#define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags) \ #define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags) \
@ -191,7 +191,7 @@ do { \
|| ((trans_i->assertions & ASSERT_CHAR_CLASS) \ || ((trans_i->assertions & ASSERT_CHAR_CLASS) \
&& (tnfa->cflags & REG_ICASE) \ && (tnfa->cflags & REG_ICASE) \
&& !tre_isctype(tre_tolower((tre_cint_t)prev_c),trans_i->u.class) \ && !tre_isctype(tre_tolower((tre_cint_t)prev_c),trans_i->u.class) \
&& !tre_isctype(tre_toupper((tre_cint_t)prev_c),trans_i->u.class)) \ && !tre_isctype(tre_toupper((tre_cint_t)prev_c),trans_i->u.class)) \
|| ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG) \ || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG) \
&& tre_neg_char_classes_match(trans_i->neg_classes,(tre_cint_t)prev_c,\ && tre_neg_char_classes_match(trans_i->neg_classes,(tre_cint_t)prev_c,\
tnfa->cflags & REG_ICASE))) tnfa->cflags & REG_ICASE)))
@ -201,8 +201,7 @@ do { \
/* Returns 1 if `t1' wins `t2', 0 otherwise. */ /* Returns 1 if `t1' wins `t2', 0 otherwise. */
HAWK_INLINE static int HAWK_INLINE static int
tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, int *t1, int *t2)
int *t1, int *t2)
{ {
int i; int i;
for (i = 0; i < num_tags; i++) for (i = 0; i < num_tags; i++)

View File

@ -169,11 +169,7 @@ SUBMATCH[4] = [defg]
#define tre_tolower(c) hawk_to_ooch_lower(c) #define tre_tolower(c) hawk_to_ooch_lower(c)
#define tre_toupper(c) hawk_to_ooch_upper(c) #define tre_toupper(c) hawk_to_ooch_upper(c)
#if defined(HAWK_OOCH_IS_BCH) && (HAWK_SIZEOF_MCHAR_T == HAWK_SIZEOF_CHAR) typedef hawk_ooch_t tre_char_t;
typedef unsigned char tre_char_t;
#else
typedef hawk_ooch_t tre_char_t;
#endif
typedef hawk_ooci_t tre_cint_t; typedef hawk_ooci_t tre_cint_t;
#define size_t hawk_oow_t #define size_t hawk_oow_t

View File

@ -291,6 +291,33 @@ function main()
ensure (a[2] === @b"Is", 1, @SCRIPTNAME, @SCRIPTLINE); ensure (a[2] === @b"Is", 1, @SCRIPTNAME, @SCRIPTLINE);
ensure (a[3] === @b"Some", 1, @SCRIPTNAME, @SCRIPTLINE); ensure (a[3] === @b"Some", 1, @SCRIPTNAME, @SCRIPTLINE);
ensure (a[4] === @b"Data", 1, @SCRIPTNAME, @SCRIPTLINE); ensure (a[4] === @b"Data", 1, @SCRIPTNAME, @SCRIPTLINE);
ensure (split(@b"Here===Is=Some=====Data", a, /=+/), 4, @SCRIPTNAME, @SCRIPTLINE);
ensure (a[1] === @b"Here", 1, @SCRIPTNAME, @SCRIPTLINE);
ensure (a[2] === @b"Is", 1, @SCRIPTNAME, @SCRIPTLINE);
ensure (a[3] === @b"Some", 1, @SCRIPTNAME, @SCRIPTLINE);
ensure (a[4] === @b"Data", 1, @SCRIPTNAME, @SCRIPTLINE);
ensure (split("[Here] : [Is] : [So\\]me] :[Da:ta]", a, "?:\\[]"), 4, @SCRIPTNAME, @SCRIPTLINE);
ensure (a[1] === "Here", 1, @SCRIPTNAME, @SCRIPTLINE);
ensure (a[2] === "Is", 1, @SCRIPTNAME, @SCRIPTLINE);
ensure (a[3] === "So]me", 1, @SCRIPTNAME, @SCRIPTLINE);
ensure (a[4] === "Da:ta", 1, @SCRIPTNAME, @SCRIPTLINE);
ensure (split(@b"[Here] : [Is] : [So\\]me] :[Da:ta]", a, "?:\\[]"), 4, @SCRIPTNAME, @SCRIPTLINE);
ensure (a[1] === @b"Here", 1, @SCRIPTNAME, @SCRIPTLINE);
ensure (a[2] === @b"Is", 1, @SCRIPTNAME, @SCRIPTLINE);
ensure (a[3] === @b"So]me", 1, @SCRIPTNAME, @SCRIPTLINE);
ensure (a[4] === @b"Da:ta", 1, @SCRIPTNAME, @SCRIPTLINE);
ensure (split("Here===Is=Some=====Data", a, ""), 23, @SCRIPTNAME, @SCRIPTLINE);
ensure (split("Here Is Some Data", a, / /), 7, @SCRIPTNAME, @SCRIPTLINE);
ensure (split("Here Is Some Data", a, " "), 4, @SCRIPTNAME, @SCRIPTLINE);
ensure (a[1] === "Here", 1, @SCRIPTNAME, @SCRIPTLINE);
ensure (a[2] === "Is", 1, @SCRIPTNAME, @SCRIPTLINE);
ensure (a[3] === "Some", 1, @SCRIPTNAME, @SCRIPTLINE);
ensure (a[4] === "Data", 1, @SCRIPTNAME, @SCRIPTLINE);
} }
print "SUCCESS"; print "SUCCESS";