enhanced str::split() to handle byte strings better

2020-11-13 14:56:15 +00:00 · 2020-11-13 14:56:15 +00:00 · 166c18c7d0
commit 166c18c7d0
parent 4a60654b49
10 changed files with 471 additions and 721 deletions
--- a/hawk/lib/fnc.c
+++ b/hawk/lib/fnc.c
@ -787,216 +787,23 @@ int hawk_fnc_substr (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi)
 	return 0;
 }

-#if 0
-static int split_mbs (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi)
-{
-	hawk_oow_t nargs;
-	hawk_val_t* a0, * a2, * t1, * t2;
-	hawk_val_type_t a2_vtype, t1_vtype;
-
-	hawk_bcs_t str;
-	hawk_bcs_t fs;
-	hawk_bch_t* fs_free = HAWK_NULL;
-	const hawk_bch_t* p;
-	hawk_oow_t str_left, org_len;
-	hawk_tre_t* fs_rex = HAWK_NULL; 
-	hawk_tre_t* fs_rex_free = HAWK_NULL;
-
-	hawk_bcs_t tok;
-	hawk_int_t nflds;
-	int x;
-
-	str.ptr = HAWK_NULL;
-	str.len = 0;
-
-	nargs = hawk_rtx_getnargs(rtx);
-	HAWK_ASSERT (nargs >= 2 && nargs <= 3);
-
-	a0 = hawk_rtx_getarg(rtx, 0);
-	a2 = (nargs >= 3)? hawk_rtx_getarg(rtx, 2): HAWK_NULL;
-
-	str.ptr = hawk_rtx_getvalbcstr(rtx, a0, &str.len);
-	if (HAWK_UNLIKELY(!str.ptr)) goto oops;
-
-	if (!a2)
-	{
-		/* get the value from FS */
-		t1 = hawk_rtx_getgbl(rtx, HAWK_GBL_FS);
-		t1_vtype = HAWK_RTX_GETVALTYPE(rtx, t1);
-		if (t1_vtype == HAWK_VAL_NIL)
-		{
-			fs.ptr = " ";
-			fs.len = 1;
-		}
-		else if (t1_vtype == HAWK_VAL_MBS)
-		{
-			fs.ptr = ((hawk_val_mbs_t*)t1)->val.ptr;
-			fs.len = ((hawk_val_mbs_t*)t1)->val.len;
-		}
-		else
-		{
-			fs.ptr = hawk_rtx_valtobcstrdup(rtx, t1, &fs.len);
-			if (HAWK_UNLIKELY(!fs.ptr)) goto oops;
-			fs_free = (hawk_bch_t*)fs.ptr;
-		}
-
-		if (fs.len > 1) fs_rex = rtx->gbl.fs[rtx->gbl.ignorecase];
-	}
-	else 
-	{
-		a2_vtype = HAWK_RTX_GETVALTYPE(rtx, a2);
-
-		if (a2_vtype == HAWK_VAL_REX)
-		{
-			/* the third parameter is a regular expression */
-			fs_rex = ((hawk_val_rex_t*)a2)->code[rtx->gbl.ignorecase];
-
-			/* make the loop below to take fs_rex by 
-			 * setting fs_len greater than 1*/
-			fs.ptr = HAWK_NULL;
-			fs.len = 2;
-		}
-		else 
-		{
-			if (a2_vtype == HAWK_VAL_MBS)
-			{
-				fs.ptr = ((hawk_val_mbs_t*)a2)->val.ptr;
-				fs.len = ((hawk_val_mbs_t*)a2)->val.len;
-			}
-			else
-			{
-				fs.ptr = hawk_rtx_valtobcstrdup(rtx, a2, &fs.len);
-				if (fs.ptr == HAWK_NULL) goto oops;
-				fs_free = (hawk_bch_t*)fs.ptr;
-			}
-
-			if (fs.len > 1) 
-			{
-				int x;
-
-				x = rtx->gbl.ignorecase?
-					hawk_rtx_buildrex(rtx, fs.ptr, fs.len, HAWK_NULL, &fs_rex):
-					hawk_rtx_buildrex(rtx, fs.ptr, fs.len, &fs_rex, HAWK_NULL);
-				if (x <= -1) goto oops;
-
-				fs_rex_free = fs_rex;
-			}
-		}
-	}
-
-	t1 = hawk_rtx_makearrval(rtx);
-	if (HAWK_UNLIKELY(!t1)) goto oops;
-
-	hawk_rtx_refupval (rtx, t1);
-	x = hawk_rtx_setrefval(rtx, (hawk_val_ref_t*)hawk_rtx_getarg(rtx, 1), t1);
-	hawk_rtx_refdownval (rtx, t1);
-	if (HAWK_UNLIKELY(x <= -1)) goto oops;
-
-	/* fill the map with actual values */
-	p = str.ptr; str_left = str.len; org_len = str.len;
-	nflds = 0;
-
-	while (p)
-	{
-		hawk_bch_t key_buf[HAWK_SIZEOF(hawk_int_t)*8+2];
-		hawk_oow_t key_len;
-
-		if (fs.len <= 1)
-		{
-			p = hawk_rtx_tokoocharswithoochars(rtx, p, str.len, fs.ptr, fs.len, &tok);
-		}
-		else
-		{
-			p = hawk_rtx_tokoocharsbyrex(rtx, str.ptr, org_len, p, str.len, fs_rex, &tok);
-			if (p == HAWK_NULL && hawk_rtx_geterrnum(rtx) != HAWK_ENOERR)
-			{
-				goto oops;
-			}
-		}
-
-		if (nflds == 0 && p == HAWK_NULL && tok.len == 0) 
-		{
-			/* no field at all*/
-			break; 
-		}
-
-		HAWK_ASSERT ((tok.ptr != HAWK_NULL && tok.len > 0) || tok.len == 0);
-
-		/* create the field string - however, the split function must
-		 * create a numeric value if the string is a number */
-		/*t2 = hawk_rtx_makembsvalwithbcs (rtx, &tok);*/
-		/*t2 = hawk_rtx_makenmbsvalwithbcs(rtx, &tok); */
-		t2 = hawk_rtx_makenumormbsvalwithbchars(rtx, tok.ptr, tok.len);
-		if (HAWK_UNLIKELY(!t2)) goto oops;
-
-		/* put it into the map */
-		key_len = hawk_int_to_oocstr(++nflds, 10, HAWK_NULL, key_buf, HAWK_COUNTOF(key_buf));
-		HAWK_ASSERT (key_len != (hawk_oow_t)-1);
-
-		if (hawk_rtx_setarrvalfld(rtx, t1, key_buf, key_len, t2) == HAWK_NULL)
-		{
-			hawk_rtx_refupval (rtx, t2);
-			hawk_rtx_refdownval (rtx, t2);
-			goto oops;
-		}
-
-		str.len = str_left - (p - str.ptr);
-	}
-
-	/*if (str_free) hawk_rtx_freemem (rtx, str_free);*/
-	hawk_rtx_freevalbcstr (rtx, a0, str.ptr);
-
-	if (fs_free) hawk_rtx_freemem (rtx, fs_free);
-
-	if (fs_rex_free) 
-	{
-		if (rtx->gbl.ignorecase)
-			hawk_rtx_freerex (rtx, HAWK_NULL, fs_rex_free);
-		else
-			hawk_rtx_freerex (rtx, fs_rex_free, HAWK_NULL);
-	}
-
-	/*nflds--;*/
-
-	t1 = hawk_rtx_makeintval(rtx, nflds);
-	if (HAWK_UNLIKELY(!t1)) return -1;
-
-	hawk_rtx_setretval (rtx, t1);
-	return 0;
-
-oops:
-	if (str.ptr) hawk_rtx_freevalbcstr (rtx, a0, str.ptr);
-
-	if (fs_free) hawk_rtx_freemem (rtx, fs_free);
-
-	if (fs_rex_free) 
-	{
-		if (rtx->gbl.ignorecase)
-			hawk_rtx_freerex (rtx, HAWK_NULL, fs_rex_free);
-		else
-			hawk_rtx_freerex (rtx, fs_rex_free, HAWK_NULL);
-	}
-	return -1;
-}
-#endif
-
 static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array)
 {
 	hawk_oow_t nargs;
-	hawk_val_t* a0, * a2, * t1, * t2;
-	hawk_val_type_t a2_vtype, t1_vtype;
+	hawk_val_t* a0, * a2, * t0, * t1, * t2;

 	hawk_oocs_t str;
 	hawk_oocs_t fs;
 	hawk_ooch_t* fs_free = HAWK_NULL;
-	const hawk_ooch_t* p;
+	hawk_ooch_t* p;
+
 	hawk_oow_t str_left, org_len;
 	hawk_tre_t* fs_rex = HAWK_NULL; 
 	hawk_tre_t* fs_rex_free = HAWK_NULL;

 	hawk_oocs_t tok;
 	hawk_int_t nflds;
-	int x;
+	int x, byte_str, do_fld = 0;

 	str.ptr = HAWK_NULL;
 	str.len = 0;
@ -1007,65 +814,41 @@ static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array)
 	a0 = hawk_rtx_getarg(rtx, 0);
 	a2 = (nargs >= 3)? hawk_rtx_getarg (rtx, 2): HAWK_NULL;

-	str.ptr = hawk_rtx_getvaloocstr(rtx, a0, &str.len);
-	if (HAWK_UNLIKELY(!str.ptr)) goto oops;
+	str.ptr = HAWK_NULL;
+	str.len = 0;

-	if (!a2)
+	/* field seperator */
+	t0 = a2? a2: hawk_rtx_getgbl(rtx, HAWK_GBL_FS); /* if a2 is not available, get the value from FS */
+
+	if (HAWK_RTX_GETVALTYPE(rtx, t0) == HAWK_VAL_NIL)
 	{
-		/* get the value from FS */
-		t1 = hawk_rtx_getgbl(rtx, HAWK_GBL_FS);
-		t1_vtype = HAWK_RTX_GETVALTYPE(rtx, t1);
-		if (t1_vtype == HAWK_VAL_NIL)
-		{
-			fs.ptr = HAWK_T(" ");
-			fs.len = 1;
-		}
-		else if (t1_vtype == HAWK_VAL_STR)
-		{
-			fs.ptr = ((hawk_val_str_t*)t1)->val.ptr;
-			fs.len = ((hawk_val_str_t*)t1)->val.len;
-		}
-		else
-		{
-			fs.ptr = hawk_rtx_valtooocstrdup(rtx, t1, &fs.len);
-			if (HAWK_UNLIKELY(!fs.ptr)) goto oops;
-			fs_free = (hawk_ooch_t*)fs.ptr;
-		}
-
-		if (fs.len > 1) fs_rex = rtx->gbl.fs[rtx->gbl.ignorecase];
+		fs.ptr = HAWK_T(" ");
+		fs.len = 1;
 	}
-	else 
+	else if (HAWK_RTX_GETVALTYPE(rtx, t0) == HAWK_VAL_REX)
 	{
-		a2_vtype = HAWK_RTX_GETVALTYPE (rtx, a2);
+		/* regular expression */
+		fs_rex = ((hawk_val_rex_t*)t0)->code[rtx->gbl.ignorecase];

-		if (a2_vtype == HAWK_VAL_REX)
+		/* make the tokenizing loop below to take fs_rex by setting fs_len greater than 1*/
+		fs.ptr = HAWK_NULL;
+		fs.len = 2;
+	}
+	else
+	{
+		fs.ptr = hawk_rtx_getvaloocstr(rtx, t0, &fs.len);
+		if (HAWK_UNLIKELY(!fs.ptr)) goto oops;
+
+		fs_free = fs.ptr;
+
+		if (fs.len == 5 && fs.ptr[0] == '?')
 		{
-			/* the third parameter is a regular expression */
-			fs_rex = ((hawk_val_rex_t*)a2)->code[rtx->gbl.ignorecase];
-
-			/* make the loop below to take fs_rex by 
-			 * setting fs_len greater than 1*/
-			fs.ptr = HAWK_NULL;
-			fs.len = 2;
+			do_fld = 1;
 		}
-		else 
+		else if (fs.len > 1) 
 		{
-			if (a2_vtype == HAWK_VAL_STR)
+			if (a2)
 			{
-				fs.ptr = ((hawk_val_str_t*)a2)->val.ptr;
-				fs.len = ((hawk_val_str_t*)a2)->val.len;
-			}
-			else
-			{
-				fs.ptr = hawk_rtx_valtooocstrdup(rtx, a2, &fs.len);
-				if (fs.ptr == HAWK_NULL) goto oops;
-				fs_free = (hawk_ooch_t*)fs.ptr;
-			}
-
-			if (fs.len > 1) 
-			{
-				int x;
-
 				x = rtx->gbl.ignorecase?
 					hawk_rtx_buildrex(rtx, fs.ptr, fs.len, HAWK_NULL, &fs_rex):
 					hawk_rtx_buildrex(rtx, fs.ptr, fs.len, &fs_rex, HAWK_NULL);
@ -1073,9 +856,28 @@ static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array)

 				fs_rex_free = fs_rex;
 			}
+			else
+			{
+				fs_rex = rtx->gbl.fs[rtx->gbl.ignorecase];
+			}
 		}
 	}

+	/* the first parameter - string to split */
+	if (HAWK_RTX_GETVALTYPE(rtx, a0) == HAWK_VAL_MBS)
+	{
+		byte_str = 1;
+		str.ptr = do_fld? hawk_rtx_valtobcstrdup(rtx, a0, &str.len):
+		                  hawk_rtx_getvalbcstr(rtx, a0, &str.len);
+	}
+	else
+	{
+		byte_str = 0;
+		str.ptr = do_fld? hawk_rtx_valtooocstrdup(rtx, a0, &str.len):
+		                  hawk_rtx_getvaloocstr(rtx, a0, &str.len);
+	}
+	if (HAWK_UNLIKELY(!str.ptr)) goto oops;
+
 	t1 = use_array? hawk_rtx_makearrval(rtx, 16): hawk_rtx_makemapval(rtx);
 	if (HAWK_UNLIKELY(!t1)) goto oops;

@ -1090,20 +892,23 @@ static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array)

 	while (p)
 	{
-		hawk_ooch_t key_buf[HAWK_SIZEOF(hawk_int_t)*8+2];
-		hawk_oow_t key_len;
-
-		if (fs.len <= 1)
+		if (fs_rex)
 		{
-			p = hawk_rtx_tokoocharswithoochars(rtx, p, str.len, fs.ptr, fs.len, &tok);
+			p = byte_str? hawk_rtx_tokbcharsbyrex(rtx, str.ptr, org_len, p, str.len, fs_rex, &tok):
+			              hawk_rtx_tokoocharsbyrex(rtx, str.ptr, org_len, p, str.len, fs_rex, &tok);
+			if (p && hawk_rtx_geterrnum(rtx) != HAWK_ENOERR) goto oops;
+		}
+		else if (do_fld)
+		{
+			/* [NOTE] even if byte_str is true, the field seperator is of the ooch type. 
+			 *        there may be some data truncation and related issues */
+			p = byte_str? hawk_rtx_fldbchars(rtx, p, str.len, fs.ptr[1], fs.ptr[2], fs.ptr[3], fs.ptr[4], &tok):
+			              hawk_rtx_fldoochars(rtx, p, str.len, fs.ptr[1], fs.ptr[2], fs.ptr[3], fs.ptr[4], &tok);
 		}
 		else
 		{
-			p = hawk_rtx_tokoocharsbyrex(rtx, str.ptr, org_len, p, str.len, fs_rex, &tok);
-			if (p == HAWK_NULL && hawk_rtx_geterrnum(rtx) != HAWK_ENOERR)
-			{
-				goto oops;
-			}
+			p = byte_str? hawk_rtx_tokbcharswithbchars(rtx, p, str.len, fs.ptr, fs.len, &tok):
+			              hawk_rtx_tokoocharswithoochars(rtx, p, str.len, fs.ptr, fs.len, &tok);
 		}

 		if (nflds == 0 && p == HAWK_NULL && tok.len == 0) 
@ -1118,7 +923,8 @@ static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array)
 		 * create a numeric value if the string is a number */
 		/*t2 = hawk_rtx_makestrvalwithoocs (rtx, &tok);*/
 		/*t2 = hawk_rtx_makenstrvalwithoocs(rtx, &tok); */
-		t2 = hawk_rtx_makenumorstrvalwithoochars(rtx, tok.ptr, tok.len);
+		t2 = byte_str? hawk_rtx_makenumormbsvalwithbchars(rtx, tok.ptr, tok.len):
+		               hawk_rtx_makenumorstrvalwithoochars(rtx, tok.ptr, tok.len);
 		if (HAWK_UNLIKELY(!t2)) goto oops;

 		if (use_array)
@ -1133,6 +939,9 @@ static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array)
 		else
 		{
 			/* put it into the map */
+			hawk_ooch_t key_buf[HAWK_SIZEOF(hawk_int_t)*8+2];
+			hawk_oow_t key_len;
+
 			key_len = hawk_int_to_oocstr(++nflds, 10, HAWK_NULL, key_buf, HAWK_COUNTOF(key_buf));
 			HAWK_ASSERT (key_len != (hawk_oow_t)-1);

@ -1144,13 +953,17 @@ static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array)
 			}
 		}

-		str.len = str_left - (p - str.ptr);
+		if (byte_str)
+			str.len = str_left - ((p - str.ptr) * HAWK_SIZEOF_OOCH_T);
+		else
+			str.len = str_left - (p - str.ptr);
 	}

-	/*if (str_free) hawk_rtx_freemem (rtx, str_free);*/
-	hawk_rtx_freevaloocstr (rtx, a0, str.ptr);
+	if (do_fld) hawk_rtx_freemem (rtx, str.ptr);
+	else if (byte_str) hawk_rtx_freevalbcstr (rtx, a0, str.ptr);
+	else hawk_rtx_freevaloocstr (rtx, a0, str.ptr);

-	if (fs_free) hawk_rtx_freemem (rtx, fs_free);
+	if (fs_free) hawk_rtx_freevaloocstr (rtx, t0, fs_free);

 	if (fs_rex_free) 
 	{
@ -1160,16 +973,19 @@ static int fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi, int use_array)
 			hawk_rtx_freerex (rtx, fs_rex_free, HAWK_NULL);
 	}

-	/*nflds--;*/
-
-	t1 = hawk_rtx_makeintval (rtx, nflds);
+	t1 = hawk_rtx_makeintval(rtx, nflds);
 	if (HAWK_UNLIKELY(!t1)) return -1;

 	hawk_rtx_setretval (rtx, t1);
 	return 0;

 oops:
-	if (str.ptr) hawk_rtx_freevaloocstr (rtx, a0, str.ptr);
+	if (str.ptr) 
+	{
+		if (do_fld) hawk_rtx_freemem (rtx, str.ptr);
+		else if (byte_str) hawk_rtx_freevalbcstr (rtx, a0, str.ptr);
+		else hawk_rtx_freevaloocstr (rtx, a0, str.ptr);
+	}

 	if (fs_free) hawk_rtx_freemem (rtx, fs_free);

@ -1185,7 +1001,8 @@ oops:

 int hawk_fnc_split (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi)
 {
-	return fnc_split(rtx, fi, 1);
+	/*return fnc_split(rtx, fi, 1);*/
+	return fnc_split(rtx, fi, 0);
 }

 int hawk_fnc_tolower (hawk_rtx_t* rtx, const hawk_fnc_info_t* fi)
--- a/hawk/lib/misc-imp.h
+++ b/hawk/lib/misc-imp.h
@ -24,6 +24,92 @@
    THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

+char_t* split_xchars_to_fields (hawk_rtx_t* rtx, char_t* str, hawk_oow_t len, char_t fs, char_t ec, char_t lq, char_t rq, xcs_t* tok)
+{
+	char_t* p = str;
+	char_t* end = str + len;
+	int escaped = 0, quoted = 0;
+	char_t* ts; /* token start */
+	char_t* tp; /* points to one char past the last token char */
+	char_t* xp; /* points to one char past the last effective char */
+
+	/* skip leading spaces */
+	while (p < end && is_xch_space(*p)) p++;
+
+	/* initialize token pointers */
+	ts = tp = xp = p; 
+
+	while (p < end)
+	{
+		char c = *p;
+
+		if (escaped)
+		{
+			*tp++ = c; xp = tp; p++;
+			escaped = 0;
+		}
+		else
+		{
+			if (c == ec)
+			{
+				escaped = 1;
+				p++;
+			}
+			else if (quoted)
+			{
+				if (c == rq)
+				{
+					quoted = 0;
+					p++;
+				}
+				else
+				{
+					*tp++ = c; xp = tp; p++;
+				}
+			}
+			else 
+			{
+				if (c == fs)
+				{
+					tok->ptr = ts;
+					tok->len = xp - ts;
+					p++;
+
+					if (is_xch_space(fs))
+					{
+						while (p < end && *p == fs) p++;
+						if (p >= end) return HAWK_NULL;
+					}
+
+					return p;
+				}
+		
+				if (c == lq)
+				{
+					quoted = 1;
+					p++;
+				}
+				else
+				{
+					*tp++ = c; p++;
+					if (!is_xch_space(c)) xp = tp; 
+				}
+			}
+		}
+	}
+
+	if (escaped) 
+	{
+		/* if it is still escaped, the last character must be 
+		 * the escaper itself. treat it as a normal character */
+		*xp++ = ec;
+	}
+
+	tok->ptr = ts;
+	tok->len = xp - ts;
+	return HAWK_NULL;
+}
+
 char_t* tokenize_xchars (hawk_rtx_t* rtx, const char_t* s, hawk_oow_t len, const char_t* delim, hawk_oow_t delim_len, xcs_t* tok)
 {
 	const char_t* p = s, *d;
@ -214,88 +300,102 @@ exit_loop:
 	return (char_t*)++p;
 }

-char_t* split_xchars_to_fields (hawk_rtx_t* rtx, char_t* str, hawk_oow_t len, char_t fs, char_t ec, char_t lq, char_t rq, xcs_t* tok)
+
+char_t* tokenize_xchars_by_rex (hawk_rtx_t* rtx, const char_t* str, hawk_oow_t len, const char_t* substr, hawk_oow_t sublen, hawk_tre_t* rex, xcs_t* tok)
 {
-	char_t* p = str;
-	char_t* end = str + len;
-	int escaped = 0, quoted = 0;
-	char_t* ts; /* token start */
-	char_t* tp; /* points to one char past the last token char */
-	char_t* xp; /* points to one char past the last effective char */
+	int n;
+	hawk_oow_t i;
+	xcs_t match, s, cursub, realsub;

-	/* skip leading spaces */
-	while (p < end && is_xch_space(*p)) p++;
+	s.ptr = (char_t*)str;
+	s.len = len;

-	/* initialize token pointers */
-	ts = tp = xp = p; 
+	cursub.ptr = (char_t*)substr;
+	cursub.len = sublen;

-	while (p < end)
+	realsub.ptr = (char_t*)substr;
+	realsub.len = sublen;
+
+	while (cursub.len > 0)
 	{
-		char c = *p;
+		n = match_rex_with_xcs(rtx, rex, &s, &cursub, &match, HAWK_NULL);
+		if (n <= -1) return HAWK_NULL;

-		if (escaped)
+		if (n == 0)
 		{
-			*tp++ = c; xp = tp; p++;
-			escaped = 0;
+			/* no match has been found. return the entire string as a token */
+			hawk_rtx_seterrnum (rtx, HAWK_NULL, HAWK_ENOERR); /* reset HAWK_EREXNOMAT to no error */
+			tok->ptr = realsub.ptr;
+			tok->len = realsub.len;
+			return HAWK_NULL; 
 		}
-		else
+
+		HAWK_ASSERT (n == 1);
+
+		if (match.len == 0)
 		{
-			if (c == ec)
+			/* the match length is zero. */
+			cursub.ptr++;
+			cursub.len--;
+		}
+		else if (HAWK_RTX_IS_STRIPRECSPC_ON(rtx))
+		{
+			/* match at the beginning of the input string */
+			if (match.ptr == substr) 
 			{
-				escaped = 1;
-				p++;
-			}
-			else if (quoted)
-			{
-				if (c == rq)
+				for (i = 0; i < match.len; i++)
 				{
-					quoted = 0;
-					p++;
+					if (!is_xch_space(match.ptr[i])) goto exit_loop;
 				}
-				else
-				{
-					*tp++ = c; xp = tp; p++;
-				}
-			}
-			else 
-			{
-				if (c == fs)
-				{
-					tok->ptr = ts;
-					tok->len = xp - ts;
-					p++;

-					if (is_xch_space(fs))
-					{
-						while (p < end && *p == fs) p++;
-						if (p >= end) return HAWK_NULL;
-					}
+				/* the match that is all spaces at the 
+				 * beginning of the input string is skipped */
+				cursub.ptr += match.len;
+				cursub.len -= match.len;

-					return p;
-				}
-		
-				if (c == lq)
-				{
-					quoted = 1;
-					p++;
-				}
-				else
-				{
-					*tp++ = c; p++;
-					if (!is_xch_space(c)) xp = tp; 
-				}
+				/* adjust the substring by skipping the leading
+				 * spaces and retry matching */
+				realsub.ptr = (char_t*)substr + match.len;
+				realsub.len -= match.len;
 			}
+			else break;
+		}
+		else break;
+	}
+
+exit_loop:
+	hawk_rtx_seterrnum (rtx, HAWK_NULL, HAWK_ENOERR);
+
+	if (cursub.len <= 0)
+	{
+		tok->ptr = realsub.ptr;
+		tok->len = realsub.len;
+		return HAWK_NULL; 
+	}
+
+	tok->ptr = realsub.ptr;
+	tok->len = match.ptr - realsub.ptr;
+
+	for (i = 0; i < match.len; i++)
+	{
+		if (!is_xch_space(match.ptr[i]))
+		{
+			/* the match contains a non-space character. */
+			return (char_t*)match.ptr+match.len;
 		}
 	}

-	if (escaped) 
+	/* the match is all spaces */
+	if (HAWK_RTX_IS_STRIPRECSPC_ON(rtx))
 	{
-		/* if it is still escaped, the last character must be 
-		 * the escaper itself. treat it as a normal character */
-		*xp++ = ec;
+		/* if the match reached the last character in the input string,
+		 * it returns HAWK_NULL to terminate tokenization. */
+		return (match.ptr+match.len >= substr+sublen)? HAWK_NULL: ((char_t*)match.ptr+match.len);
+	}
+	else
+	{
+		/* if the match went beyond the the last character in the input 
+		 * string, it returns HAWK_NULL to terminate tokenization. */
+		return (match.ptr+match.len > substr+sublen)? HAWK_NULL: ((char_t*)match.ptr+match.len);
 	}
-
-	tok->ptr = ts;
-	tok->len = xp - ts;
-	return HAWK_NULL;
 }
--- a/hawk/lib/misc-prv.h
+++ b/hawk/lib/misc-prv.h
@ -64,24 +64,37 @@ hawk_bch_t* hawk_rtx_tokbcharswithbchars (
 	const hawk_bch_t* delim, hawk_oow_t delim_len, hawk_bcs_t* tok);


+hawk_uch_t* hawk_rtx_tokucharsbyrex (
+	hawk_rtx_t*       rtx, 
+	const hawk_uch_t* str,
+	hawk_oow_t        len,
+	const hawk_uch_t* substr,
+	hawk_oow_t        sublen,
+	hawk_tre_t*       rex,
+	hawk_ucs_t*       tok
+);
+
+hawk_bch_t* hawk_rtx_tokbcharsbyrex (
+	hawk_rtx_t*       rtx, 
+	const hawk_bch_t* str,
+	hawk_oow_t        len,
+	const hawk_bch_t* substr,
+	hawk_oow_t        sublen,
+	hawk_tre_t*       rex,
+	hawk_bcs_t*       tok
+);
+
+
 #if defined(HAWK_OOCH_IS_UCH)
 #	define hawk_rtx_fldoochars hawk_rtx_flduchars
 #	define hawk_rtx_tokoocharswithoochars hawk_rtx_tokucharswithuchars
+#	define hawk_rtx_tokoocharsbyrex hawk_rtx_tokucharsbyrex
 #else
 #	define hawk_rtx_fldoochars hawk_rtx_fldbchars
 #	define hawk_rtx_tokoocharswithoochars hawk_rtx_tokbcharswithbchars
+#	define hawk_rtx_tokoocharsbyrex hawk_rtx_tokbcharsbyrex
 #endif

-hawk_ooch_t* hawk_rtx_tokoocharsbyrex (
-	hawk_rtx_t*        rtx, 
-	const hawk_ooch_t* str,
-	hawk_oow_t         len,
-	const hawk_ooch_t* substr,
-	hawk_oow_t         sublen,
-	hawk_tre_t*        rex,
-	hawk_oocs_t*       tok
-);
-

 int hawk_rtx_matchvalwithucs (
 	hawk_rtx_t* rtx, hawk_val_t* val,
--- a/hawk/lib/misc.c
+++ b/hawk/lib/misc.c
@ -30,220 +30,41 @@
 #undef char_t
 #undef xcs_t
 #undef is_xch_space
-#undef tokenize_xchars
+#undef match_rex_with_xcs
 #undef split_xchars_to_fields
+#undef tokenize_xchars
+#undef tokenize_xchars_by_rex
+
 #define char_t hawk_bch_t
 #define xcs_t hawk_bcs_t
 #define is_xch_space hawk_is_bch_space
-#define tokenize_xchars hawk_rtx_tokbcharswithbchars
+#define match_rex_with_xcs hawk_rtx_matchrexwithbcs
+
 #define split_xchars_to_fields hawk_rtx_fldbchars
+#define tokenize_xchars hawk_rtx_tokbcharswithbchars
+#define tokenize_xchars_by_rex hawk_rtx_tokbcharsbyrex
+
 #include "misc-imp.h"

 #undef char_t
 #undef xcs_t
 #undef is_xch_space
-#undef tokenize_xchars
+#undef match_rex_with_xcs
 #undef split_xchars_to_fields
+#undef tokenize_xchars
+#undef tokenize_xchars_by_rex
+
 #define char_t hawk_uch_t
 #define xcs_t hawk_ucs_t
 #define is_xch_space hawk_is_uch_space
-#define tokenize_xchars hawk_rtx_tokucharswithuchars
+#define match_rex_with_xcs hawk_rtx_matchrexwithucs
+
 #define split_xchars_to_fields hawk_rtx_flduchars
+#define tokenize_xchars hawk_rtx_tokucharswithuchars
+#define tokenize_xchars_by_rex hawk_rtx_tokucharsbyrex
+
 #include "misc-imp.h"

-hawk_ooch_t* hawk_rtx_tokoocharsbyrex (
-	hawk_rtx_t* rtx, 
-	const hawk_ooch_t* str, hawk_oow_t len,
-	const hawk_ooch_t* substr, hawk_oow_t sublen,
-	hawk_tre_t* rex, hawk_oocs_t* tok)
-{
-	int n;
-	hawk_oow_t i;
-	hawk_oocs_t match, s, cursub, realsub;
-
-	s.ptr = (hawk_ooch_t*)str;
-	s.len = len;
-
-	cursub.ptr = (hawk_ooch_t*)substr;
-	cursub.len = sublen;
-
-	realsub.ptr = (hawk_ooch_t*)substr;
-	realsub.len = sublen;
-
-	while (cursub.len > 0)
-	{
-		n = hawk_rtx_matchrexwithoocs(rtx, rex, &s, &cursub, &match, HAWK_NULL);
-		if (n <= -1) return HAWK_NULL;
-
-		if (n == 0)
-		{
-			/* no match has been found. return the entire string as a token */
-			hawk_rtx_seterrnum (rtx, HAWK_NULL, HAWK_ENOERR); /* reset HAWK_EREXNOMAT to no error */
-			tok->ptr = realsub.ptr;
-			tok->len = realsub.len;
-			return HAWK_NULL; 
-		}
-
-		HAWK_ASSERT (n == 1);
-
-		if (match.len == 0)
-		{
-			/* the match length is zero. */
-			cursub.ptr++;
-			cursub.len--;
-		}
-		else if (HAWK_RTX_IS_STRIPRECSPC_ON(rtx))
-		{
-			/* match at the beginning of the input string */
-			if (match.ptr == substr) 
-			{
-				for (i = 0; i < match.len; i++)
-				{
-					if (!hawk_is_ooch_space(match.ptr[i])) goto exit_loop;
-				}
-
-				/* the match that is all spaces at the 
-				 * beginning of the input string is skipped */
-				cursub.ptr += match.len;
-				cursub.len -= match.len;
-
-				/* adjust the substring by skipping the leading
-				 * spaces and retry matching */
-				realsub.ptr = (hawk_ooch_t*)substr + match.len;
-				realsub.len -= match.len;
-			}
-			else break;
-		}
-		else break;
-	}
-
-exit_loop:
-	hawk_rtx_seterrnum (rtx, HAWK_NULL, HAWK_ENOERR);
-
-	if (cursub.len <= 0)
-	{
-		tok->ptr = realsub.ptr;
-		tok->len = realsub.len;
-		return HAWK_NULL; 
-	}
-
-	tok->ptr = realsub.ptr;
-	tok->len = match.ptr - realsub.ptr;
-
-	for (i = 0; i < match.len; i++)
-	{
-		if (!hawk_is_ooch_space(match.ptr[i]))
-		{
-			/* the match contains a non-space character. */
-			return (hawk_ooch_t*)match.ptr+match.len;
-		}
-	}
-
-	/* the match is all spaces */
-	if (HAWK_RTX_IS_STRIPRECSPC_ON(rtx))
-	{
-		/* if the match reached the last character in the input string,
-		 * it returns HAWK_NULL to terminate tokenization. */
-		return (match.ptr+match.len >= substr+sublen)? HAWK_NULL: ((hawk_ooch_t*)match.ptr+match.len);
-	}
-	else
-	{
-		/* if the match went beyond the the last character in the input 
-		 * string, it returns HAWK_NULL to terminate tokenization. */
-		return (match.ptr+match.len > substr+sublen)? HAWK_NULL: ((hawk_ooch_t*)match.ptr+match.len);
-	}
-}
-
-#if 0
-hawk_ooch_t* hawk_rtx_strxnfld (
-	hawk_rtx_t* rtx, hawk_ooch_t* str, hawk_oow_t len,
-	hawk_ooch_t fs, hawk_ooch_t ec, hawk_ooch_t lq, hawk_ooch_t rq,
-	hawk_oocs_t* tok)
-{
-	hawk_ooch_t* p = str;
-	hawk_ooch_t* end = str + len;
-	int escaped = 0, quoted = 0;
-	hawk_ooch_t* ts; /* token start */
-	hawk_ooch_t* tp; /* points to one char past the last token char */
-	hawk_ooch_t* xp; /* points to one char past the last effective char */
-
-	/* skip leading spaces */
-	while (p < end && hawk_is_ooch_space(*p)) p++;
-
-	/* initialize token pointers */
-	ts = tp = xp = p; 
-
-	while (p < end)
-	{
-		char c = *p;
-
-		if (escaped)
-		{
-			*tp++ = c; xp = tp; p++;
-			escaped = 0;
-		}
-		else
-		{
-			if (c == ec)
-			{
-				escaped = 1;
-				p++;
-			}
-			else if (quoted)
-			{
-				if (c == rq)
-				{
-					quoted = 0;
-					p++;
-				}
-				else
-				{
-					*tp++ = c; xp = tp; p++;
-				}
-			}
-			else 
-			{
-				if (c == fs)
-				{
-					tok->ptr = ts;
-					tok->len = xp - ts;
-					p++;
-
-					if (hawk_is_ooch_space(fs))
-					{
-						while (p < end && *p == fs) p++;
-						if (p >= end) return HAWK_NULL;
-					}
-
-					return p;
-				}
-		
-				if (c == lq)
-				{
-					quoted = 1;
-					p++;
-				}
-				else
-				{
-					*tp++ = c; p++;
-					if (!hawk_is_ooch_space(c)) xp = tp; 
-				}
-			}
-		}
-	}
-
-	if (escaped) 
-	{
-		/* if it is still escaped, the last character must be 
-		 * the escaper itself. treat it as a normal character */
-		*xp++ = ec;
-	}
-	
-	tok->ptr = ts;
-	tok->len = xp - ts;
-	return HAWK_NULL;
-}
-#endif

 static int matchtre_ucs (hawk_tre_t* tre, int opt, const hawk_ucs_t* str, hawk_ucs_t* mat, hawk_ucs_t submat[9], hawk_gem_t* errgem)
 {
--- a/hawk/lib/parse.c
+++ b/hawk/lib/parse.c
@ -2579,8 +2579,8 @@ static hawk_nde_t* parse_while (hawk_t* hawk, const hawk_loc_t* xloc)
 	if (get_token(hawk) <= -1) goto oops;

 	ploc = hawk->tok.loc;
-	test = parse_expr_withdc (hawk, &ploc);
-	if (test == HAWK_NULL) goto oops;
+	test = parse_expr_withdc(hawk, &ploc);
+	if (HAWK_UNLIKELY(!test)) goto oops;

 	if (!MATCH(hawk,TOK_RPAREN)) 
 	{
@ -2591,11 +2591,11 @@ static hawk_nde_t* parse_while (hawk_t* hawk, const hawk_loc_t* xloc)
 	if (get_token(hawk) <= -1)  goto oops;

 	ploc = hawk->tok.loc;
-	body = parse_statement (hawk, &ploc);
-	if (body == HAWK_NULL) goto oops;
+	body = parse_statement(hawk, &ploc);
+	if (HAWK_UNLIKELY(!body)) goto oops;

-	nde = (hawk_nde_while_t*) hawk_callocmem (hawk, HAWK_SIZEOF(*nde));
-	if (nde == HAWK_NULL) 
+	nde = (hawk_nde_while_t*)hawk_callocmem(hawk, HAWK_SIZEOF(*nde));
+	if (HAWK_UNLIKELY(!nde)) 
 	{
 		ADJERR_LOC (hawk, xloc);
 		goto oops;
@ -2628,7 +2628,7 @@ static hawk_nde_t* parse_for (hawk_t* hawk, const hawk_loc_t* xloc)
 		return HAWK_NULL;
 	}
 	if (get_token(hawk) <= -1) return HAWK_NULL;
-		
+
 	if (!MATCH(hawk,TOK_SEMICOLON)) 
 	{
 		/* this line is very ugly. it checks the entire next 
@ -2694,8 +2694,8 @@ static hawk_nde_t* parse_for (hawk_t* hawk, const hawk_loc_t* xloc)
 	if (!MATCH(hawk,TOK_SEMICOLON)) 
 	{
 		ploc = hawk->tok.loc;
-		test = parse_expr_withdc (hawk, &ploc);
-		if (test == HAWK_NULL) goto oops;
+		test = parse_expr_withdc(hawk, &ploc);
+		if (HAWK_UNLIKELY(!test)) goto oops;

 		if (!MATCH(hawk,TOK_SEMICOLON)) 
 		{
@ -2717,8 +2717,8 @@ static hawk_nde_t* parse_for (hawk_t* hawk, const hawk_loc_t* xloc)
 			hawk_loc_t eloc;

 			eloc = hawk->tok.loc;
-			incr = parse_expr_withdc (hawk, &eloc);
-			if (incr == HAWK_NULL) goto oops;
+			incr = parse_expr_withdc(hawk, &eloc);
+			if (HAWK_UNLIKELY(!incr)) goto oops;
 		}

 		if (!MATCH(hawk,TOK_RPAREN)) 
@ -2734,8 +2734,8 @@ static hawk_nde_t* parse_for (hawk_t* hawk, const hawk_loc_t* xloc)
 	body = parse_statement (hawk, &ploc);
 	if (body == HAWK_NULL) goto oops;

-	nde_for = (hawk_nde_for_t*) hawk_callocmem (hawk, HAWK_SIZEOF(*nde_for));
-	if (nde_for == HAWK_NULL) 
+	nde_for = (hawk_nde_for_t*)hawk_callocmem(hawk, HAWK_SIZEOF(*nde_for));
+	if (HAWK_UNLIKELY(!nde_for)) 
 	{
 		ADJERR_LOC (hawk, xloc);
 		goto oops;
@ -2768,8 +2768,8 @@ static hawk_nde_t* parse_dowhile (hawk_t* hawk, const hawk_loc_t* xloc)
 	HAWK_ASSERT (hawk->ptok.type == TOK_DO);

 	ploc = hawk->tok.loc;
-	body = parse_statement (hawk, &ploc);
-	if (body == HAWK_NULL) goto oops;
+	body = parse_statement(hawk, &ploc);
+	if (HAWK_UNLIKELY(!body)) goto oops;

 	while (MATCH(hawk,TOK_NEWLINE))
 	{
@ -2794,7 +2794,7 @@ static hawk_nde_t* parse_dowhile (hawk_t* hawk, const hawk_loc_t* xloc)

 	ploc = hawk->tok.loc;
 	test = parse_expr_withdc (hawk, &ploc);
-	if (test == HAWK_NULL) goto oops;
+	if (HAWK_UNLIKELY(!test)) goto oops;

 	if (!MATCH(hawk,TOK_RPAREN)) 
 	{
@ -2803,9 +2803,9 @@ static hawk_nde_t* parse_dowhile (hawk_t* hawk, const hawk_loc_t* xloc)
 	}

 	if (get_token(hawk) <= -1)  goto oops;
-	
-	nde = (hawk_nde_while_t*) hawk_callocmem (hawk, HAWK_SIZEOF(*nde));
-	if (nde == HAWK_NULL) 
+
+	nde = (hawk_nde_while_t*)hawk_callocmem(hawk, HAWK_SIZEOF(*nde));
+	if (HAWK_UNLIKELY(!nde)) 
 	{
 		ADJERR_LOC (hawk, xloc);
 		goto oops;
@ -2836,8 +2836,8 @@ static hawk_nde_t* parse_break (hawk_t* hawk, const hawk_loc_t* xloc)
 		return HAWK_NULL;
 	}

-	nde = (hawk_nde_break_t*) hawk_callocmem (hawk, HAWK_SIZEOF(*nde));
-	if (nde == HAWK_NULL)
+	nde = (hawk_nde_break_t*)hawk_callocmem(hawk, HAWK_SIZEOF(*nde));
+	if (HAWK_UNLIKELY(!nde))
 	{
 		ADJERR_LOC (hawk, xloc);
 		return HAWK_NULL;
@ -2845,7 +2845,7 @@ static hawk_nde_t* parse_break (hawk_t* hawk, const hawk_loc_t* xloc)

 	nde->type = HAWK_NDE_BREAK;
 	nde->loc = *xloc;
-	
+
 	return (hawk_nde_t*)nde;
 }

@ -2860,8 +2860,8 @@ static hawk_nde_t* parse_continue (hawk_t* hawk, const hawk_loc_t* xloc)
 		return HAWK_NULL;
 	}

-	nde = (hawk_nde_continue_t*) hawk_callocmem (hawk, HAWK_SIZEOF(*nde));
-	if (nde == HAWK_NULL)
+	nde = (hawk_nde_continue_t*)hawk_callocmem(hawk, HAWK_SIZEOF(*nde));
+	if (HAWK_UNLIKELY(!nde))
 	{
 		ADJERR_LOC (hawk, xloc);
 		return HAWK_NULL;
@ -2880,8 +2880,8 @@ static hawk_nde_t* parse_return (hawk_t* hawk, const hawk_loc_t* xloc)

 	HAWK_ASSERT (hawk->ptok.type == TOK_RETURN);

-	nde = (hawk_nde_return_t*) hawk_callocmem ( hawk, HAWK_SIZEOF(*nde));
-	if (nde == HAWK_NULL)
+	nde = (hawk_nde_return_t*)hawk_callocmem(hawk, HAWK_SIZEOF(*nde));
+	if (HAWK_UNLIKELY(!nde))
 	{
 		ADJERR_LOC (hawk, xloc);
 		return HAWK_NULL;
@ -2900,8 +2900,8 @@ static hawk_nde_t* parse_return (hawk_t* hawk, const hawk_loc_t* xloc)
 		hawk_loc_t eloc;

 		eloc = hawk->tok.loc;
-		val = parse_expr_withdc (hawk, &eloc);
-		if (val == HAWK_NULL) 
+		val = parse_expr_withdc(hawk, &eloc);
+		if (HAWK_UNLIKELY(!val)) 
 		{
 			hawk_freemem (hawk, nde);
 			return HAWK_NULL;
--- a/hawk/lib/run.c
+++ b/hawk/lib/run.c
@ -379,12 +379,12 @@ static int set_global (hawk_rtx_t* rtx, int idx, hawk_nde_var_t* var, hawk_val_t
 				HAWK_ASSERT (vtype != HAWK_VAL_REX);

 				out.type = HAWK_RTX_VALTOSTR_CPLDUP;
-				if (hawk_rtx_valtostr (rtx, val, &out) <= -1) return -1;
+				if (hawk_rtx_valtostr(rtx, val, &out) <= -1) return -1;
 				fs_ptr = out.u.cpldup.ptr;
 				fs_len = out.u.cpldup.len;
 			}

-			if (fs_len > 1 && !(fs_len == 5 && fs_ptr[0] == HAWK_T('?')))
+			if (fs_len > 1 && !(fs_len == 5 && fs_ptr[0] == '?'))
 			{
 				/* it's a regular expression if FS contains multiple characters.
 				 * however, it's not a regular expression if it's 5 character
--- a/hawk/lib/tre-compile.c
+++ b/hawk/lib/tre-compile.c
@ -1869,14 +1869,14 @@ tre_ast_to_tnfa(hawk_gem_t* gem, tre_ast_node_t *node, tre_tnfa_transition_t *tr
 }


-#define ERROR_EXIT(err)		  \
-  do				  \
-    {				  \
-      errcode = err;		  \
-      if (/*CONSTCOND*/1)	  \
-      	goto error_exit;	  \
-    }				  \
- while (/*CONSTCOND*/0)
+#define ERROR_EXIT(err) \
+	do \
+	{ \
+		errcode = err; \
+		if (/*CONSTCOND*/1) \
+			goto error_exit; \
+	} \
+	while (/*CONSTCOND*/0)


 int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
@ -1901,11 +1901,10 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
 /* HAWK: deleted limit on the stack size 
 	stack = tre_stack_new(preg->gem, 512, 10240, 128); */
 	stack = tre_stack_new(preg->gem, 512, -1, 128); 
-	if (!stack)
-		return REG_ESPACE;
+	if (HAWK_UNLIKELY(!stack)) return REG_ESPACE;
 	/* Allocate a fast memory allocator. */
 	mem = tre_mem_new(preg->gem);
-	if (!mem)
+	if (HAWK_UNLIKELY(!mem))
 	{
 		tre_stack_destroy(stack);
 		return REG_ESPACE;
@ -1921,8 +1920,7 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
 	parse_ctx.max_backref = -1;
 	DPRINT(("tre_compile: parsing '%.*" STRF "'\n", (int)n, regex));
 	errcode = tre_parse(&parse_ctx);
-	if (errcode != REG_OK)
-		ERROR_EXIT(errcode);
+	if (errcode != REG_OK) ERROR_EXIT(errcode);
 	preg->re_nsub = parse_ctx.submatch_id - 1;
 	tree = parse_ctx.result;

@ -1941,8 +1939,8 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)

 	/* Allocate the TNFA struct. */
 	tnfa = xcalloc(preg->gem, 1, sizeof(tre_tnfa_t));
-	if (tnfa == NULL)
-		ERROR_EXIT(REG_ESPACE);
+	if (HAWK_UNLIKELY(!tnfa)) ERROR_EXIT(REG_ESPACE);
+
 	tnfa->have_backrefs = parse_ctx.max_backref >= 0;
 	tnfa->have_approx = parse_ctx.have_approx;
 	tnfa->num_submatches = parse_ctx.submatch_id;
@ -1966,26 +1964,21 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
 		{
 			tag_directions = xmalloc(preg->gem,sizeof(*tag_directions)
 			                         * (tnfa->num_tags + 1));
-			if (tag_directions == NULL)
-				ERROR_EXIT(REG_ESPACE);
+			if (tag_directions == NULL) ERROR_EXIT(REG_ESPACE);
 			tnfa->tag_directions = tag_directions;
-			HAWK_MEMSET(tag_directions, -1,
-			           sizeof(*tag_directions) * (tnfa->num_tags + 1));
+			HAWK_MEMSET(tag_directions, -1, sizeof(*tag_directions) * (tnfa->num_tags + 1));
 		}
 		tnfa->minimal_tags = xcalloc(preg->gem, (unsigned)tnfa->num_tags * 2 + 1,
 		                             sizeof(tnfa->minimal_tags));
 		if (tnfa->minimal_tags == NULL)
 			ERROR_EXIT(REG_ESPACE);

-		submatch_data = xcalloc(preg->gem,(unsigned)parse_ctx.submatch_id,
-		                        sizeof(*submatch_data));
-		if (submatch_data == NULL)
-			ERROR_EXIT(REG_ESPACE);
+		submatch_data = xcalloc(preg->gem,(unsigned)parse_ctx.submatch_id, sizeof(*submatch_data));
+		if (HAWK_UNLIKELY(!submatch_data)) ERROR_EXIT(REG_ESPACE);
 		tnfa->submatch_data = submatch_data;

 		errcode = tre_add_tags(mem, stack, tree, tnfa, 0);
-		if (errcode != REG_OK)
-			ERROR_EXIT(errcode);
+		if (errcode != REG_OK) ERROR_EXIT(errcode);

 #ifdef TRE_DEBUG
 		for (i = 0; i < parse_ctx.submatch_id; i++)
@ -1999,10 +1992,8 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
 	}

 	/* Expand iteration nodes. */
-	errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
-	                         tag_directions, &tnfa->params_depth);
-	if (errcode != REG_OK)
-		ERROR_EXIT(errcode);
+	errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position, tag_directions, &tnfa->params_depth);
+	if (errcode != REG_OK) ERROR_EXIT(errcode);

 	/* Add a dummy node for the final state.
 	   XXX - For certain patterns this dummy node can be optimized away,
@ -2010,12 +2001,10 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
 	   this possibility. */
 	tmp_ast_l = tree;
 	tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
-	if (tmp_ast_r == NULL)
-		ERROR_EXIT(REG_ESPACE);
+	if (HAWK_UNLIKELY(!tmp_ast_r)) ERROR_EXIT(REG_ESPACE);

 	tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
-	if (tree == NULL)
-		ERROR_EXIT(REG_ESPACE);
+	if (HAWK_UNLIKELY(!tree)) ERROR_EXIT(REG_ESPACE);

 #ifdef TRE_DEBUG
 	tre_ast_print(tree);
@ -2023,16 +2012,13 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
 #endif /* TRE_DEBUG */

 	errcode = tre_compute_nfl(mem, stack, tree);
-	if (errcode != REG_OK)
-		ERROR_EXIT(errcode);
+	if (errcode != REG_OK) ERROR_EXIT(errcode);

 	counts = xmalloc(preg->gem,sizeof(int) * parse_ctx.position);
-	if (counts == NULL)
-		ERROR_EXIT(REG_ESPACE);
+	if (HAWK_UNLIKELY(!counts)) ERROR_EXIT(REG_ESPACE);

 	offs = xmalloc(preg->gem,sizeof(int) * parse_ctx.position);
-	if (offs == NULL)
-		ERROR_EXIT(REG_ESPACE);
+	if (HAWK_UNLIKELY(!offs)) ERROR_EXIT(REG_ESPACE);

 	for (i = 0; i < parse_ctx.position; i++)
 		counts[i] = 0;
@ -2046,15 +2032,13 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
 		counts[i] = 0;
 	}
 	transitions = xcalloc(preg->gem, (unsigned)add + 1, sizeof(*transitions));
-	if (transitions == NULL)
-		ERROR_EXIT(REG_ESPACE);
+	if (HAWK_UNLIKELY(!transitions)) ERROR_EXIT(REG_ESPACE);
 	tnfa->transitions = transitions;
 	tnfa->num_transitions = add;

 	DPRINT(("Converting to TNFA:\n"));
 	errcode = tre_ast_to_tnfa(preg->gem, tree, transitions, counts, offs);
-	if (errcode != REG_OK)
-		ERROR_EXIT(errcode);
+	if (errcode != REG_OK) ERROR_EXIT(errcode);

 	/* If in eight bit mode, compute a table of characters that can be the
 	   first character of a match. */
@ -2145,8 +2129,7 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
 	}

 	initial = xcalloc(preg->gem, (unsigned)i + 1, sizeof(tre_tnfa_transition_t));
-	if (initial == NULL)
-		ERROR_EXIT(REG_ESPACE);
+	if (HAWK_UNLIKELY(!initial)) ERROR_EXIT(REG_ESPACE);
 	tnfa->initial = initial;

 	i = 0;
@ -2162,18 +2145,15 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
 			int j;
 			for (j = 0; p->tags[j] >= 0; j++);
 			initial[i].tags = xmalloc(preg->gem,sizeof(*p->tags) * (j + 1));
-			if (!initial[i].tags)
-				ERROR_EXIT(REG_ESPACE);
+			if (HAWK_UNLIKELY(!initial[i].tags)) ERROR_EXIT(REG_ESPACE);
 			HAWK_MEMCPY (initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
 		}
 		initial[i].params = NULL;
 		if (p->params)
 		{
 			initial[i].params = xmalloc(preg->gem,sizeof(*p->params) * TRE_PARAM_LAST);
-			if (!initial[i].params)
-				ERROR_EXIT(REG_ESPACE);
-			HAWK_MEMCPY (initial[i].params, p->params,
-			            sizeof(*p->params) * TRE_PARAM_LAST);
+			if (HAWK_UNLIKELY(!initial[i].params)) ERROR_EXIT(REG_ESPACE);
+			HAWK_MEMCPY (initial[i].params, p->params, sizeof(*p->params) * TRE_PARAM_LAST);
 		}
 		initial[i].assertions = p->assertions;
 		i++;
@ -2198,12 +2178,9 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
 error_exit:
 	/* Free everything that was allocated and return the error code. */
 	tre_mem_destroy(mem);
-	if (stack != NULL)
-		tre_stack_destroy(stack);
-	if (counts != NULL)
-		xfree(preg->gem,counts);
-	if (offs != NULL)
-		xfree(preg->gem,offs);
+	if (stack) tre_stack_destroy(stack);
+	if (counts) xfree(preg->gem,counts);
+	if (offs) xfree(preg->gem,offs);
 	preg->TRE_REGEX_T_FIELD = (void *)tnfa;
 	tre_free(preg);
 	return errcode;
--- a/hawk/lib/tre-match-ut.h
+++ b/hawk/lib/tre-match-ut.h
@ -64,83 +64,83 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 /* Wide character and multibyte support. */

-#define GET_NEXT_WCHAR()						      \
-  do {									      \
-    prev_c = next_c;							      \
-    if (type == STR_BYTE)						      \
-      {									      \
-	pos++;								      \
-	if (len >= 0 && pos >= len)					      \
-	  next_c = '\0';						      \
-	else								      \
-	  next_c = (unsigned char)(*str_byte++);			      \
-      }									      \
-    else if (type == STR_WIDE)						      \
-      {									      \
-	pos++;								      \
-	if (len >= 0 && pos >= len)					      \
-	  next_c = HAWK_T('\0');						      \
-	else								      \
-	  next_c = *str_wide++;						      \
-      }									      \
-    else if (type == STR_MBS)						      \
-      {									      \
-        pos += pos_add_next;					      	      \
-	if (str_byte == NULL)						      \
-	  next_c = HAWK_T('\0');						      \
-	else								      \
-	  {								      \
-	    size_t w;							      \
-	    int max;							      \
-	    if (len >= 0)						      \
-	      max = len - pos;						      \
-	    else							      \
-	      max = 32;							      \
-	    if (max <= 0)						      \
-	      {								      \
-		next_c = HAWK_T('\0');						      \
-		pos_add_next = 1;					      \
-	      }								      \
-	    else							      \
-	      {								      \
-		w = hawk_mbrtowc(str_byte, (size_t)max, &next_c, &mbstate);    \
-		if (w <= 0 || w > max)			      \
-		  return REG_NOMATCH;					      \
-		if (next_c == HAWK_T('\0') && len >= 0)					      \
-		  {							      \
-		    pos_add_next = 1;					      \
-		    next_c = 0;						      \
-		    str_byte++;						      \
-		  }							      \
-		else							      \
-		  {							      \
-		    pos_add_next = w;					      \
-		    str_byte += w;					      \
-		  }							      \
-	      }								      \
-	  }								      \
-      }									      \
-  } while(/*CONSTCOND*/0)
+#define GET_NEXT_WCHAR() \
+	do { \
+		prev_c = next_c; \
+		if (type == STR_BYTE) \
+		{ \
+			pos++; \
+			if (len >= 0 && pos >= len) \
+				next_c = '\0'; \
+			else \
+				next_c = (unsigned char)(*str_byte++); \
+		} \
+		else if (type == STR_WIDE) \
+		{ \
+			pos++; \
+			if (len >= 0 && pos >= len) \
+				next_c = '\0'; \
+			else \
+				next_c = *str_wide++; \
+		} \
+		else if (type == STR_MBS) \
+		{ \
+			pos += pos_add_next; \
+			if (str_byte == NULL) \
+				next_c = '\0'; \
+			else \
+			{  \
+				size_t w;  \
+				int max;  \
+				if (len >= 0)  \
+					max = len - pos;  \
+				else  \
+					max = 32;  \
+				if (max <= 0)  \
+				{  \
+					next_c = '\0';  \
+					pos_add_next = 1;  \
+				}  \
+				else  \
+				{  \
+					w = hawk_mbrtowc(str_byte, (size_t)max, &next_c, &mbstate); \
+					if (w <= 0 || w > max) \
+						return REG_NOMATCH;  \
+					if (next_c == '\0' && len >= 0) \
+					{ \
+						pos_add_next = 1; \
+						next_c = 0; \
+						str_byte++; \
+					} \
+					else \
+					{ \
+						pos_add_next = w; \
+						str_byte += w; \
+					} \
+				} \
+			} \
+		} \
+	} while(/*CONSTCOND*/0)

 #else /* !TRE_MULTIBYTE */

 /* Wide character support, no multibyte support. */

-#define GET_NEXT_WCHAR()						      \
-do {									      \
-	prev_c = next_c;							      \
-	if (type == STR_BYTE)						      \
-	{									      \
-		pos++;								      \
-		if (len >= 0 && pos >= len) next_c = HAWK_BT('\0'); \
-		else	next_c = (unsigned char)(*str_byte++);		  \
-      }									      \
-	else if (type == STR_WIDE)						      \
-	{									      \
-		pos++;								      \
-		if (len >= 0 && pos >= len) next_c = HAWK_T('\0');	\
-		else next_c = *str_wide++;					\
-      }									      \
+#define GET_NEXT_WCHAR() \
+do { \
+	prev_c = next_c;  \
+	if (type == STR_BYTE) \
+	{ \
+		pos++; \
+		if (len >= 0 && pos >= len) next_c = '\0'; \
+		else	next_c = (unsigned char)(*str_byte++);   \
+	}  \
+	else if (type == STR_WIDE) \
+	{ \
+		pos++; \
+		if (len >= 0 && pos >= len) next_c = '\0'; \
+		else next_c = *str_wide++; \
+	} \
 } while(/*CONSTCOND*/0)

 #endif /* !TRE_MULTIBYTE */
@ -166,22 +166,22 @@ do {									      \

 #define IS_WORD_CHAR(c)	 ((c) == HAWK_T('_') || tre_isalnum(c))

-#define CHECK_ASSERTIONS(assertions)					      \
-  (((assertions & ASSERT_AT_BOL)					      \
-    && (pos > 0 || reg_notbol)						      \
-    && (prev_c != HAWK_T('\n') || !reg_newline))				      \
-   || ((assertions & ASSERT_AT_EOL)					      \
-       && (next_c != HAWK_T('\0') || reg_noteol)				      \
-       && (next_c != HAWK_T('\n') || !reg_newline))				      \
-   || ((assertions & ASSERT_AT_BOW)					      \
-       && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c)))	              \
-   || ((assertions & ASSERT_AT_EOW)					      \
-       && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c)))		      \
-   || ((assertions & ASSERT_AT_WB)					      \
-       && (pos != 0 && next_c != HAWK_T('\0')					      \
-	   && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c)))		      \
-   || ((assertions & ASSERT_AT_WB_NEG)					      \
-       && (pos == 0 || next_c == HAWK_T('\0')					      \
+#define CHECK_ASSERTIONS(assertions) \
+  (((assertions & ASSERT_AT_BOL) \
+    && (pos > 0 || reg_notbol) \
+    && (prev_c != HAWK_T('\n') || !reg_newline)) \
+   || ((assertions & ASSERT_AT_EOL) \
+       && (next_c != HAWK_T('\0') || reg_noteol) \
+       && (next_c != HAWK_T('\n') || !reg_newline)) \
+   || ((assertions & ASSERT_AT_BOW) \
+       && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c))) \
+   || ((assertions & ASSERT_AT_EOW) \
+       && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c))) \
+   || ((assertions & ASSERT_AT_WB) \
+       && (pos != 0 && next_c != HAWK_T('\0') \
+	   && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c))) \
+   || ((assertions & ASSERT_AT_WB_NEG) \
+       && (pos == 0 || next_c == HAWK_T('\0') \
 	   || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))

 #define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)                             \
@ -191,7 +191,7 @@ do {									      \
    || ((trans_i->assertions & ASSERT_CHAR_CLASS)                             \
        && (tnfa->cflags & REG_ICASE)                                         \
        && !tre_isctype(tre_tolower((tre_cint_t)prev_c),trans_i->u.class)     \
-	&& !tre_isctype(tre_toupper((tre_cint_t)prev_c),trans_i->u.class))    \
+        && !tre_isctype(tre_toupper((tre_cint_t)prev_c),trans_i->u.class))    \
    || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG)                         \
        && tre_neg_char_classes_match(trans_i->neg_classes,(tre_cint_t)prev_c,\
                                      tnfa->cflags & REG_ICASE)))
@ -201,8 +201,7 @@ do {									      \

 /* Returns 1 if `t1' wins `t2', 0 otherwise. */
 HAWK_INLINE static int
-tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
-              int *t1, int *t2)
+tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions, int *t1, int *t2)
 {
 	int i;
 	for (i = 0; i < num_tags; i++)
--- a/hawk/lib/tre-prv.h
+++ b/hawk/lib/tre-prv.h
@ -169,11 +169,7 @@ SUBMATCH[4] = [defg]
 #define tre_tolower(c)  hawk_to_ooch_lower(c)
 #define tre_toupper(c)  hawk_to_ooch_upper(c)

-#if defined(HAWK_OOCH_IS_BCH) && (HAWK_SIZEOF_MCHAR_T == HAWK_SIZEOF_CHAR)
-	typedef unsigned char tre_char_t;
-#else
-	typedef hawk_ooch_t tre_char_t;
-#endif
+typedef hawk_ooch_t tre_char_t;
 typedef hawk_ooci_t tre_cint_t;

 #define size_t hawk_oow_t
--- a/hawk/t/h-002.hawk
+++ b/hawk/t/h-002.hawk
@ -291,6 +291,33 @@ function main()
 		ensure (a[2] === @b"Is",                               1, @SCRIPTNAME, @SCRIPTLINE);
 		ensure (a[3] === @b"Some",                             1, @SCRIPTNAME, @SCRIPTLINE);
 		ensure (a[4] === @b"Data",                             1, @SCRIPTNAME, @SCRIPTLINE);
+
+		ensure (split(@b"Here===Is=Some=====Data", a, /=+/),    4, @SCRIPTNAME, @SCRIPTLINE);
+		ensure (a[1] === @b"Here",                             1, @SCRIPTNAME, @SCRIPTLINE);
+		ensure (a[2] === @b"Is",                               1, @SCRIPTNAME, @SCRIPTLINE);
+		ensure (a[3] === @b"Some",                             1, @SCRIPTNAME, @SCRIPTLINE);
+		ensure (a[4] === @b"Data",                             1, @SCRIPTNAME, @SCRIPTLINE);
+
+		ensure (split("[Here] : [Is]  : [So\\]me] :[Da:ta]", a, "?:\\[]"), 4, @SCRIPTNAME, @SCRIPTLINE);
+		ensure (a[1] === "Here",                               1, @SCRIPTNAME, @SCRIPTLINE);
+		ensure (a[2] === "Is",                                 1, @SCRIPTNAME, @SCRIPTLINE);
+		ensure (a[3] === "So]me",                              1, @SCRIPTNAME, @SCRIPTLINE);
+		ensure (a[4] === "Da:ta",                              1, @SCRIPTNAME, @SCRIPTLINE);
+
+		ensure (split(@b"[Here] : [Is]  : [So\\]me] :[Da:ta]", a, "?:\\[]"), 4, @SCRIPTNAME, @SCRIPTLINE);
+		ensure (a[1] === @b"Here",                             1, @SCRIPTNAME, @SCRIPTLINE);
+		ensure (a[2] === @b"Is",                               1, @SCRIPTNAME, @SCRIPTLINE);
+		ensure (a[3] === @b"So]me",                            1, @SCRIPTNAME, @SCRIPTLINE);
+		ensure (a[4] === @b"Da:ta",                            1, @SCRIPTNAME, @SCRIPTLINE);
+
+		ensure (split("Here===Is=Some=====Data", a, ""),       23, @SCRIPTNAME, @SCRIPTLINE);
+
+		ensure (split("Here  Is Some   Data", a, / /),         7, @SCRIPTNAME, @SCRIPTLINE);
+		ensure (split("Here  Is Some   Data", a, " "),         4, @SCRIPTNAME, @SCRIPTLINE);
+		ensure (a[1] === "Here",                               1, @SCRIPTNAME, @SCRIPTLINE);
+		ensure (a[2] === "Is",                                 1, @SCRIPTNAME, @SCRIPTLINE);
+		ensure (a[3] === "Some",                               1, @SCRIPTNAME, @SCRIPTLINE);
+		ensure (a[4] === "Data",                               1, @SCRIPTNAME, @SCRIPTLINE);
 	}

 	print "SUCCESS";