fixed the bug of not able to find the longest match for a multiple-character RS.

2011-04-28 08:04:13 +00:00
parent 8b63507b79
commit 40fad800f3
8 changed files with 183 additions and 92 deletions
--- a/qse/lib/awk/rio.c
+++ b/qse/lib/awk/rio.c
@@ -1,5 +1,5 @@
 /*
- * $Id: rio.c 441 2011-04-22 14:28:43Z hyunghwan.chung $
+ * $Id: rio.c 444 2011-04-27 14:04:13Z hyunghwan.chung $
 *
    Copyright 2006-2011 Chung, Hyung-Hwan.
    This file is part of QSE.
@@ -20,7 +20,7 @@

 #include "awk.h"

-enum
+enum io_mask_t
 {
 	MASK_READ  = 0x0100,
 	MASK_WRITE = 0x0200,
@@ -88,6 +88,75 @@ static int out_mask_map[] =
 	MASK_WRITE
 };

+static QSE_INLINE int match_long_rs (qse_awk_rtx_t* run, qse_str_t* buf, int eof)
+{
+	qse_cstr_t match;
+	qse_awk_errnum_t errnum;
+	int n;
+
+/* TODO: minimize the number of regular expression match by minimizing the call
+ *       to match_long_rs() and changing its code.
+ *       currently it is called for each character added to buf.
+ *       this is a very bad way of doing the job.
+ */
+	QSE_ASSERT (run->gbl.rs != QSE_NULL);
+
+	n = QSE_AWK_MATCHREX (
+		run->awk, run->gbl.rs,
+		((run->gbl.ignorecase)? QSE_REX_IGNORECASE: 0),
+		QSE_STR_PTR(buf), QSE_STR_LEN(buf),
+		QSE_STR_PTR(buf), QSE_STR_LEN(buf),
+		&match, &errnum);
+	if (n <= -1)
+	{
+		qse_awk_rtx_seterrnum (run, errnum, QSE_NULL);
+	}
+	else if (n >= 1)
+	{
+		if (eof)
+		{
+			/* when EOF is reached, the record buffer
+			 * is not added with a new character. It's
+			 * just called again with the same record buffer
+			 * as the previous call to this function.
+			 * A match in this case must end at the end of
+			 * the current record buffer */
+			QSE_ASSERT (
+					QSE_STR_PTR(buf) + QSE_STR_LEN(buf) ==
+					match.ptr + match.len);
+
+			/* drop the RS part. no extra character after RS to drop
+			 * because we're at EOF and the EOF condition didn't
+			 * add a new character to the buffer before the call
+			 * to this function.
+			 */
+			QSE_STR_LEN(buf) -= match.len;
+		}
+		else
+		{
+			/* the last character read so far has been added
+			 * to the record before the call to this function.
+			 * if the match is found and it ends one character
+			 * before this last character, it is the longest
+			 * match.
+			 */
+			if (QSE_STR_PTR(buf) + QSE_STR_LEN(buf) == match.ptr + match.len + 1)
+			{
+				/* drop the RS part and the last one character after RS */
+				QSE_STR_LEN(buf) -= match.len + 1;
+			}
+			else
+			{
+				/* if the match does not ends at the desired position,
+				 * it is no match as it is not the longest match */
+				n = 0;
+			}
+		}
+	}
+
+	return n;
+}
+
 int qse_awk_rtx_readio (
 	qse_awk_rtx_t* run, int in_type,
 	const qse_char_t* name, qse_str_t* buf)
@@ -209,24 +278,26 @@ int qse_awk_rtx_readio (
 	rs = qse_awk_rtx_getgbl (run, QSE_AWK_GBL_RS);
 	qse_awk_rtx_refupval (run, rs);

-	if (rs->type == QSE_AWK_VAL_NIL)
+	switch (rs->type)
 	{
+		case QSE_AWK_VAL_NIL:
 			rs_ptr = QSE_NULL;
 			rs_len = 0;
-	}
-	else if (rs->type == QSE_AWK_VAL_STR)
-	{
+			break;
+
+		case QSE_AWK_VAL_STR:
 			rs_ptr = ((qse_awk_val_str_t*)rs)->ptr;
 			rs_len = ((qse_awk_val_str_t*)rs)->len;
-	}
-	else 
-	{
+			break;
+
+		default:
 			rs_ptr = qse_awk_rtx_valtocpldup (run, rs, &rs_len);
 			if (rs_ptr == QSE_NULL)
 			{
 				qse_awk_rtx_refdownval (run, rs);
 				return -1;
 			}
+			break;
 	}

 	ret = 1;
@@ -263,44 +334,24 @@ int qse_awk_rtx_readio (

 			if (n == 0) 
 			{
+				/* EOF reached */
 				p->in.eof = 1;

 				if (QSE_STR_LEN(buf) == 0) ret = 0;
 				else if (rs_len >= 2)
 				{
-					/* when RS is multiple characters, it needs to check
-					 * for the match at the end of the input stream as
-					 * the buffer has been appened with the last character
-					 * after the previous matchrex has failed */
-
-					qse_cstr_t match;
-					qse_awk_errnum_t errnum;
-
-					QSE_ASSERT (run->gbl.rs != QSE_NULL);
-
-					n = QSE_AWK_MATCHREX (
-						run->awk, run->gbl.rs, 
-						((run->gbl.ignorecase)? QSE_REX_IGNORECASE: 0),
-						QSE_STR_PTR(buf), QSE_STR_LEN(buf), 
-						QSE_STR_PTR(buf), QSE_STR_LEN(buf), 
-						&match, &errnum);
-					if (n <= -1)
+					/* When RS is multiple characters, it should 
+					 * check for the match at the end of the 
+					 * input stream also because the previous 
+					 * match could fail as it didn't end at the
+					 * desired position to be the longest match.
+					 * At EOF, the match at the end is considered 
+					 * the longest as there are no more characters
+					 * left */
+					n = match_long_rs (run, buf, 1);
+					if (n != 0)
 					{
-						qse_awk_rtx_seterrnum (run, errnum, QSE_NULL);
-						ret = -1;
-						break;
-					}
-
-					if (n >= 1)
-					{
-						/* the match should be found at the end of
-						 * the current buffer */
-						QSE_ASSERT (
-							QSE_STR_PTR(buf) + QSE_STR_LEN(buf) ==
-							match.ptr + match.len);
-
-						/*QSE_STR_LEN(buf) -= match.len;*/
-						buf->len -= match.len;
+						if (n <= -1) ret = -1;
 						break;
 					}
 				}
@@ -323,8 +374,7 @@ int qse_awk_rtx_readio (
 				if (pc == QSE_T('\r') && 
 				    QSE_STR_LEN(buf) > 0) 
 				{
-					/*QSE_STR_LEN(buf) -= 1;*/
-					buf->len -= 1;
+					QSE_STR_LEN(buf) -= 1;
 				}
 				break;
 			}
@@ -337,8 +387,7 @@ int qse_awk_rtx_readio (
 				if (pc == QSE_T('\r') && 
 				    QSE_STR_LEN(buf) > 0) 
 				{
-					/*QSE_STR_LEN(buf) -= 1;*/
-					buf->len -= 1;
+					QSE_STR_LEN(buf) -= 1;
 				}
 			}

@@ -367,43 +416,10 @@ int qse_awk_rtx_readio (
 		}
 		else
 		{
-			qse_cstr_t match;
-			qse_awk_errnum_t errnum;
-
-/* TODO: minimize the number of regular expressoin match here...
- *       currently matchrex is called for each character added to buf.
- *       this is a very bad way of doing the job.
- */
-			QSE_ASSERT (run->gbl.rs != QSE_NULL);
-
-			n = QSE_AWK_MATCHREX (
-				run->awk, run->gbl.rs, 
-				((run->gbl.ignorecase)? QSE_REX_IGNORECASE: 0),
-				QSE_STR_PTR(buf), QSE_STR_LEN(buf), 
-				QSE_STR_PTR(buf), QSE_STR_LEN(buf), 
-				&match, &errnum);
-			if (n <= -1)
-			{
-				qse_awk_rtx_seterrnum (run, errnum, QSE_NULL);
-				ret = -1;
-				p->in.pos--; /* unread the character in c */
-				break;
+			/* I don't do anything here if RS is composed of
+			 * multiple characters. See the comment furthur down */
 		}

-			if (n >= 1)
-			{
-				/* the match should be found at the end of
-				 * the current buffer */
-				QSE_ASSERT (
-					QSE_STR_PTR(buf) + QSE_STR_LEN(buf) ==
-					match.ptr + match.len);
-
-				/*QSE_STR_LEN(buf) -= match.len;*/
-				buf->len -= match.len;
-				p->in.pos--; /* unread the character in c */
-				break;
-			}
-		}

 		if (qse_str_ccat (buf, c) == (qse_size_t)-1)
 		{
@@ -412,7 +428,29 @@ int qse_awk_rtx_readio (
 			break;
 		}

-		/* TODO: handle different line terminator like \r\n */
+		if (rs_len >= 2)
+		{
+			/* if RS is composed of multiple characters,
+			 * I perform the matching after having added the
+			 * current character 'c' to the record buffer 'buf'
+			 * to find the longest match. If a match found ends
+			 * one character before this character just added
+			 * to the buffer, it is the longest match.
+			 */
+			/* TODO: change the way to find the longest match
+			 *       for performance improvement. currently,
+			 *       the function is called for every character
+			 *       added to the buffer. Stupid! */
+			n = match_long_rs (run, buf, 0);
+			if (n != 0)
+			{
+				p->in.pos--; /* unread the character in c */
+				if (n <= -1) ret = -1;
+				break;
+			}
+		}
+
+/* TODO: handle different line terminator like \r\n */
 		if (c == QSE_T('\n')) line_len = 0;
 		else line_len = line_len + 1;
 	}
--- a/qse/regress/awk/Makefile.am
+++ b/qse/regress/awk/Makefile.am
@@ -104,6 +104,7 @@ EXTRA_DIST = \
 	lang-040.awk \
 	lang-041.awk \
 	lang-042.awk \
+	lang-043.awk \
 	columnate.awk \
 	levenshtein.awk \
 	levenshtein-utests.awk \
@@ -117,6 +118,7 @@ EXTRA_DIST = \
 	lang-035.dat2 \
 	lang-036.dat \
 	lang-037.dat \
+	lang-043.dat \
 	adr.dat \
 	asm.dat \
 	cou.dat \
--- a/qse/regress/awk/Makefile.in
+++ b/qse/regress/awk/Makefile.in
@@ -293,6 +293,7 @@ EXTRA_DIST = \
 	lang-040.awk \
 	lang-041.awk \
 	lang-042.awk \
+	lang-043.awk \
 	columnate.awk \
 	levenshtein.awk \
 	levenshtein-utests.awk \
@@ -306,6 +307,7 @@ EXTRA_DIST = \
 	lang-035.dat2 \
 	lang-036.dat \
 	lang-037.dat \
+	lang-043.dat \
 	adr.dat \
 	asm.dat \
 	cou.dat \
--- a/qse/regress/awk/lang-043.awk
+++ b/qse/regress/awk/lang-043.awk
@@ -0,0 +1,7 @@
+BEGIN {
+	RS="[\t\n\v\f\r ]*[\r\n]+[\t\n\v\f\r ]*"
+} 
+
+{
+	print $0
+}
--- a/qse/regress/awk/lang-043.dat
+++ b/qse/regress/awk/lang-043.dat
@@ -0,0 +1,13 @@
+abcd    
+
+
+
+
+		dcba			
+
+
+
+
+
+
+           	 j
--- a/qse/regress/awk/regress.out
+++ b/qse/regress/awk/regress.out
@@ -2087,6 +2087,20 @@ IGNORECASE= 1
 1
 1
 --------------------------------------------------------------------------------
+[CMD]  qseawk  --newline=on -o- -f lang-043.awk lang-043.dat </dev/stdin 2>&1 
+--------------------------------------------------------------------------------
+BEGIN {
+	RS = "[	\n\v\f\r ]*[\r\n]+[	\n\v\f\r ]*";
+}
+
+{
+	print $0;
+}
+
+abcd
+dcba
+j
+--------------------------------------------------------------------------------
 [CMD]  qseawk  --newline=on -F: -f columnate.awk ./passwd.dat </dev/stdin 2>&1 
 --------------------------------------------------------------------------------
 root               x  0      0      root                                /root                       /bin/bash
--- a/qse/regress/awk/regress.out.xma
+++ b/qse/regress/awk/regress.out.xma
@@ -2087,6 +2087,20 @@ IGNORECASE= 1
 1
 1
 --------------------------------------------------------------------------------
+[CMD]  qseawk -m 500000 --newline=on -o- -f lang-043.awk lang-043.dat </dev/stdin 2>&1 
+--------------------------------------------------------------------------------
+BEGIN {
+	RS = "[	\n\v\f\r ]*[\r\n]+[	\n\v\f\r ]*";
+}
+
+{
+	print $0;
+}
+
+abcd
+dcba
+j
+--------------------------------------------------------------------------------
 [CMD]  qseawk -m 500000 --newline=on -F: -f columnate.awk ./passwd.dat </dev/stdin 2>&1 
 --------------------------------------------------------------------------------
 root               x  0      0      root                                /root                       /bin/bash
--- a/qse/regress/awk/regress.sh
+++ b/qse/regress/awk/regress.sh
@@ -165,6 +165,7 @@ PROGS="
 	lang-040.awk!!!--newline=on -o-
 	lang-041.awk!!!--newline=on -o-
 	lang-042.awk!!!--newline=on -o-
+	lang-043.awk!lang-043.dat!!--newline=on -o-

 	columnate.awk!./passwd.dat!!--newline=on -F:
 	levenshtein-utests.awk!!!--newline=on --include=on