fixed the bug of not able to find the longest match for a multiple-character RS.

2011-04-28 08:04:13 +00:00
parent 8b63507b79
commit 40fad800f3
8 changed files with 183 additions and 92 deletions
--- a/qse/lib/awk/rio.c
+++ b/qse/lib/awk/rio.c
@@ -1,5 +1,5 @@
 /*
- * $Id: rio.c 441 2011-04-22 14:28:43Z hyunghwan.chung $
+ * $Id: rio.c 444 2011-04-27 14:04:13Z hyunghwan.chung $
 *
    Copyright 2006-2011 Chung, Hyung-Hwan.
    This file is part of QSE.
@@ -20,7 +20,7 @@
 #include "awk.h"
-enum
+enum io_mask_t
 {
 	MASK_READ  = 0x0100,
 	MASK_WRITE = 0x0200,
@@ -88,6 +88,75 @@ static int out_mask_map[] =
 	MASK_WRITE
 };
 static QSE_INLINE int match_long_rs (qse_awk_rtx_t* run, qse_str_t* buf, int eof)
 {
 	qse_cstr_t match;
 	qse_awk_errnum_t errnum;
 	int n;
 /* TODO: minimize the number of regular expression match by minimizing the call
 *       to match_long_rs() and changing its code.
 *       currently it is called for each character added to buf.
 *       this is a very bad way of doing the job.
 */
 	QSE_ASSERT (run->gbl.rs != QSE_NULL);
 	n = QSE_AWK_MATCHREX (
 		run->awk, run->gbl.rs,
 		((run->gbl.ignorecase)? QSE_REX_IGNORECASE: 0),
 		QSE_STR_PTR(buf), QSE_STR_LEN(buf),
 		QSE_STR_PTR(buf), QSE_STR_LEN(buf),
 		&match, &errnum);
 	if (n <= -1)
 	{
 		qse_awk_rtx_seterrnum (run, errnum, QSE_NULL);
 	}
 	else if (n >= 1)
 	{
 		if (eof)
 		{
 			/* when EOF is reached, the record buffer
 			 * is not added with a new character. It's
 			 * just called again with the same record buffer
 			 * as the previous call to this function.
 			 * A match in this case must end at the end of
 			 * the current record buffer */
 			QSE_ASSERT (
 					QSE_STR_PTR(buf) + QSE_STR_LEN(buf) ==
 					match.ptr + match.len);
 			/* drop the RS part. no extra character after RS to drop
 			 * because we're at EOF and the EOF condition didn't
 			 * add a new character to the buffer before the call
 			 * to this function.
 			 */
 			QSE_STR_LEN(buf) -= match.len;
 		}
 		else
 		{
 			/* the last character read so far has been added
 			 * to the record before the call to this function.
 			 * if the match is found and it ends one character
 			 * before this last character, it is the longest
 			 * match.
 			 */
 			if (QSE_STR_PTR(buf) + QSE_STR_LEN(buf) == match.ptr + match.len + 1)
 			{
 				/* drop the RS part and the last one character after RS */
 				QSE_STR_LEN(buf) -= match.len + 1;
 			}
 			else
 			{
 				/* if the match does not ends at the desired position,
 				 * it is no match as it is not the longest match */
 				n = 0;
 			}
 		}
 	}
 	return n;
 }
 int qse_awk_rtx_readio (
 	qse_awk_rtx_t* run, int in_type,
 	const qse_char_t* name, qse_str_t* buf)
@@ -209,24 +278,26 @@ int qse_awk_rtx_readio (
 	rs = qse_awk_rtx_getgbl (run, QSE_AWK_GBL_RS);
 	qse_awk_rtx_refupval (run, rs);
-	if (rs->type == QSE_AWK_VAL_NIL)
+	switch (rs->type)
 	{
 		case QSE_AWK_VAL_NIL:
 			rs_ptr = QSE_NULL;
 			rs_len = 0;
-	}
+			break;
-	else if (rs->type == QSE_AWK_VAL_STR)
+
-	{
+		case QSE_AWK_VAL_STR:
 			rs_ptr = ((qse_awk_val_str_t*)rs)->ptr;
 			rs_len = ((qse_awk_val_str_t*)rs)->len;
-	}
+			break;
-	else 
+
-	{
+		default:
 			rs_ptr = qse_awk_rtx_valtocpldup (run, rs, &rs_len);
 			if (rs_ptr == QSE_NULL)
 			{
 				qse_awk_rtx_refdownval (run, rs);
 				return -1;
 			}
 			break;
 	}
 	ret = 1;
@@ -263,44 +334,24 @@ int qse_awk_rtx_readio (
 			if (n == 0) 
 			{
 				/* EOF reached */
 				p->in.eof = 1;
 				if (QSE_STR_LEN(buf) == 0) ret = 0;
 				else if (rs_len >= 2)
 				{
-					/* when RS is multiple characters, it needs to check
+					/* When RS is multiple characters, it should 
-					 * for the match at the end of the input stream as
+					 * check for the match at the end of the 
-					 * the buffer has been appened with the last character
+					 * input stream also because the previous 
-					 * after the previous matchrex has failed */
+					 * match could fail as it didn't end at the
-
+					 * desired position to be the longest match.
-					qse_cstr_t match;
+					 * At EOF, the match at the end is considered 
-					qse_awk_errnum_t errnum;
+					 * the longest as there are no more characters
-
+					 * left */
-					QSE_ASSERT (run->gbl.rs != QSE_NULL);
+					n = match_long_rs (run, buf, 1);
-
+					if (n != 0)
 					n = QSE_AWK_MATCHREX (
 						run->awk, run->gbl.rs, 
 						((run->gbl.ignorecase)? QSE_REX_IGNORECASE: 0),
 						QSE_STR_PTR(buf), QSE_STR_LEN(buf), 
 						QSE_STR_PTR(buf), QSE_STR_LEN(buf), 
 						&match, &errnum);
 					if (n <= -1)
 					{
-						qse_awk_rtx_seterrnum (run, errnum, QSE_NULL);
+						if (n <= -1) ret = -1;
 						ret = -1;
 						break;
 					}
 					if (n >= 1)
 					{
 						/* the match should be found at the end of
 						 * the current buffer */
 						QSE_ASSERT (
 							QSE_STR_PTR(buf) + QSE_STR_LEN(buf) ==
 							match.ptr + match.len);
 						/*QSE_STR_LEN(buf) -= match.len;*/
 						buf->len -= match.len;
 						break;
 					}
 				}
@@ -323,8 +374,7 @@ int qse_awk_rtx_readio (
 				if (pc == QSE_T('\r') && 
 				    QSE_STR_LEN(buf) > 0) 
 				{
-					/*QSE_STR_LEN(buf) -= 1;*/
+					QSE_STR_LEN(buf) -= 1;
 					buf->len -= 1;
 				}
 				break;
 			}
@@ -337,8 +387,7 @@ int qse_awk_rtx_readio (
 				if (pc == QSE_T('\r') && 
 				    QSE_STR_LEN(buf) > 0) 
 				{
-					/*QSE_STR_LEN(buf) -= 1;*/
+					QSE_STR_LEN(buf) -= 1;
 					buf->len -= 1;
 				}
 			}
@@ -367,43 +416,10 @@ int qse_awk_rtx_readio (
 		}
 		else
 		{
-			qse_cstr_t match;
+			/* I don't do anything here if RS is composed of
-			qse_awk_errnum_t errnum;
+			 * multiple characters. See the comment furthur down */
 /* TODO: minimize the number of regular expressoin match here...
 *       currently matchrex is called for each character added to buf.
 *       this is a very bad way of doing the job.
 */
 			QSE_ASSERT (run->gbl.rs != QSE_NULL);
 			n = QSE_AWK_MATCHREX (
 				run->awk, run->gbl.rs, 
 				((run->gbl.ignorecase)? QSE_REX_IGNORECASE: 0),
 				QSE_STR_PTR(buf), QSE_STR_LEN(buf), 
 				QSE_STR_PTR(buf), QSE_STR_LEN(buf), 
 				&match, &errnum);
 			if (n <= -1)
 			{
 				qse_awk_rtx_seterrnum (run, errnum, QSE_NULL);
 				ret = -1;
 				p->in.pos--; /* unread the character in c */
 				break;
 		}
 			if (n >= 1)
 			{
 				/* the match should be found at the end of
 				 * the current buffer */
 				QSE_ASSERT (
 					QSE_STR_PTR(buf) + QSE_STR_LEN(buf) ==
 					match.ptr + match.len);
 				/*QSE_STR_LEN(buf) -= match.len;*/
 				buf->len -= match.len;
 				p->in.pos--; /* unread the character in c */
 				break;
 			}
 		}
 		if (qse_str_ccat (buf, c) == (qse_size_t)-1)
 		{
@@ -412,6 +428,28 @@ int qse_awk_rtx_readio (
 			break;
 		}
 		if (rs_len >= 2)
 		{
 			/* if RS is composed of multiple characters,
 			 * I perform the matching after having added the
 			 * current character 'c' to the record buffer 'buf'
 			 * to find the longest match. If a match found ends
 			 * one character before this character just added
 			 * to the buffer, it is the longest match.
 			 */
 			/* TODO: change the way to find the longest match
 			 *       for performance improvement. currently,
 			 *       the function is called for every character
 			 *       added to the buffer. Stupid! */
 			n = match_long_rs (run, buf, 0);
 			if (n != 0)
 			{
 				p->in.pos--; /* unread the character in c */
 				if (n <= -1) ret = -1;
 				break;
 			}
 		}
 /* TODO: handle different line terminator like \r\n */
 		if (c == QSE_T('\n')) line_len = 0;
 		else line_len = line_len + 1;
--- a/qse/regress/awk/Makefile.am
+++ b/qse/regress/awk/Makefile.am
@@ -104,6 +104,7 @@ EXTRA_DIST = \
 	lang-040.awk \
 	lang-041.awk \
 	lang-042.awk \
 	lang-043.awk \
 	columnate.awk \
 	levenshtein.awk \
 	levenshtein-utests.awk \
@@ -117,6 +118,7 @@ EXTRA_DIST = \
 	lang-035.dat2 \
 	lang-036.dat \
 	lang-037.dat \
 	lang-043.dat \
 	adr.dat \
 	asm.dat \
 	cou.dat \
--- a/qse/regress/awk/Makefile.in
+++ b/qse/regress/awk/Makefile.in
@@ -293,6 +293,7 @@ EXTRA_DIST = \
 	lang-040.awk \
 	lang-041.awk \
 	lang-042.awk \
 	lang-043.awk \
 	columnate.awk \
 	levenshtein.awk \
 	levenshtein-utests.awk \
@@ -306,6 +307,7 @@ EXTRA_DIST = \
 	lang-035.dat2 \
 	lang-036.dat \
 	lang-037.dat \
 	lang-043.dat \
 	adr.dat \
 	asm.dat \
 	cou.dat \
--- a/qse/regress/awk/lang-043.awk
+++ b/qse/regress/awk/lang-043.awk
@@ -0,0 +1,7 @@
 BEGIN {
 	RS="[\t\n\v\f\r ]*[\r\n]+[\t\n\v\f\r ]*"
 } 
 {
 	print $0
 }
--- a/qse/regress/awk/lang-043.dat
+++ b/qse/regress/awk/lang-043.dat
@@ -0,0 +1,13 @@
 abcd    
 		dcba			
           	 j
--- a/qse/regress/awk/regress.out
+++ b/qse/regress/awk/regress.out
@@ -2087,6 +2087,20 @@ IGNORECASE= 1
 1
 1
 --------------------------------------------------------------------------------
 [CMD]  qseawk  --newline=on -o- -f lang-043.awk lang-043.dat </dev/stdin 2>&1 
 --------------------------------------------------------------------------------
 BEGIN {
 	RS = "[	\n\v\f\r ]*[\r\n]+[	\n\v\f\r ]*";
 }
 {
 	print $0;
 }
 abcd
 dcba
 j
 --------------------------------------------------------------------------------
 [CMD]  qseawk  --newline=on -F: -f columnate.awk ./passwd.dat </dev/stdin 2>&1 
 --------------------------------------------------------------------------------
 root               x  0      0      root                                /root                       /bin/bash
--- a/qse/regress/awk/regress.out.xma
+++ b/qse/regress/awk/regress.out.xma
@@ -2087,6 +2087,20 @@ IGNORECASE= 1
 1
 1
 --------------------------------------------------------------------------------
 [CMD]  qseawk -m 500000 --newline=on -o- -f lang-043.awk lang-043.dat </dev/stdin 2>&1 
 --------------------------------------------------------------------------------
 BEGIN {
 	RS = "[	\n\v\f\r ]*[\r\n]+[	\n\v\f\r ]*";
 }
 {
 	print $0;
 }
 abcd
 dcba
 j
 --------------------------------------------------------------------------------
 [CMD]  qseawk -m 500000 --newline=on -F: -f columnate.awk ./passwd.dat </dev/stdin 2>&1 
 --------------------------------------------------------------------------------
 root               x  0      0      root                                /root                       /bin/bash
--- a/qse/regress/awk/regress.sh
+++ b/qse/regress/awk/regress.sh
@@ -165,6 +165,7 @@ PROGS="
 	lang-040.awk!!!--newline=on -o-
 	lang-041.awk!!!--newline=on -o-
 	lang-042.awk!!!--newline=on -o-
 	lang-043.awk!lang-043.dat!!--newline=on -o-
 	columnate.awk!./passwd.dat!!--newline=on -F:
 	levenshtein-utests.awk!!!--newline=on --include=on