fixed the bug of not able to find the longest match for a multiple-character RS.

This commit is contained in:
hyung-hwan 2011-04-28 08:04:13 +00:00
parent 8b63507b79
commit 40fad800f3
8 changed files with 183 additions and 92 deletions

View File

@ -1,5 +1,5 @@
/*
* $Id: rio.c 441 2011-04-22 14:28:43Z hyunghwan.chung $
* $Id: rio.c 444 2011-04-27 14:04:13Z hyunghwan.chung $
*
Copyright 2006-2011 Chung, Hyung-Hwan.
This file is part of QSE.
@ -20,7 +20,7 @@
#include "awk.h"
enum
enum io_mask_t
{
MASK_READ = 0x0100,
MASK_WRITE = 0x0200,
@ -88,6 +88,75 @@ static int out_mask_map[] =
MASK_WRITE
};
static QSE_INLINE int match_long_rs (qse_awk_rtx_t* run, qse_str_t* buf, int eof)
{
qse_cstr_t match;
qse_awk_errnum_t errnum;
int n;
/* TODO: minimize the number of regular expression match by minimizing the call
* to match_long_rs() and changing its code.
* currently it is called for each character added to buf.
* this is a very bad way of doing the job.
*/
QSE_ASSERT (run->gbl.rs != QSE_NULL);
n = QSE_AWK_MATCHREX (
run->awk, run->gbl.rs,
((run->gbl.ignorecase)? QSE_REX_IGNORECASE: 0),
QSE_STR_PTR(buf), QSE_STR_LEN(buf),
QSE_STR_PTR(buf), QSE_STR_LEN(buf),
&match, &errnum);
if (n <= -1)
{
qse_awk_rtx_seterrnum (run, errnum, QSE_NULL);
}
else if (n >= 1)
{
if (eof)
{
/* when EOF is reached, the record buffer
* is not added with a new character. It's
* just called again with the same record buffer
* as the previous call to this function.
* A match in this case must end at the end of
* the current record buffer */
QSE_ASSERT (
QSE_STR_PTR(buf) + QSE_STR_LEN(buf) ==
match.ptr + match.len);
/* drop the RS part. no extra character after RS to drop
* because we're at EOF and the EOF condition didn't
* add a new character to the buffer before the call
* to this function.
*/
QSE_STR_LEN(buf) -= match.len;
}
else
{
/* the last character read so far has been added
* to the record before the call to this function.
* if the match is found and it ends one character
* before this last character, it is the longest
* match.
*/
if (QSE_STR_PTR(buf) + QSE_STR_LEN(buf) == match.ptr + match.len + 1)
{
/* drop the RS part and the last one character after RS */
QSE_STR_LEN(buf) -= match.len + 1;
}
else
{
/* if the match does not ends at the desired position,
* it is no match as it is not the longest match */
n = 0;
}
}
}
return n;
}
int qse_awk_rtx_readio (
qse_awk_rtx_t* run, int in_type,
const qse_char_t* name, qse_str_t* buf)
@ -209,24 +278,26 @@ int qse_awk_rtx_readio (
rs = qse_awk_rtx_getgbl (run, QSE_AWK_GBL_RS);
qse_awk_rtx_refupval (run, rs);
if (rs->type == QSE_AWK_VAL_NIL)
switch (rs->type)
{
case QSE_AWK_VAL_NIL:
rs_ptr = QSE_NULL;
rs_len = 0;
}
else if (rs->type == QSE_AWK_VAL_STR)
{
break;
case QSE_AWK_VAL_STR:
rs_ptr = ((qse_awk_val_str_t*)rs)->ptr;
rs_len = ((qse_awk_val_str_t*)rs)->len;
}
else
{
break;
default:
rs_ptr = qse_awk_rtx_valtocpldup (run, rs, &rs_len);
if (rs_ptr == QSE_NULL)
{
qse_awk_rtx_refdownval (run, rs);
return -1;
}
break;
}
ret = 1;
@ -263,44 +334,24 @@ int qse_awk_rtx_readio (
if (n == 0)
{
/* EOF reached */
p->in.eof = 1;
if (QSE_STR_LEN(buf) == 0) ret = 0;
else if (rs_len >= 2)
{
/* when RS is multiple characters, it needs to check
* for the match at the end of the input stream as
* the buffer has been appened with the last character
* after the previous matchrex has failed */
qse_cstr_t match;
qse_awk_errnum_t errnum;
QSE_ASSERT (run->gbl.rs != QSE_NULL);
n = QSE_AWK_MATCHREX (
run->awk, run->gbl.rs,
((run->gbl.ignorecase)? QSE_REX_IGNORECASE: 0),
QSE_STR_PTR(buf), QSE_STR_LEN(buf),
QSE_STR_PTR(buf), QSE_STR_LEN(buf),
&match, &errnum);
if (n <= -1)
/* When RS is multiple characters, it should
* check for the match at the end of the
* input stream also because the previous
* match could fail as it didn't end at the
* desired position to be the longest match.
* At EOF, the match at the end is considered
* the longest as there are no more characters
* left */
n = match_long_rs (run, buf, 1);
if (n != 0)
{
qse_awk_rtx_seterrnum (run, errnum, QSE_NULL);
ret = -1;
break;
}
if (n >= 1)
{
/* the match should be found at the end of
* the current buffer */
QSE_ASSERT (
QSE_STR_PTR(buf) + QSE_STR_LEN(buf) ==
match.ptr + match.len);
/*QSE_STR_LEN(buf) -= match.len;*/
buf->len -= match.len;
if (n <= -1) ret = -1;
break;
}
}
@ -323,8 +374,7 @@ int qse_awk_rtx_readio (
if (pc == QSE_T('\r') &&
QSE_STR_LEN(buf) > 0)
{
/*QSE_STR_LEN(buf) -= 1;*/
buf->len -= 1;
QSE_STR_LEN(buf) -= 1;
}
break;
}
@ -337,8 +387,7 @@ int qse_awk_rtx_readio (
if (pc == QSE_T('\r') &&
QSE_STR_LEN(buf) > 0)
{
/*QSE_STR_LEN(buf) -= 1;*/
buf->len -= 1;
QSE_STR_LEN(buf) -= 1;
}
}
@ -367,43 +416,10 @@ int qse_awk_rtx_readio (
}
else
{
qse_cstr_t match;
qse_awk_errnum_t errnum;
/* TODO: minimize the number of regular expressoin match here...
* currently matchrex is called for each character added to buf.
* this is a very bad way of doing the job.
*/
QSE_ASSERT (run->gbl.rs != QSE_NULL);
n = QSE_AWK_MATCHREX (
run->awk, run->gbl.rs,
((run->gbl.ignorecase)? QSE_REX_IGNORECASE: 0),
QSE_STR_PTR(buf), QSE_STR_LEN(buf),
QSE_STR_PTR(buf), QSE_STR_LEN(buf),
&match, &errnum);
if (n <= -1)
{
qse_awk_rtx_seterrnum (run, errnum, QSE_NULL);
ret = -1;
p->in.pos--; /* unread the character in c */
break;
/* I don't do anything here if RS is composed of
* multiple characters. See the comment furthur down */
}
if (n >= 1)
{
/* the match should be found at the end of
* the current buffer */
QSE_ASSERT (
QSE_STR_PTR(buf) + QSE_STR_LEN(buf) ==
match.ptr + match.len);
/*QSE_STR_LEN(buf) -= match.len;*/
buf->len -= match.len;
p->in.pos--; /* unread the character in c */
break;
}
}
if (qse_str_ccat (buf, c) == (qse_size_t)-1)
{
@ -412,7 +428,29 @@ int qse_awk_rtx_readio (
break;
}
/* TODO: handle different line terminator like \r\n */
if (rs_len >= 2)
{
/* if RS is composed of multiple characters,
* I perform the matching after having added the
* current character 'c' to the record buffer 'buf'
* to find the longest match. If a match found ends
* one character before this character just added
* to the buffer, it is the longest match.
*/
/* TODO: change the way to find the longest match
* for performance improvement. currently,
* the function is called for every character
* added to the buffer. Stupid! */
n = match_long_rs (run, buf, 0);
if (n != 0)
{
p->in.pos--; /* unread the character in c */
if (n <= -1) ret = -1;
break;
}
}
/* TODO: handle different line terminator like \r\n */
if (c == QSE_T('\n')) line_len = 0;
else line_len = line_len + 1;
}

View File

@ -104,6 +104,7 @@ EXTRA_DIST = \
lang-040.awk \
lang-041.awk \
lang-042.awk \
lang-043.awk \
columnate.awk \
levenshtein.awk \
levenshtein-utests.awk \
@ -117,6 +118,7 @@ EXTRA_DIST = \
lang-035.dat2 \
lang-036.dat \
lang-037.dat \
lang-043.dat \
adr.dat \
asm.dat \
cou.dat \

View File

@ -293,6 +293,7 @@ EXTRA_DIST = \
lang-040.awk \
lang-041.awk \
lang-042.awk \
lang-043.awk \
columnate.awk \
levenshtein.awk \
levenshtein-utests.awk \
@ -306,6 +307,7 @@ EXTRA_DIST = \
lang-035.dat2 \
lang-036.dat \
lang-037.dat \
lang-043.dat \
adr.dat \
asm.dat \
cou.dat \

View File

@ -0,0 +1,7 @@
BEGIN {
RS="[\t\n\v\f\r ]*[\r\n]+[\t\n\v\f\r ]*"
}
{
print $0
}

View File

@ -0,0 +1,13 @@
abcd
dcba
j

View File

@ -2087,6 +2087,20 @@ IGNORECASE= 1
1
1
--------------------------------------------------------------------------------
[CMD] qseawk --newline=on -o- -f lang-043.awk lang-043.dat </dev/stdin 2>&1
--------------------------------------------------------------------------------
BEGIN {
RS = "[ \n\v\f\r ]*[\r\n]+[ \n\v\f\r ]*";
}
{
print $0;
}
abcd
dcba
j
--------------------------------------------------------------------------------
[CMD] qseawk --newline=on -F: -f columnate.awk ./passwd.dat </dev/stdin 2>&1
--------------------------------------------------------------------------------
root x 0 0 root /root /bin/bash

View File

@ -2087,6 +2087,20 @@ IGNORECASE= 1
1
1
--------------------------------------------------------------------------------
[CMD] qseawk -m 500000 --newline=on -o- -f lang-043.awk lang-043.dat </dev/stdin 2>&1
--------------------------------------------------------------------------------
BEGIN {
RS = "[ \n\v\f\r ]*[\r\n]+[ \n\v\f\r ]*";
}
{
print $0;
}
abcd
dcba
j
--------------------------------------------------------------------------------
[CMD] qseawk -m 500000 --newline=on -F: -f columnate.awk ./passwd.dat </dev/stdin 2>&1
--------------------------------------------------------------------------------
root x 0 0 root /root /bin/bash

View File

@ -165,6 +165,7 @@ PROGS="
lang-040.awk!!!--newline=on -o-
lang-041.awk!!!--newline=on -o-
lang-042.awk!!!--newline=on -o-
lang-043.awk!lang-043.dat!!--newline=on -o-
columnate.awk!./passwd.dat!!--newline=on -F:
levenshtein-utests.awk!!!--newline=on --include=on