fixed the bug of not able to find the longest match for a multiple-character RS.
This commit is contained in:
parent
8b63507b79
commit
40fad800f3
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* $Id: rio.c 441 2011-04-22 14:28:43Z hyunghwan.chung $
|
||||
* $Id: rio.c 444 2011-04-27 14:04:13Z hyunghwan.chung $
|
||||
*
|
||||
Copyright 2006-2011 Chung, Hyung-Hwan.
|
||||
This file is part of QSE.
|
||||
@ -20,7 +20,7 @@
|
||||
|
||||
#include "awk.h"
|
||||
|
||||
enum
|
||||
enum io_mask_t
|
||||
{
|
||||
MASK_READ = 0x0100,
|
||||
MASK_WRITE = 0x0200,
|
||||
@ -88,6 +88,75 @@ static int out_mask_map[] =
|
||||
MASK_WRITE
|
||||
};
|
||||
|
||||
static QSE_INLINE int match_long_rs (qse_awk_rtx_t* run, qse_str_t* buf, int eof)
|
||||
{
|
||||
qse_cstr_t match;
|
||||
qse_awk_errnum_t errnum;
|
||||
int n;
|
||||
|
||||
/* TODO: minimize the number of regular expression match by minimizing the call
|
||||
* to match_long_rs() and changing its code.
|
||||
* currently it is called for each character added to buf.
|
||||
* this is a very bad way of doing the job.
|
||||
*/
|
||||
QSE_ASSERT (run->gbl.rs != QSE_NULL);
|
||||
|
||||
n = QSE_AWK_MATCHREX (
|
||||
run->awk, run->gbl.rs,
|
||||
((run->gbl.ignorecase)? QSE_REX_IGNORECASE: 0),
|
||||
QSE_STR_PTR(buf), QSE_STR_LEN(buf),
|
||||
QSE_STR_PTR(buf), QSE_STR_LEN(buf),
|
||||
&match, &errnum);
|
||||
if (n <= -1)
|
||||
{
|
||||
qse_awk_rtx_seterrnum (run, errnum, QSE_NULL);
|
||||
}
|
||||
else if (n >= 1)
|
||||
{
|
||||
if (eof)
|
||||
{
|
||||
/* when EOF is reached, the record buffer
|
||||
* is not added with a new character. It's
|
||||
* just called again with the same record buffer
|
||||
* as the previous call to this function.
|
||||
* A match in this case must end at the end of
|
||||
* the current record buffer */
|
||||
QSE_ASSERT (
|
||||
QSE_STR_PTR(buf) + QSE_STR_LEN(buf) ==
|
||||
match.ptr + match.len);
|
||||
|
||||
/* drop the RS part. no extra character after RS to drop
|
||||
* because we're at EOF and the EOF condition didn't
|
||||
* add a new character to the buffer before the call
|
||||
* to this function.
|
||||
*/
|
||||
QSE_STR_LEN(buf) -= match.len;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* the last character read so far has been added
|
||||
* to the record before the call to this function.
|
||||
* if the match is found and it ends one character
|
||||
* before this last character, it is the longest
|
||||
* match.
|
||||
*/
|
||||
if (QSE_STR_PTR(buf) + QSE_STR_LEN(buf) == match.ptr + match.len + 1)
|
||||
{
|
||||
/* drop the RS part and the last one character after RS */
|
||||
QSE_STR_LEN(buf) -= match.len + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* if the match does not ends at the desired position,
|
||||
* it is no match as it is not the longest match */
|
||||
n = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
int qse_awk_rtx_readio (
|
||||
qse_awk_rtx_t* run, int in_type,
|
||||
const qse_char_t* name, qse_str_t* buf)
|
||||
@ -209,24 +278,26 @@ int qse_awk_rtx_readio (
|
||||
rs = qse_awk_rtx_getgbl (run, QSE_AWK_GBL_RS);
|
||||
qse_awk_rtx_refupval (run, rs);
|
||||
|
||||
if (rs->type == QSE_AWK_VAL_NIL)
|
||||
switch (rs->type)
|
||||
{
|
||||
rs_ptr = QSE_NULL;
|
||||
rs_len = 0;
|
||||
}
|
||||
else if (rs->type == QSE_AWK_VAL_STR)
|
||||
{
|
||||
rs_ptr = ((qse_awk_val_str_t*)rs)->ptr;
|
||||
rs_len = ((qse_awk_val_str_t*)rs)->len;
|
||||
}
|
||||
else
|
||||
{
|
||||
rs_ptr = qse_awk_rtx_valtocpldup (run, rs, &rs_len);
|
||||
if (rs_ptr == QSE_NULL)
|
||||
{
|
||||
qse_awk_rtx_refdownval (run, rs);
|
||||
return -1;
|
||||
}
|
||||
case QSE_AWK_VAL_NIL:
|
||||
rs_ptr = QSE_NULL;
|
||||
rs_len = 0;
|
||||
break;
|
||||
|
||||
case QSE_AWK_VAL_STR:
|
||||
rs_ptr = ((qse_awk_val_str_t*)rs)->ptr;
|
||||
rs_len = ((qse_awk_val_str_t*)rs)->len;
|
||||
break;
|
||||
|
||||
default:
|
||||
rs_ptr = qse_awk_rtx_valtocpldup (run, rs, &rs_len);
|
||||
if (rs_ptr == QSE_NULL)
|
||||
{
|
||||
qse_awk_rtx_refdownval (run, rs);
|
||||
return -1;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
ret = 1;
|
||||
@ -263,44 +334,24 @@ int qse_awk_rtx_readio (
|
||||
|
||||
if (n == 0)
|
||||
{
|
||||
/* EOF reached */
|
||||
p->in.eof = 1;
|
||||
|
||||
if (QSE_STR_LEN(buf) == 0) ret = 0;
|
||||
else if (rs_len >= 2)
|
||||
{
|
||||
/* when RS is multiple characters, it needs to check
|
||||
* for the match at the end of the input stream as
|
||||
* the buffer has been appened with the last character
|
||||
* after the previous matchrex has failed */
|
||||
|
||||
qse_cstr_t match;
|
||||
qse_awk_errnum_t errnum;
|
||||
|
||||
QSE_ASSERT (run->gbl.rs != QSE_NULL);
|
||||
|
||||
n = QSE_AWK_MATCHREX (
|
||||
run->awk, run->gbl.rs,
|
||||
((run->gbl.ignorecase)? QSE_REX_IGNORECASE: 0),
|
||||
QSE_STR_PTR(buf), QSE_STR_LEN(buf),
|
||||
QSE_STR_PTR(buf), QSE_STR_LEN(buf),
|
||||
&match, &errnum);
|
||||
if (n <= -1)
|
||||
/* When RS is multiple characters, it should
|
||||
* check for the match at the end of the
|
||||
* input stream also because the previous
|
||||
* match could fail as it didn't end at the
|
||||
* desired position to be the longest match.
|
||||
* At EOF, the match at the end is considered
|
||||
* the longest as there are no more characters
|
||||
* left */
|
||||
n = match_long_rs (run, buf, 1);
|
||||
if (n != 0)
|
||||
{
|
||||
qse_awk_rtx_seterrnum (run, errnum, QSE_NULL);
|
||||
ret = -1;
|
||||
break;
|
||||
}
|
||||
|
||||
if (n >= 1)
|
||||
{
|
||||
/* the match should be found at the end of
|
||||
* the current buffer */
|
||||
QSE_ASSERT (
|
||||
QSE_STR_PTR(buf) + QSE_STR_LEN(buf) ==
|
||||
match.ptr + match.len);
|
||||
|
||||
/*QSE_STR_LEN(buf) -= match.len;*/
|
||||
buf->len -= match.len;
|
||||
if (n <= -1) ret = -1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -323,8 +374,7 @@ int qse_awk_rtx_readio (
|
||||
if (pc == QSE_T('\r') &&
|
||||
QSE_STR_LEN(buf) > 0)
|
||||
{
|
||||
/*QSE_STR_LEN(buf) -= 1;*/
|
||||
buf->len -= 1;
|
||||
QSE_STR_LEN(buf) -= 1;
|
||||
}
|
||||
break;
|
||||
}
|
||||
@ -337,8 +387,7 @@ int qse_awk_rtx_readio (
|
||||
if (pc == QSE_T('\r') &&
|
||||
QSE_STR_LEN(buf) > 0)
|
||||
{
|
||||
/*QSE_STR_LEN(buf) -= 1;*/
|
||||
buf->len -= 1;
|
||||
QSE_STR_LEN(buf) -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
@ -367,44 +416,11 @@ int qse_awk_rtx_readio (
|
||||
}
|
||||
else
|
||||
{
|
||||
qse_cstr_t match;
|
||||
qse_awk_errnum_t errnum;
|
||||
|
||||
/* TODO: minimize the number of regular expressoin match here...
|
||||
* currently matchrex is called for each character added to buf.
|
||||
* this is a very bad way of doing the job.
|
||||
*/
|
||||
QSE_ASSERT (run->gbl.rs != QSE_NULL);
|
||||
|
||||
n = QSE_AWK_MATCHREX (
|
||||
run->awk, run->gbl.rs,
|
||||
((run->gbl.ignorecase)? QSE_REX_IGNORECASE: 0),
|
||||
QSE_STR_PTR(buf), QSE_STR_LEN(buf),
|
||||
QSE_STR_PTR(buf), QSE_STR_LEN(buf),
|
||||
&match, &errnum);
|
||||
if (n <= -1)
|
||||
{
|
||||
qse_awk_rtx_seterrnum (run, errnum, QSE_NULL);
|
||||
ret = -1;
|
||||
p->in.pos--; /* unread the character in c */
|
||||
break;
|
||||
}
|
||||
|
||||
if (n >= 1)
|
||||
{
|
||||
/* the match should be found at the end of
|
||||
* the current buffer */
|
||||
QSE_ASSERT (
|
||||
QSE_STR_PTR(buf) + QSE_STR_LEN(buf) ==
|
||||
match.ptr + match.len);
|
||||
|
||||
/*QSE_STR_LEN(buf) -= match.len;*/
|
||||
buf->len -= match.len;
|
||||
p->in.pos--; /* unread the character in c */
|
||||
break;
|
||||
}
|
||||
/* I don't do anything here if RS is composed of
|
||||
* multiple characters. See the comment furthur down */
|
||||
}
|
||||
|
||||
|
||||
if (qse_str_ccat (buf, c) == (qse_size_t)-1)
|
||||
{
|
||||
qse_awk_rtx_seterrnum (run, QSE_AWK_ENOMEM, QSE_NULL);
|
||||
@ -412,7 +428,29 @@ int qse_awk_rtx_readio (
|
||||
break;
|
||||
}
|
||||
|
||||
/* TODO: handle different line terminator like \r\n */
|
||||
if (rs_len >= 2)
|
||||
{
|
||||
/* if RS is composed of multiple characters,
|
||||
* I perform the matching after having added the
|
||||
* current character 'c' to the record buffer 'buf'
|
||||
* to find the longest match. If a match found ends
|
||||
* one character before this character just added
|
||||
* to the buffer, it is the longest match.
|
||||
*/
|
||||
/* TODO: change the way to find the longest match
|
||||
* for performance improvement. currently,
|
||||
* the function is called for every character
|
||||
* added to the buffer. Stupid! */
|
||||
n = match_long_rs (run, buf, 0);
|
||||
if (n != 0)
|
||||
{
|
||||
p->in.pos--; /* unread the character in c */
|
||||
if (n <= -1) ret = -1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* TODO: handle different line terminator like \r\n */
|
||||
if (c == QSE_T('\n')) line_len = 0;
|
||||
else line_len = line_len + 1;
|
||||
}
|
||||
|
@ -104,6 +104,7 @@ EXTRA_DIST = \
|
||||
lang-040.awk \
|
||||
lang-041.awk \
|
||||
lang-042.awk \
|
||||
lang-043.awk \
|
||||
columnate.awk \
|
||||
levenshtein.awk \
|
||||
levenshtein-utests.awk \
|
||||
@ -117,6 +118,7 @@ EXTRA_DIST = \
|
||||
lang-035.dat2 \
|
||||
lang-036.dat \
|
||||
lang-037.dat \
|
||||
lang-043.dat \
|
||||
adr.dat \
|
||||
asm.dat \
|
||||
cou.dat \
|
||||
|
@ -293,6 +293,7 @@ EXTRA_DIST = \
|
||||
lang-040.awk \
|
||||
lang-041.awk \
|
||||
lang-042.awk \
|
||||
lang-043.awk \
|
||||
columnate.awk \
|
||||
levenshtein.awk \
|
||||
levenshtein-utests.awk \
|
||||
@ -306,6 +307,7 @@ EXTRA_DIST = \
|
||||
lang-035.dat2 \
|
||||
lang-036.dat \
|
||||
lang-037.dat \
|
||||
lang-043.dat \
|
||||
adr.dat \
|
||||
asm.dat \
|
||||
cou.dat \
|
||||
|
7
qse/regress/awk/lang-043.awk
Normal file
7
qse/regress/awk/lang-043.awk
Normal file
@ -0,0 +1,7 @@
|
||||
BEGIN {
|
||||
RS="[\t\n\v\f\r ]*[\r\n]+[\t\n\v\f\r ]*"
|
||||
}
|
||||
|
||||
{
|
||||
print $0
|
||||
}
|
13
qse/regress/awk/lang-043.dat
Normal file
13
qse/regress/awk/lang-043.dat
Normal file
@ -0,0 +1,13 @@
|
||||
abcd
|
||||
|
||||
|
||||
|
||||
|
||||
dcba
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
j
|
@ -2087,6 +2087,20 @@ IGNORECASE= 1
|
||||
1
|
||||
1
|
||||
--------------------------------------------------------------------------------
|
||||
[CMD] qseawk --newline=on -o- -f lang-043.awk lang-043.dat </dev/stdin 2>&1
|
||||
--------------------------------------------------------------------------------
|
||||
BEGIN {
|
||||
RS = "[ \n\v\f\r ]*[\r\n]+[ \n\v\f\r ]*";
|
||||
}
|
||||
|
||||
{
|
||||
print $0;
|
||||
}
|
||||
|
||||
abcd
|
||||
dcba
|
||||
j
|
||||
--------------------------------------------------------------------------------
|
||||
[CMD] qseawk --newline=on -F: -f columnate.awk ./passwd.dat </dev/stdin 2>&1
|
||||
--------------------------------------------------------------------------------
|
||||
root x 0 0 root /root /bin/bash
|
||||
|
@ -2087,6 +2087,20 @@ IGNORECASE= 1
|
||||
1
|
||||
1
|
||||
--------------------------------------------------------------------------------
|
||||
[CMD] qseawk -m 500000 --newline=on -o- -f lang-043.awk lang-043.dat </dev/stdin 2>&1
|
||||
--------------------------------------------------------------------------------
|
||||
BEGIN {
|
||||
RS = "[ \n\v\f\r ]*[\r\n]+[ \n\v\f\r ]*";
|
||||
}
|
||||
|
||||
{
|
||||
print $0;
|
||||
}
|
||||
|
||||
abcd
|
||||
dcba
|
||||
j
|
||||
--------------------------------------------------------------------------------
|
||||
[CMD] qseawk -m 500000 --newline=on -F: -f columnate.awk ./passwd.dat </dev/stdin 2>&1
|
||||
--------------------------------------------------------------------------------
|
||||
root x 0 0 root /root /bin/bash
|
||||
|
@ -165,6 +165,7 @@ PROGS="
|
||||
lang-040.awk!!!--newline=on -o-
|
||||
lang-041.awk!!!--newline=on -o-
|
||||
lang-042.awk!!!--newline=on -o-
|
||||
lang-043.awk!lang-043.dat!!--newline=on -o-
|
||||
|
||||
columnate.awk!./passwd.dat!!--newline=on -F:
|
||||
levenshtein-utests.awk!!!--newline=on --include=on
|
||||
|
Loading…
Reference in New Issue
Block a user