From 40fad800f3ca4d678d8caaf03550acbf14d319e2 Mon Sep 17 00:00:00 2001 From: hyung-hwan Date: Thu, 28 Apr 2011 08:04:13 +0000 Subject: [PATCH] fixed the bug of not able to find the longest match for a multiple-character RS. --- qse/lib/awk/rio.c | 222 +++++++++++++++++++------------- qse/regress/awk/Makefile.am | 2 + qse/regress/awk/Makefile.in | 2 + qse/regress/awk/lang-043.awk | 7 + qse/regress/awk/lang-043.dat | 13 ++ qse/regress/awk/regress.out | 14 ++ qse/regress/awk/regress.out.xma | 14 ++ qse/regress/awk/regress.sh | 1 + 8 files changed, 183 insertions(+), 92 deletions(-) create mode 100644 qse/regress/awk/lang-043.awk create mode 100644 qse/regress/awk/lang-043.dat diff --git a/qse/lib/awk/rio.c b/qse/lib/awk/rio.c index 4d9b71a8..51e5ce40 100644 --- a/qse/lib/awk/rio.c +++ b/qse/lib/awk/rio.c @@ -1,5 +1,5 @@ /* - * $Id: rio.c 441 2011-04-22 14:28:43Z hyunghwan.chung $ + * $Id: rio.c 444 2011-04-27 14:04:13Z hyunghwan.chung $ * Copyright 2006-2011 Chung, Hyung-Hwan. This file is part of QSE. @@ -20,7 +20,7 @@ #include "awk.h" -enum +enum io_mask_t { MASK_READ = 0x0100, MASK_WRITE = 0x0200, @@ -88,6 +88,75 @@ static int out_mask_map[] = MASK_WRITE }; +static QSE_INLINE int match_long_rs (qse_awk_rtx_t* run, qse_str_t* buf, int eof) +{ + qse_cstr_t match; + qse_awk_errnum_t errnum; + int n; + +/* TODO: minimize the number of regular expression match by minimizing the call + * to match_long_rs() and changing its code. + * currently it is called for each character added to buf. + * this is a very bad way of doing the job. + */ + QSE_ASSERT (run->gbl.rs != QSE_NULL); + + n = QSE_AWK_MATCHREX ( + run->awk, run->gbl.rs, + ((run->gbl.ignorecase)? QSE_REX_IGNORECASE: 0), + QSE_STR_PTR(buf), QSE_STR_LEN(buf), + QSE_STR_PTR(buf), QSE_STR_LEN(buf), + &match, &errnum); + if (n <= -1) + { + qse_awk_rtx_seterrnum (run, errnum, QSE_NULL); + } + else if (n >= 1) + { + if (eof) + { + /* when EOF is reached, the record buffer + * is not added with a new character. It's + * just called again with the same record buffer + * as the previous call to this function. + * A match in this case must end at the end of + * the current record buffer */ + QSE_ASSERT ( + QSE_STR_PTR(buf) + QSE_STR_LEN(buf) == + match.ptr + match.len); + + /* drop the RS part. no extra character after RS to drop + * because we're at EOF and the EOF condition didn't + * add a new character to the buffer before the call + * to this function. + */ + QSE_STR_LEN(buf) -= match.len; + } + else + { + /* the last character read so far has been added + * to the record before the call to this function. + * if the match is found and it ends one character + * before this last character, it is the longest + * match. + */ + if (QSE_STR_PTR(buf) + QSE_STR_LEN(buf) == match.ptr + match.len + 1) + { + /* drop the RS part and the last one character after RS */ + QSE_STR_LEN(buf) -= match.len + 1; + } + else + { + /* if the match does not ends at the desired position, + * it is no match as it is not the longest match */ + n = 0; + } + } + } + + return n; +} + int qse_awk_rtx_readio ( qse_awk_rtx_t* run, int in_type, const qse_char_t* name, qse_str_t* buf) @@ -209,24 +278,26 @@ int qse_awk_rtx_readio ( rs = qse_awk_rtx_getgbl (run, QSE_AWK_GBL_RS); qse_awk_rtx_refupval (run, rs); - if (rs->type == QSE_AWK_VAL_NIL) + switch (rs->type) { - rs_ptr = QSE_NULL; - rs_len = 0; - } - else if (rs->type == QSE_AWK_VAL_STR) - { - rs_ptr = ((qse_awk_val_str_t*)rs)->ptr; - rs_len = ((qse_awk_val_str_t*)rs)->len; - } - else - { - rs_ptr = qse_awk_rtx_valtocpldup (run, rs, &rs_len); - if (rs_ptr == QSE_NULL) - { - qse_awk_rtx_refdownval (run, rs); - return -1; - } + case QSE_AWK_VAL_NIL: + rs_ptr = QSE_NULL; + rs_len = 0; + break; + + case QSE_AWK_VAL_STR: + rs_ptr = ((qse_awk_val_str_t*)rs)->ptr; + rs_len = ((qse_awk_val_str_t*)rs)->len; + break; + + default: + rs_ptr = qse_awk_rtx_valtocpldup (run, rs, &rs_len); + if (rs_ptr == QSE_NULL) + { + qse_awk_rtx_refdownval (run, rs); + return -1; + } + break; } ret = 1; @@ -263,44 +334,24 @@ int qse_awk_rtx_readio ( if (n == 0) { + /* EOF reached */ p->in.eof = 1; if (QSE_STR_LEN(buf) == 0) ret = 0; else if (rs_len >= 2) { - /* when RS is multiple characters, it needs to check - * for the match at the end of the input stream as - * the buffer has been appened with the last character - * after the previous matchrex has failed */ - - qse_cstr_t match; - qse_awk_errnum_t errnum; - - QSE_ASSERT (run->gbl.rs != QSE_NULL); - - n = QSE_AWK_MATCHREX ( - run->awk, run->gbl.rs, - ((run->gbl.ignorecase)? QSE_REX_IGNORECASE: 0), - QSE_STR_PTR(buf), QSE_STR_LEN(buf), - QSE_STR_PTR(buf), QSE_STR_LEN(buf), - &match, &errnum); - if (n <= -1) + /* When RS is multiple characters, it should + * check for the match at the end of the + * input stream also because the previous + * match could fail as it didn't end at the + * desired position to be the longest match. + * At EOF, the match at the end is considered + * the longest as there are no more characters + * left */ + n = match_long_rs (run, buf, 1); + if (n != 0) { - qse_awk_rtx_seterrnum (run, errnum, QSE_NULL); - ret = -1; - break; - } - - if (n >= 1) - { - /* the match should be found at the end of - * the current buffer */ - QSE_ASSERT ( - QSE_STR_PTR(buf) + QSE_STR_LEN(buf) == - match.ptr + match.len); - - /*QSE_STR_LEN(buf) -= match.len;*/ - buf->len -= match.len; + if (n <= -1) ret = -1; break; } } @@ -323,8 +374,7 @@ int qse_awk_rtx_readio ( if (pc == QSE_T('\r') && QSE_STR_LEN(buf) > 0) { - /*QSE_STR_LEN(buf) -= 1;*/ - buf->len -= 1; + QSE_STR_LEN(buf) -= 1; } break; } @@ -337,8 +387,7 @@ int qse_awk_rtx_readio ( if (pc == QSE_T('\r') && QSE_STR_LEN(buf) > 0) { - /*QSE_STR_LEN(buf) -= 1;*/ - buf->len -= 1; + QSE_STR_LEN(buf) -= 1; } } @@ -367,44 +416,11 @@ int qse_awk_rtx_readio ( } else { - qse_cstr_t match; - qse_awk_errnum_t errnum; - -/* TODO: minimize the number of regular expressoin match here... - * currently matchrex is called for each character added to buf. - * this is a very bad way of doing the job. - */ - QSE_ASSERT (run->gbl.rs != QSE_NULL); - - n = QSE_AWK_MATCHREX ( - run->awk, run->gbl.rs, - ((run->gbl.ignorecase)? QSE_REX_IGNORECASE: 0), - QSE_STR_PTR(buf), QSE_STR_LEN(buf), - QSE_STR_PTR(buf), QSE_STR_LEN(buf), - &match, &errnum); - if (n <= -1) - { - qse_awk_rtx_seterrnum (run, errnum, QSE_NULL); - ret = -1; - p->in.pos--; /* unread the character in c */ - break; - } - - if (n >= 1) - { - /* the match should be found at the end of - * the current buffer */ - QSE_ASSERT ( - QSE_STR_PTR(buf) + QSE_STR_LEN(buf) == - match.ptr + match.len); - - /*QSE_STR_LEN(buf) -= match.len;*/ - buf->len -= match.len; - p->in.pos--; /* unread the character in c */ - break; - } + /* I don't do anything here if RS is composed of + * multiple characters. See the comment furthur down */ } + if (qse_str_ccat (buf, c) == (qse_size_t)-1) { qse_awk_rtx_seterrnum (run, QSE_AWK_ENOMEM, QSE_NULL); @@ -412,7 +428,29 @@ int qse_awk_rtx_readio ( break; } - /* TODO: handle different line terminator like \r\n */ + if (rs_len >= 2) + { + /* if RS is composed of multiple characters, + * I perform the matching after having added the + * current character 'c' to the record buffer 'buf' + * to find the longest match. If a match found ends + * one character before this character just added + * to the buffer, it is the longest match. + */ + /* TODO: change the way to find the longest match + * for performance improvement. currently, + * the function is called for every character + * added to the buffer. Stupid! */ + n = match_long_rs (run, buf, 0); + if (n != 0) + { + p->in.pos--; /* unread the character in c */ + if (n <= -1) ret = -1; + break; + } + } + +/* TODO: handle different line terminator like \r\n */ if (c == QSE_T('\n')) line_len = 0; else line_len = line_len + 1; } diff --git a/qse/regress/awk/Makefile.am b/qse/regress/awk/Makefile.am index 97cb333e..2a6ee6d6 100644 --- a/qse/regress/awk/Makefile.am +++ b/qse/regress/awk/Makefile.am @@ -104,6 +104,7 @@ EXTRA_DIST = \ lang-040.awk \ lang-041.awk \ lang-042.awk \ + lang-043.awk \ columnate.awk \ levenshtein.awk \ levenshtein-utests.awk \ @@ -117,6 +118,7 @@ EXTRA_DIST = \ lang-035.dat2 \ lang-036.dat \ lang-037.dat \ + lang-043.dat \ adr.dat \ asm.dat \ cou.dat \ diff --git a/qse/regress/awk/Makefile.in b/qse/regress/awk/Makefile.in index 83e84b61..508cdee5 100644 --- a/qse/regress/awk/Makefile.in +++ b/qse/regress/awk/Makefile.in @@ -293,6 +293,7 @@ EXTRA_DIST = \ lang-040.awk \ lang-041.awk \ lang-042.awk \ + lang-043.awk \ columnate.awk \ levenshtein.awk \ levenshtein-utests.awk \ @@ -306,6 +307,7 @@ EXTRA_DIST = \ lang-035.dat2 \ lang-036.dat \ lang-037.dat \ + lang-043.dat \ adr.dat \ asm.dat \ cou.dat \ diff --git a/qse/regress/awk/lang-043.awk b/qse/regress/awk/lang-043.awk new file mode 100644 index 00000000..df7e5b63 --- /dev/null +++ b/qse/regress/awk/lang-043.awk @@ -0,0 +1,7 @@ +BEGIN { + RS="[\t\n\v\f\r ]*[\r\n]+[\t\n\v\f\r ]*" +} + +{ + print $0 +} diff --git a/qse/regress/awk/lang-043.dat b/qse/regress/awk/lang-043.dat new file mode 100644 index 00000000..abda5f39 --- /dev/null +++ b/qse/regress/awk/lang-043.dat @@ -0,0 +1,13 @@ +abcd + + + + + dcba + + + + + + + j diff --git a/qse/regress/awk/regress.out b/qse/regress/awk/regress.out index d82ebf39..ff187fec 100644 --- a/qse/regress/awk/regress.out +++ b/qse/regress/awk/regress.out @@ -2087,6 +2087,20 @@ IGNORECASE= 1 1 1 -------------------------------------------------------------------------------- +[CMD] qseawk --newline=on -o- -f lang-043.awk lang-043.dat &1 +-------------------------------------------------------------------------------- +BEGIN { + RS = "[ \n\v\f\r ]*[\r\n]+[ \n\v\f\r ]*"; +} + +{ + print $0; +} + +abcd +dcba +j +-------------------------------------------------------------------------------- [CMD] qseawk --newline=on -F: -f columnate.awk ./passwd.dat &1 -------------------------------------------------------------------------------- root x 0 0 root /root /bin/bash diff --git a/qse/regress/awk/regress.out.xma b/qse/regress/awk/regress.out.xma index 12ec7c46..2818141f 100644 --- a/qse/regress/awk/regress.out.xma +++ b/qse/regress/awk/regress.out.xma @@ -2087,6 +2087,20 @@ IGNORECASE= 1 1 1 -------------------------------------------------------------------------------- +[CMD] qseawk -m 500000 --newline=on -o- -f lang-043.awk lang-043.dat &1 +-------------------------------------------------------------------------------- +BEGIN { + RS = "[ \n\v\f\r ]*[\r\n]+[ \n\v\f\r ]*"; +} + +{ + print $0; +} + +abcd +dcba +j +-------------------------------------------------------------------------------- [CMD] qseawk -m 500000 --newline=on -F: -f columnate.awk ./passwd.dat &1 -------------------------------------------------------------------------------- root x 0 0 root /root /bin/bash diff --git a/qse/regress/awk/regress.sh b/qse/regress/awk/regress.sh index 0062bbb9..b0eea117 100755 --- a/qse/regress/awk/regress.sh +++ b/qse/regress/awk/regress.sh @@ -165,6 +165,7 @@ PROGS=" lang-040.awk!!!--newline=on -o- lang-041.awk!!!--newline=on -o- lang-042.awk!!!--newline=on -o- + lang-043.awk!lang-043.dat!!--newline=on -o- columnate.awk!./passwd.dat!!--newline=on -F: levenshtein-utests.awk!!!--newline=on --include=on