From 944a492c88fcd2b289e92e1fccf61ce790ca040a Mon Sep 17 00:00:00 2001 From: hyung-hwan Date: Thu, 18 Jun 2009 06:43:50 +0000 Subject: [PATCH] fixed bugs in rex and awk - fixed bugs of not parsing some forms of ranges such as {,m} {n,} properly - fixed bugs in substitution functions that did not handle 0-length substring match properly. --- qse/doc/page/sed.doc | 122 +++++++++++++++++++++++++++++--------- qse/include/qse/cmn/rex.h | 38 +++++++----- qse/lib/awk/fnc.c | 57 ++++++++++++++---- qse/lib/cmn/rex.c | 28 +++++---- qse/lib/sed/sed.c | 40 +++++++++++-- qse/test/sed/sed01.c | 7 ++- 6 files changed, 223 insertions(+), 69 deletions(-) diff --git a/qse/doc/page/sed.doc b/qse/doc/page/sed.doc index a4ee0681..9db67edf 100644 --- a/qse/doc/page/sed.doc +++ b/qse/doc/page/sed.doc @@ -1,5 +1,10 @@ /** @page sed SED STREAM EDITOR +@section sed_contents CONTENTS +- \ref sed_intro +- \ref sed_command +- \ref sed_embed + @section sed_intro INTRODUCTION The sed stream editor is a non-interactive text editing tool commonly used @@ -61,7 +66,7 @@ A command without a line selector is applied to all input lines; A command with a single address is applied to an input line that matches the address; A command with an address range is applied to all input lines within the range, inclusive; A command with a start and a step is -applied to every @b step'th line starting from the line @b start. +applied to every step'th line starting from the line @b start. Here is the summary of the commands. @@ -69,88 +74,151 @@ Here is the summary of the commands. The text beginning from # to the line end is ignored; # in a line following a \\, i \\, and c \\ is treated literally and does not introduce a comment. + - : label A label can be composed of letters, digits, periods, hyphens, and underlines. It remembers a target label for @b b and @b t commands and prohibits a line selector. + - { The left curly bracket forms a command group where you can nest other -commands. It should be paired with an ending }. +commands. It should be paired with an ending @b }. + - q Terminates the exection of commands. Upon termination, it prints the pattern space if #QSE_SED_QUIET is not set. + - Q Terminates the exection of commands quietly. + - a \\ \n text -Stores @b text into an append buffer which is printed after the pattern -space for each input line. If #QSE_SED_STRICT is specified, an address range -is not allowed in the line selector. +Stores @b text into the append buffer which is printed after the pattern +space for each input line. If #QSE_SED_STRICT is specified, a line selector +of an address range is not allowed. + - i \\ \n text Inserts @b text into an insert buffer which is printed before the pattern -space for each input line. If #QSE_SED_STRICT is specified, an address range -is not allowed in the line selector. +space for each input line. If #QSE_SED_STRICT is specified, a line selector +of an address range is not allowed. + - c \\ \n text If a single line is selected for the command (i.e. no line selector, a single address line selector, or a start~step line selector is specified), it changes -pattern space to @b text and branches to the end of commands for the line. -If an address range is specified, it deletes pattern space and branches +the pattern space to @b text and branches to the end of commands for the line. +If an address range is specified, it deletes the pattern space and branches to the end of commands for all input lines but the last, and changes pattern space to @b text and branches to the end of commands. + - d -Deletes pattern space and branches to the end of commands. +Deletes the pattern space and branches to the end of commands. + - D -Deletes the first line of pattern space. If the pattern space is emptied, +Deletes the first line of the pattern space. If the pattern space is emptied, it branches to the end of script. Otherwise, the commands from the first are reapplied to the current pattern space. + - = Prints the current line number. If #QSE_SED_STRICT is speccified, an address range is not allowed in the line selector. + - p -Prints pattern space. +Prints the pattern space. + - P -Prints the first line of pattern space. +Prints the first line of the pattern space. + - l -Prints pattern space in a visually unambiguous form. +Prints the pattern space in a visually unambiguous form. + - h -Copies pattern space to hold space +Copies the pattern space to the hold space + - H -Appends pattern space to hold space +Appends the pattern space to the hold space + - g -Copies hold space to pattern space +Copies the hold space to the pattern space + - G -Appends hold space to pattern space +Appends the hold space to the pattern space + - x -Exchanges pattern space and hold space +Exchanges the pattern space and the hold space + - n -Prints pattern space and read the next line from the input stream to fill -pattern space. +Prints the pattern space and read the next line from the input stream to fill +the pattern space. + - N -Prints pattern space and read the next line from the input stream to append it -to pattern space with a newline inserted. +Prints the pattern space and read the next line from the input stream +to append it to the pattern space with a newline inserted. + - b Branches to the end of commands. + - b label Branches to @b label + - t Branches to the end of commands if substitution(s//) has been made successfully since the last reading of an input line or the last @b t command. + - t label Branches to @b label if substitution(s//) has been made successfully since the last reading of an input line or the last @b t command. + - r file -Reads text from @b file and prints it after printing pattern space but before -printing append buffer. Failure to read @b file does not cause an error. +Reads text from @b file and prints it after printing the pattern space but +before printing the append buffer. Failure to read @b file does not cause an +error. + - R file Reads a line of text from @b file and prints it after printing pattern space -but before printing append buffer. Failure to read @b file does not cause an +but before printing the append buffer. Failure to read @b file does not cause an error. + - w file +Writes the pattern space to @b file - W file +Writes the first line of the pattern space to @b file -- s/rex/repl/opt +- s/rex/repl/opts +Finds a matching substring with @b rex in pattern space and replaces it +with @repl. @b & in @b repl refers to the matching substring. @b opts may +be empty; You can combine the following options into @opts: + - @b g replaces all occurrences of a matching substring with @b rex + - @b number replaces the number'th occurrence of a matching substring + with @b rex + - @b p prints pattern space if a successful replacement was made + - @b w file writes pattern space to @b file if a successful replacement + was made. It, if specified, should be the last option. - y/src/dst/ Replaces all occurrences of characters in @b src with characters in @b dst. @b src and @b dst must contain equal number of characters. + +Let's see actual examples: +- G;G;G +Triple spaces input lines. If #QSE_SED_QUIET is on, G;G;G;p. +It works because the hold space is empty unless something is copied to it. + +- $!d +Prints the last line. If #QSE_SED_QUIET is on, try $p. + +- 1!G;h;$!d +Prints input lines in the reverse order. That is, it prints the last line +first and the first line last. + +- s/[[:space:]]{2,}/ /g +Compacts whitespaces if #QSE_SED_REXBOUND is on. + +@section sed_embed HOW TO EMBED + +In the simplest form, +- Create a stream editor - qse_sed_open() +- Compile editing commands - qse_sed_comp() +- Executes compiled commands - qse_sed_exec() +- Destroy the stream editor - qse_sed_close() + */ diff --git a/qse/include/qse/cmn/rex.h b/qse/include/qse/cmn/rex.h index 7399ca63..e75e87ad 100644 --- a/qse/include/qse/cmn/rex.h +++ b/qse/include/qse/cmn/rex.h @@ -1,5 +1,5 @@ /* - * $Id: rex.h 195 2009-06-10 13:18:25Z hyunghwan.chung $ + * $Id: rex.h 203 2009-06-17 12:43:50Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -22,7 +22,8 @@ #include #include -/* +/** @file + * * Regular Esseression Syntax * A regular expression is zero or more branches, separated by '|'. * ...... @@ -30,24 +31,29 @@ * * Compiled form of a regular expression: * - * | expression | - * | header | branch | branch | branch | - * | nb | el | na | bl | cmd | arg | cmd | arg | na | bl | cmd | arg | na | bl | cmd | + * | expression | + * | header | branch | branch | branch | + * | nb | el | na | bl | cmd | arg | cmd | arg | na | bl | cmd | arg | na | bl | cmd | * - * nb: the number of branches - * el: the length of a expression including the length of nb and el - * na: the number of atoms - * bl: the length of a branch including the length of na and bl - * cmd: The command and repetition info encoded together. - * Some commands require an argument to follow them but some other don't. - * It is encoded as follows: + * - nb: the number of branches + * - el: the length of a expression including the length of nb and el + * - na: the number of atoms + * - bl: the length of a branch including the length of na and bl + * - cmd: The command and repetition info encoded together. * - * Subexpressions can be nested by having the command "GROUP" - * and a subexpression as its argument. + * Some commands require an argument to follow them but some other don't. + * It is encoded as follows: + * ................. + * + * Subexpressions can be nested by having the command "GROUP" + * and a subexpression as its argument. * * Examples: - * a.c -> |1|6|5|ORD_CHAR(no bound)|a|ANY_CHAR(no bound)|ORD_CHAR(no bound)|c| - * ab|xy -> |2|10|4|ORD_CHAR(no bound)|a|ORD_CHAR(no bound)|b|4|ORD_CHAR(no bound)|x|ORD_CHAR(no bound)|y| + * a.c -> |1|6|5|ORD_CHAR(no bound)|a|ANY_CHAR(no bound)|ORD_CHAR(no bound)|c| + * ab|xy -> |2|10|4|ORD_CHAR(no bound)|a|ORD_CHAR(no bound)|b|4|ORD_CHAR(no bound)|x|ORD_CHAR(no bound)|y| + * + * @todo + * - support \\n to refer to the nth matching substring */ #define QSE_REX_NA(code) (*(qse_size_t*)(code)) diff --git a/qse/lib/awk/fnc.c b/qse/lib/awk/fnc.c index f9c65ccf..76c15615 100644 --- a/qse/lib/awk/fnc.c +++ b/qse/lib/awk/fnc.c @@ -1,5 +1,5 @@ /* - * $Id: fnc.c 199 2009-06-14 08:40:52Z hyunghwan.chung $ + * $Id: fnc.c 203 2009-06-17 12:43:50Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -919,14 +919,14 @@ static int __substitute (qse_awk_rtx_t* run, qse_long_t max_count) { qse_size_t nargs; qse_awk_val_t* a0, * a1, * a2, ** a2_ref, * v; - qse_char_t* a0_ptr, * a1_ptr, * a2_ptr; + qse_char_t* a0_ptr, * a1_ptr, * a2_ptr, * a2_end; qse_size_t a0_len, a1_len, a2_len; qse_char_t* a0_ptr_free = QSE_NULL; qse_char_t* a1_ptr_free = QSE_NULL; qse_char_t* a2_ptr_free = QSE_NULL; void* rex = QSE_NULL; int opt, n; - qse_cstr_t mat; + qse_cstr_t mat, pmat; const qse_char_t* cur_ptr; qse_size_t cur_len, i, m; qse_str_t new; @@ -1053,7 +1053,8 @@ static int __substitute (qse_awk_rtx_t* run, qse_long_t max_count) if (a0->type != QSE_AWK_VAL_REX) { - rex = QSE_AWK_BUILDREX (run->awk, a0_ptr, a0_len, &run->errinf.num); + rex = QSE_AWK_BUILDREX ( + run->awk, a0_ptr, a0_len, &run->errinf.num); if (rex == QSE_NULL) { qse_str_fini (&new); @@ -1063,11 +1064,18 @@ static int __substitute (qse_awk_rtx_t* run, qse_long_t max_count) } opt = (run->gbl.ignorecase)? QSE_REX_MATCH_IGNORECASE: 0; + + a2_end = a2_ptr + a2_len; cur_ptr = a2_ptr; cur_len = a2_len; sub_count = 0; - while (1) + pmat.ptr = QSE_NULL; + pmat.len = 0; + + /* perform test when cur_ptr == a2_end also because + * end of string($) needs to be tested */ + while (cur_ptr <= a2_end) { if (max_count == 0 || sub_count < max_count) { @@ -1096,17 +1104,28 @@ static int __substitute (qse_awk_rtx_t* run, qse_long_t max_count) FREE_A0_REX (run->awk, rex); qse_str_fini (&new); FREE_A_PTRS (run->awk); + qse_awk_rtx_seterrnum (run, QSE_AWK_ENOMEM); return -1; } break; } + if (mat.len == 0 && + pmat.ptr != QSE_NULL && + mat.ptr == pmat.ptr + pmat.len) + { + /* match length is 0 and the match is still at the + * end of the previous match */ + goto skip_one_char; + } + if (qse_str_ncat ( &new, cur_ptr, mat.ptr - cur_ptr) == (qse_size_t)-1) { FREE_A0_REX (run->awk, rex); qse_str_fini (&new); FREE_A_PTRS (run->awk); + qse_awk_rtx_seterrnum (run, QSE_AWK_ENOMEM); return -1; } @@ -1133,6 +1152,7 @@ static int __substitute (qse_awk_rtx_t* run, qse_long_t max_count) FREE_A0_REX (run->awk, rex); qse_str_fini (&new); FREE_A_PTRS (run->awk); + qse_awk_rtx_seterrnum (run, QSE_AWK_ENOMEM); return -1; } } @@ -1140,6 +1160,26 @@ static int __substitute (qse_awk_rtx_t* run, qse_long_t max_count) sub_count++; cur_len = cur_len - ((mat.ptr - cur_ptr) + mat.len); cur_ptr = mat.ptr + mat.len; + + pmat = mat; + + if (mat.len == 0) + { + skip_one_char: + /* special treatment is needed if match length is 0 */ + + m = qse_str_ncat (&new, cur_ptr, 1); + if (m == (qse_size_t)-1) + { + FREE_A0_REX (run->awk, rex); + qse_str_fini (&new); + FREE_A_PTRS (run->awk); + qse_awk_rtx_seterrnum (run, QSE_AWK_ENOMEM); + return -1; + } + + cur_ptr++; cur_len--; + } } FREE_A0_REX (run->awk, rex); @@ -1179,7 +1219,6 @@ static int __substitute (qse_awk_rtx_t* run, qse_long_t max_count) { qse_str_fini (&new); FREE_A_PTRS (run->awk); - /*qse_awk_rtx_seterrnum (run, QSE_AWK_ENOMEM);*/ return -1; } @@ -1196,11 +1235,7 @@ static int __substitute (qse_awk_rtx_t* run, qse_long_t max_count) #undef FREE_A_PTRS v = qse_awk_rtx_makeintval (run, sub_count); - if (v == QSE_NULL) - { - /*qse_awk_rtx_seterrnum (run, QSE_AWK_ENOMEM);*/ - return -1; - } + if (v == QSE_NULL) return -1; qse_awk_rtx_setretval (run, v); return 0; diff --git a/qse/lib/cmn/rex.c b/qse/lib/cmn/rex.c index 8112dd4d..afc7d063 100644 --- a/qse/lib/cmn/rex.c +++ b/qse/lib/cmn/rex.c @@ -1,5 +1,5 @@ /* - * $Id: rex.c 195 2009-06-10 13:18:25Z hyunghwan.chung $ + * $Id: rex.c 203 2009-06-17 12:43:50Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -1014,18 +1014,26 @@ what if it is not in the raight format? convert it to ordinary characters?? */ { NEXT_CHAR (builder, LEVEL_RANGE); - bound = 0; - while (builder->ptn.curc.type == CT_NORMAL && - (builder->ptn.curc.value >= QSE_T('0') && - builder->ptn.curc.value <= QSE_T('9'))) + if (builder->ptn.curc.type == CT_NORMAL && + (builder->ptn.curc.value >= QSE_T('0') && + builder->ptn.curc.value <= QSE_T('9'))) { - bound = bound * 10 + builder->ptn.curc.value - QSE_T('0'); - NEXT_CHAR (builder, LEVEL_RANGE); - } + bound = 0; - cmd->ubound = bound; + do + { + bound = bound * 10 + builder->ptn.curc.value - QSE_T('0'); + NEXT_CHAR (builder, LEVEL_RANGE); + } + while (builder->ptn.curc.type == CT_NORMAL && + (builder->ptn.curc.value >= QSE_T('0') && + builder->ptn.curc.value <= QSE_T('9'))); + + cmd->ubound = bound; + } + else cmd->ubound = BOUND_MAX; } - else cmd->ubound = BOUND_MAX; + else cmd->ubound = cmd->lbound; if (cmd->lbound > cmd->ubound) { diff --git a/qse/lib/sed/sed.c b/qse/lib/sed/sed.c index bd1a6452..3a5ce89f 100644 --- a/qse/lib/sed/sed.c +++ b/qse/lib/sed/sed.c @@ -1,5 +1,5 @@ /* - * $Id: sed.c 195 2009-06-10 13:18:25Z hyunghwan.chung $ + * $Id: sed.c 203 2009-06-17 12:43:50Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -1850,10 +1850,10 @@ static int write_str_to_file ( static int do_subst (qse_sed_t* sed, qse_sed_cmd_t* cmd) { - qse_cstr_t mat; + qse_cstr_t mat, pmat; int opt = 0, repl = 0, n; qse_rex_errnum_t errnum; - const qse_char_t* cur_ptr, * str_ptr; + const qse_char_t* cur_ptr, * str_ptr, * str_end; qse_size_t cur_len, str_len, m, i; qse_size_t max_count, sub_count; @@ -1868,13 +1868,19 @@ static int do_subst (qse_sed_t* sed, qse_sed_cmd_t* cmd) /* TODO: support different line end convension */ if (str_len > 0 && str_ptr[str_len-1] == QSE_T('\n')) str_len--; + str_end = str_ptr + str_len; cur_ptr = str_ptr; cur_len = str_len; sub_count = 0; max_count = (cmd->u.subst.g)? 0: cmd->u.subst.occ; - while (1) + pmat.ptr = QSE_NULL; + pmat.len = 0; + + /* perform test when cur_ptr == str_end also because + * end of string($) needs to be tested */ + while (cur_ptr <= str_end) { if (max_count == 0 || sub_count < max_count) { @@ -1908,6 +1914,15 @@ static int do_subst (qse_sed_t* sed, qse_sed_cmd_t* cmd) break; } + if (mat.len == 0 && + pmat.ptr != QSE_NULL && + mat.ptr == pmat.ptr + pmat.len) + { + /* match length is 0 and the match is still at the + * end of the previous match */ + goto skip_one_char; + } + if (max_count > 0 && sub_count + 1 != max_count) { m = qse_str_ncat ( @@ -1967,6 +1982,23 @@ static int do_subst (qse_sed_t* sed, qse_sed_cmd_t* cmd) sub_count++; cur_len = cur_len - ((mat.ptr - cur_ptr) + mat.len); cur_ptr = mat.ptr + mat.len; + + pmat = mat; + + if (mat.len == 0) + { + skip_one_char: + /* special treament is need if the match length is 0 */ + + m = qse_str_ncat (&sed->e.txt.subst, cur_ptr, 1); + if (m == (qse_size_t)-1) + { + SETERR0 (sed, QSE_SED_ENOMEM, 0); + return -1; + } + + cur_ptr++; cur_len--; + } } if (str_len < QSE_STR_LEN(&sed->e.in.line)) diff --git a/qse/test/sed/sed01.c b/qse/test/sed/sed01.c index 08a4e3cc..093fb95f 100644 --- a/qse/test/sed/sed01.c +++ b/qse/test/sed/sed01.c @@ -119,13 +119,14 @@ static void print_usage (QSE_FILE* out, int argc, qse_char_t* argv[]) qse_fprintf (out, QSE_T(" -h show this message\n")); qse_fprintf (out, QSE_T(" -n disable auto-print\n")); qse_fprintf (out, QSE_T(" -a perform strict address check\n")); + qse_fprintf (out, QSE_T(" -r allows {n,m} in a regular expression\n")); } static int handle_args (int argc, qse_char_t* argv[]) { static qse_opt_t opt = { - QSE_T("hna"), + QSE_T("hnar"), QSE_NULL }; qse_cint_t c; @@ -165,6 +166,10 @@ static int handle_args (int argc, qse_char_t* argv[]) case QSE_T('a'): g_option |= QSE_SED_STRICT; break; + + case QSE_T('r'): + g_option |= QSE_SED_REXBOUND; + break; } }