From ba8bd06016990f5894dd69f0d35987a003924e5a Mon Sep 17 00:00:00 2001 From: hyung-hwan Date: Thu, 26 Mar 2009 08:53:10 +0000 Subject: [PATCH] fixed a bug in handling a regular expression starting with a backslash. - a regular expression like /\// could not be handled properly without this fix --- qse/include/qse/utl/sed.h | 3 ++- qse/lib/awk/err.c | 4 +-- qse/lib/awk/parse.c | 42 ++++++++++++++++++++++------- qse/lib/utl/sed.c | 57 +++++++++++++++++++++++++++++---------- 4 files changed, 79 insertions(+), 27 deletions(-) diff --git a/qse/include/qse/utl/sed.h b/qse/include/qse/utl/sed.h index da891a1b..891c64aa 100644 --- a/qse/include/qse/utl/sed.h +++ b/qse/include/qse/utl/sed.h @@ -37,6 +37,7 @@ enum qse_sed_errnum_t QSE_SED_EA2PHB, /* address 2 prohibited */ QSE_SED_ENEWLN, /* a new line is expected */ QSE_SED_EBSEXP, /* \ is expected */ + QSE_SED_EBSDEL, /* \ used a delimiter */ QSE_SED_EGBABS, /* garbage after \ */ QSE_SED_ESCEXP, /* ; is expected */ QSE_SED_ELABTL, /* label too long */ @@ -44,7 +45,7 @@ enum qse_sed_errnum_t QSE_SED_ELABDU, /* duplicate label name */ QSE_SED_EFILEM, /* file name is empty */ QSE_SED_EFILIL, /* illegal file name */ - QSE_SED_ETSNTR, /* translation set not terminated */ + QSE_SED_ENOTRM, /* not terminated properly */ QSE_SED_ETSNSL, /* translation set not the same length*/ QSE_SED_EGRNBA, /* group brackets not balanced */ QSE_SED_EGRNTD /* group nested too deeply */ diff --git a/qse/lib/awk/err.c b/qse/lib/awk/err.c index 87cdd622..7244aadc 100644 --- a/qse/lib/awk/err.c +++ b/qse/lib/awk/err.c @@ -1,5 +1,5 @@ /* - * $Id: err.c 75 2009-02-22 14:10:34Z hyunghwan.chung $ + * $Id: err.c 113 2009-03-25 14:53:10Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -65,7 +65,7 @@ static const qse_char_t* __geterrstr (int errnum) QSE_T("unexpected end of source"), QSE_T("a comment not closed properly"), - QSE_T("a string not closed with a quote"), + QSE_T("a string or a regular expression not closed"), QSE_T("unexpected end of a regular expression"), QSE_T("a left brace expected in place of '${0}'"), QSE_T("a left parenthesis expected in place of '${0}'"), diff --git a/qse/lib/awk/parse.c b/qse/lib/awk/parse.c index 75bab391..40556592 100644 --- a/qse/lib/awk/parse.c +++ b/qse/lib/awk/parse.c @@ -1,5 +1,5 @@ /* - * $Id: parse.c 85 2009-02-26 10:56:12Z hyunghwan.chung $ + * $Id: parse.c 113 2009-03-25 14:53:10Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -220,7 +220,7 @@ static int get_charstr (qse_awk_t* awk); static int get_rexstr (qse_awk_t* awk); static int get_string ( qse_awk_t* awk, qse_char_t end_char, - qse_char_t esc_char, qse_bool_t keep_esc_char); + qse_char_t esc_char, qse_bool_t keep_esc_char, int preescaped); static int get_char (qse_awk_t* awk); static int unget_char (qse_awk_t* awk, qse_cint_t c); static int skip_spaces (qse_awk_t* awk); @@ -2930,7 +2930,9 @@ static qse_awk_nde_t* parse_primary (qse_awk_t* awk, qse_size_t line) int errnum; /* the regular expression is tokenized here because - * of the context-sensitivity of the slash symbol */ + * of the context-sensitivity of the slash symbol. + * if TOKEN_DIV is seen as a primary, it tries to compile + * it as a regular expression */ SET_TOKEN_TYPE (awk, TOKEN_REX); qse_str_clear (awk->token.name); @@ -4567,7 +4569,6 @@ static qse_awk_nde_t* parse_print (qse_awk_t* awk, qse_size_t line, int type) return (qse_awk_nde_t*)nde; } - static int get_token (qse_awk_t* awk) { qse_cint_t c; @@ -5091,7 +5092,7 @@ static int get_charstr (qse_awk_t* awk) * has been called */ ADD_TOKEN_CHAR (awk, awk->src.lex.curc); } - return get_string (awk, QSE_T('\"'), QSE_T('\\'), QSE_FALSE); + return get_string (awk, QSE_T('\"'), QSE_T('\\'), QSE_FALSE, 0); } static int get_rexstr (qse_awk_t* awk) @@ -5099,23 +5100,44 @@ static int get_rexstr (qse_awk_t* awk) if (awk->src.lex.curc == QSE_T('/')) { /* this part of the function is different from get_charstr - * because of the way this function is called */ + * because of the way this function is called. + * this condition is met when the input is //. + * the first / has been tokenized to TOKEN_DIV already. + * if TOKEN_DIV is seen as a primary, this function is called. + * as the token buffer has been cleared by the caller and + * the token type is set to TOKEN_REX, this function can + * just return after reading the next character */ GET_CHAR (awk); return 0; } else { - ADD_TOKEN_CHAR (awk, awk->src.lex.curc); - return get_string (awk, QSE_T('/'), QSE_T('\\'), QSE_TRUE); + int escaped = 0; + if (awk->src.lex.curc == QSE_T('\\')) + { + /* for input like /\//, this condition is met. + * the initial escape character is added when the + * second charater is handled in get_string() */ + escaped = 1; + } + else + { + /* add other initial characters here as get_string() + * begins with reading the next character */ + ADD_TOKEN_CHAR (awk, awk->src.lex.curc); + } + return get_string (awk, + QSE_T('/'), QSE_T('\\'), QSE_TRUE, escaped); } } static int get_string ( qse_awk_t* awk, qse_char_t end_char, - qse_char_t esc_char, qse_bool_t keep_esc_char) + qse_char_t esc_char, qse_bool_t keep_esc_char, + int preescaped) { qse_cint_t c; - int escaped = 0; + int escaped = preescaped; int digit_count = 0; qse_cint_t c_acc = 0; diff --git a/qse/lib/utl/sed.c b/qse/lib/utl/sed.c index 78f06bb4..89948f00 100644 --- a/qse/lib/utl/sed.c +++ b/qse/lib/utl/sed.c @@ -152,6 +152,7 @@ const qse_char_t* qse_sed_geterrmsg (qse_sed_t* sed) QSE_T("address 2 prohibited"), QSE_T("a new line expected"), QSE_T("a backslash expected"), + QSE_T("a backslash used as a delimiter"), QSE_T("garbage after a backslash"), QSE_T("a semicolon expected"), QSE_T("label name too long"), @@ -159,7 +160,7 @@ const qse_char_t* qse_sed_geterrmsg (qse_sed_t* sed) QSE_T("duplicate label name"), QSE_T("empty file name"), QSE_T("illegal file name"), - QSE_T("translation set not terminated"), + QSE_T("command not terminated properly"), QSE_T("strings in translation set not the same length"), QSE_T("group brackets not balanced"), QSE_T("group nesting too deep") @@ -227,7 +228,6 @@ static void* compile_regex (qse_sed_t* sed, qse_char_t rxend) } if (c == QSE_T('n')) c = QSE_T('\n'); - else if (c == QSE_T('r')) c = QSE_T('\r'); // TODO: support more escaped characters?? } @@ -576,7 +576,6 @@ static int get_file_name (qse_sed_t* sed, qse_sed_cmd_t* cmd) } if (c == QSE_T('n')) c = QSE_T('\n'); - else if (c == QSE_T('r')) c = QSE_T('\r'); } if (qse_str_ccat (t, c) == (qse_size_t)-1) @@ -613,15 +612,16 @@ static int get_subst (qse_sed_t* sed, qse_sed_cmd_t* cmd) c = CURSC (sed); if (c == QSE_CHAR_EOF || IS_LINTERM(c)) { - //sed->errnum = QSE_SED_ESUNTR; + /* not terminated properly */ + sed->errnum = QSE_SED_ENOTRM; goto oops; } delim = c; if (delim == QSE_T('\\')) { - /* illegal delimiter */ - //sed->errnum = QSE_SED_ESUILD; + /* backspace is an illegal delimiter */ + sed->errnum = QSE_SED_EBSDEL; goto oops; } @@ -635,7 +635,32 @@ static int get_subst (qse_sed_t* sed, qse_sed_cmd_t* cmd) c = NXTSC (sed); while (c != delim) { - } + if (c == QSE_CHAR_EOF || IS_LINTERM(c)) + { + sed->errnum = QSE_SED_ENOTRM; + goto oops; + } + + if (c == QSE_T('\\')) + { + c = NXTSC (sed); + if (c == QSE_CHAR_EOF || IS_LINTERM(c)) + { + sed->errnum = QSE_SED_ENOTRM; + goto oops; + } + + if (c == QSE_T('n')) c = QSE_T('\n'); + } + + if (qse_str_ccat (t, c) == (qse_size_t)-1) + { + sed->errnum = QSE_SED_ENOMEM; + goto oops; + } + + c = NXTSC (sed); + } oops: if (t != QSE_NULL) qse_str_close (t); @@ -652,11 +677,17 @@ static int get_transet (qse_sed_t* sed, qse_sed_cmd_t* cmd) if (c == QSE_CHAR_EOF || IS_LINTERM(c)) { /* translation set terminated prematurely*/ - sed->errnum = QSE_SED_ETSNTR; + sed->errnum = QSE_SED_ENOTRM; goto oops; } delim = c; + if (delim == QSE_T('\\')) + { + /* backspace is an illegal delimiter */ + sed->errnum = QSE_SED_EBSDEL; + goto oops; + } t = qse_str_open (sed->mmgr, 0, 32); if (t == QSE_NULL) @@ -672,7 +703,7 @@ static int get_transet (qse_sed_t* sed, qse_sed_cmd_t* cmd) if (c == QSE_CHAR_EOF || IS_LINTERM(c)) { - sed->errnum = QSE_SED_ETSNTR; + sed->errnum = QSE_SED_ENOTRM; goto oops; } @@ -681,12 +712,11 @@ static int get_transet (qse_sed_t* sed, qse_sed_cmd_t* cmd) c = NXTSC (sed); if (c == QSE_CHAR_EOF || IS_LINTERM(c)) { - sed->errnum = QSE_SED_ETSNTR; + sed->errnum = QSE_SED_ENOTRM; goto oops; } if (c == QSE_T('n')) c = QSE_T('\n'); - else if (c == QSE_T('r')) c = QSE_T('\r'); } b[0] = c; @@ -704,7 +734,7 @@ static int get_transet (qse_sed_t* sed, qse_sed_cmd_t* cmd) { if (c == QSE_CHAR_EOF || IS_LINTERM(c)) { - sed->errnum = QSE_SED_ETSNTR; + sed->errnum = QSE_SED_ENOTRM; goto oops; } @@ -713,12 +743,11 @@ static int get_transet (qse_sed_t* sed, qse_sed_cmd_t* cmd) c = NXTSC (sed); if (c == QSE_CHAR_EOF || IS_LINTERM(c)) { - sed->errnum = QSE_SED_ETSNTR; + sed->errnum = QSE_SED_ENOTRM; goto oops; } if (c == QSE_T('n')) c = QSE_T('\n'); - else if (c == QSE_T('r')) c = QSE_T('\r'); } if (pos >= QSE_STR_LEN(t))