From 0f8564426992df5742c93bc20bcdeb9d250c45a8 Mon Sep 17 00:00:00 2001 From: hyung-hwan Date: Wed, 7 Sep 2011 08:18:36 +0000 Subject: [PATCH] added code to handle an empty regular expression in sed --- qse/cmd/sed/sed.c | 14 ++- qse/include/qse/sed/sed.h | 3 +- qse/include/qse/types.h | 4 +- qse/lib/cmn/tre-parse.c | 3 +- qse/lib/cmn/tre.h | 6 +- qse/lib/sed/err.c | 7 +- qse/lib/sed/sed.c | 166 +++++++++++++++++++++++--------- qse/lib/sed/sed.h | 5 +- qse/regress/sed/regress.out | 2 +- qse/regress/sed/regress.out.xma | 2 +- 10 files changed, 148 insertions(+), 64 deletions(-) diff --git a/qse/cmd/sed/sed.c b/qse/cmd/sed/sed.c index 3135c750..cc0f8776 100644 --- a/qse/cmd/sed/sed.c +++ b/qse/cmd/sed/sed.c @@ -246,6 +246,7 @@ qse_char_t* load_script_file (const qse_char_t* file) if (qse_str_init (&script, QSE_MMGR_GETDFL(), 1024) <= -1) { qse_fclose (fp); + qse_fprintf (QSE_STDERR, QSE_T("ERROR: cannot load %s\n"), file); return QSE_NULL; } @@ -258,6 +259,13 @@ qse_char_t* load_script_file (const qse_char_t* file) return QSE_NULL; } } + if (qse_ferror(fp)) + { + qse_fprintf (QSE_STDERR, QSE_T("ERROR: cannot read %s\n"), file); + qse_str_fini (&script); + qse_fclose (fp); + return QSE_NULL; + } qse_str_yield (&script, &xstr, 0); qse_str_fini (&script); @@ -303,11 +311,7 @@ int sed_main (int argc, qse_char_t* argv[]) QSE_ASSERT (g_script == QSE_NULL); g_script = load_script_file (g_script_file); - if (g_script == QSE_NULL) - { - qse_fprintf (QSE_STDERR, QSE_T("ERROR: cannot load %s\n"), g_script_file); - goto oops; - } + if (g_script == QSE_NULL) goto oops; } if (qse_sed_comp (sed, g_script, qse_strlen(g_script)) == -1) diff --git a/qse/include/qse/sed/sed.h b/qse/include/qse/sed/sed.h index 535a0f41..d21e1f9f 100644 --- a/qse/include/qse/sed/sed.h +++ b/qse/include/qse/sed/sed.h @@ -1,5 +1,5 @@ /* - * $Id: sed.h 558 2011-09-02 15:27:44Z hyunghwan.chung $ + * $Id: sed.h 560 2011-09-06 14:18:36Z hyunghwan.chung $ * Copyright 2006-2011 Chung, Hyung-Hwan. This file is part of QSE. @@ -107,6 +107,7 @@ enum qse_sed_errnum_t QSE_SED_EOCSDU, /**< multiple occurrence specifiers */ QSE_SED_EOCSZE, /**< occurrence specifier zero */ QSE_SED_EOCSTL, /**< occurrence specifier too large */ + QSE_SED_ENPREX, /**< no previous regular expression */ QSE_SED_EIOFIL, /**< io error with file '${0}'*/ QSE_SED_EIOUSR /**< error returned by user io handler */ }; diff --git a/qse/include/qse/types.h b/qse/include/qse/types.h index dc145330..86666201 100644 --- a/qse/include/qse/types.h +++ b/qse/include/qse/types.h @@ -1,5 +1,5 @@ /* - * $Id: types.h 549 2011-08-14 09:07:31Z hyunghwan.chung $ + * $Id: types.h 560 2011-09-06 14:18:36Z hyunghwan.chung $ * Copyright 2006-2011 Chung, Hyung-Hwan. This file is part of QSE. @@ -365,12 +365,14 @@ typedef qse_int_t qse_intptr_t; * The qse_mchar_t type defines a multi-byte character type. */ typedef char qse_mchar_t; +#define QSE_SIZEOF_MCHAR_T QSE_SIZEOF_CHAR /** * The qse_mcint_t defines a type that can hold a qse_mchar_t value and * #QSE_MCHAR_EOF. */ typedef int qse_mcint_t; +#define QSE_SIZEOF_MCINT_T QSE_SIZEOF_INT /** @typedef qse_wchar_t * The qse_wchar_t type defines a wide character type. diff --git a/qse/lib/cmn/tre-parse.c b/qse/lib/cmn/tre-parse.c index 30a95fb5..1893f73a 100644 --- a/qse/lib/cmn/tre-parse.c +++ b/qse/lib/cmn/tre-parse.c @@ -1539,8 +1539,7 @@ parse_brace: /* Escaped character. */ DPRINT(("tre_parse: escaped: '%.*" STRF "'\n", REST(ctx->re - 1))); - result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, - ctx->position); + result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, ctx->position); ctx->position++; ctx->re++; } diff --git a/qse/lib/cmn/tre.h b/qse/lib/cmn/tre.h index dbd630df..52b93823 100644 --- a/qse/lib/cmn/tre.h +++ b/qse/lib/cmn/tre.h @@ -91,7 +91,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define tre_tolower(c) QSE_TOLOWER(c) #define tre_toupper(c) QSE_TOUPPER(c) -typedef qse_char_t tre_char_t; +#if defined(QSE_CHAR_IS_MCHAR) && (QSE_SIZEOF_MCHAR_T == QSE_SIZEOF_CHAR) + typedef unsigned char tre_char_t; +#else + typedef qse_char_t tre_char_t; +#endif typedef qse_cint_t tre_cint_t; #define size_t qse_size_t diff --git a/qse/lib/sed/err.c b/qse/lib/sed/err.c index 818628de..6388953b 100644 --- a/qse/lib/sed/err.c +++ b/qse/lib/sed/err.c @@ -1,5 +1,5 @@ /* - * $Id: err.c 441 2011-04-22 14:28:43Z hyunghwan.chung $ + * $Id: err.c 560 2011-09-06 14:18:36Z hyunghwan.chung $ * Copyright 2006-2011 Chung, Hyung-Hwan. This file is part of QSE. @@ -52,8 +52,9 @@ const qse_char_t* qse_sed_dflerrstr (qse_sed_t* sed, qse_sed_errnum_t errnum) QSE_T("multiple occurrence specifiers"), QSE_T("occurrence specifier zero"), QSE_T("occurrence specifier too large"), - QSE_T("io error with file '${0}'"), - QSE_T("error returned by user io handler") + QSE_T("no previous regular expression"), + QSE_T("I/O error with file '${0}'"), + QSE_T("error returned by user I/O handler") }; return (errnum >= 0 && errnum < QSE_COUNTOF(errstr))? diff --git a/qse/lib/sed/sed.c b/qse/lib/sed/sed.c index 357e8e8e..7c5a515f 100644 --- a/qse/lib/sed/sed.c +++ b/qse/lib/sed/sed.c @@ -1,5 +1,5 @@ /* - * $Id: sed.c 559 2011-09-04 16:21:54Z hyunghwan.chung $ + * $Id: sed.c 560 2011-09-06 14:18:36Z hyunghwan.chung $ * Copyright 2006-2011 Chung, Hyung-Hwan. This file is part of QSE. @@ -33,6 +33,8 @@ QSE_IMPLEMENT_COMMON_FUNCTIONS (sed) static void free_command (qse_sed_t* sed, qse_sed_cmd_t* cmd); static void free_all_command_blocks (qse_sed_t* sed); +#define EMPTY_REX ((void*)1) + #define SETERR0(sed,num,loc) \ do { qse_sed_seterror (sed, num, QSE_NULL, loc); } while (0) @@ -160,7 +162,7 @@ void qse_sed_setmaxdepth (qse_sed_t* sed, int ids, qse_size_t depth) if (ids & QSE_SED_DEPTH_REX_BUILD) sed->depth.rex.build = depth; if (ids & QSE_SED_DEPTH_REX_MATCH) sed->depth.rex.match = depth; } -#endif +#else static qse_tre_t* maketre ( qse_sed_t* sed, const qse_cstr_t* str, const qse_sed_loc_t* loc) @@ -237,7 +239,7 @@ static int matchtre ( } return 1; } - +#endif /* check if c is a space character */ #define IS_SPACE(c) ((c) == QSE_T(' ') || (c) == QSE_T('\t') || (c) == QSE_T('\r')) @@ -282,15 +284,27 @@ static void free_address (qse_sed_t* sed, qse_sed_cmd_t* cmd) if (cmd->a2.type == QSE_SED_ADR_REX) { QSE_ASSERT (cmd->a2.u.rex != QSE_NULL); - /*qse_freerex (sed->mmgr, cmd->a2.u.rex);*/ - freetre (sed, cmd->a2.u.rex); + if (cmd->a2.u.rex != EMPTY_REX) + { +#ifdef USE_REX + qse_freerex (sed->mmgr, cmd->a2.u.rex); +#else + freetre (sed, cmd->a2.u.rex); +#endif + } cmd->a2.type = QSE_SED_ADR_NONE; } if (cmd->a1.type == QSE_SED_ADR_REX) { QSE_ASSERT (cmd->a1.u.rex != QSE_NULL); - /*qse_freerex (sed->mmgr, cmd->a1.u.rex);*/ - freetre (sed, cmd->a1.u.rex); + if (cmd->a1.u.rex != EMPTY_REX) + { +#ifdef USE_REX + qse_freerex (sed->mmgr, cmd->a1.u.rex); +#else + freetre (sed, cmd->a1.u.rex); +#endif + } cmd->a1.type = QSE_SED_ADR_NONE; } } @@ -368,9 +382,14 @@ static void free_command (qse_sed_t* sed, qse_sed_cmd_t* cmd) QSE_MMGR_FREE (sed->mmgr, cmd->u.subst.file.ptr); if (cmd->u.subst.rpl.ptr) QSE_MMGR_FREE (sed->mmgr, cmd->u.subst.rpl.ptr); - if (cmd->u.subst.rex) - /*qse_freerex (sed->mmgr, cmd->u.subst.rex);*/ + if (cmd->u.subst.rex && cmd->u.subst.rex != EMPTY_REX) + { +#ifdef USE_REX + qse_freerex (sed->mmgr, cmd->u.subst.rex); +#else freetre (sed, cmd->u.subst.rex); +#endif + } break; case QSE_SED_CMD_TRANSLATE: @@ -485,6 +504,8 @@ static void* compile_rex (qse_sed_t* sed, qse_char_t rxend) } } + if (QSE_STR_LEN(&sed->tmp.rex) == 0) return EMPTY_REX; + #ifdef USE_REX code = qse_buildrex ( sed->mmgr, @@ -1044,29 +1065,33 @@ static int get_subst (qse_sed_t* sed, qse_sed_cmd_t* cmd) QSE_ASSERT (cmd->u.subst.rex == QSE_NULL); -#ifdef USE_REX - cmd->u.subst.rex = qse_buildrex ( - sed->mmgr, - sed->depth.rex.build, - ((sed->option&QSE_SED_EXTENDEDREX)? 0:QSE_REX_NOBOUND), - QSE_STR_PTR(t[0]), - QSE_STR_LEN(t[0]), - QSE_NULL - ); - if (cmd->u.subst.rex == QSE_NULL) + if (QSE_STR_LEN(t[0]) <= 0) cmd->u.subst.rex = EMPTY_REX; + else { - SETERR1 ( - sed, QSE_SED_EREXBL, - QSE_STR_PTR(t[0]), +#ifdef USE_REX + cmd->u.subst.rex = qse_buildrex ( + sed->mmgr, + sed->depth.rex.build, + ((sed->option&QSE_SED_EXTENDEDREX)? 0:QSE_REX_NOBOUND), + QSE_STR_PTR(t[0]), QSE_STR_LEN(t[0]), - &sed->src.loc + QSE_NULL ); - goto oops; - } + if (cmd->u.subst.rex == QSE_NULL) + { + SETERR1 ( + sed, QSE_SED_EREXBL, + QSE_STR_PTR(t[0]), + QSE_STR_LEN(t[0]), + &sed->src.loc + ); + goto oops; + } #else - cmd->u.subst.rex = maketre (sed, QSE_STR_CSTR(t[0]), &sed->src.loc); - if (cmd->u.subst.rex == QSE_NULL) goto oops; + cmd->u.subst.rex = maketre (sed, QSE_STR_CSTR(t[0]), &sed->src.loc); + if (cmd->u.subst.rex == QSE_NULL) goto oops; #endif + } qse_str_yield (t[1], &cmd->u.subst.rpl, 0); if (cmd->u.subst.g == 0 && cmd->u.subst.occ == 0) cmd->u.subst.occ = 1; @@ -1891,7 +1916,7 @@ static int write_str_clearly ( { #ifdef QSE_CHAR_IS_MCHAR WRITE_CHAR (sed, QSE_T('\\')); - WRITE_NUM (sed, c, 8, QSE_SIZEOF(qse_char_t)*3); + WRITE_NUM (sed, (unsigned char)c, 8, QSE_SIZEOF(qse_char_t)*3); #else if (QSE_SIZEOF(qse_char_t) <= 2) { @@ -2001,6 +2026,7 @@ static int do_subst (qse_sed_t* sed, qse_sed_cmd_t* cmd) #ifdef USE_REX qse_rex_errnum_t errnum; #endif + const qse_char_t* finalizer = QSE_NULL; qse_cstr_t str, cur; const qse_char_t* str_end; @@ -2019,7 +2045,19 @@ static int do_subst (qse_sed_t* sed, qse_sed_cmd_t* cmd) str.len = QSE_STR_LEN(&sed->e.in.line); /* TODO: support different line end convension */ - if (str.len > 0 && str.ptr[str.len-1] == QSE_T('\n')) str.len--; + if (str.len > 0 && str.ptr[str.len-1] == QSE_T('\n')) + { + str.len--; + if (str.len > 0 && str.ptr[str.len-1] == QSE_T('\r')) + { + finalizer = QSE_T("\r\n"); + str.len--; + } + else + { + finalizer = QSE_T("\n"); + } + } str_end = str.ptr + str.len; cur = str; @@ -2034,13 +2072,35 @@ static int do_subst (qse_sed_t* sed, qse_sed_cmd_t* cmd) * end of string($) needs to be tested */ while (cur.ptr <= str_end) { -#ifdef USE_REX +#ifndef USE_REX + qse_cstr_t submat[9]; + QSE_MEMSET (submat, 0, QSE_SIZEOF(submat)); +#endif + if (max_count == 0 || sub_count < max_count) { + void* rex; + + if (cmd->u.subst.rex == EMPTY_REX) + { + rex = sed->e.last_rex; + if (rex == QSE_NULL) + { + SETERR0 (sed, QSE_SED_ENPREX, &cmd->loc); + return -1; + } + } + else + { + rex = cmd->u.subst.rex; + sed->e.last_rex = rex; + } + +#ifdef USE_REX n = qse_matchrex ( sed->mmgr, sed->depth.rex.match, - cmd->u.subst.rex, opt, + rex, opt, &str, &cur, &mat, &errnum ); if (n <= -1) @@ -2048,24 +2108,16 @@ static int do_subst (qse_sed_t* sed, qse_sed_cmd_t* cmd) SETERR0 (sed, QSE_SED_EREXMA, &cmd->loc); return -1; } - } - else n = 0; #else - qse_cstr_t submat[9]; - - QSE_MEMSET (submat, 0, QSE_SIZEOF(submat)); - if (max_count == 0 || sub_count < max_count) - { n = matchtre ( - sed, - cmd->u.subst.rex, + sed, rex, ((str.ptr == cur.ptr)? opt: (opt | QSE_TRE_NOTBOL)), &cur, &mat, submat, &cmd->loc ); if (n <= -1) return -1; +#endif } else n = 0; -#endif if (n == 0) { @@ -2193,10 +2245,9 @@ static int do_subst (qse_sed_t* sed, qse_sed_cmd_t* cmd) } } - if (str.len < QSE_STR_LEN(&sed->e.in.line)) + if (finalizer) { - /* TODO: support different line ending convension */ - m = qse_str_ccat (&sed->e.txt.subst, QSE_T('\n')); + m = qse_str_cat (&sed->e.txt.subst, finalizer); if (m == (qse_size_t)-1) { SETERR0 (sed, QSE_SED_ENOMEM, QSE_NULL); @@ -2251,6 +2302,7 @@ static int match_a (qse_sed_t* sed, qse_sed_cmd_t* cmd, qse_sed_adr_t* a) qse_cstr_t match; #endif qse_cstr_t line; + void* rex; QSE_ASSERT (a->u.rex != QSE_NULL); @@ -2258,13 +2310,31 @@ static int match_a (qse_sed_t* sed, qse_sed_cmd_t* cmd, qse_sed_adr_t* a) line.len = QSE_STR_LEN(&sed->e.in.line); if (line.len > 0 && - line.ptr[line.len-1] == QSE_T('\n')) line.len--; + line.ptr[line.len-1] == QSE_T('\n')) + { + line.len--; + if (line.len > 0 && line.ptr[line.len-1] == QSE_T('\r')) line.len--; + } + if (a->u.rex == EMPTY_REX) + { + rex = sed->e.last_rex; + if (rex == QSE_NULL) + { + SETERR0 (sed, QSE_SED_ENPREX, &cmd->loc); + return -1; + } + } + else + { + rex = a->u.rex; + sed->e.last_rex = rex; + } #ifdef USE_REX n = qse_matchrex ( sed->mmgr, sed->depth.rex.match, - a->u.rex, 0, + rex, 0, &line, &line, &match, &errnum); if (n <= -1) @@ -2275,7 +2345,7 @@ static int match_a (qse_sed_t* sed, qse_sed_cmd_t* cmd, qse_sed_adr_t* a) return n; #else - return matchtre (sed, a->u.rex, 0, &line, QSE_NULL, QSE_NULL, &cmd->loc); + return matchtre (sed, rex, 0, &line, QSE_NULL, QSE_NULL, &cmd->loc); #endif } @@ -2867,6 +2937,8 @@ int qse_sed_exec (qse_sed_t* sed, qse_sed_io_fun_t inf, qse_sed_io_fun_t outf) #endif }; + sed->e.last_rex = QSE_NULL; + sed->e.subst_done = 0; qse_lda_clear (&sed->e.txt.appended); qse_str_clear (&sed->e.txt.read); diff --git a/qse/lib/sed/sed.h b/qse/lib/sed/sed.h index d97712b0..354abe30 100644 --- a/qse/lib/sed/sed.h +++ b/qse/lib/sed/sed.h @@ -1,5 +1,5 @@ /* - * $Id: sed.h 558 2011-09-02 15:27:44Z hyunghwan.chung $ + * $Id: sed.h 560 2011-09-06 14:18:36Z hyunghwan.chung $ * Copyright 2006-2011 Chung, Hyung-Hwan. This file is part of QSE. @@ -39,7 +39,7 @@ enum qse_sed_depth_t QSE_SED_DEPTH_REX_BUILD = (1 << 0), QSE_SED_DEPTH_REX_MATCH = (1 << 1) }; -typedef enum qse_sed_depth_t qse_sed_depth_t +typedef enum qse_sed_depth_t qse_sed_depth_t; #endif #define QSE_SED_CMD_NOOP QSE_T('\0') @@ -270,6 +270,7 @@ struct qse_sed_t /** indicates if a successful substitution has been made * since the last read on the input stream. */ int subst_done; + void* last_rex; } e; }; diff --git a/qse/regress/sed/regress.out b/qse/regress/sed/regress.out index c721269b..35246d91 100644 --- a/qse/regress/sed/regress.out +++ b/qse/regress/sed/regress.out @@ -1,5 +1,5 @@ -------------------------------------------------------------------------------- -[CMD] qsesed -n -f s001.sed s001.dat &1 +[CMD] qsesed -n -r -f s001.sed s001.dat &1 -------------------------------------------------------------------------------- ab...c AAA diff --git a/qse/regress/sed/regress.out.xma b/qse/regress/sed/regress.out.xma index bc4b6491..fbc656db 100644 --- a/qse/regress/sed/regress.out.xma +++ b/qse/regress/sed/regress.out.xma @@ -1,5 +1,5 @@ -------------------------------------------------------------------------------- -[CMD] qsesed -m 500000 -n -f s001.sed s001.dat &1 +[CMD] qsesed -m 500000 -n -r -f s001.sed s001.dat &1 -------------------------------------------------------------------------------- ab...c AAA