diff --git a/qse/cmd/awk/awk.c b/qse/cmd/awk/awk.c index efbe5f07..a245ecc7 100644 --- a/qse/cmd/awk/awk.c +++ b/qse/cmd/awk/awk.c @@ -1,5 +1,5 @@ /* - * $Id: awk.c 206 2009-06-21 13:33:05Z hyunghwan.chung $ + * $Id: awk.c 207 2009-06-22 13:01:28Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -331,20 +331,20 @@ struct opttab_t { { QSE_T("implicit"), QSE_AWK_IMPLICIT, QSE_T("allow undeclared variables") }, { QSE_T("explicit"), QSE_AWK_EXPLICIT, QSE_T("allow declared variables(local,global)") }, - { QSE_T("bxor"), QSE_AWK_BXOR, QSE_T("enable bit-wise xor operator(^)") }, + { QSE_T("bxor"), QSE_AWK_BXOR, QSE_T("enable bit-wise XOR operator(^)") }, { QSE_T("shift"), QSE_AWK_SHIFT, QSE_T("enable shift operators(<<,>>)") }, { QSE_T("idiv"), QSE_AWK_IDIV, QSE_T("enable idiv operator(//)") }, - { QSE_T("rio"), QSE_AWK_RIO, QSE_T("") }, + { QSE_T("rio"), QSE_AWK_RIO, QSE_T("enable builtin I/O including getline & print") }, { QSE_T("rwpipe"), QSE_AWK_RWPIPE, QSE_T("allow a dual-directional pipe") }, - { QSE_T("newline"), QSE_AWK_NEWLINE, QSE_T("") }, - { QSE_T("stripspaces"), QSE_AWK_STRIPSPACES, QSE_T("") }, - { QSE_T("nextofile"), QSE_AWK_NEXTOFILE, QSE_T("") }, - { QSE_T("crfl"), QSE_AWK_CRLF, QSE_T("") }, - { QSE_T("reset"), QSE_AWK_RESET, QSE_T("") }, - { QSE_T("maptovar"), QSE_AWK_MAPTOVAR, QSE_T("") }, - { QSE_T("pablock"), QSE_AWK_PABLOCK, QSE_T("") }, - { QSE_T("rexbound"), QSE_AWK_REXBOUND, QSE_T("") }, - { QSE_T("ncmponstr"), QSE_AWK_NCMPONSTR, QSE_T("") }, + { QSE_T("newline"), QSE_AWK_NEWLINE, QSE_T("enable a newline to terminate a statement") }, + { QSE_T("stripspaces"), QSE_AWK_STRIPSPACES, QSE_T("strip leading and trailing space of a record") }, + { QSE_T("nextofile"), QSE_AWK_NEXTOFILE, QSE_T("enable 'nextofile'") }, + { QSE_T("reset"), QSE_AWK_RESET, QSE_T("enable 'reset'") }, + { QSE_T("crlf"), QSE_AWK_CRLF, QSE_T("use CRLF for a newline") }, + { QSE_T("maptovar"), QSE_AWK_MAPTOVAR, QSE_T("allow a map to be assigned or returned") }, + { QSE_T("pablock"), QSE_AWK_PABLOCK, QSE_T("enable pattern-action loop") }, + { QSE_T("rexbound"), QSE_AWK_REXBOUND, QSE_T("enable {n,m} in a regular expression") }, + { QSE_T("ncmponstr"), QSE_AWK_NCMPONSTR, QSE_T("perform numeric comparsion on numeric strings") }, { QSE_NULL, 0 } }; @@ -357,7 +357,7 @@ static void print_usage (QSE_FILE* out, const qse_char_t* argv0) qse_fprintf (out, QSE_T("Where options are:\n")); qse_fprintf (out, QSE_T(" -h/--help print this message\n")); qse_fprintf (out, QSE_T(" -d show extra information\n")); - qse_fprintf (out, QSE_T(" -c/--call name calls a function instead of entering\n")); + qse_fprintf (out, QSE_T(" -c/--call name call a function instead of entering\n")); qse_fprintf (out, QSE_T(" the pattern-action loop\n")); qse_fprintf (out, QSE_T(" -f/--file sourcefile set the source script file\n")); qse_fprintf (out, QSE_T(" -o/--deparsed-file deparsedfile set the deparsing output file\n")); @@ -384,8 +384,8 @@ static int comparg (int argc, qse_char_t* argv[], struct arg_t* arg) { QSE_T(":newline"), QSE_T('\0') }, { QSE_T(":stripspaces"), QSE_T('\0') }, { QSE_T(":nextofile"), QSE_T('\0') }, - { QSE_T(":crlf"), QSE_T('\0') }, { QSE_T(":reset"), QSE_T('\0') }, + { QSE_T(":crlf"), QSE_T('\0') }, { QSE_T(":maptovar"), QSE_T('\0') }, { QSE_T(":pablock"), QSE_T('\0') }, { QSE_T(":rexbound"), QSE_T('\0') }, @@ -411,7 +411,6 @@ static int comparg (int argc, qse_char_t* argv[], struct arg_t* arg) qse_size_t isfc = 16; /* the capacity of isf */ qse_size_t isfl = 0; /* number of input source files */ - qse_size_t argl = 0; qse_size_t icfc = 0; /* the capacity of icf */ qse_size_t icfl = 0; /* the number of input console files */ @@ -678,8 +677,8 @@ qse_map_walk_t add_global (qse_map_t* map, qse_map_pair_t* pair, void* arg) static int awk_main (int argc, qse_char_t* argv[]) { - qse_awk_t* awk; - qse_awk_rtx_t* rtx; + qse_awk_t* awk = QSE_NULL; + qse_awk_rtx_t* rtx = QSE_NULL; qse_awk_rcb_t rcb; int i; diff --git a/qse/include/qse/awk/Awk.hpp b/qse/include/qse/awk/Awk.hpp index 25888cc8..c0e96413 100644 --- a/qse/include/qse/awk/Awk.hpp +++ b/qse/include/qse/awk/Awk.hpp @@ -1,5 +1,5 @@ /* - * $Id: Awk.hpp 206 2009-06-21 13:33:05Z hyunghwan.chung $ + * $Id: Awk.hpp 207 2009-06-22 13:01:28Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -557,10 +557,10 @@ public: /** Support the nextofile statement */ OPT_NEXTOFILE = QSE_AWK_NEXTOFILE, - /** Use CR+LF instead of LF for line breaking. */ - OPT_CRLF = QSE_AWK_CRLF, /** Enables the keyword 'reset' */ OPT_RESET = QSE_AWK_RESET, + /** Use CR+LF instead of LF for line breaking. */ + OPT_CRLF = QSE_AWK_CRLF, /** Allows the assignment of a map value to a variable */ OPT_MAPTOVAR = QSE_AWK_MAPTOVAR, /** Allows BEGIN, END, pattern-action blocks */ diff --git a/qse/include/qse/awk/awk.h b/qse/include/qse/awk/awk.h index 108fa9b3..c683cc5a 100644 --- a/qse/include/qse/awk/awk.h +++ b/qse/include/qse/awk/awk.h @@ -1,5 +1,5 @@ /* - * $Id: awk.h 206 2009-06-21 13:33:05Z hyunghwan.chung $ + * $Id: awk.h 207 2009-06-22 13:01:28Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -388,11 +388,11 @@ enum qse_awk_option_t /** enables @b nextofile */ QSE_AWK_NEXTOFILE = (1 << 12), - /** CR + LF by default */ - QSE_AWK_CRLF = (1 << 13), - /** enables @b reset */ - QSE_AWK_RESET = (1 << 14), + QSE_AWK_RESET = (1 << 13), + + /** CR + LF by default */ + QSE_AWK_CRLF = (1 << 14), /** allows the assignment of a map value to a variable */ QSE_AWK_MAPTOVAR = (1 << 15), diff --git a/qse/include/qse/awk/std.h b/qse/include/qse/awk/std.h index c5cbb0de..624225a2 100644 --- a/qse/include/qse/awk/std.h +++ b/qse/include/qse/awk/std.h @@ -1,5 +1,5 @@ /* - * $Id: std.h 206 2009-06-21 13:33:05Z hyunghwan.chung $ + * $Id: std.h 207 2009-06-22 13:01:28Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -33,6 +33,7 @@ * @todo * - StdAwk ARGV and console name handling * - add RQ and LQ for more powerful record splitting + * - improve performance in qse_awk_rtx_readio() if RS is logner than 2 chars. */ /** diff --git a/qse/lib/awk/err.c b/qse/lib/awk/err.c index 6a6a513b..cb976614 100644 --- a/qse/lib/awk/err.c +++ b/qse/lib/awk/err.c @@ -1,5 +1,5 @@ /* - * $Id: err.c 205 2009-06-20 12:47:34Z hyunghwan.chung $ + * $Id: err.c 207 2009-06-22 13:01:28Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -64,8 +64,8 @@ const qse_char_t* qse_awk_dflerrstr (qse_awk_t* awk, qse_awk_errnum_t errnum) QSE_T("cannot unget character"), QSE_T("unexpected end of source"), - QSE_T("a comment not cloawk properly"), - QSE_T("a string or a regular expression not cloawk"), + QSE_T("a comment not closed properly"), + QSE_T("a string or a regular expression not closed"), QSE_T("unexpected end of a regular expression"), QSE_T("a left brace expected in place of '${0}'"), QSE_T("a left parenthesis expected in place of '${0}'"), diff --git a/qse/lib/awk/parse.c b/qse/lib/awk/parse.c index 1cd781f1..3a8f6711 100644 --- a/qse/lib/awk/parse.c +++ b/qse/lib/awk/parse.c @@ -1,5 +1,5 @@ /* - * $Id: parse.c 205 2009-06-20 12:47:34Z hyunghwan.chung $ + * $Id: parse.c 207 2009-06-22 13:01:28Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -2558,6 +2558,7 @@ static qse_awk_nde_t* parse_concat (qse_awk_t* awk, qse_size_t line) MATCH(awk,TOKEN_MINUS) || MATCH(awk,TOKEN_PLUSPLUS) || MATCH(awk,TOKEN_MINUSMINUS) || + MATCH(awk,TOKEN_LNOT) || awk->token.type >= TOKEN_GETLINE) { /* TODO: is the check above sufficient? */ @@ -5041,7 +5042,6 @@ static int get_number (qse_awk_t* awk) return 0; } - #if 0 else if (c == QSE_T('b') || c == QSE_T('B')) { /* binary number */ @@ -5054,7 +5054,6 @@ static int get_number (qse_awk_t* awk) return 0; } - #endif else if (c != '.') { /* octal number */ diff --git a/qse/lib/awk/rio.c b/qse/lib/awk/rio.c index 320a0cbb..984e609d 100644 --- a/qse/lib/awk/rio.c +++ b/qse/lib/awk/rio.c @@ -1,5 +1,5 @@ /* - * $Id: rio.c 202 2009-06-16 06:05:40Z hyunghwan.chung $ + * $Id: rio.c 207 2009-06-22 13:01:28Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -279,13 +279,13 @@ int qse_awk_rtx_readio ( QSE_STR_PTR(buf), QSE_STR_LEN(buf), QSE_STR_PTR(buf), QSE_STR_LEN(buf), &match, &run->errinf.num); - if (n == -1) + if (n <= -1) { ret = -1; break; } - if (n == 1) + if (n >= 1) { /* the match should be found at the end of * the current buffer */ @@ -359,6 +359,10 @@ int qse_awk_rtx_readio ( { qse_cstr_t match; +/* TODO: minimize the number of regular expressoin match here... + * currently matchrex is called for each character added to buf. + * this is a very bad way of doing the job. + */ QSE_ASSERT (run->gbl.rs != QSE_NULL); n = QSE_AWK_MATCHREX ( @@ -367,14 +371,14 @@ int qse_awk_rtx_readio ( QSE_STR_PTR(buf), QSE_STR_LEN(buf), QSE_STR_PTR(buf), QSE_STR_LEN(buf), &match, &run->errinf.num); - if (n == -1) + if (n <= -1) { ret = -1; p->in.pos--; /* unread the character in c */ break; } - if (n == 1) + if (n >= 1) { /* the match should be found at the end of * the current buffer */ diff --git a/qse/lib/awk/run.c b/qse/lib/awk/run.c index e485c7e3..e31a7b39 100644 --- a/qse/lib/awk/run.c +++ b/qse/lib/awk/run.c @@ -1,5 +1,5 @@ /* - * $Id: run.c 206 2009-06-21 13:33:05Z hyunghwan.chung $ + * $Id: run.c 207 2009-06-22 13:01:28Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -77,10 +77,10 @@ static qse_size_t push_arg_from_vals ( qse_awk_rtx_t* rtx, qse_awk_nde_call_t* call, void* data); static int init_rtx (qse_awk_rtx_t* rtx, qse_awk_t* awk, qse_awk_rio_t* rio); -static void fini_rtx (qse_awk_rtx_t* rtx); +static void fini_rtx (qse_awk_rtx_t* rtx, int fini_globals); static int init_globals (qse_awk_rtx_t* rtx, const qse_cstr_t* arg); -static void fini_globals (qse_awk_rtx_t* rtx); +static void refdown_globals (qse_awk_rtx_t* run, int pop); static int run_pattern_blocks (qse_awk_rtx_t* run); static int run_pattern_block_chain ( @@ -415,7 +415,7 @@ static int set_global ( qse_real_t rv; n = qse_awk_rtx_valtonum (run, val, &lv, &rv); - if (n == -1) return -1; + if (n <= -1) return -1; if (n == 1) lv = (qse_long_t)rv; if (lv < (qse_long_t)run->inrec.nflds) @@ -682,7 +682,7 @@ qse_awk_rtx_t* qse_awk_rtx_open ( if (init_globals (rtx, arg) == -1) { - fini_rtx (rtx); + fini_rtx (rtx, 0); QSE_AWK_FREE (awk, rtx); return QSE_NULL; } @@ -692,8 +692,7 @@ qse_awk_rtx_t* qse_awk_rtx_open ( void qse_awk_rtx_close (qse_awk_rtx_t* rtx) { - fini_globals (rtx); - fini_rtx (rtx); + fini_rtx (rtx, 1); QSE_AWK_FREE (rtx->awk, rtx); } @@ -854,7 +853,7 @@ static int init_rtx (qse_awk_rtx_t* rtx, qse_awk_t* awk, qse_awk_rio_t* rio) return 0; } -static void fini_rtx (qse_awk_rtx_t* rtx) +static void fini_rtx (qse_awk_rtx_t* rtx, int fini_globals) { if (rtx->pattern_range_state != QSE_NULL) QSE_AWK_FREE (rtx->awk, rtx->pattern_range_state); @@ -922,8 +921,8 @@ static void fini_rtx (qse_awk_rtx_t* rtx) qse_str_fini (&rtx->format.fmt); qse_str_fini (&rtx->format.out); - /* destroy input record. qse_awk_rtx_clrrec should be called - * before the rtx stack has been destroyed because it may try + /* destroy input record. qse_awk_rtx_clrrec() should be called + * before the stack has been destroyed because it may try * to change the value to QSE_AWK_GBL_NF. */ qse_awk_rtx_clrrec (rtx, QSE_FALSE); if (rtx->inrec.flds != QSE_NULL) @@ -934,6 +933,8 @@ static void fini_rtx (qse_awk_rtx_t* rtx) } qse_str_fini (&rtx->inrec.line); + if (fini_globals) refdown_globals (rtx, 1); + /* destroy the stack if necessary */ if (rtx->stack != QSE_NULL) { @@ -1248,11 +1249,6 @@ oops: return -1; } -static void fini_globals (qse_awk_rtx_t* rtx) -{ - refdown_globals (rtx, 1); -} - struct capture_retval_data_t { qse_awk_rtx_t* rtx; @@ -4028,7 +4024,6 @@ static int __cmp_int_str ( if (rtx->awk->option & QSE_AWK_NCMPONSTR) { - const qse_char_t* end; qse_long_t ll; qse_real_t rr; @@ -4180,9 +4175,6 @@ static int __cmp_str_real ( static int __cmp_str_str ( qse_awk_rtx_t* rtx, qse_awk_val_t* left, qse_awk_val_t* right) { - int n1, n2; - qse_long_t l1, l2; - qse_real_t r1, r2; qse_awk_val_str_t* ls, * rs; ls = (qse_awk_val_str_t*)left; diff --git a/qse/lib/cmn/assert.c b/qse/lib/cmn/assert.c index c4d3ee9a..8e0ca120 100644 --- a/qse/lib/cmn/assert.c +++ b/qse/lib/cmn/assert.c @@ -71,7 +71,7 @@ void qse_assert_failed ( const qse_char_t* expr, const qse_char_t* desc, const qse_char_t* file, qse_size_t line) { - qse_sio_puts (QSE_SIO_ERR, QSE_T("=[ASSERTION FAILURE]============================================================")); + qse_sio_puts (QSE_SIO_ERR, QSE_T("=[ASSERTION FAILURE]============================================================\n")); qse_sio_puts (QSE_SIO_ERR, QSE_T("FILE ")); qse_sio_puts (QSE_SIO_ERR, file); @@ -89,7 +89,7 @@ void qse_assert_failed ( qse_sio_puts (QSE_SIO_ERR, desc); qse_sio_puts (QSE_SIO_ERR, QSE_T("\n")); } - qse_sio_puts (QSE_SIO_ERR, QSE_T("================================================================================")); + qse_sio_puts (QSE_SIO_ERR, QSE_T("================================================================================\n")); qse_sio_flush (QSE_SIO_ERR); #ifdef _WIN32 diff --git a/qse/lib/cmn/rex.c b/qse/lib/cmn/rex.c index 6318fbfc..b04eef51 100644 --- a/qse/lib/cmn/rex.c +++ b/qse/lib/cmn/rex.c @@ -1,5 +1,5 @@ /* - * $Id: rex.c 204 2009-06-18 12:08:06Z hyunghwan.chung $ + * $Id: rex.c 207 2009-06-22 13:01:28Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -39,7 +39,7 @@ enum LEVEL_RANGE }; -enum +enum { CMD_BOL, CMD_EOL, @@ -65,6 +65,7 @@ enum }; #define DEF_CODE_CAPA 512 + #define BOUND_MIN 0 #define BOUND_MAX (QSE_TYPE_MAX(qse_size_t)) @@ -72,7 +73,7 @@ typedef struct builder_t builder_t; typedef struct matcher_t matcher_t; typedef struct match_t match_t; -typedef struct code_t code_t; +typedef struct atom_t atom_t; typedef struct rhdr_t rhdr_t; typedef struct bhdr_t bhdr_t; typedef struct cshdr_t cshdr_t; @@ -153,12 +154,11 @@ struct match_t #include -QSE_BEGIN_PACKED_STRUCT (code_t) - /*qse_byte_t cmd;*/ - short cmd; - short negate; /* only for CMD_CHARSET */ - qse_size_t lbound; - qse_size_t ubound; +QSE_BEGIN_PACKED_STRUCT (atom_t) + short cmd; /* CMD_XXX */ + short negate; /* only for CMD_CHARSET */ + qse_size_t lbound; /* lower bound */ + qse_size_t ubound; /* upper bound */ QSE_END_PACKED_STRUCT () /* compiled regular expression header */ @@ -196,10 +196,11 @@ static int build_pattern (builder_t* rex); static int build_pattern0 (builder_t* rex); static int build_branch (builder_t* rex); static int build_atom (builder_t* rex); -static int build_charset (builder_t* rex, code_t* cmd); -static int build_occurrences (builder_t* rex, code_t* cmd); -static int build_cclass (builder_t* rex, qse_char_t* cc); -static int build_range (builder_t* rex, code_t* cmd); +static int build_atom_charset (builder_t* rex, atom_t* cmd); +static int build_atom_occ (builder_t* rex, atom_t* cmd); +static int build_atom_cclass (builder_t* rex, qse_char_t* cc); +static int build_atom_occ_range (builder_t* rex, atom_t* cmd); + static int next_char (builder_t* rex, int level); static int add_code (builder_t* rex, void* data, qse_size_t len); @@ -402,7 +403,7 @@ void* qse_buildrex ( builder.code.capa = DEF_CODE_CAPA; builder.code.size = 0; builder.code.buf = (qse_byte_t*) - QSE_MALLOC (builder.mmgr, builder.code.capa); + QSE_MMGR_ALLOC (builder.mmgr, builder.code.capa); if (builder.code.buf == QSE_NULL) { *errnum = QSE_REX_ENOMEM; @@ -424,14 +425,14 @@ void* qse_buildrex ( if (next_char (&builder, LEVEL_TOP) == -1) { if (errnum != QSE_NULL) *errnum = builder.errnum; - QSE_FREE (builder.mmgr, builder.code.buf); + QSE_MMGR_FREE (builder.mmgr, builder.code.buf); return QSE_NULL; } if (build_pattern (&builder) == -1) { if (errnum != QSE_NULL) *errnum = builder.errnum; - QSE_FREE (builder.mmgr, builder.code.buf); + QSE_MMGR_FREE (builder.mmgr, builder.code.buf); return QSE_NULL; } @@ -455,7 +456,7 @@ void* qse_buildrex ( } } - QSE_FREE (builder.mmgr, builder.code.buf); + QSE_MMGR_FREE (builder.mmgr, builder.code.buf); return QSE_NULL; } @@ -543,7 +544,7 @@ int qse_matchrex ( void qse_freerex (qse_mmgr_t* mmgr, void* code) { QSE_ASSERT (code != QSE_NULL); - QSE_FREE (mmgr, code); + QSE_MMGR_FREE (mmgr, code); } qse_bool_t qse_isemptyrex (void* code) @@ -636,7 +637,7 @@ static int build_branch (builder_t* builder) qse_size_t zero = 0; qse_size_t old_size; qse_size_t pos_na; - code_t* cmd; + atom_t* cmd; bhdr_t* bhdr; old_size = builder->code.size; @@ -647,7 +648,7 @@ static int build_branch (builder_t* builder) while (1) { - cmd = (code_t*)&builder->code.buf[builder->code.size]; + cmd = (atom_t*)&builder->code.buf[builder->code.size]; n = build_atom (builder); if (n == -1) @@ -658,7 +659,7 @@ static int build_branch (builder_t* builder) if (n == 0) break; /* no atom */ - n = build_occurrences (builder, cmd); + n = build_atom_occ (builder, cmd); if (n == -1) { builder->code.size = old_size; @@ -666,7 +667,7 @@ static int build_branch (builder_t* builder) } /* n == 0 no bound character. just continue */ - /* n == 1 bound has been applied by build_occurrences */ + /* n == 1 bound has been applied by build_atom_occ */ bhdr = (bhdr_t*)&builder->code.buf[pos_na]; bhdr->na++; @@ -681,7 +682,7 @@ static int build_branch (builder_t* builder) static int build_atom (builder_t* builder) { int n; - code_t tmp; + atom_t tmp; if (builder->ptn.curc.type == CT_EOF) return 0; @@ -733,9 +734,9 @@ static int build_atom (builder_t* builder) } else if (builder->ptn.curc.value == QSE_T('[')) { - code_t* cmd; + atom_t* cmd; - cmd = (code_t*)&builder->code.buf[builder->code.size]; + cmd = (atom_t*)&builder->code.buf[builder->code.size]; tmp.cmd = CMD_CHARSET; tmp.negate = 0; @@ -745,7 +746,7 @@ static int build_atom (builder_t* builder) NEXT_CHAR (builder, LEVEL_CHARSET); - n = build_charset (builder, cmd); + n = build_atom_charset (builder, cmd); if (n == -1) return -1; QSE_ASSERT (n != 0); @@ -782,7 +783,7 @@ static int build_atom (builder_t* builder) } } -static int build_charset (builder_t* builder, code_t* cmd) +static int build_atom_charset (builder_t* builder, atom_t* cmd) { qse_size_t zero = 0; qse_size_t old_size; @@ -814,7 +815,7 @@ static int build_charset (builder_t* builder, code_t* cmd) builder->ptn.curc.type == CT_NORMAL && builder->ptn.curc.value == QSE_T(':')) { - if (build_cclass (builder, &c1) == -1) return -1; + if (build_atom_cclass (builder, &c1) == -1) return -1; cc = cc | 1; } @@ -834,7 +835,7 @@ static int build_charset (builder_t* builder, code_t* cmd) builder->ptn.curc.type == CT_NORMAL && builder->ptn.curc.value == QSE_T(':')) { - if (build_cclass (builder, &c2) == -1) + if (build_atom_cclass (builder, &c2) == -1) { return -1; } @@ -872,7 +873,7 @@ static int build_charset (builder_t* builder, code_t* cmd) { /* invalid range */ #ifdef DEBUG_REX - DPUTS (QSE_T("build_charset: invalid character set range\n")); + DPUTS (QSE_T("build_atom_charset: invalid character set range\n")); #endif builder->errnum = QSE_REX_ECRANGE; return -1; @@ -888,7 +889,7 @@ static int build_charset (builder_t* builder, code_t* cmd) return 1; } -static int build_cclass (builder_t* builder, qse_char_t* cc) +static int build_atom_cclass (builder_t* builder, qse_char_t* cc) { const struct __char_class_t* ccp = __char_class; qse_size_t len = builder->ptn.end - builder->ptn.curp; @@ -903,7 +904,7 @@ static int build_cclass (builder_t* builder, qse_char_t* cc) { /* wrong class name */ #ifdef DEBUG_REX - DPUTS (QSE_T("build_cclass: wrong class name\n")); + DPUTS (QSE_T("build_atom_cclass: wrong class name\n")); #endif builder->errnum = QSE_REX_ECCLASS; return -1; @@ -916,7 +917,7 @@ static int build_cclass (builder_t* builder, qse_char_t* cc) builder->ptn.curc.value != QSE_T(':')) { #ifdef DEBUG_REX - DPUTS (QSE_T("build_cclass: a colon(:) expected\n")); + DPUTS (QSE_T("build_atom_cclass: a colon(:) expected\n")); #endif builder->errnum = QSE_REX_ECOLON; return -1; @@ -929,7 +930,7 @@ static int build_cclass (builder_t* builder, qse_char_t* cc) builder->ptn.curc.value != QSE_T(']')) { #ifdef DEBUG_REX - DPUTS (QSE_T("build_cclass: ] expected\n")); + DPUTS (QSE_T("build_atom_cclass: ] expected\n")); #endif builder->errnum = QSE_REX_ERBRACKET; return -1; @@ -941,7 +942,7 @@ static int build_cclass (builder_t* builder, qse_char_t* cc) return 1; } -static int build_occurrences (builder_t* builder, code_t* cmd) +static int build_atom_occ (builder_t* builder, atom_t* cmd) { if (builder->ptn.curc.type != CT_SPECIAL) return 0; @@ -975,7 +976,7 @@ static int build_occurrences (builder_t* builder, code_t* cmd) { NEXT_CHAR (builder, LEVEL_RANGE); - if (build_range(builder, cmd) == -1) return -1; + if (build_atom_occ_range(builder, cmd) == -1) return -1; if (builder->ptn.curc.type != CT_SPECIAL || builder->ptn.curc.value != QSE_T('}')) @@ -992,7 +993,7 @@ static int build_occurrences (builder_t* builder, code_t* cmd) return 0; } -static int build_range (builder_t* builder, code_t* cmd) +static int build_atom_occ_range (builder_t* builder, atom_t* cmd) { qse_size_t bound; @@ -1234,7 +1235,7 @@ static int add_code (builder_t* builder, void* data, qse_size_t len) if (builder->mmgr->realloc != QSE_NULL) { - tmp = (qse_byte_t*) QSE_REALLOC ( + tmp = (qse_byte_t*) QSE_MMGR_REALLOC ( builder->mmgr, builder->code.buf, capa); if (tmp == QSE_NULL) { @@ -1244,7 +1245,7 @@ static int add_code (builder_t* builder, void* data, qse_size_t len) } else { - tmp = (qse_byte_t*) QSE_MALLOC (builder->mmgr, capa); + tmp = (qse_byte_t*) QSE_MMGR_ALLOC (builder->mmgr, capa); if (tmp == QSE_NULL) { builder->errnum = QSE_REX_ENOMEM; @@ -1254,7 +1255,7 @@ static int add_code (builder_t* builder, void* data, qse_size_t len) if (builder->code.buf != QSE_NULL) { QSE_MEMCPY (tmp, builder->code.buf, builder->code.capa); - QSE_FREE (builder->mmgr, builder->code.buf); + QSE_MMGR_FREE (builder->mmgr, builder->code.buf); } } @@ -1420,19 +1421,19 @@ static const qse_byte_t* match_atom ( }; QSE_ASSERT ( - ((code_t*)base)->cmd >= 0 && - ((code_t*)base)->cmd < QSE_COUNTOF(matchers)); + ((atom_t*)base)->cmd >= 0 && + ((atom_t*)base)->cmd < QSE_COUNTOF(matchers)); - return matchers[((code_t*)base)->cmd] (matcher, base, mat); + return matchers[((atom_t*)base)->cmd] (matcher, base, mat); } static const qse_byte_t* match_bol ( matcher_t* matcher, const qse_byte_t* base, match_t* mat) { const qse_byte_t* p = base; - const code_t* cp; + const atom_t* cp; - cp = (const code_t*)p; p += QSE_SIZEOF(*cp); + cp = (const atom_t*)p; p += QSE_SIZEOF(*cp); QSE_ASSERT (cp->cmd == CMD_BOL); /*mat->matched = (mat->match_ptr == matcher->match.str.ptr || @@ -1448,9 +1449,9 @@ static const qse_byte_t* match_eol ( matcher_t* matcher, const qse_byte_t* base, match_t* mat) { const qse_byte_t* p = base; - const code_t* cp; + const atom_t* cp; - cp = (const code_t*)p; p += QSE_SIZEOF(*cp); + cp = (const atom_t*)p; p += QSE_SIZEOF(*cp); QSE_ASSERT (cp->cmd == CMD_EOL); /*mat->matched = (mat->match_ptr == matcher->match.str.end || @@ -1466,10 +1467,10 @@ static const qse_byte_t* match_any_char ( matcher_t* matcher, const qse_byte_t* base, match_t* mat) { const qse_byte_t* p = base; - const code_t* cp; + const atom_t* cp; qse_size_t si = 0, lbound, ubound; - cp = (const code_t*)p; p += QSE_SIZEOF(*cp); + cp = (const atom_t*)p; p += QSE_SIZEOF(*cp); QSE_ASSERT (cp->cmd == CMD_ANY_CHAR); lbound = cp->lbound; @@ -1480,10 +1481,10 @@ static const qse_byte_t* match_any_char ( /* merge the same consecutive codes */ while (p < mat->branch_end && - cp->cmd == ((const code_t*)p)->cmd) + cp->cmd == ((const atom_t*)p)->cmd) { - lbound += ((const code_t*)p)->lbound; - ubound += ((const code_t*)p)->ubound; + lbound += ((const atom_t*)p)->lbound; + ubound += ((const atom_t*)p)->ubound; p += QSE_SIZEOF(*cp); } @@ -1518,11 +1519,11 @@ static const qse_byte_t* match_ord_char ( matcher_t* matcher, const qse_byte_t* base, match_t* mat) { const qse_byte_t* p = base; - const code_t* cp; + const atom_t* cp; qse_size_t si = 0, lbound, ubound; qse_char_t cc; - cp = (const code_t*)p; p += QSE_SIZEOF(*cp); + cp = (const atom_t*)p; p += QSE_SIZEOF(*cp); QSE_ASSERT (cp->cmd == CMD_ORD_CHAR); lbound = cp->lbound; @@ -1536,12 +1537,12 @@ static const qse_byte_t* match_ord_char ( if (matcher->option & QSE_REX_MATCH_IGNORECASE) { while (p < mat->branch_end && - cp->cmd == ((const code_t*)p)->cmd) + cp->cmd == ((const atom_t*)p)->cmd) { if (QSE_TOUPPER (*(qse_char_t*)(p+QSE_SIZEOF(*cp))) != cc) break; - lbound += ((const code_t*)p)->lbound; - ubound += ((const code_t*)p)->ubound; + lbound += ((const atom_t*)p)->lbound; + ubound += ((const atom_t*)p)->ubound; p += QSE_SIZEOF(*cp) + QSE_SIZEOF(cc); } @@ -1549,12 +1550,12 @@ static const qse_byte_t* match_ord_char ( else { while (p < mat->branch_end && - cp->cmd == ((const code_t*)p)->cmd) + cp->cmd == ((const atom_t*)p)->cmd) { if (*(qse_char_t*)(p+QSE_SIZEOF(*cp)) != cc) break; - lbound += ((const code_t*)p)->lbound; - ubound += ((const code_t*)p)->ubound; + lbound += ((const atom_t*)p)->lbound; + ubound += ((const atom_t*)p)->ubound; p += QSE_SIZEOF(*cp) + QSE_SIZEOF(cc); } @@ -1621,10 +1622,10 @@ static const qse_byte_t* match_charset ( qse_bool_t n; qse_char_t c; - code_t* cp; + atom_t* cp; cshdr_t* cshdr; - cp = (code_t*)p; p += QSE_SIZEOF(*cp); + cp = (atom_t*)p; p += QSE_SIZEOF(*cp); QSE_ASSERT (cp->cmd == CMD_CHARSET); cshdr = (cshdr_t*)p; p += QSE_SIZEOF(*cshdr); @@ -1672,11 +1673,13 @@ static const qse_byte_t* match_group ( matcher_t* matcher, const qse_byte_t* base, match_t* mat) { const qse_byte_t* p = base; - const code_t* cp; + const atom_t* cp; match_t mat2; - qse_size_t si = 0, grp_len_static[16], * grp_len; + qse_size_t si = 0, grp_len_static[16], * grp_len, grp_len_capa; + + cp = (const atom_t*)p; + p += QSE_SIZEOF(*cp); /* points to a subpattern in a group */ - cp = (const code_t*)p; p += QSE_SIZEOF(*cp); QSE_ASSERT (cp->cmd == CMD_GROUP); mat->matched = QSE_FALSE; @@ -1708,12 +1711,16 @@ static const qse_byte_t* match_group ( if (cp->ubound < QSE_COUNTOF(grp_len_static)) { + grp_len_capa = QSE_COUNTOF(grp_len_static); grp_len = grp_len_static; } else { - grp_len = (qse_size_t*) QSE_MALLOC ( - matcher->mmgr, QSE_SIZEOF(qse_size_t) * cp->ubound); + grp_len_capa = cp->ubound; + if (grp_len_capa > 256) grp_len_capa = 256; + + grp_len = (qse_size_t*) QSE_MMGR_ALLOC ( + matcher->mmgr, QSE_SIZEOF(qse_size_t) * grp_len_capa); if (grp_len == QSE_NULL) { matcher->errnum = QSE_REX_ENOMEM; @@ -1736,11 +1743,31 @@ static const qse_byte_t* match_group ( if (match_pattern (matcher, p, &mat2) == QSE_NULL) { if (grp_len != grp_len_static) - QSE_FREE (matcher->mmgr, grp_len); + QSE_MMGR_FREE (matcher->mmgr, grp_len); return QSE_NULL; } if (!mat2.matched) break; + if ((si + 1) >= grp_len_capa) + { + qse_size_t* tmp; + + QSE_ASSERT (grp_len != grp_len_static); + + tmp = (qse_size_t*) QSE_MMGR_REALLOC ( + matcher->mmgr, grp_len, + QSE_SIZEOF(qse_size_t) * (grp_len_capa + 256) + ); + if (tmp == QSE_NULL) + { + QSE_MMGR_FREE (matcher->mmgr, grp_len); + return QSE_NULL; + } + + grp_len = tmp; + grp_len_capa += 256; + } + grp_len[si+1] = grp_len[si] + mat2.match_len; mat2.match_ptr += mat2.match_len; @@ -1763,6 +1790,14 @@ static const qse_byte_t* match_group ( } else { + /* consider the pattern '(abc|def){1,3}(abc)'. + * for the input abcabcabc, + * '(abc|def){1,3}' should match up to the second 'abc'. + * '(abc)' should match the last 'abc'. + * + * backtracking is needed to handle this case. + */ + QSE_ASSERT (cp->ubound > cp->lbound); do @@ -1778,11 +1813,12 @@ static const qse_byte_t* match_group ( QSE_T("match_group: GROUP si=%d [%s]\n"), (unsigned)si, mat->match_ptr); #endif + tmp = match_branch_body (matcher, p, &mat2); if (tmp == QSE_NULL) { if (grp_len != grp_len_static) - QSE_FREE (matcher->mmgr, grp_len); + QSE_MMGR_FREE (matcher->mmgr, grp_len); return QSE_NULL; } @@ -1802,7 +1838,7 @@ static const qse_byte_t* match_group ( } - if (grp_len != grp_len_static) QSE_FREE (matcher->mmgr, grp_len); + if (grp_len != grp_len_static) QSE_MMGR_FREE (matcher->mmgr, grp_len); return p; } @@ -2002,7 +2038,7 @@ static const qse_byte_t* __print_branch (qse_awk_t* awk, const qse_byte_t* p) static const qse_byte_t* __print_atom (qse_awk_t* awk, const qse_byte_t* p) { - const code_t* cp = (const code_t*)p; + const atom_t* cp = (const atom_t*)p; if (cp->cmd == CMD_BOL) { diff --git a/qse/lib/sed/err.c b/qse/lib/sed/err.c index a958cf14..97f970f7 100644 --- a/qse/lib/sed/err.c +++ b/qse/lib/sed/err.c @@ -1,5 +1,5 @@ /* - * $Id: err.c 191 2009-06-07 13:09:14Z hyunghwan.chung $ + * $Id: err.c 207 2009-06-22 13:01:28Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -18,6 +18,45 @@ #include "sed.h" +const qse_char_t* qse_sed_dflerrstr (qse_sed_t* sed, qse_sed_errnum_t errnum) +{ + static const qse_char_t* errstr[] = + { + QSE_T("no error"), + QSE_T("insufficient memory"), + QSE_T("command '${0}' not recognized"), + QSE_T("command code missing"), + QSE_T("command '${0}' incomplete"), + QSE_T("regular expression '${0}' incomplete"), + QSE_T("failed to compile regular expression '${0}'"), + QSE_T("failed to match regular expression"), + QSE_T("address 1 prohibited for '${0}'"), + QSE_T("address 2 prohibited for '${0}'"), + QSE_T("address 2 missing or invalid"), + QSE_T("newline expected"), + QSE_T("backslash expected"), + QSE_T("backslash used as delimiter"), + QSE_T("garbage after backslash"), + QSE_T("semicolon expected"), + QSE_T("empty label name"), + QSE_T("duplicate label name '${0}'"), + QSE_T("label '${0}' not found"), + QSE_T("empty file name"), + QSE_T("illegal file name"), + QSE_T("strings in translation set not the same length"), + QSE_T("group brackets not balanced"), + QSE_T("group nesting too deep"), + QSE_T("multiple occurrence specifiers"), + QSE_T("occurrence specifier zero"), + QSE_T("occurrence specifier too large"), + QSE_T("io error with file '${0}'"), + QSE_T("error returned by user io handler") + }; + + return (errnum >= 0 && errnum < QSE_COUNTOF(errstr))? + errstr[errnum]: QSE_T("unknown error"); +} + qse_sed_errstr_t qse_sed_geterrstr (qse_sed_t* sed) { return sed->errstr; diff --git a/qse/lib/sed/sed.c b/qse/lib/sed/sed.c index 3a5ce89f..3d4abfe1 100644 --- a/qse/lib/sed/sed.c +++ b/qse/lib/sed/sed.c @@ -1,5 +1,5 @@ /* - * $Id: sed.c 203 2009-06-17 12:43:50Z hyunghwan.chung $ + * $Id: sed.c 207 2009-06-22 13:01:28Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -39,45 +39,6 @@ do { \ qse_sed_seterror (sed, num, line, &__qse__err__arg__); \ } while (0) -static const qse_char_t* dflerrstr (qse_sed_t* sed, qse_sed_errnum_t errnum) -{ - static const qse_char_t* errstr[] = - { - QSE_T("no error"), - QSE_T("insufficient memory"), - QSE_T("command '${0}' not recognized"), - QSE_T("command code missing"), - QSE_T("command '${0}' incomplete"), - QSE_T("regular expression '${0}' incomplete"), - QSE_T("failed to compile regular expression '${0}'"), - QSE_T("failed to match regular expression"), - QSE_T("address 1 prohibited for '${0}'"), - QSE_T("address 2 prohibited for '${0}'"), - QSE_T("address 2 missing or invalid"), - QSE_T("newline expected"), - QSE_T("backslash expected"), - QSE_T("backslash used as delimiter"), - QSE_T("garbage after backslash"), - QSE_T("semicolon expected"), - QSE_T("empty label name"), - QSE_T("duplicate label name '${0}'"), - QSE_T("label '${0}' not found"), - QSE_T("empty file name"), - QSE_T("illegal file name"), - QSE_T("strings in translation set not the same length"), - QSE_T("group brackets not balanced"), - QSE_T("group nesting too deep"), - QSE_T("multiple occurrence specifiers"), - QSE_T("occurrence specifier zero"), - QSE_T("occurrence specifier too large"), - QSE_T("io error with file '${0}'"), - QSE_T("error returned by user io handler") - }; - - return (errnum >= 0 && errnum < QSE_COUNTOF(errstr))? - errstr[errnum]: QSE_T("unknown error"); -} - qse_sed_t* qse_sed_open (qse_mmgr_t* mmgr, qse_size_t xtn) { qse_sed_t* sed; @@ -114,7 +75,7 @@ static qse_sed_t* qse_sed_init (qse_sed_t* sed, qse_mmgr_t* mmgr) { QSE_MEMSET (sed, 0, QSE_SIZEOF(*sed)); sed->mmgr = mmgr; - sed->errstr = dflerrstr; + sed->errstr = qse_sed_dflerrstr; if (qse_str_init (&sed->tmp.rex, mmgr, 0) == QSE_NULL) { diff --git a/qse/lib/sed/sed.h b/qse/lib/sed/sed.h index fa324d29..5bfa01d1 100644 --- a/qse/lib/sed/sed.h +++ b/qse/lib/sed/sed.h @@ -1,5 +1,5 @@ /* - * $Id: sed.h 191 2009-06-07 13:09:14Z hyunghwan.chung $ + * $Id: sed.h 207 2009-06-22 13:01:28Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -260,4 +260,13 @@ struct qse_sed_t } e; }; +#ifdef __cplusplus +extern "C" { +#endif + +const qse_char_t* qse_sed_dflerrstr (qse_sed_t* sed, qse_sed_errnum_t errnum); + +#ifdef __cplusplus +} +#endif #endif