From 164b3d9a984a9ce2af0ddc4ff56e80b8d8a053d9 Mon Sep 17 00:00:00 2001 From: hyung-hwan Date: Sat, 16 May 2009 07:31:43 +0000 Subject: [PATCH] fixed a bug in AWK and enhanced sed - awk: fixed a bug of not handling ^ in gsub(), split(), and FS. - sed: added code for y and s command --- qse/include/qse/cmn/rex.h | 78 ++++++++++++++++-- qse/include/qse/utl/sed.h | 1 + qse/lib/awk/awk.h | 6 +- qse/lib/awk/fnc.c | 45 +++++----- qse/lib/awk/misc.c | 68 +++++++-------- qse/lib/awk/misc.h | 17 +++- qse/lib/awk/rec.c | 20 +++-- qse/lib/awk/rio.c | 22 ++--- qse/lib/awk/run.c | 13 ++- qse/lib/cmn/rex.c | 102 +++++++++++++++++++---- qse/lib/utl/sed.c | 169 +++++++++++++++++++++++++++++++++++--- qse/lib/utl/sed.h | 18 ++-- 12 files changed, 433 insertions(+), 126 deletions(-) diff --git a/qse/include/qse/cmn/rex.h b/qse/include/qse/cmn/rex.h index cbbae052..20b56316 100644 --- a/qse/include/qse/cmn/rex.h +++ b/qse/include/qse/cmn/rex.h @@ -1,5 +1,5 @@ /* - * $Id: rex.h 127 2009-05-07 13:15:04Z hyunghwan.chung $ + * $Id: rex.h 135 2009-05-15 13:31:43Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -76,24 +76,84 @@ enum qse_rex_errnum_t QSE_REX_EEND, /* unexpected end of the pattern */ QSE_REX_EGARBAGE /* garbage after the pattern */ }; +typedef enum qse_rex_errnum_t qse_rex_errnum_t; + +typedef struct qse_rex_t qse_rex_t; + +struct qse_rex_t +{ + QSE_DEFINE_COMMON_FIELDS (rex) + qse_rex_errnum_t errnum; + int option; + + struct + { + int build; + int match; + } depth; + + void* code; +}; #ifdef __cplusplus extern "C" { #endif +QSE_DEFINE_COMMON_FUNCTIONS (rex) + +qse_rex_t* qse_rex_open ( + qse_mmgr_t* mmgr, + qse_size_t xtn +); + +void qse_rex_close ( + qse_rex_t* rex +); + +int qse_rex_build ( + qse_rex_t* rex, + const qse_char_t* ptn, + qse_size_t len +); + +int qse_rex_match ( + qse_rex_t* rex, + const qse_char_t* str, + qse_size_t len, + const qse_char_t* substr, + qse_size_t sublen, + qse_cstr_t* match +); + void* qse_buildrex ( - qse_mmgr_t* mmgr, qse_size_t depth, - const qse_char_t* ptn, qse_size_t len, int* errnum); + qse_mmgr_t* mmgr, + qse_size_t depth, + const qse_char_t* ptn, + qse_size_t len, + qse_rex_errnum_t* errnum +); int qse_matchrex ( - qse_mmgr_t* mmgr, qse_size_t depth, - void* code, int option, - const qse_char_t* str, qse_size_t len, - const qse_char_t** match_ptr, qse_size_t* match_len, int* errnum); + qse_mmgr_t* mmgr, + qse_size_t depth, + void* code, + int option, + const qse_char_t* str, + qse_size_t len, + const qse_char_t* substr, + qse_size_t sublen, + qse_cstr_t* match, + qse_rex_errnum_t* errnum +); -void qse_freerex (qse_mmgr_t* mmgr, void* code); +void qse_freerex ( + qse_mmgr_t* mmgr, + void* code +); -qse_bool_t qse_isemptyrex (void* code); +qse_bool_t qse_isemptyrex ( + void* code +); #if 0 void qse_dprintrex (qse_rex_t* rex, void* rex); diff --git a/qse/include/qse/utl/sed.h b/qse/include/qse/utl/sed.h index 5b8bbefb..90234714 100644 --- a/qse/include/qse/utl/sed.h +++ b/qse/include/qse/utl/sed.h @@ -198,6 +198,7 @@ struct qse_sed_t { qse_lda_t appended; qse_str_t held; + qse_str_t subst; } text; }; diff --git a/qse/lib/awk/awk.h b/qse/lib/awk/awk.h index 7294ea8c..98096161 100644 --- a/qse/lib/awk/awk.h +++ b/qse/lib/awk/awk.h @@ -1,5 +1,5 @@ /* - * $Id: awk.h 127 2009-05-07 13:15:04Z hyunghwan.chung $ + * $Id: awk.h 135 2009-05-15 13:31:43Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -379,7 +379,7 @@ struct qse_awk_rtx_t #define QSE_AWK_ISEMPTYREX(awk,code) qse_isemptyrex(code) #define QSE_AWK_BUILDREX(awk,ptn,len,errnum) \ qse_awk_buildrex(awk,ptn,len,errnum) -#define QSE_AWK_MATCHREX(awk,code,option,str,len,match_ptr,match_len,errnum) \ - qse_awk_matchrex(awk,code,option,str,len,match_ptr,match_len,errnum) +#define QSE_AWK_MATCHREX(awk,code,option,str,len,substr,sublen,match,errnum) \ + qse_awk_matchrex(awk,code,option,str,len,substr,sublen,match,errnum) #endif diff --git a/qse/lib/awk/fnc.c b/qse/lib/awk/fnc.c index 1cb84564..dd4452f4 100644 --- a/qse/lib/awk/fnc.c +++ b/qse/lib/awk/fnc.c @@ -1,5 +1,5 @@ /* - * $Id: fnc.c 90 2009-03-01 09:58:19Z hyunghwan.chung $ + * $Id: fnc.c 135 2009-05-15 13:31:43Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -583,7 +583,7 @@ static int fnc_split ( qse_size_t nargs; qse_awk_val_t* a0, * a1, * a2, * t1, * t2, ** a1_ref; qse_char_t* str, * str_free, * p, * tok; - qse_size_t str_len, str_left, tok_len; + qse_size_t str_len, str_left, tok_len, org_len; qse_long_t num; qse_char_t key[QSE_SIZEOF(qse_long_t)*8+2]; qse_size_t key_len; @@ -719,7 +719,6 @@ static int fnc_split ( QSE_AWK_FREE (run->awk, fs_free); if (fs_rex_free != QSE_NULL) QSE_AWK_FREEREX (run->awk, fs_rex_free); - /*qse_awk_rtx_seterrnum (run, QSE_AWK_ENOMEM);*/ return -1; } @@ -727,7 +726,7 @@ static int fnc_split ( *a1_ref = t1; qse_awk_rtx_refupval (run, *a1_ref); - p = str; str_left = str_len; + p = str; str_left = str_len; org_len = str_len; num = 1; while (p != QSE_NULL) @@ -739,8 +738,10 @@ static int fnc_split ( } else { - p = qse_awk_rtx_strxntokbyrex (run, p, str_len, - fs_rex, &tok, &tok_len, &errnum); + p = qse_awk_rtx_strxntokbyrex ( + run, str, org_len, p, str_len, + fs_rex, &tok, &tok_len, &errnum + ); if (p == QSE_NULL && errnum != QSE_AWK_ENOERR) { if (str_free != QSE_NULL) @@ -919,8 +920,9 @@ static int __substitute (qse_awk_rtx_t* run, qse_long_t max_count) qse_char_t* a2_ptr_free = QSE_NULL; void* rex = QSE_NULL; int opt, n; - const qse_char_t* cur_ptr, * mat_ptr; - qse_size_t cur_len, mat_len, i, m; + qse_cstr_t mat; + const qse_char_t* cur_ptr; + qse_size_t cur_len, i, m; qse_str_t new; qse_long_t sub_count; @@ -1064,8 +1066,10 @@ static int __substitute (qse_awk_rtx_t* run, qse_long_t max_count) if (max_count == 0 || sub_count < max_count) { n = QSE_AWK_MATCHREX ( - run->awk, rex, opt, cur_ptr, cur_len, - &mat_ptr, &mat_len, &run->errnum); + run->awk, rex, opt, + a2_ptr, a2_len, + cur_ptr, cur_len, + &mat, &run->errnum); } else n = 0; @@ -1092,7 +1096,7 @@ static int __substitute (qse_awk_rtx_t* run, qse_long_t max_count) } if (qse_str_ncat ( - &new, cur_ptr, mat_ptr - cur_ptr) == (qse_size_t)-1) + &new, cur_ptr, mat.ptr - cur_ptr) == (qse_size_t)-1) { FREE_A0_REX (run->awk, rex); qse_str_fini (&new); @@ -1111,7 +1115,7 @@ static int __substitute (qse_awk_rtx_t* run, qse_long_t max_count) } else if (a1_ptr[i] == QSE_T('&')) { - m = qse_str_ncat (&new, mat_ptr, mat_len); + m = qse_str_ncat (&new, mat.ptr, mat.len); } else { @@ -1128,8 +1132,8 @@ static int __substitute (qse_awk_rtx_t* run, qse_long_t max_count) } sub_count++; - cur_len = cur_len - ((mat_ptr - cur_ptr) + mat_len); - cur_ptr = mat_ptr + mat_len; + cur_len = cur_len - ((mat.ptr - cur_ptr) + mat.len); + cur_ptr = mat.ptr + mat.len; } FREE_A0_REX (run->awk, rex); @@ -1218,8 +1222,7 @@ static int fnc_match ( qse_long_t idx; void* rex; int opt, n; - const qse_char_t* mat_ptr; - qse_size_t mat_len; + qse_cstr_t mat; nargs = qse_awk_rtx_getnargs (run); QSE_ASSERT (nargs == 2); @@ -1273,15 +1276,17 @@ static int fnc_match ( opt = (run->gbl.ignorecase)? QSE_REX_IGNORECASE: 0; n = QSE_AWK_MATCHREX ( - run->awk, rex, opt, str0, len0, - &mat_ptr, &mat_len, &run->errnum); + run->awk, rex, opt, + str0, len0, str0, len0, + &mat, &run->errnum + ); if (a0->type != QSE_AWK_VAL_STR) QSE_AWK_FREE (run->awk, str0); if (a1->type != QSE_AWK_VAL_REX) QSE_AWK_FREEREX (run->awk, rex); if (n == -1) return -1; - idx = (n == 0)? 0: ((qse_long_t)(mat_ptr-str0) + 1); + idx = (n == 0)? 0: ((qse_long_t)(mat.ptr-str0) + 1); a0 = qse_awk_rtx_makeintval (run, idx); if (a0 == QSE_NULL) @@ -1293,7 +1298,7 @@ static int fnc_match ( qse_awk_rtx_refupval (run, a0); a1 = qse_awk_rtx_makeintval (run, - ((n == 0)? (qse_long_t)-1: (qse_long_t)mat_len)); + ((n == 0)? (qse_long_t)-1: (qse_long_t)mat.len)); if (a1 == QSE_NULL) { qse_awk_rtx_refdownval (run, a0); diff --git a/qse/lib/awk/misc.c b/qse/lib/awk/misc.c index eefa781e..505b3ee6 100644 --- a/qse/lib/awk/misc.c +++ b/qse/lib/awk/misc.c @@ -1,5 +1,5 @@ /* - * $Id: misc.c 127 2009-05-07 13:15:04Z hyunghwan.chung $ + * $Id: misc.c 135 2009-05-15 13:31:43Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -831,24 +831,24 @@ exit_loop: } qse_char_t* qse_awk_rtx_strxntokbyrex ( - qse_awk_rtx_t* rtx, const qse_char_t* s, qse_size_t len, + qse_awk_rtx_t* rtx, + const qse_char_t* str, qse_size_t len, + const qse_char_t* substr, qse_size_t sublen, void* rex, qse_char_t** tok, qse_size_t* tok_len, int* errnum) { int n; - qse_char_t* match_ptr; - qse_size_t match_len, i; - qse_size_t left = len; - const qse_char_t* ptr = s; - const qse_char_t* str_ptr = s; - qse_size_t str_len = len; + qse_size_t i, left = sublen; + const qse_char_t* ptr = substr; + const qse_char_t* str_ptr = substr; + qse_size_t str_len = sublen; + qse_cstr_t match; - while (len > 0) + while (sublen > 0) { n = QSE_AWK_MATCHREX ( rtx->awk, rex, ((rtx->gbl.ignorecase)? QSE_REX_IGNORECASE: 0), - ptr, left, (const qse_char_t**)&match_ptr, &match_len, - errnum); + str, len, ptr, left, &match, errnum); if (n == -1) return QSE_NULL; if (n == 0) { @@ -862,7 +862,7 @@ qse_char_t* qse_awk_rtx_strxntokbyrex ( QSE_ASSERT (n == 1); - if (match_len == 0) + if (match.len == 0) { ptr++; left--; @@ -870,28 +870,28 @@ qse_char_t* qse_awk_rtx_strxntokbyrex ( else if (rtx->awk->option & QSE_AWK_STRIPSPACES) { /* match at the beginning of the input string */ - if (match_ptr == s) + if (match.ptr == substr) { - for (i = 0; i < match_len; i++) + for (i = 0; i < match.len; i++) { - if (!QSE_AWK_ISSPACE(rtx->awk, match_ptr[i])) + if (!QSE_AWK_ISSPACE(rtx->awk, match.ptr[i])) goto exit_loop; } /* the match that are all spaces at the * beginning of the input string is skipped */ - ptr += match_len; - left -= match_len; - str_ptr = s + match_len; - str_len -= match_len; + ptr += match.len; + left -= match.len; + str_ptr = substr + match.len; + str_len -= match.len; } - else break; + else break; } else break; } exit_loop: - if (len == 0) + if (sublen == 0) { *tok = (qse_char_t*)str_ptr; *tok_len = str_len; @@ -900,14 +900,14 @@ exit_loop: } *tok = (qse_char_t*)str_ptr; - *tok_len = match_ptr - str_ptr; + *tok_len = match.ptr - str_ptr; - for (i = 0; i < match_len; i++) + for (i = 0; i < match.len; i++) { - if (!QSE_AWK_ISSPACE(rtx->awk, match_ptr[i])) + if (!QSE_AWK_ISSPACE(rtx->awk, match.ptr[i])) { *errnum = QSE_AWK_ENOERR; - return match_ptr+match_len; + return (qse_char_t*)match.ptr+match.len; } } @@ -915,13 +915,13 @@ exit_loop: if (rtx->awk->option & QSE_AWK_STRIPSPACES) { - return (match_ptr+match_len >= s+len)? - QSE_NULL: (match_ptr+match_len); + return (match.ptr+match.len >= substr+sublen)? + QSE_NULL: ((qse_char_t*)match.ptr+match.len); } else { - return (match_ptr+match_len > s+len)? - QSE_NULL: (match_ptr+match_len); + return (match.ptr+match.len > substr+sublen)? + QSE_NULL: ((qse_char_t*)match.ptr+match.len); } } @@ -944,7 +944,7 @@ exit_loop: void* qse_awk_buildrex ( qse_awk_t* awk, const qse_char_t* ptn, qse_size_t len, int* errnum) { - int err; + qse_rex_errnum_t err; void* p; p = qse_buildrex ( @@ -956,13 +956,15 @@ void* qse_awk_buildrex ( int qse_awk_matchrex ( qse_awk_t* awk, void* code, int option, const qse_char_t* str, qse_size_t len, - const qse_char_t** match_ptr, qse_size_t* match_len, int* errnum) + const qse_char_t* substr, qse_size_t sublen, + qse_cstr_t* match, int* errnum) { - int err, x; + int x; + qse_rex_errnum_t err; x = qse_matchrex ( awk->mmgr, awk->rex.depth.max.match, - code, option, str, len, match_ptr, match_len, &err); + code, option, str, len, substr, sublen, match, &err); if (x < 0) *errnum = QSE_AWK_REXERRTOERR(err); return x; } diff --git a/qse/lib/awk/misc.h b/qse/lib/awk/misc.h index 587429fd..d8bf5f3b 100644 --- a/qse/lib/awk/misc.h +++ b/qse/lib/awk/misc.h @@ -1,5 +1,5 @@ /* - * $Id: misc.h 75 2009-02-22 14:10:34Z hyunghwan.chung $ + * $Id: misc.h 135 2009-05-15 13:31:43Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -42,8 +42,16 @@ qse_char_t* qse_awk_rtx_strxntok ( qse_char_t** tok, qse_size_t* tok_len); qse_char_t* qse_awk_rtx_strxntokbyrex ( - qse_awk_rtx_t* rtx, const qse_char_t* s, qse_size_t len, - void* rex, qse_char_t** tok, qse_size_t* tok_len, int* errnum); + qse_awk_rtx_t* rtx, + const qse_char_t* str, + qse_size_t len, + const qse_char_t* substr, + qse_size_t sublen, + void* rex, + qse_char_t** tok, + qse_size_t* tok_len, + int* errnum +); void* qse_awk_buildrex ( @@ -52,7 +60,8 @@ void* qse_awk_buildrex ( int qse_awk_matchrex ( qse_awk_t* awk, void* code, int option, const qse_char_t* str, qse_size_t len, - const qse_char_t** match_ptr, qse_size_t* match_len, int* errnum); + const qse_char_t* substr, qse_size_t sublen, + qse_cstr_t* match, int* errnum); #ifdef __cplusplus } diff --git a/qse/lib/awk/rec.c b/qse/lib/awk/rec.c index 68ef57e5..dbc8561c 100644 --- a/qse/lib/awk/rec.c +++ b/qse/lib/awk/rec.c @@ -1,5 +1,5 @@ /* - * $Id: rec.c 89 2009-02-28 15:27:03Z hyunghwan.chung $ + * $Id: rec.c 135 2009-05-15 13:31:43Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -146,8 +146,13 @@ static int split_record (qse_awk_rtx_t* run) } else { - p = qse_awk_rtx_strxntokbyrex (run, p, len, - run->gbl.fs, &tok, &tok_len, &errnum); + p = qse_awk_rtx_strxntokbyrex ( + run, + QSE_STR_PTR(&run->inrec.line), + QSE_STR_LEN(&run->inrec.line), + p, len, + run->gbl.fs, &tok, &tok_len, &errnum + ); if (p == QSE_NULL && errnum != QSE_AWK_ENOERR) { if (fs_free != QSE_NULL) @@ -203,8 +208,13 @@ static int split_record (qse_awk_rtx_t* run) } else { - p = qse_awk_rtx_strxntokbyrex (run, p, len, - run->gbl.fs, &tok, &tok_len, &errnum); + p = qse_awk_rtx_strxntokbyrex ( + run, + QSE_STR_PTR(&run->inrec.line), + QSE_STR_LEN(&run->inrec.line), + p, len, + run->gbl.fs, &tok, &tok_len, &errnum + ); if (p == QSE_NULL && errnum != QSE_AWK_ENOERR) { if (fs_free != QSE_NULL) diff --git a/qse/lib/awk/rio.c b/qse/lib/awk/rio.c index 611b0332..762534ce 100644 --- a/qse/lib/awk/rio.c +++ b/qse/lib/awk/rio.c @@ -1,5 +1,5 @@ /* - * $Id: rio.c 90 2009-03-01 09:58:19Z hyunghwan.chung $ + * $Id: rio.c 135 2009-05-15 13:31:43Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -269,8 +269,7 @@ int qse_awk_rtx_readio ( * the buffer has been appened with the last character * after the previous matchrex has failed */ - const qse_char_t* match_ptr; - qse_size_t match_len; + qse_cstr_t match; QSE_ASSERT (run->gbl.rs != QSE_NULL); @@ -278,7 +277,8 @@ int qse_awk_rtx_readio ( run->awk, run->gbl.rs, ((run->gbl.ignorecase)? QSE_REX_IGNORECASE: 0), QSE_STR_PTR(buf), QSE_STR_LEN(buf), - &match_ptr, &match_len, &run->errnum); + QSE_STR_PTR(buf), QSE_STR_LEN(buf), + &match, &run->errnum); if (n == -1) { ret = -1; @@ -291,9 +291,9 @@ int qse_awk_rtx_readio ( * the current buffer */ QSE_ASSERT ( QSE_STR_PTR(buf) + QSE_STR_LEN(buf) == - match_ptr + match_len); + match.ptr + match.len); - QSE_STR_LEN(buf) -= match_len; + QSE_STR_LEN(buf) -= match.len; break; } } @@ -357,8 +357,7 @@ int qse_awk_rtx_readio ( } else { - const qse_char_t* match_ptr; - qse_size_t match_len; + qse_cstr_t match; QSE_ASSERT (run->gbl.rs != QSE_NULL); @@ -366,7 +365,8 @@ int qse_awk_rtx_readio ( run->awk, run->gbl.rs, ((run->gbl.ignorecase)? QSE_REX_IGNORECASE: 0), QSE_STR_PTR(buf), QSE_STR_LEN(buf), - &match_ptr, &match_len, &run->errnum); + QSE_STR_PTR(buf), QSE_STR_LEN(buf), + &match, &run->errnum); if (n == -1) { ret = -1; @@ -380,9 +380,9 @@ int qse_awk_rtx_readio ( * the current buffer */ QSE_ASSERT ( QSE_STR_PTR(buf) + QSE_STR_LEN(buf) == - match_ptr + match_len); + match.ptr + match.len); - QSE_STR_LEN(buf) -= match_len; + QSE_STR_LEN(buf) -= match.len; p->in.pos--; /* unread the character in c */ break; } diff --git a/qse/lib/awk/run.c b/qse/lib/awk/run.c index a6ef333b..642553b8 100644 --- a/qse/lib/awk/run.c +++ b/qse/lib/awk/run.c @@ -1,5 +1,5 @@ /* - * $Id: run.c 127 2009-05-07 13:15:04Z hyunghwan.chung $ + * $Id: run.c 135 2009-05-15 13:31:43Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -3099,7 +3099,9 @@ static qse_awk_val_t* eval_expression (qse_awk_rtx_t* run, qse_awk_nde_t* nde) ((((qse_awk_rtx_t*)run)->gbl.ignorecase)? QSE_REX_IGNORECASE: 0), ((qse_awk_val_str_t*)run->inrec.d0)->ptr, ((qse_awk_val_str_t*)run->inrec.d0)->len, - QSE_NULL, QSE_NULL, &errnum); + ((qse_awk_val_str_t*)run->inrec.d0)->ptr, + ((qse_awk_val_str_t*)run->inrec.d0)->len, + QSE_NULL, &errnum); if (n == -1) { @@ -4773,7 +4775,9 @@ static qse_awk_val_t* eval_binop_match0 ( ((run->gbl.ignorecase)? QSE_REX_IGNORECASE: 0), ((qse_awk_val_str_t*)left)->ptr, ((qse_awk_val_str_t*)left)->len, - QSE_NULL, QSE_NULL, &errnum); + ((qse_awk_val_str_t*)left)->ptr, + ((qse_awk_val_str_t*)left)->len, + QSE_NULL, &errnum); if (n == -1) { if (right->type != QSE_AWK_VAL_REX) @@ -4810,7 +4814,8 @@ static qse_awk_val_t* eval_binop_match0 ( run->awk, rex_code, ((run->gbl.ignorecase)? QSE_REX_IGNORECASE: 0), out.u.cpldup.ptr, out.u.cpldup.len, - QSE_NULL, QSE_NULL, &errnum); + out.u.cpldup.ptr, out.u.cpldup.len, + QSE_NULL, &errnum); if (n == -1) { QSE_AWK_FREE (run->awk, out.u.cpldup.ptr); diff --git a/qse/lib/cmn/rex.c b/qse/lib/cmn/rex.c index 2d8e1361..2a00ec5f 100644 --- a/qse/lib/cmn/rex.c +++ b/qse/lib/cmn/rex.c @@ -1,5 +1,5 @@ /* - * $Id: rex.c 127 2009-05-07 13:15:04Z hyunghwan.chung $ + * $Id: rex.c 135 2009-05-15 13:31:43Z hyunghwan.chung $ * Copyright 2006-2009 Chung, Hyung-Hwan. @@ -107,7 +107,7 @@ struct builder_t qse_size_t cur; } depth; - int errnum; + qse_rex_errnum_t errnum; }; struct matcher_t @@ -121,6 +121,12 @@ struct matcher_t const qse_char_t* ptr; const qse_char_t* end; } str; + + struct + { + const qse_char_t* ptr; + const qse_char_t* end; + } realstr; } match; struct @@ -130,7 +136,7 @@ struct matcher_t } depth; int ignorecase; - int errnum; + qse_rex_errnum_t errnum; }; struct match_t @@ -331,9 +337,63 @@ static struct __char_class_t __char_class[] = { QSE_NULL, 0, QSE_NULL } }; +qse_rex_t* qse_rex_open (qse_mmgr_t* mmgr, qse_size_t xtn) +{ + qse_rex_t* rex; + + if (mmgr == QSE_NULL) + { + mmgr = QSE_MMGR_GETDFL(); + + QSE_ASSERTX (mmgr != QSE_NULL, + "Set the memory manager with QSE_MMGR_SETDFL()"); + + if (mmgr == QSE_NULL) return QSE_NULL; + } + + rex = (qse_rex_t*) QSE_MMGR_ALLOC (mmgr, QSE_SIZEOF(qse_rex_t) + xtn); + if (rex == QSE_NULL) return QSE_NULL; + + QSE_MEMSET (rex, 0, QSE_SIZEOF(*rex)); + rex->mmgr = mmgr; + + return rex; +} + +void qse_rex_close (qse_rex_t* rex) +{ + if (rex->code != QSE_NULL) qse_freerex (rex->mmgr, rex->code); + QSE_MMGR_FREE (rex->mmgr, rex); +} + +int qse_rex_build (qse_rex_t* rex, const qse_char_t* ptn, qse_size_t len) +{ + void* code; + + code = qse_buildrex ( + rex->mmgr, rex->depth.build, + ptn, len, &rex->errnum); + if (code == QSE_NULL) return -1; + + if (rex->code != QSE_NULL) qse_freerex (rex->mmgr, rex->code); + rex->code = code; + + return 0; +} + +int qse_rex_match ( + qse_rex_t* rex, + const qse_char_t* str, qse_size_t len, + const qse_char_t* substr, qse_size_t sublen, qse_cstr_t* match) +{ + return qse_matchrex ( + rex->mmgr, rex->depth.match, rex->code, rex->option, + str, len, substr, sublen, match, &rex->errnum); +} + void* qse_buildrex ( qse_mmgr_t* mmgr, qse_size_t depth, - const qse_char_t* ptn, qse_size_t len, int* errnum) + const qse_char_t* ptn, qse_size_t len, qse_rex_errnum_t* errnum) { builder_t builder; @@ -399,7 +459,8 @@ int qse_matchrex ( qse_mmgr_t* mmgr, qse_size_t depth, void* code, int option, const qse_char_t* str, qse_size_t len, - const qse_char_t** match_ptr, qse_size_t* match_len, int* errnum) + const qse_char_t* substr, qse_size_t sublen, + qse_cstr_t* match, qse_rex_errnum_t* errnum) { matcher_t matcher; match_t mat; @@ -409,8 +470,11 @@ int qse_matchrex ( matcher.mmgr = mmgr; /* store the source string */ - matcher.match.str.ptr = str; - matcher.match.str.end = str + len; + matcher.match.str.ptr = substr; + matcher.match.str.end = substr + sublen; + + matcher.match.realstr.ptr = str; + matcher.match.realstr.end = str + len; matcher.depth.max = depth; matcher.depth.cur = 0; @@ -418,7 +482,7 @@ int qse_matchrex ( mat.matched = QSE_FALSE; /* TODO: should it allow an offset here??? */ - mat.match_ptr = str + offset; + mat.match_ptr = substr + offset; /*while (mat.match_ptr < matcher.match.str.end)*/ while (mat.match_ptr <= matcher.match.str.end) @@ -441,8 +505,11 @@ int qse_matchrex ( } */ - if (match_ptr != QSE_NULL) *match_ptr = mat.match_ptr; - if (match_len != QSE_NULL) *match_len = mat.match_len; + if (match != QSE_NULL) + { + match->ptr = mat.match_ptr; + match->len = mat.match_len; + } /*match_ptr_zero = QSE_NULL;*/ break; @@ -454,8 +521,11 @@ int qse_matchrex ( /* if (match_ptr_zero != QSE_NULL) { - if (match_ptr != QSE_NULL) *match_ptr = match_ptr_zero; - if (match_len != QSE_NULL) *match_len = 0; + if (match != QSE_NULL) + { + match->ptr = match_ptr_zero; + match->len = 0; + } return 1; } */ @@ -1349,7 +1419,9 @@ static const qse_byte_t* match_bol ( cp = (const code_t*)p; p += QSE_SIZEOF(*cp); QSE_ASSERT (cp->cmd == CMD_BOL); - mat->matched = (mat->match_ptr == matcher->match.str.ptr || + /*mat->matched = (mat->match_ptr == matcher->match.str.ptr || + (cp->lbound == cp->ubound && cp->lbound == 0));*/ + mat->matched = (mat->match_ptr == matcher->match.realstr.ptr || (cp->lbound == cp->ubound && cp->lbound == 0)); mat->match_len = 0; @@ -1365,7 +1437,9 @@ static const qse_byte_t* match_eol ( cp = (const code_t*)p; p += QSE_SIZEOF(*cp); QSE_ASSERT (cp->cmd == CMD_EOL); - mat->matched = (mat->match_ptr == matcher->match.str.end || + /*mat->matched = (mat->match_ptr == matcher->match.str.end || + (cp->lbound == cp->ubound && cp->lbound == 0));*/ + mat->matched = (mat->match_ptr == matcher->match.realstr.end || (cp->lbound == cp->ubound && cp->lbound == 0)); mat->match_len = 0; diff --git a/qse/lib/utl/sed.c b/qse/lib/utl/sed.c index dc979c4f..f7e60a90 100644 --- a/qse/lib/utl/sed.c +++ b/qse/lib/utl/sed.c @@ -110,11 +110,22 @@ qse_sed_t* qse_sed_init (qse_sed_t* sed, qse_mmgr_t* mmgr) } + if (qse_str_init (&sed->text.subst, mmgr, 256) == QSE_NULL) + { + qse_str_fini (&sed->text.held); + qse_lda_fini (&sed->text.appended); + QSE_MMGR_FREE (sed->mmgr, sed->cmd.buf); + qse_map_fini (&sed->labs); + qse_str_fini (&sed->rexbuf); + return QSE_NULL; + } + return sed; } void qse_sed_fini (qse_sed_t* sed) { + qse_str_fini (&sed->text.subst); qse_str_fini (&sed->text.held); qse_lda_fini (&sed->text.appended); @@ -232,7 +243,7 @@ static void free_command (qse_sed_t* sed, qse_sed_cmd_t* cmd) QSE_MMGR_FREE (sed->mmgr, cmd->u.branch.label.ptr); break; - case QSE_SED_CMD_S: + case QSE_SED_CMD_SUBSTITUTE: if (cmd->u.subst.file.ptr != QSE_NULL) QSE_MMGR_FREE (sed->mmgr, cmd->u.subst.file.ptr); if (cmd->u.subst.rpl.ptr != QSE_NULL) @@ -241,7 +252,7 @@ static void free_command (qse_sed_t* sed, qse_sed_cmd_t* cmd) qse_freerex (sed->mmgr, cmd->u.subst.rex); break; - case QSE_SED_CMD_Y: + case QSE_SED_CMD_TRANSLATE: if (cmd->u.transet.ptr != QSE_NULL) QSE_MMGR_FREE (sed->mmgr, cmd->u.transet.ptr); break; @@ -1691,6 +1702,110 @@ static int write_str_to_file ( return 0; } +static int do_subst (qse_sed_t* sed, qse_sed_cmd_t* cmd) +{ + qse_cstr_t mat; + int opt = 0; + qse_rex_errnum_t errnum; + const qse_char_t* cur_ptr, * str_ptr; + qse_size_t cur_len, str_len, m, i; + qse_size_t max_count, sub_count; + + QSE_ASSERT (cmd->type == QSE_SED_CMD_SUBSTITUTE); + + qse_str_clear (&sed->text.subst); + if (cmd->u.subst.i) opt = QSE_REX_IGNORECASE; + + str_ptr = QSE_STR_PTR(&sed->eio.in.line); + str_len = QSE_STR_LEN(&sed->eio.in.line); + + /* TODO: support different line end scheme */ + if (str_len > 0 && str_ptr[str_len-1] == QSE_T('\n')) str_len--; + + cur_ptr = str_ptr; + cur_len = str_len; + + sub_count = 0; + max_count = (cmd->u.subst.g)? 0: cmd->u.subst.occ; + + while (1) + { + int n; + + if (max_count == 0 || sub_count < max_count) + { + /* TODO: maximum match depth... */ + n = qse_matchrex ( + sed->mmgr, 0, cmd->u.subst.rex, opt, + str_ptr, str_len, + cur_ptr, cur_len, + &mat, &errnum + ); + } + else n = 0; + + if (n == -1) + { + sed->errnum = QSE_SED_EREXMA; + return -1; + } + + if (n == 0) + { + /* no more match found */ + if (qse_str_ncat ( + &sed->text.subst, + cur_ptr, cur_len) == (qse_size_t)-1) + { + sed->errnum = QSE_SED_EREXMA; + return -1; + } + break; + } + + m = qse_str_ncat (&sed->text.subst, cur_ptr, mat.ptr-cur_ptr); + if (m == (qse_size_t)-1) + { + sed->errnum = QSE_SED_EREXMA; + return -1; + } + + for (i = 0; i < cmd->u.subst.rpl.len; i++) + { + if ((i+1) < cmd->u.subst.rpl.len && + cmd->u.subst.rpl.ptr[i] == QSE_T('\\') && + cmd->u.subst.rpl.ptr[i+1] == QSE_T('&')) + { + m = qse_str_ccat (&sed->text.subst, QSE_T('&')); + i++; + } + else if (cmd->u.subst.rpl.ptr[i] == QSE_T('&')) + { + m = qse_str_ncat ( + &sed->text.subst, mat.ptr, mat.len); + } + else + { + m = qse_str_ccat ( + &sed->text.subst, cmd->u.subst.rpl.ptr[i]); + } + + if (m == (qse_size_t)-1) + { + sed->errnum = QSE_SED_EREXMA; + return -1; + } + } + + sub_count++; + cur_len = cur_len - ((mat.ptr - cur_ptr) + mat.len); + cur_ptr = mat.ptr + mat.len; + } + + qse_str_swap (&sed->eio.in.line, &sed->text.subst); + return 0; +} + static int match_a (qse_sed_t* sed, qse_sed_a_t* a) { switch (a->type) @@ -1700,10 +1815,11 @@ static int match_a (qse_sed_t* sed, qse_sed_a_t* a) case QSE_SED_A_REX: { - qse_str_t match; - int errnum, n; + int n; + qse_cstr_t match; qse_str_t* line; qse_size_t llen; + qse_rex_errnum_t errnum; QSE_ASSERT (a->u.rex != QSE_NULL); @@ -1715,13 +1831,10 @@ static int match_a (qse_sed_t* sed, qse_sed_a_t* a) QSE_STR_CHAR(line,llen-1) == QSE_T('\n')) llen--; n = qse_matchrex ( - sed->mmgr, - 0, - a->u.rex, - 0, - QSE_STR_PTR(line), - llen, - &match.ptr, &match.len, &errnum); + sed->mmgr, 0, a->u.rex, 0, + QSE_STR_PTR(line), llen, + QSE_STR_PTR(line), llen, + &match, &errnum); if (n <= -1) { sed->errnum = QSE_SED_EREXMA; @@ -2047,8 +2160,7 @@ static qse_sed_cmd_t* exec_cmd (qse_sed_t* sed, qse_sed_cmd_t* cmd) case QSE_SED_CMD_WRITE_FILELN: { const qse_char_t* ptr = QSE_STR_PTR(&sed->eio.in.line); - const qse_char_t* len = QSE_STR_LEN(&sed->eio.in.line); - qse_size_t i; + qse_size_t i, len = QSE_STR_LEN(&sed->eio.in.line); for (i = 0; i < len; i++) { /* TODO: handle different line end scheme */ @@ -2089,6 +2201,37 @@ static qse_sed_cmd_t* exec_cmd (qse_sed_t* sed, qse_sed_cmd_t* cmd) jumpto = cmd->u.branch.target; break; + + case QSE_SED_CMD_TRANSLATE: + { + qse_char_t* ptr = QSE_STR_PTR(&sed->eio.in.line); + qse_size_t i, len = QSE_STR_LEN(&sed->eio.in.line); + + /* TODO: sort cmd->u.transset and do binary search + * when sorted, you can, before binary search, check if ptr[i] < transet[0] || ptr[i] > transset[transset_size-1]. if so, it has not mathing translation */ + /* TODO: support different line end scheme */ + if (len > 0 && ptr[len-1] == QSE_T('\n')) len--; + + for (i = 0; i < len; i++) + { + const qse_char_t* tptr = cmd->u.transet.ptr; + qse_size_t j, tlen = cmd->u.transet.len; + for (j = 0; j < tlen; j += 2) + { + if (ptr[i] == tptr[j]) + { + ptr[i] = tptr[j+1]; + break; + } + } + } + break; + } + + case QSE_SED_CMD_SUBSTITUTE: + n = do_subst (sed, cmd); + if (n <= -1) return QSE_NULL; + break; } if (jumpto == NULL) jumpto = cmd + 1; diff --git a/qse/lib/utl/sed.h b/qse/lib/utl/sed.h index 169eabcf..69ba6484 100644 --- a/qse/lib/utl/sed.h +++ b/qse/lib/utl/sed.h @@ -77,18 +77,16 @@ struct qse_sed_cmd_t QSE_SED_CMD_NEXT_APPEND = QSE_T('N'), /* branch */ - QSE_SED_CMD_BRANCH = QSE_T('b'), - QSE_SED_CMD_T = QSE_T('t'), + QSE_SED_CMD_BRANCH = QSE_T('b'), + QSE_SED_CMD_T = QSE_T('t'), - QSE_SED_CMD_READ_FILE = QSE_T('r'), - QSE_SED_CMD_READ_FILELN = QSE_T('R'), - QSE_SED_CMD_WRITE_FILE = QSE_T('w'), - QSE_SED_CMD_WRITE_FILELN = QSE_T('W'), + QSE_SED_CMD_READ_FILE = QSE_T('r'), + QSE_SED_CMD_READ_FILELN = QSE_T('R'), + QSE_SED_CMD_WRITE_FILE = QSE_T('w'), + QSE_SED_CMD_WRITE_FILELN = QSE_T('W'), - /* s/regex/str/ - replace matching pattern with a new string */ - QSE_SED_CMD_S = QSE_T('s'), - /* y/s/d/ - translate characters in s to characters in d */ - QSE_SED_CMD_Y = QSE_T('y') + QSE_SED_CMD_SUBSTITUTE = QSE_T('s'), + QSE_SED_CMD_TRANSLATE = QSE_T('y') } type;