diff --git a/qse/lib/cmn/tre-match-utils.h b/qse/lib/cmn/tre-match-utils.h index fe3c7f74..d87d5175 100644 --- a/qse/lib/cmn/tre-match-utils.h +++ b/qse/lib/cmn/tre-match-utils.h @@ -129,31 +129,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* Wide character support, no multibyte support. */ #define GET_NEXT_WCHAR() \ - do { \ - prev_c = next_c; \ - if (type == STR_BYTE) \ - { \ - pos++; \ - if (len >= 0 && pos >= len) \ - next_c = '\0'; \ - else \ - next_c = (unsigned char)(*str_byte++); \ +do { \ + prev_c = next_c; \ + if (type == STR_BYTE) \ + { \ + pos++; \ + if (len >= 0 && pos >= len) next_c = QSE_MT('\0'); \ + else next_c = (unsigned char)(*str_byte++); \ } \ - else if (type == STR_WIDE) \ - { \ - pos++; \ - if (len >= 0 && pos >= len) \ - next_c = QSE_T('\0'); \ - else \ - next_c = *str_wide++; \ + else if (type == STR_WIDE) \ + { \ + pos++; \ + if (len >= 0 && pos >= len) next_c = QSE_T('\0'); \ + else next_c = *str_wide++; \ } \ - else if (type == STR_USER) \ - { \ - pos += pos_add_next; \ - str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \ - str_source->context); \ - } \ - } while(/*CONSTCOND*/0) + else if (type == STR_USER) \ + { \ + pos += pos_add_next; \ + str_user_end = str_source->get_next_char(&next_c, &pos_add_next, str_source->context); \ + } \ +} while(/*CONSTCOND*/0) #endif /* !TRE_MULTIBYTE */ diff --git a/qse/lib/cmn/tre-parse.c b/qse/lib/cmn/tre-parse.c index dcfd0148..a571ddf7 100644 --- a/qse/lib/cmn/tre-parse.c +++ b/qse/lib/cmn/tre-parse.c @@ -280,40 +280,40 @@ tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate, else if (re + 1 < ctx->re_end && *re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON) { -#if 0 - char tmp_str[64]; -#endif const tre_char_t *endptr = re + 2; int len; DPRINT(("tre_parse_bracket: class: '%.*" STRF "'\n", REST(re))); - while (endptr < ctx->re_end && *endptr != CHAR_COLON) - endptr++; + while (endptr < ctx->re_end && *endptr != CHAR_COLON) endptr++; if (endptr != ctx->re_end) { - len = MIN(endptr - re - 2, 63); - - if (qse_getctypebyxname (re + 2, len, &class) <= -1) status = REG_ECTYPE; - - /* Optimize character classes for 8 bit character sets. */ - if (status == REG_OK && TRE_MB_CUR_MAX == 1) + /* QSE: bug fix of not checking ending ] */ + if (*(endptr + 1) != CHAR_RBRACKET) status = REG_ECTYPE; + else { - status = tre_expand_ctype(ctx->mem, class, items, + /* END QSE */ + len = MIN(endptr - re - 2, 63); + + if (qse_getctypebyxname (re + 2, len, &class) <= -1) status = REG_ECTYPE; + + /* Optimize character classes for 8 bit character sets. */ + if (status == REG_OK && TRE_MB_CUR_MAX == 1) + { + status = tre_expand_ctype(ctx->mem, class, items, &i, &max_i, ctx->cflags); - class = (tre_ctype_t)0; - skip = 1; + class = (tre_ctype_t)0; + skip = 1; + } + re = endptr + 2; } - re = endptr + 2; } - else - status = REG_ECTYPE; + else status = REG_ECTYPE; min = 0; max = TRE_CHAR_MAX; } else { DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); - if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET - && ctx->re != re) + if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET && ctx->re != re) /* Two ranges are not allowed to share and endpoint. */ status = REG_ERANGE; min = max = *re++; diff --git a/qse/lib/cmn/tre.h b/qse/lib/cmn/tre.h index 332ff6b2..26cdb4b6 100644 --- a/qse/lib/cmn/tre.h +++ b/qse/lib/cmn/tre.h @@ -55,6 +55,82 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef _QSE_LIB_CMN_TRE_H_ #define _QSE_LIB_CMN_TRE_H_ +/* TODO: MAKE TRE WORK LIKE GNU + +PATTERN: \(.\{0,1\}\)\(~[^,]*\)\([0-9]\)\(\.*\),\([^;]*\)\(;\([^;]*\(\3[^;]*\)\).*X*\1\(.*\)\) +INPUT: ~02.,3~3;0123456789;9876543210 + +------------------------------------------------------ +samples/cmn/tre01 gives the following output. this does not seem wrong, though. + +SUBMATCH[7],[8],[9]. + +SUBMATCH[0] = [~02.,3~3;0123456789;9876543210] +SUBMATCH[1] = [] +SUBMATCH[2] = [~0] +SUBMATCH[3] = [2] +SUBMATCH[4] = [.] +SUBMATCH[5] = [3~3] +SUBMATCH[6] = [;0123456789;9876543210] +SUBMATCH[7] = [012] +SUBMATCH[8] = [2] +SUBMATCH[9] = [3456789;9876543210 + +------------------------------------------------------ + +Using the GNU regcomp(),regexec(), the following +is printed. + +#include +#include +#include +int main (int argc, char* argv[]) +{ + regex_t tre; + regmatch_t mat[10]; + int i; + regcomp (&tre, argv[1], 0); + regexec (&tre, argv[2], 10, mat, 0); + for (i = 0; i < 10; i++) + { + if (mat[i].rm_so == -1) break; + printf ("SUBMATCH[%u] = [%.*s]\n", i, + (int)(mat[i].rm_eo - mat[i].rm_so), &argv[2][mat[i].rm_so]); + } + regfree (&tre); + return 0; +} + +SUBMATCH[0] = [~02.,3~3;0123456789;9876543210] +SUBMATCH[1] = [] +SUBMATCH[2] = [~0] +SUBMATCH[3] = [2] +SUBMATCH[4] = [.] +SUBMATCH[5] = [3~3] +SUBMATCH[6] = [;0123456789;9876543210] +SUBMATCH[7] = [0123456789] +SUBMATCH[8] = [23456789] +SUBMATCH[9] = [] + + +------------------------------------------------------ +One more example here: +$ ./tre01 "\(x*\)ab\(\(c*\1\)\(.*\)\)" "abcdefg" +Match: YES +SUBMATCH[0] = [abcdefg] +SUBMATCH[1] = [] +SUBMATCH[2] = [cdefg] +SUBMATCH[3] = [] +SUBMATCH[4] = [cdefg] + +$ ./reg "\(x*\)ab\(\(c*\1\)\(.*\)\)" "abcdefg" +SUBMATCH[0] = [abcdefg] +SUBMATCH[1] = [] +SUBMATCH[2] = [cdefg] +SUBMATCH[3] = [c] +SUBMATCH[4] = [defg] +*/ + #include #ifdef QSE_CHAR_IS_WCHAR diff --git a/qse/lib/sed/sed.c b/qse/lib/sed/sed.c index a93e84e1..89a75694 100644 --- a/qse/lib/sed/sed.c +++ b/qse/lib/sed/sed.c @@ -1,5 +1,5 @@ /* - * $Id: sed.c 563 2011-09-08 07:49:53Z hyunghwan.chung $ + * $Id: sed.c 564 2011-09-10 16:14:38Z hyunghwan.chung $ * Copyright 2006-2011 Chung, Hyung-Hwan. This file is part of QSE. @@ -95,8 +95,7 @@ int qse_sed_init (qse_sed_t* sed, qse_mmgr_t* mmgr) qse_map_mancbs(QSE_MAP_MANCBS_INLINE_KEY_COPIER) ); - if (qse_lda_init (&sed->e.txt.appended, mmgr, 32) <= -1) goto oops_4; - if (qse_str_init (&sed->e.txt.read, mmgr, 256) <= -1) goto oops_5; + if (qse_str_init (&sed->e.txt.appended, mmgr, 256) <= -1) goto oops_5; if (qse_str_init (&sed->e.txt.held, mmgr, 256) <= -1) goto oops_6; if (qse_str_init (&sed->e.txt.subst, mmgr, 256) <= -1) goto oops_7; @@ -111,10 +110,8 @@ int qse_sed_init (qse_sed_t* sed, qse_mmgr_t* mmgr) oops_7: qse_str_fini (&sed->e.txt.held); oops_6: - qse_str_fini (&sed->e.txt.read); + qse_str_fini (&sed->e.txt.appended); oops_5: - qse_lda_fini (&sed->e.txt.appended); -oops_4: qse_map_fini (&sed->tmp.labs); oops_3: qse_str_fini (&sed->tmp.lab); @@ -131,8 +128,7 @@ void qse_sed_fini (qse_sed_t* sed) qse_str_fini (&sed->e.txt.subst); qse_str_fini (&sed->e.txt.held); - qse_str_fini (&sed->e.txt.read); - qse_lda_fini (&sed->e.txt.appended); + qse_str_fini (&sed->e.txt.appended); qse_map_fini (&sed->tmp.labs); qse_str_fini (&sed->tmp.lab); @@ -451,14 +447,16 @@ static int pickup_rex ( int really, const qse_sed_cmd_t* cmd, qse_str_t* buf) { qse_cint_t c; - qse_size_t in_bracket = 0; qse_size_t chars_from_opening_bracket = 0; + int bracket_state = 0; qse_str_clear (buf); - for (;;) + while (1) { c = NXTSC (sed); + + shortcut: if (c == QSE_CHAR_EOF || IS_LINTERM(c)) { if (cmd) @@ -480,7 +478,7 @@ static int pickup_rex ( return -1; } - if (c == rxend && in_bracket <= 0) break; + if (c == rxend && bracket_state == 0) break; if (c == QSE_T('\\')) { @@ -509,11 +507,19 @@ static int pickup_rex ( return -1; } - if (in_bracket > 0 && nc == QSE_T(']')) + if (bracket_state > 0 && nc == QSE_T(']')) { - /* if really is not set, in_bracket is alyway 0. - * so this block is never reached */ - if (chars_from_opening_bracket > 1) in_bracket--; + /* + * if 'really' is not set, bracket_state is alyway 0. + * so this block is never reached. + * + * a backslashed closing bracket is seen. + * it is not :]. if bracket_state is 2, this \] + * makes an illegal regular expression. but, + * let's not care.. just drop the state to 0 + * as if the outer [ is closed. + */ + if (chars_from_opening_bracket > 1) bracket_state = 0; } if (nc == QSE_T('\n')) c = nc; @@ -524,9 +530,9 @@ static int pickup_rex ( ec = trans_escaped (nc); if (ec == nc) { - /* if the character after a backslash is not special at the - * this layer, add the backslash into the regular expression - * buffer as it is. */ + /* if the character after a backslash is not special + * at the this layer, add the backslash into the + * regular expression buffer as it is. */ if (qse_str_ccat (buf, QSE_T('\\')) == (qse_size_t)-1) { SETERR0 (sed, QSE_SED_ENOMEM, QSE_NULL); @@ -543,14 +549,40 @@ static int pickup_rex ( if (c == QSE_T('[')) { - if (in_bracket <= 0) chars_from_opening_bracket = 0; - in_bracket++; + if (bracket_state <= 0) + { + bracket_state = 1; + chars_from_opening_bracket = 0; + } + else if (bracket_state == 1) + { + qse_cint_t nc = NXTSC (sed); + if (nc == QSE_T(':')) bracket_state = 2; + + if (qse_str_ccat (buf, c) == (qse_size_t)-1) + { + SETERR0 (sed, QSE_SED_ENOMEM, QSE_NULL); + return -1; + } + chars_from_opening_bracket++; + c = nc; + goto shortcut; + } } - else if (in_bracket > 0 && c == QSE_T(']')) + else if (c == QSE_T(']')) { - /* if it is the first character after [, - * it is a normal character. */ - if (chars_from_opening_bracket > 1) in_bracket--; + if (bracket_state == 1) + { + /* if it is the first character after [, + * it is a normal character. */ + if (chars_from_opening_bracket > 1) bracket_state--; + } + else if (bracket_state == 2) + { + /* it doesn't really care if colon was for opening bracket + * like in [[:]] */ + if (QSE_STR_LASTCHAR(buf) == QSE_T(':')) bracket_state--; + } } } @@ -559,7 +591,6 @@ static int pickup_rex ( SETERR0 (sed, QSE_SED_ENOMEM, QSE_NULL); return -1; } - chars_from_opening_bracket++; } @@ -568,9 +599,10 @@ static int pickup_rex ( static QSE_INLINE void* compile_rex_address (qse_sed_t* sed, qse_char_t rxend) { - if (pickup_rex (sed, rxend, 1, QSE_NULL, &sed->tmp.rex) <= -1) return QSE_NULL; -/* TODO: support ignore case option for address */ + if (pickup_rex (sed, rxend, 1, QSE_NULL, &sed->tmp.rex) <= -1) + return QSE_NULL; +/* TODO: support ignore case option for address */ if (QSE_STR_LEN(&sed->tmp.rex) <= 0) return EMPTY_REX; return build_rex (sed, QSE_STR_CSTR(&sed->tmp.rex), 0, &sed->src.loc); } @@ -1688,7 +1720,7 @@ static int read_file ( for (i = 0; i < n; i++) { - if (qse_str_ccat (&sed->e.txt.read, buf[i]) == (qse_size_t)-1) + if (qse_str_ccat (&sed->e.txt.appended, buf[i]) == (qse_size_t)-1) { sed->e.in.fun ( sed, QSE_SED_IO_CLOSE, @@ -1703,7 +1735,7 @@ static int read_file ( } else { - if (qse_str_ncat (&sed->e.txt.read, buf, n) == (qse_size_t)-1) + if (qse_str_ncat (&sed->e.txt.appended, buf, n) == (qse_size_t)-1) { sed->e.in.fun ( sed, QSE_SED_IO_CLOSE, @@ -2523,10 +2555,10 @@ static qse_sed_cmd_t* exec_cmd (qse_sed_t* sed, qse_sed_cmd_t* cmd) break; case QSE_SED_CMD_APPEND: - if (qse_lda_insert ( + if (qse_str_ncat ( &sed->e.txt.appended, - QSE_LDA_SIZE(&sed->e.txt.appended), - &cmd->u.text, 0) == (qse_size_t)-1) + cmd->u.text.ptr, + cmd->u.text.len) == (qse_size_t)-1) { SETERR0 (sed, QSE_SED_ENOMEM, QSE_NULL); return QSE_NULL; @@ -2801,7 +2833,11 @@ static qse_sed_cmd_t* exec_cmd (qse_sed_t* sed, qse_sed_cmd_t* cmd) * if so, it has not mathing translation */ /* TODO: support different line end convension */ - if (len > 0 && ptr[len-1] == QSE_T('\n')) len--; + if (len > 0 && ptr[len-1] == QSE_T('\n')) + { + len--; + if (len > 0 && ptr[len-1] == QSE_T('\r')) len--; + } for (i = 0; i < len; i++) { @@ -2941,6 +2977,39 @@ static int init_all_commands_for_exec (qse_sed_t* sed) return 0; } +static int emit_output (qse_sed_t* sed) +{ + int n; +#if 0 + qse_size_t i; +#endif + + if (!(sed->option & QSE_SED_QUIET)) + { + /* write the pattern space */ + n = write_str (sed, + QSE_STR_PTR(&sed->e.in.line), + QSE_STR_LEN(&sed->e.in.line)); + if (n <= -1) return -1; + } + + /* write text appended by a and r */ + n = write_str ( + sed, + QSE_STR_PTR(&sed->e.txt.appended), + QSE_STR_LEN(&sed->e.txt.appended) + ); + if (n <= -1) return -1; + + /* flush the output stream in case it's not flushed + * in write functions */ + n = flush (sed); + if (n <= -1) return -1; + + return 0; +} + + int qse_sed_exec (qse_sed_t* sed, qse_sed_io_fun_t inf, qse_sed_io_fun_t outf) { qse_ssize_t n; @@ -2968,8 +3037,7 @@ int qse_sed_exec (qse_sed_t* sed, qse_sed_io_fun_t inf, qse_sed_io_fun_t outf) sed->e.last_rex = QSE_NULL; sed->e.subst_done = 0; - qse_lda_clear (&sed->e.txt.appended); - qse_str_clear (&sed->e.txt.read); + qse_str_clear (&sed->e.txt.appended); qse_str_clear (&sed->e.txt.subst); qse_str_clear (&sed->e.txt.held); if (qse_str_ccat (&sed->e.txt.held, QSE_T('\n')) == (qse_size_t)-1) @@ -3045,15 +3113,11 @@ int qse_sed_exec (qse_sed_t* sed, qse_sed_io_fun_t inf, qse_sed_io_fun_t outf) while (1) { - qse_size_t i; - int quit = 0; - n = read_line (sed, 0); if (n <= -1) { ret = -1; goto done; } if (n == 0) goto done; - qse_lda_clear (&sed->e.txt.appended); - qse_str_clear (&sed->e.txt.read); + qse_str_clear (&sed->e.txt.appended); if (sed->cmd.fb.len > 0) { @@ -3077,7 +3141,11 @@ int qse_sed_exec (qse_sed_t* sed, qse_sed_io_fun_t inf, qse_sed_io_fun_t outf) j = exec_cmd (sed, c); if (j == QSE_NULL) { ret = -1; goto done; } if (j == &sed->cmd.quit_quiet) goto done; - if (j == &sed->cmd.quit) { quit = 1; break; } + if (j == &sed->cmd.quit) + { + if (emit_output (sed) <= -1) ret = -1; + goto done; + } if (j == &sed->cmd.again) goto again; /* go to the next command */ @@ -3085,37 +3153,8 @@ int qse_sed_exec (qse_sed_t* sed, qse_sed_io_fun_t inf, qse_sed_io_fun_t outf) } } - if (!(sed->option & QSE_SED_QUIET)) - { - /* write the pattern space */ - n = write_str (sed, - QSE_STR_PTR(&sed->e.in.line), - QSE_STR_LEN(&sed->e.in.line)); - if (n <= -1) { ret = -1; goto done; } - } - /* write text read in by the r command */ - n = write_str ( - sed, - QSE_STR_PTR(&sed->e.txt.read), - QSE_STR_LEN(&sed->e.txt.read) - ); - if (n <= -1) { ret = -1; goto done; } - - /* write appeneded text by the a command */ - for (i = 0; i < QSE_LDA_SIZE(&sed->e.txt.appended); i++) - { - qse_xstr_t* t = QSE_LDA_DPTR(&sed->e.txt.appended, i); - n = write_str (sed, t->ptr, t->len); - if (n <= -1) { ret = -1; goto done; } - } - - /* flush the output stream in case it's not flushed - * in write functions */ - n = flush (sed); - if (n <= -1) { ret = -1; goto done; } - - if (quit) break; + if (emit_output (sed) <= -1) { ret = -1; goto done; } } done: diff --git a/qse/lib/sed/sed.h b/qse/lib/sed/sed.h index 814edfd9..e8fb257f 100644 --- a/qse/lib/sed/sed.h +++ b/qse/lib/sed/sed.h @@ -1,5 +1,5 @@ /* - * $Id: sed.h 563 2011-09-08 07:49:53Z hyunghwan.chung $ + * $Id: sed.h 564 2011-09-10 16:14:38Z hyunghwan.chung $ * Copyright 2006-2011 Chung, Hyung-Hwan. This file is part of QSE. @@ -23,7 +23,6 @@ #include #include -#include #define QSE_MAP_AS_RBT #include @@ -262,8 +261,7 @@ struct qse_sed_t /** text buffers */ struct { - qse_lda_t appended; - qse_str_t read; + qse_str_t appended; qse_str_t held; qse_str_t subst; } txt; diff --git a/qse/samples/cmn/tre01.c b/qse/samples/cmn/tre01.c index df1dd91a..e93ad29f 100644 --- a/qse/samples/cmn/tre01.c +++ b/qse/samples/cmn/tre01.c @@ -20,7 +20,7 @@ static int test_main (int argc, qse_char_t* argv[], qse_char_t* envp[]) qse_tre_init (&tre, QSE_NULL); - if (qse_tre_comp (&tre, argv[1], &nsubmat, QSE_TRE_EXTENDED) <= -1) + if (qse_tre_comp (&tre, argv[1], &nsubmat, 0 /*QSE_TRE_EXTENDED*/) <= -1) { qse_printf (QSE_T("ERROR: Cannot compile pattern [%s] - %s\n"), argv[1], qse_tre_geterrmsg(&tre)); goto oops;