fixed a bug in handling r and a command

This commit is contained in:
hyung-hwan 2011-09-11 10:14:38 +00:00
parent 3db2c566a2
commit 00e15a42e9
6 changed files with 226 additions and 118 deletions

View File

@ -129,31 +129,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* Wide character support, no multibyte support. */ /* Wide character support, no multibyte support. */
#define GET_NEXT_WCHAR() \ #define GET_NEXT_WCHAR() \
do { \ do { \
prev_c = next_c; \ prev_c = next_c; \
if (type == STR_BYTE) \ if (type == STR_BYTE) \
{ \ { \
pos++; \ pos++; \
if (len >= 0 && pos >= len) \ if (len >= 0 && pos >= len) next_c = QSE_MT('\0'); \
next_c = '\0'; \ else next_c = (unsigned char)(*str_byte++); \
else \
next_c = (unsigned char)(*str_byte++); \
} \ } \
else if (type == STR_WIDE) \ else if (type == STR_WIDE) \
{ \ { \
pos++; \ pos++; \
if (len >= 0 && pos >= len) \ if (len >= 0 && pos >= len) next_c = QSE_T('\0'); \
next_c = QSE_T('\0'); \ else next_c = *str_wide++; \
else \
next_c = *str_wide++; \
} \ } \
else if (type == STR_USER) \ else if (type == STR_USER) \
{ \ { \
pos += pos_add_next; \ pos += pos_add_next; \
str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \ str_user_end = str_source->get_next_char(&next_c, &pos_add_next, str_source->context); \
str_source->context); \
} \ } \
} while(/*CONSTCOND*/0) } while(/*CONSTCOND*/0)
#endif /* !TRE_MULTIBYTE */ #endif /* !TRE_MULTIBYTE */

View File

@ -280,16 +280,17 @@ tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
else if (re + 1 < ctx->re_end else if (re + 1 < ctx->re_end
&& *re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON) && *re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
{ {
#if 0
char tmp_str[64];
#endif
const tre_char_t *endptr = re + 2; const tre_char_t *endptr = re + 2;
int len; int len;
DPRINT(("tre_parse_bracket: class: '%.*" STRF "'\n", REST(re))); DPRINT(("tre_parse_bracket: class: '%.*" STRF "'\n", REST(re)));
while (endptr < ctx->re_end && *endptr != CHAR_COLON) while (endptr < ctx->re_end && *endptr != CHAR_COLON) endptr++;
endptr++;
if (endptr != ctx->re_end) if (endptr != ctx->re_end)
{ {
/* QSE: bug fix of not checking ending ] */
if (*(endptr + 1) != CHAR_RBRACKET) status = REG_ECTYPE;
else
{
/* END QSE */
len = MIN(endptr - re - 2, 63); len = MIN(endptr - re - 2, 63);
if (qse_getctypebyxname (re + 2, len, &class) <= -1) status = REG_ECTYPE; if (qse_getctypebyxname (re + 2, len, &class) <= -1) status = REG_ECTYPE;
@ -304,16 +305,15 @@ tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
} }
re = endptr + 2; re = endptr + 2;
} }
else }
status = REG_ECTYPE; else status = REG_ECTYPE;
min = 0; min = 0;
max = TRE_CHAR_MAX; max = TRE_CHAR_MAX;
} }
else else
{ {
DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re))); DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET && ctx->re != re)
&& ctx->re != re)
/* Two ranges are not allowed to share and endpoint. */ /* Two ranges are not allowed to share and endpoint. */
status = REG_ERANGE; status = REG_ERANGE;
min = max = *re++; min = max = *re++;

View File

@ -55,6 +55,82 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef _QSE_LIB_CMN_TRE_H_ #ifndef _QSE_LIB_CMN_TRE_H_
#define _QSE_LIB_CMN_TRE_H_ #define _QSE_LIB_CMN_TRE_H_
/* TODO: MAKE TRE WORK LIKE GNU
PATTERN: \(.\{0,1\}\)\(~[^,]*\)\([0-9]\)\(\.*\),\([^;]*\)\(;\([^;]*\(\3[^;]*\)\).*X*\1\(.*\)\)
INPUT: ~02.,3~3;0123456789;9876543210
------------------------------------------------------
samples/cmn/tre01 gives the following output. this does not seem wrong, though.
SUBMATCH[7],[8],[9].
SUBMATCH[0] = [~02.,3~3;0123456789;9876543210]
SUBMATCH[1] = []
SUBMATCH[2] = [~0]
SUBMATCH[3] = [2]
SUBMATCH[4] = [.]
SUBMATCH[5] = [3~3]
SUBMATCH[6] = [;0123456789;9876543210]
SUBMATCH[7] = [012]
SUBMATCH[8] = [2]
SUBMATCH[9] = [3456789;9876543210
------------------------------------------------------
Using the GNU regcomp(),regexec(), the following
is printed.
#include <sys/types.h>
#include <regex.h>
#include <stdio.h>
int main (int argc, char* argv[])
{
regex_t tre;
regmatch_t mat[10];
int i;
regcomp (&tre, argv[1], 0);
regexec (&tre, argv[2], 10, mat, 0);
for (i = 0; i < 10; i++)
{
if (mat[i].rm_so == -1) break;
printf ("SUBMATCH[%u] = [%.*s]\n", i,
(int)(mat[i].rm_eo - mat[i].rm_so), &argv[2][mat[i].rm_so]);
}
regfree (&tre);
return 0;
}
SUBMATCH[0] = [~02.,3~3;0123456789;9876543210]
SUBMATCH[1] = []
SUBMATCH[2] = [~0]
SUBMATCH[3] = [2]
SUBMATCH[4] = [.]
SUBMATCH[5] = [3~3]
SUBMATCH[6] = [;0123456789;9876543210]
SUBMATCH[7] = [0123456789]
SUBMATCH[8] = [23456789]
SUBMATCH[9] = []
------------------------------------------------------
One more example here:
$ ./tre01 "\(x*\)ab\(\(c*\1\)\(.*\)\)" "abcdefg"
Match: YES
SUBMATCH[0] = [abcdefg]
SUBMATCH[1] = []
SUBMATCH[2] = [cdefg]
SUBMATCH[3] = []
SUBMATCH[4] = [cdefg]
$ ./reg "\(x*\)ab\(\(c*\1\)\(.*\)\)" "abcdefg"
SUBMATCH[0] = [abcdefg]
SUBMATCH[1] = []
SUBMATCH[2] = [cdefg]
SUBMATCH[3] = [c]
SUBMATCH[4] = [defg]
*/
#include <qse/cmn/tre.h> #include <qse/cmn/tre.h>
#ifdef QSE_CHAR_IS_WCHAR #ifdef QSE_CHAR_IS_WCHAR

View File

@ -1,5 +1,5 @@
/* /*
* $Id: sed.c 563 2011-09-08 07:49:53Z hyunghwan.chung $ * $Id: sed.c 564 2011-09-10 16:14:38Z hyunghwan.chung $
* *
Copyright 2006-2011 Chung, Hyung-Hwan. Copyright 2006-2011 Chung, Hyung-Hwan.
This file is part of QSE. This file is part of QSE.
@ -95,8 +95,7 @@ int qse_sed_init (qse_sed_t* sed, qse_mmgr_t* mmgr)
qse_map_mancbs(QSE_MAP_MANCBS_INLINE_KEY_COPIER) qse_map_mancbs(QSE_MAP_MANCBS_INLINE_KEY_COPIER)
); );
if (qse_lda_init (&sed->e.txt.appended, mmgr, 32) <= -1) goto oops_4; if (qse_str_init (&sed->e.txt.appended, mmgr, 256) <= -1) goto oops_5;
if (qse_str_init (&sed->e.txt.read, mmgr, 256) <= -1) goto oops_5;
if (qse_str_init (&sed->e.txt.held, mmgr, 256) <= -1) goto oops_6; if (qse_str_init (&sed->e.txt.held, mmgr, 256) <= -1) goto oops_6;
if (qse_str_init (&sed->e.txt.subst, mmgr, 256) <= -1) goto oops_7; if (qse_str_init (&sed->e.txt.subst, mmgr, 256) <= -1) goto oops_7;
@ -111,10 +110,8 @@ int qse_sed_init (qse_sed_t* sed, qse_mmgr_t* mmgr)
oops_7: oops_7:
qse_str_fini (&sed->e.txt.held); qse_str_fini (&sed->e.txt.held);
oops_6: oops_6:
qse_str_fini (&sed->e.txt.read); qse_str_fini (&sed->e.txt.appended);
oops_5: oops_5:
qse_lda_fini (&sed->e.txt.appended);
oops_4:
qse_map_fini (&sed->tmp.labs); qse_map_fini (&sed->tmp.labs);
oops_3: oops_3:
qse_str_fini (&sed->tmp.lab); qse_str_fini (&sed->tmp.lab);
@ -131,8 +128,7 @@ void qse_sed_fini (qse_sed_t* sed)
qse_str_fini (&sed->e.txt.subst); qse_str_fini (&sed->e.txt.subst);
qse_str_fini (&sed->e.txt.held); qse_str_fini (&sed->e.txt.held);
qse_str_fini (&sed->e.txt.read); qse_str_fini (&sed->e.txt.appended);
qse_lda_fini (&sed->e.txt.appended);
qse_map_fini (&sed->tmp.labs); qse_map_fini (&sed->tmp.labs);
qse_str_fini (&sed->tmp.lab); qse_str_fini (&sed->tmp.lab);
@ -451,14 +447,16 @@ static int pickup_rex (
int really, const qse_sed_cmd_t* cmd, qse_str_t* buf) int really, const qse_sed_cmd_t* cmd, qse_str_t* buf)
{ {
qse_cint_t c; qse_cint_t c;
qse_size_t in_bracket = 0;
qse_size_t chars_from_opening_bracket = 0; qse_size_t chars_from_opening_bracket = 0;
int bracket_state = 0;
qse_str_clear (buf); qse_str_clear (buf);
for (;;) while (1)
{ {
c = NXTSC (sed); c = NXTSC (sed);
shortcut:
if (c == QSE_CHAR_EOF || IS_LINTERM(c)) if (c == QSE_CHAR_EOF || IS_LINTERM(c))
{ {
if (cmd) if (cmd)
@ -480,7 +478,7 @@ static int pickup_rex (
return -1; return -1;
} }
if (c == rxend && in_bracket <= 0) break; if (c == rxend && bracket_state == 0) break;
if (c == QSE_T('\\')) if (c == QSE_T('\\'))
{ {
@ -509,11 +507,19 @@ static int pickup_rex (
return -1; return -1;
} }
if (in_bracket > 0 && nc == QSE_T(']')) if (bracket_state > 0 && nc == QSE_T(']'))
{ {
/* if really is not set, in_bracket is alyway 0. /*
* so this block is never reached */ * if 'really' is not set, bracket_state is alyway 0.
if (chars_from_opening_bracket > 1) in_bracket--; * so this block is never reached.
*
* a backslashed closing bracket is seen.
* it is not :]. if bracket_state is 2, this \]
* makes an illegal regular expression. but,
* let's not care.. just drop the state to 0
* as if the outer [ is closed.
*/
if (chars_from_opening_bracket > 1) bracket_state = 0;
} }
if (nc == QSE_T('\n')) c = nc; if (nc == QSE_T('\n')) c = nc;
@ -524,9 +530,9 @@ static int pickup_rex (
ec = trans_escaped (nc); ec = trans_escaped (nc);
if (ec == nc) if (ec == nc)
{ {
/* if the character after a backslash is not special at the /* if the character after a backslash is not special
* this layer, add the backslash into the regular expression * at the this layer, add the backslash into the
* buffer as it is. */ * regular expression buffer as it is. */
if (qse_str_ccat (buf, QSE_T('\\')) == (qse_size_t)-1) if (qse_str_ccat (buf, QSE_T('\\')) == (qse_size_t)-1)
{ {
SETERR0 (sed, QSE_SED_ENOMEM, QSE_NULL); SETERR0 (sed, QSE_SED_ENOMEM, QSE_NULL);
@ -543,14 +549,40 @@ static int pickup_rex (
if (c == QSE_T('[')) if (c == QSE_T('['))
{ {
if (in_bracket <= 0) chars_from_opening_bracket = 0; if (bracket_state <= 0)
in_bracket++; {
bracket_state = 1;
chars_from_opening_bracket = 0;
} }
else if (in_bracket > 0 && c == QSE_T(']')) else if (bracket_state == 1)
{
qse_cint_t nc = NXTSC (sed);
if (nc == QSE_T(':')) bracket_state = 2;
if (qse_str_ccat (buf, c) == (qse_size_t)-1)
{
SETERR0 (sed, QSE_SED_ENOMEM, QSE_NULL);
return -1;
}
chars_from_opening_bracket++;
c = nc;
goto shortcut;
}
}
else if (c == QSE_T(']'))
{
if (bracket_state == 1)
{ {
/* if it is the first character after [, /* if it is the first character after [,
* it is a normal character. */ * it is a normal character. */
if (chars_from_opening_bracket > 1) in_bracket--; if (chars_from_opening_bracket > 1) bracket_state--;
}
else if (bracket_state == 2)
{
/* it doesn't really care if colon was for opening bracket
* like in [[:]] */
if (QSE_STR_LASTCHAR(buf) == QSE_T(':')) bracket_state--;
}
} }
} }
@ -559,7 +591,6 @@ static int pickup_rex (
SETERR0 (sed, QSE_SED_ENOMEM, QSE_NULL); SETERR0 (sed, QSE_SED_ENOMEM, QSE_NULL);
return -1; return -1;
} }
chars_from_opening_bracket++; chars_from_opening_bracket++;
} }
@ -568,9 +599,10 @@ static int pickup_rex (
static QSE_INLINE void* compile_rex_address (qse_sed_t* sed, qse_char_t rxend) static QSE_INLINE void* compile_rex_address (qse_sed_t* sed, qse_char_t rxend)
{ {
if (pickup_rex (sed, rxend, 1, QSE_NULL, &sed->tmp.rex) <= -1) return QSE_NULL; if (pickup_rex (sed, rxend, 1, QSE_NULL, &sed->tmp.rex) <= -1)
/* TODO: support ignore case option for address */ return QSE_NULL;
/* TODO: support ignore case option for address */
if (QSE_STR_LEN(&sed->tmp.rex) <= 0) return EMPTY_REX; if (QSE_STR_LEN(&sed->tmp.rex) <= 0) return EMPTY_REX;
return build_rex (sed, QSE_STR_CSTR(&sed->tmp.rex), 0, &sed->src.loc); return build_rex (sed, QSE_STR_CSTR(&sed->tmp.rex), 0, &sed->src.loc);
} }
@ -1688,7 +1720,7 @@ static int read_file (
for (i = 0; i < n; i++) for (i = 0; i < n; i++)
{ {
if (qse_str_ccat (&sed->e.txt.read, buf[i]) == (qse_size_t)-1) if (qse_str_ccat (&sed->e.txt.appended, buf[i]) == (qse_size_t)-1)
{ {
sed->e.in.fun ( sed->e.in.fun (
sed, QSE_SED_IO_CLOSE, sed, QSE_SED_IO_CLOSE,
@ -1703,7 +1735,7 @@ static int read_file (
} }
else else
{ {
if (qse_str_ncat (&sed->e.txt.read, buf, n) == (qse_size_t)-1) if (qse_str_ncat (&sed->e.txt.appended, buf, n) == (qse_size_t)-1)
{ {
sed->e.in.fun ( sed->e.in.fun (
sed, QSE_SED_IO_CLOSE, sed, QSE_SED_IO_CLOSE,
@ -2523,10 +2555,10 @@ static qse_sed_cmd_t* exec_cmd (qse_sed_t* sed, qse_sed_cmd_t* cmd)
break; break;
case QSE_SED_CMD_APPEND: case QSE_SED_CMD_APPEND:
if (qse_lda_insert ( if (qse_str_ncat (
&sed->e.txt.appended, &sed->e.txt.appended,
QSE_LDA_SIZE(&sed->e.txt.appended), cmd->u.text.ptr,
&cmd->u.text, 0) == (qse_size_t)-1) cmd->u.text.len) == (qse_size_t)-1)
{ {
SETERR0 (sed, QSE_SED_ENOMEM, QSE_NULL); SETERR0 (sed, QSE_SED_ENOMEM, QSE_NULL);
return QSE_NULL; return QSE_NULL;
@ -2801,7 +2833,11 @@ static qse_sed_cmd_t* exec_cmd (qse_sed_t* sed, qse_sed_cmd_t* cmd)
* if so, it has not mathing translation */ * if so, it has not mathing translation */
/* TODO: support different line end convension */ /* TODO: support different line end convension */
if (len > 0 && ptr[len-1] == QSE_T('\n')) len--; if (len > 0 && ptr[len-1] == QSE_T('\n'))
{
len--;
if (len > 0 && ptr[len-1] == QSE_T('\r')) len--;
}
for (i = 0; i < len; i++) for (i = 0; i < len; i++)
{ {
@ -2941,6 +2977,39 @@ static int init_all_commands_for_exec (qse_sed_t* sed)
return 0; return 0;
} }
static int emit_output (qse_sed_t* sed)
{
int n;
#if 0
qse_size_t i;
#endif
if (!(sed->option & QSE_SED_QUIET))
{
/* write the pattern space */
n = write_str (sed,
QSE_STR_PTR(&sed->e.in.line),
QSE_STR_LEN(&sed->e.in.line));
if (n <= -1) return -1;
}
/* write text appended by a and r */
n = write_str (
sed,
QSE_STR_PTR(&sed->e.txt.appended),
QSE_STR_LEN(&sed->e.txt.appended)
);
if (n <= -1) return -1;
/* flush the output stream in case it's not flushed
* in write functions */
n = flush (sed);
if (n <= -1) return -1;
return 0;
}
int qse_sed_exec (qse_sed_t* sed, qse_sed_io_fun_t inf, qse_sed_io_fun_t outf) int qse_sed_exec (qse_sed_t* sed, qse_sed_io_fun_t inf, qse_sed_io_fun_t outf)
{ {
qse_ssize_t n; qse_ssize_t n;
@ -2968,8 +3037,7 @@ int qse_sed_exec (qse_sed_t* sed, qse_sed_io_fun_t inf, qse_sed_io_fun_t outf)
sed->e.last_rex = QSE_NULL; sed->e.last_rex = QSE_NULL;
sed->e.subst_done = 0; sed->e.subst_done = 0;
qse_lda_clear (&sed->e.txt.appended); qse_str_clear (&sed->e.txt.appended);
qse_str_clear (&sed->e.txt.read);
qse_str_clear (&sed->e.txt.subst); qse_str_clear (&sed->e.txt.subst);
qse_str_clear (&sed->e.txt.held); qse_str_clear (&sed->e.txt.held);
if (qse_str_ccat (&sed->e.txt.held, QSE_T('\n')) == (qse_size_t)-1) if (qse_str_ccat (&sed->e.txt.held, QSE_T('\n')) == (qse_size_t)-1)
@ -3045,15 +3113,11 @@ int qse_sed_exec (qse_sed_t* sed, qse_sed_io_fun_t inf, qse_sed_io_fun_t outf)
while (1) while (1)
{ {
qse_size_t i;
int quit = 0;
n = read_line (sed, 0); n = read_line (sed, 0);
if (n <= -1) { ret = -1; goto done; } if (n <= -1) { ret = -1; goto done; }
if (n == 0) goto done; if (n == 0) goto done;
qse_lda_clear (&sed->e.txt.appended); qse_str_clear (&sed->e.txt.appended);
qse_str_clear (&sed->e.txt.read);
if (sed->cmd.fb.len > 0) if (sed->cmd.fb.len > 0)
{ {
@ -3077,7 +3141,11 @@ int qse_sed_exec (qse_sed_t* sed, qse_sed_io_fun_t inf, qse_sed_io_fun_t outf)
j = exec_cmd (sed, c); j = exec_cmd (sed, c);
if (j == QSE_NULL) { ret = -1; goto done; } if (j == QSE_NULL) { ret = -1; goto done; }
if (j == &sed->cmd.quit_quiet) goto done; if (j == &sed->cmd.quit_quiet) goto done;
if (j == &sed->cmd.quit) { quit = 1; break; } if (j == &sed->cmd.quit)
{
if (emit_output (sed) <= -1) ret = -1;
goto done;
}
if (j == &sed->cmd.again) goto again; if (j == &sed->cmd.again) goto again;
/* go to the next command */ /* go to the next command */
@ -3085,37 +3153,8 @@ int qse_sed_exec (qse_sed_t* sed, qse_sed_io_fun_t inf, qse_sed_io_fun_t outf)
} }
} }
if (!(sed->option & QSE_SED_QUIET))
{
/* write the pattern space */
n = write_str (sed,
QSE_STR_PTR(&sed->e.in.line),
QSE_STR_LEN(&sed->e.in.line));
if (n <= -1) { ret = -1; goto done; }
}
/* write text read in by the r command */ if (emit_output (sed) <= -1) { ret = -1; goto done; }
n = write_str (
sed,
QSE_STR_PTR(&sed->e.txt.read),
QSE_STR_LEN(&sed->e.txt.read)
);
if (n <= -1) { ret = -1; goto done; }
/* write appeneded text by the a command */
for (i = 0; i < QSE_LDA_SIZE(&sed->e.txt.appended); i++)
{
qse_xstr_t* t = QSE_LDA_DPTR(&sed->e.txt.appended, i);
n = write_str (sed, t->ptr, t->len);
if (n <= -1) { ret = -1; goto done; }
}
/* flush the output stream in case it's not flushed
* in write functions */
n = flush (sed);
if (n <= -1) { ret = -1; goto done; }
if (quit) break;
} }
done: done:

View File

@ -1,5 +1,5 @@
/* /*
* $Id: sed.h 563 2011-09-08 07:49:53Z hyunghwan.chung $ * $Id: sed.h 564 2011-09-10 16:14:38Z hyunghwan.chung $
* *
Copyright 2006-2011 Chung, Hyung-Hwan. Copyright 2006-2011 Chung, Hyung-Hwan.
This file is part of QSE. This file is part of QSE.
@ -23,7 +23,6 @@
#include <qse/sed/sed.h> #include <qse/sed/sed.h>
#include <qse/cmn/str.h> #include <qse/cmn/str.h>
#include <qse/cmn/lda.h>
#define QSE_MAP_AS_RBT #define QSE_MAP_AS_RBT
#include <qse/cmn/map.h> #include <qse/cmn/map.h>
@ -262,8 +261,7 @@ struct qse_sed_t
/** text buffers */ /** text buffers */
struct struct
{ {
qse_lda_t appended; qse_str_t appended;
qse_str_t read;
qse_str_t held; qse_str_t held;
qse_str_t subst; qse_str_t subst;
} txt; } txt;

View File

@ -20,7 +20,7 @@ static int test_main (int argc, qse_char_t* argv[], qse_char_t* envp[])
qse_tre_init (&tre, QSE_NULL); qse_tre_init (&tre, QSE_NULL);
if (qse_tre_comp (&tre, argv[1], &nsubmat, QSE_TRE_EXTENDED) <= -1) if (qse_tre_comp (&tre, argv[1], &nsubmat, 0 /*QSE_TRE_EXTENDED*/) <= -1)
{ {
qse_printf (QSE_T("ERROR: Cannot compile pattern [%s] - %s\n"), argv[1], qse_tre_geterrmsg(&tre)); qse_printf (QSE_T("ERROR: Cannot compile pattern [%s] - %s\n"), argv[1], qse_tre_geterrmsg(&tre));
goto oops; goto oops;