fixed a bug in handling r and a command

This commit is contained in:
hyung-hwan 2011-09-11 10:14:38 +00:00
parent 3db2c566a2
commit 00e15a42e9
6 changed files with 226 additions and 118 deletions

View File

@ -129,31 +129,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* Wide character support, no multibyte support. */
#define GET_NEXT_WCHAR() \
do { \
prev_c = next_c; \
if (type == STR_BYTE) \
{ \
pos++; \
if (len >= 0 && pos >= len) \
next_c = '\0'; \
else \
next_c = (unsigned char)(*str_byte++); \
do { \
prev_c = next_c; \
if (type == STR_BYTE) \
{ \
pos++; \
if (len >= 0 && pos >= len) next_c = QSE_MT('\0'); \
else next_c = (unsigned char)(*str_byte++); \
} \
else if (type == STR_WIDE) \
{ \
pos++; \
if (len >= 0 && pos >= len) \
next_c = QSE_T('\0'); \
else \
next_c = *str_wide++; \
else if (type == STR_WIDE) \
{ \
pos++; \
if (len >= 0 && pos >= len) next_c = QSE_T('\0'); \
else next_c = *str_wide++; \
} \
else if (type == STR_USER) \
{ \
pos += pos_add_next; \
str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \
str_source->context); \
} \
} while(/*CONSTCOND*/0)
else if (type == STR_USER) \
{ \
pos += pos_add_next; \
str_user_end = str_source->get_next_char(&next_c, &pos_add_next, str_source->context); \
} \
} while(/*CONSTCOND*/0)
#endif /* !TRE_MULTIBYTE */

View File

@ -280,40 +280,40 @@ tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
else if (re + 1 < ctx->re_end
&& *re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
{
#if 0
char tmp_str[64];
#endif
const tre_char_t *endptr = re + 2;
int len;
DPRINT(("tre_parse_bracket: class: '%.*" STRF "'\n", REST(re)));
while (endptr < ctx->re_end && *endptr != CHAR_COLON)
endptr++;
while (endptr < ctx->re_end && *endptr != CHAR_COLON) endptr++;
if (endptr != ctx->re_end)
{
len = MIN(endptr - re - 2, 63);
if (qse_getctypebyxname (re + 2, len, &class) <= -1) status = REG_ECTYPE;
/* Optimize character classes for 8 bit character sets. */
if (status == REG_OK && TRE_MB_CUR_MAX == 1)
/* QSE: bug fix of not checking ending ] */
if (*(endptr + 1) != CHAR_RBRACKET) status = REG_ECTYPE;
else
{
status = tre_expand_ctype(ctx->mem, class, items,
/* END QSE */
len = MIN(endptr - re - 2, 63);
if (qse_getctypebyxname (re + 2, len, &class) <= -1) status = REG_ECTYPE;
/* Optimize character classes for 8 bit character sets. */
if (status == REG_OK && TRE_MB_CUR_MAX == 1)
{
status = tre_expand_ctype(ctx->mem, class, items,
&i, &max_i, ctx->cflags);
class = (tre_ctype_t)0;
skip = 1;
class = (tre_ctype_t)0;
skip = 1;
}
re = endptr + 2;
}
re = endptr + 2;
}
else
status = REG_ECTYPE;
else status = REG_ECTYPE;
min = 0;
max = TRE_CHAR_MAX;
}
else
{
DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
&& ctx->re != re)
if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET && ctx->re != re)
/* Two ranges are not allowed to share and endpoint. */
status = REG_ERANGE;
min = max = *re++;

View File

@ -55,6 +55,82 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef _QSE_LIB_CMN_TRE_H_
#define _QSE_LIB_CMN_TRE_H_
/* TODO: MAKE TRE WORK LIKE GNU
PATTERN: \(.\{0,1\}\)\(~[^,]*\)\([0-9]\)\(\.*\),\([^;]*\)\(;\([^;]*\(\3[^;]*\)\).*X*\1\(.*\)\)
INPUT: ~02.,3~3;0123456789;9876543210
------------------------------------------------------
samples/cmn/tre01 gives the following output. this does not seem wrong, though.
SUBMATCH[7],[8],[9].
SUBMATCH[0] = [~02.,3~3;0123456789;9876543210]
SUBMATCH[1] = []
SUBMATCH[2] = [~0]
SUBMATCH[3] = [2]
SUBMATCH[4] = [.]
SUBMATCH[5] = [3~3]
SUBMATCH[6] = [;0123456789;9876543210]
SUBMATCH[7] = [012]
SUBMATCH[8] = [2]
SUBMATCH[9] = [3456789;9876543210
------------------------------------------------------
Using the GNU regcomp(),regexec(), the following
is printed.
#include <sys/types.h>
#include <regex.h>
#include <stdio.h>
int main (int argc, char* argv[])
{
regex_t tre;
regmatch_t mat[10];
int i;
regcomp (&tre, argv[1], 0);
regexec (&tre, argv[2], 10, mat, 0);
for (i = 0; i < 10; i++)
{
if (mat[i].rm_so == -1) break;
printf ("SUBMATCH[%u] = [%.*s]\n", i,
(int)(mat[i].rm_eo - mat[i].rm_so), &argv[2][mat[i].rm_so]);
}
regfree (&tre);
return 0;
}
SUBMATCH[0] = [~02.,3~3;0123456789;9876543210]
SUBMATCH[1] = []
SUBMATCH[2] = [~0]
SUBMATCH[3] = [2]
SUBMATCH[4] = [.]
SUBMATCH[5] = [3~3]
SUBMATCH[6] = [;0123456789;9876543210]
SUBMATCH[7] = [0123456789]
SUBMATCH[8] = [23456789]
SUBMATCH[9] = []
------------------------------------------------------
One more example here:
$ ./tre01 "\(x*\)ab\(\(c*\1\)\(.*\)\)" "abcdefg"
Match: YES
SUBMATCH[0] = [abcdefg]
SUBMATCH[1] = []
SUBMATCH[2] = [cdefg]
SUBMATCH[3] = []
SUBMATCH[4] = [cdefg]
$ ./reg "\(x*\)ab\(\(c*\1\)\(.*\)\)" "abcdefg"
SUBMATCH[0] = [abcdefg]
SUBMATCH[1] = []
SUBMATCH[2] = [cdefg]
SUBMATCH[3] = [c]
SUBMATCH[4] = [defg]
*/
#include <qse/cmn/tre.h>
#ifdef QSE_CHAR_IS_WCHAR

View File

@ -1,5 +1,5 @@
/*
* $Id: sed.c 563 2011-09-08 07:49:53Z hyunghwan.chung $
* $Id: sed.c 564 2011-09-10 16:14:38Z hyunghwan.chung $
*
Copyright 2006-2011 Chung, Hyung-Hwan.
This file is part of QSE.
@ -95,8 +95,7 @@ int qse_sed_init (qse_sed_t* sed, qse_mmgr_t* mmgr)
qse_map_mancbs(QSE_MAP_MANCBS_INLINE_KEY_COPIER)
);
if (qse_lda_init (&sed->e.txt.appended, mmgr, 32) <= -1) goto oops_4;
if (qse_str_init (&sed->e.txt.read, mmgr, 256) <= -1) goto oops_5;
if (qse_str_init (&sed->e.txt.appended, mmgr, 256) <= -1) goto oops_5;
if (qse_str_init (&sed->e.txt.held, mmgr, 256) <= -1) goto oops_6;
if (qse_str_init (&sed->e.txt.subst, mmgr, 256) <= -1) goto oops_7;
@ -111,10 +110,8 @@ int qse_sed_init (qse_sed_t* sed, qse_mmgr_t* mmgr)
oops_7:
qse_str_fini (&sed->e.txt.held);
oops_6:
qse_str_fini (&sed->e.txt.read);
qse_str_fini (&sed->e.txt.appended);
oops_5:
qse_lda_fini (&sed->e.txt.appended);
oops_4:
qse_map_fini (&sed->tmp.labs);
oops_3:
qse_str_fini (&sed->tmp.lab);
@ -131,8 +128,7 @@ void qse_sed_fini (qse_sed_t* sed)
qse_str_fini (&sed->e.txt.subst);
qse_str_fini (&sed->e.txt.held);
qse_str_fini (&sed->e.txt.read);
qse_lda_fini (&sed->e.txt.appended);
qse_str_fini (&sed->e.txt.appended);
qse_map_fini (&sed->tmp.labs);
qse_str_fini (&sed->tmp.lab);
@ -451,14 +447,16 @@ static int pickup_rex (
int really, const qse_sed_cmd_t* cmd, qse_str_t* buf)
{
qse_cint_t c;
qse_size_t in_bracket = 0;
qse_size_t chars_from_opening_bracket = 0;
int bracket_state = 0;
qse_str_clear (buf);
for (;;)
while (1)
{
c = NXTSC (sed);
shortcut:
if (c == QSE_CHAR_EOF || IS_LINTERM(c))
{
if (cmd)
@ -480,7 +478,7 @@ static int pickup_rex (
return -1;
}
if (c == rxend && in_bracket <= 0) break;
if (c == rxend && bracket_state == 0) break;
if (c == QSE_T('\\'))
{
@ -509,11 +507,19 @@ static int pickup_rex (
return -1;
}
if (in_bracket > 0 && nc == QSE_T(']'))
if (bracket_state > 0 && nc == QSE_T(']'))
{
/* if really is not set, in_bracket is alyway 0.
* so this block is never reached */
if (chars_from_opening_bracket > 1) in_bracket--;
/*
* if 'really' is not set, bracket_state is alyway 0.
* so this block is never reached.
*
* a backslashed closing bracket is seen.
* it is not :]. if bracket_state is 2, this \]
* makes an illegal regular expression. but,
* let's not care.. just drop the state to 0
* as if the outer [ is closed.
*/
if (chars_from_opening_bracket > 1) bracket_state = 0;
}
if (nc == QSE_T('\n')) c = nc;
@ -524,9 +530,9 @@ static int pickup_rex (
ec = trans_escaped (nc);
if (ec == nc)
{
/* if the character after a backslash is not special at the
* this layer, add the backslash into the regular expression
* buffer as it is. */
/* if the character after a backslash is not special
* at the this layer, add the backslash into the
* regular expression buffer as it is. */
if (qse_str_ccat (buf, QSE_T('\\')) == (qse_size_t)-1)
{
SETERR0 (sed, QSE_SED_ENOMEM, QSE_NULL);
@ -543,14 +549,40 @@ static int pickup_rex (
if (c == QSE_T('['))
{
if (in_bracket <= 0) chars_from_opening_bracket = 0;
in_bracket++;
if (bracket_state <= 0)
{
bracket_state = 1;
chars_from_opening_bracket = 0;
}
else if (bracket_state == 1)
{
qse_cint_t nc = NXTSC (sed);
if (nc == QSE_T(':')) bracket_state = 2;
if (qse_str_ccat (buf, c) == (qse_size_t)-1)
{
SETERR0 (sed, QSE_SED_ENOMEM, QSE_NULL);
return -1;
}
chars_from_opening_bracket++;
c = nc;
goto shortcut;
}
}
else if (in_bracket > 0 && c == QSE_T(']'))
else if (c == QSE_T(']'))
{
/* if it is the first character after [,
* it is a normal character. */
if (chars_from_opening_bracket > 1) in_bracket--;
if (bracket_state == 1)
{
/* if it is the first character after [,
* it is a normal character. */
if (chars_from_opening_bracket > 1) bracket_state--;
}
else if (bracket_state == 2)
{
/* it doesn't really care if colon was for opening bracket
* like in [[:]] */
if (QSE_STR_LASTCHAR(buf) == QSE_T(':')) bracket_state--;
}
}
}
@ -559,7 +591,6 @@ static int pickup_rex (
SETERR0 (sed, QSE_SED_ENOMEM, QSE_NULL);
return -1;
}
chars_from_opening_bracket++;
}
@ -568,9 +599,10 @@ static int pickup_rex (
static QSE_INLINE void* compile_rex_address (qse_sed_t* sed, qse_char_t rxend)
{
if (pickup_rex (sed, rxend, 1, QSE_NULL, &sed->tmp.rex) <= -1) return QSE_NULL;
/* TODO: support ignore case option for address */
if (pickup_rex (sed, rxend, 1, QSE_NULL, &sed->tmp.rex) <= -1)
return QSE_NULL;
/* TODO: support ignore case option for address */
if (QSE_STR_LEN(&sed->tmp.rex) <= 0) return EMPTY_REX;
return build_rex (sed, QSE_STR_CSTR(&sed->tmp.rex), 0, &sed->src.loc);
}
@ -1688,7 +1720,7 @@ static int read_file (
for (i = 0; i < n; i++)
{
if (qse_str_ccat (&sed->e.txt.read, buf[i]) == (qse_size_t)-1)
if (qse_str_ccat (&sed->e.txt.appended, buf[i]) == (qse_size_t)-1)
{
sed->e.in.fun (
sed, QSE_SED_IO_CLOSE,
@ -1703,7 +1735,7 @@ static int read_file (
}
else
{
if (qse_str_ncat (&sed->e.txt.read, buf, n) == (qse_size_t)-1)
if (qse_str_ncat (&sed->e.txt.appended, buf, n) == (qse_size_t)-1)
{
sed->e.in.fun (
sed, QSE_SED_IO_CLOSE,
@ -2523,10 +2555,10 @@ static qse_sed_cmd_t* exec_cmd (qse_sed_t* sed, qse_sed_cmd_t* cmd)
break;
case QSE_SED_CMD_APPEND:
if (qse_lda_insert (
if (qse_str_ncat (
&sed->e.txt.appended,
QSE_LDA_SIZE(&sed->e.txt.appended),
&cmd->u.text, 0) == (qse_size_t)-1)
cmd->u.text.ptr,
cmd->u.text.len) == (qse_size_t)-1)
{
SETERR0 (sed, QSE_SED_ENOMEM, QSE_NULL);
return QSE_NULL;
@ -2801,7 +2833,11 @@ static qse_sed_cmd_t* exec_cmd (qse_sed_t* sed, qse_sed_cmd_t* cmd)
* if so, it has not mathing translation */
/* TODO: support different line end convension */
if (len > 0 && ptr[len-1] == QSE_T('\n')) len--;
if (len > 0 && ptr[len-1] == QSE_T('\n'))
{
len--;
if (len > 0 && ptr[len-1] == QSE_T('\r')) len--;
}
for (i = 0; i < len; i++)
{
@ -2941,6 +2977,39 @@ static int init_all_commands_for_exec (qse_sed_t* sed)
return 0;
}
static int emit_output (qse_sed_t* sed)
{
int n;
#if 0
qse_size_t i;
#endif
if (!(sed->option & QSE_SED_QUIET))
{
/* write the pattern space */
n = write_str (sed,
QSE_STR_PTR(&sed->e.in.line),
QSE_STR_LEN(&sed->e.in.line));
if (n <= -1) return -1;
}
/* write text appended by a and r */
n = write_str (
sed,
QSE_STR_PTR(&sed->e.txt.appended),
QSE_STR_LEN(&sed->e.txt.appended)
);
if (n <= -1) return -1;
/* flush the output stream in case it's not flushed
* in write functions */
n = flush (sed);
if (n <= -1) return -1;
return 0;
}
int qse_sed_exec (qse_sed_t* sed, qse_sed_io_fun_t inf, qse_sed_io_fun_t outf)
{
qse_ssize_t n;
@ -2968,8 +3037,7 @@ int qse_sed_exec (qse_sed_t* sed, qse_sed_io_fun_t inf, qse_sed_io_fun_t outf)
sed->e.last_rex = QSE_NULL;
sed->e.subst_done = 0;
qse_lda_clear (&sed->e.txt.appended);
qse_str_clear (&sed->e.txt.read);
qse_str_clear (&sed->e.txt.appended);
qse_str_clear (&sed->e.txt.subst);
qse_str_clear (&sed->e.txt.held);
if (qse_str_ccat (&sed->e.txt.held, QSE_T('\n')) == (qse_size_t)-1)
@ -3045,15 +3113,11 @@ int qse_sed_exec (qse_sed_t* sed, qse_sed_io_fun_t inf, qse_sed_io_fun_t outf)
while (1)
{
qse_size_t i;
int quit = 0;
n = read_line (sed, 0);
if (n <= -1) { ret = -1; goto done; }
if (n == 0) goto done;
qse_lda_clear (&sed->e.txt.appended);
qse_str_clear (&sed->e.txt.read);
qse_str_clear (&sed->e.txt.appended);
if (sed->cmd.fb.len > 0)
{
@ -3077,7 +3141,11 @@ int qse_sed_exec (qse_sed_t* sed, qse_sed_io_fun_t inf, qse_sed_io_fun_t outf)
j = exec_cmd (sed, c);
if (j == QSE_NULL) { ret = -1; goto done; }
if (j == &sed->cmd.quit_quiet) goto done;
if (j == &sed->cmd.quit) { quit = 1; break; }
if (j == &sed->cmd.quit)
{
if (emit_output (sed) <= -1) ret = -1;
goto done;
}
if (j == &sed->cmd.again) goto again;
/* go to the next command */
@ -3085,37 +3153,8 @@ int qse_sed_exec (qse_sed_t* sed, qse_sed_io_fun_t inf, qse_sed_io_fun_t outf)
}
}
if (!(sed->option & QSE_SED_QUIET))
{
/* write the pattern space */
n = write_str (sed,
QSE_STR_PTR(&sed->e.in.line),
QSE_STR_LEN(&sed->e.in.line));
if (n <= -1) { ret = -1; goto done; }
}
/* write text read in by the r command */
n = write_str (
sed,
QSE_STR_PTR(&sed->e.txt.read),
QSE_STR_LEN(&sed->e.txt.read)
);
if (n <= -1) { ret = -1; goto done; }
/* write appeneded text by the a command */
for (i = 0; i < QSE_LDA_SIZE(&sed->e.txt.appended); i++)
{
qse_xstr_t* t = QSE_LDA_DPTR(&sed->e.txt.appended, i);
n = write_str (sed, t->ptr, t->len);
if (n <= -1) { ret = -1; goto done; }
}
/* flush the output stream in case it's not flushed
* in write functions */
n = flush (sed);
if (n <= -1) { ret = -1; goto done; }
if (quit) break;
if (emit_output (sed) <= -1) { ret = -1; goto done; }
}
done:

View File

@ -1,5 +1,5 @@
/*
* $Id: sed.h 563 2011-09-08 07:49:53Z hyunghwan.chung $
* $Id: sed.h 564 2011-09-10 16:14:38Z hyunghwan.chung $
*
Copyright 2006-2011 Chung, Hyung-Hwan.
This file is part of QSE.
@ -23,7 +23,6 @@
#include <qse/sed/sed.h>
#include <qse/cmn/str.h>
#include <qse/cmn/lda.h>
#define QSE_MAP_AS_RBT
#include <qse/cmn/map.h>
@ -262,8 +261,7 @@ struct qse_sed_t
/** text buffers */
struct
{
qse_lda_t appended;
qse_str_t read;
qse_str_t appended;
qse_str_t held;
qse_str_t subst;
} txt;

View File

@ -20,7 +20,7 @@ static int test_main (int argc, qse_char_t* argv[], qse_char_t* envp[])
qse_tre_init (&tre, QSE_NULL);
if (qse_tre_comp (&tre, argv[1], &nsubmat, QSE_TRE_EXTENDED) <= -1)
if (qse_tre_comp (&tre, argv[1], &nsubmat, 0 /*QSE_TRE_EXTENDED*/) <= -1)
{
qse_printf (QSE_T("ERROR: Cannot compile pattern [%s] - %s\n"), argv[1], qse_tre_geterrmsg(&tre));
goto oops;