enhanced sed escape handling

fixed an issue of adding an extra newline after matching $ in do_subst() of sed.
changed tre slightly for QSE's own needs
This commit is contained in:
hyung-hwan 2011-09-05 10:21:54 +00:00
parent 3fe32033cb
commit 2f15ca2335
8 changed files with 599 additions and 561 deletions

View File

@ -115,6 +115,7 @@ static qse_ssize_t out (
return 1;
case QSE_SED_IO_CLOSE:
qse_sio_flush (arg->handle);
if (arg->handle != qse_sio_out) qse_sio_close (arg->handle);
return 0;
@ -136,11 +137,12 @@ static void print_usage (QSE_FILE* out, int argc, qse_char_t* argv[])
qse_fprintf (out, QSE_T("options as follows:\n"));
qse_fprintf (out, QSE_T(" -h show this message\n"));
qse_fprintf (out, QSE_T(" -n disable auto-print\n"));
qse_fprintf (out, QSE_T(" -a perform strict address check\n"));
qse_fprintf (out, QSE_T(" -r use the extended regular expression\n"));
qse_fprintf (out, QSE_T(" -s allow text on the same line as c, a, i\n"));
qse_fprintf (out, QSE_T(" -l ensure a newline at text end\n"));
qse_fprintf (out, QSE_T(" -f file specify a script file\n"));
qse_fprintf (out, QSE_T(" -r use the extended regular expression\n"));
qse_fprintf (out, QSE_T(" -a perform strict address check\n"));
qse_fprintf (out, QSE_T(" -w allow address format of start~step\n"));
qse_fprintf (out, QSE_T(" -x allow text on the same line as c, a, i\n"));
qse_fprintf (out, QSE_T(" -y ensure a newline at text end\n"));
qse_fprintf (out, QSE_T(" -m number specify the maximum amount of memory to use in bytes\n"));
}
@ -148,7 +150,7 @@ static int handle_args (int argc, qse_char_t* argv[])
{
static qse_opt_t opt =
{
QSE_T("hnarslf:m:"),
QSE_T("hnf:rawxym:"),
QSE_NULL
};
qse_cint_t c;
@ -185,26 +187,30 @@ static int handle_args (int argc, qse_char_t* argv[])
g_option |= QSE_SED_QUIET;
break;
case QSE_T('a'):
g_option |= QSE_SED_STRICT;
case QSE_T('f'):
g_script_file = opt.arg;
break;
case QSE_T('r'):
g_option |= QSE_SED_EXTENDEDREX;
break;
case QSE_T('s'):
case QSE_T('a'):
g_option |= QSE_SED_STRICT;
break;
case QSE_T('w'):
g_option |= QSE_SED_STARTSTEP;
break;
case QSE_T('x'):
g_option |= QSE_SED_SAMELINE;
break;
case QSE_T('l'):
case QSE_T('y'):
g_option |= QSE_SED_ENSURENL;
break;
case QSE_T('f'):
g_script_file = opt.arg;
break;
case QSE_T('m'):
g_memlimit = qse_strtoulong (opt.arg);
break;

View File

@ -1,5 +1,5 @@
/*
* $Id: tio-put.c 556 2011-08-31 15:43:46Z hyunghwan.chung $
* $Id: tio-put.c 559 2011-09-04 16:21:54Z hyunghwan.chung $
*
Copyright 2006-2011 Chung, Hyung-Hwan.
This file is part of QSE.
@ -21,7 +21,7 @@
#include <qse/cmn/tio.h>
#include <qse/cmn/chr.h>
static qse_ssize_t tio_putc (qse_tio_t* tio, qse_char_t c)
static qse_ssize_t tio_putc (qse_tio_t* tio, qse_char_t c, int* flush_needed)
{
#ifdef QSE_CHAR_IS_WCHAR
qse_size_t n, i;
@ -41,7 +41,10 @@ static qse_ssize_t tio_putc (qse_tio_t* tio, qse_char_t c)
tio->outbuf[tio->outbuf_len++] = c;
if (tio->outbuf_len >= QSE_COUNTOF(tio->outbuf))
{
*flush_needed = 0;
return qse_tio_flush (tio);
}
#else /* QSE_CHAR_IS_WCHAR */
@ -62,7 +65,8 @@ static qse_ssize_t tio_putc (qse_tio_t* tio, qse_char_t c)
tio->outbuf[tio->outbuf_len++] = mc[i];
if (tio->outbuf_len >= QSE_COUNTOF(tio->outbuf))
{
if (qse_tio_flush (tio) == -1) return -1;
*flush_needed = 0;
if (qse_tio_flush (tio) <= -1) return -1;
}
}
@ -70,7 +74,8 @@ static qse_ssize_t tio_putc (qse_tio_t* tio, qse_char_t c)
if (c == QSE_T('\n') && tio->outbuf_len > 0)
{
if (qse_tio_flush (tio) == -1) return -1;
/*if (qse_tio_flush (tio) <= -1) return -1;*/
*flush_needed = 1;
}
return 1;
@ -80,6 +85,7 @@ qse_ssize_t qse_tio_write (qse_tio_t* tio, const qse_char_t* str, qse_size_t siz
{
qse_ssize_t n;
const qse_char_t* p;
int flush_needed = 0;
if (size == 0) return 0;
@ -89,8 +95,8 @@ qse_ssize_t qse_tio_write (qse_tio_t* tio, const qse_char_t* str, qse_size_t siz
{
while (*p != QSE_T('\0'))
{
n = tio_putc (tio, *p);
if (n == -1) return -1;
n = tio_putc (tio, *p, &flush_needed);
if (n <= -1) return -1;
if (n == 0) break;
p++;
}
@ -100,13 +106,14 @@ qse_ssize_t qse_tio_write (qse_tio_t* tio, const qse_char_t* str, qse_size_t siz
const qse_char_t* end = str + size;
while (p < end)
{
n = tio_putc (tio, *p);
if (n == -1) return -1;
n = tio_putc (tio, *p, &flush_needed);
if (n <= -1) return -1;
if (n == 0) break;
p++;
}
}
if (flush_needed && qse_tio_flush(tio) <= -1) return -1;
return p - str;
}

View File

@ -1,5 +1,5 @@
/*
* $Id: tio.c 556 2011-08-31 15:43:46Z hyunghwan.chung $
* $Id: tio.c 559 2011-09-04 16:21:54Z hyunghwan.chung $
*
Copyright 2006-2011 Chung, Hyung-Hwan.
This file is part of QSE.

View File

@ -358,6 +358,8 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
}
break;
}
case CATENATION:
{
tre_catenation_t *cat = node->obj;
@ -393,9 +395,10 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
/* Process left child. */
STACK_PUSHX(stack, voidptr, left);
STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
}
break;
}
case ITERATION:
{
tre_iteration_t *iter = node->obj;
@ -447,8 +450,9 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
next_tag++;
}
direction = TRE_TAG_MINIMIZE;
}
break;
}
case UNION:
{
tre_union_t *uni = node->obj;
@ -1918,7 +1922,9 @@ int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
/* Allocate a stack used throughout the compilation process for various
purposes. */
stack = tre_stack_new(preg->mmgr, 512, 10240, 128);
/* QSE: deleted limit on the stack size
stack = tre_stack_new(preg->mmgr, 512, 10240, 128); */
stack = tre_stack_new(preg->mmgr, 512, -1, 128);
if (!stack)
return REG_ESPACE;
/* Allocate a fast memory allocator. */

View File

@ -218,98 +218,6 @@ tre_compare_items(const void *a, const void *b, void* ctx)
return 0;
}
#if 0
#ifndef TRE_USE_SYSTEM_WCTYPE
/* isalnum() and the rest may be macros, so wrap them to functions. */
int tre_isalnum_func(tre_cint_t c)
{
return tre_isalnum(c);
}
int tre_isalpha_func(tre_cint_t c)
{
return tre_isalpha(c);
}
int tre_isascii_func(tre_cint_t c)
{
return !(c >> 7);
}
int tre_isblank_func(tre_cint_t c)
{
return tre_isblank(c);
}
int tre_iscntrl_func(tre_cint_t c)
{
return tre_iscntrl(c);
}
int tre_isdigit_func(tre_cint_t c)
{
return tre_isdigit(c);
}
int tre_isgraph_func(tre_cint_t c)
{
return tre_isgraph(c);
}
int tre_islower_func(tre_cint_t c)
{
return tre_islower(c);
}
int tre_isprint_func(tre_cint_t c)
{
return tre_isprint(c);
}
int tre_ispunct_func(tre_cint_t c)
{
return tre_ispunct(c);
}
int tre_isspace_func(tre_cint_t c)
{
return tre_isspace(c);
}
int tre_isupper_func(tre_cint_t c)
{
return tre_isupper(c);
}
int tre_isxdigit_func(tre_cint_t c)
{
return tre_isxdigit(c);
}
struct
{
char *name;
int (*func)(tre_cint_t);
} tre_ctype_map[] =
{
{ "alnum", &tre_isalnum_func },
{ "alpha", &tre_isalpha_func },
{ "ascii", &tre_isascii_func },
{ "blank", &tre_isblank_func },
{ "cntrl", &tre_iscntrl_func },
{ "digit", &tre_isdigit_func },
{ "graph", &tre_isgraph_func },
{ "lower", &tre_islower_func },
{ "print", &tre_isprint_func },
{ "punct", &tre_ispunct_func },
{ "space", &tre_isspace_func },
{ "upper", &tre_isupper_func },
{ "xdigit", &tre_isxdigit_func },
{ NULL, NULL}
};
tre_ctype_t tre_ctype(const char *name)
{
int i;
for (i = 0; tre_ctype_map[i].name != NULL; i++)
{
if (qse_mbscmp(name, tre_ctype_map[i].name) == 0)
return tre_ctype_map[i].func;
}
return (tre_ctype_t)0;
}
#endif /* !TRE_USE_SYSTEM_WCTYPE */
#endif
/* Maximum number of character classes that can occur in a negated bracket
expression. */
#define MAX_NEG_CLASSES 64
@ -882,11 +790,15 @@ tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
minimal = !(ctx->cflags & REG_UNGREEDY);
r++;
}
/* QSE - commented out for minimal impact on backward compatibility.
* X{x,y}* X{x,y}+ */
#if 0
else if (*r == CHAR_STAR || *r == CHAR_PLUS)
{
/* These are reserved for future extensions. */
return REG_BADRPT;
}
#endif
}
/* Create the AST node(s). */
@ -1196,12 +1108,18 @@ tre_parse(tre_parse_ctx_t *ctx)
minimal = !(ctx->cflags & REG_UNGREEDY);
ctx->re++;
}
/* QSE - TRE has provisions for ** or *+ as a special repetition operator.
* however, that seems to break backward compatibility.
* '+' in 'a*+' is not treated as a normal character with the
* following block enabled. So let me comment it out */
#if 0
else if (*(ctx->re + 1) == CHAR_STAR
|| *(ctx->re + 1) == CHAR_PLUS)
{
/* These are reserved for future extensions. */
return REG_BADRPT;
}
#endif
}
DPRINT(("tre_parse: %s star: '%.*" STRF "'\n",
@ -1512,6 +1430,7 @@ parse_brace:
ctx->re++;
if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
{
/* QSE */
#if 0
/* 8 bit hex char. */
char tmp[3] = {0, 0, 0};
@ -1552,6 +1471,7 @@ parse_brace:
else if (ctx->re < ctx->re_end)
{
/* Wide char. */
/* QSE */
#if 0
char tmp[32];
long val;

View File

@ -117,7 +117,9 @@ tre_stack_push(tre_stack_t *s, union tre_stack_item value)
}
else
{
if (s->size >= s->max_size)
/* QSE added check for s->max_size > 0
if (s->size >= s->max_size)*/
if (s->max_size > 0 && s->size >= s->max_size)
{
DPRINT(("tre_stack_push: stack full\n"));
return REG_ESPACE;
@ -128,7 +130,9 @@ tre_stack_push(tre_stack_t *s, union tre_stack_item value)
int new_size;
DPRINT(("tre_stack_push: trying to realloc more space\n"));
new_size = s->size + s->increment;
if (new_size > s->max_size)
/* QSE added check for s->max_size > 0
if (new_size > s->max_size) */
if (s->max_size > 0 && new_size > s->max_size)
new_size = s->max_size;
new_buffer = xrealloc(s->mmgr, s->stack, sizeof(*new_buffer) * new_size);
if (new_buffer == NULL)

View File

@ -1,5 +1,5 @@
/*
* $Id: sed.c 558 2011-09-02 15:27:44Z hyunghwan.chung $
* $Id: sed.c 559 2011-09-04 16:21:54Z hyunghwan.chung $
*
Copyright 2006-2011 Chung, Hyung-Hwan.
This file is part of QSE.
@ -177,7 +177,11 @@ static qse_tre_t* maketre (
if (qse_tre_compx (tre, str->ptr, str->len, QSE_NULL,
((sed->option & QSE_SED_EXTENDEDREX)? QSE_TRE_EXTENDED: 0)) <= -1)
{
SETERR1 (sed, QSE_SED_EREXBL, str->ptr, str->len, loc);
qse_sed_errnum_t errnum;
errnum = (QSE_TRE_ERRNUM(tre) == QSE_TRE_ENOMEM)?
QSE_TRE_ENOMEM: QSE_SED_EREXBL;
SETERR1 (sed, errnum, str->ptr, str->len, loc);
qse_tre_close (tre);
return QSE_NULL;
}
@ -201,8 +205,13 @@ static int matchtre (
n = qse_tre_execx (tre, str->ptr, str->len, match, 10, opt);
if (n <= -1)
{
qse_sed_errnum_t errnum;
if (QSE_TRE_ERRNUM(tre) == QSE_TRE_ENOMATCH) return 0;
SETERR0 (sed, QSE_SED_EREXMA, loc);
errnum = (QSE_TRE_ERRNUM(tre) == QSE_TRE_ENOMEM)?
QSE_TRE_ENOMEM: QSE_SED_EREXMA;
SETERR0 (sed, errnum, loc);
return -1;
}
@ -374,6 +383,39 @@ static void free_command (qse_sed_t* sed, qse_sed_cmd_t* cmd)
}
}
static qse_cint_t trans_escaped (qse_cint_t c)
{
switch (c)
{
case QSE_T('a'):
c = QSE_T('\a');
break;
/*
Omitted for clash with regular expression \b.
case QSE_T('b'):
c = QSE_T('\b');
break;
*/
case QSE_T('f'):
c = QSE_T('\f');
case QSE_T('n'):
c = QSE_T('\n');
break;
case QSE_T('r'):
c = QSE_T('\r');
break;
case QSE_T('t'):
c = QSE_T('\t');
break;
case QSE_T('v'):
c = QSE_T('\v');
break;
}
return c;
}
static void* compile_rex (qse_sed_t* sed, qse_char_t rxend)
{
#ifdef USE_REX
@ -401,8 +443,10 @@ static void* compile_rex (qse_sed_t* sed, qse_char_t rxend)
if (c == QSE_T('\\'))
{
c = NXTSC (sed);
if (c == QSE_CHAR_EOF || c == QSE_T('\n'))
qse_cint_t nc;
nc = NXTSC (sed);
if (nc == QSE_CHAR_EOF /*|| nc == QSE_T('\n')*/)
{
SETERR1 (
sed, QSE_SED_EREXIC,
@ -413,8 +457,25 @@ static void* compile_rex (qse_sed_t* sed, qse_char_t rxend)
return QSE_NULL;
}
if (c == QSE_T('n')) c = QSE_T('\n');
/* TODO: support more escaped characters?? */
if (nc == QSE_T('\n')) c = nc;
else
{
qse_cint_t ec;
ec = trans_escaped (nc);
if (ec == nc)
{
/* if the character after a backslash is not special at the
* this layer, add the backslash into the regular expression
* buffer as it is. */
if (qse_str_ccat (&sed->tmp.rex, QSE_T('\\')) == (qse_size_t)-1)
{
SETERR0 (sed, QSE_SED_ENOMEM, QSE_NULL);
return QSE_NULL;
}
}
c = ec;
}
}
if (qse_str_ccat (&sed->tmp.rex, c) == (qse_size_t)-1)
@ -824,6 +885,16 @@ do { \
} \
} while (0)
#define CHECK_CMDIC_ESCAPED(sed,cmd,c,action) \
do { \
if (c == QSE_CHAR_EOF) \
{ \
SETERR1 (sed, QSE_SED_ECMDIC, \
&cmd->type, 1, &sed->src.loc); \
action; \
} \
} while (0)
static int get_subst (qse_sed_t* sed, qse_sed_cmd_t* cmd)
{
qse_cint_t c, delim;
@ -859,14 +930,39 @@ static int get_subst (qse_sed_t* sed, qse_sed_cmd_t* cmd)
{
CHECK_CMDIC (sed, cmd, c, goto oops);
#if 0
if (c == QSE_T('\\'))
{
c = NXTSC (sed);
CHECK_CMDIC (sed, cmd, c, goto oops);
if (c == QSE_T('n')) c = QSE_T('\n');
qse_cint_t nc;
nc = NXTSC (sed);
CHECK_CMDIC_ESCAPED (sed, cmd, nc, goto oops);
if (nc == QSE_T('\n')) c = nc;
else
{
qse_cint_t ec;
/* Escaping a known speical character for the regular expression
* part is done here. However, Escaping a special character for
* the replacement part is done in do_subst() except '\n' because
* it has more special characters like '&'. */
ec = trans_escaped (nc);
if (ec == nc)
{
/* if the character after a backslash is not special at the
* this layer, add the backslash into the regular expression
* buffer as it is. */
if (qse_str_ccat (t[i], QSE_T('\\')) == (qse_size_t)-1)
{
SETERR0 (sed, QSE_SED_ENOMEM, QSE_NULL);
goto oops;
}
}
c = ec;
}
}
#endif
if (qse_str_ccat (t[i], c) == (qse_size_t)-1)
{
@ -1017,8 +1113,8 @@ static int get_transet (qse_sed_t* sed, qse_sed_cmd_t* cmd)
if (c == QSE_T('\\'))
{
c = NXTSC (sed);
CHECK_CMDIC (sed, cmd, c, goto oops);
if (c == QSE_T('n')) c = QSE_T('\n');
CHECK_CMDIC_ESCAPED (sed, cmd, c, goto oops);
c = trans_escaped (c);
}
b[0] = c;
@ -1039,8 +1135,8 @@ static int get_transet (qse_sed_t* sed, qse_sed_cmd_t* cmd)
if (c == QSE_T('\\'))
{
c = NXTSC (sed);
CHECK_CMDIC (sed, cmd, c, goto oops);
if (c == QSE_T('n')) c = QSE_T('\n');
CHECK_CMDIC_ESCAPED (sed, cmd, c, goto oops);
c = trans_escaped (c);
}
if (pos >= QSE_STR_LEN(t))
@ -1659,10 +1755,21 @@ static int write_char (qse_sed_t* sed, qse_char_t c)
static int write_str (qse_sed_t* sed, const qse_char_t* str, qse_size_t len)
{
qse_size_t i;
int flush_needed = 0;
for (i = 0; i < len; i++)
{
if (write_char (sed, str[i]) <= -1) return -1;
/*if (write_char (sed, str[i]) <= -1) return -1;*/
sed->e.out.buf[sed->e.out.len++] = str[i];
if (sed->e.out.len >= QSE_COUNTOF(sed->e.out.buf))
{
if (flush (sed) <= -1) return -1;
flush_needed = 0;
}
else if (str[i] == QSE_T('\n')) flush_needed = 1;
}
if (flush_needed && flush(sed) <= -1) return -1;
return 0;
}
@ -1983,29 +2090,35 @@ static int do_subst (qse_sed_t* sed, qse_sed_cmd_t* cmd)
}
if (max_count > 0 && sub_count + 1 != max_count)
{
if (cur.ptr < str_end)
{
m = qse_str_ncat (
&sed->e.txt.subst,
cur.ptr, mat.ptr-cur.ptr+mat.len
);
if (m == (qse_size_t)-1)
{
SETERR0 (sed, QSE_SED_ENOMEM, QSE_NULL);
return -1;
}
}
}
else
{
repl = 1;
if (cur.ptr < str_end)
{
m = qse_str_ncat (
&sed->e.txt.subst, cur.ptr, mat.ptr-cur.ptr);
&sed->e.txt.subst, cur.ptr, mat.ptr-cur.ptr
);
if (m == (qse_size_t)-1)
{
SETERR0 (sed, QSE_SED_ENOMEM, QSE_NULL);
return -1;
}
}
for (i = 0; i < cmd->u.subst.rpl.len; i++)
{
@ -2018,41 +2131,21 @@ static int do_subst (qse_sed_t* sed, qse_sed_cmd_t* cmd)
if (nc >= QSE_T('1') && nc <= QSE_T('9'))
{
int smi = nc - QSE_T('1');
m = qse_str_ncat (&sed->e.txt.subst, submat[smi].ptr, submat[smi].len);
m = qse_str_ncat (
&sed->e.txt.subst,
submat[smi].ptr, submat[smi].len
);
}
else
{
#endif
switch (nc)
{
case QSE_T('n'):
nc = QSE_T('\n');
break;
case QSE_T('r'):
nc = QSE_T('\r');
break;
case QSE_T('t'):
nc = QSE_T('\t');
break;
case QSE_T('f'):
nc = QSE_T('\f');
break;
case QSE_T('b'):
nc = QSE_T('\b');
break;
case QSE_T('v'):
nc = QSE_T('\v');
break;
case QSE_T('a'):
nc = QSE_T('\a');
break;
/* the know speical characters have been escaped
* in get_subst(). so i don't call trans_escaped() here */
m = qse_str_ccat (&sed->e.txt.subst, nc);
#ifndef USE_REX
}
#endif
m = qse_str_ccat (&sed->e.txt.subst, nc);
}
i++;
}
else if (cmd->u.subst.rpl.ptr[i] == QSE_T('&'))
@ -2085,14 +2178,16 @@ static int do_subst (qse_sed_t* sed, qse_sed_cmd_t* cmd)
if (mat.len == 0)
{
skip_one_char:
/* special treament is need if the match length is 0 */
if (cur.ptr < str_end)
{
/* special treament is needed if the match length is 0 */
m = qse_str_ncat (&sed->e.txt.subst, cur.ptr, 1);
if (m == (qse_size_t)-1)
{
SETERR0 (sed, QSE_SED_ENOMEM, QSE_NULL);
return -1;
}
}
cur.ptr++; cur.len--;
}

View File

@ -56,7 +56,7 @@ OUTFILE_XMA="${OUTFILE}.xma"
XMAOPTS="-m 500000"
PROGS="
s001.sed/s001.dat//-n
s001.sed/s001.dat//-n -r
s002.sed/s002.dat//
s003.sed/s003.dat//
s004.sed/s004.dat//