enhanced multi-character RS handling

This commit is contained in:
hyung-hwan 2011-05-01 09:24:38 +00:00
parent 7b4914c794
commit d915a0fc7a
2 changed files with 309 additions and 198 deletions

View File

@ -1,5 +1,5 @@
/*
* $Id: str.h 442 2011-04-25 14:53:50Z hyunghwan.chung $
* $Id: str.h 446 2011-04-30 15:24:38Z hyunghwan.chung $
*
Copyright 2006-2011 Chung, Hyung-Hwan.
This file is part of QSE.
@ -29,18 +29,19 @@
*
* The #qse_cstr_t type and the #qse_xstr_t defined in <qse/types.h> help you
* deal with a string pointer and length in a structure.
*
*/
#define QSE_MBS_LEN(s) ((s)->len) /**< string length */
#define QSE_MBS_PTR(s) ((s)->ptr) /**< string buffer pointer */
#define QSE_MBS_CAPA(s) ((s)->capa) /**< string buffer capacity */
#define QSE_MBS_CHAR(s,idx) ((s)->ptr[idx]) /**< character at given position */
#define QSE_MBS_LEN(s) ((s)->len) /**< string length */
#define QSE_MBS_PTR(s) ((s)->ptr) /**< string buffer pointer */
#define QSE_MBS_CAPA(s) ((s)->capa) /**< string buffer capacity */
#define QSE_MBS_CHAR(s,idx) ((s)->ptr[idx]) /**< character at given position */
#define QSE_MBS_LASTCHAR(s) ((s)->ptr[(s)->len-1]) /**< last character. unsafe if length <= 0 */
#define QSE_WCS_LEN(s) ((s)->len) /**< string buffer length */
#define QSE_WCS_PTR(s) ((s)->ptr) /**< string buffer pointer */
#define QSE_WCS_CAPA(s) ((s)->capa) /**< string buffer capacity */
#define QSE_WCS_CHAR(s,idx) ((s)->ptr[idx]) /**< character at given position */
#define QSE_WCS_LEN(s) ((s)->len) /**< string buffer length */
#define QSE_WCS_PTR(s) ((s)->ptr) /**< string buffer pointer */
#define QSE_WCS_CAPA(s) ((s)->capa) /**< string buffer capacity */
#define QSE_WCS_CHAR(s,idx) ((s)->ptr[idx]) /**< character at given position */
#define QSE_WCS_LASTCHAR(s) ((s)->ptr[(s)->len-1]) /**< last character. unsafe if length <= 0 */
typedef struct qse_mbs_t qse_mbs_t;
typedef struct qse_wcs_t qse_wcs_t;
@ -60,6 +61,7 @@ typedef qse_size_t (*qse_wcs_sizer_t) (
# define QSE_STR_PTR(s) QSE_MBS_PTR(s)
# define QSE_STR_CAPA(s) QSE_MBS_CAPA(s)
# define QSE_STR_CHAR(s,idx) QSE_MBS_CHAR(s,idx)
# define QSE_STR_LASTCHAR(s) QSE_MBS_LASTCHAR(s)
# define qse_str_t qse_mbs_t
# define qse_str_sizer_t qse_mbs_sizer_t
#else
@ -67,6 +69,7 @@ typedef qse_size_t (*qse_wcs_sizer_t) (
# define QSE_STR_PTR(s) QSE_WCS_PTR(s)
# define QSE_STR_CAPA(s) QSE_WCS_CAPA(s)
# define QSE_STR_CHAR(s,idx) QSE_WCS_CHAR(s,idx)
# define QSE_STR_LASTCHAR(s) QSE_WCS_LASTCHAR(s)
# define qse_str_t qse_wcs_t
# define qse_str_sizer_t qse_wcs_sizer_t
#endif

View File

@ -1,5 +1,5 @@
/*
* $Id: rio.c 445 2011-04-28 14:11:19Z hyunghwan.chung $
* $Id: rio.c 446 2011-04-30 15:24:38Z hyunghwan.chung $
*
Copyright 2006-2011 Chung, Hyung-Hwan.
This file is part of QSE.
@ -25,7 +25,6 @@ enum io_mask_t
MASK_READ = 0x0100,
MASK_WRITE = 0x0200,
MASK_RDWR = 0x0400,
MASK_CLEAR = 0x00FF
};
@ -88,93 +87,13 @@ static int out_mask_map[] =
MASK_WRITE
};
static QSE_INLINE int match_long_rs (qse_awk_rtx_t* run, qse_str_t* buf, int eof)
{
qse_cstr_t match;
qse_awk_errnum_t errnum;
int n;
/* TODO: minimize the number of regular expression match by minimizing the call
* to match_long_rs() and changing its code.
* currently it is called for each character added to buf.
* this is a very bad way of doing the job.
*/
QSE_ASSERT (run->gbl.rs != QSE_NULL);
n = QSE_AWK_MATCHREX (
run->awk, run->gbl.rs,
((run->gbl.ignorecase)? QSE_REX_IGNORECASE: 0),
QSE_STR_PTR(buf), QSE_STR_LEN(buf),
QSE_STR_PTR(buf), QSE_STR_LEN(buf),
&match, &errnum);
if (n <= -1)
{
qse_awk_rtx_seterrnum (run, errnum, QSE_NULL);
}
else if (n >= 1)
{
if (eof)
{
/* when EOF is reached, the record buffer
* is not added with a new character. It's
* just called again with the same record buffer
* as the previous call to this function.
* A match in this case must end at the end of
* the current record buffer */
QSE_ASSERT (
QSE_STR_PTR(buf) + QSE_STR_LEN(buf) ==
match.ptr + match.len);
/* drop the RS part. no extra character after RS to drop
* because we're at EOF and the EOF condition didn't
* add a new character to the buffer before the call
* to this function.
*/
QSE_STR_LEN(buf) -= match.len;
}
else
{
/* the last character read so far has been added
* to the record before the call to this function.
* if the match is found and it ends one character
* before this last character, it is the longest
* match. The code here is more generic in that
* the match is determined seeing if it does not end
* at the end of of the buffer.
*/
const qse_char_t* be = QSE_STR_PTR(buf) + QSE_STR_LEN(buf);
const qse_char_t* me = match.ptr + match.len;
if (be > me)
{
/* drop the RS part and the characters after RS */
QSE_STR_LEN(buf) -= match.len + (be - me);
}
else
{
/* if the match doesn't at the desired position,
* it is no match as it is not the longest match */
n = 0; /* switch to no match */
}
}
}
return n;
}
int qse_awk_rtx_readio (
qse_awk_rtx_t* run, int in_type,
const qse_char_t* name, qse_str_t* buf)
static int find_rio_in (
qse_awk_rtx_t* run, int in_type, const qse_char_t* name,
qse_awk_rio_arg_t** rio, qse_awk_rio_fun_t* fun)
{
qse_awk_rio_arg_t* p = run->rio.chain;
qse_awk_rio_fun_t handler;
int io_type, io_mode, io_mask, ret, n;
qse_ssize_t x;
qse_awk_val_t* rs;
qse_char_t* rs_ptr;
qse_size_t rs_len;
qse_size_t line_len = 0;
qse_char_t c = QSE_T('\0'), pc;
int io_type, io_mode, io_mask;
QSE_ASSERT (in_type >= 0 && in_type <= QSE_COUNTOF(in_type_map));
QSE_ASSERT (in_type >= 0 && in_type <= QSE_COUNTOF(in_mode_map));
@ -185,11 +104,11 @@ int qse_awk_rtx_readio (
io_mode = in_mode_map[in_type];
io_mask = in_mask_map[in_type];
/* get the io handler provided by a user */
/* get the I/O handler provided by a user */
handler = run->rio.handler[io_type];
if (handler == QSE_NULL)
{
/* no io handler provided */
/* no I/O handler provided */
qse_awk_rtx_seterrnum (run, QSE_AWK_EIOUSER, QSE_NULL);
return -1;
}
@ -204,6 +123,8 @@ int qse_awk_rtx_readio (
if (p == QSE_NULL)
{
qse_ssize_t x;
/* if the name doesn't exist in the chain, create an entry
* to the chain */
p = (qse_awk_rio_arg_t*) QSE_AWK_ALLOC (
@ -246,7 +167,7 @@ int qse_awk_rtx_readio (
if (run->errinf.num == QSE_AWK_ENOERR)
{
/* if the error number has not been
/* if the error number has not been
* set by the user handler */
qse_awk_rtx_seterrnum (run, QSE_AWK_EIOIMPL, QSE_NULL);
}
@ -258,52 +179,143 @@ int qse_awk_rtx_readio (
p->next = run->rio.chain;
run->rio.chain = p;
/* usually, x == 0 indicates that it has reached the end
* of the input. the user io handler can return 0 for the
* open request if it doesn't have any files to open. One
* advantage of doing this would be that you can skip the
/* usually, x == 0 indicates that it has reached the end
* of the input. the user I/O handler can return 0 for the
* open request if it doesn't have any files to open. One
* advantage of doing this would be that you can skip the
* entire pattern-block matching and execution. */
if (x == 0)
if (x == 0) p->in.eos = 1;
}
*rio = p;
*fun = handler;
return 0;
}
static QSE_INLINE int resolve_rs (
qse_awk_rtx_t* run, qse_awk_val_t* rs, qse_xstr_t* rrs)
{
int ret = 0;
switch (rs->type)
{
case QSE_AWK_VAL_NIL:
rrs->ptr = QSE_NULL;
rrs->len = 0;
break;
case QSE_AWK_VAL_STR:
rrs->ptr = ((qse_awk_val_str_t*)rs)->ptr;
rrs->len = ((qse_awk_val_str_t*)rs)->len;
break;
default:
rrs->ptr = qse_awk_rtx_valtocpldup (run, rs, &rrs->len);
if (rrs->ptr == QSE_NULL) ret = -1;
break;
}
return ret;
}
static QSE_INLINE int match_long_rs (
qse_awk_rtx_t* run, qse_str_t* buf, qse_awk_rio_arg_t* p)
{
qse_cstr_t match;
qse_awk_errnum_t errnum;
int ret;
QSE_ASSERT (run->gbl.rs != QSE_NULL);
ret = QSE_AWK_MATCHREX (
run->awk, run->gbl.rs,
((run->gbl.ignorecase)? QSE_REX_IGNORECASE: 0),
QSE_STR_PTR(buf), QSE_STR_LEN(buf),
QSE_STR_PTR(buf), QSE_STR_LEN(buf),
&match, &errnum);
if (ret <= -1)
{
qse_awk_rtx_seterrnum (run, errnum, QSE_NULL);
}
else if (ret >= 1)
{
if (p->in.eof)
{
p->in.eos = 1;
return 0;
/* when EOF is reached, the record buffer
* is not added with a new character. It's
* just called again with the same record buffer
* as the previous call to this function.
* A match in this case must end at the end of
* the current record buffer */
QSE_ASSERT (
QSE_STR_PTR(buf) + QSE_STR_LEN(buf) == match.ptr + match.len
);
/* drop the RS part. no extra character after RS to drop
* because we're at EOF and the EOF condition didn't
* add a new character to the buffer before the call
* to this function.
*/
QSE_STR_LEN(buf) -= match.len;
}
else
{
/* If the match is found before the end of the current buffer,
* I see it as the longest match. A match ending at the end
* of the buffer is not indeterministic as we don't have the
* full input yet.
*/
const qse_char_t* be = QSE_STR_PTR(buf) + QSE_STR_LEN(buf);
const qse_char_t* me = match.ptr + match.len;
if (me < be)
{
/* the match ends before the ending boundary.
* it must be the longest match. drop the RS part
* and the characters after RS. */
QSE_STR_LEN(buf) -= match.len + (be - me);
p->in.pos -= (be - me);
}
else
{
/* the match is at the ending boundary. switch to no match */
ret = 0;
}
}
}
if (p->in.eos)
{
/* no more streams. */
return 0;
}
return ret;
}
/* ready to read a record (typically a line).
* clear the buffer. */
int qse_awk_rtx_readio (
qse_awk_rtx_t* run, int in_type,
const qse_char_t* name, qse_str_t* buf)
{
qse_awk_rio_arg_t* p;
qse_awk_rio_fun_t handler;
int ret;
qse_awk_val_t* rs;
qse_xstr_t rrs;
qse_size_t line_len = 0;
qse_char_t c = QSE_T('\0'), pc;
if (find_rio_in (run, in_type, name, &p, &handler) <= -1) return -1;
if (p->in.eos) return 0; /* no more streams left */
/* ready to read a record(typically a line). clear the buffer. */
qse_str_clear (buf);
/* get the record separator */
rs = qse_awk_rtx_getgbl (run, QSE_AWK_GBL_RS);
qse_awk_rtx_refupval (run, rs);
switch (rs->type)
if (resolve_rs (run, rs, &rrs) <= -1)
{
case QSE_AWK_VAL_NIL:
rs_ptr = QSE_NULL;
rs_len = 0;
break;
case QSE_AWK_VAL_STR:
rs_ptr = ((qse_awk_val_str_t*)rs)->ptr;
rs_len = ((qse_awk_val_str_t*)rs)->len;
break;
default:
rs_ptr = qse_awk_rtx_valtocpldup (run, rs, &rs_len);
if (rs_ptr == QSE_NULL)
{
qse_awk_rtx_refdownval (run, rs);
return -1;
}
break;
qse_awk_rtx_refdownval (run, rs);
return -1;
}
ret = 1;
@ -313,7 +325,9 @@ int qse_awk_rtx_readio (
{
if (p->in.pos >= p->in.len)
{
qse_ssize_t n;
qse_ssize_t x;
/* no more data. read more */
if (p->in.eof)
{
@ -323,9 +337,9 @@ int qse_awk_rtx_readio (
qse_awk_rtx_seterrnum (run, QSE_AWK_ENOERR, QSE_NULL);
n = handler (run, QSE_AWK_RIO_READ,
x = handler (run, QSE_AWK_RIO_READ,
p, p->in.buf, QSE_COUNTOF(p->in.buf));
if (n <= -1)
if (x <= -1)
{
if (run->errinf.num == QSE_AWK_ENOERR)
{
@ -338,13 +352,21 @@ int qse_awk_rtx_readio (
break;
}
if (n == 0)
if (x == 0)
{
/* EOF reached */
p->in.eof = 1;
if (QSE_STR_LEN(buf) == 0) ret = 0;
else if (rs_len >= 2)
else if (rrs.ptr != QSE_NULL && rrs.len == 0)
{
/* TODO: handle different line terminator */
/* drop the line terminator from the record
* if RS is a blank line and EOF is reached. */
if (QSE_STR_LASTCHAR(buf) == QSE_T('\n'))
QSE_STR_LEN(buf) -= 1;
}
else if (rrs.len >= 2)
{
/* When RS is multiple characters, it should
* check for the match at the end of the
@ -354,7 +376,7 @@ int qse_awk_rtx_readio (
* At EOF, the match at the end is considered
* the longest as there are no more characters
* left */
n = match_long_rs (run, buf, 1);
int n = match_long_rs (run, buf, p);
if (n != 0)
{
if (n <= -1) ret = -1;
@ -365,76 +387,155 @@ int qse_awk_rtx_readio (
break;
}
p->in.len = n;
p->in.len = x;
p->in.pos = 0;
}
pc = c;
c = p->in.buf[p->in.pos++];
if (rs_ptr == QSE_NULL)
if (rrs.ptr == QSE_NULL)
{
/* separate by a new line */
if (c == QSE_T('\n'))
qse_size_t start_pos = p->in.pos;
qse_size_t end_pos, tmp;
do
{
if (pc == QSE_T('\r') &&
QSE_STR_LEN(buf) > 0)
pc = c;
c = p->in.buf[p->in.pos++];
end_pos = p->in.pos;
/* TODO: handle different line terminator */
/* separate by a new line */
if (c == QSE_T('\n'))
{
QSE_STR_LEN(buf) -= 1;
end_pos--;
if (pc == QSE_T('\r'))
{
if (end_pos > start_pos)
{
/* '\r' is the part of the read buffer.
* decrementing the end_pos variable can
* simply drop it */
end_pos--;
}
else
{
/* '\r' must have come from the previous
* read. the record buffer must contain
* it at the end. */
QSE_ASSERT (end_pos == start_pos);
QSE_ASSERT (QSE_STR_LEN(buf) > 0);
QSE_ASSERT (QSE_STR_LASTCHAR(buf) == QSE_T('\r'));
QSE_STR_LEN(buf)--;
}
}
break;
}
}
while (p->in.pos < p->in.len);
tmp = qse_str_ncat (
buf,
&p->in.buf[start_pos],
end_pos - start_pos
);
if (tmp == (qse_size_t)-1)
{
qse_awk_rtx_seterrnum (run, QSE_AWK_ENOMEM, QSE_NULL);
ret = -1;
break;
}
if (end_pos < p->in.len) break; /* RS found */
}
else if (rs_len == 0)
else if (rrs.len == 0)
{
/* separate by a blank line */
if (c == QSE_T('\n'))
int done = 0;
do
{
if (pc == QSE_T('\r') &&
QSE_STR_LEN(buf) > 0)
pc = c;
c = p->in.buf[p->in.pos++];
/* TODO: handle different line terminator */
/* separate by a blank line */
if (c == QSE_T('\n'))
{
if (pc == QSE_T('\r') &&
QSE_STR_LEN(buf) > 0)
{
QSE_STR_LEN(buf) -= 1;
}
}
if (line_len == 0 && c == QSE_T('\n'))
{
if (QSE_STR_LEN(buf) <= 0)
{
/* if the record is empty when a blank
* line is encountered, the line
* terminator should not be added to
* the record */
continue;
}
/* when a blank line is encountered,
* it needs to snip off the line
* terminator of the previous line */
QSE_STR_LEN(buf) -= 1;
done = 1;
break;
}
if (qse_str_ccat (buf, c) == (qse_size_t)-1)
{
qse_awk_rtx_seterrnum (run, QSE_AWK_ENOMEM, QSE_NULL);
ret = -1;
done = 1;
break;
}
/* TODO: handle different line terminator */
if (c == QSE_T('\n')) line_len = 0;
else line_len = line_len + 1;
}
while (p->in.pos < p->in.len);
if (done) break;
}
else if (rrs.len == 1)
{
qse_size_t start_pos = p->in.pos;
qse_size_t end_pos, tmp;
do
{
c = p->in.buf[p->in.pos++];
end_pos = p->in.pos;
if (c == rrs.ptr[0])
{
end_pos--;
break;
}
}
while (p->in.pos < p->in.len);
if (line_len == 0 && c == QSE_T('\n'))
tmp = qse_str_ncat (
buf,
&p->in.buf[start_pos],
end_pos - start_pos
);
if (tmp == (qse_size_t)-1)
{
if (QSE_STR_LEN(buf) <= 0)
{
/* if the record is empty when a blank
* line is encountered, the line
* terminator should not be added to
* the record */
continue;
}
/* when a blank line is encountered,
* it needs to snip off the line
* terminator of the previous line */
/*QSE_STR_LEN(buf) -= 1;*/
buf->len -= 1;
qse_awk_rtx_seterrnum (run, QSE_AWK_ENOMEM, QSE_NULL);
ret = -1;
break;
}
}
else if (rs_len == 1)
{
if (c == rs_ptr[0]) break;
if (end_pos < p->in.len) break; /* RS found */
}
else
{
/* I don't do anything here if RS is composed of
* multiple characters. See the comment further down */
}
qse_size_t tmp;
int n;
if (qse_str_ccat (buf, c) == (qse_size_t)-1)
{
qse_awk_rtx_seterrnum (run, QSE_AWK_ENOMEM, QSE_NULL);
ret = -1;
break;
}
if (rs_len >= 2)
{
/* if RS is composed of multiple characters,
* I perform the matching after having added the
* current character 'c' to the record buffer 'buf'
@ -442,26 +543,33 @@ int qse_awk_rtx_readio (
* one character before this character just added
* to the buffer, it is the longest match.
*/
/* TODO: change the way to find the longest match
* for performance improvement. currently,
* the function is called for every character
* added to the buffer. Stupid! */
n = match_long_rs (run, buf, 0);
tmp = qse_str_ncat (
buf,
&p->in.buf[p->in.pos],
p->in.len - p->in.pos
);
if (tmp == (qse_size_t)-1)
{
qse_awk_rtx_seterrnum (run, QSE_AWK_ENOMEM, QSE_NULL);
ret = -1;
break;
}
p->in.pos = p->in.len;
n = match_long_rs (run, buf, p);
if (n != 0)
{
p->in.pos--; /* unread the character in c */
//p->in.pos--; /* unread the character in c */
if (n <= -1) ret = -1;
break;
}
}
/* TODO: handle different line terminator like \r\n */
if (c == QSE_T('\n')) line_len = 0;
else line_len = line_len + 1;
}
if (rs_ptr != QSE_NULL &&
rs->type != QSE_AWK_VAL_STR) QSE_AWK_FREE (run->awk, rs_ptr);
if (rrs.ptr != QSE_NULL &&
rs->type != QSE_AWK_VAL_STR) QSE_AWK_FREE (run->awk, rrs.ptr);
qse_awk_rtx_refdownval (run, rs);
return ret;
@ -752,7 +860,7 @@ int qse_awk_rtx_nextio_read (
if (n == 0)
{
/* the next stream cannot be opened.
* set the eos flags so that the next call to nextio_read
* set the EOS flags so that the next call to nextio_read
* will return 0 without executing the handler */
p->in.eos = 1;
return 0;
@ -760,7 +868,7 @@ int qse_awk_rtx_nextio_read (
else
{
/* as the next stream has been opened successfully,
* the eof flag should be cleared if set */
* the EOF flag should be cleared if set */
p->in.eof = 0;
/* also the previous input buffer must be reset */