2020-11-13 02:50:20 +00:00
|
|
|
/*
|
|
|
|
Copyright (c) 2006-2020 Chung, Hyung-Hwan. All rights reserved.
|
|
|
|
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
|
|
modification, are permitted provided that the following conditions
|
|
|
|
are met:
|
|
|
|
1. Redistributions of source code must retain the above copyright
|
|
|
|
notice, this list of conditions and the following disclaimer.
|
|
|
|
2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
notice, this list of conditions and the following disclaimer in the
|
|
|
|
documentation and/or other materials provided with the distribution.
|
|
|
|
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR
|
|
|
|
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
|
|
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
|
|
IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
|
|
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
|
|
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
|
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
|
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
|
|
THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
2020-11-13 14:56:15 +00:00
|
|
|
char_t* split_xchars_to_fields (hawk_rtx_t* rtx, char_t* str, hawk_oow_t len, char_t fs, char_t ec, char_t lq, char_t rq, xcs_t* tok)
|
|
|
|
{
|
|
|
|
char_t* p = str;
|
|
|
|
char_t* end = str + len;
|
|
|
|
int escaped = 0, quoted = 0;
|
|
|
|
char_t* ts; /* token start */
|
|
|
|
char_t* tp; /* points to one char past the last token char */
|
|
|
|
char_t* xp; /* points to one char past the last effective char */
|
|
|
|
|
|
|
|
/* skip leading spaces */
|
|
|
|
while (p < end && is_xch_space(*p)) p++;
|
|
|
|
|
|
|
|
/* initialize token pointers */
|
2024-05-02 13:47:30 +00:00
|
|
|
ts = tp = xp = p;
|
2020-11-13 14:56:15 +00:00
|
|
|
|
|
|
|
while (p < end)
|
|
|
|
{
|
|
|
|
char c = *p;
|
|
|
|
|
|
|
|
if (escaped)
|
|
|
|
{
|
|
|
|
*tp++ = c; xp = tp; p++;
|
|
|
|
escaped = 0;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (c == ec)
|
|
|
|
{
|
|
|
|
escaped = 1;
|
|
|
|
p++;
|
|
|
|
}
|
|
|
|
else if (quoted)
|
|
|
|
{
|
|
|
|
if (c == rq)
|
|
|
|
{
|
|
|
|
quoted = 0;
|
|
|
|
p++;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
*tp++ = c; xp = tp; p++;
|
|
|
|
}
|
|
|
|
}
|
2024-05-02 13:47:30 +00:00
|
|
|
else
|
2020-11-13 14:56:15 +00:00
|
|
|
{
|
|
|
|
if (c == fs)
|
|
|
|
{
|
|
|
|
tok->ptr = ts;
|
|
|
|
tok->len = xp - ts;
|
|
|
|
p++;
|
|
|
|
|
|
|
|
if (is_xch_space(fs))
|
|
|
|
{
|
|
|
|
while (p < end && *p == fs) p++;
|
|
|
|
if (p >= end) return HAWK_NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return p;
|
|
|
|
}
|
2024-05-02 13:47:30 +00:00
|
|
|
|
2020-11-13 14:56:15 +00:00
|
|
|
if (c == lq)
|
|
|
|
{
|
|
|
|
quoted = 1;
|
|
|
|
p++;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
*tp++ = c; p++;
|
2024-05-02 13:47:30 +00:00
|
|
|
if (!is_xch_space(c)) xp = tp;
|
2020-11-13 14:56:15 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-05-02 13:47:30 +00:00
|
|
|
if (escaped)
|
2020-11-13 14:56:15 +00:00
|
|
|
{
|
2024-05-02 13:47:30 +00:00
|
|
|
/* if it is still escaped, the last character must be
|
2020-11-13 14:56:15 +00:00
|
|
|
* the escaper itself. treat it as a normal character */
|
|
|
|
*xp++ = ec;
|
|
|
|
}
|
|
|
|
|
|
|
|
tok->ptr = ts;
|
|
|
|
tok->len = xp - ts;
|
|
|
|
return HAWK_NULL;
|
|
|
|
}
|
|
|
|
|
2020-11-13 02:50:20 +00:00
|
|
|
char_t* tokenize_xchars (hawk_rtx_t* rtx, const char_t* s, hawk_oow_t len, const char_t* delim, hawk_oow_t delim_len, xcs_t* tok)
|
|
|
|
{
|
|
|
|
const char_t* p = s, *d;
|
|
|
|
const char_t* end = s + len;
|
|
|
|
const char_t* sp = HAWK_NULL, * ep = HAWK_NULL;
|
|
|
|
const char_t* delim_end = delim + delim_len;
|
2024-05-02 13:47:30 +00:00
|
|
|
char_t c;
|
2020-11-13 02:50:20 +00:00
|
|
|
int delim_mode;
|
|
|
|
|
|
|
|
#define __DELIM_NULL 0
|
|
|
|
#define __DELIM_EMPTY 1
|
|
|
|
#define __DELIM_SPACES 2
|
|
|
|
#define __DELIM_NOSPACES 3
|
|
|
|
#define __DELIM_COMPOSITE 4
|
|
|
|
if (delim == HAWK_NULL) delim_mode = __DELIM_NULL;
|
2024-05-02 13:47:30 +00:00
|
|
|
else
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
|
|
|
delim_mode = __DELIM_EMPTY;
|
|
|
|
|
2024-05-02 13:47:30 +00:00
|
|
|
for (d = delim; d < delim_end; d++)
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
2024-05-02 13:47:30 +00:00
|
|
|
if (is_xch_space(*d))
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
|
|
|
if (delim_mode == __DELIM_EMPTY)
|
|
|
|
delim_mode = __DELIM_SPACES;
|
|
|
|
else if (delim_mode == __DELIM_NOSPACES)
|
|
|
|
{
|
|
|
|
delim_mode = __DELIM_COMPOSITE;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (delim_mode == __DELIM_EMPTY)
|
|
|
|
delim_mode = __DELIM_NOSPACES;
|
|
|
|
else if (delim_mode == __DELIM_SPACES)
|
|
|
|
{
|
|
|
|
delim_mode = __DELIM_COMPOSITE;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* TODO: verify the following statement... */
|
2024-05-02 13:47:30 +00:00
|
|
|
if (delim_mode == __DELIM_SPACES &&
|
|
|
|
delim_len == 1 &&
|
2020-11-13 02:50:20 +00:00
|
|
|
delim[0] != ' ') delim_mode = __DELIM_NOSPACES;
|
|
|
|
}
|
|
|
|
|
2024-05-02 13:47:30 +00:00
|
|
|
if (delim_mode == __DELIM_NULL)
|
|
|
|
{
|
|
|
|
/* when HAWK_NULL is given as "delim", it trims off the
|
2020-11-13 02:50:20 +00:00
|
|
|
* leading and trailing spaces characters off the source
|
|
|
|
* string "s" eventually. */
|
|
|
|
|
|
|
|
while (p < end && is_xch_space(*p)) p++;
|
2024-05-02 13:47:30 +00:00
|
|
|
while (p < end)
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
|
|
|
c = *p;
|
|
|
|
|
2024-05-02 13:47:30 +00:00
|
|
|
if (!is_xch_space(c))
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
|
|
|
if (sp == HAWK_NULL) sp = p;
|
|
|
|
ep = p;
|
|
|
|
}
|
|
|
|
p++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if (delim_mode == __DELIM_EMPTY)
|
|
|
|
{
|
|
|
|
/* each character in the source string "s" becomes a token. */
|
|
|
|
if (p < end)
|
|
|
|
{
|
|
|
|
c = *p;
|
|
|
|
sp = p;
|
|
|
|
ep = p++;
|
|
|
|
}
|
|
|
|
}
|
2024-05-02 13:47:30 +00:00
|
|
|
else if (delim_mode == __DELIM_SPACES)
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
|
|
|
/* each token is delimited by space characters. all leading
|
|
|
|
* and trailing spaces are removed. */
|
|
|
|
|
|
|
|
while (p < end && is_xch_space(*p)) p++;
|
2024-05-02 13:47:30 +00:00
|
|
|
while (p < end)
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
|
|
|
c = *p;
|
|
|
|
if (is_xch_space(c)) break;
|
|
|
|
if (sp == HAWK_NULL) sp = p;
|
|
|
|
ep = p++;
|
|
|
|
}
|
|
|
|
while (p < end && is_xch_space(*p)) p++;
|
|
|
|
}
|
|
|
|
else if (delim_mode == __DELIM_NOSPACES)
|
|
|
|
{
|
2024-05-02 13:47:30 +00:00
|
|
|
/* each token is delimited by one of charaters
|
2020-11-13 02:50:20 +00:00
|
|
|
* in the delimeter set "delim". */
|
|
|
|
if (rtx->gbl.ignorecase)
|
|
|
|
{
|
2024-05-02 13:47:30 +00:00
|
|
|
while (p < end)
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
2020-12-18 17:16:58 +00:00
|
|
|
c = to_xch_upper(*p);
|
2024-05-02 13:47:30 +00:00
|
|
|
for (d = delim; d < delim_end; d++)
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
2020-12-18 17:16:58 +00:00
|
|
|
if (c == to_xch_upper(*d)) goto exit_loop;
|
2020-11-13 02:50:20 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (sp == HAWK_NULL) sp = p;
|
|
|
|
ep = p++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2024-05-02 13:47:30 +00:00
|
|
|
while (p < end)
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
|
|
|
c = *p;
|
2024-05-02 13:47:30 +00:00
|
|
|
for (d = delim; d < delim_end; d++)
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
|
|
|
if (c == *d) goto exit_loop;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (sp == HAWK_NULL) sp = p;
|
|
|
|
ep = p++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2024-05-02 13:47:30 +00:00
|
|
|
else /* if (delim_mode == __DELIM_COMPOSITE) */
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
|
|
|
/* each token is delimited by one of non-space charaters
|
|
|
|
* in the delimeter set "delim". however, all space characters
|
|
|
|
* surrounding the token are removed */
|
|
|
|
while (p < end && is_xch_space(*p)) p++;
|
|
|
|
if (rtx->gbl.ignorecase)
|
|
|
|
{
|
2024-05-02 13:47:30 +00:00
|
|
|
while (p < end)
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
2020-12-18 17:16:58 +00:00
|
|
|
c = to_xch_upper(*p);
|
2024-05-02 13:47:30 +00:00
|
|
|
if (is_xch_space(c))
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
|
|
|
p++;
|
|
|
|
continue;
|
|
|
|
}
|
2024-05-02 13:47:30 +00:00
|
|
|
for (d = delim; d < delim_end; d++)
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
2020-12-18 17:16:58 +00:00
|
|
|
if (c == to_xch_upper(*d)) goto exit_loop;
|
2020-11-13 02:50:20 +00:00
|
|
|
}
|
|
|
|
if (sp == HAWK_NULL) sp = p;
|
|
|
|
ep = p++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2024-05-02 13:47:30 +00:00
|
|
|
while (p < end)
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
|
|
|
c = *p;
|
2024-05-02 13:47:30 +00:00
|
|
|
if (is_xch_space(c))
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
|
|
|
p++;
|
|
|
|
continue;
|
|
|
|
}
|
2024-05-02 13:47:30 +00:00
|
|
|
for (d = delim; d < delim_end; d++)
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
|
|
|
if (c == *d) goto exit_loop;
|
|
|
|
}
|
|
|
|
if (sp == HAWK_NULL) sp = p;
|
|
|
|
ep = p++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
exit_loop:
|
2024-05-02 13:47:30 +00:00
|
|
|
if (sp == HAWK_NULL)
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
|
|
|
tok->ptr = HAWK_NULL;
|
|
|
|
tok->len = (hawk_oow_t)0;
|
|
|
|
}
|
2024-05-02 13:47:30 +00:00
|
|
|
else
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
|
|
|
tok->ptr = (char_t*)sp;
|
2024-05-02 13:47:30 +00:00
|
|
|
tok->len = ep - sp + 1;
|
2020-11-13 02:50:20 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* if HAWK_NULL is returned, this function should not be called again */
|
|
|
|
if (p >= end) return HAWK_NULL;
|
2024-05-02 13:47:30 +00:00
|
|
|
if (delim_mode == __DELIM_EMPTY ||
|
2020-11-13 02:50:20 +00:00
|
|
|
delim_mode == __DELIM_SPACES) return (char_t*)p;
|
|
|
|
return (char_t*)++p;
|
|
|
|
}
|
|
|
|
|
2020-11-13 14:56:15 +00:00
|
|
|
|
|
|
|
char_t* tokenize_xchars_by_rex (hawk_rtx_t* rtx, const char_t* str, hawk_oow_t len, const char_t* substr, hawk_oow_t sublen, hawk_tre_t* rex, xcs_t* tok)
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
2020-11-13 14:56:15 +00:00
|
|
|
int n;
|
|
|
|
hawk_oow_t i;
|
|
|
|
xcs_t match, s, cursub, realsub;
|
2020-11-13 02:50:20 +00:00
|
|
|
|
2020-11-13 14:56:15 +00:00
|
|
|
s.ptr = (char_t*)str;
|
|
|
|
s.len = len;
|
2020-11-13 02:50:20 +00:00
|
|
|
|
2020-11-13 14:56:15 +00:00
|
|
|
cursub.ptr = (char_t*)substr;
|
|
|
|
cursub.len = sublen;
|
2020-11-13 02:50:20 +00:00
|
|
|
|
2020-11-13 14:56:15 +00:00
|
|
|
realsub.ptr = (char_t*)substr;
|
|
|
|
realsub.len = sublen;
|
|
|
|
|
|
|
|
while (cursub.len > 0)
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
2020-11-13 14:56:15 +00:00
|
|
|
n = match_rex_with_xcs(rtx, rex, &s, &cursub, &match, HAWK_NULL);
|
|
|
|
if (n <= -1) return HAWK_NULL;
|
2020-11-13 02:50:20 +00:00
|
|
|
|
2020-11-13 14:56:15 +00:00
|
|
|
if (n == 0)
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
2020-11-13 14:56:15 +00:00
|
|
|
/* no match has been found. return the entire string as a token */
|
|
|
|
hawk_rtx_seterrnum (rtx, HAWK_NULL, HAWK_ENOERR); /* reset HAWK_EREXNOMAT to no error */
|
|
|
|
tok->ptr = realsub.ptr;
|
|
|
|
tok->len = realsub.len;
|
2024-05-02 13:47:30 +00:00
|
|
|
return HAWK_NULL;
|
2020-11-13 02:50:20 +00:00
|
|
|
}
|
2020-11-13 14:56:15 +00:00
|
|
|
|
|
|
|
HAWK_ASSERT (n == 1);
|
|
|
|
|
|
|
|
if (match.len == 0)
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
2020-11-13 14:56:15 +00:00
|
|
|
/* the match length is zero. */
|
|
|
|
cursub.ptr++;
|
|
|
|
cursub.len--;
|
|
|
|
}
|
|
|
|
else if (HAWK_RTX_IS_STRIPRECSPC_ON(rtx))
|
|
|
|
{
|
|
|
|
/* match at the beginning of the input string */
|
2024-05-02 13:47:30 +00:00
|
|
|
if (match.ptr == substr)
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
2020-11-13 14:56:15 +00:00
|
|
|
for (i = 0; i < match.len; i++)
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
2020-11-13 14:56:15 +00:00
|
|
|
if (!is_xch_space(match.ptr[i])) goto exit_loop;
|
2020-11-13 02:50:20 +00:00
|
|
|
}
|
|
|
|
|
2024-05-02 13:47:30 +00:00
|
|
|
/* the match that is all spaces at the
|
2020-11-13 14:56:15 +00:00
|
|
|
* beginning of the input string is skipped */
|
|
|
|
cursub.ptr += match.len;
|
|
|
|
cursub.len -= match.len;
|
2020-11-13 02:50:20 +00:00
|
|
|
|
2020-11-13 14:56:15 +00:00
|
|
|
/* adjust the substring by skipping the leading
|
|
|
|
* spaces and retry matching */
|
|
|
|
realsub.ptr = (char_t*)substr + match.len;
|
|
|
|
realsub.len -= match.len;
|
2020-11-13 02:50:20 +00:00
|
|
|
}
|
2020-11-13 14:56:15 +00:00
|
|
|
else break;
|
2020-11-13 02:50:20 +00:00
|
|
|
}
|
2020-11-13 14:56:15 +00:00
|
|
|
else break;
|
2020-11-13 02:50:20 +00:00
|
|
|
}
|
|
|
|
|
2020-11-13 14:56:15 +00:00
|
|
|
exit_loop:
|
|
|
|
hawk_rtx_seterrnum (rtx, HAWK_NULL, HAWK_ENOERR);
|
|
|
|
|
|
|
|
if (cursub.len <= 0)
|
2020-11-13 02:50:20 +00:00
|
|
|
{
|
2020-11-13 14:56:15 +00:00
|
|
|
tok->ptr = realsub.ptr;
|
|
|
|
tok->len = realsub.len;
|
2024-05-02 13:47:30 +00:00
|
|
|
return HAWK_NULL;
|
2020-11-13 02:50:20 +00:00
|
|
|
}
|
|
|
|
|
2020-11-13 14:56:15 +00:00
|
|
|
tok->ptr = realsub.ptr;
|
|
|
|
tok->len = match.ptr - realsub.ptr;
|
|
|
|
|
|
|
|
for (i = 0; i < match.len; i++)
|
|
|
|
{
|
|
|
|
if (!is_xch_space(match.ptr[i]))
|
|
|
|
{
|
|
|
|
/* the match contains a non-space character. */
|
|
|
|
return (char_t*)match.ptr+match.len;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* the match is all spaces */
|
|
|
|
if (HAWK_RTX_IS_STRIPRECSPC_ON(rtx))
|
|
|
|
{
|
|
|
|
/* if the match reached the last character in the input string,
|
|
|
|
* it returns HAWK_NULL to terminate tokenization. */
|
|
|
|
return (match.ptr+match.len >= substr+sublen)? HAWK_NULL: ((char_t*)match.ptr+match.len);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2024-05-02 13:47:30 +00:00
|
|
|
/* if the match went beyond the the last character in the input
|
2020-11-13 14:56:15 +00:00
|
|
|
* string, it returns HAWK_NULL to terminate tokenization. */
|
|
|
|
return (match.ptr+match.len > substr+sublen)? HAWK_NULL: ((char_t*)match.ptr+match.len);
|
|
|
|
}
|
2020-11-13 02:50:20 +00:00
|
|
|
}
|