changed the awk parser to accept \u and \U in the MCHAR mode.

implemented k/K in the awk's builtin printf function
This commit is contained in:
hyung-hwan 2019-03-20 07:06:00 +00:00
parent 6c61e0bab6
commit c139017593
4 changed files with 136 additions and 92 deletions

View File

@ -2466,6 +2466,10 @@ QSE_EXPORT int qse_wcshextobin (
#define QSE_BYTETOSTR_RADIXMASK (0xFF)
#define QSE_BYTETOSTR_LOWERCASE (1 << 8)
#define QSE_BYTETOMBS_RADIXMASK QSE_BYTETOSTR_RADIXMASK
#define QSE_BYTETOMBS_LOWERCASE QSE_BYTETOSTR_LOWERCASE
#define QSE_BYTETOWCS_RADIXMASK QSE_BYTETOSTR_RADIXMASK
#define QSE_BYTETOWCS_LOWERCASE QSE_BYTETOSTR_LOWERCASE
qse_size_t qse_bytetombs (
qse_byte_t byte,

View File

@ -25,6 +25,7 @@
*/
#include "awk-prv.h"
#include <qse/cmn/utf8.h>
#if !defined(QSE_AWK_DEFAULT_MODPREFIX)
# if defined(_WIN32)
@ -184,73 +185,44 @@ static int parse_progunit (qse_awk_t* awk);
static qse_awk_t* collect_globals (qse_awk_t* awk);
static void adjust_static_globals (qse_awk_t* awk);
static qse_size_t find_global (qse_awk_t* awk, const qse_cstr_t* name);
static qse_awk_t* collect_locals (
qse_awk_t* awk, qse_size_t nlcls, int istop);
static qse_awk_t* collect_locals (qse_awk_t* awk, qse_size_t nlcls, int istop);
static qse_awk_nde_t* parse_function (qse_awk_t* awk);
static qse_awk_nde_t* parse_begin (qse_awk_t* awk);
static qse_awk_nde_t* parse_end (qse_awk_t* awk);
static qse_awk_chain_t* parse_action_block (
qse_awk_t* awk, qse_awk_nde_t* ptn, int blockless);
static qse_awk_chain_t* parse_action_block (qse_awk_t* awk, qse_awk_nde_t* ptn, int blockless);
static qse_awk_nde_t* parse_block_dc (
qse_awk_t* awk, const qse_awk_loc_t* xloc, int istop);
static qse_awk_nde_t* parse_block_dc (qse_awk_t* awk, const qse_awk_loc_t* xloc, int istop);
static qse_awk_nde_t* parse_statement (
qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_statement (qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_expr_withdc (
qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_expr_withdc (qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_logical_or (
qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_logical_and (
qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_in (
qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_regex_match (
qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_bitwise_or (
qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_bitwise_xor (
qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_bitwise_and (
qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_equality (
qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_relational (
qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_shift (
qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_concat (
qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_additive (
qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_multiplicative (
qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_logical_or (qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_logical_and (qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_in (qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_regex_match (qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_bitwise_or (qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_bitwise_xor (qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_bitwise_and (qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_equality (qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_relational (qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_shift (qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_concat (qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_additive (qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_multiplicative (qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_unary (
qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_exponent (
qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_unary_exp (
qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_increment (
qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_primary (
qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_primary_ident (
qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_unary (qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_exponent (qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_unary_exp (qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_increment (qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_primary (qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_primary_ident (qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_hashidx (
qse_awk_t* awk, const qse_cstr_t* name, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_fncall (
qse_awk_t* awk, const qse_cstr_t* name,
qse_awk_fnc_t* fnc, const qse_awk_loc_t* xloc, int noarg);
static qse_awk_nde_t* parse_hashidx (qse_awk_t* awk, const qse_cstr_t* name, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_fncall (qse_awk_t* awk, const qse_cstr_t* name, qse_awk_fnc_t* fnc, const qse_awk_loc_t* xloc, int noarg);
static qse_awk_nde_t* parse_primary_ident_segs (
qse_awk_t* awk, const qse_awk_loc_t* xloc, const qse_cstr_t* full,
const qse_cstr_t segs[], int nsegs);
static qse_awk_nde_t* parse_primary_ident_segs (qse_awk_t* awk, const qse_awk_loc_t* xloc, const qse_cstr_t* full, const qse_cstr_t segs[], int nsegs);
static int get_token (qse_awk_t* awk);
static int preget_token (qse_awk_t* awk);
@ -261,8 +233,7 @@ static int skip_comment (qse_awk_t* awk);
static int classify_ident (qse_awk_t* awk, const qse_cstr_t* name);
static int deparse (qse_awk_t* awk);
static qse_htb_walk_t deparse_func (
qse_htb_t* map, qse_htb_pair_t* pair, void* arg);
static qse_htb_walk_t deparse_func (qse_htb_t* map, qse_htb_pair_t* pair, void* arg);
static int put_char (qse_awk_t* awk, qse_char_t c);
static int flush_out (qse_awk_t* awk);
@ -406,6 +377,23 @@ static global_t gtab[] =
} \
} while (0)
#if defined(QSE_CHAR_IS_MCHAR)
# define ADD_TOKEN_UINT32(awk,tok,c) \
do { \
if (c <= 0xFF) ADD_TOKEN_CHAR(awk, tok, c); \
else \
{ \
qse_mchar_t __xbuf[QSE_MBLEN_MAX + 1]; \
qse_size_t __len, __i; \
__len = qse_uctoutf8(c, __xbuf, QSE_COUNTOF(__xbuf)); /* use utf8 all the time */ \
for (__i = 0; __i < __len; __i++) ADD_TOKEN_CHAR(awk, tok, __xbuf[__i]); \
} \
} while (0)
#else
# define ADD_TOKEN_UINT32(awk,tok,c) ADD_TOKEN_CHAR(awk,tok,c);
#endif
#define MATCH(awk,tok_type) ((awk)->tok.type == (tok_type))
#define MATCH_RANGE(awk,tok_type_start,tok_type_end) ((awk)->tok.type >= (tok_type_start) && (awk)->tok.type <= (tok_type_end))
@ -5627,6 +5615,12 @@ static int get_number (qse_awk_t* awk, qse_awk_tok_t* tok)
return 0;
}
/* i think allowing only up to 2 hexadigits is more useful though it
* may break compatibilty with other awk implementations. If you want
* more than 2, define HEX_DIGIT_LIMIT_FOR_X to QSE_TYPE_MAX(qse_size_t). */
/*#define HEX_DIGIT_LIMIT_FOR_X (QSE_TYPE_MAX(qse_size_t))*/
#define HEX_DIGIT_LIMIT_FOR_X (2)
static int get_string (
qse_awk_t* awk, qse_char_t end_char,
qse_char_t esc_char, int keep_esc_char,
@ -5635,7 +5629,7 @@ static int get_string (
qse_cint_t c;
qse_size_t escaped = preescaped;
qse_size_t digit_count = 0;
qse_cint_t c_acc = 0;
qse_uint32_t c_acc = 0;
while (1)
{
@ -5656,19 +5650,19 @@ static int get_string (
if (digit_count >= escaped)
{
/* should i limit the max to 0xFF/0377?
* if (c_acc > 0377) c_acc = 0377;*/
ADD_TOKEN_CHAR (awk, tok, c_acc);
if (c_acc > 0377) c_acc = 0377; */
ADD_TOKEN_UINT32 (awk, tok, c_acc);
escaped = 0;
}
continue;
}
else
{
ADD_TOKEN_CHAR (awk, tok, c_acc);
ADD_TOKEN_UINT32 (awk, tok, c_acc);
escaped = 0;
}
}
else if (escaped == QSE_TYPE_MAX(qse_size_t) || escaped == 4 || escaped == 8)
else if (escaped == HEX_DIGIT_LIMIT_FOR_X || escaped == 4 || escaped == 8)
{
if (c >= QSE_T('0') && c <= QSE_T('9'))
{
@ -5676,7 +5670,7 @@ static int get_string (
digit_count++;
if (digit_count >= escaped)
{
ADD_TOKEN_CHAR (awk, tok, c_acc);
ADD_TOKEN_UINT32 (awk, tok, c_acc);
escaped = 0;
}
continue;
@ -5687,7 +5681,7 @@ static int get_string (
digit_count++;
if (digit_count >= escaped)
{
ADD_TOKEN_CHAR (awk, tok, c_acc);
ADD_TOKEN_UINT32 (awk, tok, c_acc);
escaped = 0;
}
continue;
@ -5698,7 +5692,7 @@ static int get_string (
digit_count++;
if (digit_count >= escaped)
{
ADD_TOKEN_CHAR (awk, tok, c_acc);
ADD_TOKEN_UINT32 (awk, tok, c_acc);
escaped = 0;
}
continue;
@ -5707,13 +5701,19 @@ static int get_string (
{
qse_char_t rc;
/*rc = (escaped == QSE_TYPE_MAX(qse_size_t))? QSE_T('x'):
(escaped == 4)? QSE_T('u'): QSE_T('U');*/
rc = (escaped == 2)? QSE_T('x'):
rc = (escaped == HEX_DIGIT_LIMIT_FOR_X)? QSE_T('x'):
(escaped == 4)? QSE_T('u'): QSE_T('U');
if (digit_count == 0)
{
/* no valid character after the escaper.
* keep the escaper as it is. consider this input:
* \xGG
* 'c' is at the first G. this part is to restore the
* \x part. since \x is not followed by any hexadecimal
* digits, it's literally 'x' */
ADD_TOKEN_CHAR (awk, tok, rc);
else ADD_TOKEN_CHAR (awk, tok, c_acc);
}
else ADD_TOKEN_UINT32 (awk, tok, c_acc);
escaped = 0;
}
@ -5753,28 +5753,29 @@ static int get_string (
}
else if (c == QSE_T('x'))
{
/*escaped = QSE_TYPE_MAX(qse_size_t);*/
escaped = 2; /* i find allowing only 2 hexadigits more useful though it may break compatibilty with other awk implementations */
escaped = HEX_DIGIT_LIMIT_FOR_X;
digit_count = 0;
c_acc = 0;
continue;
}
#if defined(QSE_CHAR_IS_WCHAR)
else if (c == QSE_T('u') && QSE_SIZEOF(qse_char_t) >= 2)
else if (c == QSE_T('u'))
{
/* in the MCHAR mode, the \u letter will get converted to UTF-8 sequences.
* see ADD_TOKEN_UINT32(). */
escaped = 4;
digit_count = 0;
c_acc = 0;
continue;
}
else if (c == QSE_T('U') && QSE_SIZEOF(qse_char_t) >= 4)
else if (c == QSE_T('U'))
{
/* in the MCHAR mode, the \u letter will get converted to UTF-8 sequences
* see ADD_TOKEN_UINT32(). */
escaped = 8;
digit_count = 0;
c_acc = 0;
continue;
}
#endif
else if (keep_esc_char)
{
/* if the following character doesn't compose a proper

View File

@ -7426,13 +7426,14 @@ wp_mod_main:
qse_awk_rtx_refdownval (rtx, v);
}
else if (fmt[i] == QSE_T('s'))
else if (fmt[i] == QSE_T('s') || fmt[i] == QSE_T('k') || fmt[i] == QSE_T('K'))
{
qse_char_t* str_ptr, * str_free = QSE_NULL;
qse_size_t str_len;
qse_awk_int_t k;
qse_awk_val_t* v;
qse_awk_val_type_t vtype;
int bytetostr_flagged_radix = 16;
if (args == QSE_NULL)
{
@ -7486,7 +7487,7 @@ wp_mod_main:
SETERR_COD (rtx, QSE_AWK_EFMTCNV);
return QSE_NULL;
}
out.type = QSE_AWK_RTX_VALTOSTR_CPLDUP;
if (qse_awk_rtx_valtostr (rtx, v, &out) <= -1)
{
@ -7509,10 +7510,9 @@ wp_mod_main:
/* right align */
while (wp[WP_WIDTH] > wp[WP_PRECISION])
{
if (qse_str_ccat (out, QSE_T(' ')) == -1)
if (qse_str_ccat(out, QSE_T(' ')) == -1)
{
if (str_free != QSE_NULL)
QSE_AWK_FREE (rtx->awk, str_free);
if (str_free) QSE_AWK_FREE (rtx->awk, str_free);
qse_awk_rtx_refdownval (rtx, v);
SETERR_COD (rtx, QSE_AWK_ENOMEM);
return QSE_NULL;
@ -7521,26 +7521,65 @@ wp_mod_main:
}
}
#define BYTE_PRINTABLE(x) ((x) <= 0x7F && QSE_ISMPRINT(x) && (x) != '\\')
if (fmt[i] == QSE_T('k')) bytetostr_flagged_radix |= QSE_BYTETOSTR_LOWERCASE;
for (k = 0; k < wp[WP_PRECISION]; k++)
{
if (qse_str_ccat (out, str_ptr[k]) == -1)
{
if (str_free != QSE_NULL)
QSE_AWK_FREE (rtx->awk, str_free);
qse_awk_rtx_refdownval (rtx, v);
SETERR_COD (rtx, QSE_AWK_ENOMEM);
return QSE_NULL;
}
if (fmt[i] != QSE_T('s') && !BYTE_PRINTABLE(str_ptr[k]))
{
qse_char_t xbuf[3];
if (str_ptr[k] <= 0xFF)
{
if (qse_str_ncat (out, QSE_T("\\x"), 2) == (qse_size_t)-1) goto s_fail;
qse_bytetostr(str_ptr[k], xbuf, QSE_COUNTOF(xbuf), bytetostr_flagged_radix, QSE_T('0'));
if (qse_str_ncat (out, xbuf, 2) == (qse_size_t)-1) goto s_fail;
}
else if (str_ptr[k] <= 0xFFFF)
{
qse_uint16_t u16 = str_ptr[k];
if (qse_str_ncat (out, QSE_T("\\u"), 2) == (qse_size_t)-1) goto s_fail;
qse_bytetostr((u16 >> 8) & 0xFF, xbuf, QSE_COUNTOF(xbuf), bytetostr_flagged_radix, QSE_T('0'));
if (qse_str_ncat (out, xbuf, 2) == (qse_size_t)-1) goto s_fail;
qse_bytetostr(u16 & 0xFF, xbuf, QSE_COUNTOF(xbuf), bytetostr_flagged_radix, QSE_T('0'));
if (qse_str_ncat (out, xbuf, 2) == (qse_size_t)-1) goto s_fail;
}
else
{
qse_uint32_t u32 = str_ptr[k];
if (qse_str_ncat (out, QSE_T("\\U"), 2) == (qse_size_t)-1) goto s_fail;
qse_bytetostr((u32 >> 24) & 0xFF, xbuf, QSE_COUNTOF(xbuf), bytetostr_flagged_radix, QSE_T('0'));
if (qse_str_ncat (out, xbuf, 2) == (qse_size_t)-1) goto s_fail;
qse_bytetostr((u32 >> 16) & 0xFF, xbuf, QSE_COUNTOF(xbuf), bytetostr_flagged_radix, QSE_T('0'));
if (qse_str_ncat (out, xbuf, 2) == (qse_size_t)-1) goto s_fail;
qse_bytetostr((u32 >> 8) & 0xFF, xbuf, QSE_COUNTOF(xbuf), bytetostr_flagged_radix, QSE_T('0'));
if (qse_str_ncat (out, xbuf, 2) == (qse_size_t)-1) goto s_fail;
qse_bytetostr(u32 & 0xFF, xbuf, QSE_COUNTOF(xbuf), bytetostr_flagged_radix, QSE_T('0'));
if (qse_str_ncat (out, xbuf, 2) == (qse_size_t)-1) goto s_fail;
}
}
else
{
if (qse_str_ccat(out, str_ptr[k]) == (qse_size_t)-1)
{
s_fail:
if (str_free) QSE_AWK_FREE (rtx->awk, str_free);
qse_awk_rtx_refdownval (rtx, v);
SETERR_COD (rtx, QSE_AWK_ENOMEM);
return QSE_NULL;
}
}
}
if (str_free != QSE_NULL) QSE_AWK_FREE (rtx->awk, str_free);
if (str_free) QSE_AWK_FREE (rtx->awk, str_free);
if (flags & FLAG_MINUS)
{
/* left align */
while (wp[WP_WIDTH] > wp[WP_PRECISION])
{
if (qse_str_ccat (out, QSE_T(' ')) == -1)
if (qse_str_ccat(out, QSE_T(' ')) == (qse_size_t)-1)
{
qse_awk_rtx_refdownval (rtx, v);
SETERR_COD (rtx, QSE_AWK_ENOMEM);

View File

@ -109,7 +109,7 @@ static char_t* sprintn (char_t* nbuf, qse_uintmax_t num, int base, int *lenp, in
PUT_CHAR(__xbuf[1]); \
} while (0)
#define BYTE_PRINTABLE(x) ((x >= 'a' && x <= 'z') || (x >= 'A' && x <= 'Z') || (x >= '0' && x <= '9') || (x == ' '))
#define BYTE_PRINTABLE(x) ((x) <= 0x7F && QSE_ISMPRINT(x) && (x) != '\\')
int fmtout (const char_t* fmt, fmtout_t* data, va_list ap)
{