From 07be5e22d76d68bf3974793a18fe4160b35aad5a Mon Sep 17 00:00:00 2001 From: hyung-hwan Date: Thu, 18 Apr 2019 08:42:54 +0000 Subject: [PATCH] added quite some code for handling mbs in awk --- qse/include/qse/awk/awk.h | 18 ++-- qse/include/qse/cmn/str.h | 14 +-- qse/lib/awk/awk-prv.h | 4 +- qse/lib/awk/awk.c | 8 +- qse/lib/awk/err.c | 1 + qse/lib/awk/parse.c | 179 +++++++++++++++++++++++++++++--------- qse/lib/awk/run.c | 111 +++++++++++++++-------- qse/lib/awk/tree.c | 77 ++++++++++++++++ qse/lib/awk/tree.h | 9 ++ qse/lib/awk/val.c | 54 ++++++------ qse/lib/cmn/fmt-out.h | 1 - qse/lib/cmn/str-dyn.c | 6 +- 12 files changed, 357 insertions(+), 125 deletions(-) diff --git a/qse/include/qse/awk/awk.h b/qse/include/qse/awk/awk.h index ad361eaa..ae747e77 100644 --- a/qse/include/qse/awk/awk.h +++ b/qse/include/qse/awk/awk.h @@ -203,14 +203,14 @@ typedef struct qse_awk_val_str_t qse_awk_val_str_t; /** * The qse_awk_val_str_t type is a string type. The type field is - * #QSE_AWK_VAL_BYTEARR. + * #QSE_AWK_VAL_MBS. */ -struct qse_awk_val_bytearr_t +struct qse_awk_val_mbs_t { QSE_AWK_VAL_HDR; - qse_u8ptl_t val; + qse_mcstr_t val; }; -typedef struct qse_awk_val_bytearr_t qse_awk_val_bytearr_t; +typedef struct qse_awk_val_mbs_t qse_awk_val_mbs_t; /** * The qse_awk_val_rex_t type is a regular expression type. The type field @@ -358,7 +358,7 @@ enum qse_awk_nde_type_t /* expression */ /* if you change the following values including their order, - * you should change __eval_func of __eval_expression + * you should change __evaluator of __eval_expression * in run.c accordingly */ QSE_AWK_NDE_GRP, QSE_AWK_NDE_ASS, @@ -372,6 +372,7 @@ enum qse_awk_nde_type_t QSE_AWK_NDE_INT, QSE_AWK_NDE_FLT, QSE_AWK_NDE_STR, + QSE_AWK_NDE_MBS, QSE_AWK_NDE_REX, /* keep this order for the following items otherwise, you may have @@ -1211,6 +1212,7 @@ enum qse_awk_errnum_t QSE_AWK_EEOF, /**< unexpected end of source */ QSE_AWK_ECMTNC, /**< comment not closed properly */ QSE_AWK_ESTRNC, /**< string or regular expression not closed */ + QSE_AWK_EMBSCHR, /**< invalid mbs character '%{0}' */ QSE_AWK_ELBRACE, /**< left brace expected in place of '${0}' */ QSE_AWK_ELPAREN, /**< left parenthesis expected in place of '${0}' */ QSE_AWK_ERPAREN, /**< right parenthesis expected in place of '${0}' */ @@ -1401,7 +1403,7 @@ enum qse_awk_val_type_t QSE_AWK_VAL_INT = 1, /**< integer */ QSE_AWK_VAL_FLT = 2, /**< floating-pointer number */ QSE_AWK_VAL_STR = 3, /**< string */ - QSE_AWK_VAL_BYTEARR = 4, /**< byte array */ + QSE_AWK_VAL_MBS = 4, /**< byte array */ QSE_AWK_VAL_MAP = 5, /**< map */ QSE_AWK_VAL_REX = 6, /**< regular expression */ @@ -2474,9 +2476,9 @@ QSE_EXPORT qse_awk_val_t* qse_awk_rtx_makenstrvalwithxstr ( * The qse_awk_rtx_makebytearrvaal() function create a byte array value. * \return value on success, #QSE_NULL on failure */ -qse_awk_val_t* qse_awk_rtx_makebytearrval ( +qse_awk_val_t* qse_awk_rtx_makembsval ( qse_awk_rtx_t* rtx, - const qse_uint8_t* ptr, + const qse_mchar_t* ptr, qse_size_t len ); diff --git a/qse/include/qse/cmn/str.h b/qse/include/qse/cmn/str.h index 5f683ac1..b3e14bfc 100644 --- a/qse/include/qse/cmn/str.h +++ b/qse/include/qse/cmn/str.h @@ -3639,21 +3639,23 @@ QSE_EXPORT qse_size_t qse_wcs_vfmt ( QSE_EXPORT qse_size_t qse_mbs_ncatwcs ( qse_mbs_t* str, const qse_wchar_t* s, - qse_size_t len + qse_size_t len, + qse_cmgr_t* cmgr ); qse_size_t qse_wcs_ncatmbs ( qse_wcs_t* str, const qse_mchar_t* s, - qse_size_t len + qse_size_t len, + qse_cmgr_t* cmgr ); #if defined(QSE_CHAR_IS_MCHAR) -# define qse_str_ncatwcs(str,s,len) qse_mbs_ncatwcs(str,s,len) -# define qse_str_ncatmbs(str,s,len) qse_mbs_ncat(str,s,len) +# define qse_str_ncatwcs(str,s,len,cmgr) qse_mbs_ncatwcs(str,s,len,cmgr) +# define qse_str_ncatmbs(str,s,len,cmgr) qse_mbs_ncat(str,s,len) #else -# define qse_str_ncatwcs(str,s,len) qse_wcs_ncat(str,s,len) -# define qse_str_ncatmbs(str,s,len) qse_wcs_ncatmbs(str,s,len) +# define qse_str_ncatwcs(str,s,len,cmgr) qse_wcs_ncat(str,s,len) +# define qse_str_ncatmbs(str,s,len,cmgr) qse_wcs_ncatmbs(str,s,len,cmgr) #endif #if defined(__cplusplus) diff --git a/qse/lib/awk/awk-prv.h b/qse/lib/awk/awk-prv.h index 4a22ae93..65643d99 100644 --- a/qse/lib/awk/awk-prv.h +++ b/qse/lib/awk/awk-prv.h @@ -107,6 +107,8 @@ typedef struct qse_awk_tree_t qse_awk_tree_t; #define QSE_AWK_STRDUP(awk,str) (qse_strdup(str,(awk)->mmgr)) #define QSE_AWK_STRXDUP(awk,str,len) (qse_strxdup(str,len,(awk)->mmgr)) +#define QSE_AWK_BYTE_PRINTABLE(x) ((x) <= 0x7F && (x) != '\\' && QSE_ISMPRINT(x)) + enum qse_awk_rio_type_t { /* rio types available */ @@ -149,6 +151,7 @@ struct qse_awk_tok_t struct qse_awk_t { qse_mmgr_t* mmgr; + qse_cmgr_t* cmgr; /* primitive functions */ qse_awk_prm_t prm; @@ -385,7 +388,6 @@ struct qse_awk_rtx_t qse_awk_errinf_t errinf; qse_awk_t* awk; - qse_cmgr_t* cmgr; /* internal default cmgr */ qse_awk_rtx_ecb_t* ecb; }; diff --git a/qse/lib/awk/awk.c b/qse/lib/awk/awk.c index 47ed7e00..69ee42d2 100644 --- a/qse/lib/awk/awk.c +++ b/qse/lib/awk/awk.c @@ -25,6 +25,7 @@ */ #include "awk-prv.h" +#include static void free_fun (qse_htb_t* map, void* vptr, qse_size_t vlen) { @@ -144,6 +145,7 @@ int qse_awk_init (qse_awk_t* awk, qse_mmgr_t* mmgr, const qse_awk_prm_t* prm) /* remember the memory manager */ awk->mmgr = mmgr; + awk->cmgr = qse_getdflcmgr(); /* initialize error handling fields */ awk->errinf.num = QSE_AWK_ENOERR; @@ -166,9 +168,9 @@ int qse_awk_init (qse_awk_t* awk, qse_mmgr_t* mmgr, const qse_awk_prm_t* prm) } awk->prm = *prm; - if (init_token (mmgr, &awk->ptok) <= -1 || - init_token (mmgr, &awk->tok) <= -1 || - init_token (mmgr, &awk->ntok) <= -1) + if (init_token(mmgr, &awk->ptok) <= -1 || + init_token(mmgr, &awk->tok) <= -1 || + init_token(mmgr, &awk->ntok) <= -1) { qse_awk_seterrnum (awk, QSE_AWK_ENOMEM, QSE_NULL); goto oops; diff --git a/qse/lib/awk/err.c b/qse/lib/awk/err.c index cadad925..bb3e1052 100644 --- a/qse/lib/awk/err.c +++ b/qse/lib/awk/err.c @@ -59,6 +59,7 @@ const qse_char_t* qse_awk_dflerrstr (const qse_awk_t* awk, qse_awk_errnum_t errn QSE_T("unexpected end of input"), QSE_T("comment not closed properly"), QSE_T("string or regular expression not closed"), + QSE_T("invalid mbs character '${0}'"), QSE_T("left brace expected in place of '${0}'"), QSE_T("left parenthesis expected in place of '${0}'"), QSE_T("right parenthesis expected in place of '${0}'"), diff --git a/qse/lib/awk/parse.c b/qse/lib/awk/parse.c index 41e7e5b4..b1ec2182 100644 --- a/qse/lib/awk/parse.c +++ b/qse/lib/awk/parse.c @@ -26,6 +26,7 @@ #include "awk-prv.h" #include +#include #if !defined(QSE_AWK_DEFAULT_MODPREFIX) # if defined(_WIN32) @@ -148,6 +149,7 @@ enum tok_t TOK_INT, TOK_FLT, TOK_STR, + TOK_MBS, TOK_REX, __TOKEN_COUNT__ @@ -871,7 +873,7 @@ static int parse_progunit (qse_awk_t* awk) } if (get_token(awk) <= -1) return -1; - + if (!MATCH(awk,TOK_STR)) { SETERR_LOC (awk, QSE_AWK_EINCLSTR, &awk->ptok.loc); @@ -4410,7 +4412,53 @@ oops: return QSE_NULL; } -static qse_awk_nde_t* parse_primary_rex (qse_awk_t* awk, const qse_awk_loc_t* xloc) +static qse_awk_nde_t* parse_primary_mbs (qse_awk_t* awk, const qse_awk_loc_t* xloc) +{ + qse_awk_nde_mbs_t* nde; + + nde = (qse_awk_nde_mbs_t*)qse_awk_callocmem(awk, QSE_SIZEOF(*nde)); + if (nde == QSE_NULL) + { + ADJERR_LOC (awk, xloc); + return QSE_NULL; + } + + nde->type = QSE_AWK_NDE_MBS; + nde->loc = *xloc; + +#if defined(QSE_CHAR_IS_MCHAR) + nde->len = QSE_STR_LEN(awk->tok.name); + nde->ptr = qse_awk_cstrdup(awk, QSE_STR_XSTR(awk->tok.name)); + if (!nde->ptr) goto oops; +#else + { + qse_size_t wcslen, mbslen; + wcslen = QSE_STR_LEN(awk->tok.name); + + /* the MBS token doesn't include a character greater than 0xFF in awk->tok.name though it is a wide character string. + * so i simply use QSE_CMGR_MB8 to store it in a byte string */ + nde->ptr = qse_wcsntombsdupwithcmgr(QSE_STR_PTR(awk->tok.name), wcslen, &mbslen, awk->mmgr, qse_findcmgrbyid(QSE_CMGR_MB8)); + if (!nde->ptr) + { + qse_awk_seterror (awk, QSE_AWK_ENOMEM, QSE_NULL, xloc); + goto oops; + } + nde->len = mbslen; + } +#endif + + if (get_token(awk) <= -1) goto oops; + + return (qse_awk_nde_t*)nde; + +oops: + QSE_ASSERT (nde != QSE_NULL); + if (nde->ptr) QSE_AWK_FREE (awk, nde->ptr); + QSE_AWK_FREE (awk, nde); + return QSE_NULL; +} + +static qse_awk_nde_t* parse_primary_rex (qse_awk_t* awk, const qse_awk_loc_t* xloc) { qse_awk_nde_rex_t* nde; qse_awk_errnum_t errnum; @@ -4686,29 +4734,32 @@ static qse_awk_nde_t* parse_primary_nopipe (qse_awk_t* awk, const qse_awk_loc_t* switch (awk->tok.type) { case TOK_IDENT: - return parse_primary_ident (awk, xloc); + return parse_primary_ident(awk, xloc); case TOK_INT: - return parse_primary_int (awk, xloc); + return parse_primary_int(awk, xloc); case TOK_FLT: - return parse_primary_flt (awk, xloc); + return parse_primary_flt(awk, xloc); case TOK_STR: - return parse_primary_str (awk, xloc); + return parse_primary_str(awk, xloc); + + case TOK_MBS: + return parse_primary_mbs(awk, xloc); case TOK_DIV: case TOK_DIV_ASSN: - return parse_primary_rex (awk, xloc); + return parse_primary_rex(awk, xloc); case TOK_DOLLAR: - return parse_primary_positional (awk, xloc); + return parse_primary_positional(awk, xloc); case TOK_LPAREN: - return parse_primary_lparen (awk, xloc); + return parse_primary_lparen(awk, xloc); case TOK_GETLINE: - return parse_primary_getline (awk, xloc); + return parse_primary_getline(awk, xloc); default: /* in the tolerant mode, we treat print and printf @@ -5623,7 +5674,7 @@ static int get_number (qse_awk_t* awk, qse_awk_tok_t* tok) static int get_string ( qse_awk_t* awk, qse_char_t end_char, - qse_char_t esc_char, int keep_esc_char, + qse_char_t esc_char, int keep_esc_char, int byte_only, qse_size_t preescaped, qse_awk_tok_t* tok) { qse_cint_t c; @@ -5641,6 +5692,15 @@ static int get_string ( return -1; } + #if !defined(QSE_CHAR_IS_MCHAR) + if (byte_only && c != '\\' && !QSE_AWK_BYTE_PRINTABLE(c)) + { + qse_char_t wc = c; + SETERR_ARG_LOC (awk, QSE_AWK_EMBSCHR, &wc, 1, &awk->tok.loc); + return -1; + } + #endif + if (escaped == 3) { if (c >= QSE_T('0') && c <= QSE_T('7')) @@ -5758,7 +5818,7 @@ static int get_string ( c_acc = 0; continue; } - else if (c == QSE_T('u')) + else if (!byte_only && c == QSE_T('u')) { /* in the MCHAR mode, the \u letter will get converted to UTF-8 sequences. * see ADD_TOKEN_UINT32(). */ @@ -5767,7 +5827,7 @@ static int get_string ( c_acc = 0; continue; } - else if (c == QSE_T('U')) + else if (!byte_only && c == QSE_T('U')) { /* in the MCHAR mode, the \u letter will get converted to UTF-8 sequences * see ADD_TOKEN_UINT32(). */ @@ -5827,10 +5887,48 @@ static int get_rexstr (qse_awk_t* awk, qse_awk_tok_t* tok) * begins with reading the next character */ ADD_TOKEN_CHAR (awk, tok, awk->sio.last.c); } - return get_string (awk, QSE_T('/'), QSE_T('\\'), 1, preescaped, tok); + return get_string(awk, QSE_T('/'), QSE_T('\\'), 1, 0, preescaped, tok); } } + +static int get_single_quoted_string (qse_awk_t* awk, int byte_only, qse_awk_tok_t* tok) +{ + qse_cint_t c; + + while (1) + { + GET_CHAR_TO (awk, c); + + if (c == QSE_CHAR_EOF) + { + SETERR_LOC (awk, QSE_AWK_ESTRNC, &awk->tok.loc); + return -1; + } + + #if !defined(QSE_CHAR_IS_MCHAR) + if (byte_only && c != '\\' && !QSE_AWK_BYTE_PRINTABLE(c)) + { + qse_char_t wc = c; + SETERR_ARG_LOC (awk, QSE_AWK_EMBSCHR, &wc, 1, &awk->tok.loc); + return -1; + } + #endif + + + if (c == QSE_T('\'')) + { + /* terminating quote */ + GET_CHAR (awk); + break; + } + + ADD_TOKEN_CHAR (awk, tok, c); + } + + return 0; +} + static int skip_spaces (qse_awk_t* awk) { qse_cint_t c = awk->sio.last.c; @@ -6155,7 +6253,7 @@ retry: QSE_AWK_ISALPHA(awk, c) || QSE_AWK_ISDIGIT(awk, c)); - type = classify_ident (awk, QSE_STR_XSTR(tok->name)); + type = classify_ident(awk, QSE_STR_XSTR(tok->name)); if (type == TOK_IDENT) { SETERR_TOK (awk, QSE_AWK_EXKWNR); @@ -6163,54 +6261,52 @@ retry: } SET_TOKEN_TYPE (awk, tok, type); } + else if (c == 'M') + { + GET_CHAR_TO (awk, c); + if (c == '\"') + { + /* multi-byte string */ + SET_TOKEN_TYPE (awk, tok, TOK_MBS); + if (get_string(awk, c, QSE_T('\\'), 0, 1, 0, tok) <= -1) return -1; + } + else if (c == '\'') + { + SET_TOKEN_TYPE (awk, tok, TOK_MBS); + if (get_single_quoted_string(awk, 1, tok) <= -1) return -1; + } + else + { + goto process_identifier; + } + } else if (c == QSE_T('_') || QSE_AWK_ISALPHA(awk, c)) { int type; + process_identifier: /* identifier */ do { ADD_TOKEN_CHAR (awk, tok, c); GET_CHAR_TO (awk, c); } - while (c == QSE_T('_') || - QSE_AWK_ISALPHA(awk, c) || - QSE_AWK_ISDIGIT(awk, c)); + while (c == QSE_T('_') || QSE_AWK_ISALPHA(awk, c) || QSE_AWK_ISDIGIT(awk, c)); - type = classify_ident (awk, QSE_STR_XSTR(tok->name)); + type = classify_ident(awk, QSE_STR_XSTR(tok->name)); SET_TOKEN_TYPE (awk, tok, type); } else if (c == QSE_T('\"')) { /* double-quoted string */ SET_TOKEN_TYPE (awk, tok, TOK_STR); - if (get_string (awk, c, QSE_T('\\'), 0, 0, tok) <= -1) return -1; + if (get_string(awk, c, QSE_T('\\'), 0, 0, 0, tok) <= -1) return -1; } else if (c == QSE_T('\'')) { /* single-quoted string - no escaping */ - SET_TOKEN_TYPE (awk, tok, TOK_STR); - - while (1) - { - GET_CHAR_TO (awk, c); - - if (c == QSE_CHAR_EOF) - { - SETERR_LOC (awk, QSE_AWK_ESTRNC, &awk->tok.loc); - return -1; - } - - if (c == QSE_T('\'')) - { - /* terminating quote */ - GET_CHAR (awk); - break; - } - - ADD_TOKEN_CHAR (awk, tok, c); - } + if (get_single_quoted_string(awk, 0, tok) <= -1) return -1; } else { @@ -6916,4 +7012,3 @@ done: n = mdp->mod.query (&mdp->mod, awk, segs[1].ptr, sym); return (n <= -1)? QSE_NULL: &mdp->mod; } - diff --git a/qse/lib/awk/run.c b/qse/lib/awk/run.c index 02d747c6..3d88ed84 100644 --- a/qse/lib/awk/run.c +++ b/qse/lib/awk/run.c @@ -250,6 +250,7 @@ static qse_awk_val_t** get_reference_indexed ( static qse_awk_val_t* eval_int (qse_awk_rtx_t* run, qse_awk_nde_t* nde); static qse_awk_val_t* eval_real (qse_awk_rtx_t* run, qse_awk_nde_t* nde); static qse_awk_val_t* eval_str (qse_awk_rtx_t* run, qse_awk_nde_t* nde); +static qse_awk_val_t* eval_mbs (qse_awk_rtx_t* run, qse_awk_nde_t* nde); static qse_awk_val_t* eval_rex (qse_awk_rtx_t* run, qse_awk_nde_t* nde); static qse_awk_val_t* eval_named (qse_awk_rtx_t* run, qse_awk_nde_t* nde); static qse_awk_val_t* eval_gbl (qse_awk_rtx_t* run, qse_awk_nde_t* nde); @@ -985,7 +986,6 @@ static int init_rtx (qse_awk_rtx_t* rtx, qse_awk_t* awk, qse_awk_rio_t* rio) }; rtx->awk = awk; - rtx->cmgr = qse_getdflcmgr(); CLRERR (rtx); @@ -3294,6 +3294,7 @@ static qse_awk_val_t* eval_expression0 (qse_awk_rtx_t* run, qse_awk_nde_t* nde) eval_int, eval_real, eval_str, + eval_mbs, eval_rex, eval_named, eval_gbl, @@ -4179,7 +4180,7 @@ static QSE_INLINE int __cmp_nil_str (qse_awk_rtx_t* rtx, qse_awk_val_t* left, qs static QSE_INLINE int __cmp_nil_bytearr (qse_awk_rtx_t* rtx, qse_awk_val_t* left, qse_awk_val_t* right) { - return (((qse_awk_val_bytearr_t*)right)->val.len == 0)? 0: -1; + return (((qse_awk_val_mbs_t*)right)->val.len == 0)? 0: -1; } static QSE_INLINE int __cmp_nil_map (qse_awk_rtx_t* rtx, qse_awk_val_t* left, qse_awk_val_t* right) @@ -4467,7 +4468,7 @@ static QSE_INLINE int __cmp_str_str (qse_awk_rtx_t* rtx, qse_awk_val_t* left, qs static QSE_INLINE int __cmp_str_bytearr (qse_awk_rtx_t* rtx, qse_awk_val_t* left, qse_awk_val_t* right) { qse_awk_val_str_t* ls = (qse_awk_val_str_t*)left; - qse_awk_val_bytearr_t* rs = (qse_awk_val_bytearr_t*)right; + qse_awk_val_mbs_t* rs = (qse_awk_val_mbs_t*)right; #if (QSE_SIZEOF_MCHAR_T != QSE_SIZEOF_UINT8_T) # error Unsupported size of qse_mchar_t @@ -4501,7 +4502,7 @@ static QSE_INLINE int __cmp_str_map (qse_awk_rtx_t* rtx, qse_awk_val_t* left, qs static QSE_INLINE int __cmp_bytearr_nil (qse_awk_rtx_t* rtx, qse_awk_val_t* left, qse_awk_val_t* right) { - return (((qse_awk_val_bytearr_t*)left)->val.len == 0)? 0: 1; + return (((qse_awk_val_mbs_t*)left)->val.len == 0)? 0: 1; } static QSE_INLINE int __cmp_bytearr_int (qse_awk_rtx_t* rtx, qse_awk_val_t* left, qse_awk_val_t* right) @@ -4521,8 +4522,8 @@ static QSE_INLINE int __cmp_bytearr_str (qse_awk_rtx_t* rtx, qse_awk_val_t* left static QSE_INLINE int __cmp_bytearr_bytearr (qse_awk_rtx_t* rtx, qse_awk_val_t* left, qse_awk_val_t* right) { - qse_awk_val_bytearr_t* ls = (qse_awk_val_bytearr_t*)left; - qse_awk_val_bytearr_t* rs = (qse_awk_val_bytearr_t*)right; + qse_awk_val_mbs_t* ls = (qse_awk_val_mbs_t*)left; + qse_awk_val_mbs_t* rs = (qse_awk_val_mbs_t*)right; #if (QSE_SIZEOF_MCHAR_T != QSE_SIZEOF_UINT8_T) # error Unsupported size of qse_mchar_t #endif @@ -4609,7 +4610,7 @@ static int __cmp_val( * QSE_AWK_VAL_INT = 1 * QSE_AWK_VAL_FLT = 2 * QSE_AWK_VAL_STR = 3 - * QSE_AWK_VAL_BYTEARR = 4 + * QSE_AWK_VAL_MBS = 4 * QSE_AWK_VAL_MAP = 5 */ return func[lvtype * 6 + rvtype](rtx, left, right); @@ -4651,12 +4652,12 @@ static int teq_val (qse_awk_rtx_t* rtx, qse_awk_val_t* left, qse_awk_val_t* righ ((qse_awk_val_str_t*)right)->val.len) == 0; break; - case QSE_AWK_VAL_BYTEARR: + case QSE_AWK_VAL_MBS: n = qse_mbsxncmp ( - ((qse_awk_val_bytearr_t*)left)->val.ptr, - ((qse_awk_val_bytearr_t*)left)->val.len, - ((qse_awk_val_bytearr_t*)right)->val.ptr, - ((qse_awk_val_bytearr_t*)right)->val.len) == 0; + ((qse_awk_val_mbs_t*)left)->val.ptr, + ((qse_awk_val_mbs_t*)left)->val.len, + ((qse_awk_val_mbs_t*)right)->val.ptr, + ((qse_awk_val_mbs_t*)right)->val.len) == 0; break; default: @@ -6262,6 +6263,19 @@ static qse_awk_val_t* eval_str (qse_awk_rtx_t* run, qse_awk_nde_t* nde) return val; } +static qse_awk_val_t* eval_mbs (qse_awk_rtx_t* run, qse_awk_nde_t* nde) +{ + qse_awk_val_t* val; + + val = qse_awk_rtx_makembsval (run, + ((qse_awk_nde_mbs_t*)nde)->ptr, + ((qse_awk_nde_mbs_t*)nde)->len); + if (val == QSE_NULL) ADJERR_LOC (run, &nde->loc); + + return val; +} + + static qse_awk_val_t* eval_rex (qse_awk_rtx_t* run, qse_awk_nde_t* nde) { qse_awk_val_t* val; @@ -7329,7 +7343,7 @@ wp_mod_main: qse_awk_val_t* v; qse_awk_flt_t r; int n; - + #if defined(QSE_USE_AWK_FLTMAX) FMT_CHAR (QSE_T('j')); #else @@ -7438,6 +7452,16 @@ wp_mod_main: else ch = QSE_T('\0'); break; + case QSE_AWK_VAL_MBS: + ch_len = ((qse_awk_val_mbs_t*)v)->val.len; + if (ch_len > 0) + { + ch = ((qse_awk_val_mbs_t*)v)->val.ptr[0]; + ch_len = 1; + } + else ch = QSE_T('\0'); + break; + default: qse_awk_rtx_refdownval (rtx, v); SETERR_COD (rtx, QSE_AWK_EVALTOCHR); @@ -7531,7 +7555,7 @@ wp_mod_main: qse_awk_rtx_refupval (rtx, v); - vtype = QSE_AWK_RTX_GETVALTYPE (rtx, v); + vtype = QSE_AWK_RTX_GETVALTYPE(rtx, v); switch (vtype) { case QSE_AWK_VAL_NIL: @@ -7544,6 +7568,17 @@ wp_mod_main: str_len = ((qse_awk_val_str_t*)v)->val.len; break; + case QSE_AWK_VAL_MBS: + #if defined(QSE_CHAR_IS_MCHAR) + str_ptr = ((qse_awk_val_mbs_t*)v)->val.ptr; + str_len = ((qse_awk_val_mbs_t*)v)->val.len; + break; + #else + str_ptr = (qse_char_t*)((qse_awk_val_mbs_t*)v)->val.ptr; + str_len = ((qse_awk_val_mbs_t*)v)->val.len; + break; + #endif + default: { qse_awk_rtx_valtostr_out_t out; @@ -7556,7 +7591,7 @@ wp_mod_main: } out.type = QSE_AWK_RTX_VALTOSTR_CPLDUP; - if (qse_awk_rtx_valtostr (rtx, v, &out) <= -1) + if (qse_awk_rtx_valtostr(rtx, v, &out) <= -1) { qse_awk_rtx_refdownval (rtx, v); return QSE_NULL; @@ -7588,47 +7623,54 @@ wp_mod_main: } } - #define BYTE_PRINTABLE(x) ((x) <= 0x7F && (x) != '\\' && QSE_ISMPRINT(x)) - if (fmt[i] == QSE_T('k')) bytetostr_flagged_radix |= QSE_BYTETOSTR_LOWERCASE; for (k = 0; k < wp[WP_PRECISION]; k++) { - if (fmt[i] != QSE_T('s') && !BYTE_PRINTABLE(str_ptr[k])) + qse_char_t curc; + + #if defined(QSE_CHAR_IS_MCHAR) + curc = str_ptr[k]; + #else + if (vtype == QSE_AWK_VAL_MBS) curc = (qse_uint8_t)((qse_mchar_t*)str_ptr)[k]; + else curc = str_ptr[k]; + #endif + + if (fmt[i] != QSE_T('s') && !QSE_AWK_BYTE_PRINTABLE(curc)) { qse_char_t xbuf[3]; - if (str_ptr[k] <= 0xFF) + if (curc <= 0xFF) { - if (qse_str_ncat (out, QSE_T("\\x"), 2) == (qse_size_t)-1) goto s_fail; - qse_bytetostr(str_ptr[k], xbuf, QSE_COUNTOF(xbuf), bytetostr_flagged_radix, QSE_T('0')); - if (qse_str_ncat (out, xbuf, 2) == (qse_size_t)-1) goto s_fail; + if (qse_str_ncat(out, QSE_T("\\x"), 2) == (qse_size_t)-1) goto s_fail; + qse_bytetostr(curc, xbuf, QSE_COUNTOF(xbuf), bytetostr_flagged_radix, QSE_T('0')); + if (qse_str_ncat(out, xbuf, 2) == (qse_size_t)-1) goto s_fail; } - else if (str_ptr[k] <= 0xFFFF) + else if (curc <= 0xFFFF) { - qse_uint16_t u16 = str_ptr[k]; - if (qse_str_ncat (out, QSE_T("\\u"), 2) == (qse_size_t)-1) goto s_fail; + qse_uint16_t u16 = curc; + if (qse_str_ncat(out, QSE_T("\\u"), 2) == (qse_size_t)-1) goto s_fail; qse_bytetostr((u16 >> 8) & 0xFF, xbuf, QSE_COUNTOF(xbuf), bytetostr_flagged_radix, QSE_T('0')); - if (qse_str_ncat (out, xbuf, 2) == (qse_size_t)-1) goto s_fail; + if (qse_str_ncat(out, xbuf, 2) == (qse_size_t)-1) goto s_fail; qse_bytetostr(u16 & 0xFF, xbuf, QSE_COUNTOF(xbuf), bytetostr_flagged_radix, QSE_T('0')); - if (qse_str_ncat (out, xbuf, 2) == (qse_size_t)-1) goto s_fail; + if (qse_str_ncat(out, xbuf, 2) == (qse_size_t)-1) goto s_fail; } else { - qse_uint32_t u32 = str_ptr[k]; - if (qse_str_ncat (out, QSE_T("\\U"), 2) == (qse_size_t)-1) goto s_fail; + qse_uint32_t u32 = curc; + if (qse_str_ncat(out, QSE_T("\\U"), 2) == (qse_size_t)-1) goto s_fail; qse_bytetostr((u32 >> 24) & 0xFF, xbuf, QSE_COUNTOF(xbuf), bytetostr_flagged_radix, QSE_T('0')); - if (qse_str_ncat (out, xbuf, 2) == (qse_size_t)-1) goto s_fail; + if (qse_str_ncat(out, xbuf, 2) == (qse_size_t)-1) goto s_fail; qse_bytetostr((u32 >> 16) & 0xFF, xbuf, QSE_COUNTOF(xbuf), bytetostr_flagged_radix, QSE_T('0')); - if (qse_str_ncat (out, xbuf, 2) == (qse_size_t)-1) goto s_fail; + if (qse_str_ncat(out, xbuf, 2) == (qse_size_t)-1) goto s_fail; qse_bytetostr((u32 >> 8) & 0xFF, xbuf, QSE_COUNTOF(xbuf), bytetostr_flagged_radix, QSE_T('0')); - if (qse_str_ncat (out, xbuf, 2) == (qse_size_t)-1) goto s_fail; + if (qse_str_ncat(out, xbuf, 2) == (qse_size_t)-1) goto s_fail; qse_bytetostr(u32 & 0xFF, xbuf, QSE_COUNTOF(xbuf), bytetostr_flagged_radix, QSE_T('0')); - if (qse_str_ncat (out, xbuf, 2) == (qse_size_t)-1) goto s_fail; + if (qse_str_ncat(out, xbuf, 2) == (qse_size_t)-1) goto s_fail; } } else { - if (qse_str_ccat(out, str_ptr[k]) == (qse_size_t)-1) + if (qse_str_ccat(out, curc) == (qse_size_t)-1) { s_fail: if (str_free) QSE_AWK_FREE (rtx->awk, str_free); @@ -7657,7 +7699,6 @@ wp_mod_main: } qse_awk_rtx_refdownval (rtx, v); - } else { diff --git a/qse/lib/awk/tree.c b/qse/lib/awk/tree.c index 363d9e37..7999b4f1 100644 --- a/qse/lib/awk/tree.c +++ b/qse/lib/awk/tree.c @@ -417,6 +417,76 @@ static int print_expr (qse_awk_t* awk, qse_awk_nde_t* nde) break; } + case QSE_AWK_NDE_MBS: + { + qse_mchar_t* ptr; + qse_size_t len, i; + + PUT_SRCSTR (awk, QSE_T("M\"")); + ptr = ((qse_awk_nde_mbs_t*)nde)->ptr; + len = ((qse_awk_nde_mbs_t*)nde)->len; + for (i = 0; i < len; i++) + { + /* TODO: maybe more de-escaping?? */ + switch (ptr[i]) + { + case QSE_MT('\n'): + PUT_SRCSTR (awk, QSE_T("\\n")); + break; + case QSE_MT('\r'): + PUT_SRCSTR (awk, QSE_T("\\r")); + break; + case QSE_MT('\t'): + PUT_SRCSTR (awk, QSE_T("\\t")); + break; + case QSE_MT('\f'): + PUT_SRCSTR (awk, QSE_T("\\f")); + break; + case QSE_MT('\b'): + PUT_SRCSTR (awk, QSE_T("\\b")); + break; + case QSE_MT('\v'): + PUT_SRCSTR (awk, QSE_T("\\v")); + break; + case QSE_MT('\a'): + PUT_SRCSTR (awk, QSE_T("\\a")); + break; + case QSE_MT('\0'): + PUT_SRCSTR (awk, QSE_T("\\0")); + break; + case QSE_MT('\"'): + PUT_SRCSTR (awk, QSE_T("\\\"")); + break; + case QSE_MT('\\'): + PUT_SRCSTR (awk, QSE_T("\\\\")); + break; + default: + { + #if defined(QSE_CHAR_IS_MCHAR) + PUT_SRCSTRN (awk, &ptr[i], 1); + #else + qse_char_t wc = ptr[i]; + if (QSE_AWK_BYTE_PRINTABLE(wc)) + { + PUT_SRCSTRN (awk, &wc, 1); + } + else + { + qse_mchar_t xbuf[3]; + qse_bytetombs (wc, xbuf, QSE_COUNTOF(xbuf), 16, '0'); + PUT_SRCSTR (awk, QSE_T("\\x")); + wc = xbuf[0]; PUT_SRCSTRN (awk, &wc, 1); + wc = xbuf[1]; PUT_SRCSTRN (awk, &wc, 1); + } + #endif + break; + } + } + } + PUT_SRCSTR (awk, QSE_T("\"")); + break; + } + case QSE_AWK_NDE_REX: { PUT_SRCSTR (awk, QSE_T("/")); @@ -1317,6 +1387,13 @@ void qse_awk_clrpt (qse_awk_t* awk, qse_awk_nde_t* tree) break; } + case QSE_AWK_NDE_MBS: + { + QSE_AWK_FREE (awk, ((qse_awk_nde_mbs_t*)p)->ptr); + QSE_AWK_FREE (awk, p); + break; + } + case QSE_AWK_NDE_REX: { qse_awk_nde_rex_t* rex = (qse_awk_nde_rex_t*)p; diff --git a/qse/lib/awk/tree.h b/qse/lib/awk/tree.h index 48320907..a783a6a9 100644 --- a/qse/lib/awk/tree.h +++ b/qse/lib/awk/tree.h @@ -61,6 +61,7 @@ typedef struct qse_awk_nde_int_t qse_awk_nde_int_t; typedef struct qse_awk_nde_flt_t qse_awk_nde_flt_t; typedef struct qse_awk_nde_str_t qse_awk_nde_str_t; +typedef struct qse_awk_nde_mbs_t qse_awk_nde_mbs_t; typedef struct qse_awk_nde_rex_t qse_awk_nde_rex_t; typedef struct qse_awk_nde_var_t qse_awk_nde_var_t; typedef struct qse_awk_nde_fncall_t qse_awk_nde_fncall_t; @@ -156,6 +157,14 @@ struct qse_awk_nde_str_t qse_size_t len; }; +/* QSE_AWK_NDE_MBS */ +struct qse_awk_nde_mbs_t +{ + QSE_AWK_NDE_HDR; + qse_mchar_t* ptr; + qse_size_t len; +}; + /* QSE_AWK_NDE_REX */ struct qse_awk_nde_rex_t { diff --git a/qse/lib/awk/val.c b/qse/lib/awk/val.c index 83673c00..8c00d6f5 100644 --- a/qse/lib/awk/val.c +++ b/qse/lib/awk/val.c @@ -245,7 +245,7 @@ qse_awk_val_t* qse_awk_rtx_makestrvalwithwcs (qse_awk_rtx_t* rtx, const qse_wcha qse_awk_val_t* v; qse_mcstr_t tmp; - tmp.ptr = qse_wcstombsdup (wcs, &tmp.len, rtx->awk->mmgr); + tmp.ptr = qse_wcstombsdup(wcs, &tmp.len, rtx->awk->mmgr); if (tmp.ptr == QSE_NULL) { qse_awk_rtx_seterrnum (rtx, QSE_AWK_ENOMEM, QSE_NULL); @@ -398,26 +398,26 @@ qse_awk_val_t* qse_awk_rtx_makenstrvalwithxstr (qse_awk_rtx_t* rtx, const qse_cs return v; } -qse_awk_val_t* qse_awk_rtx_makebytearrval (qse_awk_rtx_t* rtx, const qse_uint8_t* ptr, qse_size_t len) +qse_awk_val_t* qse_awk_rtx_makembsval (qse_awk_rtx_t* rtx, const qse_mchar_t* ptr, qse_size_t len) { - qse_awk_val_bytearr_t* val = QSE_NULL; - qse_size_t xlen = len * QSE_SIZEOF(*ptr); + qse_awk_val_mbs_t* val = QSE_NULL; + qse_size_t xsz = len * QSE_SIZEOF(*ptr); - val = (qse_awk_val_bytearr_t*)QSE_AWK_ALLOC(rtx->awk, QSE_SIZEOF(qse_awk_val_bytearr_t) + xlen + QSE_SIZEOF(*ptr)); + val = (qse_awk_val_mbs_t*)QSE_AWK_ALLOC(rtx->awk, QSE_SIZEOF(qse_awk_val_mbs_t) + xsz + QSE_SIZEOF(*ptr)); if (val == QSE_NULL) { qse_awk_rtx_seterrnum (rtx, QSE_AWK_ENOMEM, QSE_NULL); return QSE_NULL; } - val->v_type = QSE_AWK_VAL_BYTEARR; + val->v_type = QSE_AWK_VAL_MBS; val->ref = 0; val->stat = 0; val->nstr = 0; val->val.len = len; - val->val.ptr = (qse_uint8_t*)(val + 1); - QSE_MEMCPY (val->val.ptr, ptr, xlen); - val->val.ptr[xlen] = 0; + val->val.ptr = (qse_mchar_t*)(val + 1); + QSE_MEMCPY (val->val.ptr, ptr, xsz); + val->val.ptr[len] = QSE_MT('\0'); return (qse_awk_val_t*)val; } @@ -812,7 +812,7 @@ void qse_awk_rtx_freeval (qse_awk_rtx_t* rtx, qse_awk_val_t* val, int cache) break; } - case QSE_AWK_VAL_BYTEARR: + case QSE_AWK_VAL_MBS: QSE_AWK_FREE (rtx->awk, val); break; @@ -976,8 +976,8 @@ int qse_awk_rtx_valtobool (qse_awk_rtx_t* rtx, const qse_awk_val_t* val) return ((qse_awk_val_flt_t*)val)->val != 0.0; case QSE_AWK_VAL_STR: return ((qse_awk_val_str_t*)val)->val.len > 0; - case QSE_AWK_VAL_BYTEARR: - return ((qse_awk_val_bytearr_t*)val)->val.len > 0; + case QSE_AWK_VAL_MBS: + return ((qse_awk_val_mbs_t*)val)->val.len > 0; case QSE_AWK_VAL_REX: /* TODO: is this correct? */ return ((qse_awk_val_rex_t*)val)->str.len > 0; case QSE_AWK_VAL_MAP: @@ -1085,7 +1085,7 @@ static int mbs_to_str (qse_awk_rtx_t* rtx, const qse_mchar_t* str, qse_size_t st mbslen = str_len; wcslen = out->u.cplcpy.len; - if (qse_mbsntowcsnallwithcmgr(str, &mbslen, out->u.cplcpy.ptr, &wcslen, rtx->cmgr) <= -1 || wcslen >= out->u.cplcpy.len) + if (qse_mbsntowcsnallwithcmgr(str, &mbslen, out->u.cplcpy.ptr, &wcslen, qse_findcmgrbyid(QSE_CMGR_MB8)) <= -1 || wcslen >= out->u.cplcpy.len) { qse_awk_rtx_seterrnum (rtx, QSE_AWK_EINVAL, QSE_NULL); /* TODO: change error code */ return -1; @@ -1103,7 +1103,7 @@ static int mbs_to_str (qse_awk_rtx_t* rtx, const qse_mchar_t* str, qse_size_t st qse_size_t mbslen, wcslen; mbslen = str_len; - tmp = qse_mbsntowcsalldupwithcmgr(str, &mbslen, &wcslen, rtx->awk->mmgr, rtx->cmgr); + tmp = qse_mbsntowcsalldupwithcmgr(str, &mbslen, &wcslen, rtx->awk->mmgr, qse_findcmgrbyid(QSE_CMGR_MB8)); if (!tmp) { qse_awk_rtx_seterrnum (rtx, QSE_AWK_ENOMEM, QSE_NULL); @@ -1120,7 +1120,7 @@ static int mbs_to_str (qse_awk_rtx_t* rtx, const qse_mchar_t* str, qse_size_t st qse_size_t n; qse_str_clear (out->u.strp); - n = qse_str_ncatmbs(out->u.strp, str, str_len); + n = qse_str_ncatmbs(out->u.strp, str, str_len, qse_findcmgrbyid(QSE_CMGR_MB8)); if (n == (qse_size_t)-1) { qse_awk_rtx_seterrnum (rtx, QSE_AWK_ENOMEM, QSE_NULL); @@ -1133,7 +1133,7 @@ static int mbs_to_str (qse_awk_rtx_t* rtx, const qse_mchar_t* str, qse_size_t st { qse_size_t n; - n = qse_str_ncatmbs(out->u.strpcat, str, str_len); + n = qse_str_ncatmbs(out->u.strpcat, str, str_len, qse_findcmgrbyid(QSE_CMGR_MB8)); if (n == (qse_size_t)-1) { qse_awk_rtx_seterrnum (rtx, QSE_AWK_ENOMEM, QSE_NULL); @@ -1469,9 +1469,9 @@ int qse_awk_rtx_valtostr ( return str_to_str(rtx, vs->val.ptr, vs->val.len, out); } - case QSE_AWK_VAL_BYTEARR: + case QSE_AWK_VAL_MBS: { - qse_awk_val_bytearr_t* vs = (qse_awk_val_bytearr_t*)v; + qse_awk_val_mbs_t* vs = (qse_awk_val_mbs_t*)v; #if defined(QSE_CHAR_IS_MCHAR) return str_to_str(rtx, vs->val.ptr, vs->val.len, out); #else @@ -1678,13 +1678,13 @@ int qse_awk_rtx_valtonum (qse_awk_rtx_t* rtx, const qse_awk_val_t* v, qse_awk_in ); } - case QSE_AWK_VAL_BYTEARR: + case QSE_AWK_VAL_MBS: { return qse_awk_rtx_mbstonum ( rtx, QSE_AWK_RTX_STRTONUM_MAKE_OPTION(0, 0), - ((qse_awk_val_bytearr_t*)v)->val.ptr, - ((qse_awk_val_bytearr_t*)v)->val.len, + ((qse_awk_val_mbs_t*)v)->val.ptr, + ((qse_awk_val_mbs_t*)v)->val.len, l, r ); } @@ -1823,9 +1823,9 @@ qse_awk_int_t qse_awk_rtx_hashval (qse_awk_rtx_t* rtx, qse_awk_val_t* v) break; } - case QSE_AWK_VAL_BYTEARR: + case QSE_AWK_VAL_MBS: { - qse_awk_val_bytearr_t* dv = (qse_awk_val_bytearr_t*)v; + qse_awk_val_mbs_t* dv = (qse_awk_val_mbs_t*)v; hv = (qse_awk_int_t)hash((qse_uint8_t*)dv->val.ptr, dv->val.len * QSE_SIZEOF(*dv->val.ptr)); break; } @@ -1886,13 +1886,13 @@ int qse_awk_rtx_setrefval (qse_awk_rtx_t* rtx, qse_awk_val_ref_t* ref, qse_awk_v return x; } - case QSE_AWK_VAL_BYTEARR: + case QSE_AWK_VAL_MBS: #if defined(QSE_CHAR_IS_MCHAR) { /* same as str in the mchar mode */ int x; qse_awk_rtx_refupval (rtx, val); - x = qse_awk_rtx_setrec(rtx, (qse_size_t)ref->adr, &((qse_awk_val_bytearr_t*)val)->val); + x = qse_awk_rtx_setrec(rtx, (qse_size_t)ref->adr, &((qse_awk_val_mbs_t*)val)->val); qse_awk_rtx_refdownval (rtx, val); return x; } @@ -2018,6 +2018,10 @@ void qse_awk_dprintval (qse_awk_rtx_t* run, qse_awk_val_t* val) qse_errputstrf (QSE_T("%s"), ((qse_awk_val_str_t*)val)->ptr); break; + case QSE_AWK_VAL_MBS: + qse_errputstrf (QSE_T("%hs"), ((qse_awk_val_mbs_t*)val)->ptr); + break; + case QSE_AWK_VAL_REX: qse_errputstrf (QSE_T("REX[%s]"), ((qse_awk_val_rex_t*)val)->ptr); break; diff --git a/qse/lib/cmn/fmt-out.h b/qse/lib/cmn/fmt-out.h index 307e566a..b2c94865 100644 --- a/qse/lib/cmn/fmt-out.h +++ b/qse/lib/cmn/fmt-out.h @@ -92,7 +92,6 @@ static char_t* sprintn (char_t* nbuf, qse_uintmax_t num, int base, int *lenp, in #undef PUT_CHAR #undef PUT_BYTE_IN_HEX -#undef BYTE_PRINTABLE #define PUT_CHAR(c) do { \ int xx; \ diff --git a/qse/lib/cmn/str-dyn.c b/qse/lib/cmn/str-dyn.c index ca0bf528..abd6255c 100644 --- a/qse/lib/cmn/str-dyn.c +++ b/qse/lib/cmn/str-dyn.c @@ -280,10 +280,9 @@ static int mbs_to_wcs ( #include "str-dyn.h" -qse_size_t qse_mbs_ncatwcs (qse_mbs_t* str, const qse_wchar_t* s, qse_size_t len) +qse_size_t qse_mbs_ncatwcs (qse_mbs_t* str, const qse_wchar_t* s, qse_size_t len, qse_cmgr_t* cmgr) { qse_size_t mbslen, wcslen; - qse_cmgr_t* cmgr = qse_getdflcmgr(); wcslen = len; if (qse_wcsntombsnwithcmgr(s, &wcslen, QSE_NULL, &mbslen, cmgr) <= -1) return (qse_size_t)-1; @@ -299,10 +298,9 @@ qse_size_t qse_mbs_ncatwcs (qse_mbs_t* str, const qse_wchar_t* s, qse_size_t len return str->val.len; } -qse_size_t qse_wcs_ncatmbs (qse_wcs_t* str, const qse_mchar_t* s, qse_size_t len) +qse_size_t qse_wcs_ncatmbs (qse_wcs_t* str, const qse_mchar_t* s, qse_size_t len, qse_cmgr_t* cmgr) { qse_size_t mbslen, wcslen; - qse_cmgr_t* cmgr = qse_getdflcmgr(); mbslen = len; if (qse_mbsntowcsnallwithcmgr(s, &mbslen, QSE_NULL, &wcslen, cmgr) <= -1) return (qse_size_t)-1;