From f2316d1d3ee24b0ae5ca472ddb155aa14563c0a2 Mon Sep 17 00:00:00 2001 From: hyung-hwan Date: Fri, 13 Mar 2020 16:00:21 +0000 Subject: [PATCH] hawk_unescape_bcstr()/hawk_unescape_ucstr() --- hawk/lib/HawkStd.cpp | 23 +++- hawk/lib/hawk-utl.h | 12 ++ hawk/lib/std.c | 23 +++- hawk/lib/utl-str.c | 297 +++++++++++++++++++++++++++++++++++++++++++ hawk/lib/val.c | 3 +- 5 files changed, 344 insertions(+), 14 deletions(-) diff --git a/hawk/lib/HawkStd.cpp b/hawk/lib/HawkStd.cpp index 3a07ada9..9f91bbd0 100644 --- a/hawk/lib/HawkStd.cpp +++ b/hawk/lib/HawkStd.cpp @@ -845,18 +845,29 @@ void HawkStd::clearConsoleOutputs () static int check_var_assign (hawk_rtx_t* rtx, const hawk_ooch_t* str) { - hawk_ooch_t* eq, * var; + hawk_ooch_t* eq, * dstr; int n; eq = hawk_find_oochar_in_oocstr(str, '='); if (!eq || eq <= str) return 0; /* not assignment */ - var = hawk_rtx_dupoochars(rtx, str, eq - str); - if (HAWK_UNLIKELY(!var)) return -1; + dstr = hawk_rtx_dupoocstr(rtx, str, HAWK_NULL); + if (HAWK_UNLIKELY(!dstr)) return -1; - n = hawk_isvalidident(hawk_rtx_gethawk(rtx), var)? - ((hawk_rtx_setgbltostrbyname(rtx, var, eq + 1) <= -1)? -1: 1): 0; - hawk_rtx_freemem (rtx, var); + eq = dstr + (eq - str); + *eq = '\0'; + + if (hawk_isvalidident(hawk_rtx_gethawk(rtx), dstr)) + { + hawk_unescape_oocstr (eq + 1); + n = (hawk_rtx_setgbltostrbyname(rtx, dstr, eq + 1) <= -1)? -1: 1; + } + else + { + n = 0; + } + + hawk_rtx_freemem (rtx, dstr); return n; } diff --git a/hawk/lib/hawk-utl.h b/hawk/lib/hawk-utl.h index 8e21fb3a..e243ada8 100644 --- a/hawk/lib/hawk-utl.h +++ b/hawk/lib/hawk-utl.h @@ -682,6 +682,16 @@ HAWK_EXPORT hawk_bch_t* hawk_tokenize_bchars ( int ignorecase ); + +HAWK_EXPORT void hawk_unescape_ucstr ( + hawk_uch_t* str +); + +HAWK_EXPORT void hawk_unescape_bcstr ( + hawk_bch_t* str +); + + #if defined(HAWK_OOCH_IS_UCH) # define hawk_equal_oochars hawk_equal_uchars # define hawk_comp_oochars hawk_comp_uchars @@ -720,6 +730,7 @@ HAWK_EXPORT hawk_bch_t* hawk_tokenize_bchars ( # define hawk_split_oocstr hawk_split_ucstr # define hawk_tokenize_oochars hawk_tokenize_uchars +# define hawk_unescape_oocstr hawk_unescape_ucstr #else # define hawk_equal_oochars hawk_equal_bchars # define hawk_comp_oochars hawk_comp_bchars @@ -758,6 +769,7 @@ HAWK_EXPORT hawk_bch_t* hawk_tokenize_bchars ( # define hawk_split_oocstr hawk_split_bcstr # define hawk_tokenize_oochars hawk_tokenize_bchars +# define hawk_unescape_oocstr hawk_unescape_bcstr #endif /* ------------------------------------------------------------------------- */ diff --git a/hawk/lib/std.c b/hawk/lib/std.c index 21824a0f..8bc5b9cc 100644 --- a/hawk/lib/std.c +++ b/hawk/lib/std.c @@ -1673,18 +1673,29 @@ int hawk_parsestd (hawk_t* awk, hawk_parsestd_t in[], hawk_parsestd_t* out) static int check_var_assign (hawk_rtx_t* rtx, const hawk_ooch_t* str) { - hawk_ooch_t* eq, * var; + hawk_ooch_t* eq, * dstr; int n; eq = hawk_find_oochar_in_oocstr(str, '='); if (!eq || eq <= str) return 0; /* not assignment */ - var = hawk_rtx_dupoochars(rtx, str, eq - str); - if (HAWK_UNLIKELY(!var)) return -1; + dstr = hawk_rtx_dupoocstr(rtx, str, HAWK_NULL); + if (HAWK_UNLIKELY(!dstr)) return -1; - n = hawk_isvalidident(hawk_rtx_gethawk(rtx), var)? - ((hawk_rtx_setgbltostrbyname(rtx, var, eq + 1) <= -1)? -1: 1): 0; - hawk_rtx_freemem (rtx, var); + eq = dstr + (eq - str); + *eq = '\0'; + + if (hawk_isvalidident(hawk_rtx_gethawk(rtx), dstr)) + { + hawk_unescape_oocstr (eq + 1); + n = (hawk_rtx_setgbltostrbyname(rtx, dstr, eq + 1) <= -1)? -1: 1; + } + else + { + n = 0; + } + + hawk_rtx_freemem (rtx, dstr); return n; } diff --git a/hawk/lib/utl-str.c b/hawk/lib/utl-str.c index 931ff4dd..6461f54a 100644 --- a/hawk/lib/utl-str.c +++ b/hawk/lib/utl-str.c @@ -1836,6 +1836,303 @@ exit_loop: /* ------------------------------------------------------------------------ */ +void hawk_unescape_ucstr (hawk_uch_t* str) +{ + hawk_uch_t c, c_acc, * p1, * p2; + int escaped = 0, digit_count; + + p1 = str; + p2 = str; + while ((c = *p1++) != '\0') + { + if (escaped == 3) + { + /* octal */ + if (c >= '0' && c <= '7') + { + c_acc = c_acc * 8 + c - '0'; + digit_count++; + + if (digit_count >= escaped) + { + /* should i limit the max to 0xFF/0377? + if (c_acc > 0377) c_acc = 0377; */ + escaped = 0; + *p2++ = c_acc; + } + continue; + } + else + { + escaped = 0; + *p2++ = c_acc; + } + } + else if (escaped == 2 || escaped == 4 || escaped == 8) + { + /* hexadecimal */ + if (c >= '0' && c <= '9') + { + c_acc = c_acc * 16 + c - '0'; + digit_count++; + if (digit_count >= escaped) + { + *p2++ = c_acc; + escaped = 0; + } + continue; + } + else if (c >= 'A' && c <= 'F') + { + c_acc = c_acc * 16 + c - 'A' + 10; + digit_count++; + if (digit_count >= escaped) + { + *p2++ = c_acc; + escaped = 0; + } + continue; + } + else if (c >= 'a' && c <= 'f') + { + c_acc = c_acc * 16 + c - 'a' + 10; + digit_count++; + if (digit_count >= escaped) + { + *p2++ = c_acc; + escaped = 0; + } + continue; + } + else + { + hawk_uch_t rc; + + rc = (escaped == 2)? 'x': + (escaped == 4)? 'u': 'U'; + if (digit_count == 0) + { + /* no valid character after the escaper. + * keep the escaper as it is. consider this input: + * \xGG + * 'c' is at the first G. this part is to restore the + * \x part. since \x is not followed by any hexadecimal + * digits, it's literally 'x' */ + *p2++ = rc; + } + else *p2++ = c_acc; + + escaped = 0; + } + } + + if (escaped == 1) + { + switch (c) + { + case 'n': c = '\n'; break; + case 'r': c = '\r'; break; + case 't': c = '\t'; break; + case 'f': c = '\f'; break; + case 'b': c = '\b'; break; + case 'v': c = '\v'; break; + case 'a': c = '\a'; break; + + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + escaped = 3; + digit_count = 1; + c_acc = c - '0'; + continue; + + case 'x': + escaped = 2; + digit_count = 0; + c_acc = 0; + continue; + + case 'u': + escaped = 4; + digit_count = 0; + c_acc = 0; + continue; + + case 'U': + escaped = 8; + digit_count = 0; + c_acc = 0; + continue; + } + + *p2++ = c; + escaped = 0; + continue; + } + + if (c == '\\') + { + escaped = 1; + continue; + } + + *p2++ = c; + } + + *p2 = '\0'; +} + +/* ------------------------------------------------------------------------ */ + +void hawk_unescape_bcstr (hawk_bch_t* str) +{ + hawk_bch_t c, c_acc, * p1, * p2; + int escaped = 0, digit_count; + + p1 = str; + p2 = str; + while ((c = *p1++) != '\0') + { + if (escaped == 3) + { + /* octal */ + if (c >= '0' && c <= '7') + { + c_acc = c_acc * 8 + c - '0'; + digit_count++; + + if (digit_count >= escaped) + { + /* should i limit the max to 0xFF/0377? + if (c_acc > 0377) c_acc = 0377; */ + escaped = 0; + *p2++ = c_acc; + } + continue; + } + else + { + escaped = 0; + *p2++ = c_acc; + } + } + else if (escaped == 2 || escaped == 4 || escaped == 8) + { + /* hexadecimal */ + if (c >= '0' && c <= '9') + { + c_acc = c_acc * 16 + c - '0'; + digit_count++; + if (digit_count >= escaped) + { + *p2++ = c_acc; + escaped = 0; + } + continue; + } + else if (c >= 'A' && c <= 'F') + { + c_acc = c_acc * 16 + c - 'A' + 10; + digit_count++; + if (digit_count >= escaped) + { + *p2++ = c_acc; + escaped = 0; + } + continue; + } + else if (c >= 'a' && c <= 'f') + { + c_acc = c_acc * 16 + c - 'a' + 10; + digit_count++; + if (digit_count >= escaped) + { + *p2++ = c_acc; + escaped = 0; + } + continue; + } + else + { + hawk_bch_t rc; + + rc = (escaped == 2)? 'x': + (escaped == 4)? 'u': 'U'; + if (digit_count == 0) + { + /* no valid character after the escaper. + * keep the escaper as it is. consider this input: + * \xGG + * 'c' is at the first G. this part is to restore the + * \x part. since \x is not followed by any hexadecimal + * digits, it's literally 'x' */ + *p2++ = rc; + } + else *p2++ = c_acc; + + escaped = 0; + } + } + + if (escaped == 1) + { + switch (c) + { + case 'n': c = '\n'; break; + case 'r': c = '\r'; break; + case 't': c = '\t'; break; + case 'f': c = '\f'; break; + case 'b': c = '\b'; break; + case 'v': c = '\v'; break; + case 'a': c = '\a'; break; + + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + escaped = 3; + digit_count = 1; + c_acc = c - '0'; + continue; + + case 'x': + escaped = 2; + digit_count = 0; + c_acc = 0; + continue; + + #if 0 + /* don't support \u and \U in byte string. */ + case 'u': + escaped = 4; + digit_count = 0; + c_acc = 0; + continue; + + case 'U': + escaped = 8; + digit_count = 0; + c_acc = 0; + continue; + #endif + } + + *p2++ = c; + escaped = 0; + continue; + } + + if (c == '\\') + { + escaped = 1; + continue; + } + + *p2++ = c; + } + + *p2 = '\0'; +} + +/* ------------------------------------------------------------------------ */ + hawk_oow_t hawk_int_to_oocstr (hawk_int_t value, int radix, const hawk_ooch_t* prefix, hawk_ooch_t* buf, hawk_oow_t size) { hawk_int_t t, rem; diff --git a/hawk/lib/val.c b/hawk/lib/val.c index cdd5a209..7430b0ca 100644 --- a/hawk/lib/val.c +++ b/hawk/lib/val.c @@ -195,7 +195,7 @@ init: val->fcb = 0; val->val.len = len1 + len2; val->val.ptr = (hawk_ooch_t*)(val + 1); - if (str1) hawk_copy_oochars_to_oocstr_unlimited (&val->val.ptr[0], str1, len1); + if (HAWK_LIKELY(str1)) hawk_copy_oochars_to_oocstr_unlimited (&val->val.ptr[0], str1, len1); if (str2) hawk_copy_oochars_to_oocstr_unlimited (&val->val.ptr[len1], str2, len2); val->val.ptr[val->val.len] = '\0'; @@ -205,7 +205,6 @@ init: return (hawk_val_t*)val; } - hawk_val_t* hawk_rtx_makestrvalwithuchars (hawk_rtx_t* rtx, const hawk_uch_t* ucs, hawk_oow_t len) { #if defined(HAWK_OOCH_IS_UCH)