From 164b85a6f94748611dec7bf21852eea5ccb1a49b Mon Sep 17 00:00:00 2001 From: hyung-hwan Date: Sat, 19 Jan 2013 16:21:32 +0000 Subject: [PATCH] changed the number of maximum digits after \x in an awk string --- qse/doc/page/awk-lang.md | 59 ++++++++++++++++++++++++++++------------ qse/lib/awk/parse.c | 17 ++++++------ 2 files changed, 50 insertions(+), 26 deletions(-) diff --git a/qse/doc/page/awk-lang.md b/qse/doc/page/awk-lang.md index 5c78276b..7db3f742 100644 --- a/qse/doc/page/awk-lang.md +++ b/qse/doc/page/awk-lang.md @@ -13,9 +13,9 @@ tranforms them to an internal form for execution. An QSEAWK program can be composed of the following elements at the top level. - - pattern-action blocks - *BEGIN* blocks - *END* blocks + - pattern-action blocks - user-defined functions - comments - \@global variables @@ -107,17 +107,17 @@ point with a preceeding number. 34.56e # 34.56 34.56E3 -An integer can be prefixed with 0x, 0, 0b for a hexa-decimal number, an octal number, -and a binary number respectively. For a hexa-decimal number, letters from A to F -can form a number case-insenstively in addition to numeric digits. +An integer can be prefixed with 0x, 0, 0b for a hexa-decimal number, an octal +number, and a binary number respectively. For a hexa-decimal number, letters +from A to F can form a number case-insenstively in addition to numeric digits. 0xA1 # 161 0xB0b0 # 45232 020 # 16 0b101 # 5 -If the prefix is not followed by any numeric digits, it is still a valid token and -represents the value of 0. +If the prefix is not followed by any numeric digits, it is still a valid token +and represents the value of 0. 0x # 0x0 but not desirable. 0b # 0b0 but not desirable. @@ -127,21 +127,44 @@ represents the value of 0. A string is enclosed in a pair of double quotes or single quotes. A character in a string encosed in the double-quotes, when preceded with -a back-slash, changes the meaning. +a back-slash, changes the meaning. - - \\ - - \a - - \b - - \uXXXX - - \UXXXXXXXX + - \\a - alert + - \\b - backspace + - \\f - formfeed + - \\n - newline + - \\r - carriage return + - \\t - horizontal tab + - \\v - vertical tab + - \\\\ - backslash + - \\" - double quote -You can use \\u and \\U in a string to specify a character by unicode if -[Character Type](@ref installation) chosen for building is the wide character -type. +You can specify a character with an octal number or a hexadecimal number. +The actual value can range between 0 and 255 inclusive. - BEGIN { - print "\uC720\uB2C8\uCF54\uB4DC \U00007D71\U00004E00\U000078BC"; - } + - \\OOO - O is an octal digit. + - \\xXX - X is a hexadecimal digit. + +In the octal sequence, you can specify up to 3 octal digits after \\; In the +hexadecimal sequence, you can specify as many hexadecimal digits as possible +after \\x. + +If the number doesn't fit in the range that the default character type +can represent, the character generated from the sequence is undefined. + +You can use \\u and \\U in a string to specify a character by a Unicode code +point if [Character Type](@ref installation) chosen for building is the +wide character type. + + - \\uXXXX - X is a hexadecimal digit. + - \\UXXXXXXXX - X is a hexadecimal digit. + + +~~~~~{.awk} + BEGIN { + print "\uC720\uB2C8\uCF54\uB4DC \U00007D71\U00004E00\U000078BC"; + } +~~~~~ This program should print 유니코드 統一碼. diff --git a/qse/lib/awk/parse.c b/qse/lib/awk/parse.c index bf668d7d..1f4f16da 100644 --- a/qse/lib/awk/parse.c +++ b/qse/lib/awk/parse.c @@ -5415,11 +5415,11 @@ static int get_number (qse_awk_t* awk, qse_awk_tok_t* tok) static int get_string ( qse_awk_t* awk, qse_char_t end_char, qse_char_t esc_char, int keep_esc_char, - int preescaped, qse_awk_tok_t* tok) + qse_size_t preescaped, qse_awk_tok_t* tok) { qse_cint_t c; - int escaped = preescaped; - int digit_count = 0; + qse_size_t escaped = preescaped; + qse_size_t digit_count = 0; qse_cint_t c_acc = 0; while (1) @@ -5440,6 +5440,8 @@ static int get_string ( digit_count++; if (digit_count >= escaped) { + /* should i limit the max to 0xFF/0377? + * if (c_acc > 0377) c_acc = 0377;*/ ADD_TOKEN_CHAR (awk, tok, c_acc); escaped = 0; } @@ -5451,7 +5453,7 @@ static int get_string ( escaped = 0; } } - else if (escaped == 2 || escaped == 4 || escaped == 8) + else if (escaped == QSE_TYPE_MAX(qse_size_t) || escaped == 4 || escaped == 8) { if (c >= QSE_T('0') && c <= QSE_T('9')) { @@ -5490,9 +5492,8 @@ static int get_string ( { qse_char_t rc; - rc = (escaped == 2)? QSE_T('x'): + rc = (escaped == QSE_TYPE_MAX(qse_size_t))? QSE_T('x'): (escaped == 4)? QSE_T('u'): QSE_T('U'); - if (digit_count == 0) ADD_TOKEN_CHAR (awk, tok, rc); else ADD_TOKEN_CHAR (awk, tok, c_acc); @@ -5533,12 +5534,12 @@ static int get_string ( } else if (c == QSE_T('x')) { - escaped = 2; + escaped = QSE_TYPE_MAX(qse_size_t); digit_count = 0; c_acc = 0; continue; } - #ifdef QSE_CHAR_IS_WCHAR + #if defined(QSE_CHAR_IS_WCHAR) else if (c == QSE_T('u') && QSE_SIZEOF(qse_char_t) >= 2) { escaped = 4;