enhanced hawk_unescape_bcstr() to convert \uXXXX and \UXXXXXXXX to utf8 sequences
This commit is contained in:
		| @ -6140,32 +6140,36 @@ static int get_string ( | |||||||
| 			} | 			} | ||||||
| 			else | 			else | ||||||
| 			{ | 			{ | ||||||
| 				hawk_ooch_t rc; |  | ||||||
| 				 | 				 | ||||||
| 				rc = (escaped == HEX_DIGIT_LIMIT_FOR_X)? HAWK_T('x'): |  | ||||||
| 				     (escaped == 4)? HAWK_T('u'): HAWK_T('U'); |  | ||||||
| 				if (digit_count == 0)  | 				if (digit_count == 0)  | ||||||
| 				{ | 				{ | ||||||
|  | 					hawk_ooch_t ec; | ||||||
|  |  | ||||||
|  | 					ec = (escaped == HEX_DIGIT_LIMIT_FOR_X)? HAWK_T('x'): | ||||||
|  | 					     (escaped == 4)? HAWK_T('u'): HAWK_T('U'); | ||||||
|  |  | ||||||
| 					/* no valid character after the escaper. | 					/* no valid character after the escaper. | ||||||
| 					 * keep the escaper as it is. consider this input: | 					 * keep the escaper as it is. consider this input: | ||||||
| 					 *   \xGG | 					 *   \xGG | ||||||
| 					 * 'c' is at the first G. this part is to restore the | 					 * 'c' is at the first G. this part is to restore the | ||||||
| 					 * \x part. since \x is not followed by any hexadecimal | 					 * \x part. since \x is not followed by any hexadecimal | ||||||
| 					 * digits, it's literally 'x' */ | 					 * digits, it's literally 'x' */ | ||||||
| 					ADD_TOKEN_CHAR (awk, tok, rc); | 					ADD_TOKEN_CHAR (awk, tok, ec); | ||||||
| 				} | 				} | ||||||
| 				else ADD_TOKEN_UINT32 (awk, tok, c_acc); | 				else ADD_TOKEN_UINT32 (awk, tok, c_acc); | ||||||
|  |  | ||||||
| 				escaped = 0; | 				escaped = 0; | ||||||
|  | 				/* carray on to handle the current character  */ | ||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
|  | 		else if (escaped == 99) | ||||||
| 		if (escaped == 99) |  | ||||||
| 		{ | 		{ | ||||||
| 			escaped = 0; | 			escaped = 0; | ||||||
| 			if (c == '\n') continue; /* backslash \r \n */ | 			if (c == '\n') continue; /* backslash \r \n */ | ||||||
| 		} | 		} | ||||||
|  |  | ||||||
|  | 		/* -------------------------------------- */ | ||||||
|  |  | ||||||
| 		if (escaped == 0) | 		if (escaped == 0) | ||||||
| 		{ | 		{ | ||||||
| 			if (c == end_char) | 			if (c == end_char) | ||||||
| @ -6180,14 +6184,13 @@ static int get_string ( | |||||||
| 				escaped = 1; | 				escaped = 1; | ||||||
| 				continue; | 				continue; | ||||||
| 			} | 			} | ||||||
| 			else if (!(awk->parse.pragma.trait & HAWK_MULTILINESTR) && c == '\n' || c == '\r') | 			else if (!(awk->parse.pragma.trait & HAWK_MULTILINESTR) && (c == '\n' || c == '\r')) | ||||||
| 			{ | 			{ | ||||||
| 				hawk_seterrnum (awk, &awk->tok.loc, HAWK_ESTRNC); | 				hawk_seterrnum (awk, &awk->tok.loc, HAWK_ESTRNC); | ||||||
| 				return -1; | 				return -1; | ||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
|  | 		else if (escaped == 1) | ||||||
| 		if (escaped == 1) |  | ||||||
| 		{ | 		{ | ||||||
| 			if (c == '\n') | 			if (c == '\n') | ||||||
| 			{ | 			{ | ||||||
|  | |||||||
| @ -1838,7 +1838,8 @@ exit_loop: | |||||||
|  |  | ||||||
| void hawk_unescape_ucstr (hawk_uch_t* str) | void hawk_unescape_ucstr (hawk_uch_t* str) | ||||||
| { | { | ||||||
| 	hawk_uch_t c, c_acc, * p1, * p2; | 	hawk_uch_t c, * p1, * p2; | ||||||
|  | 	hawk_uint32_t c_acc; | ||||||
| 	int escaped = 0, digit_count; | 	int escaped = 0, digit_count; | ||||||
|  |  | ||||||
| 	p1 = str; | 	p1 = str; | ||||||
| @ -1866,6 +1867,7 @@ void hawk_unescape_ucstr (hawk_uch_t* str) | |||||||
| 			{ | 			{ | ||||||
| 				escaped = 0; | 				escaped = 0; | ||||||
| 				*p2++ = c_acc; | 				*p2++ = c_acc; | ||||||
|  | 				goto normal_char; | ||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
| 		else if (escaped == 2 || escaped == 4 || escaped == 8) | 		else if (escaped == 2 || escaped == 4 || escaped == 8) | ||||||
| @ -1906,10 +1908,6 @@ void hawk_unescape_ucstr (hawk_uch_t* str) | |||||||
| 			} | 			} | ||||||
| 			else | 			else | ||||||
| 			{ | 			{ | ||||||
| 				hawk_uch_t rc; |  | ||||||
|  |  | ||||||
| 				rc = (escaped == 2)? 'x': |  | ||||||
| 				     (escaped == 4)? 'u': 'U'; |  | ||||||
| 				if (digit_count == 0)  | 				if (digit_count == 0)  | ||||||
| 				{ | 				{ | ||||||
| 					/* no valid character after the escaper. | 					/* no valid character after the escaper. | ||||||
| @ -1918,11 +1916,13 @@ void hawk_unescape_ucstr (hawk_uch_t* str) | |||||||
| 					 * 'c' is at the first G. this part is to restore the | 					 * 'c' is at the first G. this part is to restore the | ||||||
| 					 * \x part. since \x is not followed by any hexadecimal | 					 * \x part. since \x is not followed by any hexadecimal | ||||||
| 					 * digits, it's literally 'x' */ | 					 * digits, it's literally 'x' */ | ||||||
| 					*p2++ = rc; | 					*p2++ = (escaped == 2)? 'x': | ||||||
|  | 				             (escaped == 4)? 'u': 'U'; | ||||||
| 				} | 				} | ||||||
| 				else *p2++ = c_acc; | 				else *p2++ = c_acc; | ||||||
|  |  | ||||||
| 				escaped = 0; | 				escaped = 0; | ||||||
|  | 				goto normal_char; | ||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
|  |  | ||||||
| @ -1969,6 +1969,7 @@ void hawk_unescape_ucstr (hawk_uch_t* str) | |||||||
| 			continue; | 			continue; | ||||||
| 		} | 		} | ||||||
|  |  | ||||||
|  | 	normal_char: | ||||||
| 		if (c == '\\')  | 		if (c == '\\')  | ||||||
| 		{ | 		{ | ||||||
| 			escaped = 1; | 			escaped = 1; | ||||||
| @ -1985,8 +1986,12 @@ void hawk_unescape_ucstr (hawk_uch_t* str) | |||||||
|  |  | ||||||
| void hawk_unescape_bcstr (hawk_bch_t* str) | void hawk_unescape_bcstr (hawk_bch_t* str) | ||||||
| { | { | ||||||
| 	hawk_bch_t c, c_acc, * p1, * p2; | 	hawk_bch_t c, * p1, * p2; | ||||||
|  | 	hawk_uint32_t c_acc; | ||||||
| 	int escaped = 0, digit_count; | 	int escaped = 0, digit_count; | ||||||
|  | 	hawk_cmgr_t* utf8_cmgr; | ||||||
|  |  | ||||||
|  | 	utf8_cmgr = hawk_get_cmgr_by_id(HAWK_CMGR_UTF8); | ||||||
|  |  | ||||||
| 	p1 = str; | 	p1 = str; | ||||||
| 	p2 = str; | 	p2 = str; | ||||||
| @ -2013,6 +2018,7 @@ void hawk_unescape_bcstr (hawk_bch_t* str) | |||||||
| 			{ | 			{ | ||||||
| 				escaped = 0; | 				escaped = 0; | ||||||
| 				*p2++ = c_acc; | 				*p2++ = c_acc; | ||||||
|  | 				goto normal_char; | ||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
| 		else if (escaped == 2 || escaped == 4 || escaped == 8) | 		else if (escaped == 2 || escaped == 4 || escaped == 8) | ||||||
| @ -2024,7 +2030,8 @@ void hawk_unescape_bcstr (hawk_bch_t* str) | |||||||
| 				digit_count++; | 				digit_count++; | ||||||
| 				if (digit_count >= escaped)  | 				if (digit_count >= escaped)  | ||||||
| 				{ | 				{ | ||||||
| 					*p2++ = c_acc; | 					if (escaped == 2) *p2++ = c_acc; | ||||||
|  | 					else p2 += utf8_cmgr->uctobc(c_acc, p2, HAWK_TYPE_MAX(hawk_oow_t)); | ||||||
| 					escaped = 0; | 					escaped = 0; | ||||||
| 				} | 				} | ||||||
| 				continue; | 				continue; | ||||||
| @ -2035,7 +2042,8 @@ void hawk_unescape_bcstr (hawk_bch_t* str) | |||||||
| 				digit_count++; | 				digit_count++; | ||||||
| 				if (digit_count >= escaped)  | 				if (digit_count >= escaped)  | ||||||
| 				{ | 				{ | ||||||
| 					*p2++ = c_acc; | 					if (escaped == 2) *p2++ = c_acc; | ||||||
|  | 					else p2 += utf8_cmgr->uctobc(c_acc, p2, HAWK_TYPE_MAX(hawk_oow_t)); | ||||||
| 					escaped = 0; | 					escaped = 0; | ||||||
| 				} | 				} | ||||||
| 				continue; | 				continue; | ||||||
| @ -2046,17 +2054,16 @@ void hawk_unescape_bcstr (hawk_bch_t* str) | |||||||
| 				digit_count++; | 				digit_count++; | ||||||
| 				if (digit_count >= escaped)  | 				if (digit_count >= escaped)  | ||||||
| 				{ | 				{ | ||||||
| 					*p2++ = c_acc; | 					if (escaped == 2) *p2++ = c_acc; | ||||||
|  | 					else p2 += utf8_cmgr->uctobc(c_acc, p2, HAWK_TYPE_MAX(hawk_oow_t)); | ||||||
| 					escaped = 0; | 					escaped = 0; | ||||||
| 				} | 				} | ||||||
| 				continue; | 				continue; | ||||||
| 			} | 			} | ||||||
| 			else | 			else | ||||||
| 			{ | 			{ | ||||||
| 				hawk_bch_t rc; | 				/* non digit or xdigit */ | ||||||
| 				 | 				 | ||||||
| 				rc = (escaped == 2)? 'x': |  | ||||||
| 				     (escaped == 4)? 'u': 'U'; |  | ||||||
| 				if (digit_count == 0)  | 				if (digit_count == 0)  | ||||||
| 				{ | 				{ | ||||||
| 					/* no valid character after the escaper. | 					/* no valid character after the escaper. | ||||||
| @ -2065,11 +2072,19 @@ void hawk_unescape_bcstr (hawk_bch_t* str) | |||||||
| 					 * 'c' is at the first G. this part is to restore the | 					 * 'c' is at the first G. this part is to restore the | ||||||
| 					 * \x part. since \x is not followed by any hexadecimal | 					 * \x part. since \x is not followed by any hexadecimal | ||||||
| 					 * digits, it's literally 'x' */ | 					 * digits, it's literally 'x' */ | ||||||
| 					*p2++ = rc; | 					*p2++ = (escaped == 2)? 'x': | ||||||
|  | 					        (escaped == 4)? 'u': 'U'; | ||||||
|  | 				} | ||||||
|  | 				else  | ||||||
|  | 				{ | ||||||
|  | 					/* for a unicode, the utf8 conversion can never outgrow the input string of the hexadecimal notation with an escaper. | ||||||
|  | 					 * so it must be safe to specify a very large buffer size to uctobc() */ | ||||||
|  | 					if (escaped == 2) *p2++ = c_acc; | ||||||
|  | 					else p2 += utf8_cmgr->uctobc(c_acc, p2, HAWK_TYPE_MAX(hawk_oow_t)); | ||||||
| 				} | 				} | ||||||
| 				else *p2++ = c_acc; |  | ||||||
|  |  | ||||||
| 				escaped = 0; | 				escaped = 0; | ||||||
|  | 				goto normal_char; | ||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
|  |  | ||||||
| @ -2098,8 +2113,6 @@ void hawk_unescape_bcstr (hawk_bch_t* str) | |||||||
| 					c_acc = 0; | 					c_acc = 0; | ||||||
| 					continue; | 					continue; | ||||||
|  |  | ||||||
| 			#if 0 |  | ||||||
| 				/* don't support \u and \U in byte string. */ |  | ||||||
| 				case 'u': | 				case 'u': | ||||||
| 					escaped = 4; | 					escaped = 4; | ||||||
| 					digit_count = 0; | 					digit_count = 0; | ||||||
| @ -2111,7 +2124,6 @@ void hawk_unescape_bcstr (hawk_bch_t* str) | |||||||
| 					digit_count = 0; | 					digit_count = 0; | ||||||
| 					c_acc = 0; | 					c_acc = 0; | ||||||
| 					continue; | 					continue; | ||||||
| 			#endif |  | ||||||
| 				} | 				} | ||||||
|  |  | ||||||
| 			*p2++ = c; | 			*p2++ = c; | ||||||
| @ -2119,6 +2131,7 @@ void hawk_unescape_bcstr (hawk_bch_t* str) | |||||||
| 			continue; | 			continue; | ||||||
| 		} | 		} | ||||||
|  |  | ||||||
|  | 	normal_char: | ||||||
| 		if (c == '\\')  | 		if (c == '\\')  | ||||||
| 		{ | 		{ | ||||||
| 			escaped = 1; | 			escaped = 1; | ||||||
|  | |||||||
		Reference in New Issue
	
	Block a user