enhanced the json reader to handle unicode surrogate pairs
This commit is contained in:
		| @ -39,7 +39,7 @@ static void clear_token (mio_json_t* json) | ||||
| 	if (json->tok_capa > 0) json->tok.ptr[json->tok.len] = '\0'; | ||||
| } | ||||
|  | ||||
| static int add_char_to_token (mio_json_t* json, mio_ooch_t ch) | ||||
| static int add_char_to_token (mio_json_t* json, mio_ooch_t ch, int handle_surrogate_pair) | ||||
| { | ||||
| 	if (json->tok.len >= json->tok_capa) | ||||
| 	{ | ||||
| @ -48,12 +48,34 @@ static int add_char_to_token (mio_json_t* json, mio_ooch_t ch) | ||||
|  | ||||
| 		newcapa = MIO_ALIGN_POW2(json->tok.len + 2, MIO_JSON_TOKEN_NAME_ALIGN);  /* +2 here because of -1 when setting newcapa */ | ||||
| 		tmp = (mio_ooch_t*)mio_reallocmem(json->mio, json->tok.ptr, newcapa * MIO_SIZEOF(*tmp)); | ||||
| 		if (!tmp) return -1; | ||||
| 		if (MIO_UNLIKELY(!tmp)) return -1; | ||||
|  | ||||
| 		json->tok_capa = newcapa - 1; /* -1 to secure space for terminating null */ | ||||
| 		json->tok.ptr = tmp; | ||||
| 	} | ||||
|  | ||||
| #if (MIO_SIZEOF_OOCH_T >= 4)  | ||||
| 	if (handle_surrogate_pair && ch >= 0xDC00 && ch <= 0xDFFF && json->tok.len > 0) | ||||
| 	{ | ||||
| 		/* RFC7159 | ||||
| 			To escape an extended character that is not in the Basic Multilingual | ||||
| 			Plane, the character is represented as a 12-character sequence, | ||||
| 			encoding the UTF-16 surrogate pair.  So, for example, a string | ||||
| 			containing only the G clef character (U+1D11E) may be represented as | ||||
| 			"\uD834\uDD1E". | ||||
| 		*/ | ||||
| 		mio_ooch_t pch = json->tok.ptr[json->tok.len - 1]; | ||||
| 		if (pch >= 0xD800 && pch <= 0xDBFF) | ||||
| 		{ | ||||
| 			/* X = (character outside BMP) - 0x10000; | ||||
| 			 * W1 = high ten bits of X + 0xD800 | ||||
| 			 * W2 = low ten bits of X + 0xDC00 */ | ||||
| 			json->tok.ptr[json->tok.len - 1] = (((pch - 0xD800) << 10) | (ch - 0xDC00)) + 0x10000; | ||||
| 			return 0; | ||||
| 		} | ||||
| 	} | ||||
| #endif | ||||
|  | ||||
| 	json->tok.ptr[json->tok.len++] = ch; | ||||
| 	json->tok.ptr[json->tok.len] = '\0'; | ||||
| 	return 0; | ||||
| @ -70,14 +92,13 @@ static int add_chars_to_token (mio_json_t* json, const mio_ooch_t* ptr, mio_oow_ | ||||
|  | ||||
| 		newcapa = MIO_ALIGN_POW2(json->tok.len + len + 1, MIO_JSON_TOKEN_NAME_ALIGN); | ||||
| 		tmp = (mio_ooch_t*)mio_reallocmem(json->mio, json->tok.ptr, newcapa * MIO_SIZEOF(*tmp)); | ||||
| 		if (!tmp) return -1; | ||||
| 		if (MIO_UNLIKELY(!tmp)) return -1; | ||||
|  | ||||
| 		json->tok_capa = newcapa - 1; | ||||
| 		json->tok.ptr = tmp; | ||||
| 	} | ||||
|  | ||||
| 	for (i = 0; i < len; i++)   | ||||
| 		json->tok.ptr[json->tok.len++] = ptr[i]; | ||||
| 	for (i = 0; i < len; i++) json->tok.ptr[json->tok.len++] = ptr[i]; | ||||
| 	json->tok.ptr[json->tok.len] = '\0'; | ||||
| 	return 0; | ||||
| } | ||||
| @ -237,7 +258,7 @@ static int handle_string_value_char (mio_json_t* json, mio_ooci_t c) | ||||
| 			ret = 0; | ||||
| 		add_sv_acc: | ||||
| 		#if defined(MIO_OOCH_IS_UCH) | ||||
| 			if (add_char_to_token(json, json->state_stack->u.sv.acc) <= -1) return -1; | ||||
| 			if (add_char_to_token(json, json->state_stack->u.sv.acc, json->state_stack->u.sv.escaped == 4) <= -1) return -1; | ||||
| 		#else | ||||
| 			/* convert the character to utf8 */ | ||||
| 			{ | ||||
| @ -274,10 +295,6 @@ static int handle_string_value_char (mio_json_t* json, mio_ooci_t c) | ||||
| 		} | ||||
| 		else if (c == 'u') | ||||
| 		{ | ||||
| 		#if (MIO_SIZEOF_UCH_T > 2) | ||||
| 			/* TOOD: handle UTF-16 surrogate pair  U+1D11E ->  \uD834\uDD1E*/ | ||||
|                 	/*  0xD800-0xDBFF 0xDC00-0xDFFF */ | ||||
| 		#endif | ||||
| 			json->state_stack->u.sv.escaped = 4; | ||||
| 			json->state_stack->u.sv.digit_count = 0; | ||||
| 			json->state_stack->u.sv.acc = 0; | ||||
| @ -291,7 +308,7 @@ static int handle_string_value_char (mio_json_t* json, mio_ooci_t c) | ||||
| 		else | ||||
| 		{ | ||||
| 			json->state_stack->u.sv.escaped = 0; | ||||
| 			if (add_char_to_token(json, unescape(c)) <= -1) return -1; | ||||
| 			if (add_char_to_token(json, unescape(c), 0) <= -1) return -1; | ||||
| 		} | ||||
| 	} | ||||
| 	else if (c == '\\') | ||||
| @ -305,7 +322,7 @@ static int handle_string_value_char (mio_json_t* json, mio_ooci_t c) | ||||
| 	} | ||||
| 	else | ||||
| 	{ | ||||
| 		if (add_char_to_token(json, c) <= -1) return -1; | ||||
| 		if (add_char_to_token(json, c, 0) <= -1) return -1; | ||||
| 	} | ||||
|  | ||||
| 	return ret; | ||||
| @ -315,13 +332,13 @@ static int handle_numeric_value_char (mio_json_t* json, mio_ooci_t c) | ||||
| { | ||||
| 	if (mio_is_ooch_digit(c) || (json->tok.len == 0 && (c == '+' || c == '-'))) | ||||
| 	{ | ||||
| 		if (add_char_to_token(json, c) <= -1) return -1; | ||||
| 		if (add_char_to_token(json, c, 0) <= -1) return -1; | ||||
| 		return 1; | ||||
| 	} | ||||
| 	else if (!json->state_stack->u.nv.dotted && c == '.' && | ||||
| 	         json->tok.len > 0 && mio_is_ooch_digit(json->tok.ptr[json->tok.len - 1])) | ||||
| 	{ | ||||
| 		if (add_char_to_token(json, c) <= -1) return -1; | ||||
| 		if (add_char_to_token(json, c, 0) <= -1) return -1; | ||||
| 		json->state_stack->u.nv.dotted = 1; | ||||
| 		return 1; | ||||
| 	} | ||||
| @ -344,7 +361,7 @@ static int handle_word_value_char (mio_json_t* json, mio_ooci_t c) | ||||
|  | ||||
| 	if (mio_is_ooch_alpha(c)) | ||||
| 	{ | ||||
| 		if (add_char_to_token(json, c) <= -1) return -1; | ||||
| 		if (add_char_to_token(json, c, 0) <= -1) return -1; | ||||
| 		return 1; | ||||
| 	} | ||||
|  | ||||
|  | ||||
		Reference in New Issue
	
	Block a user