From c09bd088c1125f2e3f96ddbc245a98e4f82772ab Mon Sep 17 00:00:00 2001 From: hyung-hwan Date: Fri, 7 Jan 2022 02:04:28 +0000 Subject: [PATCH] limited the utf8 ranges --- hio/lib/utf8.c | 14 +++++++++++++- hio/t/t-002.c | 19 +++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/hio/lib/utf8.c b/hio/lib/utf8.c index 2d21ec5..8e97c2a 100644 --- a/hio/lib/utf8.c +++ b/hio/lib/utf8.c @@ -24,6 +24,8 @@ #include "hio-prv.h" +/*#define RETAIN_RFC2279 1*/ + /* * from RFC 2279 UTF-8, a transformation format of ISO 10646 * @@ -34,6 +36,12 @@ * 4:4 00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx * inv 00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * inv 04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * + * RFC3629 limits the ranges like this: + * 1:2 00000000-0000007F 0xxxxxxx + * 2:2 00000080-000007FF 110xxxxx 10xxxxxx + * 3:2 00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx + * 4:4 00010000-0010FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ struct __utf8_t @@ -53,9 +61,13 @@ static __utf8_t utf8_table[] = {0x00000000ul, 0x0000007Ful, 0x00, 0x80, 0x7F, 1}, {0x00000080ul, 0x000007FFul, 0xC0, 0xE0, 0x1F, 2}, {0x00000800ul, 0x0000FFFFul, 0xE0, 0xF0, 0x0F, 3}, +#if defined(RETAIN_RFC2279) {0x00010000ul, 0x001FFFFFul, 0xF0, 0xF8, 0x07, 4}, {0x00200000ul, 0x03FFFFFFul, 0xF8, 0xFC, 0x03, 5}, {0x04000000ul, 0x7FFFFFFFul, 0xFC, 0xFE, 0x01, 6} +#else + {0x00010000ul, 0x0010FFFFul, 0xF0, 0xF8, 0x07, 4} +#endif }; static HIO_INLINE __utf8_t* get_utf8_slot (hio_uch_t uc) @@ -79,7 +91,7 @@ static HIO_INLINE __utf8_t* get_utf8_slot (hio_uch_t uc) hio_oow_t hio_uc_to_utf8 (hio_uch_t uc, hio_bch_t* utf8, hio_oow_t size) { - __utf8_t* cur = get_utf8_slot (uc); + __utf8_t* cur = get_utf8_slot(uc); if (cur == HIO_NULL) return 0; /* illegal character */ diff --git a/hio/t/t-002.c b/hio/t/t-002.c index 484a3ac..08f657d 100644 --- a/hio/t/t-002.c +++ b/hio/t/t-002.c @@ -50,6 +50,25 @@ int main () T_ASSERT1 (v == 0 && *endptr == '\0' && is_sober == 1, "integer in E notation"); } + { + hio_bch_t tmp[10]; + hio_oow_t x; + hio_uch_t uc; + + x = hio_uc_to_utf8(0x2665, tmp, HIO_COUNTOF(tmp)); + T_ASSERT1 (x == 3 && (hio_uint8_t)tmp[0] == 0xE2 && (hio_uint8_t)tmp[1] == 0x99 && (hio_uint8_t)tmp[2] == 0xA5, "unicode to utf8 conversion"); + + x = hio_utf8_to_uc(tmp, x, &uc); + T_ASSERT1 (x == 3 && uc == 0x2665, "utf8 to unicode conversion"); + + #if (HIO_SIZEOF_UCH_T > 2) + x = hio_uc_to_utf8(0x1F3E9, tmp, HIO_COUNTOF(tmp)); + T_ASSERT1 (x == 4 && (hio_uint8_t)tmp[0] == 0xF0 && (hio_uint8_t)tmp[1] == 0x9F && (hio_uint8_t)tmp[2] == 0x8F && (hio_uint8_t)tmp[3] == 0xA9, "unicode to utf8 conversion"); + + x = hio_utf8_to_uc(tmp, x, &uc); + T_ASSERT1 (x == 4 && uc == 0x1F3E9, "utf8 to unicode conversion"); + #endif + } return 0; oops: