limited the utf8 ranges
This commit is contained in:
parent
d90647598b
commit
c09bd088c1
@ -24,6 +24,8 @@
|
|||||||
|
|
||||||
#include "hio-prv.h"
|
#include "hio-prv.h"
|
||||||
|
|
||||||
|
/*#define RETAIN_RFC2279 1*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* from RFC 2279 UTF-8, a transformation format of ISO 10646
|
* from RFC 2279 UTF-8, a transformation format of ISO 10646
|
||||||
*
|
*
|
||||||
@ -34,6 +36,12 @@
|
|||||||
* 4:4 00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
* 4:4 00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||||
* inv 00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
* inv 00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||||
* inv 04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
* inv 04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||||
|
*
|
||||||
|
* RFC3629 limits the ranges like this:
|
||||||
|
* 1:2 00000000-0000007F 0xxxxxxx
|
||||||
|
* 2:2 00000080-000007FF 110xxxxx 10xxxxxx
|
||||||
|
* 3:2 00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
|
||||||
|
* 4:4 00010000-0010FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||||
*/
|
*/
|
||||||
|
|
||||||
struct __utf8_t
|
struct __utf8_t
|
||||||
@ -53,9 +61,13 @@ static __utf8_t utf8_table[] =
|
|||||||
{0x00000000ul, 0x0000007Ful, 0x00, 0x80, 0x7F, 1},
|
{0x00000000ul, 0x0000007Ful, 0x00, 0x80, 0x7F, 1},
|
||||||
{0x00000080ul, 0x000007FFul, 0xC0, 0xE0, 0x1F, 2},
|
{0x00000080ul, 0x000007FFul, 0xC0, 0xE0, 0x1F, 2},
|
||||||
{0x00000800ul, 0x0000FFFFul, 0xE0, 0xF0, 0x0F, 3},
|
{0x00000800ul, 0x0000FFFFul, 0xE0, 0xF0, 0x0F, 3},
|
||||||
|
#if defined(RETAIN_RFC2279)
|
||||||
{0x00010000ul, 0x001FFFFFul, 0xF0, 0xF8, 0x07, 4},
|
{0x00010000ul, 0x001FFFFFul, 0xF0, 0xF8, 0x07, 4},
|
||||||
{0x00200000ul, 0x03FFFFFFul, 0xF8, 0xFC, 0x03, 5},
|
{0x00200000ul, 0x03FFFFFFul, 0xF8, 0xFC, 0x03, 5},
|
||||||
{0x04000000ul, 0x7FFFFFFFul, 0xFC, 0xFE, 0x01, 6}
|
{0x04000000ul, 0x7FFFFFFFul, 0xFC, 0xFE, 0x01, 6}
|
||||||
|
#else
|
||||||
|
{0x00010000ul, 0x0010FFFFul, 0xF0, 0xF8, 0x07, 4}
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
static HIO_INLINE __utf8_t* get_utf8_slot (hio_uch_t uc)
|
static HIO_INLINE __utf8_t* get_utf8_slot (hio_uch_t uc)
|
||||||
@ -79,7 +91,7 @@ static HIO_INLINE __utf8_t* get_utf8_slot (hio_uch_t uc)
|
|||||||
|
|
||||||
hio_oow_t hio_uc_to_utf8 (hio_uch_t uc, hio_bch_t* utf8, hio_oow_t size)
|
hio_oow_t hio_uc_to_utf8 (hio_uch_t uc, hio_bch_t* utf8, hio_oow_t size)
|
||||||
{
|
{
|
||||||
__utf8_t* cur = get_utf8_slot (uc);
|
__utf8_t* cur = get_utf8_slot(uc);
|
||||||
|
|
||||||
if (cur == HIO_NULL) return 0; /* illegal character */
|
if (cur == HIO_NULL) return 0; /* illegal character */
|
||||||
|
|
||||||
|
@ -50,6 +50,25 @@ int main ()
|
|||||||
T_ASSERT1 (v == 0 && *endptr == '\0' && is_sober == 1, "integer in E notation");
|
T_ASSERT1 (v == 0 && *endptr == '\0' && is_sober == 1, "integer in E notation");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
hio_bch_t tmp[10];
|
||||||
|
hio_oow_t x;
|
||||||
|
hio_uch_t uc;
|
||||||
|
|
||||||
|
x = hio_uc_to_utf8(0x2665, tmp, HIO_COUNTOF(tmp));
|
||||||
|
T_ASSERT1 (x == 3 && (hio_uint8_t)tmp[0] == 0xE2 && (hio_uint8_t)tmp[1] == 0x99 && (hio_uint8_t)tmp[2] == 0xA5, "unicode to utf8 conversion");
|
||||||
|
|
||||||
|
x = hio_utf8_to_uc(tmp, x, &uc);
|
||||||
|
T_ASSERT1 (x == 3 && uc == 0x2665, "utf8 to unicode conversion");
|
||||||
|
|
||||||
|
#if (HIO_SIZEOF_UCH_T > 2)
|
||||||
|
x = hio_uc_to_utf8(0x1F3E9, tmp, HIO_COUNTOF(tmp));
|
||||||
|
T_ASSERT1 (x == 4 && (hio_uint8_t)tmp[0] == 0xF0 && (hio_uint8_t)tmp[1] == 0x9F && (hio_uint8_t)tmp[2] == 0x8F && (hio_uint8_t)tmp[3] == 0xA9, "unicode to utf8 conversion");
|
||||||
|
|
||||||
|
x = hio_utf8_to_uc(tmp, x, &uc);
|
||||||
|
T_ASSERT1 (x == 4 && uc == 0x1F3E9, "utf8 to unicode conversion");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
oops:
|
oops:
|
||||||
|
Loading…
Reference in New Issue
Block a user