From b70d9a976a09721862a6228382eee50b6886be6b Mon Sep 17 00:00:00 2001 From: "hyunghwan.chung" Date: Sun, 17 May 2015 05:02:30 +0000 Subject: [PATCH] added utf8 string conversion functions --- stix/lib/comp.c | 100 ++++++++++--------- stix/lib/ignite.c | 2 +- stix/lib/main.c | 56 ++++++----- stix/lib/memo.txt | 3 +- stix/lib/obj.c | 4 +- stix/lib/stix-prv.h | 70 ++++++++----- stix/lib/stix.c | 4 +- stix/lib/stix.h | 44 ++++++--- stix/lib/sym.c | 6 +- stix/lib/utf8.c | 232 ++++++++++++++++++++++++++++++++++---------- 10 files changed, 356 insertions(+), 165 deletions(-) diff --git a/stix/lib/comp.c b/stix/lib/comp.c index 29188dc..5a49e24 100644 --- a/stix/lib/comp.c +++ b/stix/lib/comp.c @@ -219,7 +219,7 @@ static STIX_INLINE int is_closing_char (stix_cint_t c) #define ADD_TOKEN_STR(fsc,s) \ do { if (add_token_str (fsc, s) == -1) return -1; } while (0) -static STIX_INLINE int add_token_char (stix_t* fsc, stix_char_t c) +static STIX_INLINE int add_token_char (stix_t* fsc, stix_uch_t c) { if (fsc->tok.name.len >= STIX_COUNTOF(fsc->tok.buf) - 1) { @@ -232,7 +232,7 @@ static STIX_INLINE int add_token_char (stix_t* fsc, stix_char_t c) return 0; } -static STIX_INLINE int add_token_str (stix_t* fsc, const stix_char_t* str) +static STIX_INLINE int add_token_str (stix_t* fsc, const stix_uch_t* str) { stix_size_t len; @@ -247,49 +247,52 @@ static STIX_INLINE int add_token_str (stix_t* fsc, const stix_char_t* str) fsc->tok.name.len += stix_strcpy (&fsc->tok.buf[fsc->tok.name.len], str); return 0; } +#endif -static int get_char (stix_t* fsc) +static int get_char (stix_t* stix) { stix_ssize_t n; - if (fsc->sio.inp->b.pos >= fsc->sio.inp->b.len) + if (stix->c->curinp->b.pos >= stix->c->curinp->b.len) { - n = fsc->sio.impl (fsc, STIX_FSC_IO_READ, fsc->sio.inp); + n = stix->c->impl (stix, STIX_IO_READ, stix->c->curinp); if (n <= -1) return -1; if (n == 0) { - fsc->sio.inp->lxc.c = STIX_CHAR_EOF; - fsc->sio.inp->lxc.line = fsc->sio.inp->line; - fsc->sio.inp->lxc.colm = fsc->sio.inp->colm; - fsc->sio.inp->lxc.file = fsc->sio.inp->name; - fsc->sio.lxc = fsc->sio.inp->lxc; - return 0; +// stix->c->curinp->lxc.c = STIX_CHAR_EOF; + stix->c->curinp->lxc.c = 0; + stix->c->curinp->lxc.line = stix->c->curinp->line; + stix->c->curinp->lxc.colm = stix->c->curinp->colm; + stix->c->curinp->lxc.file = stix->c->curinp->name; + stix->c->lxc = stix->c->curinp->lxc; + return 0; /* indicate that EOF has been read */ } - fsc->sio.inp->b.pos = 0; - fsc->sio.inp->b.len = n; + stix->c->curinp->b.pos = 0; + stix->c->curinp->b.len = n; } - if (fsc->sio.inp->lxc.c == STIX_T('\n')) + if (stix->c->curinp->lxc.c == '\n') { /* if the previous charater was a newline, * increment the line counter and reset column to 1. * incrementing it line number here instead of * updating inp->lxc causes the line number for * TOK_EOF to be the same line as the lxc newline. */ - fsc->sio.inp->line++; - fsc->sio.inp->colm = 1; + stix->c->curinp->line++; + stix->c->curinp->colm = 1; } - - fsc->sio.inp->lxc.c = fsc->sio.inp->buf[fsc->sio.inp->b.pos++]; - fsc->sio.inp->lxc.line = fsc->sio.inp->line; - fsc->sio.inp->lxc.colm = fsc->sio.inp->colm++; - fsc->sio.inp->lxc.file = fsc->sio.inp->name; - fsc->sio.lxc = fsc->sio.inp->lxc; - return 0; + + stix->c->curinp->lxc.c = stix->c->curinp->buf[stix->c->curinp->b.pos++]; + stix->c->curinp->lxc.line = stix->c->curinp->line; + stix->c->curinp->lxc.colm = stix->c->curinp->colm++; + stix->c->curinp->lxc.file = stix->c->curinp->name; + stix->c->lxc = stix->c->curinp->lxc; + return 1; /* indicate that a normal character has been read */ } +#if 0 static int skip_spaces (stix_t* fsc) { while (STIX_ISSPACE(fsc->sio.lxc.c)) GET_CHAR (fsc); @@ -653,9 +656,9 @@ retry: else { stix_cstr_t ea; - stix_char_t cc; + stix_uch_t cc; - cc = (stix_char_t)c; + cc = (stix_uch_t)c; ea.ptr = &cc; ea.len = 1; @@ -685,17 +688,17 @@ static int begin_include (stix_t* fsc) stix_ioarg_t* arg; stix_link_t* link; - link = (stix_link_t*) stix_callocmem (fsc, STIX_SIZEOF(*link) + STIX_SIZEOF(stix_char_t) * (fsc->tok.name.len + 1)); + link = (stix_link_t*) stix_callocmem (fsc, STIX_SIZEOF(*link) + STIX_SIZEOF(stix_uch_t) * (fsc->tok.name.len + 1)); if (link == STIX_NULL) goto oops; - stix_strcpy ((stix_char_t*)(link + 1), fsc->tok.name.ptr); + stix_strcpy ((stix_uch_t*)(link + 1), fsc->tok.name.ptr); link->link = fsc->sio_names; fsc->sio_names = link; arg = (stix_ioarg_t*) stix_callocmem (fsc, STIX_SIZEOF(*arg)); if (arg == STIX_NULL) goto oops; - arg->name = (const stix_char_t*)(link + 1); + arg->name = (const stix_uch_t*)(link + 1); arg->line = 1; arg->colm = 1; arg->prev = fsc->sio.inp; @@ -777,7 +780,7 @@ static STIX_INLINE int is_tok_pseudovar (stix_t* fsc) stix_strequal(fsc->tok.name.ptr, STIX_T("false"))); } -static STIX_INLINE int is_tok_binsel (stix_t* fsc, const stix_char_t* sel) +static STIX_INLINE int is_tok_binsel (stix_t* fsc, const stix_uch_t* sel) { return fsc->tok.type == STIX_FSC_TOK_BINSEL && stix_strequal (fsc->tok.name.ptr, sel); @@ -848,7 +851,7 @@ static STIX_INLINE int is_tok_binsel (stix_t* fsc, const stix_char_t* sel) #endif static STIX_INLINE int emit_code_test ( - stix_t* fsc, const stix_char_t* high, const stix_char_t* low) + stix_t* fsc, const stix_uch_t* high, const stix_uch_t* low) { wprintf (L"CODE: %s %s\n", high, low); return 0; @@ -1037,7 +1040,7 @@ static int __add_literal (stix_t* fsc, stix_word_t literal) return fsc->literal_count - 1; } -static int __add_character_literal (stix_t* fsc, stix_char_t ch) +static int __add_character_literal (stix_t* fsc, stix_uch_t ch) { stix_word_t i, c, literal; stix_vm_t* stx = fsc->stx; @@ -1056,7 +1059,7 @@ static int __add_character_literal (stix_t* fsc, stix_char_t ch) } static int __add_string_literal ( - stix_t* fsc, const stix_char_t* str, stix_word_t size) + stix_t* fsc, const stix_uch_t* str, stix_word_t size) { stix_word_t i, c, literal; stix_vm_t* stx = fsc->stx; @@ -1077,7 +1080,7 @@ static int __add_string_literal ( } static int __add_symbol_literal ( - stix_t* fsc, const stix_char_t* str, stix_word_t size) + stix_t* fsc, const stix_uch_t* str, stix_word_t size) { stix_vm_t* stx = fsc->stx; return __add_literal (fsc, stix_new_symbolx(stx, str, size)); @@ -1210,7 +1213,7 @@ static int parse_expression (stix_t* fsc) stix_vm_t* stx = fsc->stx; if (fsc->tok.type == STIX_FSC_TOK_IDENT) { - stix_char_t* ident = stix_tok_yield (&fsc->tok, 0); + stix_uch_t* ident = stix_tok_yield (&fsc->tok, 0); if (ident == STIX_NULL) { fsc->errnum = STIX_FSC_ERROR_MEMORY; return -1; @@ -1242,7 +1245,7 @@ static int parse_expression (stix_t* fsc) } static int parse_basic_expression ( - stix_t* fsc, const stix_char_t* ident) + stix_t* fsc, const stix_uch_t* ident) { /* * ::= [ ] @@ -1259,7 +1262,7 @@ static int parse_basic_expression ( } static int parse_assignment ( - stix_t* fsc, const stix_char_t* target) + stix_t* fsc, const stix_uch_t* target) { /* * ::= assignmentOperator @@ -1304,7 +1307,7 @@ static int parse_assignment ( } static int parse_primary ( - stix_t* fsc, const stix_char_t* ident, int* is_super) + stix_t* fsc, const stix_uch_t* ident, int* is_super) { /* * ::= @@ -1389,7 +1392,7 @@ static int parse_primary ( } static int parse_primary_ident ( - stix_t* fsc, const stix_char_t* ident, int* is_super) + stix_t* fsc, const stix_uch_t* ident, int* is_super) { stix_word_t i; stix_vm_t* stx = fsc->stx; @@ -1616,7 +1619,7 @@ static int parse_binary_message (stix_t* fsc, int is_super) while (fsc->tok.type == STIX_FSC_TOK_BINSEL) { - stix_char_t* op = stix_tok_yield (&fsc->tok, 0); + stix_uch_t* op = stix_tok_yield (&fsc->tok, 0); if (op == STIX_NULL) { fsc->errnum = STIX_FSC_ERROR_MEMORY; return -1; @@ -1707,11 +1710,11 @@ static int parse_method (stix_t* fsc, stix_word_t method_class, void* input) #endif -static int get_class_type (const stix_char_t* str, class_type_t* type) +static int get_class_type (const stix_uch_t* str, class_type_t* type) { static struct { - stix_char_t* word; + stix_uch_t* word; class_type_t type; } tab[] = { @@ -1735,11 +1738,11 @@ static int get_class_type (const stix_char_t* str, class_type_t* type) return -1; } -static int get_vardef_type (const stix_char_t* str, vardef_type_t* type) +static int get_vardef_type (const stix_uch_t* str, vardef_type_t* type) { static struct { - stix_char_t* word; + stix_uch_t* word; class_type_t type; } tab[] = { @@ -2254,13 +2257,20 @@ int stix_compile (stix_t* stix, stix_ioimpl_t io) stix->c->arg.line = 1; stix->c->arg.colm = 1; stix->c->curinp = &stix->c->arg; - clear_sio_names (stix); +// clear_sio_names (stix); /* open the top-level stream */ n = stix->c->impl (stix, STIX_IO_OPEN, stix->c->curinp); if (n <= -1) return -1; - if (compile_stream (stix) <= -1) goto oops; +// if (compile_stream (stix) <= -1) goto oops; + while (get_char(stix) > 0) + { + stix_bch_t buf[16]; + stix_size_t len; + len = stix_uctoutf8 (stix->c->curinp->lxc.c, buf, STIX_COUNTOF(buf)); + printf ("%.*s", (int)len, buf); + } /* close the stream */ STIX_ASSERT (stix->c->curinp == &stix->c->arg); diff --git a/stix/lib/ignite.c b/stix/lib/ignite.c index 059ea7a..4751f2f 100644 --- a/stix/lib/ignite.c +++ b/stix/lib/ignite.c @@ -183,7 +183,7 @@ static int ignite_3 (stix_t* stix) static struct symbol_name_t { stix_oow_t len; - stix_char_t str[16]; + stix_uch_t str[16]; } symnames[] = { { 4, { 'S','t','i','x' } }, { 6, { 'O','b','j','e','c','t' } }, diff --git a/stix/lib/main.c b/stix/lib/main.c index 2a34d9d..a26b8bf 100644 --- a/stix/lib/main.c +++ b/stix/lib/main.c @@ -28,7 +28,7 @@ #include #include - +#include typedef struct xtn_t xtn_t; struct xtn_t @@ -64,31 +64,41 @@ static stix_mmgr_t sys_mmgr = }; -static STIX_INLINE stix_oow_t open_input (stix_t* stix, stix_ioarg_t* arg) +static STIX_INLINE stix_ssize_t open_input (stix_t* stix, stix_ioarg_t* arg) { if (arg->includer) { /* includee */ - xtn_t* xtn = stix_getxtn(stix); + stix_bch_t bcs[1024]; /* TODO: right buffer size */ + stix_size_t bcslen = STIX_COUNTOF(bcs); + stix_size_t ucslen = ~(stix_size_t)0; - arg->handle = fopen (xtn->source_path, "r"); - if (!arg->handle) + if (stix_ucstoutf8 (arg->name, &ucslen, bcs, &bcslen) <= -1) { - stix_seterrnum (stix, STIX_EIOERR); + stix_seterrnum (stix, STIX_EECERR); return -1; } } else { /* main stream */ - /*char tmp[PATH_MAX];*/ + xtn_t* xtn = stix_getxtn(stix); + arg->handle = fopen (xtn->source_path, "r"); } + + if (!arg->handle) + { + stix_seterrnum (stix, STIX_EIOERR); + return -1; + } + + return 0; } -static STIX_INLINE stix_oow_t read_input (stix_t* stix, stix_ioarg_t* arg) +static STIX_INLINE stix_ssize_t read_input (stix_t* stix, stix_ioarg_t* arg) { xtn_t* xtn = stix_getxtn(stix); - stix_size_t n, bcslen, cslen; + stix_size_t n, bcslen, ucslen, remlen; int x; STIX_ASSERT (arg->handle != STIX_NULL); @@ -100,32 +110,32 @@ static STIX_INLINE stix_oow_t read_input (stix_t* stix, stix_ioarg_t* arg) stix_seterrnum (stix, STIX_EIOERR); return -1; } - - } xtn->bchar_len += n; bcslen = xtn->bchar_len; - cslen = STIX_COUNTOF(arg->buf); - x = stix_utf8toucs (xtn->bchar_buf, &bcslen, arg->buf, &cslen); - if (x == -2) - { - /* buffer to small */ - } - if (x <= -1) + ucslen = STIX_COUNTOF(arg->buf); + x = stix_utf8toucs (xtn->bchar_buf, &bcslen, arg->buf, &ucslen); + if (x <= -1 && ucslen <= 0) { + stix_seterrnum (stix, STIX_EECERR); + return -1; } + + remlen = xtn->bchar_len - bcslen; + if (remlen > 0) memmove (xtn->bchar_buf, &xtn->bchar_buf[bcslen], remlen); + xtn->bchar_len = remlen; + return ucslen; } -static STIX_INLINE stix_oow_t close_input (stix_t* stix, stix_ioarg_t* arg) +static STIX_INLINE stix_ssize_t close_input (stix_t* stix, stix_ioarg_t* arg) { STIX_ASSERT (arg->handle != STIX_NULL); fclose (arg->handle); return 0; } -/* TODO: IMPLEMENT PROPER INPUT HANDLER */ -static stix_oow_t input_handler (stix_t* stix, stix_iocmd_t cmd, stix_ioarg_t* arg) +static stix_ssize_t input_handler (stix_t* stix, stix_iocmd_t cmd, stix_ioarg_t* arg) { switch (cmd) { @@ -217,8 +227,8 @@ int main (int argc, char* argv[]) } { -stix_char_t x[] = { 'S', 't', 'r', 'i', 'n', 'g', '\0' }; -stix_char_t y[] = { 'S', 'y', 'm', 'b', 'o', 'l', '\0' }; +stix_uch_t x[] = { 'S', 't', 'r', 'i', 'n', 'g', '\0' }; +stix_uch_t y[] = { 'S', 'y', 'm', 'b', 'o', 'l', '\0' }; stix_oop_t a, b; a = stix_makesymbol (stix, x, 6); diff --git a/stix/lib/memo.txt b/stix/lib/memo.txt index 6abbd97..1fdded5 100644 --- a/stix/lib/memo.txt +++ b/stix/lib/memo.txt @@ -617,4 +617,5 @@ Single line comment #! comment text (easy handling to skip hash bang) Multi-line comments - double quoted as in smalltalk -" comment text " +" comment text 설명이라지요. " + diff --git a/stix/lib/obj.c b/stix/lib/obj.c index 213ba7c..5afe2fc 100644 --- a/stix/lib/obj.c +++ b/stix/lib/obj.c @@ -114,9 +114,9 @@ static stix_oop_t alloc_numeric_array (stix_t* stix, const void* ptr, stix_oow_t return hdr; } -stix_oop_t stix_alloccharobj (stix_t* stix, const stix_char_t* ptr, stix_oow_t len) +stix_oop_t stix_alloccharobj (stix_t* stix, const stix_uch_t* ptr, stix_oow_t len) { - return alloc_numeric_array (stix, ptr, len, STIX_OBJ_TYPE_CHAR, STIX_SIZEOF(stix_char_t), 1); + return alloc_numeric_array (stix, ptr, len, STIX_OBJ_TYPE_CHAR, STIX_SIZEOF(stix_uch_t), 1); } stix_oop_t stix_allocuint8obj (stix_t* stix, const stix_uint8_t* ptr, stix_oow_t len) diff --git a/stix/lib/stix-prv.h b/stix/lib/stix-prv.h index 754a8ab..29dc29f 100644 --- a/stix/lib/stix-prv.h +++ b/stix/lib/stix-prv.h @@ -140,10 +140,10 @@ typedef enum stix_iocmd_t stix_iocmd_t; typedef struct stix_iolxc_t stix_iolxc_t; struct stix_iolxc_t { - stix_char_t c; /**< character */ - unsigned long line; /**< line */ - unsigned long colm; /**< column */ - const stix_char_t* file; /**< file specified in #include */ + stix_uch_t c; /**< character */ + unsigned long line; /**< line */ + unsigned long colm; /**< column */ + const stix_uch_t* file; /**< file specified in #include */ }; enum stix_ioarg_flag_t @@ -160,7 +160,7 @@ struct stix_ioarg_t * It is #STIX_NULL for the main stream and points to a non-NULL string * for an included stream. */ - const stix_char_t* name; + const stix_uch_t* name; /** * [OUT] I/O handle set by a handler. @@ -173,7 +173,7 @@ struct stix_ioarg_t /** * [OUT] place data here */ - stix_char_t buf[1024]; + stix_uch_t buf[1024]; /** * [IN] points to the data of the includer. It is #STIX_NULL for the @@ -195,7 +195,7 @@ struct stix_ioarg_t /*-----------------------------------------------------------------*/ }; -typedef stix_oow_t (*stix_ioimpl_t) ( +typedef stix_ssize_t (*stix_ioimpl_t) ( stix_t* stix, stix_iocmd_t cmd, stix_ioarg_t* arg @@ -260,13 +260,13 @@ stix_oow_t stix_hashbytes ( ); stix_oow_t stix_hashchars ( - const stix_char_t* ptr, + const stix_uch_t* ptr, stix_oow_t len ); int stix_equalchars ( - const stix_char_t* str1, - const stix_char_t* str2, + const stix_uch_t* str1, + const stix_uch_t* str2, stix_oow_t len ); @@ -289,7 +289,7 @@ stix_oop_t stix_allocoopobj ( stix_oop_t stix_alloccharobj ( stix_t* stix, - const stix_char_t* ptr, + const stix_uch_t* ptr, stix_oow_t len ); @@ -310,13 +310,13 @@ stix_oop_t stix_allocuint16obj ( /* ========================================================================= */ stix_oop_t stix_makesymbol ( stix_t* stix, - const stix_char_t* ptr, + const stix_uch_t* ptr, stix_oow_t len ); stix_oop_t stix_findsymbol ( stix_t* stix, - const stix_char_t* ptr, + const stix_uch_t* ptr, stix_oow_t len ); @@ -338,22 +338,21 @@ stix_oop_t stix_getatsysdic ( /* utf8.c */ /* ========================================================================= */ stix_size_t stix_uctoutf8 ( - stix_char_t uc, - stix_bchar_t* utf8, + stix_uch_t uc, + stix_bch_t* utf8, stix_size_t size ); stix_size_t stix_utf8touc ( - const stix_bchar_t* utf8, - stix_size_t size, - stix_char_t* uc + const stix_bch_t* utf8, + stix_size_t size, + stix_uch_t* uc ); - int stix_ucstoutf8 ( - const stix_char_t* ucs, + const stix_uch_t* ucs, stix_size_t* ucslen, - stix_bchar_t* bcs, + stix_bch_t* bcs, stix_size_t* bcslen ); @@ -363,27 +362,46 @@ int stix_ucstoutf8 ( * It never returns -2 if \a ucs is #STIX_NULL. * * \code - * const stix_bchar_t* bcs = "a multibyte string"; - * stix_char_t ucs[100]; + * const stix_bch_t* bcs = "test string"; + * stix_uch_t ucs[100]; * qse_size_t ucslen = STIX_COUNTOF(buf), n; - * qse_size_t bcslen = strlen(bcs); + * qse_size_t bcslen = 11; * int n; * n = qse_bcstoucs (bcs, &bcslen, ucs, &ucslen); * if (n <= -1) { invalid/incomplenete sequence or buffer to small } * \endcode * + * For a null-terminated string, you can specify ~(stix_size_t)0 in + * \a bcslen. The destination buffer \a ucs also must be large enough to + * store a terminating null. Otherwise, -2 is returned. + * + * The resulting \a ucslen can still be greater than 0 even if the return + * value is negative. The value indiates the number of characters converted + * before the error has occurred. + * * \return 0 on success. * -1 if \a bcs contains an illegal character. * -2 if the wide-character string buffer is too small. * -3 if \a bcs is not a complete sequence. */ int stix_utf8toucs ( - const stix_bchar_t* bcs, + const stix_bch_t* bcs, stix_size_t* bcslen, - stix_char_t* ucs, + stix_uch_t* ucs, stix_size_t* ucslen ); + +/** + * The stix_ucslen() function returns the number of characters before + * a terminating null. + */ +/* +stix_size_t stix_ucslen ( + const stix_uch_t* ucs +); +*/ + /* ========================================================================= */ /* comp.c */ /* ========================================================================= */ diff --git a/stix/lib/stix.c b/stix/lib/stix.c index 4c48bcb..e68d23d 100644 --- a/stix/lib/stix.c +++ b/stix/lib/stix.c @@ -155,12 +155,12 @@ stix_oow_t stix_hashbytes (const stix_uint8_t* ptr, stix_oow_t len) return h; } -stix_oow_t stix_hashchars (const stix_char_t* ptr, stix_oow_t len) +stix_oow_t stix_hashchars (const stix_uch_t* ptr, stix_oow_t len) { return stix_hashbytes ((const stix_uint8_t *)ptr, len * STIX_SIZEOF(*ptr)); } -int stix_equalchars (const stix_char_t* str1, const stix_char_t* str2, stix_oow_t len) +int stix_equalchars (const stix_uch_t* str1, const stix_uch_t* str2, stix_oow_t len) { stix_oow_t i; diff --git a/stix/lib/stix.h b/stix/lib/stix.h index 170f415..b118670 100644 --- a/stix/lib/stix.h +++ b/stix/lib/stix.h @@ -44,9 +44,26 @@ typedef unsigned short int stix_uint16_t; #endif typedef unsigned long int stix_uintptr_t; typedef unsigned long int stix_size_t; +typedef long int stix_ssize_t; -typedef unsigned short int stix_char_t; /* TODO ... wchar_t??? */ -typedef char stix_bchar_t; +typedef unsigned short int stix_uch_t; /* TODO ... wchar_t??? */ +typedef char stix_bch_t; + + +struct stix_ucs_t +{ + stix_uch_t* ptr; + stix_size_t len; +}; + +struct stix_bcs_t +{ + stix_bch_t* ptr; + stix_size_t len; +}; + +typedef struct stix_ucs_t stix_ucs_t; +typedef struct stix_bcs_t stix_bcs_t; /* ========================================================================= * PRIMITIVE MACROS @@ -185,15 +202,15 @@ struct stix_mmgr_t typedef struct stix_cmgr_t stix_cmgr_t; -typedef stix_size_t (*stix_cmgr_bctoc_t) ( - const stix_bchar_t* mb, +typedef stix_size_t (*stix_cmgr_bctouc_t) ( + const stix_bch_t* mb, stix_size_t size, - stix_char_t* wc + stix_uch_t* wc ); -typedef stix_size_t (*stix_cmgr_ctobc_t) ( - stix_char_t wc, - stix_bchar_t* mb, +typedef stix_size_t (*stix_cmgr_uctobc_t) ( + stix_uch_t wc, + stix_bch_t* mb, stix_size_t size ); @@ -206,8 +223,8 @@ typedef stix_size_t (*stix_cmgr_ctobc_t) ( */ struct stix_cmgr_t { - stix_cmgr_bctoc_t bctoc; - stix_cmgr_ctobc_t ctobc; + stix_cmgr_bctouc_t bctouc; + stix_cmgr_uctobc_t uctobc; }; /* ========================================================================= @@ -258,7 +275,8 @@ enum stix_errnum_t STIX_ENOMEM, /**< insufficient memory */ STIX_EINVAL, /**< invalid parameter or data */ STIX_ENOENT, /**< no matching entry */ - STIX_EIOERR /**< I/O error */ + STIX_EIOERR, /**< I/O error */ + STIX_EECERR /**< encoding conversion error */ }; typedef enum stix_errnum_t stix_errnum_t; @@ -605,7 +623,7 @@ struct stix_obj_oop_t struct stix_obj_char_t { STIX_OBJ_HEADER; - stix_char_t slot[1]; + stix_uch_t slot[1]; }; struct stix_obj_uint8_t @@ -832,7 +850,7 @@ STIX_EXPORT void stix_gc ( */ STIX_EXPORT int stix_findclass ( stix_t* vm, - const stix_char_t* name, + const stix_uch_t* name, stix_oop_t* oop ); diff --git a/stix/lib/sym.c b/stix/lib/sym.c index 70adc10..37c8037 100644 --- a/stix/lib/sym.c +++ b/stix/lib/sym.c @@ -57,7 +57,7 @@ static stix_oop_oop_t expand_bucket (stix_t* stix, stix_oop_oop_t old_bucket) return new_bucket; } -static stix_oop_t find_or_make_symbol (stix_t* stix, const stix_char_t* ptr, stix_oow_t len, int create) +static stix_oop_t find_or_make_symbol (stix_t* stix, const stix_uch_t* ptr, stix_oow_t len, int create) { stix_oow_t index, tally; stix_oop_char_t symbol; @@ -130,12 +130,12 @@ static stix_oop_t find_or_make_symbol (stix_t* stix, const stix_char_t* ptr, sti return (stix_oop_t)symbol; } -stix_oop_t stix_makesymbol (stix_t* stix, const stix_char_t* ptr, stix_oow_t len) +stix_oop_t stix_makesymbol (stix_t* stix, const stix_uch_t* ptr, stix_oow_t len) { return find_or_make_symbol (stix, ptr, len, 1); } -stix_oop_t stix_findsymbol (stix_t* stix, const stix_char_t* ptr, stix_oow_t len) +stix_oop_t stix_findsymbol (stix_t* stix, const stix_uch_t* ptr, stix_oow_t len) { return find_or_make_symbol (stix, ptr, len, 0); } diff --git a/stix/lib/utf8.c b/stix/lib/utf8.c index 3e09a90..300a2ae 100644 --- a/stix/lib/utf8.c +++ b/stix/lib/utf8.c @@ -26,7 +26,7 @@ #include "stix-prv.h" -#define STIX_BCLEN_MAX 16 +#define STIX_BCLEN_MAX 6 /* * from RFC 2279 UTF-8, a transformation format of ISO 10646 @@ -62,12 +62,12 @@ static __utf8_t utf8_table[] = {0x04000000ul, 0x7FFFFFFFul, 0xFC, 0xFE, 0x01, 6} }; -static STIX_INLINE __utf8_t* get_utf8_slot (stix_char_t uc) +static STIX_INLINE __utf8_t* get_utf8_slot (stix_uch_t uc) { __utf8_t* cur, * end; - STIX_ASSERT (STIX_SIZEOF(stix_bchar_t) == 1); - STIX_ASSERT (STIX_SIZEOF(stix_char_t) >= 2); + STIX_ASSERT (STIX_SIZEOF(stix_bch_t) == 1); + STIX_ASSERT (STIX_SIZEOF(stix_uch_t) >= 2); end = utf8_table + STIX_COUNTOF(utf8_table); cur = utf8_table; @@ -81,7 +81,7 @@ static STIX_INLINE __utf8_t* get_utf8_slot (stix_char_t uc) return STIX_NULL; /* invalid character */ } -stix_size_t stix_uctoutf8 (stix_char_t uc, stix_bchar_t* utf8, stix_size_t size) +stix_size_t stix_uctoutf8 (stix_uch_t uc, stix_bch_t* utf8, stix_size_t size) { __utf8_t* cur = get_utf8_slot (uc); @@ -108,14 +108,14 @@ stix_size_t stix_uctoutf8 (stix_char_t uc, stix_bchar_t* utf8, stix_size_t size) return (stix_size_t)cur->length; } -stix_size_t stix_utf8touc (const stix_bchar_t* utf8, stix_size_t size, stix_char_t* uc) +stix_size_t stix_utf8touc (const stix_bch_t* utf8, stix_size_t size, stix_uch_t* uc) { __utf8_t* cur, * end; STIX_ASSERT (utf8 != STIX_NULL); STIX_ASSERT (size > 0); - STIX_ASSERT (STIX_SIZEOF(stix_bchar_t) == 1); - STIX_ASSERT (STIX_SIZEOF(stix_char_t) >= 2); + STIX_ASSERT (STIX_SIZEOF(stix_bch_t) == 1); + STIX_ASSERT (STIX_SIZEOF(stix_uch_t) >= 2); end = utf8_table + STIX_COUNTOF(utf8_table); cur = utf8_table; @@ -135,7 +135,7 @@ stix_size_t stix_utf8touc (const stix_bchar_t* utf8, stix_size_t size, stix_char if (uc) { - stix_char_t w; + stix_uch_t w; w = utf8[0] & cur->fmask; for (i = 1; i < cur->length; i++) @@ -167,9 +167,9 @@ stix_size_t stix_utf8touc (const stix_bchar_t* utf8, stix_size_t size, stix_char } /* this return value can indicate both - * the correct length (len >= cur->length) + * the correct length (size >= cur->length) * and - * the incomplete seqeunce error (len < cur->length). + * the incomplete seqeunce error (size < cur->length). */ return (stix_size_t)cur->length; } @@ -179,28 +179,26 @@ stix_size_t stix_utf8touc (const stix_bchar_t* utf8, stix_size_t size, stix_char return 0; /* error - invalid sequence */ } -stix_size_t stix_utf8len (const stix_bchar_t* utf8, stix_size_t size) -{ - return stix_utf8touc (utf8, size, STIX_NULL); -} - /* ----------------------------------------------------------------------- */ -static int bcsn_to_csn_with_cmgr ( - const stix_bchar_t* bcs, stix_size_t* bcslen, - stix_char_t* cs, stix_size_t* cslen, stix_cmgr_t* cmgr, int all) +static STIX_INLINE int bcsn_to_ucsn_with_cmgr ( + const stix_bch_t* bcs, stix_size_t* bcslen, + stix_uch_t* ucs, stix_size_t* ucslen, stix_cmgr_t* cmgr, int all) { - const stix_bchar_t* p; + const stix_bch_t* p; int ret = 0; stix_size_t mlen; - if (cs) + if (ucs) { - stix_char_t* q, * qend; + /* destination buffer is specified. + * copy the conversion result to the buffer */ + + stix_uch_t* q, * qend; p = bcs; - q = cs; - qend = cs + *cslen; + q = ucs; + qend = ucs + *ucslen; mlen = *bcslen; while (mlen > 0) @@ -214,7 +212,7 @@ static int bcsn_to_csn_with_cmgr ( break; } - n = cmgr->bctoc (p, mlen, q); + n = cmgr->bctouc (p, mlen, q); if (n == 0) { /* invalid sequence */ @@ -249,12 +247,18 @@ static int bcsn_to_csn_with_cmgr ( mlen -= n; } - *cslen = q - cs; + *ucslen = q - ucs; *bcslen = p - bcs; } else { - stix_char_t w; + /* no destination buffer is specified. perform conversion + * but don't copy the result. the caller can call this function + * without a buffer to find the required buffer size, allocate + * a buffer with the size and call this function again with + * the buffer. */ + + stix_uch_t w; stix_size_t wlen = 0; p = bcs; @@ -264,7 +268,7 @@ static int bcsn_to_csn_with_cmgr ( { stix_size_t n; - n = cmgr->bctoc (p, mlen, &w); + n = cmgr->bctouc (p, mlen, &w); if (n == 0) { /* invalid sequence */ @@ -291,19 +295,42 @@ static int bcsn_to_csn_with_cmgr ( wlen += 1; } - *cslen = wlen; + *ucslen = wlen; *bcslen = p - bcs; } return ret; } -static int csn_to_bcsn_with_cmgr ( - const stix_char_t* cs, stix_size_t* cslen, - stix_bchar_t* bcs, stix_size_t* bcslen, stix_cmgr_t* cmgr) +static STIX_INLINE int bcs_to_ucs_with_cmgr ( + const stix_bch_t* bcs, stix_size_t* bcslen, + stix_uch_t* ucs, stix_size_t* ucslen, stix_cmgr_t* cmgr, int all) { - const stix_char_t* p = cs; - const stix_char_t* end = cs + *cslen; + const stix_bch_t* bp; + stix_size_t mlen, wlen; + int n; + + for (bp = bcs; *bp != '\0'; bp++); + + mlen = bp - bcs; wlen = *ucslen; + n = bcsn_to_ucsn_with_cmgr (bcs, &mlen, ucs, &wlen, cmgr, all); + if (ucs) + { + /* null-terminate the target buffer if it has room for it. */ + if (wlen < *ucslen) ucs[wlen] = '\0'; + else n = -2; /* buffer too small */ + } + *bcslen = mlen; *ucslen = wlen; + + return n; +} + +static STIX_INLINE int ucsn_to_bcsn_with_cmgr ( + const stix_uch_t* ucs, stix_size_t* ucslen, + stix_bch_t* bcs, stix_size_t* bcslen, stix_cmgr_t* cmgr) +{ + const stix_uch_t* p = ucs; + const stix_uch_t* end = ucs + *ucslen; int ret = 0; if (bcs) @@ -320,7 +347,7 @@ static int csn_to_bcsn_with_cmgr ( break; } - n = cmgr->ctobc (*p, bcs, rem); + n = cmgr->uctobc (*p, bcs, rem); if (n == 0) { ret = -1; @@ -338,14 +365,96 @@ static int csn_to_bcsn_with_cmgr ( } else { - stix_bchar_t bcsbuf[STIX_BCLEN_MAX]; + stix_bch_t bcsbuf[STIX_BCLEN_MAX]; stix_size_t mlen = 0; while (p < end) { stix_size_t n; - n = cmgr->ctobc (*p, bcsbuf, STIX_COUNTOF(bcsbuf)); + n = cmgr->uctobc (*p, bcsbuf, STIX_COUNTOF(bcsbuf)); + if (n == 0) + { + ret = -1; + break; /* illegal character */ + } + + /* it assumes that bcsbuf is large enough to hold a character */ + STIX_ASSERT (n <= STIX_COUNTOF(bcsbuf)); + + p++; mlen += n; + } + + /* this length excludes the terminating null character. + * this function doesn't even null-terminate the result. */ + *bcslen = mlen; + } + + *ucslen = p - ucs; + return ret; +} + + +static int ucs_to_bcs_with_cmgr ( + const stix_uch_t* ucs, stix_size_t* ucslen, + stix_bch_t* bcs, stix_size_t* bcslen, stix_cmgr_t* cmgr) +{ + const stix_uch_t* p = ucs; + int ret = 0; + + if (bcs) + { + stix_size_t rem = *bcslen; + + while (*p != '\0') + { + stix_size_t n; + + if (rem <= 0) + { + ret = -2; + break; + } + + n = cmgr->uctobc (*p, bcs, rem); + if (n == 0) + { + ret = -1; + break; /* illegal character */ + } + if (n > rem) + { + ret = -2; + break; /* buffer too small */ + } + + bcs += n; rem -= n; p++; + } + + /* update bcslen to the length of the bcs string converted excluding + * terminating null */ + *bcslen -= rem; + + /* null-terminate the multibyte sequence if it has sufficient space */ + if (rem > 0) *bcs = '\0'; + else + { + /* if ret is -2 and cs[cslen] == '\0', + * this means that the bcs buffer was lacking one + * slot for the terminating null */ + ret = -2; /* buffer too small */ + } + } + else + { + stix_bch_t bcsbuf[STIX_BCLEN_MAX]; + stix_size_t mlen = 0; + + while (*p != '\0') + { + stix_size_t n; + + n = cmgr->uctobc (*p, bcsbuf, STIX_COUNTOF(bcsbuf)); if (n == 0) { ret = -1; @@ -353,22 +462,20 @@ static int csn_to_bcsn_with_cmgr ( } /* it assumes that bcs is large enough to hold a character */ - STIX_ASSERT (n <= STIX_COUNTOF(bcsbuf)); + STIX_ASSERT (n <= STIX_COUNTOF(bcs)); p++; mlen += n; } - /* this length excludes the terminating null character. - * this function doesn't event null-terminate the result. */ + /* this length holds the number of resulting multi-byte characters + * excluding the terminating null character */ *bcslen = mlen; } - *cslen = p - cs; - + *ucslen = p - ucs; /* the number of wide characters handled. */ return ret; } - static stix_cmgr_t utf8_cmgr = { stix_utf8touc, @@ -376,15 +483,42 @@ static stix_cmgr_t utf8_cmgr = }; int stix_utf8toucs ( - const stix_bchar_t* bcs, stix_size_t* bcslen, - stix_char_t* ucs, stix_size_t* ucslen) + const stix_bch_t* bcs, stix_size_t* bcslen, + stix_uch_t* ucs, stix_size_t* ucslen) { - return bcsn_to_csn_with_cmgr (bcs, bcslen, ucs, ucslen, &utf8_cmgr, 0); + if (*bcslen == ~(stix_size_t)0) + { + /* the source is null-terminated. */ + return bcs_to_ucs_with_cmgr (bcs, bcslen, ucs, ucslen, &utf8_cmgr, 0); + } + else + { + /* the source is length bound */ + return bcsn_to_ucsn_with_cmgr (bcs, bcslen, ucs, ucslen, &utf8_cmgr, 0); + } } int stix_ucstoutf8 ( - const stix_char_t* ucs, stix_size_t *ucslen, - stix_bchar_t* bcs, stix_size_t* bcslen) + const stix_uch_t* ucs, stix_size_t *ucslen, + stix_bch_t* bcs, stix_size_t* bcslen) { - return csn_to_bcsn_with_cmgr (ucs, ucslen, bcs, bcslen, &utf8_cmgr); + if (*ucslen == ~(stix_size_t)0) + { + /* null-terminated */ + return ucs_to_bcs_with_cmgr (ucs, ucslen, bcs, bcslen, &utf8_cmgr); + } + else + { + /* length bound */ + return ucsn_to_bcsn_with_cmgr (ucs, ucslen, bcs, bcslen, &utf8_cmgr); + } } + +/* +stix_size_t stix_ucslen (const stix_uch_t* ucs) +{ + const stix_uch_t* ptr = ucs; + while (*ptr) ptr = STIX_INCPTR(const stix_uch_t, ptr, 1); + return STIX_SUBPTR(const stix_uch_t, ptr, ucs); +} +*/