added utf8 string conversion functions

This commit is contained in:
hyunghwan.chung 2015-05-17 05:02:30 +00:00
parent 090c9ac1bf
commit b70d9a976a
10 changed files with 356 additions and 165 deletions

View File

@ -219,7 +219,7 @@ static STIX_INLINE int is_closing_char (stix_cint_t c)
#define ADD_TOKEN_STR(fsc,s) \ #define ADD_TOKEN_STR(fsc,s) \
do { if (add_token_str (fsc, s) == -1) return -1; } while (0) do { if (add_token_str (fsc, s) == -1) return -1; } while (0)
static STIX_INLINE int add_token_char (stix_t* fsc, stix_char_t c) static STIX_INLINE int add_token_char (stix_t* fsc, stix_uch_t c)
{ {
if (fsc->tok.name.len >= STIX_COUNTOF(fsc->tok.buf) - 1) if (fsc->tok.name.len >= STIX_COUNTOF(fsc->tok.buf) - 1)
{ {
@ -232,7 +232,7 @@ static STIX_INLINE int add_token_char (stix_t* fsc, stix_char_t c)
return 0; return 0;
} }
static STIX_INLINE int add_token_str (stix_t* fsc, const stix_char_t* str) static STIX_INLINE int add_token_str (stix_t* fsc, const stix_uch_t* str)
{ {
stix_size_t len; stix_size_t len;
@ -247,49 +247,52 @@ static STIX_INLINE int add_token_str (stix_t* fsc, const stix_char_t* str)
fsc->tok.name.len += stix_strcpy (&fsc->tok.buf[fsc->tok.name.len], str); fsc->tok.name.len += stix_strcpy (&fsc->tok.buf[fsc->tok.name.len], str);
return 0; return 0;
} }
#endif
static int get_char (stix_t* fsc) static int get_char (stix_t* stix)
{ {
stix_ssize_t n; stix_ssize_t n;
if (fsc->sio.inp->b.pos >= fsc->sio.inp->b.len) if (stix->c->curinp->b.pos >= stix->c->curinp->b.len)
{ {
n = fsc->sio.impl (fsc, STIX_FSC_IO_READ, fsc->sio.inp); n = stix->c->impl (stix, STIX_IO_READ, stix->c->curinp);
if (n <= -1) return -1; if (n <= -1) return -1;
if (n == 0) if (n == 0)
{ {
fsc->sio.inp->lxc.c = STIX_CHAR_EOF; // stix->c->curinp->lxc.c = STIX_CHAR_EOF;
fsc->sio.inp->lxc.line = fsc->sio.inp->line; stix->c->curinp->lxc.c = 0;
fsc->sio.inp->lxc.colm = fsc->sio.inp->colm; stix->c->curinp->lxc.line = stix->c->curinp->line;
fsc->sio.inp->lxc.file = fsc->sio.inp->name; stix->c->curinp->lxc.colm = stix->c->curinp->colm;
fsc->sio.lxc = fsc->sio.inp->lxc; stix->c->curinp->lxc.file = stix->c->curinp->name;
return 0; stix->c->lxc = stix->c->curinp->lxc;
return 0; /* indicate that EOF has been read */
} }
fsc->sio.inp->b.pos = 0; stix->c->curinp->b.pos = 0;
fsc->sio.inp->b.len = n; stix->c->curinp->b.len = n;
} }
if (fsc->sio.inp->lxc.c == STIX_T('\n')) if (stix->c->curinp->lxc.c == '\n')
{ {
/* if the previous charater was a newline, /* if the previous charater was a newline,
* increment the line counter and reset column to 1. * increment the line counter and reset column to 1.
* incrementing it line number here instead of * incrementing it line number here instead of
* updating inp->lxc causes the line number for * updating inp->lxc causes the line number for
* TOK_EOF to be the same line as the lxc newline. */ * TOK_EOF to be the same line as the lxc newline. */
fsc->sio.inp->line++; stix->c->curinp->line++;
fsc->sio.inp->colm = 1; stix->c->curinp->colm = 1;
} }
fsc->sio.inp->lxc.c = fsc->sio.inp->buf[fsc->sio.inp->b.pos++]; stix->c->curinp->lxc.c = stix->c->curinp->buf[stix->c->curinp->b.pos++];
fsc->sio.inp->lxc.line = fsc->sio.inp->line; stix->c->curinp->lxc.line = stix->c->curinp->line;
fsc->sio.inp->lxc.colm = fsc->sio.inp->colm++; stix->c->curinp->lxc.colm = stix->c->curinp->colm++;
fsc->sio.inp->lxc.file = fsc->sio.inp->name; stix->c->curinp->lxc.file = stix->c->curinp->name;
fsc->sio.lxc = fsc->sio.inp->lxc; stix->c->lxc = stix->c->curinp->lxc;
return 0; return 1; /* indicate that a normal character has been read */
} }
#if 0
static int skip_spaces (stix_t* fsc) static int skip_spaces (stix_t* fsc)
{ {
while (STIX_ISSPACE(fsc->sio.lxc.c)) GET_CHAR (fsc); while (STIX_ISSPACE(fsc->sio.lxc.c)) GET_CHAR (fsc);
@ -653,9 +656,9 @@ retry:
else else
{ {
stix_cstr_t ea; stix_cstr_t ea;
stix_char_t cc; stix_uch_t cc;
cc = (stix_char_t)c; cc = (stix_uch_t)c;
ea.ptr = &cc; ea.ptr = &cc;
ea.len = 1; ea.len = 1;
@ -685,17 +688,17 @@ static int begin_include (stix_t* fsc)
stix_ioarg_t* arg; stix_ioarg_t* arg;
stix_link_t* link; stix_link_t* link;
link = (stix_link_t*) stix_callocmem (fsc, STIX_SIZEOF(*link) + STIX_SIZEOF(stix_char_t) * (fsc->tok.name.len + 1)); link = (stix_link_t*) stix_callocmem (fsc, STIX_SIZEOF(*link) + STIX_SIZEOF(stix_uch_t) * (fsc->tok.name.len + 1));
if (link == STIX_NULL) goto oops; if (link == STIX_NULL) goto oops;
stix_strcpy ((stix_char_t*)(link + 1), fsc->tok.name.ptr); stix_strcpy ((stix_uch_t*)(link + 1), fsc->tok.name.ptr);
link->link = fsc->sio_names; link->link = fsc->sio_names;
fsc->sio_names = link; fsc->sio_names = link;
arg = (stix_ioarg_t*) stix_callocmem (fsc, STIX_SIZEOF(*arg)); arg = (stix_ioarg_t*) stix_callocmem (fsc, STIX_SIZEOF(*arg));
if (arg == STIX_NULL) goto oops; if (arg == STIX_NULL) goto oops;
arg->name = (const stix_char_t*)(link + 1); arg->name = (const stix_uch_t*)(link + 1);
arg->line = 1; arg->line = 1;
arg->colm = 1; arg->colm = 1;
arg->prev = fsc->sio.inp; arg->prev = fsc->sio.inp;
@ -777,7 +780,7 @@ static STIX_INLINE int is_tok_pseudovar (stix_t* fsc)
stix_strequal(fsc->tok.name.ptr, STIX_T("false"))); stix_strequal(fsc->tok.name.ptr, STIX_T("false")));
} }
static STIX_INLINE int is_tok_binsel (stix_t* fsc, const stix_char_t* sel) static STIX_INLINE int is_tok_binsel (stix_t* fsc, const stix_uch_t* sel)
{ {
return fsc->tok.type == STIX_FSC_TOK_BINSEL && return fsc->tok.type == STIX_FSC_TOK_BINSEL &&
stix_strequal (fsc->tok.name.ptr, sel); stix_strequal (fsc->tok.name.ptr, sel);
@ -848,7 +851,7 @@ static STIX_INLINE int is_tok_binsel (stix_t* fsc, const stix_char_t* sel)
#endif #endif
static STIX_INLINE int emit_code_test ( static STIX_INLINE int emit_code_test (
stix_t* fsc, const stix_char_t* high, const stix_char_t* low) stix_t* fsc, const stix_uch_t* high, const stix_uch_t* low)
{ {
wprintf (L"CODE: %s %s\n", high, low); wprintf (L"CODE: %s %s\n", high, low);
return 0; return 0;
@ -1037,7 +1040,7 @@ static int __add_literal (stix_t* fsc, stix_word_t literal)
return fsc->literal_count - 1; return fsc->literal_count - 1;
} }
static int __add_character_literal (stix_t* fsc, stix_char_t ch) static int __add_character_literal (stix_t* fsc, stix_uch_t ch)
{ {
stix_word_t i, c, literal; stix_word_t i, c, literal;
stix_vm_t* stx = fsc->stx; stix_vm_t* stx = fsc->stx;
@ -1056,7 +1059,7 @@ static int __add_character_literal (stix_t* fsc, stix_char_t ch)
} }
static int __add_string_literal ( static int __add_string_literal (
stix_t* fsc, const stix_char_t* str, stix_word_t size) stix_t* fsc, const stix_uch_t* str, stix_word_t size)
{ {
stix_word_t i, c, literal; stix_word_t i, c, literal;
stix_vm_t* stx = fsc->stx; stix_vm_t* stx = fsc->stx;
@ -1077,7 +1080,7 @@ static int __add_string_literal (
} }
static int __add_symbol_literal ( static int __add_symbol_literal (
stix_t* fsc, const stix_char_t* str, stix_word_t size) stix_t* fsc, const stix_uch_t* str, stix_word_t size)
{ {
stix_vm_t* stx = fsc->stx; stix_vm_t* stx = fsc->stx;
return __add_literal (fsc, stix_new_symbolx(stx, str, size)); return __add_literal (fsc, stix_new_symbolx(stx, str, size));
@ -1210,7 +1213,7 @@ static int parse_expression (stix_t* fsc)
stix_vm_t* stx = fsc->stx; stix_vm_t* stx = fsc->stx;
if (fsc->tok.type == STIX_FSC_TOK_IDENT) { if (fsc->tok.type == STIX_FSC_TOK_IDENT) {
stix_char_t* ident = stix_tok_yield (&fsc->tok, 0); stix_uch_t* ident = stix_tok_yield (&fsc->tok, 0);
if (ident == STIX_NULL) { if (ident == STIX_NULL) {
fsc->errnum = STIX_FSC_ERROR_MEMORY; fsc->errnum = STIX_FSC_ERROR_MEMORY;
return -1; return -1;
@ -1242,7 +1245,7 @@ static int parse_expression (stix_t* fsc)
} }
static int parse_basic_expression ( static int parse_basic_expression (
stix_t* fsc, const stix_char_t* ident) stix_t* fsc, const stix_uch_t* ident)
{ {
/* /*
* <basic expression> ::= <primary> [<messages> <cascaded messages>] * <basic expression> ::= <primary> [<messages> <cascaded messages>]
@ -1259,7 +1262,7 @@ static int parse_basic_expression (
} }
static int parse_assignment ( static int parse_assignment (
stix_t* fsc, const stix_char_t* target) stix_t* fsc, const stix_uch_t* target)
{ {
/* /*
* <assignment> ::= <assignment target> assignmentOperator <expression> * <assignment> ::= <assignment target> assignmentOperator <expression>
@ -1304,7 +1307,7 @@ static int parse_assignment (
} }
static int parse_primary ( static int parse_primary (
stix_t* fsc, const stix_char_t* ident, int* is_super) stix_t* fsc, const stix_uch_t* ident, int* is_super)
{ {
/* /*
* <primary> ::= * <primary> ::=
@ -1389,7 +1392,7 @@ static int parse_primary (
} }
static int parse_primary_ident ( static int parse_primary_ident (
stix_t* fsc, const stix_char_t* ident, int* is_super) stix_t* fsc, const stix_uch_t* ident, int* is_super)
{ {
stix_word_t i; stix_word_t i;
stix_vm_t* stx = fsc->stx; stix_vm_t* stx = fsc->stx;
@ -1616,7 +1619,7 @@ static int parse_binary_message (stix_t* fsc, int is_super)
while (fsc->tok.type == STIX_FSC_TOK_BINSEL) while (fsc->tok.type == STIX_FSC_TOK_BINSEL)
{ {
stix_char_t* op = stix_tok_yield (&fsc->tok, 0); stix_uch_t* op = stix_tok_yield (&fsc->tok, 0);
if (op == STIX_NULL) { if (op == STIX_NULL) {
fsc->errnum = STIX_FSC_ERROR_MEMORY; fsc->errnum = STIX_FSC_ERROR_MEMORY;
return -1; return -1;
@ -1707,11 +1710,11 @@ static int parse_method (stix_t* fsc, stix_word_t method_class, void* input)
#endif #endif
static int get_class_type (const stix_char_t* str, class_type_t* type) static int get_class_type (const stix_uch_t* str, class_type_t* type)
{ {
static struct static struct
{ {
stix_char_t* word; stix_uch_t* word;
class_type_t type; class_type_t type;
} tab[] = } tab[] =
{ {
@ -1735,11 +1738,11 @@ static int get_class_type (const stix_char_t* str, class_type_t* type)
return -1; return -1;
} }
static int get_vardef_type (const stix_char_t* str, vardef_type_t* type) static int get_vardef_type (const stix_uch_t* str, vardef_type_t* type)
{ {
static struct static struct
{ {
stix_char_t* word; stix_uch_t* word;
class_type_t type; class_type_t type;
} tab[] = } tab[] =
{ {
@ -2254,13 +2257,20 @@ int stix_compile (stix_t* stix, stix_ioimpl_t io)
stix->c->arg.line = 1; stix->c->arg.line = 1;
stix->c->arg.colm = 1; stix->c->arg.colm = 1;
stix->c->curinp = &stix->c->arg; stix->c->curinp = &stix->c->arg;
clear_sio_names (stix); // clear_sio_names (stix);
/* open the top-level stream */ /* open the top-level stream */
n = stix->c->impl (stix, STIX_IO_OPEN, stix->c->curinp); n = stix->c->impl (stix, STIX_IO_OPEN, stix->c->curinp);
if (n <= -1) return -1; if (n <= -1) return -1;
if (compile_stream (stix) <= -1) goto oops; // if (compile_stream (stix) <= -1) goto oops;
while (get_char(stix) > 0)
{
stix_bch_t buf[16];
stix_size_t len;
len = stix_uctoutf8 (stix->c->curinp->lxc.c, buf, STIX_COUNTOF(buf));
printf ("%.*s", (int)len, buf);
}
/* close the stream */ /* close the stream */
STIX_ASSERT (stix->c->curinp == &stix->c->arg); STIX_ASSERT (stix->c->curinp == &stix->c->arg);

View File

@ -183,7 +183,7 @@ static int ignite_3 (stix_t* stix)
static struct symbol_name_t static struct symbol_name_t
{ {
stix_oow_t len; stix_oow_t len;
stix_char_t str[16]; stix_uch_t str[16];
} symnames[] = { } symnames[] = {
{ 4, { 'S','t','i','x' } }, { 4, { 'S','t','i','x' } },
{ 6, { 'O','b','j','e','c','t' } }, { 6, { 'O','b','j','e','c','t' } },

View File

@ -28,7 +28,7 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h>
typedef struct xtn_t xtn_t; typedef struct xtn_t xtn_t;
struct xtn_t struct xtn_t
@ -64,31 +64,41 @@ static stix_mmgr_t sys_mmgr =
}; };
static STIX_INLINE stix_oow_t open_input (stix_t* stix, stix_ioarg_t* arg) static STIX_INLINE stix_ssize_t open_input (stix_t* stix, stix_ioarg_t* arg)
{ {
if (arg->includer) if (arg->includer)
{ {
/* includee */ /* includee */
xtn_t* xtn = stix_getxtn(stix); stix_bch_t bcs[1024]; /* TODO: right buffer size */
stix_size_t bcslen = STIX_COUNTOF(bcs);
stix_size_t ucslen = ~(stix_size_t)0;
arg->handle = fopen (xtn->source_path, "r"); if (stix_ucstoutf8 (arg->name, &ucslen, bcs, &bcslen) <= -1)
if (!arg->handle)
{ {
stix_seterrnum (stix, STIX_EIOERR); stix_seterrnum (stix, STIX_EECERR);
return -1; return -1;
} }
} }
else else
{ {
/* main stream */ /* main stream */
/*char tmp[PATH_MAX];*/ xtn_t* xtn = stix_getxtn(stix);
} arg->handle = fopen (xtn->source_path, "r");
} }
static STIX_INLINE stix_oow_t read_input (stix_t* stix, stix_ioarg_t* arg) if (!arg->handle)
{
stix_seterrnum (stix, STIX_EIOERR);
return -1;
}
return 0;
}
static STIX_INLINE stix_ssize_t read_input (stix_t* stix, stix_ioarg_t* arg)
{ {
xtn_t* xtn = stix_getxtn(stix); xtn_t* xtn = stix_getxtn(stix);
stix_size_t n, bcslen, cslen; stix_size_t n, bcslen, ucslen, remlen;
int x; int x;
STIX_ASSERT (arg->handle != STIX_NULL); STIX_ASSERT (arg->handle != STIX_NULL);
@ -100,32 +110,32 @@ static STIX_INLINE stix_oow_t read_input (stix_t* stix, stix_ioarg_t* arg)
stix_seterrnum (stix, STIX_EIOERR); stix_seterrnum (stix, STIX_EIOERR);
return -1; return -1;
} }
} }
xtn->bchar_len += n; xtn->bchar_len += n;
bcslen = xtn->bchar_len; bcslen = xtn->bchar_len;
cslen = STIX_COUNTOF(arg->buf); ucslen = STIX_COUNTOF(arg->buf);
x = stix_utf8toucs (xtn->bchar_buf, &bcslen, arg->buf, &cslen); x = stix_utf8toucs (xtn->bchar_buf, &bcslen, arg->buf, &ucslen);
if (x == -2) if (x <= -1 && ucslen <= 0)
{ {
/* buffer to small */ stix_seterrnum (stix, STIX_EECERR);
} return -1;
if (x <= -1)
{
}
} }
static STIX_INLINE stix_oow_t close_input (stix_t* stix, stix_ioarg_t* arg) remlen = xtn->bchar_len - bcslen;
if (remlen > 0) memmove (xtn->bchar_buf, &xtn->bchar_buf[bcslen], remlen);
xtn->bchar_len = remlen;
return ucslen;
}
static STIX_INLINE stix_ssize_t close_input (stix_t* stix, stix_ioarg_t* arg)
{ {
STIX_ASSERT (arg->handle != STIX_NULL); STIX_ASSERT (arg->handle != STIX_NULL);
fclose (arg->handle); fclose (arg->handle);
return 0; return 0;
} }
/* TODO: IMPLEMENT PROPER INPUT HANDLER */
static stix_oow_t input_handler (stix_t* stix, stix_iocmd_t cmd, stix_ioarg_t* arg) static stix_ssize_t input_handler (stix_t* stix, stix_iocmd_t cmd, stix_ioarg_t* arg)
{ {
switch (cmd) switch (cmd)
{ {
@ -217,8 +227,8 @@ int main (int argc, char* argv[])
} }
{ {
stix_char_t x[] = { 'S', 't', 'r', 'i', 'n', 'g', '\0' }; stix_uch_t x[] = { 'S', 't', 'r', 'i', 'n', 'g', '\0' };
stix_char_t y[] = { 'S', 'y', 'm', 'b', 'o', 'l', '\0' }; stix_uch_t y[] = { 'S', 'y', 'm', 'b', 'o', 'l', '\0' };
stix_oop_t a, b; stix_oop_t a, b;
a = stix_makesymbol (stix, x, 6); a = stix_makesymbol (stix, x, 6);

View File

@ -617,4 +617,5 @@ Single line comment
#! comment text (easy handling to skip hash bang) #! comment text (easy handling to skip hash bang)
Multi-line comments - double quoted as in smalltalk Multi-line comments - double quoted as in smalltalk
" comment text " " comment text 설명이라지요. "

View File

@ -114,9 +114,9 @@ static stix_oop_t alloc_numeric_array (stix_t* stix, const void* ptr, stix_oow_t
return hdr; return hdr;
} }
stix_oop_t stix_alloccharobj (stix_t* stix, const stix_char_t* ptr, stix_oow_t len) stix_oop_t stix_alloccharobj (stix_t* stix, const stix_uch_t* ptr, stix_oow_t len)
{ {
return alloc_numeric_array (stix, ptr, len, STIX_OBJ_TYPE_CHAR, STIX_SIZEOF(stix_char_t), 1); return alloc_numeric_array (stix, ptr, len, STIX_OBJ_TYPE_CHAR, STIX_SIZEOF(stix_uch_t), 1);
} }
stix_oop_t stix_allocuint8obj (stix_t* stix, const stix_uint8_t* ptr, stix_oow_t len) stix_oop_t stix_allocuint8obj (stix_t* stix, const stix_uint8_t* ptr, stix_oow_t len)

View File

@ -140,10 +140,10 @@ typedef enum stix_iocmd_t stix_iocmd_t;
typedef struct stix_iolxc_t stix_iolxc_t; typedef struct stix_iolxc_t stix_iolxc_t;
struct stix_iolxc_t struct stix_iolxc_t
{ {
stix_char_t c; /**< character */ stix_uch_t c; /**< character */
unsigned long line; /**< line */ unsigned long line; /**< line */
unsigned long colm; /**< column */ unsigned long colm; /**< column */
const stix_char_t* file; /**< file specified in #include */ const stix_uch_t* file; /**< file specified in #include */
}; };
enum stix_ioarg_flag_t enum stix_ioarg_flag_t
@ -160,7 +160,7 @@ struct stix_ioarg_t
* It is #STIX_NULL for the main stream and points to a non-NULL string * It is #STIX_NULL for the main stream and points to a non-NULL string
* for an included stream. * for an included stream.
*/ */
const stix_char_t* name; const stix_uch_t* name;
/** /**
* [OUT] I/O handle set by a handler. * [OUT] I/O handle set by a handler.
@ -173,7 +173,7 @@ struct stix_ioarg_t
/** /**
* [OUT] place data here * [OUT] place data here
*/ */
stix_char_t buf[1024]; stix_uch_t buf[1024];
/** /**
* [IN] points to the data of the includer. It is #STIX_NULL for the * [IN] points to the data of the includer. It is #STIX_NULL for the
@ -195,7 +195,7 @@ struct stix_ioarg_t
/*-----------------------------------------------------------------*/ /*-----------------------------------------------------------------*/
}; };
typedef stix_oow_t (*stix_ioimpl_t) ( typedef stix_ssize_t (*stix_ioimpl_t) (
stix_t* stix, stix_t* stix,
stix_iocmd_t cmd, stix_iocmd_t cmd,
stix_ioarg_t* arg stix_ioarg_t* arg
@ -260,13 +260,13 @@ stix_oow_t stix_hashbytes (
); );
stix_oow_t stix_hashchars ( stix_oow_t stix_hashchars (
const stix_char_t* ptr, const stix_uch_t* ptr,
stix_oow_t len stix_oow_t len
); );
int stix_equalchars ( int stix_equalchars (
const stix_char_t* str1, const stix_uch_t* str1,
const stix_char_t* str2, const stix_uch_t* str2,
stix_oow_t len stix_oow_t len
); );
@ -289,7 +289,7 @@ stix_oop_t stix_allocoopobj (
stix_oop_t stix_alloccharobj ( stix_oop_t stix_alloccharobj (
stix_t* stix, stix_t* stix,
const stix_char_t* ptr, const stix_uch_t* ptr,
stix_oow_t len stix_oow_t len
); );
@ -310,13 +310,13 @@ stix_oop_t stix_allocuint16obj (
/* ========================================================================= */ /* ========================================================================= */
stix_oop_t stix_makesymbol ( stix_oop_t stix_makesymbol (
stix_t* stix, stix_t* stix,
const stix_char_t* ptr, const stix_uch_t* ptr,
stix_oow_t len stix_oow_t len
); );
stix_oop_t stix_findsymbol ( stix_oop_t stix_findsymbol (
stix_t* stix, stix_t* stix,
const stix_char_t* ptr, const stix_uch_t* ptr,
stix_oow_t len stix_oow_t len
); );
@ -338,22 +338,21 @@ stix_oop_t stix_getatsysdic (
/* utf8.c */ /* utf8.c */
/* ========================================================================= */ /* ========================================================================= */
stix_size_t stix_uctoutf8 ( stix_size_t stix_uctoutf8 (
stix_char_t uc, stix_uch_t uc,
stix_bchar_t* utf8, stix_bch_t* utf8,
stix_size_t size stix_size_t size
); );
stix_size_t stix_utf8touc ( stix_size_t stix_utf8touc (
const stix_bchar_t* utf8, const stix_bch_t* utf8,
stix_size_t size, stix_size_t size,
stix_char_t* uc stix_uch_t* uc
); );
int stix_ucstoutf8 ( int stix_ucstoutf8 (
const stix_char_t* ucs, const stix_uch_t* ucs,
stix_size_t* ucslen, stix_size_t* ucslen,
stix_bchar_t* bcs, stix_bch_t* bcs,
stix_size_t* bcslen stix_size_t* bcslen
); );
@ -363,27 +362,46 @@ int stix_ucstoutf8 (
* It never returns -2 if \a ucs is #STIX_NULL. * It never returns -2 if \a ucs is #STIX_NULL.
* *
* \code * \code
* const stix_bchar_t* bcs = "a multibyte string"; * const stix_bch_t* bcs = "test string";
* stix_char_t ucs[100]; * stix_uch_t ucs[100];
* qse_size_t ucslen = STIX_COUNTOF(buf), n; * qse_size_t ucslen = STIX_COUNTOF(buf), n;
* qse_size_t bcslen = strlen(bcs); * qse_size_t bcslen = 11;
* int n; * int n;
* n = qse_bcstoucs (bcs, &bcslen, ucs, &ucslen); * n = qse_bcstoucs (bcs, &bcslen, ucs, &ucslen);
* if (n <= -1) { invalid/incomplenete sequence or buffer to small } * if (n <= -1) { invalid/incomplenete sequence or buffer to small }
* \endcode * \endcode
* *
* For a null-terminated string, you can specify ~(stix_size_t)0 in
* \a bcslen. The destination buffer \a ucs also must be large enough to
* store a terminating null. Otherwise, -2 is returned.
*
* The resulting \a ucslen can still be greater than 0 even if the return
* value is negative. The value indiates the number of characters converted
* before the error has occurred.
*
* \return 0 on success. * \return 0 on success.
* -1 if \a bcs contains an illegal character. * -1 if \a bcs contains an illegal character.
* -2 if the wide-character string buffer is too small. * -2 if the wide-character string buffer is too small.
* -3 if \a bcs is not a complete sequence. * -3 if \a bcs is not a complete sequence.
*/ */
int stix_utf8toucs ( int stix_utf8toucs (
const stix_bchar_t* bcs, const stix_bch_t* bcs,
stix_size_t* bcslen, stix_size_t* bcslen,
stix_char_t* ucs, stix_uch_t* ucs,
stix_size_t* ucslen stix_size_t* ucslen
); );
/**
* The stix_ucslen() function returns the number of characters before
* a terminating null.
*/
/*
stix_size_t stix_ucslen (
const stix_uch_t* ucs
);
*/
/* ========================================================================= */ /* ========================================================================= */
/* comp.c */ /* comp.c */
/* ========================================================================= */ /* ========================================================================= */

View File

@ -155,12 +155,12 @@ stix_oow_t stix_hashbytes (const stix_uint8_t* ptr, stix_oow_t len)
return h; return h;
} }
stix_oow_t stix_hashchars (const stix_char_t* ptr, stix_oow_t len) stix_oow_t stix_hashchars (const stix_uch_t* ptr, stix_oow_t len)
{ {
return stix_hashbytes ((const stix_uint8_t *)ptr, len * STIX_SIZEOF(*ptr)); return stix_hashbytes ((const stix_uint8_t *)ptr, len * STIX_SIZEOF(*ptr));
} }
int stix_equalchars (const stix_char_t* str1, const stix_char_t* str2, stix_oow_t len) int stix_equalchars (const stix_uch_t* str1, const stix_uch_t* str2, stix_oow_t len)
{ {
stix_oow_t i; stix_oow_t i;

View File

@ -44,9 +44,26 @@ typedef unsigned short int stix_uint16_t;
#endif #endif
typedef unsigned long int stix_uintptr_t; typedef unsigned long int stix_uintptr_t;
typedef unsigned long int stix_size_t; typedef unsigned long int stix_size_t;
typedef long int stix_ssize_t;
typedef unsigned short int stix_char_t; /* TODO ... wchar_t??? */ typedef unsigned short int stix_uch_t; /* TODO ... wchar_t??? */
typedef char stix_bchar_t; typedef char stix_bch_t;
struct stix_ucs_t
{
stix_uch_t* ptr;
stix_size_t len;
};
struct stix_bcs_t
{
stix_bch_t* ptr;
stix_size_t len;
};
typedef struct stix_ucs_t stix_ucs_t;
typedef struct stix_bcs_t stix_bcs_t;
/* ========================================================================= /* =========================================================================
* PRIMITIVE MACROS * PRIMITIVE MACROS
@ -185,15 +202,15 @@ struct stix_mmgr_t
typedef struct stix_cmgr_t stix_cmgr_t; typedef struct stix_cmgr_t stix_cmgr_t;
typedef stix_size_t (*stix_cmgr_bctoc_t) ( typedef stix_size_t (*stix_cmgr_bctouc_t) (
const stix_bchar_t* mb, const stix_bch_t* mb,
stix_size_t size, stix_size_t size,
stix_char_t* wc stix_uch_t* wc
); );
typedef stix_size_t (*stix_cmgr_ctobc_t) ( typedef stix_size_t (*stix_cmgr_uctobc_t) (
stix_char_t wc, stix_uch_t wc,
stix_bchar_t* mb, stix_bch_t* mb,
stix_size_t size stix_size_t size
); );
@ -206,8 +223,8 @@ typedef stix_size_t (*stix_cmgr_ctobc_t) (
*/ */
struct stix_cmgr_t struct stix_cmgr_t
{ {
stix_cmgr_bctoc_t bctoc; stix_cmgr_bctouc_t bctouc;
stix_cmgr_ctobc_t ctobc; stix_cmgr_uctobc_t uctobc;
}; };
/* ========================================================================= /* =========================================================================
@ -258,7 +275,8 @@ enum stix_errnum_t
STIX_ENOMEM, /**< insufficient memory */ STIX_ENOMEM, /**< insufficient memory */
STIX_EINVAL, /**< invalid parameter or data */ STIX_EINVAL, /**< invalid parameter or data */
STIX_ENOENT, /**< no matching entry */ STIX_ENOENT, /**< no matching entry */
STIX_EIOERR /**< I/O error */ STIX_EIOERR, /**< I/O error */
STIX_EECERR /**< encoding conversion error */
}; };
typedef enum stix_errnum_t stix_errnum_t; typedef enum stix_errnum_t stix_errnum_t;
@ -605,7 +623,7 @@ struct stix_obj_oop_t
struct stix_obj_char_t struct stix_obj_char_t
{ {
STIX_OBJ_HEADER; STIX_OBJ_HEADER;
stix_char_t slot[1]; stix_uch_t slot[1];
}; };
struct stix_obj_uint8_t struct stix_obj_uint8_t
@ -832,7 +850,7 @@ STIX_EXPORT void stix_gc (
*/ */
STIX_EXPORT int stix_findclass ( STIX_EXPORT int stix_findclass (
stix_t* vm, stix_t* vm,
const stix_char_t* name, const stix_uch_t* name,
stix_oop_t* oop stix_oop_t* oop
); );

View File

@ -57,7 +57,7 @@ static stix_oop_oop_t expand_bucket (stix_t* stix, stix_oop_oop_t old_bucket)
return new_bucket; return new_bucket;
} }
static stix_oop_t find_or_make_symbol (stix_t* stix, const stix_char_t* ptr, stix_oow_t len, int create) static stix_oop_t find_or_make_symbol (stix_t* stix, const stix_uch_t* ptr, stix_oow_t len, int create)
{ {
stix_oow_t index, tally; stix_oow_t index, tally;
stix_oop_char_t symbol; stix_oop_char_t symbol;
@ -130,12 +130,12 @@ static stix_oop_t find_or_make_symbol (stix_t* stix, const stix_char_t* ptr, sti
return (stix_oop_t)symbol; return (stix_oop_t)symbol;
} }
stix_oop_t stix_makesymbol (stix_t* stix, const stix_char_t* ptr, stix_oow_t len) stix_oop_t stix_makesymbol (stix_t* stix, const stix_uch_t* ptr, stix_oow_t len)
{ {
return find_or_make_symbol (stix, ptr, len, 1); return find_or_make_symbol (stix, ptr, len, 1);
} }
stix_oop_t stix_findsymbol (stix_t* stix, const stix_char_t* ptr, stix_oow_t len) stix_oop_t stix_findsymbol (stix_t* stix, const stix_uch_t* ptr, stix_oow_t len)
{ {
return find_or_make_symbol (stix, ptr, len, 0); return find_or_make_symbol (stix, ptr, len, 0);
} }

View File

@ -26,7 +26,7 @@
#include "stix-prv.h" #include "stix-prv.h"
#define STIX_BCLEN_MAX 16 #define STIX_BCLEN_MAX 6
/* /*
* from RFC 2279 UTF-8, a transformation format of ISO 10646 * from RFC 2279 UTF-8, a transformation format of ISO 10646
@ -62,12 +62,12 @@ static __utf8_t utf8_table[] =
{0x04000000ul, 0x7FFFFFFFul, 0xFC, 0xFE, 0x01, 6} {0x04000000ul, 0x7FFFFFFFul, 0xFC, 0xFE, 0x01, 6}
}; };
static STIX_INLINE __utf8_t* get_utf8_slot (stix_char_t uc) static STIX_INLINE __utf8_t* get_utf8_slot (stix_uch_t uc)
{ {
__utf8_t* cur, * end; __utf8_t* cur, * end;
STIX_ASSERT (STIX_SIZEOF(stix_bchar_t) == 1); STIX_ASSERT (STIX_SIZEOF(stix_bch_t) == 1);
STIX_ASSERT (STIX_SIZEOF(stix_char_t) >= 2); STIX_ASSERT (STIX_SIZEOF(stix_uch_t) >= 2);
end = utf8_table + STIX_COUNTOF(utf8_table); end = utf8_table + STIX_COUNTOF(utf8_table);
cur = utf8_table; cur = utf8_table;
@ -81,7 +81,7 @@ static STIX_INLINE __utf8_t* get_utf8_slot (stix_char_t uc)
return STIX_NULL; /* invalid character */ return STIX_NULL; /* invalid character */
} }
stix_size_t stix_uctoutf8 (stix_char_t uc, stix_bchar_t* utf8, stix_size_t size) stix_size_t stix_uctoutf8 (stix_uch_t uc, stix_bch_t* utf8, stix_size_t size)
{ {
__utf8_t* cur = get_utf8_slot (uc); __utf8_t* cur = get_utf8_slot (uc);
@ -108,14 +108,14 @@ stix_size_t stix_uctoutf8 (stix_char_t uc, stix_bchar_t* utf8, stix_size_t size)
return (stix_size_t)cur->length; return (stix_size_t)cur->length;
} }
stix_size_t stix_utf8touc (const stix_bchar_t* utf8, stix_size_t size, stix_char_t* uc) stix_size_t stix_utf8touc (const stix_bch_t* utf8, stix_size_t size, stix_uch_t* uc)
{ {
__utf8_t* cur, * end; __utf8_t* cur, * end;
STIX_ASSERT (utf8 != STIX_NULL); STIX_ASSERT (utf8 != STIX_NULL);
STIX_ASSERT (size > 0); STIX_ASSERT (size > 0);
STIX_ASSERT (STIX_SIZEOF(stix_bchar_t) == 1); STIX_ASSERT (STIX_SIZEOF(stix_bch_t) == 1);
STIX_ASSERT (STIX_SIZEOF(stix_char_t) >= 2); STIX_ASSERT (STIX_SIZEOF(stix_uch_t) >= 2);
end = utf8_table + STIX_COUNTOF(utf8_table); end = utf8_table + STIX_COUNTOF(utf8_table);
cur = utf8_table; cur = utf8_table;
@ -135,7 +135,7 @@ stix_size_t stix_utf8touc (const stix_bchar_t* utf8, stix_size_t size, stix_char
if (uc) if (uc)
{ {
stix_char_t w; stix_uch_t w;
w = utf8[0] & cur->fmask; w = utf8[0] & cur->fmask;
for (i = 1; i < cur->length; i++) for (i = 1; i < cur->length; i++)
@ -167,9 +167,9 @@ stix_size_t stix_utf8touc (const stix_bchar_t* utf8, stix_size_t size, stix_char
} }
/* this return value can indicate both /* this return value can indicate both
* the correct length (len >= cur->length) * the correct length (size >= cur->length)
* and * and
* the incomplete seqeunce error (len < cur->length). * the incomplete seqeunce error (size < cur->length).
*/ */
return (stix_size_t)cur->length; return (stix_size_t)cur->length;
} }
@ -179,28 +179,26 @@ stix_size_t stix_utf8touc (const stix_bchar_t* utf8, stix_size_t size, stix_char
return 0; /* error - invalid sequence */ return 0; /* error - invalid sequence */
} }
stix_size_t stix_utf8len (const stix_bchar_t* utf8, stix_size_t size)
{
return stix_utf8touc (utf8, size, STIX_NULL);
}
/* ----------------------------------------------------------------------- */ /* ----------------------------------------------------------------------- */
static int bcsn_to_csn_with_cmgr ( static STIX_INLINE int bcsn_to_ucsn_with_cmgr (
const stix_bchar_t* bcs, stix_size_t* bcslen, const stix_bch_t* bcs, stix_size_t* bcslen,
stix_char_t* cs, stix_size_t* cslen, stix_cmgr_t* cmgr, int all) stix_uch_t* ucs, stix_size_t* ucslen, stix_cmgr_t* cmgr, int all)
{ {
const stix_bchar_t* p; const stix_bch_t* p;
int ret = 0; int ret = 0;
stix_size_t mlen; stix_size_t mlen;
if (cs) if (ucs)
{ {
stix_char_t* q, * qend; /* destination buffer is specified.
* copy the conversion result to the buffer */
stix_uch_t* q, * qend;
p = bcs; p = bcs;
q = cs; q = ucs;
qend = cs + *cslen; qend = ucs + *ucslen;
mlen = *bcslen; mlen = *bcslen;
while (mlen > 0) while (mlen > 0)
@ -214,7 +212,7 @@ static int bcsn_to_csn_with_cmgr (
break; break;
} }
n = cmgr->bctoc (p, mlen, q); n = cmgr->bctouc (p, mlen, q);
if (n == 0) if (n == 0)
{ {
/* invalid sequence */ /* invalid sequence */
@ -249,12 +247,18 @@ static int bcsn_to_csn_with_cmgr (
mlen -= n; mlen -= n;
} }
*cslen = q - cs; *ucslen = q - ucs;
*bcslen = p - bcs; *bcslen = p - bcs;
} }
else else
{ {
stix_char_t w; /* no destination buffer is specified. perform conversion
* but don't copy the result. the caller can call this function
* without a buffer to find the required buffer size, allocate
* a buffer with the size and call this function again with
* the buffer. */
stix_uch_t w;
stix_size_t wlen = 0; stix_size_t wlen = 0;
p = bcs; p = bcs;
@ -264,7 +268,7 @@ static int bcsn_to_csn_with_cmgr (
{ {
stix_size_t n; stix_size_t n;
n = cmgr->bctoc (p, mlen, &w); n = cmgr->bctouc (p, mlen, &w);
if (n == 0) if (n == 0)
{ {
/* invalid sequence */ /* invalid sequence */
@ -291,19 +295,42 @@ static int bcsn_to_csn_with_cmgr (
wlen += 1; wlen += 1;
} }
*cslen = wlen; *ucslen = wlen;
*bcslen = p - bcs; *bcslen = p - bcs;
} }
return ret; return ret;
} }
static int csn_to_bcsn_with_cmgr ( static STIX_INLINE int bcs_to_ucs_with_cmgr (
const stix_char_t* cs, stix_size_t* cslen, const stix_bch_t* bcs, stix_size_t* bcslen,
stix_bchar_t* bcs, stix_size_t* bcslen, stix_cmgr_t* cmgr) stix_uch_t* ucs, stix_size_t* ucslen, stix_cmgr_t* cmgr, int all)
{ {
const stix_char_t* p = cs; const stix_bch_t* bp;
const stix_char_t* end = cs + *cslen; stix_size_t mlen, wlen;
int n;
for (bp = bcs; *bp != '\0'; bp++);
mlen = bp - bcs; wlen = *ucslen;
n = bcsn_to_ucsn_with_cmgr (bcs, &mlen, ucs, &wlen, cmgr, all);
if (ucs)
{
/* null-terminate the target buffer if it has room for it. */
if (wlen < *ucslen) ucs[wlen] = '\0';
else n = -2; /* buffer too small */
}
*bcslen = mlen; *ucslen = wlen;
return n;
}
static STIX_INLINE int ucsn_to_bcsn_with_cmgr (
const stix_uch_t* ucs, stix_size_t* ucslen,
stix_bch_t* bcs, stix_size_t* bcslen, stix_cmgr_t* cmgr)
{
const stix_uch_t* p = ucs;
const stix_uch_t* end = ucs + *ucslen;
int ret = 0; int ret = 0;
if (bcs) if (bcs)
@ -320,7 +347,7 @@ static int csn_to_bcsn_with_cmgr (
break; break;
} }
n = cmgr->ctobc (*p, bcs, rem); n = cmgr->uctobc (*p, bcs, rem);
if (n == 0) if (n == 0)
{ {
ret = -1; ret = -1;
@ -338,14 +365,96 @@ static int csn_to_bcsn_with_cmgr (
} }
else else
{ {
stix_bchar_t bcsbuf[STIX_BCLEN_MAX]; stix_bch_t bcsbuf[STIX_BCLEN_MAX];
stix_size_t mlen = 0; stix_size_t mlen = 0;
while (p < end) while (p < end)
{ {
stix_size_t n; stix_size_t n;
n = cmgr->ctobc (*p, bcsbuf, STIX_COUNTOF(bcsbuf)); n = cmgr->uctobc (*p, bcsbuf, STIX_COUNTOF(bcsbuf));
if (n == 0)
{
ret = -1;
break; /* illegal character */
}
/* it assumes that bcsbuf is large enough to hold a character */
STIX_ASSERT (n <= STIX_COUNTOF(bcsbuf));
p++; mlen += n;
}
/* this length excludes the terminating null character.
* this function doesn't even null-terminate the result. */
*bcslen = mlen;
}
*ucslen = p - ucs;
return ret;
}
static int ucs_to_bcs_with_cmgr (
const stix_uch_t* ucs, stix_size_t* ucslen,
stix_bch_t* bcs, stix_size_t* bcslen, stix_cmgr_t* cmgr)
{
const stix_uch_t* p = ucs;
int ret = 0;
if (bcs)
{
stix_size_t rem = *bcslen;
while (*p != '\0')
{
stix_size_t n;
if (rem <= 0)
{
ret = -2;
break;
}
n = cmgr->uctobc (*p, bcs, rem);
if (n == 0)
{
ret = -1;
break; /* illegal character */
}
if (n > rem)
{
ret = -2;
break; /* buffer too small */
}
bcs += n; rem -= n; p++;
}
/* update bcslen to the length of the bcs string converted excluding
* terminating null */
*bcslen -= rem;
/* null-terminate the multibyte sequence if it has sufficient space */
if (rem > 0) *bcs = '\0';
else
{
/* if ret is -2 and cs[cslen] == '\0',
* this means that the bcs buffer was lacking one
* slot for the terminating null */
ret = -2; /* buffer too small */
}
}
else
{
stix_bch_t bcsbuf[STIX_BCLEN_MAX];
stix_size_t mlen = 0;
while (*p != '\0')
{
stix_size_t n;
n = cmgr->uctobc (*p, bcsbuf, STIX_COUNTOF(bcsbuf));
if (n == 0) if (n == 0)
{ {
ret = -1; ret = -1;
@ -353,22 +462,20 @@ static int csn_to_bcsn_with_cmgr (
} }
/* it assumes that bcs is large enough to hold a character */ /* it assumes that bcs is large enough to hold a character */
STIX_ASSERT (n <= STIX_COUNTOF(bcsbuf)); STIX_ASSERT (n <= STIX_COUNTOF(bcs));
p++; mlen += n; p++; mlen += n;
} }
/* this length excludes the terminating null character. /* this length holds the number of resulting multi-byte characters
* this function doesn't event null-terminate the result. */ * excluding the terminating null character */
*bcslen = mlen; *bcslen = mlen;
} }
*cslen = p - cs; *ucslen = p - ucs; /* the number of wide characters handled. */
return ret; return ret;
} }
static stix_cmgr_t utf8_cmgr = static stix_cmgr_t utf8_cmgr =
{ {
stix_utf8touc, stix_utf8touc,
@ -376,15 +483,42 @@ static stix_cmgr_t utf8_cmgr =
}; };
int stix_utf8toucs ( int stix_utf8toucs (
const stix_bchar_t* bcs, stix_size_t* bcslen, const stix_bch_t* bcs, stix_size_t* bcslen,
stix_char_t* ucs, stix_size_t* ucslen) stix_uch_t* ucs, stix_size_t* ucslen)
{ {
return bcsn_to_csn_with_cmgr (bcs, bcslen, ucs, ucslen, &utf8_cmgr, 0); if (*bcslen == ~(stix_size_t)0)
{
/* the source is null-terminated. */
return bcs_to_ucs_with_cmgr (bcs, bcslen, ucs, ucslen, &utf8_cmgr, 0);
}
else
{
/* the source is length bound */
return bcsn_to_ucsn_with_cmgr (bcs, bcslen, ucs, ucslen, &utf8_cmgr, 0);
}
} }
int stix_ucstoutf8 ( int stix_ucstoutf8 (
const stix_char_t* ucs, stix_size_t *ucslen, const stix_uch_t* ucs, stix_size_t *ucslen,
stix_bchar_t* bcs, stix_size_t* bcslen) stix_bch_t* bcs, stix_size_t* bcslen)
{ {
return csn_to_bcsn_with_cmgr (ucs, ucslen, bcs, bcslen, &utf8_cmgr); if (*ucslen == ~(stix_size_t)0)
{
/* null-terminated */
return ucs_to_bcs_with_cmgr (ucs, ucslen, bcs, bcslen, &utf8_cmgr);
} }
else
{
/* length bound */
return ucsn_to_bcsn_with_cmgr (ucs, ucslen, bcs, bcslen, &utf8_cmgr);
}
}
/*
stix_size_t stix_ucslen (const stix_uch_t* ucs)
{
const stix_uch_t* ptr = ucs;
while (*ptr) ptr = STIX_INCPTR(const stix_uch_t, ptr, 1);
return STIX_SUBPTR(const stix_uch_t, ptr, ucs);
}
*/