added utf8 string conversion functions

This commit is contained in:
hyunghwan.chung 2015-05-17 05:02:30 +00:00
parent 090c9ac1bf
commit b70d9a976a
10 changed files with 356 additions and 165 deletions

View File

@ -219,7 +219,7 @@ static STIX_INLINE int is_closing_char (stix_cint_t c)
#define ADD_TOKEN_STR(fsc,s) \
do { if (add_token_str (fsc, s) == -1) return -1; } while (0)
static STIX_INLINE int add_token_char (stix_t* fsc, stix_char_t c)
static STIX_INLINE int add_token_char (stix_t* fsc, stix_uch_t c)
{
if (fsc->tok.name.len >= STIX_COUNTOF(fsc->tok.buf) - 1)
{
@ -232,7 +232,7 @@ static STIX_INLINE int add_token_char (stix_t* fsc, stix_char_t c)
return 0;
}
static STIX_INLINE int add_token_str (stix_t* fsc, const stix_char_t* str)
static STIX_INLINE int add_token_str (stix_t* fsc, const stix_uch_t* str)
{
stix_size_t len;
@ -247,49 +247,52 @@ static STIX_INLINE int add_token_str (stix_t* fsc, const stix_char_t* str)
fsc->tok.name.len += stix_strcpy (&fsc->tok.buf[fsc->tok.name.len], str);
return 0;
}
#endif
static int get_char (stix_t* fsc)
static int get_char (stix_t* stix)
{
stix_ssize_t n;
if (fsc->sio.inp->b.pos >= fsc->sio.inp->b.len)
if (stix->c->curinp->b.pos >= stix->c->curinp->b.len)
{
n = fsc->sio.impl (fsc, STIX_FSC_IO_READ, fsc->sio.inp);
n = stix->c->impl (stix, STIX_IO_READ, stix->c->curinp);
if (n <= -1) return -1;
if (n == 0)
{
fsc->sio.inp->lxc.c = STIX_CHAR_EOF;
fsc->sio.inp->lxc.line = fsc->sio.inp->line;
fsc->sio.inp->lxc.colm = fsc->sio.inp->colm;
fsc->sio.inp->lxc.file = fsc->sio.inp->name;
fsc->sio.lxc = fsc->sio.inp->lxc;
return 0;
// stix->c->curinp->lxc.c = STIX_CHAR_EOF;
stix->c->curinp->lxc.c = 0;
stix->c->curinp->lxc.line = stix->c->curinp->line;
stix->c->curinp->lxc.colm = stix->c->curinp->colm;
stix->c->curinp->lxc.file = stix->c->curinp->name;
stix->c->lxc = stix->c->curinp->lxc;
return 0; /* indicate that EOF has been read */
}
fsc->sio.inp->b.pos = 0;
fsc->sio.inp->b.len = n;
stix->c->curinp->b.pos = 0;
stix->c->curinp->b.len = n;
}
if (fsc->sio.inp->lxc.c == STIX_T('\n'))
if (stix->c->curinp->lxc.c == '\n')
{
/* if the previous charater was a newline,
* increment the line counter and reset column to 1.
* incrementing it line number here instead of
* updating inp->lxc causes the line number for
* TOK_EOF to be the same line as the lxc newline. */
fsc->sio.inp->line++;
fsc->sio.inp->colm = 1;
stix->c->curinp->line++;
stix->c->curinp->colm = 1;
}
fsc->sio.inp->lxc.c = fsc->sio.inp->buf[fsc->sio.inp->b.pos++];
fsc->sio.inp->lxc.line = fsc->sio.inp->line;
fsc->sio.inp->lxc.colm = fsc->sio.inp->colm++;
fsc->sio.inp->lxc.file = fsc->sio.inp->name;
fsc->sio.lxc = fsc->sio.inp->lxc;
return 0;
stix->c->curinp->lxc.c = stix->c->curinp->buf[stix->c->curinp->b.pos++];
stix->c->curinp->lxc.line = stix->c->curinp->line;
stix->c->curinp->lxc.colm = stix->c->curinp->colm++;
stix->c->curinp->lxc.file = stix->c->curinp->name;
stix->c->lxc = stix->c->curinp->lxc;
return 1; /* indicate that a normal character has been read */
}
#if 0
static int skip_spaces (stix_t* fsc)
{
while (STIX_ISSPACE(fsc->sio.lxc.c)) GET_CHAR (fsc);
@ -653,9 +656,9 @@ retry:
else
{
stix_cstr_t ea;
stix_char_t cc;
stix_uch_t cc;
cc = (stix_char_t)c;
cc = (stix_uch_t)c;
ea.ptr = &cc;
ea.len = 1;
@ -685,17 +688,17 @@ static int begin_include (stix_t* fsc)
stix_ioarg_t* arg;
stix_link_t* link;
link = (stix_link_t*) stix_callocmem (fsc, STIX_SIZEOF(*link) + STIX_SIZEOF(stix_char_t) * (fsc->tok.name.len + 1));
link = (stix_link_t*) stix_callocmem (fsc, STIX_SIZEOF(*link) + STIX_SIZEOF(stix_uch_t) * (fsc->tok.name.len + 1));
if (link == STIX_NULL) goto oops;
stix_strcpy ((stix_char_t*)(link + 1), fsc->tok.name.ptr);
stix_strcpy ((stix_uch_t*)(link + 1), fsc->tok.name.ptr);
link->link = fsc->sio_names;
fsc->sio_names = link;
arg = (stix_ioarg_t*) stix_callocmem (fsc, STIX_SIZEOF(*arg));
if (arg == STIX_NULL) goto oops;
arg->name = (const stix_char_t*)(link + 1);
arg->name = (const stix_uch_t*)(link + 1);
arg->line = 1;
arg->colm = 1;
arg->prev = fsc->sio.inp;
@ -777,7 +780,7 @@ static STIX_INLINE int is_tok_pseudovar (stix_t* fsc)
stix_strequal(fsc->tok.name.ptr, STIX_T("false")));
}
static STIX_INLINE int is_tok_binsel (stix_t* fsc, const stix_char_t* sel)
static STIX_INLINE int is_tok_binsel (stix_t* fsc, const stix_uch_t* sel)
{
return fsc->tok.type == STIX_FSC_TOK_BINSEL &&
stix_strequal (fsc->tok.name.ptr, sel);
@ -848,7 +851,7 @@ static STIX_INLINE int is_tok_binsel (stix_t* fsc, const stix_char_t* sel)
#endif
static STIX_INLINE int emit_code_test (
stix_t* fsc, const stix_char_t* high, const stix_char_t* low)
stix_t* fsc, const stix_uch_t* high, const stix_uch_t* low)
{
wprintf (L"CODE: %s %s\n", high, low);
return 0;
@ -1037,7 +1040,7 @@ static int __add_literal (stix_t* fsc, stix_word_t literal)
return fsc->literal_count - 1;
}
static int __add_character_literal (stix_t* fsc, stix_char_t ch)
static int __add_character_literal (stix_t* fsc, stix_uch_t ch)
{
stix_word_t i, c, literal;
stix_vm_t* stx = fsc->stx;
@ -1056,7 +1059,7 @@ static int __add_character_literal (stix_t* fsc, stix_char_t ch)
}
static int __add_string_literal (
stix_t* fsc, const stix_char_t* str, stix_word_t size)
stix_t* fsc, const stix_uch_t* str, stix_word_t size)
{
stix_word_t i, c, literal;
stix_vm_t* stx = fsc->stx;
@ -1077,7 +1080,7 @@ static int __add_string_literal (
}
static int __add_symbol_literal (
stix_t* fsc, const stix_char_t* str, stix_word_t size)
stix_t* fsc, const stix_uch_t* str, stix_word_t size)
{
stix_vm_t* stx = fsc->stx;
return __add_literal (fsc, stix_new_symbolx(stx, str, size));
@ -1210,7 +1213,7 @@ static int parse_expression (stix_t* fsc)
stix_vm_t* stx = fsc->stx;
if (fsc->tok.type == STIX_FSC_TOK_IDENT) {
stix_char_t* ident = stix_tok_yield (&fsc->tok, 0);
stix_uch_t* ident = stix_tok_yield (&fsc->tok, 0);
if (ident == STIX_NULL) {
fsc->errnum = STIX_FSC_ERROR_MEMORY;
return -1;
@ -1242,7 +1245,7 @@ static int parse_expression (stix_t* fsc)
}
static int parse_basic_expression (
stix_t* fsc, const stix_char_t* ident)
stix_t* fsc, const stix_uch_t* ident)
{
/*
* <basic expression> ::= <primary> [<messages> <cascaded messages>]
@ -1259,7 +1262,7 @@ static int parse_basic_expression (
}
static int parse_assignment (
stix_t* fsc, const stix_char_t* target)
stix_t* fsc, const stix_uch_t* target)
{
/*
* <assignment> ::= <assignment target> assignmentOperator <expression>
@ -1304,7 +1307,7 @@ static int parse_assignment (
}
static int parse_primary (
stix_t* fsc, const stix_char_t* ident, int* is_super)
stix_t* fsc, const stix_uch_t* ident, int* is_super)
{
/*
* <primary> ::=
@ -1389,7 +1392,7 @@ static int parse_primary (
}
static int parse_primary_ident (
stix_t* fsc, const stix_char_t* ident, int* is_super)
stix_t* fsc, const stix_uch_t* ident, int* is_super)
{
stix_word_t i;
stix_vm_t* stx = fsc->stx;
@ -1616,7 +1619,7 @@ static int parse_binary_message (stix_t* fsc, int is_super)
while (fsc->tok.type == STIX_FSC_TOK_BINSEL)
{
stix_char_t* op = stix_tok_yield (&fsc->tok, 0);
stix_uch_t* op = stix_tok_yield (&fsc->tok, 0);
if (op == STIX_NULL) {
fsc->errnum = STIX_FSC_ERROR_MEMORY;
return -1;
@ -1707,11 +1710,11 @@ static int parse_method (stix_t* fsc, stix_word_t method_class, void* input)
#endif
static int get_class_type (const stix_char_t* str, class_type_t* type)
static int get_class_type (const stix_uch_t* str, class_type_t* type)
{
static struct
{
stix_char_t* word;
stix_uch_t* word;
class_type_t type;
} tab[] =
{
@ -1735,11 +1738,11 @@ static int get_class_type (const stix_char_t* str, class_type_t* type)
return -1;
}
static int get_vardef_type (const stix_char_t* str, vardef_type_t* type)
static int get_vardef_type (const stix_uch_t* str, vardef_type_t* type)
{
static struct
{
stix_char_t* word;
stix_uch_t* word;
class_type_t type;
} tab[] =
{
@ -2254,13 +2257,20 @@ int stix_compile (stix_t* stix, stix_ioimpl_t io)
stix->c->arg.line = 1;
stix->c->arg.colm = 1;
stix->c->curinp = &stix->c->arg;
clear_sio_names (stix);
// clear_sio_names (stix);
/* open the top-level stream */
n = stix->c->impl (stix, STIX_IO_OPEN, stix->c->curinp);
if (n <= -1) return -1;
if (compile_stream (stix) <= -1) goto oops;
// if (compile_stream (stix) <= -1) goto oops;
while (get_char(stix) > 0)
{
stix_bch_t buf[16];
stix_size_t len;
len = stix_uctoutf8 (stix->c->curinp->lxc.c, buf, STIX_COUNTOF(buf));
printf ("%.*s", (int)len, buf);
}
/* close the stream */
STIX_ASSERT (stix->c->curinp == &stix->c->arg);

View File

@ -183,7 +183,7 @@ static int ignite_3 (stix_t* stix)
static struct symbol_name_t
{
stix_oow_t len;
stix_char_t str[16];
stix_uch_t str[16];
} symnames[] = {
{ 4, { 'S','t','i','x' } },
{ 6, { 'O','b','j','e','c','t' } },

View File

@ -28,7 +28,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct xtn_t xtn_t;
struct xtn_t
@ -64,31 +64,41 @@ static stix_mmgr_t sys_mmgr =
};
static STIX_INLINE stix_oow_t open_input (stix_t* stix, stix_ioarg_t* arg)
static STIX_INLINE stix_ssize_t open_input (stix_t* stix, stix_ioarg_t* arg)
{
if (arg->includer)
{
/* includee */
xtn_t* xtn = stix_getxtn(stix);
stix_bch_t bcs[1024]; /* TODO: right buffer size */
stix_size_t bcslen = STIX_COUNTOF(bcs);
stix_size_t ucslen = ~(stix_size_t)0;
arg->handle = fopen (xtn->source_path, "r");
if (!arg->handle)
if (stix_ucstoutf8 (arg->name, &ucslen, bcs, &bcslen) <= -1)
{
stix_seterrnum (stix, STIX_EIOERR);
stix_seterrnum (stix, STIX_EECERR);
return -1;
}
}
else
{
/* main stream */
/*char tmp[PATH_MAX];*/
xtn_t* xtn = stix_getxtn(stix);
arg->handle = fopen (xtn->source_path, "r");
}
if (!arg->handle)
{
stix_seterrnum (stix, STIX_EIOERR);
return -1;
}
return 0;
}
static STIX_INLINE stix_oow_t read_input (stix_t* stix, stix_ioarg_t* arg)
static STIX_INLINE stix_ssize_t read_input (stix_t* stix, stix_ioarg_t* arg)
{
xtn_t* xtn = stix_getxtn(stix);
stix_size_t n, bcslen, cslen;
stix_size_t n, bcslen, ucslen, remlen;
int x;
STIX_ASSERT (arg->handle != STIX_NULL);
@ -100,32 +110,32 @@ static STIX_INLINE stix_oow_t read_input (stix_t* stix, stix_ioarg_t* arg)
stix_seterrnum (stix, STIX_EIOERR);
return -1;
}
}
xtn->bchar_len += n;
bcslen = xtn->bchar_len;
cslen = STIX_COUNTOF(arg->buf);
x = stix_utf8toucs (xtn->bchar_buf, &bcslen, arg->buf, &cslen);
if (x == -2)
{
/* buffer to small */
}
if (x <= -1)
ucslen = STIX_COUNTOF(arg->buf);
x = stix_utf8toucs (xtn->bchar_buf, &bcslen, arg->buf, &ucslen);
if (x <= -1 && ucslen <= 0)
{
stix_seterrnum (stix, STIX_EECERR);
return -1;
}
remlen = xtn->bchar_len - bcslen;
if (remlen > 0) memmove (xtn->bchar_buf, &xtn->bchar_buf[bcslen], remlen);
xtn->bchar_len = remlen;
return ucslen;
}
static STIX_INLINE stix_oow_t close_input (stix_t* stix, stix_ioarg_t* arg)
static STIX_INLINE stix_ssize_t close_input (stix_t* stix, stix_ioarg_t* arg)
{
STIX_ASSERT (arg->handle != STIX_NULL);
fclose (arg->handle);
return 0;
}
/* TODO: IMPLEMENT PROPER INPUT HANDLER */
static stix_oow_t input_handler (stix_t* stix, stix_iocmd_t cmd, stix_ioarg_t* arg)
static stix_ssize_t input_handler (stix_t* stix, stix_iocmd_t cmd, stix_ioarg_t* arg)
{
switch (cmd)
{
@ -217,8 +227,8 @@ int main (int argc, char* argv[])
}
{
stix_char_t x[] = { 'S', 't', 'r', 'i', 'n', 'g', '\0' };
stix_char_t y[] = { 'S', 'y', 'm', 'b', 'o', 'l', '\0' };
stix_uch_t x[] = { 'S', 't', 'r', 'i', 'n', 'g', '\0' };
stix_uch_t y[] = { 'S', 'y', 'm', 'b', 'o', 'l', '\0' };
stix_oop_t a, b;
a = stix_makesymbol (stix, x, 6);

View File

@ -617,4 +617,5 @@ Single line comment
#! comment text (easy handling to skip hash bang)
Multi-line comments - double quoted as in smalltalk
" comment text "
" comment text 설명이라지요. "

View File

@ -114,9 +114,9 @@ static stix_oop_t alloc_numeric_array (stix_t* stix, const void* ptr, stix_oow_t
return hdr;
}
stix_oop_t stix_alloccharobj (stix_t* stix, const stix_char_t* ptr, stix_oow_t len)
stix_oop_t stix_alloccharobj (stix_t* stix, const stix_uch_t* ptr, stix_oow_t len)
{
return alloc_numeric_array (stix, ptr, len, STIX_OBJ_TYPE_CHAR, STIX_SIZEOF(stix_char_t), 1);
return alloc_numeric_array (stix, ptr, len, STIX_OBJ_TYPE_CHAR, STIX_SIZEOF(stix_uch_t), 1);
}
stix_oop_t stix_allocuint8obj (stix_t* stix, const stix_uint8_t* ptr, stix_oow_t len)

View File

@ -140,10 +140,10 @@ typedef enum stix_iocmd_t stix_iocmd_t;
typedef struct stix_iolxc_t stix_iolxc_t;
struct stix_iolxc_t
{
stix_char_t c; /**< character */
stix_uch_t c; /**< character */
unsigned long line; /**< line */
unsigned long colm; /**< column */
const stix_char_t* file; /**< file specified in #include */
const stix_uch_t* file; /**< file specified in #include */
};
enum stix_ioarg_flag_t
@ -160,7 +160,7 @@ struct stix_ioarg_t
* It is #STIX_NULL for the main stream and points to a non-NULL string
* for an included stream.
*/
const stix_char_t* name;
const stix_uch_t* name;
/**
* [OUT] I/O handle set by a handler.
@ -173,7 +173,7 @@ struct stix_ioarg_t
/**
* [OUT] place data here
*/
stix_char_t buf[1024];
stix_uch_t buf[1024];
/**
* [IN] points to the data of the includer. It is #STIX_NULL for the
@ -195,7 +195,7 @@ struct stix_ioarg_t
/*-----------------------------------------------------------------*/
};
typedef stix_oow_t (*stix_ioimpl_t) (
typedef stix_ssize_t (*stix_ioimpl_t) (
stix_t* stix,
stix_iocmd_t cmd,
stix_ioarg_t* arg
@ -260,13 +260,13 @@ stix_oow_t stix_hashbytes (
);
stix_oow_t stix_hashchars (
const stix_char_t* ptr,
const stix_uch_t* ptr,
stix_oow_t len
);
int stix_equalchars (
const stix_char_t* str1,
const stix_char_t* str2,
const stix_uch_t* str1,
const stix_uch_t* str2,
stix_oow_t len
);
@ -289,7 +289,7 @@ stix_oop_t stix_allocoopobj (
stix_oop_t stix_alloccharobj (
stix_t* stix,
const stix_char_t* ptr,
const stix_uch_t* ptr,
stix_oow_t len
);
@ -310,13 +310,13 @@ stix_oop_t stix_allocuint16obj (
/* ========================================================================= */
stix_oop_t stix_makesymbol (
stix_t* stix,
const stix_char_t* ptr,
const stix_uch_t* ptr,
stix_oow_t len
);
stix_oop_t stix_findsymbol (
stix_t* stix,
const stix_char_t* ptr,
const stix_uch_t* ptr,
stix_oow_t len
);
@ -338,22 +338,21 @@ stix_oop_t stix_getatsysdic (
/* utf8.c */
/* ========================================================================= */
stix_size_t stix_uctoutf8 (
stix_char_t uc,
stix_bchar_t* utf8,
stix_uch_t uc,
stix_bch_t* utf8,
stix_size_t size
);
stix_size_t stix_utf8touc (
const stix_bchar_t* utf8,
const stix_bch_t* utf8,
stix_size_t size,
stix_char_t* uc
stix_uch_t* uc
);
int stix_ucstoutf8 (
const stix_char_t* ucs,
const stix_uch_t* ucs,
stix_size_t* ucslen,
stix_bchar_t* bcs,
stix_bch_t* bcs,
stix_size_t* bcslen
);
@ -363,27 +362,46 @@ int stix_ucstoutf8 (
* It never returns -2 if \a ucs is #STIX_NULL.
*
* \code
* const stix_bchar_t* bcs = "a multibyte string";
* stix_char_t ucs[100];
* const stix_bch_t* bcs = "test string";
* stix_uch_t ucs[100];
* qse_size_t ucslen = STIX_COUNTOF(buf), n;
* qse_size_t bcslen = strlen(bcs);
* qse_size_t bcslen = 11;
* int n;
* n = qse_bcstoucs (bcs, &bcslen, ucs, &ucslen);
* if (n <= -1) { invalid/incomplenete sequence or buffer to small }
* \endcode
*
* For a null-terminated string, you can specify ~(stix_size_t)0 in
* \a bcslen. The destination buffer \a ucs also must be large enough to
* store a terminating null. Otherwise, -2 is returned.
*
* The resulting \a ucslen can still be greater than 0 even if the return
* value is negative. The value indiates the number of characters converted
* before the error has occurred.
*
* \return 0 on success.
* -1 if \a bcs contains an illegal character.
* -2 if the wide-character string buffer is too small.
* -3 if \a bcs is not a complete sequence.
*/
int stix_utf8toucs (
const stix_bchar_t* bcs,
const stix_bch_t* bcs,
stix_size_t* bcslen,
stix_char_t* ucs,
stix_uch_t* ucs,
stix_size_t* ucslen
);
/**
* The stix_ucslen() function returns the number of characters before
* a terminating null.
*/
/*
stix_size_t stix_ucslen (
const stix_uch_t* ucs
);
*/
/* ========================================================================= */
/* comp.c */
/* ========================================================================= */

View File

@ -155,12 +155,12 @@ stix_oow_t stix_hashbytes (const stix_uint8_t* ptr, stix_oow_t len)
return h;
}
stix_oow_t stix_hashchars (const stix_char_t* ptr, stix_oow_t len)
stix_oow_t stix_hashchars (const stix_uch_t* ptr, stix_oow_t len)
{
return stix_hashbytes ((const stix_uint8_t *)ptr, len * STIX_SIZEOF(*ptr));
}
int stix_equalchars (const stix_char_t* str1, const stix_char_t* str2, stix_oow_t len)
int stix_equalchars (const stix_uch_t* str1, const stix_uch_t* str2, stix_oow_t len)
{
stix_oow_t i;

View File

@ -44,9 +44,26 @@ typedef unsigned short int stix_uint16_t;
#endif
typedef unsigned long int stix_uintptr_t;
typedef unsigned long int stix_size_t;
typedef long int stix_ssize_t;
typedef unsigned short int stix_char_t; /* TODO ... wchar_t??? */
typedef char stix_bchar_t;
typedef unsigned short int stix_uch_t; /* TODO ... wchar_t??? */
typedef char stix_bch_t;
struct stix_ucs_t
{
stix_uch_t* ptr;
stix_size_t len;
};
struct stix_bcs_t
{
stix_bch_t* ptr;
stix_size_t len;
};
typedef struct stix_ucs_t stix_ucs_t;
typedef struct stix_bcs_t stix_bcs_t;
/* =========================================================================
* PRIMITIVE MACROS
@ -185,15 +202,15 @@ struct stix_mmgr_t
typedef struct stix_cmgr_t stix_cmgr_t;
typedef stix_size_t (*stix_cmgr_bctoc_t) (
const stix_bchar_t* mb,
typedef stix_size_t (*stix_cmgr_bctouc_t) (
const stix_bch_t* mb,
stix_size_t size,
stix_char_t* wc
stix_uch_t* wc
);
typedef stix_size_t (*stix_cmgr_ctobc_t) (
stix_char_t wc,
stix_bchar_t* mb,
typedef stix_size_t (*stix_cmgr_uctobc_t) (
stix_uch_t wc,
stix_bch_t* mb,
stix_size_t size
);
@ -206,8 +223,8 @@ typedef stix_size_t (*stix_cmgr_ctobc_t) (
*/
struct stix_cmgr_t
{
stix_cmgr_bctoc_t bctoc;
stix_cmgr_ctobc_t ctobc;
stix_cmgr_bctouc_t bctouc;
stix_cmgr_uctobc_t uctobc;
};
/* =========================================================================
@ -258,7 +275,8 @@ enum stix_errnum_t
STIX_ENOMEM, /**< insufficient memory */
STIX_EINVAL, /**< invalid parameter or data */
STIX_ENOENT, /**< no matching entry */
STIX_EIOERR /**< I/O error */
STIX_EIOERR, /**< I/O error */
STIX_EECERR /**< encoding conversion error */
};
typedef enum stix_errnum_t stix_errnum_t;
@ -605,7 +623,7 @@ struct stix_obj_oop_t
struct stix_obj_char_t
{
STIX_OBJ_HEADER;
stix_char_t slot[1];
stix_uch_t slot[1];
};
struct stix_obj_uint8_t
@ -832,7 +850,7 @@ STIX_EXPORT void stix_gc (
*/
STIX_EXPORT int stix_findclass (
stix_t* vm,
const stix_char_t* name,
const stix_uch_t* name,
stix_oop_t* oop
);

View File

@ -57,7 +57,7 @@ static stix_oop_oop_t expand_bucket (stix_t* stix, stix_oop_oop_t old_bucket)
return new_bucket;
}
static stix_oop_t find_or_make_symbol (stix_t* stix, const stix_char_t* ptr, stix_oow_t len, int create)
static stix_oop_t find_or_make_symbol (stix_t* stix, const stix_uch_t* ptr, stix_oow_t len, int create)
{
stix_oow_t index, tally;
stix_oop_char_t symbol;
@ -130,12 +130,12 @@ static stix_oop_t find_or_make_symbol (stix_t* stix, const stix_char_t* ptr, sti
return (stix_oop_t)symbol;
}
stix_oop_t stix_makesymbol (stix_t* stix, const stix_char_t* ptr, stix_oow_t len)
stix_oop_t stix_makesymbol (stix_t* stix, const stix_uch_t* ptr, stix_oow_t len)
{
return find_or_make_symbol (stix, ptr, len, 1);
}
stix_oop_t stix_findsymbol (stix_t* stix, const stix_char_t* ptr, stix_oow_t len)
stix_oop_t stix_findsymbol (stix_t* stix, const stix_uch_t* ptr, stix_oow_t len)
{
return find_or_make_symbol (stix, ptr, len, 0);
}

View File

@ -26,7 +26,7 @@
#include "stix-prv.h"
#define STIX_BCLEN_MAX 16
#define STIX_BCLEN_MAX 6
/*
* from RFC 2279 UTF-8, a transformation format of ISO 10646
@ -62,12 +62,12 @@ static __utf8_t utf8_table[] =
{0x04000000ul, 0x7FFFFFFFul, 0xFC, 0xFE, 0x01, 6}
};
static STIX_INLINE __utf8_t* get_utf8_slot (stix_char_t uc)
static STIX_INLINE __utf8_t* get_utf8_slot (stix_uch_t uc)
{
__utf8_t* cur, * end;
STIX_ASSERT (STIX_SIZEOF(stix_bchar_t) == 1);
STIX_ASSERT (STIX_SIZEOF(stix_char_t) >= 2);
STIX_ASSERT (STIX_SIZEOF(stix_bch_t) == 1);
STIX_ASSERT (STIX_SIZEOF(stix_uch_t) >= 2);
end = utf8_table + STIX_COUNTOF(utf8_table);
cur = utf8_table;
@ -81,7 +81,7 @@ static STIX_INLINE __utf8_t* get_utf8_slot (stix_char_t uc)
return STIX_NULL; /* invalid character */
}
stix_size_t stix_uctoutf8 (stix_char_t uc, stix_bchar_t* utf8, stix_size_t size)
stix_size_t stix_uctoutf8 (stix_uch_t uc, stix_bch_t* utf8, stix_size_t size)
{
__utf8_t* cur = get_utf8_slot (uc);
@ -108,14 +108,14 @@ stix_size_t stix_uctoutf8 (stix_char_t uc, stix_bchar_t* utf8, stix_size_t size)
return (stix_size_t)cur->length;
}
stix_size_t stix_utf8touc (const stix_bchar_t* utf8, stix_size_t size, stix_char_t* uc)
stix_size_t stix_utf8touc (const stix_bch_t* utf8, stix_size_t size, stix_uch_t* uc)
{
__utf8_t* cur, * end;
STIX_ASSERT (utf8 != STIX_NULL);
STIX_ASSERT (size > 0);
STIX_ASSERT (STIX_SIZEOF(stix_bchar_t) == 1);
STIX_ASSERT (STIX_SIZEOF(stix_char_t) >= 2);
STIX_ASSERT (STIX_SIZEOF(stix_bch_t) == 1);
STIX_ASSERT (STIX_SIZEOF(stix_uch_t) >= 2);
end = utf8_table + STIX_COUNTOF(utf8_table);
cur = utf8_table;
@ -135,7 +135,7 @@ stix_size_t stix_utf8touc (const stix_bchar_t* utf8, stix_size_t size, stix_char
if (uc)
{
stix_char_t w;
stix_uch_t w;
w = utf8[0] & cur->fmask;
for (i = 1; i < cur->length; i++)
@ -167,9 +167,9 @@ stix_size_t stix_utf8touc (const stix_bchar_t* utf8, stix_size_t size, stix_char
}
/* this return value can indicate both
* the correct length (len >= cur->length)
* the correct length (size >= cur->length)
* and
* the incomplete seqeunce error (len < cur->length).
* the incomplete seqeunce error (size < cur->length).
*/
return (stix_size_t)cur->length;
}
@ -179,28 +179,26 @@ stix_size_t stix_utf8touc (const stix_bchar_t* utf8, stix_size_t size, stix_char
return 0; /* error - invalid sequence */
}
stix_size_t stix_utf8len (const stix_bchar_t* utf8, stix_size_t size)
{
return stix_utf8touc (utf8, size, STIX_NULL);
}
/* ----------------------------------------------------------------------- */
static int bcsn_to_csn_with_cmgr (
const stix_bchar_t* bcs, stix_size_t* bcslen,
stix_char_t* cs, stix_size_t* cslen, stix_cmgr_t* cmgr, int all)
static STIX_INLINE int bcsn_to_ucsn_with_cmgr (
const stix_bch_t* bcs, stix_size_t* bcslen,
stix_uch_t* ucs, stix_size_t* ucslen, stix_cmgr_t* cmgr, int all)
{
const stix_bchar_t* p;
const stix_bch_t* p;
int ret = 0;
stix_size_t mlen;
if (cs)
if (ucs)
{
stix_char_t* q, * qend;
/* destination buffer is specified.
* copy the conversion result to the buffer */
stix_uch_t* q, * qend;
p = bcs;
q = cs;
qend = cs + *cslen;
q = ucs;
qend = ucs + *ucslen;
mlen = *bcslen;
while (mlen > 0)
@ -214,7 +212,7 @@ static int bcsn_to_csn_with_cmgr (
break;
}
n = cmgr->bctoc (p, mlen, q);
n = cmgr->bctouc (p, mlen, q);
if (n == 0)
{
/* invalid sequence */
@ -249,12 +247,18 @@ static int bcsn_to_csn_with_cmgr (
mlen -= n;
}
*cslen = q - cs;
*ucslen = q - ucs;
*bcslen = p - bcs;
}
else
{
stix_char_t w;
/* no destination buffer is specified. perform conversion
* but don't copy the result. the caller can call this function
* without a buffer to find the required buffer size, allocate
* a buffer with the size and call this function again with
* the buffer. */
stix_uch_t w;
stix_size_t wlen = 0;
p = bcs;
@ -264,7 +268,7 @@ static int bcsn_to_csn_with_cmgr (
{
stix_size_t n;
n = cmgr->bctoc (p, mlen, &w);
n = cmgr->bctouc (p, mlen, &w);
if (n == 0)
{
/* invalid sequence */
@ -291,19 +295,42 @@ static int bcsn_to_csn_with_cmgr (
wlen += 1;
}
*cslen = wlen;
*ucslen = wlen;
*bcslen = p - bcs;
}
return ret;
}
static int csn_to_bcsn_with_cmgr (
const stix_char_t* cs, stix_size_t* cslen,
stix_bchar_t* bcs, stix_size_t* bcslen, stix_cmgr_t* cmgr)
static STIX_INLINE int bcs_to_ucs_with_cmgr (
const stix_bch_t* bcs, stix_size_t* bcslen,
stix_uch_t* ucs, stix_size_t* ucslen, stix_cmgr_t* cmgr, int all)
{
const stix_char_t* p = cs;
const stix_char_t* end = cs + *cslen;
const stix_bch_t* bp;
stix_size_t mlen, wlen;
int n;
for (bp = bcs; *bp != '\0'; bp++);
mlen = bp - bcs; wlen = *ucslen;
n = bcsn_to_ucsn_with_cmgr (bcs, &mlen, ucs, &wlen, cmgr, all);
if (ucs)
{
/* null-terminate the target buffer if it has room for it. */
if (wlen < *ucslen) ucs[wlen] = '\0';
else n = -2; /* buffer too small */
}
*bcslen = mlen; *ucslen = wlen;
return n;
}
static STIX_INLINE int ucsn_to_bcsn_with_cmgr (
const stix_uch_t* ucs, stix_size_t* ucslen,
stix_bch_t* bcs, stix_size_t* bcslen, stix_cmgr_t* cmgr)
{
const stix_uch_t* p = ucs;
const stix_uch_t* end = ucs + *ucslen;
int ret = 0;
if (bcs)
@ -320,7 +347,7 @@ static int csn_to_bcsn_with_cmgr (
break;
}
n = cmgr->ctobc (*p, bcs, rem);
n = cmgr->uctobc (*p, bcs, rem);
if (n == 0)
{
ret = -1;
@ -338,14 +365,96 @@ static int csn_to_bcsn_with_cmgr (
}
else
{
stix_bchar_t bcsbuf[STIX_BCLEN_MAX];
stix_bch_t bcsbuf[STIX_BCLEN_MAX];
stix_size_t mlen = 0;
while (p < end)
{
stix_size_t n;
n = cmgr->ctobc (*p, bcsbuf, STIX_COUNTOF(bcsbuf));
n = cmgr->uctobc (*p, bcsbuf, STIX_COUNTOF(bcsbuf));
if (n == 0)
{
ret = -1;
break; /* illegal character */
}
/* it assumes that bcsbuf is large enough to hold a character */
STIX_ASSERT (n <= STIX_COUNTOF(bcsbuf));
p++; mlen += n;
}
/* this length excludes the terminating null character.
* this function doesn't even null-terminate the result. */
*bcslen = mlen;
}
*ucslen = p - ucs;
return ret;
}
static int ucs_to_bcs_with_cmgr (
const stix_uch_t* ucs, stix_size_t* ucslen,
stix_bch_t* bcs, stix_size_t* bcslen, stix_cmgr_t* cmgr)
{
const stix_uch_t* p = ucs;
int ret = 0;
if (bcs)
{
stix_size_t rem = *bcslen;
while (*p != '\0')
{
stix_size_t n;
if (rem <= 0)
{
ret = -2;
break;
}
n = cmgr->uctobc (*p, bcs, rem);
if (n == 0)
{
ret = -1;
break; /* illegal character */
}
if (n > rem)
{
ret = -2;
break; /* buffer too small */
}
bcs += n; rem -= n; p++;
}
/* update bcslen to the length of the bcs string converted excluding
* terminating null */
*bcslen -= rem;
/* null-terminate the multibyte sequence if it has sufficient space */
if (rem > 0) *bcs = '\0';
else
{
/* if ret is -2 and cs[cslen] == '\0',
* this means that the bcs buffer was lacking one
* slot for the terminating null */
ret = -2; /* buffer too small */
}
}
else
{
stix_bch_t bcsbuf[STIX_BCLEN_MAX];
stix_size_t mlen = 0;
while (*p != '\0')
{
stix_size_t n;
n = cmgr->uctobc (*p, bcsbuf, STIX_COUNTOF(bcsbuf));
if (n == 0)
{
ret = -1;
@ -353,22 +462,20 @@ static int csn_to_bcsn_with_cmgr (
}
/* it assumes that bcs is large enough to hold a character */
STIX_ASSERT (n <= STIX_COUNTOF(bcsbuf));
STIX_ASSERT (n <= STIX_COUNTOF(bcs));
p++; mlen += n;
}
/* this length excludes the terminating null character.
* this function doesn't event null-terminate the result. */
/* this length holds the number of resulting multi-byte characters
* excluding the terminating null character */
*bcslen = mlen;
}
*cslen = p - cs;
*ucslen = p - ucs; /* the number of wide characters handled. */
return ret;
}
static stix_cmgr_t utf8_cmgr =
{
stix_utf8touc,
@ -376,15 +483,42 @@ static stix_cmgr_t utf8_cmgr =
};
int stix_utf8toucs (
const stix_bchar_t* bcs, stix_size_t* bcslen,
stix_char_t* ucs, stix_size_t* ucslen)
const stix_bch_t* bcs, stix_size_t* bcslen,
stix_uch_t* ucs, stix_size_t* ucslen)
{
return bcsn_to_csn_with_cmgr (bcs, bcslen, ucs, ucslen, &utf8_cmgr, 0);
if (*bcslen == ~(stix_size_t)0)
{
/* the source is null-terminated. */
return bcs_to_ucs_with_cmgr (bcs, bcslen, ucs, ucslen, &utf8_cmgr, 0);
}
else
{
/* the source is length bound */
return bcsn_to_ucsn_with_cmgr (bcs, bcslen, ucs, ucslen, &utf8_cmgr, 0);
}
}
int stix_ucstoutf8 (
const stix_char_t* ucs, stix_size_t *ucslen,
stix_bchar_t* bcs, stix_size_t* bcslen)
const stix_uch_t* ucs, stix_size_t *ucslen,
stix_bch_t* bcs, stix_size_t* bcslen)
{
return csn_to_bcsn_with_cmgr (ucs, ucslen, bcs, bcslen, &utf8_cmgr);
if (*ucslen == ~(stix_size_t)0)
{
/* null-terminated */
return ucs_to_bcs_with_cmgr (ucs, ucslen, bcs, bcslen, &utf8_cmgr);
}
else
{
/* length bound */
return ucsn_to_bcsn_with_cmgr (ucs, ucslen, bcs, bcslen, &utf8_cmgr);
}
}
/*
stix_size_t stix_ucslen (const stix_uch_t* ucs)
{
const stix_uch_t* ptr = ucs;
while (*ptr) ptr = STIX_INCPTR(const stix_uch_t, ptr, 1);
return STIX_SUBPTR(const stix_uch_t, ptr, ucs);
}
*/