added utf8 conversion functions

This commit is contained in:
hyunghwan.chung 2015-05-16 07:31:16 +00:00
parent 75bb3e9a40
commit 090c9ac1bf
4 changed files with 369 additions and 38 deletions

View File

@ -33,9 +33,12 @@
typedef struct xtn_t xtn_t; typedef struct xtn_t xtn_t;
struct xtn_t struct xtn_t
{ {
char source_path[1024]; const char* source_path;
};
char bchar_buf[1024];
stix_size_t bchar_pos;
stix_size_t bchar_len;
};
static void* sys_alloc (stix_mmgr_t* mmgr, stix_size_t size) static void* sys_alloc (stix_mmgr_t* mmgr, stix_size_t size)
{ {
@ -82,18 +85,36 @@ static STIX_INLINE stix_oow_t open_input (stix_t* stix, stix_ioarg_t* arg)
} }
} }
static STIX_INLINE stix_oow_t read_input (stix_t* stix, stix_ioarg_t* arg) static STIX_INLINE stix_oow_t read_input (stix_t* stix, stix_ioarg_t* arg)
{ {
xtn_t* xtn = stix_getxtn(stix);
stix_size_t n, bcslen, cslen;
int x;
STIX_ASSERT (arg->handle != STIX_NULL); STIX_ASSERT (arg->handle != STIX_NULL);
if (fread (arg->buf, STIX_SIZEOF(arg->buf[0]), STIX_COUNTOF(arg->buf), arg->handle) == 0) n = fread (&xtn->bchar_buf[xtn->bchar_len], STIX_SIZEOF(xtn->bchar_buf[0]), STIX_COUNTOF(xtn->bchar_buf) - xtn->bchar_len, arg->handle);
if (n == 0)
{ {
if (ferror(arg->handle)) if (ferror(arg->handle))
{ {
stix_seterrnum (stix, STIX_EIOERR);
return -1;
} }
} }
return 0; xtn->bchar_len += n;
bcslen = xtn->bchar_len;
cslen = STIX_COUNTOF(arg->buf);
x = stix_utf8toucs (xtn->bchar_buf, &bcslen, arg->buf, &cslen);
if (x == -2)
{
/* buffer to small */
}
if (x <= -1)
{
}
} }
static STIX_INLINE stix_oow_t close_input (stix_t* stix, stix_ioarg_t* arg) static STIX_INLINE stix_oow_t close_input (stix_t* stix, stix_ioarg_t* arg)
@ -151,10 +172,19 @@ static void dump_symbol_table (stix_t* stix)
int main (int argc, char* argv[]) int main (int argc, char* argv[])
{ {
stix_t* stix; stix_t* stix;
xtn_t* xtn;
printf ("Stix 1.0.0 - max named %lu max indexed %lu\n", printf ("Stix 1.0.0 - max named %lu max indexed %lu\n",
(unsigned long int)STIX_MAX_NAMED_INSTVARS, (unsigned long int)STIX_MAX_INDEXED_INSTVARS(STIX_MAX_NAMED_INSTVARS)); (unsigned long int)STIX_MAX_NAMED_INSTVARS, (unsigned long int)STIX_MAX_INDEXED_INSTVARS(STIX_MAX_NAMED_INSTVARS));
if (argc != 2)
{
fprintf (stderr, "Usage: %s filename\n", argv[0]);
return -1;
}
{ {
stix_oow_t x; stix_oow_t x;
@ -206,7 +236,8 @@ printf ("%p\n", a);
dump_symbol_table (stix); dump_symbol_table (stix);
} }
xtn = stix_getxtn (stix);
xtn->source_path = argv[1];
if (stix_compile (stix, input_handler) <= -1) if (stix_compile (stix, input_handler) <= -1)
{ {
printf ("cannot compile code\n"); printf ("cannot compile code\n");

View File

@ -334,7 +334,55 @@ stix_oop_t stix_getatsysdic (
stix_oop_t key stix_oop_t key
); );
/* ========================================================================= */
/* utf8.c */
/* ========================================================================= */
stix_size_t stix_uctoutf8 (
stix_char_t uc,
stix_bchar_t* utf8,
stix_size_t size
);
stix_size_t stix_utf8touc (
const stix_bchar_t* utf8,
stix_size_t size,
stix_char_t* uc
);
int stix_ucstoutf8 (
const stix_char_t* ucs,
stix_size_t* ucslen,
stix_bchar_t* bcs,
stix_size_t* bcslen
);
/**
* The stix_utf8toucs() function converts a UTF8 string to a uncide string.
*
* It never returns -2 if \a ucs is #STIX_NULL.
*
* \code
* const stix_bchar_t* bcs = "a multibyte string";
* stix_char_t ucs[100];
* qse_size_t ucslen = STIX_COUNTOF(buf), n;
* qse_size_t bcslen = strlen(bcs);
* int n;
* n = qse_bcstoucs (bcs, &bcslen, ucs, &ucslen);
* if (n <= -1) { invalid/incomplenete sequence or buffer to small }
* \endcode
*
* \return 0 on success.
* -1 if \a bcs contains an illegal character.
* -2 if the wide-character string buffer is too small.
* -3 if \a bcs is not a complete sequence.
*/
int stix_utf8toucs (
const stix_bchar_t* bcs,
stix_size_t* bcslen,
stix_char_t* ucs,
stix_size_t* ucslen
);
/* ========================================================================= */ /* ========================================================================= */
/* comp.c */ /* comp.c */

View File

@ -31,27 +31,9 @@
/* TODO: move this macro out to the build files.... */ /* TODO: move this macro out to the build files.... */
#define STIX_INCLUDE_COMPILER #define STIX_INCLUDE_COMPILER
#if defined(__MSDOS__) /* =========================================================================
# define STIX_INCPTR(type,base,inc) (((type __huge*)base) + (inc)) * PRIMITIVE TYPE DEFINTIONS
# define STIX_DECPTR(type,base,inc) (((type __huge*)base) - (inc)) * ========================================================================= */
# define STIX_GTPTR(type,ptr1,ptr2) (((type __huge*)ptr1) > ((type __huge*)ptr2))
# define STIX_GEPTR(type,ptr1,ptr2) (((type __huge*)ptr1) >= ((type __huge*)ptr2))
# define STIX_LTPTR(type,ptr1,ptr2) (((type __huge*)ptr1) < ((type __huge*)ptr2))
# define STIX_LEPTR(type,ptr1,ptr2) (((type __huge*)ptr1) <= ((type __huge*)ptr2))
# define STIX_EQPTR(type,ptr1,ptr2) (((type __huge*)ptr1) == ((type __huge*)ptr2))
# define STIX_SUBPTR(type,ptr1,ptr2) (((type __huge*)ptr1) - ((type __huge*)ptr2))
#else
# define STIX_INCPTR(type,base,inc) (((type*)base) + (inc))
# define STIX_DECPTR(type,base,inc) (((type*)base) - (inc))
# define STIX_GTPTR(type,ptr1,ptr2) (((type*)ptr1) > ((type*)ptr2))
# define STIX_GEPTR(type,ptr1,ptr2) (((type*)ptr1) >= ((type*)ptr2))
# define STIX_LTPTR(type,ptr1,ptr2) (((type*)ptr1) < ((type*)ptr2))
# define STIX_LEPTR(type,ptr1,ptr2) (((type*)ptr1) <= ((type*)ptr2))
# define STIX_EQPTR(type,ptr1,ptr2) (((type*)ptr1) == ((type*)ptr2))
# define STIX_SUBPTR(type,ptr1,ptr2) (((type*)ptr1) - ((type*)ptr2))
#endif
/* ========================================================================== */
/* TODO: define these types and macros using autoconf */ /* TODO: define these types and macros using autoconf */
typedef unsigned char stix_uint8_t; typedef unsigned char stix_uint8_t;
typedef unsigned short int stix_uint16_t; typedef unsigned short int stix_uint16_t;
@ -64,8 +46,11 @@ typedef unsigned long int stix_uintptr_t;
typedef unsigned long int stix_size_t; typedef unsigned long int stix_size_t;
typedef unsigned short int stix_char_t; /* TODO ... wchar_t??? */ typedef unsigned short int stix_char_t; /* TODO ... wchar_t??? */
typedef char stix_iochar_t; typedef char stix_bchar_t;
/* =========================================================================
* PRIMITIVE MACROS
* ========================================================================= */
#define STIX_SIZEOF(x) (sizeof(x)) #define STIX_SIZEOF(x) (sizeof(x))
#define STIX_COUNTOF(x) (sizeof(x) / sizeof(x[0])) #define STIX_COUNTOF(x) (sizeof(x) / sizeof(x[0]))
@ -92,7 +77,6 @@ typedef char stix_iochar_t;
# define STIX_NULL ((void*)0) # define STIX_NULL ((void*)0)
#endif #endif
/* make a low bit mask that can mask off low n bits*/ /* make a low bit mask that can mask off low n bits*/
#define STIX_LBMASK(type,n) (~(~((type)0) << (n))) #define STIX_LBMASK(type,n) (~(~((type)0) << (n)))
@ -114,16 +98,43 @@ typedef char stix_iochar_t;
/*#define STIX_BITS_MAX(type,nbits) ((((type)1) << (nbits)) - 1)*/ /*#define STIX_BITS_MAX(type,nbits) ((((type)1) << (nbits)) - 1)*/
#define STIX_BITS_MAX(type,nbits) ((~(type)0) >> (STIX_SIZEOF(type) * 8 - (nbits))) #define STIX_BITS_MAX(type,nbits) ((~(type)0) >> (STIX_SIZEOF(type) * 8 - (nbits)))
/* =========================================================================
* POINTER MANIPULATION MACROS
* ========================================================================= */
#if defined(__MSDOS__)
# define STIX_INCPTR(type,base,inc) (((type __huge*)base) + (inc))
# define STIX_DECPTR(type,base,inc) (((type __huge*)base) - (inc))
# define STIX_GTPTR(type,ptr1,ptr2) (((type __huge*)ptr1) > ((type __huge*)ptr2))
# define STIX_GEPTR(type,ptr1,ptr2) (((type __huge*)ptr1) >= ((type __huge*)ptr2))
# define STIX_LTPTR(type,ptr1,ptr2) (((type __huge*)ptr1) < ((type __huge*)ptr2))
# define STIX_LEPTR(type,ptr1,ptr2) (((type __huge*)ptr1) <= ((type __huge*)ptr2))
# define STIX_EQPTR(type,ptr1,ptr2) (((type __huge*)ptr1) == ((type __huge*)ptr2))
# define STIX_SUBPTR(type,ptr1,ptr2) (((type __huge*)ptr1) - ((type __huge*)ptr2))
#else
# define STIX_INCPTR(type,base,inc) (((type*)base) + (inc))
# define STIX_DECPTR(type,base,inc) (((type*)base) - (inc))
# define STIX_GTPTR(type,ptr1,ptr2) (((type*)ptr1) > ((type*)ptr2))
# define STIX_GEPTR(type,ptr1,ptr2) (((type*)ptr1) >= ((type*)ptr2))
# define STIX_LTPTR(type,ptr1,ptr2) (((type*)ptr1) < ((type*)ptr2))
# define STIX_LEPTR(type,ptr1,ptr2) (((type*)ptr1) <= ((type*)ptr2))
# define STIX_EQPTR(type,ptr1,ptr2) (((type*)ptr1) == ((type*)ptr2))
# define STIX_SUBPTR(type,ptr1,ptr2) (((type*)ptr1) - ((type*)ptr2))
#endif
/* =========================================================================
* MMGR
* ========================================================================= */
typedef struct stix_mmgr_t stix_mmgr_t; typedef struct stix_mmgr_t stix_mmgr_t;
/** /**
* allocate a memory chunk of the size \a n. * allocate a memory chunk of the size \a n.
* @return pointer to a memory chunk on success, #STIX_NULL on failure. * \return pointer to a memory chunk on success, #STIX_NULL on failure.
*/ */
typedef void* (*stix_mmgr_alloc_t) (stix_mmgr_t* mmgr, stix_size_t n); typedef void* (*stix_mmgr_alloc_t) (stix_mmgr_t* mmgr, stix_size_t n);
/** /**
* resize a memory chunk pointed to by \a ptr to the size \a n. * resize a memory chunk pointed to by \a ptr to the size \a n.
* @return pointer to a memory chunk on success, #STIX_NULL on failure. * \return pointer to a memory chunk on success, #STIX_NULL on failure.
*/ */
typedef void* (*stix_mmgr_realloc_t) (stix_mmgr_t* mmgr, void* ptr, stix_size_t n); typedef void* (*stix_mmgr_realloc_t) (stix_mmgr_t* mmgr, void* ptr, stix_size_t n);
/** /**
@ -168,6 +179,41 @@ struct stix_mmgr_t
#define STIX_MMGR_FREE(mmgr,ptr) ((mmgr)->free(mmgr,ptr)) #define STIX_MMGR_FREE(mmgr,ptr) ((mmgr)->free(mmgr,ptr))
/* =========================================================================
* CMGR
* =========================================================================*/
typedef struct stix_cmgr_t stix_cmgr_t;
typedef stix_size_t (*stix_cmgr_bctoc_t) (
const stix_bchar_t* mb,
stix_size_t size,
stix_char_t* wc
);
typedef stix_size_t (*stix_cmgr_ctobc_t) (
stix_char_t wc,
stix_bchar_t* mb,
stix_size_t size
);
/**
* The stix_cmgr_t type defines the character-level interface to
* multibyte/wide-character conversion. This interface doesn't
* provide any facility to store conversion state in a context
* independent manner. This leads to the limitation that it can
* handle a stateless multibyte encoding only.
*/
struct stix_cmgr_t
{
stix_cmgr_bctoc_t bctoc;
stix_cmgr_ctobc_t ctobc;
};
/* =========================================================================
* MACROS THAT CHANGES THE BEHAVIORS OF THE C COMPILER/LINKER
* =========================================================================*/
#if defined(_WIN32) || defined(__WATCOMC__) #if defined(_WIN32) || defined(__WATCOMC__)
# define STIX_IMPORT __declspec(dllimport) # define STIX_IMPORT __declspec(dllimport)
# define STIX_EXPORT __declspec(dllexport) # define STIX_EXPORT __declspec(dllexport)
@ -467,7 +513,7 @@ enum stix_obj_type_t
}; };
typedef enum stix_obj_type_t stix_obj_type_t; typedef enum stix_obj_type_t stix_obj_type_t;
/* ------------------------------------------------------------------------- /* =========================================================================
* Object header structure * Object header structure
* *
* _flags: * _flags:
@ -502,7 +548,7 @@ typedef enum stix_obj_type_t stix_obj_type_t;
* class can have normal instance variables. On the contrary, the actual byte * class can have normal instance variables. On the contrary, the actual byte
* size calculation and the access to the payload fields become more complex. * size calculation and the access to the payload fields become more complex.
* Therefore, i've dropped the idea. * Therefore, i've dropped the idea.
* ------------------------------------------------------------------------- */ * ========================================================================= */
#define STIX_OBJ_FLAGS_TYPE_BITS 6 #define STIX_OBJ_FLAGS_TYPE_BITS 6
#define STIX_OBJ_FLAGS_UNIT_BITS 5 #define STIX_OBJ_FLAGS_UNIT_BITS 5
#define STIX_OBJ_FLAGS_EXTRA_BITS 1 #define STIX_OBJ_FLAGS_EXTRA_BITS 1

View File

@ -26,6 +26,8 @@
#include "stix-prv.h" #include "stix-prv.h"
#define STIX_BCLEN_MAX 16
/* /*
* from RFC 2279 UTF-8, a transformation format of ISO 10646 * from RFC 2279 UTF-8, a transformation format of ISO 10646
* *
@ -64,7 +66,7 @@ static STIX_INLINE __utf8_t* get_utf8_slot (stix_char_t uc)
{ {
__utf8_t* cur, * end; __utf8_t* cur, * end;
STIX_ASSERT (STIX_SIZEOF(stix_iochar_t) == 1); STIX_ASSERT (STIX_SIZEOF(stix_bchar_t) == 1);
STIX_ASSERT (STIX_SIZEOF(stix_char_t) >= 2); STIX_ASSERT (STIX_SIZEOF(stix_char_t) >= 2);
end = utf8_table + STIX_COUNTOF(utf8_table); end = utf8_table + STIX_COUNTOF(utf8_table);
@ -79,7 +81,7 @@ static STIX_INLINE __utf8_t* get_utf8_slot (stix_char_t uc)
return STIX_NULL; /* invalid character */ return STIX_NULL; /* invalid character */
} }
stix_size_t stix_uctoutf8 (stix_char_t uc, stix_iochar_t* utf8, stix_size_t size) stix_size_t stix_uctoutf8 (stix_char_t uc, stix_bchar_t* utf8, stix_size_t size)
{ {
__utf8_t* cur = get_utf8_slot (uc); __utf8_t* cur = get_utf8_slot (uc);
@ -106,13 +108,13 @@ stix_size_t stix_uctoutf8 (stix_char_t uc, stix_iochar_t* utf8, stix_size_t size
return (stix_size_t)cur->length; return (stix_size_t)cur->length;
} }
stix_size_t stix_utf8touc (const stix_iochar_t* utf8, stix_size_t size, stix_char_t* uc) stix_size_t stix_utf8touc (const stix_bchar_t* utf8, stix_size_t size, stix_char_t* uc)
{ {
__utf8_t* cur, * end; __utf8_t* cur, * end;
STIX_ASSERT (utf8 != STIX_NULL); STIX_ASSERT (utf8 != STIX_NULL);
STIX_ASSERT (size > 0); STIX_ASSERT (size > 0);
STIX_ASSERT (STIX_SIZEOF(stix_iochar_t) == 1); STIX_ASSERT (STIX_SIZEOF(stix_bchar_t) == 1);
STIX_ASSERT (STIX_SIZEOF(stix_char_t) >= 2); STIX_ASSERT (STIX_SIZEOF(stix_char_t) >= 2);
end = utf8_table + STIX_COUNTOF(utf8_table); end = utf8_table + STIX_COUNTOF(utf8_table);
@ -177,8 +179,212 @@ stix_size_t stix_utf8touc (const stix_iochar_t* utf8, stix_size_t size, stix_cha
return 0; /* error - invalid sequence */ return 0; /* error - invalid sequence */
} }
stix_size_t stix_utf8len (const stix_iochar_t* utf8, stix_size_t size) stix_size_t stix_utf8len (const stix_bchar_t* utf8, stix_size_t size)
{ {
return stix_utf8touc (utf8, size, STIX_NULL); return stix_utf8touc (utf8, size, STIX_NULL);
} }
/* ----------------------------------------------------------------------- */
static int bcsn_to_csn_with_cmgr (
const stix_bchar_t* bcs, stix_size_t* bcslen,
stix_char_t* cs, stix_size_t* cslen, stix_cmgr_t* cmgr, int all)
{
const stix_bchar_t* p;
int ret = 0;
stix_size_t mlen;
if (cs)
{
stix_char_t* q, * qend;
p = bcs;
q = cs;
qend = cs + *cslen;
mlen = *bcslen;
while (mlen > 0)
{
stix_size_t n;
if (q >= qend)
{
/* buffer too small */
ret = -2;
break;
}
n = cmgr->bctoc (p, mlen, q);
if (n == 0)
{
/* invalid sequence */
if (all)
{
n = 1;
*q = '?';
}
else
{
ret = -1;
break;
}
}
if (n > mlen)
{
/* incomplete sequence */
if (all)
{
n = 1;
*q = '?';
}
else
{
ret = -3;
break;
}
}
q++;
p += n;
mlen -= n;
}
*cslen = q - cs;
*bcslen = p - bcs;
}
else
{
stix_char_t w;
stix_size_t wlen = 0;
p = bcs;
mlen = *bcslen;
while (mlen > 0)
{
stix_size_t n;
n = cmgr->bctoc (p, mlen, &w);
if (n == 0)
{
/* invalid sequence */
if (all) n = 1;
else
{
ret = -1;
break;
}
}
if (n > mlen)
{
/* incomplete sequence */
if (all) n = 1;
else
{
ret = -3;
break;
}
}
p += n;
mlen -= n;
wlen += 1;
}
*cslen = wlen;
*bcslen = p - bcs;
}
return ret;
}
static int csn_to_bcsn_with_cmgr (
const stix_char_t* cs, stix_size_t* cslen,
stix_bchar_t* bcs, stix_size_t* bcslen, stix_cmgr_t* cmgr)
{
const stix_char_t* p = cs;
const stix_char_t* end = cs + *cslen;
int ret = 0;
if (bcs)
{
stix_size_t rem = *bcslen;
while (p < end)
{
stix_size_t n;
if (rem <= 0)
{
ret = -2; /* buffer too small */
break;
}
n = cmgr->ctobc (*p, bcs, rem);
if (n == 0)
{
ret = -1;
break; /* illegal character */
}
if (n > rem)
{
ret = -2; /* buffer too small */
break;
}
bcs += n; rem -= n; p++;
}
*bcslen -= rem;
}
else
{
stix_bchar_t bcsbuf[STIX_BCLEN_MAX];
stix_size_t mlen = 0;
while (p < end)
{
stix_size_t n;
n = cmgr->ctobc (*p, bcsbuf, STIX_COUNTOF(bcsbuf));
if (n == 0)
{
ret = -1;
break; /* illegal character */
}
/* it assumes that bcs is large enough to hold a character */
STIX_ASSERT (n <= STIX_COUNTOF(bcsbuf));
p++; mlen += n;
}
/* this length excludes the terminating null character.
* this function doesn't event null-terminate the result. */
*bcslen = mlen;
}
*cslen = p - cs;
return ret;
}
static stix_cmgr_t utf8_cmgr =
{
stix_utf8touc,
stix_uctoutf8
};
int stix_utf8toucs (
const stix_bchar_t* bcs, stix_size_t* bcslen,
stix_char_t* ucs, stix_size_t* ucslen)
{
return bcsn_to_csn_with_cmgr (bcs, bcslen, ucs, ucslen, &utf8_cmgr, 0);
}
int stix_ucstoutf8 (
const stix_char_t* ucs, stix_size_t *ucslen,
stix_bchar_t* bcs, stix_size_t* bcslen)
{
return csn_to_bcsn_with_cmgr (ucs, ucslen, bcs, bcslen, &utf8_cmgr);
}