diff --git a/stix/lib/main.c b/stix/lib/main.c index 1f1866b..2a34d9d 100644 --- a/stix/lib/main.c +++ b/stix/lib/main.c @@ -33,9 +33,12 @@ typedef struct xtn_t xtn_t; struct xtn_t { - char source_path[1024]; -}; + const char* source_path; + char bchar_buf[1024]; + stix_size_t bchar_pos; + stix_size_t bchar_len; +}; static void* sys_alloc (stix_mmgr_t* mmgr, stix_size_t size) { @@ -82,18 +85,36 @@ static STIX_INLINE stix_oow_t open_input (stix_t* stix, stix_ioarg_t* arg) } } - static STIX_INLINE stix_oow_t read_input (stix_t* stix, stix_ioarg_t* arg) { + xtn_t* xtn = stix_getxtn(stix); + stix_size_t n, bcslen, cslen; + int x; + STIX_ASSERT (arg->handle != STIX_NULL); - if (fread (arg->buf, STIX_SIZEOF(arg->buf[0]), STIX_COUNTOF(arg->buf), arg->handle) == 0) + n = fread (&xtn->bchar_buf[xtn->bchar_len], STIX_SIZEOF(xtn->bchar_buf[0]), STIX_COUNTOF(xtn->bchar_buf) - xtn->bchar_len, arg->handle); + if (n == 0) { if (ferror(arg->handle)) { + stix_seterrnum (stix, STIX_EIOERR); + return -1; } + + } - return 0; + xtn->bchar_len += n; + bcslen = xtn->bchar_len; + cslen = STIX_COUNTOF(arg->buf); + x = stix_utf8toucs (xtn->bchar_buf, &bcslen, arg->buf, &cslen); + if (x == -2) + { + /* buffer to small */ + } + if (x <= -1) + { + } } static STIX_INLINE stix_oow_t close_input (stix_t* stix, stix_ioarg_t* arg) @@ -151,10 +172,19 @@ static void dump_symbol_table (stix_t* stix) int main (int argc, char* argv[]) { stix_t* stix; + xtn_t* xtn; + printf ("Stix 1.0.0 - max named %lu max indexed %lu\n", (unsigned long int)STIX_MAX_NAMED_INSTVARS, (unsigned long int)STIX_MAX_INDEXED_INSTVARS(STIX_MAX_NAMED_INSTVARS)); + if (argc != 2) + { + fprintf (stderr, "Usage: %s filename\n", argv[0]); + return -1; + } + + { stix_oow_t x; @@ -206,7 +236,8 @@ printf ("%p\n", a); dump_symbol_table (stix); } - + xtn = stix_getxtn (stix); + xtn->source_path = argv[1]; if (stix_compile (stix, input_handler) <= -1) { printf ("cannot compile code\n"); diff --git a/stix/lib/stix-prv.h b/stix/lib/stix-prv.h index f9c4b32..754a8ab 100644 --- a/stix/lib/stix-prv.h +++ b/stix/lib/stix-prv.h @@ -334,7 +334,55 @@ stix_oop_t stix_getatsysdic ( stix_oop_t key ); +/* ========================================================================= */ +/* utf8.c */ +/* ========================================================================= */ +stix_size_t stix_uctoutf8 ( + stix_char_t uc, + stix_bchar_t* utf8, + stix_size_t size +); +stix_size_t stix_utf8touc ( + const stix_bchar_t* utf8, + stix_size_t size, + stix_char_t* uc +); + + +int stix_ucstoutf8 ( + const stix_char_t* ucs, + stix_size_t* ucslen, + stix_bchar_t* bcs, + stix_size_t* bcslen +); + +/** + * The stix_utf8toucs() function converts a UTF8 string to a uncide string. + * + * It never returns -2 if \a ucs is #STIX_NULL. + * + * \code + * const stix_bchar_t* bcs = "a multibyte string"; + * stix_char_t ucs[100]; + * qse_size_t ucslen = STIX_COUNTOF(buf), n; + * qse_size_t bcslen = strlen(bcs); + * int n; + * n = qse_bcstoucs (bcs, &bcslen, ucs, &ucslen); + * if (n <= -1) { invalid/incomplenete sequence or buffer to small } + * \endcode + * + * \return 0 on success. + * -1 if \a bcs contains an illegal character. + * -2 if the wide-character string buffer is too small. + * -3 if \a bcs is not a complete sequence. + */ +int stix_utf8toucs ( + const stix_bchar_t* bcs, + stix_size_t* bcslen, + stix_char_t* ucs, + stix_size_t* ucslen +); /* ========================================================================= */ /* comp.c */ diff --git a/stix/lib/stix.h b/stix/lib/stix.h index 3170e23..170f415 100644 --- a/stix/lib/stix.h +++ b/stix/lib/stix.h @@ -31,27 +31,9 @@ /* TODO: move this macro out to the build files.... */ #define STIX_INCLUDE_COMPILER -#if defined(__MSDOS__) -# define STIX_INCPTR(type,base,inc) (((type __huge*)base) + (inc)) -# define STIX_DECPTR(type,base,inc) (((type __huge*)base) - (inc)) -# define STIX_GTPTR(type,ptr1,ptr2) (((type __huge*)ptr1) > ((type __huge*)ptr2)) -# define STIX_GEPTR(type,ptr1,ptr2) (((type __huge*)ptr1) >= ((type __huge*)ptr2)) -# define STIX_LTPTR(type,ptr1,ptr2) (((type __huge*)ptr1) < ((type __huge*)ptr2)) -# define STIX_LEPTR(type,ptr1,ptr2) (((type __huge*)ptr1) <= ((type __huge*)ptr2)) -# define STIX_EQPTR(type,ptr1,ptr2) (((type __huge*)ptr1) == ((type __huge*)ptr2)) -# define STIX_SUBPTR(type,ptr1,ptr2) (((type __huge*)ptr1) - ((type __huge*)ptr2)) -#else -# define STIX_INCPTR(type,base,inc) (((type*)base) + (inc)) -# define STIX_DECPTR(type,base,inc) (((type*)base) - (inc)) -# define STIX_GTPTR(type,ptr1,ptr2) (((type*)ptr1) > ((type*)ptr2)) -# define STIX_GEPTR(type,ptr1,ptr2) (((type*)ptr1) >= ((type*)ptr2)) -# define STIX_LTPTR(type,ptr1,ptr2) (((type*)ptr1) < ((type*)ptr2)) -# define STIX_LEPTR(type,ptr1,ptr2) (((type*)ptr1) <= ((type*)ptr2)) -# define STIX_EQPTR(type,ptr1,ptr2) (((type*)ptr1) == ((type*)ptr2)) -# define STIX_SUBPTR(type,ptr1,ptr2) (((type*)ptr1) - ((type*)ptr2)) -#endif - -/* ========================================================================== */ +/* ========================================================================= + * PRIMITIVE TYPE DEFINTIONS + * ========================================================================= */ /* TODO: define these types and macros using autoconf */ typedef unsigned char stix_uint8_t; typedef unsigned short int stix_uint16_t; @@ -64,8 +46,11 @@ typedef unsigned long int stix_uintptr_t; typedef unsigned long int stix_size_t; typedef unsigned short int stix_char_t; /* TODO ... wchar_t??? */ -typedef char stix_iochar_t; +typedef char stix_bchar_t; +/* ========================================================================= + * PRIMITIVE MACROS + * ========================================================================= */ #define STIX_SIZEOF(x) (sizeof(x)) #define STIX_COUNTOF(x) (sizeof(x) / sizeof(x[0])) @@ -92,7 +77,6 @@ typedef char stix_iochar_t; # define STIX_NULL ((void*)0) #endif - /* make a low bit mask that can mask off low n bits*/ #define STIX_LBMASK(type,n) (~(~((type)0) << (n))) @@ -114,16 +98,43 @@ typedef char stix_iochar_t; /*#define STIX_BITS_MAX(type,nbits) ((((type)1) << (nbits)) - 1)*/ #define STIX_BITS_MAX(type,nbits) ((~(type)0) >> (STIX_SIZEOF(type) * 8 - (nbits))) +/* ========================================================================= + * POINTER MANIPULATION MACROS + * ========================================================================= */ + +#if defined(__MSDOS__) +# define STIX_INCPTR(type,base,inc) (((type __huge*)base) + (inc)) +# define STIX_DECPTR(type,base,inc) (((type __huge*)base) - (inc)) +# define STIX_GTPTR(type,ptr1,ptr2) (((type __huge*)ptr1) > ((type __huge*)ptr2)) +# define STIX_GEPTR(type,ptr1,ptr2) (((type __huge*)ptr1) >= ((type __huge*)ptr2)) +# define STIX_LTPTR(type,ptr1,ptr2) (((type __huge*)ptr1) < ((type __huge*)ptr2)) +# define STIX_LEPTR(type,ptr1,ptr2) (((type __huge*)ptr1) <= ((type __huge*)ptr2)) +# define STIX_EQPTR(type,ptr1,ptr2) (((type __huge*)ptr1) == ((type __huge*)ptr2)) +# define STIX_SUBPTR(type,ptr1,ptr2) (((type __huge*)ptr1) - ((type __huge*)ptr2)) +#else +# define STIX_INCPTR(type,base,inc) (((type*)base) + (inc)) +# define STIX_DECPTR(type,base,inc) (((type*)base) - (inc)) +# define STIX_GTPTR(type,ptr1,ptr2) (((type*)ptr1) > ((type*)ptr2)) +# define STIX_GEPTR(type,ptr1,ptr2) (((type*)ptr1) >= ((type*)ptr2)) +# define STIX_LTPTR(type,ptr1,ptr2) (((type*)ptr1) < ((type*)ptr2)) +# define STIX_LEPTR(type,ptr1,ptr2) (((type*)ptr1) <= ((type*)ptr2)) +# define STIX_EQPTR(type,ptr1,ptr2) (((type*)ptr1) == ((type*)ptr2)) +# define STIX_SUBPTR(type,ptr1,ptr2) (((type*)ptr1) - ((type*)ptr2)) +#endif + +/* ========================================================================= + * MMGR + * ========================================================================= */ typedef struct stix_mmgr_t stix_mmgr_t; /** * allocate a memory chunk of the size \a n. - * @return pointer to a memory chunk on success, #STIX_NULL on failure. + * \return pointer to a memory chunk on success, #STIX_NULL on failure. */ typedef void* (*stix_mmgr_alloc_t) (stix_mmgr_t* mmgr, stix_size_t n); /** * resize a memory chunk pointed to by \a ptr to the size \a n. - * @return pointer to a memory chunk on success, #STIX_NULL on failure. + * \return pointer to a memory chunk on success, #STIX_NULL on failure. */ typedef void* (*stix_mmgr_realloc_t) (stix_mmgr_t* mmgr, void* ptr, stix_size_t n); /** @@ -168,6 +179,41 @@ struct stix_mmgr_t #define STIX_MMGR_FREE(mmgr,ptr) ((mmgr)->free(mmgr,ptr)) +/* ========================================================================= + * CMGR + * =========================================================================*/ + +typedef struct stix_cmgr_t stix_cmgr_t; + +typedef stix_size_t (*stix_cmgr_bctoc_t) ( + const stix_bchar_t* mb, + stix_size_t size, + stix_char_t* wc +); + +typedef stix_size_t (*stix_cmgr_ctobc_t) ( + stix_char_t wc, + stix_bchar_t* mb, + stix_size_t size +); + +/** + * The stix_cmgr_t type defines the character-level interface to + * multibyte/wide-character conversion. This interface doesn't + * provide any facility to store conversion state in a context + * independent manner. This leads to the limitation that it can + * handle a stateless multibyte encoding only. + */ +struct stix_cmgr_t +{ + stix_cmgr_bctoc_t bctoc; + stix_cmgr_ctobc_t ctobc; +}; + +/* ========================================================================= + * MACROS THAT CHANGES THE BEHAVIORS OF THE C COMPILER/LINKER + * =========================================================================*/ + #if defined(_WIN32) || defined(__WATCOMC__) # define STIX_IMPORT __declspec(dllimport) # define STIX_EXPORT __declspec(dllexport) @@ -467,7 +513,7 @@ enum stix_obj_type_t }; typedef enum stix_obj_type_t stix_obj_type_t; -/* ------------------------------------------------------------------------- +/* ========================================================================= * Object header structure * * _flags: @@ -502,7 +548,7 @@ typedef enum stix_obj_type_t stix_obj_type_t; * class can have normal instance variables. On the contrary, the actual byte * size calculation and the access to the payload fields become more complex. * Therefore, i've dropped the idea. - * ------------------------------------------------------------------------- */ + * ========================================================================= */ #define STIX_OBJ_FLAGS_TYPE_BITS 6 #define STIX_OBJ_FLAGS_UNIT_BITS 5 #define STIX_OBJ_FLAGS_EXTRA_BITS 1 diff --git a/stix/lib/utf8.c b/stix/lib/utf8.c index 4c0ce4f..3e09a90 100644 --- a/stix/lib/utf8.c +++ b/stix/lib/utf8.c @@ -26,6 +26,8 @@ #include "stix-prv.h" +#define STIX_BCLEN_MAX 16 + /* * from RFC 2279 UTF-8, a transformation format of ISO 10646 * @@ -64,7 +66,7 @@ static STIX_INLINE __utf8_t* get_utf8_slot (stix_char_t uc) { __utf8_t* cur, * end; - STIX_ASSERT (STIX_SIZEOF(stix_iochar_t) == 1); + STIX_ASSERT (STIX_SIZEOF(stix_bchar_t) == 1); STIX_ASSERT (STIX_SIZEOF(stix_char_t) >= 2); end = utf8_table + STIX_COUNTOF(utf8_table); @@ -79,7 +81,7 @@ static STIX_INLINE __utf8_t* get_utf8_slot (stix_char_t uc) return STIX_NULL; /* invalid character */ } -stix_size_t stix_uctoutf8 (stix_char_t uc, stix_iochar_t* utf8, stix_size_t size) +stix_size_t stix_uctoutf8 (stix_char_t uc, stix_bchar_t* utf8, stix_size_t size) { __utf8_t* cur = get_utf8_slot (uc); @@ -106,13 +108,13 @@ stix_size_t stix_uctoutf8 (stix_char_t uc, stix_iochar_t* utf8, stix_size_t size return (stix_size_t)cur->length; } -stix_size_t stix_utf8touc (const stix_iochar_t* utf8, stix_size_t size, stix_char_t* uc) +stix_size_t stix_utf8touc (const stix_bchar_t* utf8, stix_size_t size, stix_char_t* uc) { __utf8_t* cur, * end; STIX_ASSERT (utf8 != STIX_NULL); STIX_ASSERT (size > 0); - STIX_ASSERT (STIX_SIZEOF(stix_iochar_t) == 1); + STIX_ASSERT (STIX_SIZEOF(stix_bchar_t) == 1); STIX_ASSERT (STIX_SIZEOF(stix_char_t) >= 2); end = utf8_table + STIX_COUNTOF(utf8_table); @@ -177,8 +179,212 @@ stix_size_t stix_utf8touc (const stix_iochar_t* utf8, stix_size_t size, stix_cha return 0; /* error - invalid sequence */ } -stix_size_t stix_utf8len (const stix_iochar_t* utf8, stix_size_t size) +stix_size_t stix_utf8len (const stix_bchar_t* utf8, stix_size_t size) { return stix_utf8touc (utf8, size, STIX_NULL); } +/* ----------------------------------------------------------------------- */ + +static int bcsn_to_csn_with_cmgr ( + const stix_bchar_t* bcs, stix_size_t* bcslen, + stix_char_t* cs, stix_size_t* cslen, stix_cmgr_t* cmgr, int all) +{ + const stix_bchar_t* p; + int ret = 0; + stix_size_t mlen; + + if (cs) + { + stix_char_t* q, * qend; + + p = bcs; + q = cs; + qend = cs + *cslen; + mlen = *bcslen; + + while (mlen > 0) + { + stix_size_t n; + + if (q >= qend) + { + /* buffer too small */ + ret = -2; + break; + } + + n = cmgr->bctoc (p, mlen, q); + if (n == 0) + { + /* invalid sequence */ + if (all) + { + n = 1; + *q = '?'; + } + else + { + ret = -1; + break; + } + } + if (n > mlen) + { + /* incomplete sequence */ + if (all) + { + n = 1; + *q = '?'; + } + else + { + ret = -3; + break; + } + } + + q++; + p += n; + mlen -= n; + } + + *cslen = q - cs; + *bcslen = p - bcs; + } + else + { + stix_char_t w; + stix_size_t wlen = 0; + + p = bcs; + mlen = *bcslen; + + while (mlen > 0) + { + stix_size_t n; + + n = cmgr->bctoc (p, mlen, &w); + if (n == 0) + { + /* invalid sequence */ + if (all) n = 1; + else + { + ret = -1; + break; + } + } + if (n > mlen) + { + /* incomplete sequence */ + if (all) n = 1; + else + { + ret = -3; + break; + } + } + + p += n; + mlen -= n; + wlen += 1; + } + + *cslen = wlen; + *bcslen = p - bcs; + } + + return ret; +} + +static int csn_to_bcsn_with_cmgr ( + const stix_char_t* cs, stix_size_t* cslen, + stix_bchar_t* bcs, stix_size_t* bcslen, stix_cmgr_t* cmgr) +{ + const stix_char_t* p = cs; + const stix_char_t* end = cs + *cslen; + int ret = 0; + + if (bcs) + { + stix_size_t rem = *bcslen; + + while (p < end) + { + stix_size_t n; + + if (rem <= 0) + { + ret = -2; /* buffer too small */ + break; + } + + n = cmgr->ctobc (*p, bcs, rem); + if (n == 0) + { + ret = -1; + break; /* illegal character */ + } + if (n > rem) + { + ret = -2; /* buffer too small */ + break; + } + bcs += n; rem -= n; p++; + } + + *bcslen -= rem; + } + else + { + stix_bchar_t bcsbuf[STIX_BCLEN_MAX]; + stix_size_t mlen = 0; + + while (p < end) + { + stix_size_t n; + + n = cmgr->ctobc (*p, bcsbuf, STIX_COUNTOF(bcsbuf)); + if (n == 0) + { + ret = -1; + break; /* illegal character */ + } + + /* it assumes that bcs is large enough to hold a character */ + STIX_ASSERT (n <= STIX_COUNTOF(bcsbuf)); + + p++; mlen += n; + } + + /* this length excludes the terminating null character. + * this function doesn't event null-terminate the result. */ + *bcslen = mlen; + } + + *cslen = p - cs; + + return ret; +} + + +static stix_cmgr_t utf8_cmgr = +{ + stix_utf8touc, + stix_uctoutf8 +}; + +int stix_utf8toucs ( + const stix_bchar_t* bcs, stix_size_t* bcslen, + stix_char_t* ucs, stix_size_t* ucslen) +{ + return bcsn_to_csn_with_cmgr (bcs, bcslen, ucs, ucslen, &utf8_cmgr, 0); +} + +int stix_ucstoutf8 ( + const stix_char_t* ucs, stix_size_t *ucslen, + stix_bchar_t* bcs, stix_size_t* bcslen) +{ + return csn_to_bcsn_with_cmgr (ucs, ucslen, bcs, bcslen, &utf8_cmgr); +}