From fa6dfeddc9d0a887d60f4fc855efd5b102ffd912 Mon Sep 17 00:00:00 2001 From: hyung-hwan Date: Sat, 18 Oct 2008 05:43:20 +0000 Subject: [PATCH] adding wide string and multibyte string conversion --- ase/configure | 97 +++++++++++++++++++++++++++++++ ase/configure.ac | 1 + ase/include/ase/awk/Awk.hpp | 9 ++- ase/include/ase/cmn/chr.h | 24 ++++++++ ase/include/ase/cmn/str.h | 100 +++++++++++++++++++++++++++----- ase/include/ase/cmn/tio.h | 14 +++++ ase/include/ase/config.h.in | 12 ++++ ase/lib/awk/Awk.cpp | 6 +- ase/lib/cmn/chr_cnv.c | 25 +++++--- ase/lib/cmn/str_cnv.c | 111 +++++++++++++++++++++++++++++++++++- ase/lib/cmn/tio.c | 16 ++++++ ase/lib/cmn/tio_get.c | 15 ++++- 12 files changed, 401 insertions(+), 29 deletions(-) diff --git a/ase/configure b/ase/configure index 58ad89cc..1f3426e1 100755 --- a/ase/configure +++ b/ase/configure @@ -21134,6 +21134,103 @@ fi done + + + +for ac_func in mbsnrtowcs mbsrtowcs wcsnrtombs wcsrtombs +do +as_ac_var=`echo "ac_cv_func_$ac_func" | $as_tr_sh` +{ echo "$as_me:$LINENO: checking for $ac_func" >&5 +echo $ECHO_N "checking for $ac_func... $ECHO_C" >&6; } +if { as_var=$as_ac_var; eval "test \"\${$as_var+set}\" = set"; }; then + echo $ECHO_N "(cached) $ECHO_C" >&6 +else + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ +/* Define $ac_func to an innocuous variant, in case declares $ac_func. + For example, HP-UX 11i declares gettimeofday. */ +#define $ac_func innocuous_$ac_func + +/* System header to define __stub macros and hopefully few prototypes, + which can conflict with char $ac_func (); below. + Prefer to if __STDC__ is defined, since + exists even on freestanding compilers. */ + +#ifdef __STDC__ +# include +#else +# include +#endif + +#undef $ac_func + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char $ac_func (); +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined __stub_$ac_func || defined __stub___$ac_func +choke me +#endif + +int +main () +{ +return $ac_func (); + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext conftest$ac_exeext +if { (ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 + (eval "$ac_link") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest$ac_exeext && + $as_test_x conftest$ac_exeext; then + eval "$as_ac_var=yes" +else + echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + eval "$as_ac_var=no" +fi + +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ + conftest$ac_exeext conftest.$ac_ext +fi +ac_res=`eval echo '${'$as_ac_var'}'` + { echo "$as_me:$LINENO: result: $ac_res" >&5 +echo "${ECHO_T}$ac_res" >&6; } +if test `eval echo '${'$as_ac_var'}'` = yes; then + cat >>confdefs.h <<_ACEOF +#define `echo "HAVE_$ac_func" | $as_tr_cpp` 1 +_ACEOF + +fi +done + + { echo "$as_me:$LINENO: checking for char" >&5 echo $ECHO_N "checking for char... $ECHO_C" >&6; } if test "${ac_cv_type_char+set}" = set; then diff --git a/ase/configure.ac b/ase/configure.ac index 31f6e2ad..0acda669 100644 --- a/ase/configure.ac +++ b/ase/configure.ac @@ -86,6 +86,7 @@ AC_CHECK_TYPE([wchar_t], dnl check functions AC_CHECK_FUNCS([uselocale]) AC_CHECK_FUNCS([mbrlen mbrtowc wcrtomb]) +AC_CHECK_FUNCS([mbsnrtowcs mbsrtowcs wcsnrtombs wcsrtombs]) dnl Checks the size of primitive data types AC_CHECK_SIZEOF(char) diff --git a/ase/include/ase/awk/Awk.hpp b/ase/include/ase/awk/Awk.hpp index d1f7a8af..ab925062 100644 --- a/ase/include/ase/awk/Awk.hpp +++ b/ase/include/ase/awk/Awk.hpp @@ -1,5 +1,5 @@ /* - * $Id: Awk.hpp 399 2008-09-29 10:26:26Z baconevi $ + * $Id: Awk.hpp 430 2008-10-17 11:43:20Z baconevi $ * * {License} */ @@ -11,6 +11,7 @@ #include #include #include +#include ///////////////////////////////// ASE_BEGIN_NAMESPACE(ASE) @@ -323,8 +324,10 @@ public: public: // initialization - void* operator new (size_t n, awk_t* awk) throw (); - void* operator new[] (size_t n, awk_t* awk) throw (); + //void* operator new (size_t n, awk_t* awk) throw (); + //void* operator new[] (size_t n, awk_t* awk) throw (); + void* operator new (::size_t n, awk_t* awk) throw (); + void* operator new[] (::size_t n, awk_t* awk) throw (); #if !defined(__BORLANDC__) // deletion when initialization fails diff --git a/ase/include/ase/cmn/chr.h b/ase/include/ase/cmn/chr.h index d799becf..39f71900 100644 --- a/ase/include/ase/cmn/chr.h +++ b/ase/include/ase/cmn/chr.h @@ -54,17 +54,41 @@ ase_size_t ase_mblen ( ase_size_t mblen ); +/****f* ase.cmn.chr/ase_mbtowc + * NAME + * ase_mbtowc - convert a multibyte sequence to a wide character. + * + * RETURN + * The ase_mbtowc() function returns 0 if an invalid multibyte sequence is + * detected, mblen + 1 if the sequence is incomplete. It returns the number + * of bytes processed to form a wide character. + * + * SYNOPSIS + */ ase_size_t ase_mbtowc ( const ase_mchar_t* mb, ase_size_t mblen, ase_wchar_t* wc ); +/******/ +/****f* ase.cmn.chr/ase_wctomb + * NAME + * ase_wctomb - convert a wide character to a multibyte sequence + * + * RETURN + * The ase_wctomb() functions returns 0 if the wide character is illegal, + * mblen + 1 if mblen is not large enough to hold the multibyte sequence. + * On successful conversion, it returns the number of bytes in the sequence. + * + * SYNOPSIS + */ ase_size_t ase_wctomb ( ase_wchar_t wc, ase_mchar_t* mb, ase_size_t mblen ); +/******/ #ifdef __cplusplus } diff --git a/ase/include/ase/cmn/str.h b/ase/include/ase/cmn/str.h index 3889ce95..60a1bfe7 100644 --- a/ase/include/ase/cmn/str.h +++ b/ase/include/ase/cmn/str.h @@ -1,5 +1,5 @@ /* - * $Id: str.h 389 2008-09-26 08:01:24Z baconevi $ + * $Id: str.h 430 2008-10-17 11:43:20Z baconevi $ * * {License} */ @@ -229,10 +229,6 @@ ase_long_t ase_strxtolong (const ase_char_t* str, ase_size_t len); ase_uint_t ase_strxtouint (const ase_char_t* str, ase_size_t len); ase_ulong_t ase_strxtoulong (const ase_char_t* str, ase_size_t len); -/* - * dynamic string - */ - ase_str_t* ase_str_open ( ase_mmgr_t* mmgr, ase_size_t ext, @@ -346,16 +342,94 @@ ase_size_t ase_str_setcapa ( ase_size_t capa /* a new capacity */ ); -void ase_str_clear (ase_str_t* str); -void ase_str_swap (ase_str_t* str, ase_str_t* str2); +void ase_str_clear ( + ase_str_t* str +); -ase_size_t ase_str_cpy (ase_str_t* str, const ase_char_t* s); -ase_size_t ase_str_ncpy (ase_str_t* str, const ase_char_t* s, ase_size_t len); +void ase_str_swap ( + ase_str_t* str, + ase_str_t* str2 +); -ase_size_t ase_str_cat (ase_str_t* str, const ase_char_t* s); -ase_size_t ase_str_ncat (ase_str_t* str, const ase_char_t* s, ase_size_t len); -ase_size_t ase_str_ccat (ase_str_t* str, ase_char_t c); -ase_size_t ase_str_nccat (ase_str_t* str, ase_char_t c, ase_size_t len); +ase_size_t ase_str_cpy ( + ase_str_t* str, + const ase_char_t* s +); + +ase_size_t ase_str_ncpy ( + ase_str_t* str, + const ase_char_t* s, + ase_size_t len +); + +ase_size_t ase_str_cat ( + ase_str_t* str, + const ase_char_t* s +); + +ase_size_t ase_str_ncat ( + ase_str_t* str, + const ase_char_t* s, + ase_size_t len +); + +ase_size_t ase_str_ccat ( + ase_str_t* str, + ase_char_t c +); + +ase_size_t ase_str_nccat ( + ase_str_t* str, + ase_char_t c, + ase_size_t len +); + + +ase_size_t ase_mbstowcs ( + const ase_mchar_t* mbs, + ase_wchar_t* wcs, + ase_size_t* wcslen +); + +/****f* ase.cmn.str/ase_mbsntowcsn + * NAME + * ase_mbsntowcsn - conver a multibyte string to a wide character string + * + * RETURN + * The ase_mbstowcs() function returns the number of bytes handled. + * + * SYNOPSIS + */ +ase_size_t ase_mbsntowcsn ( + const ase_mchar_t* mbs, + ase_size_t mbslen, + ase_wchar_t* wcs, + ase_size_t* wcslen +); +/******/ + +ase_size_t ase_wcstombs ( + const ase_wchar_t* wcs, + ase_mchar_t* mbs, + ase_size_t* mbslen +); + +/****f* ase.cmn.str/ase_wcsntombsn + * NAME + * ase_wcstombs - convert a wide character string to a multibyte string + * + * RETURN + * The ase_wcstombs() function returns the number of wide characters handled. + * + * SYNOPSIS + */ +ase_size_t ase_wcsntombsn ( + const ase_wchar_t* wcs, + ase_size_t wcslen, + ase_mchar_t* mbs, + ase_size_t* mbslen +); +/******/ #ifdef __cplusplus } diff --git a/ase/include/ase/cmn/tio.h b/ase/include/ase/cmn/tio.h index 2e9ea9d6..8478e3d8 100644 --- a/ase/include/ase/cmn/tio.h +++ b/ase/include/ase/cmn/tio.h @@ -15,6 +15,7 @@ enum ASE_TIO_ENOMEM, /* out of memory */ ASE_TIO_ENOSPC, /* no more space */ ASE_TIO_EILSEQ, /* illegal sequence */ + ASE_TIO_EICSEQ, /* incomplete sequence */ ASE_TIO_EILCHR, /* illegal character */ ASE_TIO_ENOINF, /* no input function attached */ ASE_TIO_EINPUT, /* input function returned an error */ @@ -114,6 +115,19 @@ int ase_tio_fini ( ase_tio_t* tio ); +void* ase_tio_getextension ( + ase_tio_t* tio +); + +ase_mmgr_t* ase_tio_getmmgr ( + ase_tio_t* tio +); + +void ase_tio_setmmgr ( + ase_tio_t* tio, + ase_mmgr_t* mmgr +); + /* * FUNCTION: ase_tio_geterrnum * Returns an error code diff --git a/ase/include/ase/config.h.in b/ase/include/ase/config.h.in index 337c2c2e..f49bf038 100644 --- a/ase/include/ase/config.h.in +++ b/ase/include/ase/config.h.in @@ -87,6 +87,12 @@ /* Define to 1 if you have the `mbrtowc' function. */ #undef HAVE_MBRTOWC +/* Define to 1 if you have the `mbsnrtowcs' function. */ +#undef HAVE_MBSNRTOWCS + +/* Define to 1 if you have the `mbsrtowcs' function. */ +#undef HAVE_MBSRTOWCS + /* Define to 1 if you have the header file. */ #undef HAVE_MEMORY_H @@ -126,6 +132,12 @@ /* Define to 1 if you have the `wcrtomb' function. */ #undef HAVE_WCRTOMB +/* Define to 1 if you have the `wcsnrtombs' function. */ +#undef HAVE_WCSNRTOMBS + +/* Define to 1 if you have the `wcsrtombs' function. */ +#undef HAVE_WCSRTOMBS + /* Define to 1 if you have the header file. */ #undef HAVE_WCTYPE_H diff --git a/ase/lib/awk/Awk.cpp b/ase/lib/awk/Awk.cpp index 897b202f..a7a690c7 100644 --- a/ase/lib/awk/Awk.cpp +++ b/ase/lib/awk/Awk.cpp @@ -1,5 +1,5 @@ /* - * $Id: Awk.cpp 399 2008-09-29 10:26:26Z baconevi $ + * $Id: Awk.cpp 430 2008-10-17 11:43:20Z baconevi $ * * {License} */ @@ -229,7 +229,7 @@ void Awk::Argument::clear () this->inum = 0; } -void* Awk::Argument::operator new (size_t n, awk_t* awk) throw () +void* Awk::Argument::operator new (::size_t n, awk_t* awk) throw () { void* ptr = ase_awk_alloc (awk, ASE_SIZEOF(awk) + n); if (ptr == ASE_NULL) return ASE_NULL; @@ -238,7 +238,7 @@ void* Awk::Argument::operator new (size_t n, awk_t* awk) throw () return (char*)ptr+ASE_SIZEOF(awk); } -void* Awk::Argument::operator new[] (size_t n, awk_t* awk) throw () +void* Awk::Argument::operator new[] (::size_t n, awk_t* awk) throw () { void* ptr = ase_awk_alloc (awk, ASE_SIZEOF(awk) + n); if (ptr == ASE_NULL) return ASE_NULL; diff --git a/ase/lib/cmn/chr_cnv.c b/ase/lib/cmn/chr_cnv.c index c35d88d4..fe2b0c34 100644 --- a/ase/lib/cmn/chr_cnv.c +++ b/ase/lib/cmn/chr_cnv.c @@ -3,6 +3,7 @@ */ #include +#include "mem.h" #ifdef HAVE_WCHAR_H #include @@ -56,12 +57,6 @@ ase_size_t ase_wctomb (ase_wchar_t wc, ase_mchar_t* mb, ase_size_t mblen) size_t n; mbstate_t mbs = { 0 }; - if (mblen < MB_CUR_MAX) - { - /* buffer too small */ - return mblen + 1; - } - /* man mbsinit * For 8-bit encodings, all states are equivalent to the initial state. * For multibyte encodings like UTF-8, EUC-*, BIG5 or SJIS, the wide char‐ @@ -71,8 +66,22 @@ ase_size_t ase_wctomb (ase_wchar_t wc, ase_mchar_t* mb, ase_size_t mblen) * of a character. */ - n = wcrtomb (mb, wc, &mbs); - if (n == (size_t)-1) n = 0; // illegal character + if (mblen < MB_CUR_MAX) + { + ase_mchar_t buf[MB_CUR_MAX]; + + n = wcrtomb (buf, wc, &mbs); + if (n > mblen) return mblen + 1; /* buffer to small */ + if (n == (size_t)-1) return 0; /* illegal character */ + + ASE_MEMCPY (mb, buf, mblen); + } + else + { + n = wcrtomb (mb, wc, &mbs); + if (n > mblen) return mblen + 1; /* buffer to small */ + if (n == (size_t)-1) return 0; /* illegal character */ + } return n; #else diff --git a/ase/lib/cmn/str_cnv.c b/ase/lib/cmn/str_cnv.c index e665718b..e94e89c7 100644 --- a/ase/lib/cmn/str_cnv.c +++ b/ase/lib/cmn/str_cnv.c @@ -1,11 +1,15 @@ /* - * $Id: str_cnv.c 332 2008-08-18 11:21:48Z baconevi $ + * $Id: str_cnv.c 430 2008-10-17 11:43:20Z baconevi $ * * {License} */ #include +#ifdef HAVE_WCHAR_H +#include +#endif + int ase_strtoi (const ase_char_t* str) { int v; @@ -117,3 +121,108 @@ ase_ulong_t ase_strxtoulong (const ase_char_t* str, ase_size_t len) ASE_STRXTONUM (v, str, len, ASE_NULL, 10); return v; } + +ase_size_t ase_mbstowcs ( + const ase_mchar_t* mbs, ase_wchar_t* wcs, ase_size_t* wcslen) +{ + ase_size_t len, wlen; + for (len = 0; *mbs++ != '\0'; len++); + + + if (*wcslen <= 0) return 0; + if (*wcslen == 1) + { + wcs[0] = L'\0'; + return 0; + } + + /* because ase_mbtowc needs the length, we get the lenght of mbs + * and pass it to ase_mbsntowcsn */ + wlen = *wcslen - 1; + len = ase_mbsntowcsn (mbs, len, wcs, &wlen); + + wcs[wlen] = L'\0'; + *wcslen = wlen; +/* TODO: wcslen should include the length including null? */ + return len; +} + +ase_size_t ase_mbsntowcsn ( + const ase_mchar_t* mbs, ase_size_t mbslen, + ase_wchar_t* wcs, ase_size_t* wcslen) +{ + ase_size_t mlen = mbslen, n; + const ase_mchar_t* p; + ase_wchar_t* q, * qend ; + + qend = wcs + *wcslen; + + for (p = mbs, q = wcs; mlen > 0 && q < qend; p += n, mlen -= n) + { + n = ase_mbtowc (p, mlen, q); + if (n == 0 || n > mlen) + { + /* wrong sequence or insufficient input */ + break; + } + + q++; + } + + *wcslen = q - wcs; + return p - mbs; /* returns the number of bytes processed */ +} + +ase_size_t wcstombs ( + const ase_wchar_t* wcs, ase_mchar_t* mbs, ase_size_t* mbslen) +{ + const ase_wchar_t* p = wcs; + ase_size_t len = *mbslen; + + while (*p != ASE_T('\0') && len > 1) + { + ase_size_t n = ase_wctomb (*p, mbs, len); + if (n == 0 || n > len) + { + /* illegal character or buffer not enough */ + break; + } + mbs += n; len -= n; p++; + } + + *mbslen -= len; + if (len > 0) *mbs = '\0'; + + /* returns the number of characters handled. + * the caller can check if the return value is as large is wcslen + * for an error. */ + return p - wcs; +} + +ase_size_t ase_wcsntombsn ( + const ase_wchar_t* wcs, ase_size_t wcslen, + ase_mchar_t* mbs, ase_size_t* mbslen) +{ + const ase_wchar_t* p = wcs; + const ase_wchar_t* end = wcs + wcslen; + ase_size_t len = *mbslen; + + while (p < end && len > 0) + { + ase_size_t n = ase_wctomb (*p, mbs, len); + if (n == 0 || n > len) + { + /* illegal character or buffer not enough */ + break; + } + mbs += n; len -= n; p++; + } + + *mbslen -= len; + + /* returns the number of characters handled. + * the caller can check if the return value is as large is wcslen + * for an error. */ + return p - wcs; +} + diff --git a/ase/lib/cmn/tio.c b/ase/lib/cmn/tio.c index 8e9e7ed2..1903fd86 100644 --- a/ase/lib/cmn/tio.c +++ b/ase/lib/cmn/tio.c @@ -69,6 +69,21 @@ int ase_tio_fini (ase_tio_t* tio) return 0; } +void* ase_tio_getextension (ase_tio_t* tio) +{ + return tio + 1; +} + +ase_mmgr_t* ase_tio_getmmgr (ase_tio_t* tio) +{ + return tio->mmgr; +} + +void ase_tio_setmmgr (ase_tio_t* tio, ase_mmgr_t* mmgr) +{ + tio->mmgr = mmgr; +} + int ase_tio_geterrnum (ase_tio_t* tio) { return tio->errnum; @@ -82,6 +97,7 @@ const ase_char_t* ase_tio_geterrstr (ase_tio_t* tio) ASE_T("out of memory"), ASE_T("no more space"), ASE_T("illegal multibyte sequence"), + ASE_T("incomplete multibyte sequence"), ASE_T("illegal wide character"), ASE_T("no input function attached"), ASE_T("input function returned an error"), diff --git a/ase/lib/cmn/tio_get.c b/ase/lib/cmn/tio_get.c index aaefcd7e..b7144731 100644 --- a/ase/lib/cmn/tio_get.c +++ b/ase/lib/cmn/tio_get.c @@ -39,7 +39,17 @@ ase_ssize_t ase_tio_getc (ase_tio_t* tio, ase_char_t* c) n = tio->input_func ( ASE_TIO_IO_DATA, tio->input_arg, &tio->inbuf[left], ASE_COUNTOF(tio->inbuf) - left); - if (n == 0) return 0; + if (n == 0) + { + if (tio->inbuf_curp < tio->inbuf_len) + { + /* gargage left in the buffer */ + tio->errnum = ASE_TIO_EICSEQ; + return -1; + } + + return 0; + } if (n <= -1) { tio->errnum = ASE_TIO_EINPUT; @@ -100,6 +110,7 @@ ase_ssize_t ase_tio_getc (ase_tio_t* tio, ase_char_t* c) goto getc_conv; } #endif + n = ase_mbtowc (&tio->inbuf[tio->inbuf_curp], left, &curc); if (n == 0) { @@ -161,6 +172,7 @@ ase_ssize_t ase_tio_getsx (ase_tio_t* tio, ase_char_t* buf, ase_size_t size) if (n == 0) break; *p++ = c; + /* TODO: support a different line breaker */ if (c == ASE_T('\n')) break; } @@ -194,6 +206,7 @@ ase_ssize_t ase_tio_getstr (ase_tio_t* tio, ase_str_t* buf) return -1; } + /* TODO: support a different line breaker */ if (c == ASE_T('\n')) break; }