qse/qse/lib/cmn/slmb.c

292 lines
7.8 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* $Id$
*
Copyright 2006-2012 Chung, Hyung-Hwan.
This file is part of QSE.
QSE is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation, either version 3 of
the License, or (at your option) any later version.
QSE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with QSE. If not, see <http://www.gnu.org/licenses/>.
*/
#include <qse/cmn/slmb.h>
#include "mem.h"
#if !defined(QSE_HAVE_CONFIG_H)
# if defined(_WIN32) || defined(__OS2__) || defined(__DOS__)
# define HAVE_WCHAR_H
# define HAVE_STDLIB_H
# define HAVE_MBRLEN
# define HAVE_MBRTOWC
# define HAVE_WCRTOMB
# endif
#endif
#ifdef HAVE_WCHAR_H
# include <wchar.h>
#endif
#ifdef HAVE_STDLIB_H
# include <stdlib.h>
#endif
#if defined(_WIN32)
# include <windows.h>
#endif
qse_size_t qse_slwcrtoslmb (
qse_wchar_t wc, qse_mchar_t* mb,
qse_size_t mbl, qse_mbstate_t* state)
{
#if defined(_WIN32)
int n;
/* CP_THREAD_ACP results in ERROR_INVALID_PARAMETER
* on an old windows os like win95 */
n = WideCharToMultiByte (
CP_ACP/*CP_THREAD_ACP*/, 0 /*WC_ERR_INVALID_CHARS*/,
&wc, 1, mb, mbl, NULL, NULL);
if (n == 0)
{
DWORD e = GetLastError();
if (e == ERROR_INSUFFICIENT_BUFFER) return mbl + 1;
/*if (e == ERROR_NO_UNICODE_TRANSLATION) return 0;*/
/* treat all other erros as invalid unicode character */
}
return (qse_size_t)n;
#elif defined(HAVE_WCRTOMB)
size_t n;
if (mbl < QSE_MBLEN_MAX)
{
/* the buffer given is too small. try conversion on
* a temporary buffer large enough to handle all locales
* and copy the result to the original buffer.
*/
qse_mchar_t buf[QSE_MBLEN_MAX];
n = wcrtomb (buf, wc, (mbstate_t*)state);
/* it's important that n is checked againt (size_t)-1
* before againt mbl. n > mbl is true if n is (size_t)-1.
* if the check comes later, i won't have a chance to
* determine the case of an illegal character */
if (n == (size_t)-1) return 0; /* illegal character */
if (n > mbl) return mbl + 1; /* buffer to small */
QSE_MEMCPY (mb, buf, mbl);
}
else
{
n = wcrtomb (mb, wc, (mbstate_t*)state);
if (n == (size_t)-1) return 0; /* illegal character */
if (n > mbl) return mbl + 1; /* buffer to small */
}
return n; /* number of bytes written to the buffer */
#else
#error #### NOT SUPPORTED ####
#endif
}
qse_size_t qse_slmbrtoslwc (
const qse_mchar_t* mb, qse_size_t mbl,
qse_wchar_t* wc, qse_mbstate_t* state)
{
#if defined(_WIN32)
qse_size_t dbcslen;
int n;
QSE_ASSERT (mb != QSE_NULL);
QSE_ASSERT (mbl > 0);
dbcslen = IsDBCSLeadByteEx(CP_ACP/*CP_THREAD_ACP*/, *mb)? 2: 1;
if (mbl < dbcslen) return mbl + 1; /* incomplete sequence */
n = MultiByteToWideChar (
CP_ACP/*CP_THREAD_ACP*/, MB_ERR_INVALID_CHARS, mb, dbcslen, wc, 1);
if (n == 0)
{
/*DWORD e = GetLastError();*/
/*if (e == ERROR_NO_UNICODE_TRANSLATION) return 0;*/
/*if (e == ERROR_INSUFFICIENT_BUFFER) return mbl + 1;*/
return 0;
}
return dbcslen;
#elif defined(__sun__) && defined(HAVE_MBRTOWC)
/*
* Read comments in qse_slmbrlen().
*/
size_t n;
QSE_ASSERT (mb != QSE_NULL);
QSE_ASSERT (mbl > 0);
if (wc)
{
n = mbrtowc (wc, mb, mbl, (mbstate_t*)state);
if (n == 0)
{
*wc = QSE_WT('\0');
return 1;
}
}
else
{
qse_wchar_t dummy;
n = mbrtowc (&dummy, mb, mbl, (mbstate_t*)state);
if (n == 0) return 1;
}
if (n == (size_t)-1) return 0; /* invalid sequence */
if (n == (size_t)-2) return mbl + 1; /* incomplete sequence */
return (qse_size_t)n;
#elif defined(HAVE_MBRTOWC)
size_t n;
QSE_ASSERT (mb != QSE_NULL);
QSE_ASSERT (mbl > 0);
n = mbrtowc (wc, mb, mbl, (mbstate_t*)state);
if (n == 0)
{
if (wc) *wc = QSE_WT('\0');
return 1;
}
if (n == (size_t)-1) return 0; /* invalid sequence */
if (n == (size_t)-2) return mbl + 1; /* incomplete sequence */
return (qse_size_t)n;
#else
#error #### NOT SUPPORTED ####
#endif
}
qse_size_t qse_slmbrlen (
const qse_mchar_t* mb, qse_size_t mbl, qse_mbstate_t* state)
{
#if defined(_WIN32)
qse_size_t dbcslen;
QSE_ASSERT (mb != QSE_NULL);
QSE_ASSERT (mbl > 0);
/* IsDBCSLeadByte() or IsDBCSLeadByteEx() doesn't validate
* the actual sequence. So it can't actually detect an invalid
* sequence. Thus, qse_slmbrtowc() may return a different length
* for an invalid sequence form qse_slmbrlen(). */
dbcslen = IsDBCSLeadByteEx(CP_ACP/*CP_THREAD_ACP*/, *mb)? 2: 1;
if (mbl < dbcslen) return mbl + 1; /* incomplete sequence */
return dbcslen;
#elif defined(__sun__) && defined(HAVE_MBRLEN)
/* on solaris 8,
* for a valid utf8 sequence on the utf8-locale,
* mbrlen() returned -1.
* mbrtowc(NULL, mbs, mbl, state) also returned -1.
* mblen() returned the right length.
* mbrtowc(wc, mbs, mbl, state) returned the right length.
* for a cp949 sequence on the cp949 locale,
* mbrlen() returned the right length.
* mbrtowc(NULL, mbs, mbl, state) returned the right length.
* mblen() returned the right length.
* mbrtowc(wc, mbs, mbl, state) returned the right length.
*
* The problem is buggy mbrlen() that can't handle utf8 sequence
* properly. here is my quick and dirty workaround for solaris.
*
* Newer solaris 9 and 10 or later should be also affected since
* i don't check any version or something.
*
* There could be other platforms with the same issue.
*/
/* TODO:
* it seems that solaris is not the only platform with
* this kind of a bug.
*
* checking this in autoconf doesn't solve the problem.
* the underlying system could have fixed the problem already.
*
* checking this during library initialization makes sense.
* qse_slmbinit() or qse_initlib() tests if mblen() and mbrlen()
* returns consistant results and arranges properly method
* for this slmb routine.
*/
return qse_slmbrtoslwc (mb, mbl, QSE_NULL, state);
#elif defined(HAVE_MBRLEN)
size_t n;
QSE_ASSERT (mb != QSE_NULL);
QSE_ASSERT (mbl > 0);
n = mbrlen (mb, mbl, (mbstate_t*)state);
if (n == 0) return 1; /* a null character */
if (n == (size_t)-1) return 0; /* invalid sequence */
if (n == (size_t)-2) return mbl + 1; /* incomplete sequence */
return (qse_size_t)n;
#if 0
n = mblen (mb, mbl);
if (n == (size_t)-1) return 0; /* invalid or incomplete sequence */
if (n == 0) return 1; /* a null character */
return (qse_size_t)n;
#endif
#else
#error #### NOT SUPPORTED ####
#endif
}
/* man mbsinit
* For 8-bit encodings, all states are equivalent to the initial state.
* For multibyte encodings like UTF-8, EUC-*, BIG5 or SJIS, the wide char
* acter to multibyte conversion functions never produce non-initial
* states, but the multibyte to wide-character conversion functions like
* mbrtowc(3) do produce non-initial states when interrupted in the middle
* of a character.
*/
qse_size_t qse_slmbtoslwc (const qse_mchar_t* mb, qse_size_t mbl, qse_wchar_t* wc)
{
qse_mbstate_t state = { { 0, } };
return qse_slmbrtoslwc (mb, mbl, wc, &state);
}
qse_size_t qse_slwctoslmb (qse_wchar_t wc, qse_mchar_t* mb, qse_size_t mbl)
{
qse_mbstate_t state = { { 0, } };
return qse_slwcrtoslmb (wc, mb, mbl, &state);
}
qse_size_t qse_slmblen (const qse_mchar_t* mb, qse_size_t mbl)
{
qse_mbstate_t state = { { 0, } };
return qse_slmbrlen (mb, mbl, &state);
}
qse_size_t qse_slmblenmax (void)
{
#if defined(_WIN32)
/* Windows doesn't handle utf8 properly even when your code page
* is CP_UTF8(65001). you should use functions in utf8.c for utf8
* handleing on windows. 2 is the maximum for DBCS encodings. */
return 2;
#else
return MB_CUR_MAX;
#endif
}