2008-10-15 04:08:31 +00:00
|
|
|
|
/*
|
2012-01-03 14:41:15 +00:00
|
|
|
|
* $Id$
|
2009-01-06 04:40:25 +00:00
|
|
|
|
*
|
2019-06-06 05:28:23 +00:00
|
|
|
|
Copyright (c) 2006-2019 Chung, Hyung-Hwan. All rights reserved.
|
2009-01-06 04:40:25 +00:00
|
|
|
|
|
2014-11-19 14:42:24 +00:00
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
|
|
|
modification, are permitted provided that the following conditions
|
|
|
|
|
are met:
|
|
|
|
|
1. Redistributions of source code must retain the above copyright
|
|
|
|
|
notice, this list of conditions and the following disclaimer.
|
|
|
|
|
2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
|
notice, this list of conditions and the following disclaimer in the
|
|
|
|
|
documentation and/or other materials provided with the distribution.
|
2009-01-06 04:40:25 +00:00
|
|
|
|
|
2014-11-19 14:42:24 +00:00
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR
|
|
|
|
|
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
|
|
|
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
|
|
|
IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
|
|
|
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
|
|
|
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
|
|
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
|
|
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
|
|
|
THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
2008-10-15 04:08:31 +00:00
|
|
|
|
*/
|
|
|
|
|
|
2012-01-03 14:41:15 +00:00
|
|
|
|
#include <qse/cmn/slmb.h>
|
2015-04-27 13:03:32 +00:00
|
|
|
|
#include <qse/cmn/utf8.h>
|
2016-04-29 03:55:42 +00:00
|
|
|
|
#include "mem-prv.h"
|
2011-03-17 02:37:06 +00:00
|
|
|
|
|
|
|
|
|
#if !defined(QSE_HAVE_CONFIG_H)
|
2011-05-04 08:00:38 +00:00
|
|
|
|
# if defined(_WIN32) || defined(__OS2__) || defined(__DOS__)
|
2011-03-17 02:37:06 +00:00
|
|
|
|
# define HAVE_WCHAR_H
|
|
|
|
|
# define HAVE_STDLIB_H
|
|
|
|
|
# define HAVE_MBRLEN
|
|
|
|
|
# define HAVE_MBRTOWC
|
|
|
|
|
# define HAVE_WCRTOMB
|
2014-05-01 14:35:17 +00:00
|
|
|
|
# elif defined(macintosh) && defined(__MWERKS__)
|
|
|
|
|
# define HAVE_WCHAR_H
|
|
|
|
|
# define HAVE_STDLIB_H
|
2011-03-17 02:37:06 +00:00
|
|
|
|
# endif
|
|
|
|
|
#endif
|
2008-10-15 04:08:31 +00:00
|
|
|
|
|
2014-11-14 02:44:20 +00:00
|
|
|
|
#if defined(HAVE_WCHAR_H)
|
2011-03-15 09:40:35 +00:00
|
|
|
|
# include <wchar.h>
|
2008-10-15 04:08:31 +00:00
|
|
|
|
#endif
|
2014-11-14 02:44:20 +00:00
|
|
|
|
#if defined(HAVE_STDLIB_H)
|
2011-03-15 09:40:35 +00:00
|
|
|
|
# include <stdlib.h>
|
2008-10-15 04:08:31 +00:00
|
|
|
|
#endif
|
2012-01-06 14:38:11 +00:00
|
|
|
|
#if defined(_WIN32)
|
|
|
|
|
# include <windows.h>
|
|
|
|
|
#endif
|
2008-10-15 04:08:31 +00:00
|
|
|
|
|
2015-04-27 13:03:32 +00:00
|
|
|
|
#if defined(_WIN32) || (defined(HAVE_WCRTOMB) && defined(HAVE_MBRTOWC) && defined(HAVE_MBRLEN))
|
|
|
|
|
/* use system locale */
|
|
|
|
|
# undef FORCE_UTF8
|
|
|
|
|
#else
|
|
|
|
|
/* force to use UTF8 as required functions are not available */
|
|
|
|
|
# define FORCE_UTF8 1
|
|
|
|
|
#endif
|
|
|
|
|
|
2012-01-06 14:38:11 +00:00
|
|
|
|
qse_size_t qse_slwcrtoslmb (
|
|
|
|
|
qse_wchar_t wc, qse_mchar_t* mb,
|
|
|
|
|
qse_size_t mbl, qse_mbstate_t* state)
|
2008-10-15 04:08:31 +00:00
|
|
|
|
{
|
2015-04-27 13:03:32 +00:00
|
|
|
|
|
|
|
|
|
#if defined(FORCE_UTF8)
|
|
|
|
|
|
|
|
|
|
return qse_uctoutf8 (wc, mb, mbl);
|
|
|
|
|
|
|
|
|
|
#elif defined(_WIN32)
|
2012-01-06 14:38:11 +00:00
|
|
|
|
int n;
|
|
|
|
|
|
2012-02-19 14:38:22 +00:00
|
|
|
|
/* CP_THREAD_ACP results in ERROR_INVALID_PARAMETER
|
|
|
|
|
* on an old windows os like win95 */
|
2012-01-06 14:38:11 +00:00
|
|
|
|
n = WideCharToMultiByte (
|
2012-02-19 14:38:22 +00:00
|
|
|
|
CP_ACP/*CP_THREAD_ACP*/, 0 /*WC_ERR_INVALID_CHARS*/,
|
2012-01-06 14:38:11 +00:00
|
|
|
|
&wc, 1, mb, mbl, NULL, NULL);
|
|
|
|
|
if (n == 0)
|
|
|
|
|
{
|
|
|
|
|
DWORD e = GetLastError();
|
|
|
|
|
if (e == ERROR_INSUFFICIENT_BUFFER) return mbl + 1;
|
|
|
|
|
/*if (e == ERROR_NO_UNICODE_TRANSLATION) return 0;*/
|
|
|
|
|
/* treat all other erros as invalid unicode character */
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return (qse_size_t)n;
|
|
|
|
|
|
|
|
|
|
#elif defined(HAVE_WCRTOMB)
|
2008-10-15 04:08:31 +00:00
|
|
|
|
size_t n;
|
|
|
|
|
|
2012-01-06 14:38:11 +00:00
|
|
|
|
if (mbl < QSE_MBLEN_MAX)
|
|
|
|
|
{
|
|
|
|
|
/* the buffer given is too small. try conversion on
|
|
|
|
|
* a temporary buffer large enough to handle all locales
|
|
|
|
|
* and copy the result to the original buffer.
|
|
|
|
|
*/
|
|
|
|
|
qse_mchar_t buf[QSE_MBLEN_MAX];
|
2008-10-15 04:08:31 +00:00
|
|
|
|
|
2012-01-06 14:38:11 +00:00
|
|
|
|
n = wcrtomb (buf, wc, (mbstate_t*)state);
|
|
|
|
|
/* it's important that n is checked againt (size_t)-1
|
|
|
|
|
* before againt mbl. n > mbl is true if n is (size_t)-1.
|
|
|
|
|
* if the check comes later, i won't have a chance to
|
|
|
|
|
* determine the case of an illegal character */
|
|
|
|
|
if (n == (size_t)-1) return 0; /* illegal character */
|
|
|
|
|
if (n > mbl) return mbl + 1; /* buffer to small */
|
2008-10-15 04:08:31 +00:00
|
|
|
|
|
2012-01-06 14:38:11 +00:00
|
|
|
|
QSE_MEMCPY (mb, buf, mbl);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
n = wcrtomb (mb, wc, (mbstate_t*)state);
|
|
|
|
|
if (n == (size_t)-1) return 0; /* illegal character */
|
|
|
|
|
if (n > mbl) return mbl + 1; /* buffer to small */
|
|
|
|
|
}
|
2010-04-06 06:50:01 +00:00
|
|
|
|
|
2012-01-06 14:38:11 +00:00
|
|
|
|
return n; /* number of bytes written to the buffer */
|
2014-05-01 14:35:17 +00:00
|
|
|
|
|
2008-10-15 04:08:31 +00:00
|
|
|
|
#else
|
2014-05-01 14:35:17 +00:00
|
|
|
|
/* not supported */
|
|
|
|
|
return 0;
|
2015-04-27 13:03:32 +00:00
|
|
|
|
|
2008-10-15 04:08:31 +00:00
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
2019-05-13 05:22:40 +00:00
|
|
|
|
qse_size_t qse_slmbrtoslwc (const qse_mchar_t* mb, qse_size_t mbl, qse_wchar_t* wc, qse_mbstate_t* state)
|
2008-10-15 04:08:31 +00:00
|
|
|
|
{
|
2015-04-27 13:03:32 +00:00
|
|
|
|
#if defined(FORCE_UTF8)
|
|
|
|
|
|
|
|
|
|
return qse_utf8touc (mb, mbl, wc);
|
|
|
|
|
|
|
|
|
|
#elif defined(_WIN32)
|
2012-01-06 14:38:11 +00:00
|
|
|
|
qse_size_t dbcslen;
|
|
|
|
|
int n;
|
|
|
|
|
|
|
|
|
|
QSE_ASSERT (mb != QSE_NULL);
|
|
|
|
|
QSE_ASSERT (mbl > 0);
|
|
|
|
|
|
2012-02-19 14:38:22 +00:00
|
|
|
|
dbcslen = IsDBCSLeadByteEx(CP_ACP/*CP_THREAD_ACP*/, *mb)? 2: 1;
|
2012-01-06 14:38:11 +00:00
|
|
|
|
if (mbl < dbcslen) return mbl + 1; /* incomplete sequence */
|
|
|
|
|
|
|
|
|
|
n = MultiByteToWideChar (
|
2012-02-19 14:38:22 +00:00
|
|
|
|
CP_ACP/*CP_THREAD_ACP*/, MB_ERR_INVALID_CHARS, mb, dbcslen, wc, 1);
|
2012-01-06 14:38:11 +00:00
|
|
|
|
if (n == 0)
|
|
|
|
|
{
|
|
|
|
|
/*DWORD e = GetLastError();*/
|
|
|
|
|
/*if (e == ERROR_NO_UNICODE_TRANSLATION) return 0;*/
|
|
|
|
|
/*if (e == ERROR_INSUFFICIENT_BUFFER) return mbl + 1;*/
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return dbcslen;
|
|
|
|
|
|
|
|
|
|
#elif defined(HAVE_MBRTOWC)
|
2008-10-15 04:08:31 +00:00
|
|
|
|
size_t n;
|
2019-05-13 05:22:40 +00:00
|
|
|
|
wchar_t tc;
|
2008-10-15 04:08:31 +00:00
|
|
|
|
|
2012-01-06 14:38:11 +00:00
|
|
|
|
QSE_ASSERT (mb != QSE_NULL);
|
|
|
|
|
QSE_ASSERT (mbl > 0);
|
|
|
|
|
|
2019-05-13 05:22:40 +00:00
|
|
|
|
n = mbrtowc(&tc, mb, mbl, (mbstate_t*)state);
|
2008-10-15 04:08:31 +00:00
|
|
|
|
if (n == 0)
|
|
|
|
|
{
|
2012-01-06 14:38:11 +00:00
|
|
|
|
if (wc) *wc = QSE_WT('\0');
|
2008-10-15 04:08:31 +00:00
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (n == (size_t)-1) return 0; /* invalid sequence */
|
2011-08-22 23:26:26 +00:00
|
|
|
|
if (n == (size_t)-2) return mbl + 1; /* incomplete sequence */
|
2019-05-13 05:22:40 +00:00
|
|
|
|
|
|
|
|
|
if (wc) *wc = tc; /* if sizeof(qse_wchar_t) < sizeof(wchar_t), *wc may get truncated */
|
2008-12-21 21:35:07 +00:00
|
|
|
|
return (qse_size_t)n;
|
2014-05-01 14:35:17 +00:00
|
|
|
|
|
2008-10-15 04:08:31 +00:00
|
|
|
|
#else
|
2014-05-01 14:35:17 +00:00
|
|
|
|
/* not supported */
|
|
|
|
|
return 0;
|
2008-10-15 04:08:31 +00:00
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
2019-05-13 05:22:40 +00:00
|
|
|
|
qse_size_t qse_slmbrlen (const qse_mchar_t* mb, qse_size_t mbl, qse_mbstate_t* state)
|
2008-10-15 04:08:31 +00:00
|
|
|
|
{
|
2015-04-27 13:03:32 +00:00
|
|
|
|
#if defined(FORCE_UTF8)
|
|
|
|
|
return qse_utf8len (mb, mbl);
|
|
|
|
|
|
|
|
|
|
#elif defined(_WIN32)
|
2012-01-06 14:38:11 +00:00
|
|
|
|
qse_size_t dbcslen;
|
|
|
|
|
|
|
|
|
|
QSE_ASSERT (mb != QSE_NULL);
|
|
|
|
|
QSE_ASSERT (mbl > 0);
|
|
|
|
|
|
2012-01-10 15:05:40 +00:00
|
|
|
|
/* IsDBCSLeadByte() or IsDBCSLeadByteEx() doesn't validate
|
|
|
|
|
* the actual sequence. So it can't actually detect an invalid
|
|
|
|
|
* sequence. Thus, qse_slmbrtowc() may return a different length
|
|
|
|
|
* for an invalid sequence form qse_slmbrlen(). */
|
2012-02-19 14:38:22 +00:00
|
|
|
|
dbcslen = IsDBCSLeadByteEx(CP_ACP/*CP_THREAD_ACP*/, *mb)? 2: 1;
|
2012-01-06 14:38:11 +00:00
|
|
|
|
if (mbl < dbcslen) return mbl + 1; /* incomplete sequence */
|
|
|
|
|
return dbcslen;
|
2012-01-10 15:05:40 +00:00
|
|
|
|
|
|
|
|
|
#elif defined(__sun__) && defined(HAVE_MBRLEN)
|
|
|
|
|
/* on solaris 8,
|
|
|
|
|
* for a valid utf8 sequence on the utf8-locale,
|
|
|
|
|
* mbrlen() returned -1.
|
|
|
|
|
* mbrtowc(NULL, mbs, mbl, state) also returned -1.
|
|
|
|
|
* mblen() returned the right length.
|
|
|
|
|
* mbrtowc(wc, mbs, mbl, state) returned the right length.
|
|
|
|
|
* for a cp949 sequence on the cp949 locale,
|
|
|
|
|
* mbrlen() returned the right length.
|
|
|
|
|
* mbrtowc(NULL, mbs, mbl, state) returned the right length.
|
|
|
|
|
* mblen() returned the right length.
|
|
|
|
|
* mbrtowc(wc, mbs, mbl, state) returned the right length.
|
|
|
|
|
*
|
|
|
|
|
* The problem is buggy mbrlen() that can't handle utf8 sequence
|
|
|
|
|
* properly. here is my quick and dirty workaround for solaris.
|
|
|
|
|
*
|
|
|
|
|
* Newer solaris 9 and 10 or later should be also affected since
|
|
|
|
|
* i don't check any version or something.
|
|
|
|
|
*
|
|
|
|
|
* There could be other platforms with the same issue.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/* TODO:
|
|
|
|
|
* it seems that solaris is not the only platform with
|
|
|
|
|
* this kind of a bug.
|
|
|
|
|
*
|
|
|
|
|
* checking this in autoconf doesn't solve the problem.
|
|
|
|
|
* the underlying system could have fixed the problem already.
|
|
|
|
|
*
|
|
|
|
|
* checking this during library initialization makes sense.
|
|
|
|
|
* qse_slmbinit() or qse_initlib() tests if mblen() and mbrlen()
|
|
|
|
|
* returns consistant results and arranges properly method
|
|
|
|
|
* for this slmb routine.
|
|
|
|
|
*/
|
|
|
|
|
return qse_slmbrtoslwc (mb, mbl, QSE_NULL, state);
|
|
|
|
|
|
2012-01-06 14:38:11 +00:00
|
|
|
|
#elif defined(HAVE_MBRLEN)
|
2008-10-15 04:08:31 +00:00
|
|
|
|
size_t n;
|
|
|
|
|
|
2012-01-06 14:38:11 +00:00
|
|
|
|
QSE_ASSERT (mb != QSE_NULL);
|
|
|
|
|
QSE_ASSERT (mbl > 0);
|
2011-08-22 23:26:26 +00:00
|
|
|
|
|
2019-05-13 05:22:40 +00:00
|
|
|
|
n = mbrlen(mb, mbl, (mbstate_t*)state);
|
2012-01-06 14:38:11 +00:00
|
|
|
|
if (n == 0) return 1; /* a null character */
|
2008-10-18 05:43:20 +00:00
|
|
|
|
|
2012-01-06 14:38:11 +00:00
|
|
|
|
if (n == (size_t)-1) return 0; /* invalid sequence */
|
|
|
|
|
if (n == (size_t)-2) return mbl + 1; /* incomplete sequence */
|
2008-10-15 04:08:31 +00:00
|
|
|
|
|
2012-01-06 14:38:11 +00:00
|
|
|
|
return (qse_size_t)n;
|
|
|
|
|
|
|
|
|
|
#if 0
|
2019-05-13 05:22:40 +00:00
|
|
|
|
n = mblen(mb, mbl);
|
2012-01-06 14:38:11 +00:00
|
|
|
|
if (n == (size_t)-1) return 0; /* invalid or incomplete sequence */
|
|
|
|
|
if (n == 0) return 1; /* a null character */
|
|
|
|
|
return (qse_size_t)n;
|
|
|
|
|
#endif
|
2008-10-15 04:08:31 +00:00
|
|
|
|
#else
|
2014-05-01 14:35:17 +00:00
|
|
|
|
/* not supported */
|
|
|
|
|
return 0;
|
2008-10-15 04:08:31 +00:00
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
2011-08-22 23:26:26 +00:00
|
|
|
|
/* man mbsinit
|
|
|
|
|
* For 8-bit encodings, all states are equivalent to the initial state.
|
|
|
|
|
* For multibyte encodings like UTF-8, EUC-*, BIG5 or SJIS, the wide char‐
|
|
|
|
|
* acter to multibyte conversion functions never produce non-initial
|
|
|
|
|
* states, but the multibyte to wide-character conversion functions like
|
|
|
|
|
* mbrtowc(3) do produce non-initial states when interrupted in the middle
|
|
|
|
|
* of a character.
|
|
|
|
|
*/
|
|
|
|
|
|
2012-01-03 14:41:15 +00:00
|
|
|
|
qse_size_t qse_slmbtoslwc (const qse_mchar_t* mb, qse_size_t mbl, qse_wchar_t* wc)
|
2011-08-22 23:26:26 +00:00
|
|
|
|
{
|
2014-11-18 16:10:12 +00:00
|
|
|
|
/*qse_mbstate_t state = { { 0, } };*/
|
|
|
|
|
qse_mbstate_t state;
|
|
|
|
|
QSE_MEMSET (&state, 0, QSE_SIZEOF(state));
|
2012-01-03 14:41:15 +00:00
|
|
|
|
return qse_slmbrtoslwc (mb, mbl, wc, &state);
|
2011-08-22 23:26:26 +00:00
|
|
|
|
}
|
|
|
|
|
|
2012-01-03 14:41:15 +00:00
|
|
|
|
qse_size_t qse_slwctoslmb (qse_wchar_t wc, qse_mchar_t* mb, qse_size_t mbl)
|
2011-08-22 23:26:26 +00:00
|
|
|
|
{
|
2014-11-18 16:10:12 +00:00
|
|
|
|
/*qse_mbstate_t state = { { 0, } };*/
|
|
|
|
|
qse_mbstate_t state;
|
|
|
|
|
QSE_MEMSET (&state, 0, QSE_SIZEOF(state));
|
2012-01-03 14:41:15 +00:00
|
|
|
|
return qse_slwcrtoslmb (wc, mb, mbl, &state);
|
2011-08-22 23:26:26 +00:00
|
|
|
|
}
|
|
|
|
|
|
2012-01-06 14:38:11 +00:00
|
|
|
|
qse_size_t qse_slmblen (const qse_mchar_t* mb, qse_size_t mbl)
|
2011-08-22 23:26:26 +00:00
|
|
|
|
{
|
2014-11-18 16:10:12 +00:00
|
|
|
|
/*qse_mbstate_t state = { { 0, } };*/
|
|
|
|
|
qse_mbstate_t state;
|
|
|
|
|
QSE_MEMSET (&state, 0, QSE_SIZEOF(state));
|
2012-01-06 14:38:11 +00:00
|
|
|
|
return qse_slmbrlen (mb, mbl, &state);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
qse_size_t qse_slmblenmax (void)
|
|
|
|
|
{
|
2015-04-27 13:03:32 +00:00
|
|
|
|
#if defined(FORCE_UTF8)
|
|
|
|
|
return qse_utf8lenmax ();
|
|
|
|
|
|
|
|
|
|
#elif defined(_WIN32)
|
2012-01-06 14:38:11 +00:00
|
|
|
|
/* Windows doesn't handle utf8 properly even when your code page
|
|
|
|
|
* is CP_UTF8(65001). you should use functions in utf8.c for utf8
|
2014-05-01 14:35:17 +00:00
|
|
|
|
* handling on windows. 2 is the maximum for DBCS encodings. */
|
2012-01-06 14:38:11 +00:00
|
|
|
|
return 2;
|
2014-11-17 17:18:11 +00:00
|
|
|
|
|
|
|
|
|
#elif defined(MB_CUR_MAX)
|
|
|
|
|
|
2012-01-03 14:41:15 +00:00
|
|
|
|
return MB_CUR_MAX;
|
2014-11-17 17:18:11 +00:00
|
|
|
|
|
|
|
|
|
#elif (QSE_SIZEOF_WCHAR_T == QSE_SIZEOF_MCHAR_T)
|
|
|
|
|
|
|
|
|
|
/* no proper multibyte string support */
|
2015-04-27 13:03:32 +00:00
|
|
|
|
return 1;
|
2014-11-17 17:18:11 +00:00
|
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
/* fallback max utf8 value */
|
|
|
|
|
return 6;
|
2012-01-06 14:38:11 +00:00
|
|
|
|
#endif
|
2011-08-22 23:26:26 +00:00
|
|
|
|
}
|
2014-05-01 14:35:17 +00:00
|
|
|
|
|