added simple utf8 functions

2011-11-08 13:36:47 +00:00
parent 10b0469ee1
commit f2615f05a5
6 changed files with 252 additions and 14 deletions
--- a/qse/include/qse/cmn/Makefile.am
+++ b/qse/include/qse/cmn/Makefile.am
@@ -28,6 +28,7 @@ pkginclude_HEADERS = \
 	time.h \
 	tio.h \
 	tre.h \
+	utf8.h \
 	xma.h 

 if ENABLE_CXX
--- a/qse/include/qse/cmn/Makefile.in
+++ b/qse/include/qse/cmn/Makefile.in
@@ -54,7 +54,7 @@ DIST_SOURCES =
 am__pkginclude_HEADERS_DIST = alg.h chr.h dll.h env.h fio.h fma.h \
 	fmt.h gdl.h htb.h lda.h main.h map.h mem.h oht.h opt.h path.h \
 	pio.h pma.h rbt.h rex.h sio.h sll.h stdio.h str.h time.h tio.h \
-	tre.h xma.h Mmgr.hpp StdMmgr.hpp Mmged.hpp
+	tre.h utf8.h xma.h Mmgr.hpp StdMmgr.hpp Mmged.hpp
 am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
 am__vpath_adj = case $$p in \
    $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
@@ -225,8 +225,8 @@ top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
 pkginclude_HEADERS = alg.h chr.h dll.h env.h fio.h fma.h fmt.h gdl.h \
 	htb.h lda.h main.h map.h mem.h oht.h opt.h path.h pio.h pma.h \
-	rbt.h rex.h sio.h sll.h stdio.h str.h time.h tio.h tre.h xma.h \
-	$(am__append_1)
+	rbt.h rex.h sio.h sll.h stdio.h str.h time.h tio.h tre.h \
+	utf8.h xma.h $(am__append_1)
 all: all-am

 .SUFFIXES:
--- a/qse/include/qse/cmn/utf8.h
+++ b/qse/include/qse/cmn/utf8.h
@@ -0,0 +1,66 @@
+/*
+ * $Id$
+ *
+    Copyright 2006-2011 Chung, Hyung-Hwan.
+    This file is part of QSE.
+
+    QSE is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as 
+    published by the Free Software Foundation, either version 3 of 
+    the License, or (at your option) any later version.
+
+    QSE is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public 
+    License along with QSE. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _QSE_CMN_UTF8_H_
+#define _QSE_CMN_UTF8_H_
+
+#include <qse/types.h>
+#include <qse/macros.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** 
+ * The qse_uctoutf8len() function returns the number bytes in the utf8 sequence
+ * that would result from the original unicode character.
+ * @return 
+ * - 0 is returned if @a uc is invalid. 
+ * - A positive integer is returned in all other cases.
+ */
+int qse_uctoutf8len (
+	qse_wchar_t uc
+);
+
+/** 
+ * The qse_uctoutf8() function converts a unicode character to a utf8 sequence.
+ * @return 
+ * - 0 is returned if @a uc is invalid. 
+ * - A negative integer is returned if the utf8 sequence buffer is not 
+ *   large enough. It is the negated buffer size required.
+ * - A positive integer is returned in all other cases.
+ */
+int qse_uctoutf8 (
+	qse_wchar_t  uc,
+	qse_mchar_t* utf8,
+	int          size
+);
+
+int qse_utf8touc (
+	const qse_mchar_t* utf8,
+	int                size, 
+	qse_wchar_t*       uc
+);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/qse/lib/cmn/Makefile.am
+++ b/qse/lib/cmn/Makefile.am
@@ -44,6 +44,7 @@ libqsecmn_la_SOURCES = \
 	rex.c \
 	sio.c \
 	sll.c \
+	stdio.c \
 	str-beg.c \
 	str-cat.c \
 	str-chr.c \
@@ -84,7 +85,7 @@ libqsecmn_la_SOURCES = \
 	tre-match-parallel.c \
 	tre-parse.c \
 	tre-stack.c \
-	stdio.c \
+	utf8.c \
 	xma.c

 libqsecmn_la_LDFLAGS = -L$(libdir) -version-info 1:0:0 -no-undefined
--- a/qse/lib/cmn/Makefile.in
+++ b/qse/lib/cmn/Makefile.in
@@ -79,15 +79,15 @@ am_libqsecmn_la_OBJECTS = alg-search.lo alg-sort.lo assert.lo chr.lo \
 	chr-cnv.lo dll.lo env.lo gdl.lo htb.lo lda.lo fio.lo fma.lo \
 	fmt.lo main.lo mem.lo oht.lo opt.lo path-basename.lo \
 	path-canon.lo pio.lo pma.lo rbt.lo rex.lo sio.lo sll.lo \
-	str-beg.lo str-cat.lo str-chr.lo str-cnv.lo str-cmp.lo \
-	str-cpy.lo str-del.lo str-dup.lo str-dynm.lo str-dynw.lo \
-	str-end.lo str-excl.lo str-fcpy.lo str-fnmat.lo str-incl.lo \
-	str-len.lo str-pac.lo str-pbrk.lo str-put.lo str-rev.lo \
-	str-rot.lo str-set.lo str-spl.lo str-spn.lo str-str.lo \
-	str-subst.lo str-tok.lo str-trm.lo str-word.lo time.lo tio.lo \
-	tio-get.lo tio-put.lo tre.lo tre-ast.lo tre-compile.lo \
-	tre-match-backtrack.lo tre-match-parallel.lo tre-parse.lo \
-	tre-stack.lo stdio.lo xma.lo
+	stdio.lo str-beg.lo str-cat.lo str-chr.lo str-cnv.lo \
+	str-cmp.lo str-cpy.lo str-del.lo str-dup.lo str-dynm.lo \
+	str-dynw.lo str-end.lo str-excl.lo str-fcpy.lo str-fnmat.lo \
+	str-incl.lo str-len.lo str-pac.lo str-pbrk.lo str-put.lo \
+	str-rev.lo str-rot.lo str-set.lo str-spl.lo str-spn.lo \
+	str-str.lo str-subst.lo str-tok.lo str-trm.lo str-word.lo \
+	time.lo tio.lo tio-get.lo tio-put.lo tre.lo tre-ast.lo \
+	tre-compile.lo tre-match-backtrack.lo tre-match-parallel.lo \
+	tre-parse.lo tre-stack.lo utf8.lo xma.lo
 libqsecmn_la_OBJECTS = $(am_libqsecmn_la_OBJECTS)
 libqsecmn_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
@@ -313,6 +313,7 @@ libqsecmn_la_SOURCES = \
 	rex.c \
 	sio.c \
 	sll.c \
+	stdio.c \
 	str-beg.c \
 	str-cat.c \
 	str-chr.c \
@@ -353,7 +354,7 @@ libqsecmn_la_SOURCES = \
 	tre-match-parallel.c \
 	tre-parse.c \
 	tre-stack.c \
-	stdio.c \
+	utf8.c \
 	xma.c

 libqsecmn_la_LDFLAGS = -L$(libdir) -version-info 1:0:0 -no-undefined
@@ -506,6 +507,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tre-parse.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tre-stack.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tre.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/utf8.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/xma.Plo@am__quote@

 .c.o:
--- a/qse/lib/cmn/utf8.c
+++ b/qse/lib/cmn/utf8.c
@@ -0,0 +1,168 @@
+/*
+ * $Id: utf8.c 50 2009-02-10 05:48:05Z hyunghwan.chung $
+ *
+    Copyright 2006-2011 Chung, Hyung-Hwan.
+    This file is part of QSE.
+
+    QSE is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as 
+    published by the Free Software Foundation, either version 3 of 
+    the License, or (at your option) any later version.
+
+    QSE is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public 
+    License along with QSE. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <qse/cmn/utf8.h>
+
+/*
+ * from RFC 2279 UTF-8, a transformation format of ISO 10646
+ *
+ *     UCS-4 range (hex.)  UTF-8 octet sequence (binary)
+ * 1:2 00000000-0000007F  0xxxxxxx
+ * 2:2 00000080-000007FF  110xxxxx 10xxxxxx
+ * 3:2 00000800-0000FFFF  1110xxxx 10xxxxxx 10xxxxxx
+ * 4:4 00010000-001FFFFF  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * inv 00200000-03FFFFFF  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * inv 04000000-7FFFFFFF  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ */
+
+struct __utf8_t
+{
+	qse_uint32_t  lower;
+	qse_uint32_t  upper;
+	qse_uint8_t   fbyte;  /* mask to the first utf8 byte */
+	qse_uint8_t   mask;
+	int           length; /* number of bytes */
+};
+
+typedef struct __utf8_t __utf8_t;
+
+static __utf8_t utf8_table[] = 
+{
+	{0x00000000ul, 0x0000007Ful, 0x00, 0x80, 1},
+	{0x00000080ul, 0x000007FFul, 0xC0, 0xE0, 2},
+	{0x00000800ul, 0x0000FFFFul, 0xE0, 0xF0, 3},
+	{0x00010000ul, 0x001FFFFFul, 0xF0, 0xF8, 4},
+	{0x00200000ul, 0x03FFFFFFul, 0xF8, 0xFC, 5},
+	{0x04000000ul, 0x7FFFFFFFul, 0xFC, 0xFE, 6}
+};
+
+static QSE_INLINE __utf8_t* get_utf8_slot (qse_wchar_t uc)
+{
+	__utf8_t* cur, * end;
+
+	QSE_ASSERT (QSE_SIZEOF(qse_mchar_t) == 1);
+	QSE_ASSERT (QSE_SIZEOF(qse_wchar_t) >= 2);
+
+	end = utf8_table + QSE_COUNTOF(utf8_table);
+	cur = utf8_table;
+
+	while (cur < end) 
+	{
+		if (uc >= cur->lower && uc <= cur->upper) return cur;
+		cur++;
+	}
+
+	return QSE_NULL; /* invalid character */
+}
+
+int qse_uctoutf8len (qse_wchar_t uc)
+{
+	__utf8_t* cur = get_utf8_slot (uc);
+	return (cur == QSE_NULL)? 0: cur->length;
+}
+
+int qse_uctoutf8 (qse_wchar_t uc, qse_mchar_t* utf8, int size)
+{
+	__utf8_t* cur = get_utf8_slot (uc);
+	int index;
+
+	if (cur == QSE_NULL) return 0; /* invalid character */
+
+	if (cur->length > size)
+	{
+		/* buffer not big enough. index indicates the buffer size needed */
+		return -index;
+	}
+
+	index = cur->length;
+	while (index > 1) 
+	{
+		/*
+		 * 0x3F: 00111111
+		 * 0x80: 10000000
+		 */
+		utf8[--index] = (uc & 0x3F) | 0x80;
+		uc >>= 6;
+	}
+
+	utf8[0] = uc | cur->fbyte;
+	return cur->length;
+}
+
+int qse_utf8touc (
+	const qse_mchar_t* utf8, int size, qse_wchar_t* uc)
+{
+	__utf8_t* cur, * end;
+	qse_mchar_t c, t;
+	qse_wchar_t w;
+	int count = 0;
+
+	QSE_ASSERT (utf8 != QSE_NULL);
+	QSE_ASSERT (QSE_SIZEOF(qse_mchar_t) == 1);
+	QSE_ASSERT (QSE_SIZEOF(qse_wchar_t) >= 2);
+
+	end = utf8_table + QSE_COUNTOF(utf8_table);
+	cur = utf8_table;
+	
+	c = *utf8;
+	w = c;
+
+	while (cur < end) 
+	{
+		count++;
+
+		if ((c & cur->mask) == cur->fbyte) 
+		{
+			w &= cur->upper;
+			if (w < cur->lower) break; /* wrong value */
+			*uc = w;
+			return count;
+		}
+
+		if (size <= count) break; /* insufficient input */
+		utf8++; /* advance to the next character in the sequence */
+
+		t = (*utf8 ^ 0x80) & 0xFF;
+		if (t & 0xC0) break;
+		w = (w << 6) | t;
+
+		cur++;
+	}
+
+	return 0; /* error - invalid sequence */
+}
+
+#if 0
+int qse_utf8len (qse_mchar_t first)
+{
+	__utf8_t* cur, * end;
+
+	end = utf8_table + QSE_COUNTOF(utf8_table);
+	cur = utf8_table;
+
+	while (cur < end) 
+	{
+		if ((first & cur->mask) == cur->fbyte) return cur->length;
+		cur++;
+	}
+
+	return 0; /* error - invalid sequence */
+}
+#endif