diff --git a/qse/include/qse/cmn/Makefile.am b/qse/include/qse/cmn/Makefile.am index 416b4fcd..27c1358b 100644 --- a/qse/include/qse/cmn/Makefile.am +++ b/qse/include/qse/cmn/Makefile.am @@ -28,6 +28,7 @@ pkginclude_HEADERS = \ time.h \ tio.h \ tre.h \ + utf8.h \ xma.h if ENABLE_CXX diff --git a/qse/include/qse/cmn/Makefile.in b/qse/include/qse/cmn/Makefile.in index f652f09a..ae549f93 100644 --- a/qse/include/qse/cmn/Makefile.in +++ b/qse/include/qse/cmn/Makefile.in @@ -54,7 +54,7 @@ DIST_SOURCES = am__pkginclude_HEADERS_DIST = alg.h chr.h dll.h env.h fio.h fma.h \ fmt.h gdl.h htb.h lda.h main.h map.h mem.h oht.h opt.h path.h \ pio.h pma.h rbt.h rex.h sio.h sll.h stdio.h str.h time.h tio.h \ - tre.h xma.h Mmgr.hpp StdMmgr.hpp Mmged.hpp + tre.h utf8.h xma.h Mmgr.hpp StdMmgr.hpp Mmged.hpp am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; am__vpath_adj = case $$p in \ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ @@ -225,8 +225,8 @@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ pkginclude_HEADERS = alg.h chr.h dll.h env.h fio.h fma.h fmt.h gdl.h \ htb.h lda.h main.h map.h mem.h oht.h opt.h path.h pio.h pma.h \ - rbt.h rex.h sio.h sll.h stdio.h str.h time.h tio.h tre.h xma.h \ - $(am__append_1) + rbt.h rex.h sio.h sll.h stdio.h str.h time.h tio.h tre.h \ + utf8.h xma.h $(am__append_1) all: all-am .SUFFIXES: diff --git a/qse/include/qse/cmn/utf8.h b/qse/include/qse/cmn/utf8.h new file mode 100644 index 00000000..ab68c513 --- /dev/null +++ b/qse/include/qse/cmn/utf8.h @@ -0,0 +1,66 @@ +/* + * $Id$ + * + Copyright 2006-2011 Chung, Hyung-Hwan. + This file is part of QSE. + + QSE is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation, either version 3 of + the License, or (at your option) any later version. + + QSE is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with QSE. If not, see . + */ + +#ifndef _QSE_CMN_UTF8_H_ +#define _QSE_CMN_UTF8_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * The qse_uctoutf8len() function returns the number bytes in the utf8 sequence + * that would result from the original unicode character. + * @return + * - 0 is returned if @a uc is invalid. + * - A positive integer is returned in all other cases. + */ +int qse_uctoutf8len ( + qse_wchar_t uc +); + +/** + * The qse_uctoutf8() function converts a unicode character to a utf8 sequence. + * @return + * - 0 is returned if @a uc is invalid. + * - A negative integer is returned if the utf8 sequence buffer is not + * large enough. It is the negated buffer size required. + * - A positive integer is returned in all other cases. + */ +int qse_uctoutf8 ( + qse_wchar_t uc, + qse_mchar_t* utf8, + int size +); + +int qse_utf8touc ( + const qse_mchar_t* utf8, + int size, + qse_wchar_t* uc +); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/qse/lib/cmn/Makefile.am b/qse/lib/cmn/Makefile.am index 9fe69bab..c614b6da 100644 --- a/qse/lib/cmn/Makefile.am +++ b/qse/lib/cmn/Makefile.am @@ -44,6 +44,7 @@ libqsecmn_la_SOURCES = \ rex.c \ sio.c \ sll.c \ + stdio.c \ str-beg.c \ str-cat.c \ str-chr.c \ @@ -84,7 +85,7 @@ libqsecmn_la_SOURCES = \ tre-match-parallel.c \ tre-parse.c \ tre-stack.c \ - stdio.c \ + utf8.c \ xma.c libqsecmn_la_LDFLAGS = -L$(libdir) -version-info 1:0:0 -no-undefined diff --git a/qse/lib/cmn/Makefile.in b/qse/lib/cmn/Makefile.in index 157fecac..8fc35211 100644 --- a/qse/lib/cmn/Makefile.in +++ b/qse/lib/cmn/Makefile.in @@ -79,15 +79,15 @@ am_libqsecmn_la_OBJECTS = alg-search.lo alg-sort.lo assert.lo chr.lo \ chr-cnv.lo dll.lo env.lo gdl.lo htb.lo lda.lo fio.lo fma.lo \ fmt.lo main.lo mem.lo oht.lo opt.lo path-basename.lo \ path-canon.lo pio.lo pma.lo rbt.lo rex.lo sio.lo sll.lo \ - str-beg.lo str-cat.lo str-chr.lo str-cnv.lo str-cmp.lo \ - str-cpy.lo str-del.lo str-dup.lo str-dynm.lo str-dynw.lo \ - str-end.lo str-excl.lo str-fcpy.lo str-fnmat.lo str-incl.lo \ - str-len.lo str-pac.lo str-pbrk.lo str-put.lo str-rev.lo \ - str-rot.lo str-set.lo str-spl.lo str-spn.lo str-str.lo \ - str-subst.lo str-tok.lo str-trm.lo str-word.lo time.lo tio.lo \ - tio-get.lo tio-put.lo tre.lo tre-ast.lo tre-compile.lo \ - tre-match-backtrack.lo tre-match-parallel.lo tre-parse.lo \ - tre-stack.lo stdio.lo xma.lo + stdio.lo str-beg.lo str-cat.lo str-chr.lo str-cnv.lo \ + str-cmp.lo str-cpy.lo str-del.lo str-dup.lo str-dynm.lo \ + str-dynw.lo str-end.lo str-excl.lo str-fcpy.lo str-fnmat.lo \ + str-incl.lo str-len.lo str-pac.lo str-pbrk.lo str-put.lo \ + str-rev.lo str-rot.lo str-set.lo str-spl.lo str-spn.lo \ + str-str.lo str-subst.lo str-tok.lo str-trm.lo str-word.lo \ + time.lo tio.lo tio-get.lo tio-put.lo tre.lo tre-ast.lo \ + tre-compile.lo tre-match-backtrack.lo tre-match-parallel.lo \ + tre-parse.lo tre-stack.lo utf8.lo xma.lo libqsecmn_la_OBJECTS = $(am_libqsecmn_la_OBJECTS) libqsecmn_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \ $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ @@ -313,6 +313,7 @@ libqsecmn_la_SOURCES = \ rex.c \ sio.c \ sll.c \ + stdio.c \ str-beg.c \ str-cat.c \ str-chr.c \ @@ -353,7 +354,7 @@ libqsecmn_la_SOURCES = \ tre-match-parallel.c \ tre-parse.c \ tre-stack.c \ - stdio.c \ + utf8.c \ xma.c libqsecmn_la_LDFLAGS = -L$(libdir) -version-info 1:0:0 -no-undefined @@ -506,6 +507,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tre-parse.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tre-stack.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tre.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/utf8.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/xma.Plo@am__quote@ .c.o: diff --git a/qse/lib/cmn/utf8.c b/qse/lib/cmn/utf8.c new file mode 100644 index 00000000..a88f1dc5 --- /dev/null +++ b/qse/lib/cmn/utf8.c @@ -0,0 +1,168 @@ +/* + * $Id: utf8.c 50 2009-02-10 05:48:05Z hyunghwan.chung $ + * + Copyright 2006-2011 Chung, Hyung-Hwan. + This file is part of QSE. + + QSE is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation, either version 3 of + the License, or (at your option) any later version. + + QSE is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with QSE. If not, see . + */ + +#include + +/* + * from RFC 2279 UTF-8, a transformation format of ISO 10646 + * + * UCS-4 range (hex.) UTF-8 octet sequence (binary) + * 1:2 00000000-0000007F 0xxxxxxx + * 2:2 00000080-000007FF 110xxxxx 10xxxxxx + * 3:2 00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx + * 4:4 00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * inv 00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * inv 04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + */ + +struct __utf8_t +{ + qse_uint32_t lower; + qse_uint32_t upper; + qse_uint8_t fbyte; /* mask to the first utf8 byte */ + qse_uint8_t mask; + int length; /* number of bytes */ +}; + +typedef struct __utf8_t __utf8_t; + +static __utf8_t utf8_table[] = +{ + {0x00000000ul, 0x0000007Ful, 0x00, 0x80, 1}, + {0x00000080ul, 0x000007FFul, 0xC0, 0xE0, 2}, + {0x00000800ul, 0x0000FFFFul, 0xE0, 0xF0, 3}, + {0x00010000ul, 0x001FFFFFul, 0xF0, 0xF8, 4}, + {0x00200000ul, 0x03FFFFFFul, 0xF8, 0xFC, 5}, + {0x04000000ul, 0x7FFFFFFFul, 0xFC, 0xFE, 6} +}; + +static QSE_INLINE __utf8_t* get_utf8_slot (qse_wchar_t uc) +{ + __utf8_t* cur, * end; + + QSE_ASSERT (QSE_SIZEOF(qse_mchar_t) == 1); + QSE_ASSERT (QSE_SIZEOF(qse_wchar_t) >= 2); + + end = utf8_table + QSE_COUNTOF(utf8_table); + cur = utf8_table; + + while (cur < end) + { + if (uc >= cur->lower && uc <= cur->upper) return cur; + cur++; + } + + return QSE_NULL; /* invalid character */ +} + +int qse_uctoutf8len (qse_wchar_t uc) +{ + __utf8_t* cur = get_utf8_slot (uc); + return (cur == QSE_NULL)? 0: cur->length; +} + +int qse_uctoutf8 (qse_wchar_t uc, qse_mchar_t* utf8, int size) +{ + __utf8_t* cur = get_utf8_slot (uc); + int index; + + if (cur == QSE_NULL) return 0; /* invalid character */ + + if (cur->length > size) + { + /* buffer not big enough. index indicates the buffer size needed */ + return -index; + } + + index = cur->length; + while (index > 1) + { + /* + * 0x3F: 00111111 + * 0x80: 10000000 + */ + utf8[--index] = (uc & 0x3F) | 0x80; + uc >>= 6; + } + + utf8[0] = uc | cur->fbyte; + return cur->length; +} + +int qse_utf8touc ( + const qse_mchar_t* utf8, int size, qse_wchar_t* uc) +{ + __utf8_t* cur, * end; + qse_mchar_t c, t; + qse_wchar_t w; + int count = 0; + + QSE_ASSERT (utf8 != QSE_NULL); + QSE_ASSERT (QSE_SIZEOF(qse_mchar_t) == 1); + QSE_ASSERT (QSE_SIZEOF(qse_wchar_t) >= 2); + + end = utf8_table + QSE_COUNTOF(utf8_table); + cur = utf8_table; + + c = *utf8; + w = c; + + while (cur < end) + { + count++; + + if ((c & cur->mask) == cur->fbyte) + { + w &= cur->upper; + if (w < cur->lower) break; /* wrong value */ + *uc = w; + return count; + } + + if (size <= count) break; /* insufficient input */ + utf8++; /* advance to the next character in the sequence */ + + t = (*utf8 ^ 0x80) & 0xFF; + if (t & 0xC0) break; + w = (w << 6) | t; + + cur++; + } + + return 0; /* error - invalid sequence */ +} + +#if 0 +int qse_utf8len (qse_mchar_t first) +{ + __utf8_t* cur, * end; + + end = utf8_table + QSE_COUNTOF(utf8_table); + cur = utf8_table; + + while (cur < end) + { + if ((first & cur->mask) == cur->fbyte) return cur->length; + cur++; + } + + return 0; /* error - invalid sequence */ +} +#endif