From bcf50fe3817370e569f179388a6000b8a988a51d Mon Sep 17 00:00:00 2001 From: hyung-hwan Date: Tue, 14 Jan 2014 14:22:06 +0000 Subject: [PATCH] added Unicode_To_Utf8 converter --- cmd/scheme.adb | 10 +++-- cmd/scheme.gpr.in | 2 +- cmd/stream.adb | 21 +++++---- cmd/stream.ads | 2 + lib/h2-scheme.adb | 17 +++---- lib/h2-scheme.ads | 21 +++++---- lib/h2-utf8.adb | 110 ++++++++++++++++++++++++++++++++++++++++++++++ lib/h2-utf8.ads | 20 +++++++++ lib/h2.ads | 19 +++++++- lib/lib.gpr.in | 9 ++-- 10 files changed, 195 insertions(+), 36 deletions(-) create mode 100644 lib/h2-utf8.adb create mode 100644 lib/h2-utf8.ads diff --git a/cmd/scheme.adb b/cmd/scheme.adb index 520912f..ab397c9 100644 --- a/cmd/scheme.adb +++ b/cmd/scheme.adb @@ -6,7 +6,6 @@ with Stream; with Ada.Text_IO; with Ada.Unchecked_Deallocation; - procedure scheme is --package S renames H2.Scheme; --package S is new H2.Scheme (Wide_Character, Wide_String); @@ -43,8 +42,13 @@ begin ); File_Stream.Name := File_Name'Unchecked_Access; - S.Set_Input_Stream (SI, File_Stream); -- specify main input stream - --S.Set_Input_Stream (SI, String_Stream); + begin + S.Set_Input_Stream (SI, File_Stream); -- specify main input stream + --S.Set_Input_Stream (SI, String_Stream); + exception + when others => + Ada.Text_IO.Put_Line ("Cannot open Input Stream"); + end; --S.Set_Output_Stream (SI, Stream); -- specify main output stream. Ada.Text_IO.Put_Line ("-------------------------------------------"); diff --git a/cmd/scheme.gpr.in b/cmd/scheme.gpr.in index 728aa02..2d90c4f 100644 --- a/cmd/scheme.gpr.in +++ b/cmd/scheme.gpr.in @@ -23,7 +23,7 @@ project Scheme is package Compiler is for Default_Switches ("Ada") use ( - "-gnata", "-gnato", "-gnatN", "-gnatwl", "-gnat95", + "-gnata", "-gnato", "-gnatN", "-gnatwl", "-gnat95", "-gnatW8", "-I@abs_srcdir@/../lib" ); end Compiler; diff --git a/cmd/stream.adb b/cmd/stream.adb index 672fcd0..dfc4634 100644 --- a/cmd/stream.adb +++ b/cmd/stream.adb @@ -1,7 +1,8 @@ with H2.Pool; -with Ada.Characters.Conversions; with Ada.Unchecked_Conversion; +with Ada.Text_IO; -- for debugging + package body Stream is ------------------------------------------------------------------ @@ -9,13 +10,13 @@ package body Stream is procedure Open (Stream: in out String_Input_Stream_Record) is begin -Ada.Wide_Text_IO.Put_Line ("****** OPEN STRING STREAM ******"); +Ada.Text_IO.Put_Line ("****** OPEN STRING STREAM ******"); Stream.Pos := 0; end Open; procedure Close (Stream: in out String_Input_Stream_Record) is begin -Ada.Wide_Text_IO.Put_Line ("****** CLOSE STRING STREAM ******"); +Ada.Text_IO.Put_Line ("****** CLOSE STRING STREAM ******"); Stream.Pos := Stream.Str'Last; end Close; @@ -50,18 +51,18 @@ Ada.Wide_Text_IO.Put_Line ("****** CLOSE STRING STREAM ******"); ------------------------------------------------------------------ procedure Open (Stream: in out File_Stream_Record) is - subtype Wide_String is Standard.Wide_String(1 .. Standard.Natural(Stream.Name'Length)); - function To_Wide_String is new Ada.Unchecked_Conversion (S.Object_String, Wide_String); begin -Ada.Wide_Text_IO.Put_Line (">>>>> OPEN File STREAM <<<<< " & To_Wide_String(Stream.Name.all)); - Ada.Wide_Text_IO.Open (Stream.Handle, Ada.Wide_Text_IO.In_File, Ada.Characters.Conversions.To_String(To_Wide_String(Stream.Name.all))); +Ada.Text_IO.Put_Line (">>>>> OPEN File STREAM <<<<< " & Standard.String(UTF8.Unicode_To_Utf8(UTF8.Unicode_String(Stream.Name.all)))); + --Ada.Wide_Text_IO.Open (Stream.Handle, Ada.Wide_Text_IO.In_File, Ada.Characters.Conversions.To_String(Standard.Wide_String(Stream.Name.all))); + Ada.Wide_Text_IO.Open (Stream.Handle, Ada.Wide_Text_IO.In_File, Standard.String(UTF8.Unicode_To_Utf8(UTF8.Unicode_String(Stream.Name.all)))); end Open; procedure Close (Stream: in out File_Stream_Record) is subtype Wide_String is Standard.Wide_String(1 .. Standard.Natural(Stream.Name'Length)); function To_Wide_String is new Ada.Unchecked_Conversion (S.Object_String, Wide_String); begin -Ada.Wide_Text_IO.Put_Line (">>>>> CLOSE File STREAM <<<<< " & To_Wide_String(Stream.Name.all)); +--Ada.Wide_Text_IO.Put_Line (">>>>> CLOSE File STREAM <<<<< " & Standard.Wide_String(Stream.Name.all)); +Ada.Text_IO.Put_Line (">>>>> CLOSE File STREAM <<<<< " & Standard.String(UTF8.Unicode_To_Utf8(UTF8.Unicode_String(Stream.Name.all)))); Ada.Wide_Text_IO.Close (Stream.Handle); end Close; @@ -71,6 +72,10 @@ Ada.Wide_Text_IO.Put_Line (">>>>> CLOSE File STREAM <<<<< " & To_Wide_String(Str begin for I in Data'First .. Data'Last loop begin + if Ada.Wide_Text_IO.End_Of_File (Stream.Handle) then + Last := I - 1; + return; + end if; Ada.Wide_Text_IO.Get_Immediate (Stream.Handle, Data(I)); exception when Ada.Wide_Text_IO.End_Error => diff --git a/cmd/stream.ads b/cmd/stream.ads index 7c3051c..a4e80fa 100644 --- a/cmd/stream.ads +++ b/cmd/stream.ads @@ -1,9 +1,11 @@ with H2.Scheme; +with H2.UTF8; with Ada.Wide_Text_IO; package Stream is package S is new H2.Scheme (Standard.Wide_Character); + package UTF8 is new H2.UTF8 (Standard.Wide_Character, Standard.Character); ------------------------------------------------------------ --type Object_String_Pointer is access all S.Object_String; diff --git a/lib/h2-scheme.adb b/lib/h2-scheme.adb index fb72e00..9a5e654 100644 --- a/lib/h2-scheme.adb +++ b/lib/h2-scheme.adb @@ -1,20 +1,13 @@ with H2.Ascii; with H2.Pool; -with System.Address_To_Access_Conversions; - -with Ada.Unchecked_Deallocation; -- for h2scm c interface. TOOD: move it to a separate file -with Interfaces.C; - -- XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXx -with Ada.Characters.Handling; -with Ada.Wide_Characters.Handling; - -- TODO: delete these after debugging +with Ada.Unchecked_Deallocation; -- for h2scm c interface. TOOD: move it to a separate file +with Interfaces.C; with ada.text_io; with ada.wide_text_io; -- TODO: delete above after debugging - -- XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXx package body H2.Scheme is @@ -1480,8 +1473,8 @@ Ada.Text_IO.Put_Line ("2222222222222222222222222"); Make_Procedure_Objects; Ada.Text_IO.Put_Line ("99999"); -Ada.Text_IO.Put_Line (IO_Character_Record'Size'Img); -Ada.Text_IO.Put_Line (IO_Character_Record'Max_Size_In_Storage_Elements'Img); +Ada.Text_IO.Put_Line (Object_Size'Image(IO_Character_Record'Size)); +Ada.Text_IO.Put_Line (Object_Size'Image(IO_Character_Record'Max_Size_In_Storage_Elements)); exception when others => Deinitialize_Heap (Interp); @@ -2819,6 +2812,8 @@ Ada.Text_IO.Put_Line ("INFO: NO MORE TOKEN "); Result: out Object_Pointer) is -- standard read-eval-print loop begin + pragma Assert (Interp.Base_Input.Stream /= null); + Result := Nil_Pointer; loop diff --git a/lib/h2-scheme.ads b/lib/h2-scheme.ads index 7e210a9..18a6460 100644 --- a/lib/h2-scheme.ads +++ b/lib/h2-scheme.ads @@ -117,17 +117,20 @@ package H2.Scheme is -- The Object_Size type defines the size of object payload. -- It is the number of payload items for each object kind. --type Object_Size is new Object_Word range 0 .. (2 ** (System.Word_Size - 1)) - 1; - --type Object_Size is new Object_Word range 0 .. 1000; -- TODO: remove this line and uncommect the live above - type Object_Size is new Object_Word; + --type Object_Size is new Object_Word range 0 .. 1000; + --type Object_Size is new Object_Word; + type Object_Size is new System_Size; for Object_Size'Size use Object_Pointer_Bits; -- for GC + subtype Object_Index is Object_Size range Object_Size(System_Index'First) .. Object_Size(System_Index'Last); type Object_Byte is mod 2 ** System.Storage_Unit; for Object_Byte'Size use System.Storage_Unit; subtype Object_Character is Character_Type; - subtype Object_String_Size is Object_Size range 0 .. Object_Size'Last - 1; - subtype Object_String_Range is Object_Size range 1 .. Object_Size'Last - 1; - type Object_String is array (Object_String_Range range <>) of Object_Character; + + subtype Object_String_Size is Object_Size; + subtype Object_String_Index is Object_Index; + type Object_String is array (Object_String_Index range <>) of Object_Character; type Object_String_Pointer is access all Object_String; for Object_String_Pointer'Size use Object_Pointer_Bits; @@ -135,14 +138,14 @@ package H2.Scheme is for Constant_Object_String_Pointer'Size use Object_Pointer_Bits; -- TODO: are these Thin_XXXX necessary? - subtype Thin_Object_String is Object_String (Object_String_Range'Range); + subtype Thin_Object_String is Object_String(Object_Index'Range); type Thin_Object_String_Pointer is access all Thin_Object_String; for Thin_Object_String_Pointer'Size use Object_Pointer_Bits; - type Object_Byte_Array is array (Object_Size range <>) of Object_Byte; + type Object_Byte_Array is array (Object_Index range <>) of Object_Byte; subtype Object_Character_Array is Object_String; - type Object_Pointer_Array is array (Object_Size range <>) of Object_Pointer; - type Object_Word_Array is array (Object_Size range <>) of Object_Word; + type Object_Pointer_Array is array (Object_Index range <>) of Object_Pointer; + type Object_Word_Array is array (Object_Index range <>) of Object_Word; type Object_Kind is ( Moved_Object, -- internal use only diff --git a/lib/h2-utf8.adb b/lib/h2-utf8.adb new file mode 100644 index 0000000..ee560a0 --- /dev/null +++ b/lib/h2-utf8.adb @@ -0,0 +1,110 @@ +with ada.text_io; + +package body H2.UTF8 is + + type Uint8 is mod 2 ** 8; + type Uint32 is mod 2 ** 32; + + type Conv_Record is record + Lower: Uint32; + Upper: Uint32; + Fbyte: Uint8; -- Mask to the first utf8 byte */ + Mask: Uint8; + Fmask: Uint8; + Length: Uint8; -- number of bytes + end record; + + type Conv_Record_Array is array(System_Index range<>) of Conv_Record; + + Conv_Table: constant Conv_Record_Array := ( + (16#0000_0000#, 16#0000_007F#, 16#00#, 16#80#, 16#7F#, 1), + (16#0000_0080#, 16#0000_07FF#, 16#C0#, 16#E0#, 16#1F#, 2), + (16#0000_0800#, 16#0000_FFFF#, 16#E0#, 16#F0#, 16#0F#, 3), + (16#0001_0000#, 16#001F_FFFF#, 16#F0#, 16#F8#, 16#07#, 4), + (16#0020_0000#, 16#03FF_FFFF#, 16#F8#, 16#FC#, 16#03#, 5), + (16#0400_0000#, 16#7FFF_FFFF#, 16#FC#, 16#FE#, 16#01#, 6) + ); + + function Get_UTF8_Slot (UV: in Uint32) return System_Size is + pragma Inline (Get_UTF8_Slot); + begin + for I in Conv_Table'Range loop + if UV >= Conv_Table(I).Lower and then UV <= Conv_Table(I).Upper then + return I; + end if; + end loop; + return System_Size'First; + end Get_UTF8_Slot; + + function Unicode_To_UTF8 (UC: in Unicode_Character) return UTF8_String is + UV: Uint32; + I: System_Size; + begin + UV := Unicode_Character'Pos(UC); + + I := Get_UTF8_Slot(UV); + if I not in System_Index'Range then + raise Invalid_Unicode_Character; + end if; + + declare + subtype Result_String is UTF8_String(1 .. System_Index(Conv_Table(I).Length)); + Result: Result_String; + begin + for J in reverse Result_String'First + 1 .. Result_String'Last loop + -- 2#0011_1111#: 16#3F# + -- 2#1000_0000#: 16#80# + Result(J) := UTF8_Character'Val((UV and 2#0011_1111#) or 2#1000_0000#); + UV := UV / (2 ** 6); --UV := UV >> 6; + end loop; + + Result(Result_String'First) := UTF8_Character'Val(UV or Uint32(Conv_Table(I).Fbyte)); + return Result; + end; + end Unicode_To_UTF8; + + + function Unicode_To_UTF8 (US: in Unicode_String) return UTF8_String is + -- this function has high stack pressur if the input string is too long + -- TODO: create a procedure to overcome this problem. + Tmp: System_Size; + begin + Tmp := 0; + for I in US'Range loop + declare + UTF8: UTF8_String := Unicode_To_UTF8(US(I)); + begin + Tmp := Tmp + UTF8'Length; + end; + end loop; + + declare + subtype Result_String is UTF8_String(1 .. Tmp); + Result: Result_String; + begin + Tmp := Result'First; + for I in US'Range loop + declare + UTF8: UTF8_String := Unicode_To_UTF8(US(I)); + begin + Result(Tmp .. Tmp + UTF8'Length - 1) := UTF8; + Tmp := Tmp + UTF8'Length; + end; + end loop; + return Result; + end; + end Unicode_To_UTF8; + + procedure UTF8_To_Unicode (UTF8: in UTF8_String; + UC: out Unicode_Character) is + begin + null; + end UTF8_To_Unicode; + + procedure UTF8_To_Unicode (UTF8: in UTF8_String; + US: in out Unicode_String) is + begin + null; + end UTF8_To_Unicode; + +end H2.UTF8; diff --git a/lib/h2-utf8.ads b/lib/h2-utf8.ads new file mode 100644 index 0000000..273f473 --- /dev/null +++ b/lib/h2-utf8.ads @@ -0,0 +1,20 @@ +generic + type Unicode_Character_Type is (<>); + type UTF8_Character_Type is (<>); +package H2.UTF8 is + + Invalid_Unicode_Character: exception; + + subtype Unicode_Character is Unicode_Character_Type; + subtype UTF8_Character is UTF8_Character_Type; + + type UTF8_String is array(System_Index range<>) of UTF8_Character; + type Unicode_String is array(System_Index range<>) of Unicode_Character; + + function Unicode_To_UTF8 (UC: in Unicode_Character) return UTF8_String; + function Unicode_To_UTF8 (US: in Unicode_String) return UTF8_String; + + --procedure UTF8_To_Unicode (UTF8: in UTF8_String; + -- UC: out Unicode_Character_Type); + +end H2.UTF8; diff --git a/lib/h2.ads b/lib/h2.ads index 88b5a42..eff3fde 100644 --- a/lib/h2.ads +++ b/lib/h2.ads @@ -1,8 +1,25 @@ +with System; with System.Storage_Pools; package H2 is - type Storage_Pool_Pointer is + System_Word_Bits: constant := System.Word_Size; + System_Word_Bytes: constant := System_Word_Bits / System.Storage_Unit; + + --type System_Byte is mod 2 ** System.Storage_Unit; + --for System_Byte'Size use System.Storage_Unit; + + type System_Word is mod 2 ** System_Word_Bits; + --for System_Word'Size use System_Word_Bits; + + type System_Signed_Word is range -(2 ** (System_Word_Bits - 1)) .. + +(2 ** (System_Word_Bits - 1)) - 1; + --for System_Signed_Word'Size use System_Word_Bits; + + type System_Size is new System_Word range 0 .. (2 ** System_Word_Bits) - 1; + subtype System_Index is System_Size range 1 .. System_Size'Last; + + type Storage_Pool_Pointer is access all System.Storage_Pools.Root_Storage_Pool'Class; end H2; diff --git a/lib/lib.gpr.in b/lib/lib.gpr.in index 9198393..7c6b9a9 100644 --- a/lib/lib.gpr.in +++ b/lib/lib.gpr.in @@ -14,18 +14,21 @@ project Lib is "h2-pool.ads", "h2-scheme.adb", "h2-scheme.ads", - "h2-scheme-token.adb" + "h2-scheme-token.adb", + "h2-utf8.adb", + "h2-utf8.ads" ); for Library_Interface use ( "h2", "h2.ascii", "h2.pool", - "h2.scheme" + "h2.scheme", + "h2.utf8" ); package Compiler is for Default_Switches ("Ada") use ( - "-gnata", "-gnato", "-gnatN", "-gnatwl", "-gnat95" + "-gnata", "-gnato", "-gnatN", "-gnatwl", "-gnat95", "-gnatW8" ); end Compiler;