added Unicode_To_Utf8 converter

This commit is contained in:
hyung-hwan 2014-01-14 14:22:06 +00:00
parent d839e8c41a
commit bcf50fe381
10 changed files with 195 additions and 36 deletions

View File

@ -6,7 +6,6 @@ with Stream;
with Ada.Text_IO; with Ada.Text_IO;
with Ada.Unchecked_Deallocation; with Ada.Unchecked_Deallocation;
procedure scheme is procedure scheme is
--package S renames H2.Scheme; --package S renames H2.Scheme;
--package S is new H2.Scheme (Wide_Character, Wide_String); --package S is new H2.Scheme (Wide_Character, Wide_String);
@ -43,8 +42,13 @@ begin
); );
File_Stream.Name := File_Name'Unchecked_Access; File_Stream.Name := File_Name'Unchecked_Access;
S.Set_Input_Stream (SI, File_Stream); -- specify main input stream begin
--S.Set_Input_Stream (SI, String_Stream); S.Set_Input_Stream (SI, File_Stream); -- specify main input stream
--S.Set_Input_Stream (SI, String_Stream);
exception
when others =>
Ada.Text_IO.Put_Line ("Cannot open Input Stream");
end;
--S.Set_Output_Stream (SI, Stream); -- specify main output stream. --S.Set_Output_Stream (SI, Stream); -- specify main output stream.
Ada.Text_IO.Put_Line ("-------------------------------------------"); Ada.Text_IO.Put_Line ("-------------------------------------------");

View File

@ -23,7 +23,7 @@ project Scheme is
package Compiler is package Compiler is
for Default_Switches ("Ada") use ( for Default_Switches ("Ada") use (
"-gnata", "-gnato", "-gnatN", "-gnatwl", "-gnat95", "-gnata", "-gnato", "-gnatN", "-gnatwl", "-gnat95", "-gnatW8",
"-I@abs_srcdir@/../lib" "-I@abs_srcdir@/../lib"
); );
end Compiler; end Compiler;

View File

@ -1,7 +1,8 @@
with H2.Pool; with H2.Pool;
with Ada.Characters.Conversions;
with Ada.Unchecked_Conversion; with Ada.Unchecked_Conversion;
with Ada.Text_IO; -- for debugging
package body Stream is package body Stream is
------------------------------------------------------------------ ------------------------------------------------------------------
@ -9,13 +10,13 @@ package body Stream is
procedure Open (Stream: in out String_Input_Stream_Record) is procedure Open (Stream: in out String_Input_Stream_Record) is
begin begin
Ada.Wide_Text_IO.Put_Line ("****** OPEN STRING STREAM ******"); Ada.Text_IO.Put_Line ("****** OPEN STRING STREAM ******");
Stream.Pos := 0; Stream.Pos := 0;
end Open; end Open;
procedure Close (Stream: in out String_Input_Stream_Record) is procedure Close (Stream: in out String_Input_Stream_Record) is
begin begin
Ada.Wide_Text_IO.Put_Line ("****** CLOSE STRING STREAM ******"); Ada.Text_IO.Put_Line ("****** CLOSE STRING STREAM ******");
Stream.Pos := Stream.Str'Last; Stream.Pos := Stream.Str'Last;
end Close; end Close;
@ -50,18 +51,18 @@ Ada.Wide_Text_IO.Put_Line ("****** CLOSE STRING STREAM ******");
------------------------------------------------------------------ ------------------------------------------------------------------
procedure Open (Stream: in out File_Stream_Record) is procedure Open (Stream: in out File_Stream_Record) is
subtype Wide_String is Standard.Wide_String(1 .. Standard.Natural(Stream.Name'Length));
function To_Wide_String is new Ada.Unchecked_Conversion (S.Object_String, Wide_String);
begin begin
Ada.Wide_Text_IO.Put_Line (">>>>> OPEN File STREAM <<<<< " & To_Wide_String(Stream.Name.all)); Ada.Text_IO.Put_Line (">>>>> OPEN File STREAM <<<<< " & Standard.String(UTF8.Unicode_To_Utf8(UTF8.Unicode_String(Stream.Name.all))));
Ada.Wide_Text_IO.Open (Stream.Handle, Ada.Wide_Text_IO.In_File, Ada.Characters.Conversions.To_String(To_Wide_String(Stream.Name.all))); --Ada.Wide_Text_IO.Open (Stream.Handle, Ada.Wide_Text_IO.In_File, Ada.Characters.Conversions.To_String(Standard.Wide_String(Stream.Name.all)));
Ada.Wide_Text_IO.Open (Stream.Handle, Ada.Wide_Text_IO.In_File, Standard.String(UTF8.Unicode_To_Utf8(UTF8.Unicode_String(Stream.Name.all))));
end Open; end Open;
procedure Close (Stream: in out File_Stream_Record) is procedure Close (Stream: in out File_Stream_Record) is
subtype Wide_String is Standard.Wide_String(1 .. Standard.Natural(Stream.Name'Length)); subtype Wide_String is Standard.Wide_String(1 .. Standard.Natural(Stream.Name'Length));
function To_Wide_String is new Ada.Unchecked_Conversion (S.Object_String, Wide_String); function To_Wide_String is new Ada.Unchecked_Conversion (S.Object_String, Wide_String);
begin begin
Ada.Wide_Text_IO.Put_Line (">>>>> CLOSE File STREAM <<<<< " & To_Wide_String(Stream.Name.all)); --Ada.Wide_Text_IO.Put_Line (">>>>> CLOSE File STREAM <<<<< " & Standard.Wide_String(Stream.Name.all));
Ada.Text_IO.Put_Line (">>>>> CLOSE File STREAM <<<<< " & Standard.String(UTF8.Unicode_To_Utf8(UTF8.Unicode_String(Stream.Name.all))));
Ada.Wide_Text_IO.Close (Stream.Handle); Ada.Wide_Text_IO.Close (Stream.Handle);
end Close; end Close;
@ -71,6 +72,10 @@ Ada.Wide_Text_IO.Put_Line (">>>>> CLOSE File STREAM <<<<< " & To_Wide_String(Str
begin begin
for I in Data'First .. Data'Last loop for I in Data'First .. Data'Last loop
begin begin
if Ada.Wide_Text_IO.End_Of_File (Stream.Handle) then
Last := I - 1;
return;
end if;
Ada.Wide_Text_IO.Get_Immediate (Stream.Handle, Data(I)); Ada.Wide_Text_IO.Get_Immediate (Stream.Handle, Data(I));
exception exception
when Ada.Wide_Text_IO.End_Error => when Ada.Wide_Text_IO.End_Error =>

View File

@ -1,9 +1,11 @@
with H2.Scheme; with H2.Scheme;
with H2.UTF8;
with Ada.Wide_Text_IO; with Ada.Wide_Text_IO;
package Stream is package Stream is
package S is new H2.Scheme (Standard.Wide_Character); package S is new H2.Scheme (Standard.Wide_Character);
package UTF8 is new H2.UTF8 (Standard.Wide_Character, Standard.Character);
------------------------------------------------------------ ------------------------------------------------------------
--type Object_String_Pointer is access all S.Object_String; --type Object_String_Pointer is access all S.Object_String;

View File

@ -1,20 +1,13 @@
with H2.Ascii; with H2.Ascii;
with H2.Pool; with H2.Pool;
with System.Address_To_Access_Conversions;
with Ada.Unchecked_Deallocation; -- for h2scm c interface. TOOD: move it to a separate file
with Interfaces.C;
-- XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXx -- XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXx
with Ada.Characters.Handling;
with Ada.Wide_Characters.Handling;
-- TODO: delete these after debugging -- TODO: delete these after debugging
with Ada.Unchecked_Deallocation; -- for h2scm c interface. TOOD: move it to a separate file
with Interfaces.C;
with ada.text_io; with ada.text_io;
with ada.wide_text_io; with ada.wide_text_io;
-- TODO: delete above after debugging -- TODO: delete above after debugging
-- XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXx -- XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXx
package body H2.Scheme is package body H2.Scheme is
@ -1480,8 +1473,8 @@ Ada.Text_IO.Put_Line ("2222222222222222222222222");
Make_Procedure_Objects; Make_Procedure_Objects;
Ada.Text_IO.Put_Line ("99999"); Ada.Text_IO.Put_Line ("99999");
Ada.Text_IO.Put_Line (IO_Character_Record'Size'Img); Ada.Text_IO.Put_Line (Object_Size'Image(IO_Character_Record'Size));
Ada.Text_IO.Put_Line (IO_Character_Record'Max_Size_In_Storage_Elements'Img); Ada.Text_IO.Put_Line (Object_Size'Image(IO_Character_Record'Max_Size_In_Storage_Elements));
exception exception
when others => when others =>
Deinitialize_Heap (Interp); Deinitialize_Heap (Interp);
@ -2819,6 +2812,8 @@ Ada.Text_IO.Put_Line ("INFO: NO MORE TOKEN ");
Result: out Object_Pointer) is Result: out Object_Pointer) is
-- standard read-eval-print loop -- standard read-eval-print loop
begin begin
pragma Assert (Interp.Base_Input.Stream /= null);
Result := Nil_Pointer; Result := Nil_Pointer;
loop loop

View File

@ -117,17 +117,20 @@ package H2.Scheme is
-- The Object_Size type defines the size of object payload. -- The Object_Size type defines the size of object payload.
-- It is the number of payload items for each object kind. -- It is the number of payload items for each object kind.
--type Object_Size is new Object_Word range 0 .. (2 ** (System.Word_Size - 1)) - 1; --type Object_Size is new Object_Word range 0 .. (2 ** (System.Word_Size - 1)) - 1;
--type Object_Size is new Object_Word range 0 .. 1000; -- TODO: remove this line and uncommect the live above --type Object_Size is new Object_Word range 0 .. 1000;
type Object_Size is new Object_Word; --type Object_Size is new Object_Word;
type Object_Size is new System_Size;
for Object_Size'Size use Object_Pointer_Bits; -- for GC for Object_Size'Size use Object_Pointer_Bits; -- for GC
subtype Object_Index is Object_Size range Object_Size(System_Index'First) .. Object_Size(System_Index'Last);
type Object_Byte is mod 2 ** System.Storage_Unit; type Object_Byte is mod 2 ** System.Storage_Unit;
for Object_Byte'Size use System.Storage_Unit; for Object_Byte'Size use System.Storage_Unit;
subtype Object_Character is Character_Type; subtype Object_Character is Character_Type;
subtype Object_String_Size is Object_Size range 0 .. Object_Size'Last - 1;
subtype Object_String_Range is Object_Size range 1 .. Object_Size'Last - 1; subtype Object_String_Size is Object_Size;
type Object_String is array (Object_String_Range range <>) of Object_Character; subtype Object_String_Index is Object_Index;
type Object_String is array (Object_String_Index range <>) of Object_Character;
type Object_String_Pointer is access all Object_String; type Object_String_Pointer is access all Object_String;
for Object_String_Pointer'Size use Object_Pointer_Bits; for Object_String_Pointer'Size use Object_Pointer_Bits;
@ -135,14 +138,14 @@ package H2.Scheme is
for Constant_Object_String_Pointer'Size use Object_Pointer_Bits; for Constant_Object_String_Pointer'Size use Object_Pointer_Bits;
-- TODO: are these Thin_XXXX necessary? -- TODO: are these Thin_XXXX necessary?
subtype Thin_Object_String is Object_String (Object_String_Range'Range); subtype Thin_Object_String is Object_String(Object_Index'Range);
type Thin_Object_String_Pointer is access all Thin_Object_String; type Thin_Object_String_Pointer is access all Thin_Object_String;
for Thin_Object_String_Pointer'Size use Object_Pointer_Bits; for Thin_Object_String_Pointer'Size use Object_Pointer_Bits;
type Object_Byte_Array is array (Object_Size range <>) of Object_Byte; type Object_Byte_Array is array (Object_Index range <>) of Object_Byte;
subtype Object_Character_Array is Object_String; subtype Object_Character_Array is Object_String;
type Object_Pointer_Array is array (Object_Size range <>) of Object_Pointer; type Object_Pointer_Array is array (Object_Index range <>) of Object_Pointer;
type Object_Word_Array is array (Object_Size range <>) of Object_Word; type Object_Word_Array is array (Object_Index range <>) of Object_Word;
type Object_Kind is ( type Object_Kind is (
Moved_Object, -- internal use only Moved_Object, -- internal use only

110
lib/h2-utf8.adb Normal file
View File

@ -0,0 +1,110 @@
with ada.text_io;
package body H2.UTF8 is
type Uint8 is mod 2 ** 8;
type Uint32 is mod 2 ** 32;
type Conv_Record is record
Lower: Uint32;
Upper: Uint32;
Fbyte: Uint8; -- Mask to the first utf8 byte */
Mask: Uint8;
Fmask: Uint8;
Length: Uint8; -- number of bytes
end record;
type Conv_Record_Array is array(System_Index range<>) of Conv_Record;
Conv_Table: constant Conv_Record_Array := (
(16#0000_0000#, 16#0000_007F#, 16#00#, 16#80#, 16#7F#, 1),
(16#0000_0080#, 16#0000_07FF#, 16#C0#, 16#E0#, 16#1F#, 2),
(16#0000_0800#, 16#0000_FFFF#, 16#E0#, 16#F0#, 16#0F#, 3),
(16#0001_0000#, 16#001F_FFFF#, 16#F0#, 16#F8#, 16#07#, 4),
(16#0020_0000#, 16#03FF_FFFF#, 16#F8#, 16#FC#, 16#03#, 5),
(16#0400_0000#, 16#7FFF_FFFF#, 16#FC#, 16#FE#, 16#01#, 6)
);
function Get_UTF8_Slot (UV: in Uint32) return System_Size is
pragma Inline (Get_UTF8_Slot);
begin
for I in Conv_Table'Range loop
if UV >= Conv_Table(I).Lower and then UV <= Conv_Table(I).Upper then
return I;
end if;
end loop;
return System_Size'First;
end Get_UTF8_Slot;
function Unicode_To_UTF8 (UC: in Unicode_Character) return UTF8_String is
UV: Uint32;
I: System_Size;
begin
UV := Unicode_Character'Pos(UC);
I := Get_UTF8_Slot(UV);
if I not in System_Index'Range then
raise Invalid_Unicode_Character;
end if;
declare
subtype Result_String is UTF8_String(1 .. System_Index(Conv_Table(I).Length));
Result: Result_String;
begin
for J in reverse Result_String'First + 1 .. Result_String'Last loop
-- 2#0011_1111#: 16#3F#
-- 2#1000_0000#: 16#80#
Result(J) := UTF8_Character'Val((UV and 2#0011_1111#) or 2#1000_0000#);
UV := UV / (2 ** 6); --UV := UV >> 6;
end loop;
Result(Result_String'First) := UTF8_Character'Val(UV or Uint32(Conv_Table(I).Fbyte));
return Result;
end;
end Unicode_To_UTF8;
function Unicode_To_UTF8 (US: in Unicode_String) return UTF8_String is
-- this function has high stack pressur if the input string is too long
-- TODO: create a procedure to overcome this problem.
Tmp: System_Size;
begin
Tmp := 0;
for I in US'Range loop
declare
UTF8: UTF8_String := Unicode_To_UTF8(US(I));
begin
Tmp := Tmp + UTF8'Length;
end;
end loop;
declare
subtype Result_String is UTF8_String(1 .. Tmp);
Result: Result_String;
begin
Tmp := Result'First;
for I in US'Range loop
declare
UTF8: UTF8_String := Unicode_To_UTF8(US(I));
begin
Result(Tmp .. Tmp + UTF8'Length - 1) := UTF8;
Tmp := Tmp + UTF8'Length;
end;
end loop;
return Result;
end;
end Unicode_To_UTF8;
procedure UTF8_To_Unicode (UTF8: in UTF8_String;
UC: out Unicode_Character) is
begin
null;
end UTF8_To_Unicode;
procedure UTF8_To_Unicode (UTF8: in UTF8_String;
US: in out Unicode_String) is
begin
null;
end UTF8_To_Unicode;
end H2.UTF8;

20
lib/h2-utf8.ads Normal file
View File

@ -0,0 +1,20 @@
generic
type Unicode_Character_Type is (<>);
type UTF8_Character_Type is (<>);
package H2.UTF8 is
Invalid_Unicode_Character: exception;
subtype Unicode_Character is Unicode_Character_Type;
subtype UTF8_Character is UTF8_Character_Type;
type UTF8_String is array(System_Index range<>) of UTF8_Character;
type Unicode_String is array(System_Index range<>) of Unicode_Character;
function Unicode_To_UTF8 (UC: in Unicode_Character) return UTF8_String;
function Unicode_To_UTF8 (US: in Unicode_String) return UTF8_String;
--procedure UTF8_To_Unicode (UTF8: in UTF8_String;
-- UC: out Unicode_Character_Type);
end H2.UTF8;

View File

@ -1,8 +1,25 @@
with System;
with System.Storage_Pools; with System.Storage_Pools;
package H2 is package H2 is
type Storage_Pool_Pointer is System_Word_Bits: constant := System.Word_Size;
System_Word_Bytes: constant := System_Word_Bits / System.Storage_Unit;
--type System_Byte is mod 2 ** System.Storage_Unit;
--for System_Byte'Size use System.Storage_Unit;
type System_Word is mod 2 ** System_Word_Bits;
--for System_Word'Size use System_Word_Bits;
type System_Signed_Word is range -(2 ** (System_Word_Bits - 1)) ..
+(2 ** (System_Word_Bits - 1)) - 1;
--for System_Signed_Word'Size use System_Word_Bits;
type System_Size is new System_Word range 0 .. (2 ** System_Word_Bits) - 1;
subtype System_Index is System_Size range 1 .. System_Size'Last;
type Storage_Pool_Pointer is
access all System.Storage_Pools.Root_Storage_Pool'Class; access all System.Storage_Pools.Root_Storage_Pool'Class;
end H2; end H2;

View File

@ -14,18 +14,21 @@ project Lib is
"h2-pool.ads", "h2-pool.ads",
"h2-scheme.adb", "h2-scheme.adb",
"h2-scheme.ads", "h2-scheme.ads",
"h2-scheme-token.adb" "h2-scheme-token.adb",
"h2-utf8.adb",
"h2-utf8.ads"
); );
for Library_Interface use ( for Library_Interface use (
"h2", "h2",
"h2.ascii", "h2.ascii",
"h2.pool", "h2.pool",
"h2.scheme" "h2.scheme",
"h2.utf8"
); );
package Compiler is package Compiler is
for Default_Switches ("Ada") use ( for Default_Switches ("Ada") use (
"-gnata", "-gnato", "-gnatN", "-gnatwl", "-gnat95" "-gnata", "-gnato", "-gnatN", "-gnatwl", "-gnat95", "-gnatW8"
); );
end Compiler; end Compiler;