added Unicode_To_Utf8 converter

This commit is contained in:
2014-01-14 14:22:06 +00:00
parent d839e8c41a
commit bcf50fe381
10 changed files with 195 additions and 36 deletions

View File

@ -1,20 +1,13 @@
with H2.Ascii;
with H2.Pool;
with System.Address_To_Access_Conversions;
with Ada.Unchecked_Deallocation; -- for h2scm c interface. TOOD: move it to a separate file
with Interfaces.C;
-- XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXx
with Ada.Characters.Handling;
with Ada.Wide_Characters.Handling;
-- TODO: delete these after debugging
with Ada.Unchecked_Deallocation; -- for h2scm c interface. TOOD: move it to a separate file
with Interfaces.C;
with ada.text_io;
with ada.wide_text_io;
-- TODO: delete above after debugging
-- XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXx
package body H2.Scheme is
@ -1480,8 +1473,8 @@ Ada.Text_IO.Put_Line ("2222222222222222222222222");
Make_Procedure_Objects;
Ada.Text_IO.Put_Line ("99999");
Ada.Text_IO.Put_Line (IO_Character_Record'Size'Img);
Ada.Text_IO.Put_Line (IO_Character_Record'Max_Size_In_Storage_Elements'Img);
Ada.Text_IO.Put_Line (Object_Size'Image(IO_Character_Record'Size));
Ada.Text_IO.Put_Line (Object_Size'Image(IO_Character_Record'Max_Size_In_Storage_Elements));
exception
when others =>
Deinitialize_Heap (Interp);
@ -2819,6 +2812,8 @@ Ada.Text_IO.Put_Line ("INFO: NO MORE TOKEN ");
Result: out Object_Pointer) is
-- standard read-eval-print loop
begin
pragma Assert (Interp.Base_Input.Stream /= null);
Result := Nil_Pointer;
loop

View File

@ -117,17 +117,20 @@ package H2.Scheme is
-- The Object_Size type defines the size of object payload.
-- It is the number of payload items for each object kind.
--type Object_Size is new Object_Word range 0 .. (2 ** (System.Word_Size - 1)) - 1;
--type Object_Size is new Object_Word range 0 .. 1000; -- TODO: remove this line and uncommect the live above
type Object_Size is new Object_Word;
--type Object_Size is new Object_Word range 0 .. 1000;
--type Object_Size is new Object_Word;
type Object_Size is new System_Size;
for Object_Size'Size use Object_Pointer_Bits; -- for GC
subtype Object_Index is Object_Size range Object_Size(System_Index'First) .. Object_Size(System_Index'Last);
type Object_Byte is mod 2 ** System.Storage_Unit;
for Object_Byte'Size use System.Storage_Unit;
subtype Object_Character is Character_Type;
subtype Object_String_Size is Object_Size range 0 .. Object_Size'Last - 1;
subtype Object_String_Range is Object_Size range 1 .. Object_Size'Last - 1;
type Object_String is array (Object_String_Range range <>) of Object_Character;
subtype Object_String_Size is Object_Size;
subtype Object_String_Index is Object_Index;
type Object_String is array (Object_String_Index range <>) of Object_Character;
type Object_String_Pointer is access all Object_String;
for Object_String_Pointer'Size use Object_Pointer_Bits;
@ -135,14 +138,14 @@ package H2.Scheme is
for Constant_Object_String_Pointer'Size use Object_Pointer_Bits;
-- TODO: are these Thin_XXXX necessary?
subtype Thin_Object_String is Object_String (Object_String_Range'Range);
subtype Thin_Object_String is Object_String(Object_Index'Range);
type Thin_Object_String_Pointer is access all Thin_Object_String;
for Thin_Object_String_Pointer'Size use Object_Pointer_Bits;
type Object_Byte_Array is array (Object_Size range <>) of Object_Byte;
type Object_Byte_Array is array (Object_Index range <>) of Object_Byte;
subtype Object_Character_Array is Object_String;
type Object_Pointer_Array is array (Object_Size range <>) of Object_Pointer;
type Object_Word_Array is array (Object_Size range <>) of Object_Word;
type Object_Pointer_Array is array (Object_Index range <>) of Object_Pointer;
type Object_Word_Array is array (Object_Index range <>) of Object_Word;
type Object_Kind is (
Moved_Object, -- internal use only

110
lib/h2-utf8.adb Normal file
View File

@ -0,0 +1,110 @@
with ada.text_io;
package body H2.UTF8 is
type Uint8 is mod 2 ** 8;
type Uint32 is mod 2 ** 32;
type Conv_Record is record
Lower: Uint32;
Upper: Uint32;
Fbyte: Uint8; -- Mask to the first utf8 byte */
Mask: Uint8;
Fmask: Uint8;
Length: Uint8; -- number of bytes
end record;
type Conv_Record_Array is array(System_Index range<>) of Conv_Record;
Conv_Table: constant Conv_Record_Array := (
(16#0000_0000#, 16#0000_007F#, 16#00#, 16#80#, 16#7F#, 1),
(16#0000_0080#, 16#0000_07FF#, 16#C0#, 16#E0#, 16#1F#, 2),
(16#0000_0800#, 16#0000_FFFF#, 16#E0#, 16#F0#, 16#0F#, 3),
(16#0001_0000#, 16#001F_FFFF#, 16#F0#, 16#F8#, 16#07#, 4),
(16#0020_0000#, 16#03FF_FFFF#, 16#F8#, 16#FC#, 16#03#, 5),
(16#0400_0000#, 16#7FFF_FFFF#, 16#FC#, 16#FE#, 16#01#, 6)
);
function Get_UTF8_Slot (UV: in Uint32) return System_Size is
pragma Inline (Get_UTF8_Slot);
begin
for I in Conv_Table'Range loop
if UV >= Conv_Table(I).Lower and then UV <= Conv_Table(I).Upper then
return I;
end if;
end loop;
return System_Size'First;
end Get_UTF8_Slot;
function Unicode_To_UTF8 (UC: in Unicode_Character) return UTF8_String is
UV: Uint32;
I: System_Size;
begin
UV := Unicode_Character'Pos(UC);
I := Get_UTF8_Slot(UV);
if I not in System_Index'Range then
raise Invalid_Unicode_Character;
end if;
declare
subtype Result_String is UTF8_String(1 .. System_Index(Conv_Table(I).Length));
Result: Result_String;
begin
for J in reverse Result_String'First + 1 .. Result_String'Last loop
-- 2#0011_1111#: 16#3F#
-- 2#1000_0000#: 16#80#
Result(J) := UTF8_Character'Val((UV and 2#0011_1111#) or 2#1000_0000#);
UV := UV / (2 ** 6); --UV := UV >> 6;
end loop;
Result(Result_String'First) := UTF8_Character'Val(UV or Uint32(Conv_Table(I).Fbyte));
return Result;
end;
end Unicode_To_UTF8;
function Unicode_To_UTF8 (US: in Unicode_String) return UTF8_String is
-- this function has high stack pressur if the input string is too long
-- TODO: create a procedure to overcome this problem.
Tmp: System_Size;
begin
Tmp := 0;
for I in US'Range loop
declare
UTF8: UTF8_String := Unicode_To_UTF8(US(I));
begin
Tmp := Tmp + UTF8'Length;
end;
end loop;
declare
subtype Result_String is UTF8_String(1 .. Tmp);
Result: Result_String;
begin
Tmp := Result'First;
for I in US'Range loop
declare
UTF8: UTF8_String := Unicode_To_UTF8(US(I));
begin
Result(Tmp .. Tmp + UTF8'Length - 1) := UTF8;
Tmp := Tmp + UTF8'Length;
end;
end loop;
return Result;
end;
end Unicode_To_UTF8;
procedure UTF8_To_Unicode (UTF8: in UTF8_String;
UC: out Unicode_Character) is
begin
null;
end UTF8_To_Unicode;
procedure UTF8_To_Unicode (UTF8: in UTF8_String;
US: in out Unicode_String) is
begin
null;
end UTF8_To_Unicode;
end H2.UTF8;

20
lib/h2-utf8.ads Normal file
View File

@ -0,0 +1,20 @@
generic
type Unicode_Character_Type is (<>);
type UTF8_Character_Type is (<>);
package H2.UTF8 is
Invalid_Unicode_Character: exception;
subtype Unicode_Character is Unicode_Character_Type;
subtype UTF8_Character is UTF8_Character_Type;
type UTF8_String is array(System_Index range<>) of UTF8_Character;
type Unicode_String is array(System_Index range<>) of Unicode_Character;
function Unicode_To_UTF8 (UC: in Unicode_Character) return UTF8_String;
function Unicode_To_UTF8 (US: in Unicode_String) return UTF8_String;
--procedure UTF8_To_Unicode (UTF8: in UTF8_String;
-- UC: out Unicode_Character_Type);
end H2.UTF8;

View File

@ -1,8 +1,25 @@
with System;
with System.Storage_Pools;
package H2 is
type Storage_Pool_Pointer is
System_Word_Bits: constant := System.Word_Size;
System_Word_Bytes: constant := System_Word_Bits / System.Storage_Unit;
--type System_Byte is mod 2 ** System.Storage_Unit;
--for System_Byte'Size use System.Storage_Unit;
type System_Word is mod 2 ** System_Word_Bits;
--for System_Word'Size use System_Word_Bits;
type System_Signed_Word is range -(2 ** (System_Word_Bits - 1)) ..
+(2 ** (System_Word_Bits - 1)) - 1;
--for System_Signed_Word'Size use System_Word_Bits;
type System_Size is new System_Word range 0 .. (2 ** System_Word_Bits) - 1;
subtype System_Index is System_Size range 1 .. System_Size'Last;
type Storage_Pool_Pointer is
access all System.Storage_Pools.Root_Storage_Pool'Class;
end H2;

View File

@ -14,18 +14,21 @@ project Lib is
"h2-pool.ads",
"h2-scheme.adb",
"h2-scheme.ads",
"h2-scheme-token.adb"
"h2-scheme-token.adb",
"h2-utf8.adb",
"h2-utf8.ads"
);
for Library_Interface use (
"h2",
"h2.ascii",
"h2.pool",
"h2.scheme"
"h2.scheme",
"h2.utf8"
);
package Compiler is
for Default_Switches ("Ada") use (
"-gnata", "-gnato", "-gnatN", "-gnatwl", "-gnat95"
"-gnata", "-gnato", "-gnatN", "-gnatwl", "-gnat95", "-gnatW8"
);
end Compiler;