From de3770b6ef71bbf7e8d1c43b42c139ea186a4d23 Mon Sep 17 00:00:00 2001 From: hyung-hwan Date: Sun, 7 Nov 2021 17:32:50 +0000 Subject: [PATCH] added utf8 functions --- lib2/h3-arrays.adb | 8 +- lib2/h3-arrays.ads | 7 +- lib2/h3-compilers.adb | 105 +++++++++++++++++--- lib2/h3-compilers.ads | 25 ++++- lib2/h3-utf8.adb | 217 ++++++++++++++++++++++++++++++++++++++++++ lib2/h3-utf8.ads | 51 ++++++++++ lib2/hello3.adb | 2 +- 7 files changed, 389 insertions(+), 26 deletions(-) create mode 100644 lib2/h3-utf8.adb create mode 100644 lib2/h3-utf8.ads diff --git a/lib2/h3-arrays.adb b/lib2/h3-arrays.adb index 727d4f1..212ce41 100644 --- a/lib2/h3-arrays.adb +++ b/lib2/h3-arrays.adb @@ -347,15 +347,15 @@ package body H3.Arrays is return System_Size'First; end Find; - function "=" (Obj: in Elastic_Array; Obj2: in Elastic_Array) return Boolean is + function Equals (Obj: in Elastic_Array; Obj2: in Elastic_Array) return Boolean is begin return Obj.Buffer = Obj2.Buffer or else Obj.Buffer.Slot(Get_First_Index(Obj) .. Get_Last_Index(Obj)) = Obj2.Buffer.Slot(Get_First_Index(Obj2) .. Get_Last_Index(Obj2)); - end "="; + end Equals; - function "=" (Obj: in Elastic_Array; Obj2: in Item_Array) return Boolean is + function Equals (Obj: in Elastic_Array; Obj2: in Item_Array) return Boolean is begin return Obj.Buffer.Slot(Get_First_Index(Obj) .. Get_Last_Index(Obj)) = Obj2; - end "="; + end Equals; -- --------------------------------------------------------------------- -- Controlled Management diff --git a/lib2/h3-arrays.ads b/lib2/h3-arrays.ads index 2fc2ba9..ec77ab1 100644 --- a/lib2/h3-arrays.ads +++ b/lib2/h3-arrays.ads @@ -69,8 +69,11 @@ package H3.Arrays is function Find (Obj: in Elastic_Array; V: in Item_Type; Start_Pos: in System_Index; Find_Dir: in Direction := DIRECTION_FORWARD) return System_Size; function Find (Obj: in Elastic_Array; V: in Item_Array; Start_Pos: in System_Index; Find_Dir: in Direction := DIRECTION_FORWARD) return System_Size; - function "=" (Obj: in Elastic_Array; Obj2: in Elastic_Array) return Boolean; - function "=" (Obj: in Elastic_Array; Obj2: in Item_Array) return Boolean; + function Equals (Obj: in Elastic_Array; Obj2: in Elastic_Array) return Boolean; + function Equals (Obj: in Elastic_Array; Obj2: in Item_Array) return Boolean; + + function "=" (Obj: in Elastic_Array; Obj2: in Elastic_Array) return Boolean renames Equals; + function "=" (Obj: in Elastic_Array; Obj2: in Item_Array) return Boolean renames Equals; private type Buffer_Record(Capa: System_Size) is limited record diff --git a/lib2/h3-compilers.adb b/lib2/h3-compilers.adb index 87d263e..79dc048 100644 --- a/lib2/h3-compilers.adb +++ b/lib2/h3-compilers.adb @@ -1,7 +1,12 @@ +with H3.Utf8; with ada.text_io; package body H3.Compilers is + type Char_Array is array(System_Index range<>) of Standard.Character; + package Utf8 is new H3.Utf8(Standard.Character, S.Rune, Char_Array, S.Rune_Array); + LB_EOF: constant S.Rune_Array := (R.V.Left_Arrow,R.V.UC_E,R.V.UC_O,R.V.UC_F,R.V.Right_Arrow); -- + LB_XINCLUDE: constant S.Rune_Array := (R.V.Number_Sign,R.V.LC_I,R.V.LC_N,R.V.LC_C,R.V.LC_L,R.V.LC_U,R.V.LC_D,R.V.LC_E); -- #include procedure Set_Lexer_State (C: in out Compiler; State: in Lexer_State) is begin @@ -21,21 +26,13 @@ package body H3.Compilers is Set_Lexer_State (C, State, R.To_Rune(Code)); end Set_Lexer_State; - procedure Got_Token (C: in out Compiler) is + procedure Set_Parser_State (C: in out Compiler; State: in Parser_State) is begin - --case C.P.State IS - -- when START => - -- null; - --end case; - -ada.text_io.put (C.Tk.Id'Img); -ada.text_io.put (" "); -for i in C.Tk.Buf.Get_First_Index .. C.Tk.Buf.Get_Last_Index loop - ada.text_io.put (standard.character'val(S.Rune'Pos(C.Tk.Buf.Get_Item(i)))); -end loop; -ada.text_io.put_line(""); - + C.Ps.State := State; + end Set_Parser_State; + procedure Parse_Start (C: in out Compiler) is + begin case C.Tk.Id is when TK_BSTR => null; @@ -46,8 +43,11 @@ ada.text_io.put_line(""); when TK_CSTR => null; when TK_DIRECTIVE => - --Push_Feed_Layer (... - null; + if C.Tk.Buf.Equals(LB_XINCLUDE) then + Set_Parser_State (C, PS_INCLUDE); + else + raise Syntax_Error; + end if; when TK_EOF => null; when TK_EOL => @@ -65,6 +65,69 @@ ada.text_io.put_line(""); when TK_SEMICOLON => null; end case; + end Parse_Start; + + procedure Start_Inclusion (C: in out Compiler; Name: in S.Rune_Array) is + Top: System_Index; + begin + if C.St.Top = C.St.Items'Last then + raise Syntax_Error; -- TODO: inclusion depth too deep + end if; + + Top := C.St.Top + 1; + Ada.Text_IO.Open (C.St.Items(Top).Handle, Ada.Text_IO.In_File, Standard.String(Utf8.From_Unicode_String(Name))); + C.St.Top := Top; + end Start_Inclusion; + + procedure End_Inclusion (C: in out Compiler) is + begin + Ada.Text_IO.Close (C.St.Items(C.St.Top).Handle); + C.St.Top := C.St.Top - 1; + end End_Inclusion; + + procedure Parse_Include (C: in out Compiler) is + begin + if C.Tk.Id = TK_CSTR then + -- arrange to feed more data from the included file. + Start_Inclusion (C, S.To_Rune_Array(C.Tk.Buf)); + null; + else + raise Syntax_Error; -- string literal required + end if; + end Parse_Include; + + procedure Parse_Include_End (C: in out Compiler) is + begin + if C.Tk.Id /= TK_SEMICOLON then + raise Syntax_Error; + end if; + + -- TODO: put the state back to START??? + end Parse_Include_End; + + procedure Got_Token (C: in out Compiler) is + begin + --case C.P.State IS + -- when START => + -- null; + --end case; + +ada.text_io.put (C.Tk.Id'Img); +ada.text_io.put (" "); +for i in C.Tk.Buf.Get_First_Index .. C.Tk.Buf.Get_Last_Index loop + ada.text_io.put (standard.character'val(S.Rune'Pos(C.Tk.Buf.Get_Item(i)))); +end loop; +ada.text_io.put_line(""); + + case C.Ps.State is + when PS_START => + Parse_Start (C); + when PS_INCLUDE => + Parse_Include (C); + when others => + raise Syntax_Error; -- TODO: change this... + end case; + end Got_Token; procedure Start_Token (C: in out Compiler) is @@ -208,6 +271,18 @@ end if; begin for i in Data'Range loop Feed_Char_Code (C, R.To_Code(Data(i))); + + if C.St.Top > 0 then + declare + Ch: Standard.Character; + begin + while not Ada.Text_IO.End_Of_File(C.St.Items(C.St.Top).Handle) loop + Ada.Text_IO.Get (C.St.Items(C.St.Top).Handle, Ch); + Feed_Char_Code (C, Standard.Character'Pos(Ch)); + --if inclusion stack is not Empty??? + end loop; + end; + end if; end loop; end Feed; diff --git a/lib2/h3-compilers.ads b/lib2/h3-compilers.ads index 5da7dc2..c9aa9a8 100644 --- a/lib2/h3-compilers.ads +++ b/lib2/h3-compilers.ads @@ -1,5 +1,6 @@ with H3.Runes; with H3.Strings; +with Ada.Text_IO; generic type Rune_Type is (<>); @@ -9,7 +10,7 @@ package H3.Compilers is Syntax_Error: exception; - type Compiler is tagged private; + type Compiler is tagged limited private; procedure Feed (C: in out Compiler; Data: in S.Rune_Array); procedure End_Feed (C: in out Compiler); @@ -28,6 +29,17 @@ private State: Lexer_State := LX_START; end record; + type Stream is record + Handle: Ada.Text_IO.File_Type; + --Handle: System_Size; + end record; + + type Stream_Array is array(System_Index range <>) of Stream; + type Stream_Stack(Capa: System_Index) is record + Items: Stream_Array(System_Index'First .. Capa); + Top: System_Size := 0; + end record; + type Token_Id is ( TK_BSTR, TK_BYTE, @@ -48,13 +60,18 @@ private Buf: S.Elastic_String; end record; - type Parser_State is (START, INCLUDE); + type Parser_State is ( + PS_START, + PS_INCLUDE + ); type Parser is record - State: Parser_State := START; + State: Parser_State := PS_START; end record; - type Compiler is tagged record + type Compiler is tagged limited record Lx: Lexer; Tk: Token; + Ps: Parser; + St: Stream_Stack(32); end record; end H3.Compilers; diff --git a/lib2/h3-utf8.adb b/lib2/h3-utf8.adb new file mode 100644 index 0000000..80daa6f --- /dev/null +++ b/lib2/h3-utf8.adb @@ -0,0 +1,217 @@ +with Interfaces; + +package body H3.Utf8 is + +--|---------------------------------------------------------------------------- +--| From RFC 2279 UTF-8, a transformation format of ISO 10646 +--| +--| UCS-4 range (hex.) UTF-8 octet sequence (binary) +--| 1:2 00000000-0000007F 0xxxxxxx +--| 2:2 00000080-000007FF 110xxxxx 10xxxxxx +--| 3:2 00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx +--| 4:4 00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx +--| inv 00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx +--| inv 04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx +--|---------------------------------------------------------------------------- + + --type Uint8 is mod 2 ** 8; + --type Uint32 is mod 2 ** 32; + use type Interfaces.Unsigned_8; + use type Interfaces.Unsigned_32; + subtype Uint8 is Interfaces.Unsigned_8; + subtype Uint32 is Interfaces.Unsigned_32; + + type Conv_Record is record + Lower: Uint32; + Upper: Uint32; + + Fbyte: Uint8; + Mask: Uint8; -- Mask for getting the fixed bits in the first byte. + -- (First-Byte and Mask) = Fbyte + + Fmask: Uint8; -- Mask for getting the actual values bits off the first byte. + + Length: System_Size; -- Number of bytes + end record; + + type Conv_Record_Array is array(System_Index range<>) of Conv_Record; + + Conv_Table: constant Conv_Record_Array := ( + (16#0000_0000#, 16#0000_007F#, 2#0000_0000#, 2#1000_0000#, 2#0111_1111#, 1), + (16#0000_0080#, 16#0000_07FF#, 2#1100_0000#, 2#1110_0000#, 2#0001_1111#, 2), + (16#0000_0800#, 16#0000_FFFF#, 2#1110_0000#, 2#1111_0000#, 2#0000_1111#, 3), + (16#0001_0000#, 16#001F_FFFF#, 2#1111_0000#, 2#1111_1000#, 2#0000_0111#, 4), + (16#0020_0000#, 16#03FF_FFFF#, 2#1111_1000#, 2#1111_1100#, 2#0000_0011#, 5), + (16#0400_0000#, 16#7FFF_FFFF#, 2#1111_1100#, 2#1111_1110#, 2#0000_0001#, 6) + ); + + function Get_Utf8_Slot (UV: in Uint32) return System_Size is + pragma Inline (Get_Utf8_Slot); + begin + for I in Conv_Table'Range loop + if UV >= Conv_Table(I).Lower and then UV <= Conv_Table(I).Upper then + return I; + end if; + end loop; + return System_Size'First; + end Get_Utf8_Slot; + + function From_Unicode_Character (Chr: in Unicode_Character) return Utf8_String is + UV: Uint32; + I: System_Size; + begin + UV := Unicode_Character'Pos(Chr); + + I := Get_Utf8_Slot(UV); + if I not in System_Index'Range then + raise Invalid_Unicode_Character; + end if; + + declare + Result: Utf8_String (1 .. System_Index(Conv_Table(I).Length)); + begin + for J in reverse Result'First + 1 .. Result'Last loop + -- 2#0011_1111#: 16#3F# + -- 2#1000_0000#: 16#80# + Result(J) := Utf8_Character'Val((UV and Uint32'(2#0011_1111#)) or Uint32'(2#1000_0000#)); + --UV := UV / (2 ** 6); --UV := UV >> 6; + UV := Interfaces.Shift_Right (UV, 6); + end loop; + + Result(Result'First) := Utf8_Character'Val(UV or Uint32(Conv_Table(I).Fbyte)); + return Result; + end; + end From_Unicode_Character; + + function From_Unicode_String (Str: in Unicode_String) return Utf8_String is + -- this function has high stack pressure if the input string is too long + -- TODO: create a procedure to overcome this problem. + Tmp: System_Size; + begin + -- Calculate the length first + Tmp := 0; + for I in Str'Range loop + declare + Utf8: Utf8_String := From_Unicode_Character(Chr => Str(I)); + begin + Tmp := Tmp + Utf8'Length; + end; + end loop; + + declare + Result: Utf8_String (1 .. Tmp); + begin + Tmp := Result'First; + for I in Str'Range loop + declare + Utf8: Utf8_String := From_Unicode_Character(Str(I)); + begin + Result(Tmp .. Tmp + Utf8'Length - 1) := Utf8; + Tmp := Tmp + Utf8'Length; + end; + end loop; + return Result; + end; + end From_Unicode_String; + + function Sequence_Length (Seq: in Utf8_Character) return System_Size is + begin + for I in Conv_Table'Range loop + if (Utf8_Character'Pos(Seq) and Conv_Table(I).Mask) = Conv_Table(I).Fbyte then + return Conv_Table(I).Length; + end if; + end loop; + return System_Size'First; + end Sequence_Length; + + procedure To_Unicode_Character (Seq: in Utf8_String; + Seq_Len: out System_Size; + Chr: out Unicode_Character) is + W: Uint32; + begin + for I in Conv_Table'Range loop + + -- Check if the first byte matches the desired bit patterns. + if (Utf8_Character'Pos(Seq(Seq'First)) and Conv_Table(I).Mask) = Conv_Table(I).Fbyte then + + if Seq'Length < Conv_Table(I).Length then + raise Insufficient_Utf8_Sequence; + end if; + + -- Get the values bits off the first byte. + W := Utf8_Character'Pos(Seq(Seq'First)) and Uint32(Conv_Table(I).Fmask); + + -- Get the values bits off subsequent bytes. + for J in 1 .. Conv_Table(I).Length - 1 loop + if (Utf8_Character'Pos(Seq(Seq'First + J)) and Uint32'(2#1100_0000#)) /= Uint32'(2#1000_0000#) then + -- Each UTF8 byte except the first must be set with 2#1000_0000. + raise Invalid_Utf8_Sequence; + end if; + W := Interfaces.Shift_Left(W, 6) or (Utf8_Character'Pos(Seq(Seq'First + J)) and Uint32'(2#0011_1111#)); + end loop; + + -- Return the character matching the word + Chr := Unicode_Character'Val(W); + Seq_Len := Conv_Table(I).Length; + return; + end if; + end loop; + + raise Invalid_Utf8_Sequence; + end To_Unicode_Character; + + function To_Unicode_Character (Seq: in Utf8_String) return Unicode_Character is + Seq_Len: System_Size; + Chr: Unicode_Character; + begin + To_Unicode_Character (Seq, Seq_Len, Chr); + return Chr; + end To_Unicode_Character; + + procedure To_Unicode_String (Seq: in Utf8_String; + Seq_Len: out System_Size; + Str: out Unicode_String; + Str_Len: out System_Size) is + Seq_Pos: System_Index := Seq'First; + Str_Pos: System_Index := Str'First; + Len: System_Size; + begin + while Seq_Pos <= Seq'Last and then Str_Pos <= Str'Last loop + To_Unicode_Character(Seq(Seq_Pos .. Seq'Last), Len, Str(Str_Pos)); + Seq_Pos := Seq_Pos + Len; + Str_Pos := Str_Pos + 1; + end loop; + + Seq_Len := Seq_Pos - Seq'First; + Str_Len := Str_Pos - Str'First; + end To_Unicode_String; + + function To_Unicode_String (Seq: in Utf8_String) return Unicode_String is + UL: System_Size := 0; + begin + declare + Chr: Unicode_Character; + Pos: System_Index := Seq'First; + Seq_Len: System_Size; + begin + while Pos <= Seq'Last loop + To_Unicode_Character(Seq(Pos .. Seq'Last), Seq_Len, Chr); + UL := UL + 1; + Pos := Pos + Seq_Len; + end loop; + end; + + declare + Str: Unicode_String (1 .. UL); + Pos: System_Index := Seq'First; + Seq_Len: System_Size; + begin + for I in Str'Range loop + To_Unicode_Character(Seq(Pos .. Seq'Last), Seq_Len, Str(I)); + Pos := Pos + Seq_Len; + end loop; + return Str; + end; + end To_Unicode_String; + +end H3.Utf8; diff --git a/lib2/h3-utf8.ads b/lib2/h3-utf8.ads new file mode 100644 index 0000000..6d3e2e8 --- /dev/null +++ b/lib2/h3-utf8.ads @@ -0,0 +1,51 @@ +generic + type Slim_Character is (<>); + type Wide_Character is (<>); + type Slim_String is array(System_Index range<>) of Slim_Character; + type Wide_String is array(System_Index range<>) of Wide_Character; +package H3.Utf8 is + --pragma Preelaborate (Utf8); + + --Invalid_Unicode_Character: exception renames Invalid_Wide_Character; + --Invalid_Utf8_Sequence: exception renames Invalid_Slim_Sequence; + --Insufficient_Utf8_Sequence: exception renames Insifficient_Slim_Sequence; + Invalid_Unicode_Character: exception; + Invalid_Utf8_Sequence: exception; + Insufficient_Utf8_Sequence: exception; + + subtype Utf8_Character is Slim_Character; + subtype Unicode_Character is Wide_Character; + subtype Utf8_String is Slim_String; + subtype Unicode_String is Wide_String; + subtype Utf8_Sequence is Utf8_String; + + --type Unicode_Character_Kit is record + -- Seq: System_Size; -- sequence length + -- Chr: Unicode_Character; + --end record; + + --type Unicode_String_Kit(Length: System_Size) is record + -- Seq: System_Size; + -- Str: Unicode_String(System_Index'First .. Length); + --end record; + + function From_Unicode_Character (Chr: in Unicode_Character) return Utf8_String; + function From_Unicode_String (Str: in Unicode_String) return Utf8_String; + --| The Sequence_Length function returns the length of a full UTF8 + --| sequence representing a single Unicode character given the first + --| sequence byte. It returns 0 if the first byte is invalid. + function Sequence_Length (Seq: in Utf8_Character) return System_Size; + + procedure To_Unicode_Character (Seq: in Utf8_String; + Seq_Len: out System_Size; + Chr: out Unicode_Character); + + procedure To_Unicode_String (Seq: in Utf8_String; + Seq_Len: out System_Size; + Str: out Unicode_String; + Str_Len: out System_Size); + + function To_Unicode_Character (Seq: in Utf8_String) return Unicode_Character; + function To_Unicode_String (Seq: in Utf8_String) return Unicode_String; + +end H3.Utf8; diff --git a/lib2/hello3.adb b/lib2/hello3.adb index dcbc777..30df9aa 100644 --- a/lib2/hello3.adb +++ b/lib2/hello3.adb @@ -6,6 +6,6 @@ procedure hello3 is Compiler: C.Compiler; begin - Compiler.Feed ("<<=hello world"); + Compiler.Feed ("#include 'abc.txt'"); Compiler.End_Feed; end hello3;