added an incomplete and experimental file package
This commit is contained in:
@ -1,29 +1,51 @@
|
||||
with Interfaces;
|
||||
|
||||
package body H2.Utf8 is
|
||||
|
||||
type Uint8 is mod 2 ** 8;
|
||||
type Uint32 is mod 2 ** 32;
|
||||
--|----------------------------------------------------------------------------
|
||||
--| From RFC 2279 UTF-8, a transformation format of ISO 10646
|
||||
--|
|
||||
--| UCS-4 range (hex.) UTF-8 octet sequence (binary)
|
||||
--| 1:2 00000000-0000007F 0xxxxxxx
|
||||
--| 2:2 00000080-000007FF 110xxxxx 10xxxxxx
|
||||
--| 3:2 00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
|
||||
--| 4:4 00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
--| inv 00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
--| inv 04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
--|----------------------------------------------------------------------------
|
||||
|
||||
--type Uint8 is mod 2 ** 8;
|
||||
--type Uint32 is mod 2 ** 32;
|
||||
use type Interfaces.Unsigned_8;
|
||||
use type Interfaces.Unsigned_32;
|
||||
subtype Uint8 is Interfaces.Unsigned_8;
|
||||
subtype Uint32 is Interfaces.Unsigned_32;
|
||||
|
||||
type Conv_Record is record
|
||||
Lower: Uint32;
|
||||
Upper: Uint32;
|
||||
Fbyte: Uint8; -- Mask to the first utf8 byte */
|
||||
Mask: Uint8;
|
||||
Fmask: Uint8;
|
||||
Length: Uint8; -- number of bytes
|
||||
Lower: Uint32;
|
||||
Upper: Uint32;
|
||||
|
||||
Fbyte: Uint8;
|
||||
Mask: Uint8; -- Mask for getting the fixed bits in the first byte.
|
||||
-- (First-Byte and Mask) = Fbyte
|
||||
|
||||
Fmask: Uint8; -- Mask for getting the actual values bits off the first byte.
|
||||
|
||||
Length: System_Length; -- Number of bytes
|
||||
end record;
|
||||
|
||||
type Conv_Record_Array is array(System_Index range<>) of Conv_Record;
|
||||
|
||||
Conv_Table: constant Conv_Record_Array := (
|
||||
(16#0000_0000#, 16#0000_007F#, 16#00#, 16#80#, 16#7F#, 1),
|
||||
(16#0000_0080#, 16#0000_07FF#, 16#C0#, 16#E0#, 16#1F#, 2),
|
||||
(16#0000_0800#, 16#0000_FFFF#, 16#E0#, 16#F0#, 16#0F#, 3),
|
||||
(16#0001_0000#, 16#001F_FFFF#, 16#F0#, 16#F8#, 16#07#, 4),
|
||||
(16#0020_0000#, 16#03FF_FFFF#, 16#F8#, 16#FC#, 16#03#, 5),
|
||||
(16#0400_0000#, 16#7FFF_FFFF#, 16#FC#, 16#FE#, 16#01#, 6)
|
||||
(16#0000_0000#, 16#0000_007F#, 2#0000_0000#, 2#1000_0000#, 2#0111_1111#, 1),
|
||||
(16#0000_0080#, 16#0000_07FF#, 2#1100_0000#, 2#1110_0000#, 2#0001_1111#, 2),
|
||||
(16#0000_0800#, 16#0000_FFFF#, 2#1110_0000#, 2#1111_0000#, 2#0000_1111#, 3),
|
||||
(16#0001_0000#, 16#001F_FFFF#, 2#1111_0000#, 2#1111_1000#, 2#0000_0111#, 4),
|
||||
(16#0020_0000#, 16#03FF_FFFF#, 2#1111_1000#, 2#1111_1100#, 2#0000_0011#, 5),
|
||||
(16#0400_0000#, 16#7FFF_FFFF#, 2#1111_1100#, 2#1111_1110#, 2#0000_0001#, 6)
|
||||
);
|
||||
|
||||
function Get_Utf8_Slot (UV: in Uint32) return System_Size is
|
||||
function Get_Utf8_Slot (UV: in Uint32) return System_Length is
|
||||
pragma Inline (Get_Utf8_Slot);
|
||||
begin
|
||||
for I in Conv_Table'Range loop
|
||||
@ -31,45 +53,46 @@ package body H2.Utf8 is
|
||||
return I;
|
||||
end if;
|
||||
end loop;
|
||||
return System_Size'First;
|
||||
return System_Length'First;
|
||||
end Get_Utf8_Slot;
|
||||
|
||||
function Unicode_To_Utf8 (UC: in Unicode_Character) return Utf8_String is
|
||||
|
||||
function From_Unicode_Character (Chr: in Unicode_Character) return Utf8_String is
|
||||
UV: Uint32;
|
||||
I: System_Size;
|
||||
I: System_Length;
|
||||
begin
|
||||
UV := Unicode_Character'Pos(UC);
|
||||
|
||||
UV := Unicode_Character'Pos(Chr);
|
||||
|
||||
I := Get_Utf8_Slot(UV);
|
||||
if I not in System_Index'Range then
|
||||
raise Invalid_Unicode_Character;
|
||||
end if;
|
||||
|
||||
|
||||
declare
|
||||
Result: Utf8_String (1 .. System_Index(Conv_Table(I).Length));
|
||||
begin
|
||||
for J in reverse Result'First + 1 .. Result'Last loop
|
||||
-- 2#0011_1111#: 16#3F#
|
||||
-- 2#0011_1111#: 16#3F#
|
||||
-- 2#1000_0000#: 16#80#
|
||||
Result(J) := Utf8_Character'Val((UV and 2#0011_1111#) or 2#1000_0000#);
|
||||
UV := UV / (2 ** 6); --UV := UV >> 6;
|
||||
Result(J) := Utf8_Character'Val((UV and Uint32'(2#0011_1111#)) or Uint32'(2#1000_0000#));
|
||||
--UV := UV / (2 ** 6); --UV := UV >> 6;
|
||||
UV := Interfaces.Shift_Right (UV, 6);
|
||||
end loop;
|
||||
|
||||
Result(Result'First) := Utf8_Character'Val(UV or Uint32(Conv_Table(I).Fbyte));
|
||||
return Result;
|
||||
end;
|
||||
end Unicode_To_Utf8;
|
||||
end From_Unicode_Character;
|
||||
|
||||
|
||||
function Unicode_To_Utf8 (US: in Unicode_String) return Utf8_String is
|
||||
function From_Unicode_String (Str: in Unicode_String) return Utf8_String is
|
||||
-- this function has high stack pressure if the input string is too long
|
||||
-- TODO: create a procedure to overcome this problem.
|
||||
Tmp: System_Size;
|
||||
Tmp: System_Length;
|
||||
begin
|
||||
-- Calculate the length first
|
||||
Tmp := 0;
|
||||
for I in US'Range loop
|
||||
for I in Str'Range loop
|
||||
declare
|
||||
Utf8: Utf8_String := Unicode_To_Utf8(US(I));
|
||||
Utf8: Utf8_String := From_Unicode_Character(Chr => Str(I));
|
||||
begin
|
||||
Tmp := Tmp + Utf8'Length;
|
||||
end;
|
||||
@ -79,9 +102,9 @@ package body H2.Utf8 is
|
||||
Result: Utf8_String (1 .. Tmp);
|
||||
begin
|
||||
Tmp := Result'First;
|
||||
for I in US'Range loop
|
||||
for I in Str'Range loop
|
||||
declare
|
||||
Utf8: Utf8_String := Unicode_To_Utf8(US(I));
|
||||
Utf8: Utf8_String := From_Unicode_Character(Str(I));
|
||||
begin
|
||||
Result(Tmp .. Tmp + Utf8'Length - 1) := Utf8;
|
||||
Tmp := Tmp + Utf8'Length;
|
||||
@ -89,18 +112,106 @@ package body H2.Utf8 is
|
||||
end loop;
|
||||
return Result;
|
||||
end;
|
||||
end Unicode_To_Utf8;
|
||||
end From_Unicode_String;
|
||||
|
||||
procedure Utf8_To_Unicode (Utf8: in Utf8_String;
|
||||
UC: out Unicode_Character) is
|
||||
function Sequence_Length (Seq: in Utf8_Character) return System_Length is
|
||||
begin
|
||||
null;
|
||||
end Utf8_To_Unicode;
|
||||
for I in Conv_Table'Range loop
|
||||
if (Utf8_Character'Pos(Seq) and Conv_Table(I).Mask) = Conv_Table(I).Fbyte then
|
||||
return Conv_Table(I).Length;
|
||||
end if;
|
||||
end loop;
|
||||
return System_Length'First;
|
||||
end Sequence_Length;
|
||||
|
||||
procedure Utf8_To_Unicode (Utf8: in Utf8_String;
|
||||
US: in out Unicode_String) is
|
||||
procedure To_Unicode_Character (Seq: in Utf8_String;
|
||||
Seq_Len: out System_Length;
|
||||
Chr: out Unicode_Character) is
|
||||
W: Uint32;
|
||||
begin
|
||||
null;
|
||||
end Utf8_To_Unicode;
|
||||
for I in Conv_Table'Range loop
|
||||
|
||||
-- Check if the first byte matches the desired bit patterns.
|
||||
if (Utf8_Character'Pos(Seq(Seq'First)) and Conv_Table(I).Mask) = Conv_Table(I).Fbyte then
|
||||
|
||||
if Seq'Length < Conv_Table(I).Length then
|
||||
raise Insufficient_Utf8_Sequence;
|
||||
end if;
|
||||
|
||||
-- Get the values bits off the first byte.
|
||||
W := Utf8_Character'Pos(Seq(Seq'First)) and Uint32(Conv_Table(I).Fmask);
|
||||
|
||||
-- Get the values bits off subsequent bytes.
|
||||
for J in 1 .. Conv_Table(I).Length - 1 loop
|
||||
if (Utf8_Character'Pos(Seq(Seq'First + J)) and Uint32'(2#1100_0000#)) /= Uint32'(2#1000_0000#) then
|
||||
-- Each UTF8 byte except the first must be set with 2#1000_0000.
|
||||
raise Invalid_Utf8_Sequence;
|
||||
end if;
|
||||
W := Interfaces.Shift_Left(W, 6) or (Utf8_Character'Pos(Seq(Seq'First + J)) and Uint32'(2#0011_1111#));
|
||||
end loop;
|
||||
|
||||
-- Return the character matching the word
|
||||
Chr := Unicode_Character'Val(W);
|
||||
Seq_Len := Conv_Table(I).Length;
|
||||
return;
|
||||
end if;
|
||||
end loop;
|
||||
|
||||
raise Invalid_Utf8_Sequence;
|
||||
end To_Unicode_Character;
|
||||
|
||||
function To_Unicode_Character (Seq: in Utf8_String) return Unicode_Character is
|
||||
Seq_Len: System_Length;
|
||||
Chr: Unicode_Character;
|
||||
begin
|
||||
To_Unicode_Character (Seq, Seq_Len, Chr);
|
||||
return Chr;
|
||||
end To_Unicode_Character;
|
||||
|
||||
procedure To_Unicode_String (Seq: in Utf8_String;
|
||||
Seq_Len: out System_Length;
|
||||
Str: in out Unicode_String;
|
||||
Str_Len: out System_Length) is
|
||||
Seq_Pos: System_Index := Seq'First;
|
||||
Str_Pos: System_Index := Str'First;
|
||||
Len: System_Length;
|
||||
begin
|
||||
while Seq_Pos <= Seq'Last and then Str_Pos <= Str'Last loop
|
||||
To_Unicode_Character(Seq(Seq_Pos .. Seq'Last), Len, Str(Str_Pos));
|
||||
Seq_Pos := Seq_Pos + Len;
|
||||
Str_Pos := Str_Pos + 1;
|
||||
end loop;
|
||||
|
||||
Seq_Len := Seq_Pos - Seq'First;
|
||||
Str_Len := Str_Pos - Str'First;
|
||||
end To_Unicode_String;
|
||||
|
||||
function To_Unicode_String (Seq: in Utf8_String) return Unicode_String is
|
||||
UL: System_Length := 0;
|
||||
begin
|
||||
declare
|
||||
Chr: Unicode_Character;
|
||||
Pos: System_Index := Seq'First;
|
||||
Seq_Len: System_Length;
|
||||
begin
|
||||
while Pos <= Seq'Last loop
|
||||
To_Unicode_Character(Seq(Pos .. Seq'Last), Seq_Len, Chr);
|
||||
UL := UL + 1;
|
||||
Pos := Pos + Seq_Len;
|
||||
end loop;
|
||||
end;
|
||||
|
||||
declare
|
||||
Str: Unicode_String (1 .. UL);
|
||||
Pos: System_Index := Seq'First;
|
||||
Seq_Len: System_Length;
|
||||
begin
|
||||
for I in Str'Range loop
|
||||
To_Unicode_Character(Seq(Pos .. Seq'Last), Seq_Len, Str(I));
|
||||
Pos := Pos + Seq_Len;
|
||||
end loop;
|
||||
return Str;
|
||||
end;
|
||||
end To_Unicode_String;
|
||||
|
||||
end H2.Utf8;
|
||||
|
Reference in New Issue
Block a user