2014-01-15 09:21:26 +00:00
|
|
|
package body H2.Utf8 is
|
2014-01-14 14:22:06 +00:00
|
|
|
|
|
|
|
type Uint8 is mod 2 ** 8;
|
|
|
|
type Uint32 is mod 2 ** 32;
|
|
|
|
|
|
|
|
type Conv_Record is record
|
|
|
|
Lower: Uint32;
|
|
|
|
Upper: Uint32;
|
|
|
|
Fbyte: Uint8; -- Mask to the first utf8 byte */
|
|
|
|
Mask: Uint8;
|
|
|
|
Fmask: Uint8;
|
|
|
|
Length: Uint8; -- number of bytes
|
|
|
|
end record;
|
|
|
|
|
|
|
|
type Conv_Record_Array is array(System_Index range<>) of Conv_Record;
|
|
|
|
|
|
|
|
Conv_Table: constant Conv_Record_Array := (
|
|
|
|
(16#0000_0000#, 16#0000_007F#, 16#00#, 16#80#, 16#7F#, 1),
|
|
|
|
(16#0000_0080#, 16#0000_07FF#, 16#C0#, 16#E0#, 16#1F#, 2),
|
|
|
|
(16#0000_0800#, 16#0000_FFFF#, 16#E0#, 16#F0#, 16#0F#, 3),
|
|
|
|
(16#0001_0000#, 16#001F_FFFF#, 16#F0#, 16#F8#, 16#07#, 4),
|
|
|
|
(16#0020_0000#, 16#03FF_FFFF#, 16#F8#, 16#FC#, 16#03#, 5),
|
|
|
|
(16#0400_0000#, 16#7FFF_FFFF#, 16#FC#, 16#FE#, 16#01#, 6)
|
|
|
|
);
|
|
|
|
|
2014-01-15 09:21:26 +00:00
|
|
|
function Get_Utf8_Slot (UV: in Uint32) return System_Size is
|
|
|
|
pragma Inline (Get_Utf8_Slot);
|
2014-01-14 14:22:06 +00:00
|
|
|
begin
|
|
|
|
for I in Conv_Table'Range loop
|
|
|
|
if UV >= Conv_Table(I).Lower and then UV <= Conv_Table(I).Upper then
|
|
|
|
return I;
|
|
|
|
end if;
|
|
|
|
end loop;
|
|
|
|
return System_Size'First;
|
2014-01-15 09:21:26 +00:00
|
|
|
end Get_Utf8_Slot;
|
2014-01-14 14:22:06 +00:00
|
|
|
|
2014-01-15 09:21:26 +00:00
|
|
|
function Unicode_To_Utf8 (UC: in Unicode_Character) return Utf8_String is
|
2014-01-14 14:22:06 +00:00
|
|
|
UV: Uint32;
|
|
|
|
I: System_Size;
|
|
|
|
begin
|
|
|
|
UV := Unicode_Character'Pos(UC);
|
|
|
|
|
2014-01-15 09:21:26 +00:00
|
|
|
I := Get_Utf8_Slot(UV);
|
2014-01-14 14:22:06 +00:00
|
|
|
if I not in System_Index'Range then
|
|
|
|
raise Invalid_Unicode_Character;
|
|
|
|
end if;
|
|
|
|
|
|
|
|
declare
|
2014-01-15 09:21:26 +00:00
|
|
|
Result: Utf8_String (1 .. System_Index(Conv_Table(I).Length));
|
2014-01-14 14:22:06 +00:00
|
|
|
begin
|
2014-01-15 09:21:26 +00:00
|
|
|
for J in reverse Result'First + 1 .. Result'Last loop
|
2014-01-14 14:22:06 +00:00
|
|
|
-- 2#0011_1111#: 16#3F#
|
|
|
|
-- 2#1000_0000#: 16#80#
|
2014-01-15 09:21:26 +00:00
|
|
|
Result(J) := Utf8_Character'Val((UV and 2#0011_1111#) or 2#1000_0000#);
|
2014-01-14 14:22:06 +00:00
|
|
|
UV := UV / (2 ** 6); --UV := UV >> 6;
|
|
|
|
end loop;
|
|
|
|
|
2014-01-15 09:21:26 +00:00
|
|
|
Result(Result'First) := Utf8_Character'Val(UV or Uint32(Conv_Table(I).Fbyte));
|
2014-01-14 14:22:06 +00:00
|
|
|
return Result;
|
|
|
|
end;
|
2014-01-15 09:21:26 +00:00
|
|
|
end Unicode_To_Utf8;
|
2014-01-14 14:22:06 +00:00
|
|
|
|
|
|
|
|
2014-01-15 09:21:26 +00:00
|
|
|
function Unicode_To_Utf8 (US: in Unicode_String) return Utf8_String is
|
2014-03-09 18:01:38 +00:00
|
|
|
-- this function has high stack pressure if the input string is too long
|
2014-01-14 14:22:06 +00:00
|
|
|
-- TODO: create a procedure to overcome this problem.
|
|
|
|
Tmp: System_Size;
|
|
|
|
begin
|
|
|
|
Tmp := 0;
|
|
|
|
for I in US'Range loop
|
|
|
|
declare
|
2014-01-15 09:21:26 +00:00
|
|
|
Utf8: Utf8_String := Unicode_To_Utf8(US(I));
|
2014-01-14 14:22:06 +00:00
|
|
|
begin
|
2014-01-15 09:21:26 +00:00
|
|
|
Tmp := Tmp + Utf8'Length;
|
2014-01-14 14:22:06 +00:00
|
|
|
end;
|
|
|
|
end loop;
|
|
|
|
|
|
|
|
declare
|
2014-01-15 09:21:26 +00:00
|
|
|
Result: Utf8_String (1 .. Tmp);
|
2014-01-14 14:22:06 +00:00
|
|
|
begin
|
|
|
|
Tmp := Result'First;
|
|
|
|
for I in US'Range loop
|
|
|
|
declare
|
2014-01-15 09:21:26 +00:00
|
|
|
Utf8: Utf8_String := Unicode_To_Utf8(US(I));
|
2014-01-14 14:22:06 +00:00
|
|
|
begin
|
2014-01-15 09:21:26 +00:00
|
|
|
Result(Tmp .. Tmp + Utf8'Length - 1) := Utf8;
|
|
|
|
Tmp := Tmp + Utf8'Length;
|
2014-01-14 14:22:06 +00:00
|
|
|
end;
|
|
|
|
end loop;
|
|
|
|
return Result;
|
|
|
|
end;
|
2014-01-15 09:21:26 +00:00
|
|
|
end Unicode_To_Utf8;
|
2014-01-14 14:22:06 +00:00
|
|
|
|
2014-03-09 18:01:38 +00:00
|
|
|
procedure Utf8_To_Unicode (Utf8: in Utf8_String;
|
|
|
|
UC: out Unicode_Character) is
|
2014-01-14 14:22:06 +00:00
|
|
|
begin
|
|
|
|
null;
|
2014-01-15 09:21:26 +00:00
|
|
|
end Utf8_To_Unicode;
|
2014-01-14 14:22:06 +00:00
|
|
|
|
2014-03-09 18:01:38 +00:00
|
|
|
procedure Utf8_To_Unicode (Utf8: in Utf8_String;
|
|
|
|
US: in out Unicode_String) is
|
2014-01-14 14:22:06 +00:00
|
|
|
begin
|
|
|
|
null;
|
2014-01-15 09:21:26 +00:00
|
|
|
end Utf8_To_Unicode;
|
2014-01-14 14:22:06 +00:00
|
|
|
|
2014-01-15 09:21:26 +00:00
|
|
|
end H2.Utf8;
|