hcl/lib/h2-utf8.adb

111 lines
2.9 KiB
Ada
Raw Normal View History

2014-01-14 14:22:06 +00:00
with ada.text_io;
package body H2.UTF8 is
type Uint8 is mod 2 ** 8;
type Uint32 is mod 2 ** 32;
type Conv_Record is record
Lower: Uint32;
Upper: Uint32;
Fbyte: Uint8; -- Mask to the first utf8 byte */
Mask: Uint8;
Fmask: Uint8;
Length: Uint8; -- number of bytes
end record;
type Conv_Record_Array is array(System_Index range<>) of Conv_Record;
Conv_Table: constant Conv_Record_Array := (
(16#0000_0000#, 16#0000_007F#, 16#00#, 16#80#, 16#7F#, 1),
(16#0000_0080#, 16#0000_07FF#, 16#C0#, 16#E0#, 16#1F#, 2),
(16#0000_0800#, 16#0000_FFFF#, 16#E0#, 16#F0#, 16#0F#, 3),
(16#0001_0000#, 16#001F_FFFF#, 16#F0#, 16#F8#, 16#07#, 4),
(16#0020_0000#, 16#03FF_FFFF#, 16#F8#, 16#FC#, 16#03#, 5),
(16#0400_0000#, 16#7FFF_FFFF#, 16#FC#, 16#FE#, 16#01#, 6)
);
function Get_UTF8_Slot (UV: in Uint32) return System_Size is
pragma Inline (Get_UTF8_Slot);
begin
for I in Conv_Table'Range loop
if UV >= Conv_Table(I).Lower and then UV <= Conv_Table(I).Upper then
return I;
end if;
end loop;
return System_Size'First;
end Get_UTF8_Slot;
function Unicode_To_UTF8 (UC: in Unicode_Character) return UTF8_String is
UV: Uint32;
I: System_Size;
begin
UV := Unicode_Character'Pos(UC);
I := Get_UTF8_Slot(UV);
if I not in System_Index'Range then
raise Invalid_Unicode_Character;
end if;
declare
subtype Result_String is UTF8_String(1 .. System_Index(Conv_Table(I).Length));
Result: Result_String;
begin
for J in reverse Result_String'First + 1 .. Result_String'Last loop
-- 2#0011_1111#: 16#3F#
-- 2#1000_0000#: 16#80#
Result(J) := UTF8_Character'Val((UV and 2#0011_1111#) or 2#1000_0000#);
UV := UV / (2 ** 6); --UV := UV >> 6;
end loop;
Result(Result_String'First) := UTF8_Character'Val(UV or Uint32(Conv_Table(I).Fbyte));
return Result;
end;
end Unicode_To_UTF8;
function Unicode_To_UTF8 (US: in Unicode_String) return UTF8_String is
-- this function has high stack pressur if the input string is too long
-- TODO: create a procedure to overcome this problem.
Tmp: System_Size;
begin
Tmp := 0;
for I in US'Range loop
declare
UTF8: UTF8_String := Unicode_To_UTF8(US(I));
begin
Tmp := Tmp + UTF8'Length;
end;
end loop;
declare
subtype Result_String is UTF8_String(1 .. Tmp);
Result: Result_String;
begin
Tmp := Result'First;
for I in US'Range loop
declare
UTF8: UTF8_String := Unicode_To_UTF8(US(I));
begin
Result(Tmp .. Tmp + UTF8'Length - 1) := UTF8;
Tmp := Tmp + UTF8'Length;
end;
end loop;
return Result;
end;
end Unicode_To_UTF8;
procedure UTF8_To_Unicode (UTF8: in UTF8_String;
UC: out Unicode_Character) is
begin
null;
end UTF8_To_Unicode;
procedure UTF8_To_Unicode (UTF8: in UTF8_String;
US: in out Unicode_String) is
begin
null;
end UTF8_To_Unicode;
end H2.UTF8;