with Interfaces;

package body H2.Utf8 is

--|----------------------------------------------------------------------------
--| From RFC 2279 UTF-8, a transformation format of ISO 10646
--|
--|    UCS-4 range (hex.) UTF-8 octet sequence (binary)
--| 1:2 00000000-0000007F 0xxxxxxx
--| 2:2 00000080-000007FF 110xxxxx 10xxxxxx
--| 3:2 00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
--| 4:4 00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
--| inv 00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
--| inv 04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
--|----------------------------------------------------------------------------

	--type Uint8 is mod 2 ** 8;
	--type Uint32 is mod 2 ** 32;
	use type Interfaces.Unsigned_8;
	use type Interfaces.Unsigned_32;
	subtype Uint8 is Interfaces.Unsigned_8;
	subtype Uint32 is Interfaces.Unsigned_32;

	type Conv_Record is record
		Lower: Uint32;
		Upper: Uint32;

		Fbyte: Uint8; 
		Mask: Uint8; -- Mask for getting the fixed bits in the first byte.
		             -- (First-Byte and Mask) = Fbyte

		Fmask: Uint8; -- Mask for getting the actual values bits off the first byte.

		Length: System_Length; -- Number of bytes
	end record;

	type Conv_Record_Array is array(System_Index range<>) of Conv_Record;

	Conv_Table: constant Conv_Record_Array := (
		(16#0000_0000#, 16#0000_007F#, 2#0000_0000#, 2#1000_0000#, 2#0111_1111#, 1),
		(16#0000_0080#, 16#0000_07FF#, 2#1100_0000#, 2#1110_0000#, 2#0001_1111#, 2),
		(16#0000_0800#, 16#0000_FFFF#, 2#1110_0000#, 2#1111_0000#, 2#0000_1111#, 3),
		(16#0001_0000#, 16#001F_FFFF#, 2#1111_0000#, 2#1111_1000#, 2#0000_0111#, 4),
		(16#0020_0000#, 16#03FF_FFFF#, 2#1111_1000#, 2#1111_1100#, 2#0000_0011#, 5),
		(16#0400_0000#, 16#7FFF_FFFF#, 2#1111_1100#, 2#1111_1110#, 2#0000_0001#, 6)
	);

	function Get_Utf8_Slot (UV: in Uint32) return System_Length is
		pragma Inline (Get_Utf8_Slot);
	begin
		for I in Conv_Table'Range loop
			if UV >= Conv_Table(I).Lower and then UV <= Conv_Table(I).Upper then
				return I;
			end if;
		end loop;
		return System_Length'First;
	end  Get_Utf8_Slot;

	function From_Unicode_Character (Chr: in Unicode_Character) return Utf8_String is
		UV: Uint32;
		I: System_Length;
	begin
		UV := Unicode_Character'Pos(Chr);

		I := Get_Utf8_Slot(UV);
		if I not in System_Index'Range then
			raise Invalid_Unicode_Character;
		end if;

		declare
			Result: Utf8_String (1 .. System_Index(Conv_Table(I).Length));
		begin
			for J in reverse Result'First + 1 .. Result'Last loop
				-- 2#0011_1111#: 16#3F#
				-- 2#1000_0000#: 16#80#
				Result(J) := Utf8_Character'Val((UV and Uint32'(2#0011_1111#)) or Uint32'(2#1000_0000#));
				--UV := UV / (2 ** 6); --UV := UV >> 6;
				UV := Interfaces.Shift_Right (UV, 6);
			end loop;

			Result(Result'First) := Utf8_Character'Val(UV or Uint32(Conv_Table(I).Fbyte));
			return Result;
		end;
	end From_Unicode_Character;

	function From_Unicode_String (Str: in Unicode_String) return Utf8_String is
		-- this function has high stack pressure if the input string is too long
		-- TODO: create a procedure to overcome this problem.
		Tmp: System_Length;
	begin
		-- Calculate the length first
		Tmp := 0;
		for I in Str'Range loop
			declare
				Utf8: Utf8_String := From_Unicode_Character(Chr => Str(I));
			begin
				Tmp := Tmp + Utf8'Length;
			end;
		end loop;

		declare
			Result: Utf8_String (1 .. Tmp);
		begin
			Tmp := Result'First;
			for I in Str'Range loop
				declare
					Utf8: Utf8_String := From_Unicode_Character(Str(I));
				begin
					Result(Tmp .. Tmp + Utf8'Length - 1) := Utf8;
					Tmp := Tmp + Utf8'Length;
				end;
			end loop;
			return Result;
		end;
	end From_Unicode_String;

	function Sequence_Length (Seq: in Utf8_Character) return System_Length is
	begin
		for I in Conv_Table'Range loop
			if (Utf8_Character'Pos(Seq) and Conv_Table(I).Mask) = Conv_Table(I).Fbyte then
				return Conv_Table(I).Length;
			end if;
		end loop;
		return System_Length'First;
	end Sequence_Length;

	procedure To_Unicode_Character (Seq:     in  Utf8_String; 
	                                Seq_Len: out System_Length;
	                                Chr:     out Unicode_Character) is
		W: Uint32;
	begin
		for I in Conv_Table'Range loop

			-- Check if the first byte matches the desired bit patterns.
			if (Utf8_Character'Pos(Seq(Seq'First)) and Conv_Table(I).Mask) = Conv_Table(I).Fbyte then
				
				if Seq'Length < Conv_Table(I).Length then
					raise Insufficient_Utf8_Sequence;
				end if;

				-- Get the values bits off the first byte.
				W := Utf8_Character'Pos(Seq(Seq'First)) and Uint32(Conv_Table(I).Fmask);

				-- Get the values bits off subsequent bytes.
				for J in 1 .. Conv_Table(I).Length - 1 loop
					if (Utf8_Character'Pos(Seq(Seq'First + J)) and Uint32'(2#1100_0000#)) /= Uint32'(2#1000_0000#) then
						-- Each UTF8 byte except the first must be set with 2#1000_0000.
						raise Invalid_Utf8_Sequence;
					end if;
					W := Interfaces.Shift_Left(W, 6) or (Utf8_Character'Pos(Seq(Seq'First + J)) and Uint32'(2#0011_1111#)); 
				end loop;

				-- Return the character matching the word
				Chr := Unicode_Character'Val(W);
				Seq_Len := Conv_Table(I).Length;
				return;
			end if;
		end loop;
		
		raise Invalid_Utf8_Sequence;
	end To_Unicode_Character;

	function To_Unicode_Character (Seq: in Utf8_String) return Unicode_Character is
		Seq_Len: System_Length;
		Chr: Unicode_Character;
	begin
		To_Unicode_Character (Seq, Seq_Len, Chr);
		return Chr;
	end To_Unicode_Character;

	procedure To_Unicode_String (Seq:     in  Utf8_String; 
	                             Seq_Len: out System_Length;
	                             Str:     out Unicode_String;
	                             Str_Len: out System_Length) is
		Seq_Pos: System_Index := Seq'First;
		Str_Pos: System_Index := Str'First;
		Len: System_Length;
	begin
		while Seq_Pos <= Seq'Last and then Str_Pos <= Str'Last loop
			To_Unicode_Character(Seq(Seq_Pos .. Seq'Last), Len, Str(Str_Pos));
			Seq_Pos := Seq_Pos + Len;
			Str_Pos := Str_Pos + 1;
		end loop;

		Seq_Len := Seq_Pos - Seq'First;
		Str_Len := Str_Pos - Str'First;
	end To_Unicode_String;

	function To_Unicode_String (Seq: in Utf8_String) return Unicode_String is
		UL: System_Length := 0;
	begin
		declare
			Chr: Unicode_Character;
			Pos: System_Index := Seq'First;
			Seq_Len: System_Length;
		begin
			while Pos <= Seq'Last loop
				To_Unicode_Character(Seq(Pos .. Seq'Last), Seq_Len, Chr);
				UL := UL + 1;
				Pos := Pos + Seq_Len;
			end loop;
		end;

		declare
			Str: Unicode_String (1 .. UL);
			Pos: System_Index := Seq'First;
			Seq_Len: System_Length;
		begin
			for I in Str'Range loop
				To_Unicode_Character(Seq(Pos .. Seq'Last), Seq_Len, Str(I));
				Pos := Pos + Seq_Len;
			end loop;
			return Str;
		end;
	end To_Unicode_String;

end H2.Utf8;