| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | with Interfaces; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-01-15 09:21:26 +00:00
										 |  |  | package body H2.Utf8 is | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | --|----------------------------------------------------------------------------
 | 
					
						
							|  |  |  | --| From RFC 2279 UTF-8, a transformation format of ISO 10646
 | 
					
						
							|  |  |  | --|
 | 
					
						
							|  |  |  | --|    UCS-4 range (hex.) UTF-8 octet sequence (binary)
 | 
					
						
							|  |  |  | --| 1:2 00000000-0000007F 0xxxxxxx
 | 
					
						
							|  |  |  | --| 2:2 00000080-000007FF 110xxxxx 10xxxxxx
 | 
					
						
							|  |  |  | --| 3:2 00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
 | 
					
						
							|  |  |  | --| 4:4 00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 | 
					
						
							|  |  |  | --| inv 00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 | 
					
						
							|  |  |  | --| inv 04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 | 
					
						
							|  |  |  | --|----------------------------------------------------------------------------
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	--type Uint8 is mod 2 ** 8;
 | 
					
						
							|  |  |  | 	--type Uint32 is mod 2 ** 32;
 | 
					
						
							|  |  |  | 	use type Interfaces.Unsigned_8; | 
					
						
							|  |  |  | 	use type Interfaces.Unsigned_32; | 
					
						
							|  |  |  | 	subtype Uint8 is Interfaces.Unsigned_8; | 
					
						
							|  |  |  | 	subtype Uint32 is Interfaces.Unsigned_32; | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	type Conv_Record is record | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 		Lower: Uint32; | 
					
						
							|  |  |  | 		Upper: Uint32; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		Fbyte: Uint8;  | 
					
						
							|  |  |  | 		Mask: Uint8; -- Mask for getting the fixed bits in the first byte.
 | 
					
						
							|  |  |  | 		             -- (First-Byte and Mask) = Fbyte
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		Fmask: Uint8; -- Mask for getting the actual values bits off the first byte.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		Length: System_Length; -- Number of bytes
 | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 	end record; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	type Conv_Record_Array is array(System_Index range<>) of Conv_Record; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	Conv_Table: constant Conv_Record_Array := ( | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 		(16#0000_0000#, 16#0000_007F#, 2#0000_0000#, 2#1000_0000#, 2#0111_1111#, 1), | 
					
						
							|  |  |  | 		(16#0000_0080#, 16#0000_07FF#, 2#1100_0000#, 2#1110_0000#, 2#0001_1111#, 2), | 
					
						
							|  |  |  | 		(16#0000_0800#, 16#0000_FFFF#, 2#1110_0000#, 2#1111_0000#, 2#0000_1111#, 3), | 
					
						
							|  |  |  | 		(16#0001_0000#, 16#001F_FFFF#, 2#1111_0000#, 2#1111_1000#, 2#0000_0111#, 4), | 
					
						
							|  |  |  | 		(16#0020_0000#, 16#03FF_FFFF#, 2#1111_1000#, 2#1111_1100#, 2#0000_0011#, 5), | 
					
						
							|  |  |  | 		(16#0400_0000#, 16#7FFF_FFFF#, 2#1111_1100#, 2#1111_1110#, 2#0000_0001#, 6) | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 	); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 	function Get_Utf8_Slot (UV: in Uint32) return System_Length is | 
					
						
							| 
									
										
										
										
											2014-01-15 09:21:26 +00:00
										 |  |  | 		pragma Inline (Get_Utf8_Slot); | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 	begin | 
					
						
							|  |  |  | 		for I in Conv_Table'Range loop | 
					
						
							|  |  |  | 			if UV >= Conv_Table(I).Lower and then UV <= Conv_Table(I).Upper then | 
					
						
							|  |  |  | 				return I; | 
					
						
							|  |  |  | 			end if; | 
					
						
							|  |  |  | 		end loop; | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 		return System_Length'First; | 
					
						
							| 
									
										
										
										
											2014-01-15 09:21:26 +00:00
										 |  |  | 	end  Get_Utf8_Slot; | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	function From_Unicode_Character (Chr: in Unicode_Character) return Utf8_String is | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 		UV: Uint32; | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 		I: System_Length; | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 	begin | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 		UV := Unicode_Character'Pos(Chr); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-01-15 09:21:26 +00:00
										 |  |  | 		I := Get_Utf8_Slot(UV); | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 		if I not in System_Index'Range then | 
					
						
							|  |  |  | 			raise Invalid_Unicode_Character; | 
					
						
							|  |  |  | 		end if; | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 		declare | 
					
						
							| 
									
										
										
										
											2014-01-15 09:21:26 +00:00
										 |  |  | 			Result: Utf8_String (1 .. System_Index(Conv_Table(I).Length)); | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 		begin | 
					
						
							| 
									
										
										
										
											2014-01-15 09:21:26 +00:00
										 |  |  | 			for J in reverse Result'First + 1 .. Result'Last loop | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 				-- 2#0011_1111#: 16#3F#
 | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 				-- 2#1000_0000#: 16#80#
 | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 				Result(J) := Utf8_Character'Val((UV and Uint32'(2#0011_1111#)) or Uint32'(2#1000_0000#)); | 
					
						
							|  |  |  | 				--UV := UV / (2 ** 6); --UV := UV >> 6;
 | 
					
						
							|  |  |  | 				UV := Interfaces.Shift_Right (UV, 6); | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 			end loop; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-01-15 09:21:26 +00:00
										 |  |  | 			Result(Result'First) := Utf8_Character'Val(UV or Uint32(Conv_Table(I).Fbyte)); | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 			return Result; | 
					
						
							|  |  |  | 		end; | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 	end From_Unicode_Character; | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 	function From_Unicode_String (Str: in Unicode_String) return Utf8_String is | 
					
						
							| 
									
										
										
										
											2014-03-09 18:01:38 +00:00
										 |  |  | 		-- this function has high stack pressure if the input string is too long
 | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 		-- TODO: create a procedure to overcome this problem.
 | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 		Tmp: System_Length; | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 	begin | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 		-- Calculate the length first
 | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 		Tmp := 0; | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 		for I in Str'Range loop | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 			declare | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 				Utf8: Utf8_String := From_Unicode_Character(Chr => Str(I)); | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 			begin | 
					
						
							| 
									
										
										
										
											2014-01-15 09:21:26 +00:00
										 |  |  | 				Tmp := Tmp + Utf8'Length; | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 			end; | 
					
						
							|  |  |  | 		end loop; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		declare | 
					
						
							| 
									
										
										
										
											2014-01-15 09:21:26 +00:00
										 |  |  | 			Result: Utf8_String (1 .. Tmp); | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 		begin | 
					
						
							|  |  |  | 			Tmp := Result'First; | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 			for I in Str'Range loop | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 				declare | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 					Utf8: Utf8_String := From_Unicode_Character(Str(I)); | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 				begin | 
					
						
							| 
									
										
										
										
											2014-01-15 09:21:26 +00:00
										 |  |  | 					Result(Tmp .. Tmp + Utf8'Length - 1) := Utf8; | 
					
						
							|  |  |  | 					Tmp := Tmp + Utf8'Length; | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 				end; | 
					
						
							|  |  |  | 			end loop; | 
					
						
							|  |  |  | 			return Result; | 
					
						
							|  |  |  | 		end; | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 	end From_Unicode_String; | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 	function Sequence_Length (Seq: in Utf8_Character) return System_Length is | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 	begin | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 		for I in Conv_Table'Range loop | 
					
						
							|  |  |  | 			if (Utf8_Character'Pos(Seq) and Conv_Table(I).Mask) = Conv_Table(I).Fbyte then | 
					
						
							|  |  |  | 				return Conv_Table(I).Length; | 
					
						
							|  |  |  | 			end if; | 
					
						
							|  |  |  | 		end loop; | 
					
						
							|  |  |  | 		return System_Length'First; | 
					
						
							|  |  |  | 	end Sequence_Length; | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 	procedure To_Unicode_Character (Seq:     in  Utf8_String;  | 
					
						
							|  |  |  | 	                                Seq_Len: out System_Length; | 
					
						
							|  |  |  | 	                                Chr:     out Unicode_Character) is | 
					
						
							|  |  |  | 		W: Uint32; | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 	begin | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 		for I in Conv_Table'Range loop | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 			-- Check if the first byte matches the desired bit patterns.
 | 
					
						
							|  |  |  | 			if (Utf8_Character'Pos(Seq(Seq'First)) and Conv_Table(I).Mask) = Conv_Table(I).Fbyte then | 
					
						
							|  |  |  | 				 | 
					
						
							|  |  |  | 				if Seq'Length < Conv_Table(I).Length then | 
					
						
							|  |  |  | 					raise Insufficient_Utf8_Sequence; | 
					
						
							|  |  |  | 				end if; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 				-- Get the values bits off the first byte.
 | 
					
						
							|  |  |  | 				W := Utf8_Character'Pos(Seq(Seq'First)) and Uint32(Conv_Table(I).Fmask); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 				-- Get the values bits off subsequent bytes.
 | 
					
						
							|  |  |  | 				for J in 1 .. Conv_Table(I).Length - 1 loop | 
					
						
							|  |  |  | 					if (Utf8_Character'Pos(Seq(Seq'First + J)) and Uint32'(2#1100_0000#)) /= Uint32'(2#1000_0000#) then | 
					
						
							|  |  |  | 						-- Each UTF8 byte except the first must be set with 2#1000_0000.
 | 
					
						
							|  |  |  | 						raise Invalid_Utf8_Sequence; | 
					
						
							|  |  |  | 					end if; | 
					
						
							|  |  |  | 					W := Interfaces.Shift_Left(W, 6) or (Utf8_Character'Pos(Seq(Seq'First + J)) and Uint32'(2#0011_1111#));  | 
					
						
							|  |  |  | 				end loop; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 				-- Return the character matching the word
 | 
					
						
							|  |  |  | 				Chr := Unicode_Character'Val(W); | 
					
						
							|  |  |  | 				Seq_Len := Conv_Table(I).Length; | 
					
						
							|  |  |  | 				return; | 
					
						
							|  |  |  | 			end if; | 
					
						
							|  |  |  | 		end loop; | 
					
						
							|  |  |  | 		 | 
					
						
							|  |  |  | 		raise Invalid_Utf8_Sequence; | 
					
						
							|  |  |  | 	end To_Unicode_Character; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	function To_Unicode_Character (Seq: in Utf8_String) return Unicode_Character is | 
					
						
							|  |  |  | 		Seq_Len: System_Length; | 
					
						
							|  |  |  | 		Chr: Unicode_Character; | 
					
						
							|  |  |  | 	begin | 
					
						
							|  |  |  | 		To_Unicode_Character (Seq, Seq_Len, Chr); | 
					
						
							|  |  |  | 		return Chr; | 
					
						
							|  |  |  | 	end To_Unicode_Character; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-06-17 15:23:35 +00:00
										 |  |  | 	procedure To_Unicode_String (Seq:     in  Utf8_String;  | 
					
						
							|  |  |  | 	                             Seq_Len: out System_Length; | 
					
						
							|  |  |  | 	                             Str:     out Unicode_String; | 
					
						
							|  |  |  | 	                             Str_Len: out System_Length) is | 
					
						
							| 
									
										
										
										
											2014-05-30 03:15:40 +00:00
										 |  |  | 		Seq_Pos: System_Index := Seq'First; | 
					
						
							|  |  |  | 		Str_Pos: System_Index := Str'First; | 
					
						
							|  |  |  | 		Len: System_Length; | 
					
						
							|  |  |  | 	begin | 
					
						
							|  |  |  | 		while Seq_Pos <= Seq'Last and then Str_Pos <= Str'Last loop | 
					
						
							|  |  |  | 			To_Unicode_Character(Seq(Seq_Pos .. Seq'Last), Len, Str(Str_Pos)); | 
					
						
							|  |  |  | 			Seq_Pos := Seq_Pos + Len; | 
					
						
							|  |  |  | 			Str_Pos := Str_Pos + 1; | 
					
						
							|  |  |  | 		end loop; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		Seq_Len := Seq_Pos - Seq'First; | 
					
						
							|  |  |  | 		Str_Len := Str_Pos - Str'First; | 
					
						
							|  |  |  | 	end To_Unicode_String; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	function To_Unicode_String (Seq: in Utf8_String) return Unicode_String is | 
					
						
							|  |  |  | 		UL: System_Length := 0; | 
					
						
							|  |  |  | 	begin | 
					
						
							|  |  |  | 		declare | 
					
						
							|  |  |  | 			Chr: Unicode_Character; | 
					
						
							|  |  |  | 			Pos: System_Index := Seq'First; | 
					
						
							|  |  |  | 			Seq_Len: System_Length; | 
					
						
							|  |  |  | 		begin | 
					
						
							|  |  |  | 			while Pos <= Seq'Last loop | 
					
						
							|  |  |  | 				To_Unicode_Character(Seq(Pos .. Seq'Last), Seq_Len, Chr); | 
					
						
							|  |  |  | 				UL := UL + 1; | 
					
						
							|  |  |  | 				Pos := Pos + Seq_Len; | 
					
						
							|  |  |  | 			end loop; | 
					
						
							|  |  |  | 		end; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		declare | 
					
						
							|  |  |  | 			Str: Unicode_String (1 .. UL); | 
					
						
							|  |  |  | 			Pos: System_Index := Seq'First; | 
					
						
							|  |  |  | 			Seq_Len: System_Length; | 
					
						
							|  |  |  | 		begin | 
					
						
							|  |  |  | 			for I in Str'Range loop | 
					
						
							|  |  |  | 				To_Unicode_Character(Seq(Pos .. Seq'Last), Seq_Len, Str(I)); | 
					
						
							|  |  |  | 				Pos := Pos + Seq_Len; | 
					
						
							|  |  |  | 			end loop; | 
					
						
							|  |  |  | 			return Str; | 
					
						
							|  |  |  | 		end; | 
					
						
							|  |  |  | 	end To_Unicode_String; | 
					
						
							| 
									
										
										
										
											2014-01-14 14:22:06 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-01-15 09:21:26 +00:00
										 |  |  | end H2.Utf8; |