class EL_UTF_8_CONVERTER_IMP

(source code)

description

UTF-8 string converter accessible from EL_MODULE_UTF_8

note
	description: "UTF-8 string converter accessible from ${EL_MODULE_UTF_8}"

	author: "Finnian Reilly"
	copyright: "Copyright (c) 2001-2022 Finnian Reilly"
	contact: "finnian at eiffel hyphen loop dot com"

	license: "MIT license (See: en.wikipedia.org/wiki/MIT_License)"
	date: "2025-04-13 17:06:35 GMT (Sunday 13th April 2025)"
	revision: "9"

class
	EL_UTF_8_CONVERTER_IMP

inherit
	ANY

	EL_STRING_GENERAL_ROUTINES_I

feature -- Conversion

	frozen unicode (area: SPECIAL [CHARACTER_8]; leading_byte: NATURAL_32; offset, byte_count: INTEGER): NATURAL
		-- return unicode encoded as `byte_count' bytes from `offset' in `area'
		local
			i: INTEGER
		do
			inspect byte_count
				when 1 then -- 0xxxxxxx
					Result := leading_byte
				when 2 then -- 110xxxxx 10xxxxxx
					Result := leading_byte & 0x1F
				when 3 then -- 1110xxxx 10xxxxxx 10xxxxxx
					Result := leading_byte & 0xF
				when 4 then -- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
					Result := leading_byte & 0x7
			else
			end
			from i := 1 until i = byte_count loop
				Result := (Result |<< 6) | (area [offset + i].natural_32_code & 0x3F)
				i := i + 1
			end
		end

	utf_32_string_to_string_8 (s: READABLE_STRING_GENERAL): STRING_8
		-- UTF-8 sequence corresponding to `s' interpreted as a UTF-32 sequence.
		local
			c: UTF_CONVERTER
		do
			inspect string_storage_type (s)
				when 'X' then
					if attached {ZSTRING} s as zstr then
						Result := zstr.to_utf_8
					end
				when '4' then
					if attached {READABLE_STRING_32} s as str_32 then
						Result := c.string_32_to_utf_8_string_8 (str_32)
					end
			else
				Result := c.utf_32_string_to_utf_8_string_8 (s)
			end
		end

	string_32_to_string_8 (s: READABLE_STRING_32): STRING_8
		-- UTF-8 sequence corresponding to `s'.
		local
			c: UTF_CONVERTER
		do
			inspect string_storage_type (s)
				when 'X' then
					if attached {ZSTRING} s as zstr then
						Result := zstr.to_utf_8
					end
			else
				Result := c.string_32_to_utf_8_string_8 (s)
			end
		end

	to_string_32 (utf_8: READABLE_STRING_8): STRING_32
		do
			create Result.make (unicode_count (utf_8))
			string_8_into_string_general (utf_8, Result)
		end

feature -- Measurement

	frozen sequence_count (first_code: NATURAL): INTEGER
		-- utf-8 byte count indicated by first code in sequence

		-- selected		 : 296.0 times (100%)
		-- bit-shifting : 294.0 times (-0.7%)
		-- Using bit-shifting algorithm is slightly slower then selected alogorithm
		do
			if first_code <= 0x7F then -- 0xxxxxxx
				Result := 1
			elseif first_code <= 0xDF then -- 110xxxxx 10xxxxxx
				Result := 2
			elseif first_code <= 0xEF then -- 1110xxxx 10xxxxxx 10xxxxxx
				Result := 3
			elseif first_code <= 0xF7 then -- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
				Result := 4
			else
				check
					valid_utf_8_character: False
				end
				Result := 1
			end
		end

	frozen unicode_count (str: READABLE_STRING_8): INTEGER
		do
			if attached super_readable_8 (str) as s then
				Result := array_unicode_count (s.area, s.index_lower, s.index_upper)
			end
		end

	frozen unicode_substring_count (str: READABLE_STRING_8; start_index, end_index: INTEGER): INTEGER
		require
			valid_start_index: str.valid_index (start_index)
			valid_end_index: end_index >= start_index - 1 and end_index <= str.count
		local
			first_index: INTEGER
		do
			if attached super_readable_8 (str) as s then
				first_index := s.index_lower + start_index - 1
				Result := array_unicode_count (s.area, first_index, first_index + end_index - start_index)
			end
		end

	frozen array_unicode_count (area: SPECIAL [CHARACTER]; first_index, last_index: INTEGER): INTEGER
		local
			i: INTEGER
		do
			from i := first_index until i > last_index loop
				Result := Result + 1
				i := i + sequence_count (area [i].natural_32_code)
			end
		end

	frozen memory_unicode_count (area: MANAGED_POINTER; first_index, last_index: INTEGER): INTEGER
		require
			valid_indices: first_index >= last_index + 1 and then last_index <= area.count
		local
			i: INTEGER
		do
			from i := first_index until i > last_index loop
				Result := Result + 1
				i := i + sequence_count (area.read_character (i).natural_32_code)
			end
		end

	frozen storage_count (iterable_list: ITERABLE [READABLE_STRING_GENERAL]; separator_count: INTEGER): INTEGER
		-- total  UTF-8 string byte count to store `iterable_list' of strings
		-- joined with `separator_count' of ASCII characters
		do
			across iterable_list as list loop
				if Result > 0 then
					Result := Result + separator_count
				end
				if attached super_readable_general (list.item) as str then
					Result := Result + str.utf_8_byte_count
				end
			end
		end

feature -- Status report

	is_valid_string_8 (s: READABLE_STRING_8): BOOLEAN
			-- Is `s' a valid UTF-8 Unicode sequence?
		local
			c: UTF_CONVERTER
		do
			Result := c.is_valid_utf_8_string_8 (s)
		end

feature -- Basic operations

	string_8_into_string_general (s: READABLE_STRING_8; a_result: STRING_GENERAL)
		do
			substring_8_into_string_general (s, 1, s.count, a_result)
		end

	substring_8_into_string_general (str: READABLE_STRING_8; start_index, end_index: INTEGER; a_result: STRING_GENERAL)
	-- Copy STRING_32 corresponding to UTF-8 sequence `s.substring (start_index, end_index)' appended into `a_result'.
		local
			i, i_final, n, offset, byte_count: INTEGER; code: NATURAL_32
			area: SPECIAL [CHARACTER_8]; area_32: SPECIAL [CHARACTER_32]
		do
			if attached super_readable_8 (str) as s then
				area := s.area; offset := s.index_lower
			end
			n := end_index - start_index + 1
			if n > 0 then
				i_final := offset + start_index + n - 1
				create area_32.make_empty (n)
				from i := offset + start_index - 1 until i >= i_final loop
					code := area [i].natural_32_code
					byte_count := sequence_count (code)
					area_32.extend (unicode (area, code, i, byte_count).to_character_32)
					i := i + byte_count
				end
				super_general (a_result).append_area_32 (area_32)
			end
		ensure
			roundtrip: attached str.substring (start_index, end_index) as s and then is_valid_string_8 (s)
				implies
					utf_32_string_to_string_8 (a_result.substring (old a_result.count + 1, a_result.count)).same_string (s)
		end

end