class EL_UTF_CONVERTER_IMP

(source code)

description

UTF string converter

note
	description: "UTF string converter"

	author: "Finnian Reilly"
	copyright: "Copyright (c) 2001-2022 Finnian Reilly"
	contact: "finnian at eiffel hyphen loop dot com"

	license: "MIT license (See: en.wikipedia.org/wiki/MIT_License)"
	date: "2024-08-27 7:51:23 GMT (Tuesday 27th August 2024)"
	revision: "18"

class
	EL_UTF_CONVERTER_IMP

inherit
	UTF_CONVERTER
		export
			{NONE} utf_8_string_8_into_string_32
		redefine
			is_valid_utf_16
		end

	EL_STRING_HANDLER

	EL_SHARED_STRING_8_CURSOR; EL_SHARED_IMMUTABLE_8_MANAGER

feature -- Status report

	is_utf_16_le_file (text: READABLE_STRING_8): BOOLEAN
		-- `True' if file `text' is encoded as little-endian UTF-16
		do
			Result := text.starts_with (Utf_16le_bom_to_string_8)
		end

	is_utf_8_file (text: READABLE_STRING_8): BOOLEAN
		-- `True' if file `text' is encoded as UTF-8
		do
			Result := text.starts_with (Utf_8_bom_to_string_8)
		end

	is_valid_utf_16 (s: SPECIAL [NATURAL_16]): BOOLEAN
			-- Is `s' a valid UTF-16 Unicode sequence?
		local
			i, n: INTEGER; c1, c2: NATURAL_32
		do
			from
				i := 0
				n := s.count
				Result := True
			until
				i >= n or not Result
			loop
				c1 := s.item (i)
				if c1 = 0 then
						-- We hit our null terminating character, we can stop
					i := n + 1
				else
					if c1 < 0xD800 or c1 >= 0xE000 then
						-- Codepoint from Basic Multilingual Plane: one 16-bit code unit, this is valid Unicode.
						i := i + 1
					elseif c1 <= 0xDBFF then
						i := i + 1
						if i <= n then
							c2 := s.item (i)
							Result := 0xDC00 <= c2 and c2 <= 0xDFF
						else
								-- Surrogate pair is incomplete, clearly not a valid UTF-16 sequence.
							Result := False
						end
					else
							-- Invalid starting surrogate pair which should be between 0xD800 and 0xDBFF.
						Result := False
					end
				end
			end
		end

feature -- Conversion

	bomless_utf_16_le (text: STRING_8): READABLE_STRING_8
		-- file `text' without the little-endian UTF-16 byte-order-mark
		require
			has_16le_bom: is_utf_16_le_file (text)
		do
			Result := bomless_text (Utf_16le_bom_to_string_8, text)
		end

	bomless_utf_8 (text: STRING_8): READABLE_STRING_8
		-- file `text' without the UTF-8 byte-order-mark
		require
			has_utf_8_bom: is_utf_8_file (text)
		do
			Result := bomless_text (Utf_8_bom_to_string_8, text)
		end

feature -- UTF-16 operations

	utf_16_be_0_pointer_into_string_32 (p: MANAGED_POINTER; a_result: STRING_32)
			-- Copy {STRING_32} object corresponding to UTF-16BE sequence `p' which is zero-terminated
			-- appended into `a_result'.
		require
			minimum_size: p.count >= 2
			valid_count: p.count \\ 2 = 0
		do
			utf_16_be_0_subpointer_into_string_32 (p, 0, p.count // 2 - 1, True, a_result)
		end

	utf_16_be_0_subpointer_into_string_32 (p: MANAGED_POINTER; start_pos, end_pos: INTEGER; a_stop_at_null: BOOLEAN; a_result: STRING_32)
			-- Copy {STRING_32} object corresponding to UTF-16BE sequence `p' between code units `start_pos' and
			-- `end_pos' or the first null character encountered if `a_stop_at_null' appended into `a_result'.
		require
			minimum_size: p.count >= 2
			start_position_big_enough: start_pos >= 0
			end_position_big_enough: start_pos <= end_pos + 1
			end_pos_small_enough: end_pos < p.count // 2
		local
			i, n: INTEGER
			c: NATURAL_32
		do
			from
					-- Allocate Result with the same number of bytes as copied from `p'.
				a_result.grow (a_result.count + end_pos - start_pos + 1)
				i := start_pos * 2
				n := end_pos * 2
			until
				i > n
			loop
				c := p.read_natural_16_be (i)
				if c = 0 and a_stop_at_null then
						-- We hit our null terminating character, we can stop
					i := n + 1
				else
					i := i + 2
					if c < 0xD800 or c >= 0xE000 then
							-- Codepoint from Basic Multilingual Plane: one 16-bit code unit.
						a_result.extend (c.to_character_32)
					else
							-- Supplementary Planes: surrogate pair with lead and trail surrogates.
						if i <= n then
							a_result.extend (((c.as_natural_32 |<< 10) + p.read_natural_16_be (i) - 0x35FDC00).to_character_32)
							i := i + 2
						end
					end
				end
			end
		end

feature {NONE} -- Implementation

	bomless_text (bom, text: STRING_8): READABLE_STRING_8
		-- file `text' without the UTF-8 byte-order-mark
		do
			if bom.count > 0 then
				Result := Immutable_8.shared_substring (text, bom.count + 1, text.count)
			else
				Result := text
			end
		end
end