class EL_ZSTRING_IMPLEMENTATION

(source code)

Client examples: ZSTRING_CONVERTABLE_TEST_SET

description

Core implementation of ZSTRING using an 8 bit array to store characters encodeable by codec, and a compacted array of 32-bit arrays to encode any character not defined by the 8-bit encoding.

note
	description: "[
		Core implementation of ${ZSTRING} using an 8 bit array to store characters encodeable
		by `codec', and a compacted array of 32-bit arrays to encode any character not defined by the 8-bit encoding.
	]"

	author: "Finnian Reilly"
	copyright: "Copyright (c) 2001-2022 Finnian Reilly"
	contact: "finnian at eiffel hyphen loop dot com"

	license: "MIT license (See: en.wikipedia.org/wiki/MIT_License)"
	date: "2024-04-15 9:58:11 GMT (Monday 15th April 2024)"
	revision: "101"

deferred class
	EL_ZSTRING_IMPLEMENTATION

inherit
	EL_COMPACT_SUBSTRINGS_32_I
		rename
			append as append_unencoded,
			append_intervals as append_unencoded_intervals,
			area as unencoded_area,
			buffer as unencoded_buffer,
			code as unencoded_code,
			combined_area as unencoded_combined_area,
			count_greater_than_zero_flags as respective_encoding,
			empty_buffer as empty_unencoded_buffer,
			fill as unencoded_fill,
			fill_list as unencoded_fill_list,
			first_lower as unencoded_first_lower,
			first_upper as unencoded_first_upper,
			extended_hash_code as unencoded_hash_code_to,
			has as unencoded_has,
			has_between as unencoded_has_between,
			index_of as unencoded_index_of,
			interval_sequence as unencoded_interval_sequence,
			insert as insert_unencoded,
			intersects as has_unencoded_between,
			item as unencoded_item,
			i_th_substring as unencoded_i_th_substring,
			interval_count as unencoded_interval_count,
			last_index_of as unencoded_last_index_of,
			last_upper as unencoded_last_upper,
			make as make_unencoded,
			make_filled as make_unencoded_filled,
			make_from_other as make_unencoded_from_other,
			minimal_increase as minimal_unencoded_increase,
			new_filled_area as new_filled_unencoded_area,
			not_empty as has_mixed_encoding,
			occurrences as unencoded_occurrences,
			overlaps as overlaps_unencoded,
			put as put_unencoded,
			remove as remove_unencoded,
			remove_substring as remove_unencoded_substring,
			replace_character as replace_unencoded_character,
			same_characters as same_unencoded_characters,
			same_string as same_unencoded_string,
			set_area as set_unencoded_area,
			set_from_buffer as set_unencoded_from_buffer,
			shift as shift_unencoded,
			shift_from as shift_unencoded_from,
			shifted as shifted_unencoded,
			substring_list as unencoded_substring_list,
			character_count as unencoded_count,
			to_lower as unencoded_to_lower,
			to_upper as unencoded_to_upper,
			utf_8_byte_count as unencoded_utf_8_byte_count,
			write as write_unencoded,
			z_code as unencoded_z_code,
			is_valid as is_unencoded_valid
		undefine
			is_equal, copy, out
		end

	EL_ZSTRING_CHARACTER_8_IMPLEMENTATION
		rename
			fill_character as internal_fill_character,
			hash_code as area_hash_code,
			item as internal_item,
			index_of as internal_index_of,
			insert_string as internal_insert_string,
			keep_head as internal_keep_head,
			keep_tail as internal_keep_tail,
			last_index_of as internal_last_index_of,
			make as internal_make,
			order_comparison as internal_order_comparison,
			remove as internal_remove,
			same_characters as internal_same_characters,
			same_string as internal_same_string,
			share as internal_share,
			string as internal_string,
			substring as internal_substring,
			wipe_out as internal_wipe_out
		export
			{STRING_HANDLER} area, area_lower
		undefine
			copy, is_equal, out
		end

	EL_READABLE_ZSTRING_I

feature -- Access

	item alias "[]", at alias "@" (i: INTEGER): CHARACTER_32 assign put
		-- Unicode character at position `i'
		local
			c_i: CHARACTER
		do
			c_i := area [i - 1]
			inspect c_i
				when Substitute then
					Result := unencoded_code (i).to_character_32

				when Control_0 .. Control_25, Control_27 .. Max_ascii then
					Result := c_i.to_character_32
			else
				Result := Unicode_table [c_i.code]
			end
		end

	item_8 (i: INTEGER): CHARACTER_8
		-- internal character at position `i'
		do
			Result := area [i - 1]
		end

	item_code (i: INTEGER): INTEGER
		obsolete
			"Due to potential truncation it is recommended to use `code (i)' instead."
		do
			Result := item (i).natural_32_code.to_integer_32
		end

	unicode (i: INTEGER): NATURAL
		local
			c_i: CHARACTER
		do
			c_i := area [i - 1]
			inspect c_i
				when Substitute then
					Result := unencoded_code (i)

				when Control_0 .. Control_25, Control_27 .. Max_ascii then
					Result := c_i.natural_32_code
			else
				Result := Unicode_table [c_i.code].natural_32_code
			end
		end

feature -- Element change

	put (uc: CHARACTER_32; i: INTEGER)
			-- Replace character at position `i' by `uc'.
		require else -- from STRING_GENERAL
			valid_index: valid_index (i)
		local
			old_c: CHARACTER
		do
			if attached area as c then
				old_c := c [i - 1]
				c [i - 1] := Codec.encoded_character (uc)
				inspect c [i - 1]
					when Substitute then
						put_unencoded (uc, i)
				else
					if old_c = Substitute then
						remove_unencoded (i)
					end
				end
				reset_hash
			end
		ensure then
			inserted: item (i) = uc
			stable_count: count = old count
			stable_before_i: Elks_checking implies substring (1, i - 1) ~ (old substring (1, i - 1))
			stable_after_i: Elks_checking implies substring (i + 1, count) ~ (old substring (i + 1, count))
		end

	put_z_code (a_z_code: like z_code; i: INTEGER)
		-- Passes over 3000 millisecs (in descending order)
		-- append_zcode     :  7979.3 times (100%)
		-- append_character :  7924.4 times (-0.7%)
		do
			if attached area as c then
				if a_z_code > 0xFF then
					c [i - 1] := Substitute
					put_unencoded (z_code_to_unicode (a_z_code).to_character_32, i)
				else
					inspect c [i - 1]
						when Substitute then
							remove_unencoded (i)
					else
					end
					c [i - 1] := a_z_code.to_character_8
				end
			end
		end

feature -- Status query

	has (uc: CHARACTER_32): BOOLEAN
		-- `True' is string contains at least one `uc'?
		local
			c: CHARACTER
		do
			inspect uc.code
			-- allow uc = 26 to map to unicode subtitute character
				when 0 .. 25, 27 .. Max_ascii_code then
					Result := has_character_8 (area, uc.to_character_8, count - 1)
			else
				c := Codec.encoded_character (uc)
				inspect c
					when Substitute then
						Result := unencoded_has (uc)
				else
					Result := has_character_8 (area, c, count - 1)
				end
			end
		end

	has_z_code (a_z_code: NATURAL): BOOLEAN
		do
			if a_z_code < 0x100 then
				Result := String_8.has (Current, a_z_code.to_character_8)
			else
				Result := unencoded_has (z_code_to_unicode (a_z_code).to_character_32)
			end
		end

	is_ascii: BOOLEAN
		-- `True' if all characters in are in the range 0 to 127 and `has_mixed_encoding' is false
		local
			c: EL_CHARACTER_8_ROUTINES
		do
			Result := not has_mixed_encoding and then c.is_ascii_area (area, area_lower, area_upper)
		end

	valid_index (i: INTEGER): BOOLEAN
		deferred
		end

feature -- Contract Support

	is_valid: BOOLEAN
			-- True position and number of `Substitute' in `area' consistent with `unencoded_area' substrings
		local
			i, j, lower, upper, l_count, interval_count, sum_count: INTEGER
		do
			if has_mixed_encoding then
				l_count := count
				if attached area as l_area and then attached unencoded_area as area_32 then
					Result := True
					from i := 0 until not Result or else i = area_32.count loop
						lower := area_32 [i].code; upper := area_32 [i + 1].code
						interval_count := upper - lower + 1
						if upper <= l_count then
							from j := lower until not Result or else j > upper loop
								Result := Result and l_area [j - 1] = Substitute
								j := j + 1
							end
						else
							Result := False
						end
						sum_count := sum_count + interval_count
						i := i + interval_count + 2
					end
				end
				Result := Result and String_8.occurrences (Current, Substitute) = sum_count
			else
				Result := String_8.occurrences (Current, Substitute) = 0
			end
		end

	shared_substring (start_index, end_index: INTEGER): EL_READABLE_ZSTRING
		-- `Current' if `start_index = 1' and `end_index = count'
		do
			if start_index = 1 and then end_index = count then
			else
				Result := substring (start_index, end_index)
			end
		end

	valid_substring_indices (start_index, end_index: INTEGER): BOOLEAN
		do
			if valid_index (start_index) then
				Result := end_index >= start_index and end_index <= count
			end
		end

feature {EL_ZSTRING_IMPLEMENTATION} -- Status query

	elks_checking: BOOLEAN
		deferred
		end

	has_character_8 (a_area: like area; c: CHARACTER_8; upper_index: INTEGER): BOOLEAN
		local
			i: INTEGER
		do
			from until Result or else i > upper_index loop
				Result := a_area [i] = c
				i := i + 1
			end
		end

	has_substitutes_between (a_area: like area; start_index, end_index: INTEGER): BOOLEAN
		local
			i: INTEGER
		do
			from i := start_index - 1 until Result or else i = end_index loop
				Result := a_area [i] = Substitute
				i := i + 1
			end
		end

	has_unencoded_between_optimal (a_area: like area; start_index, end_index: INTEGER): BOOLEAN
		-- `has_unencoded_between' with optimal alternative method of
		do
			if end_index - start_index < 50 then
				Result := has_substitutes_between (a_area, start_index, end_index)
			else
				Result := has_unencoded_between (start_index, end_index)
			end
		end

feature {NONE} -- Implementation

	adapted_argument (general: READABLE_STRING_GENERAL; index: INTEGER): EL_ZSTRING
		do
			Result := adapted_argument_for_type (general, Class_id.string_storage_type (general), index)
		end

	adapted_argument_for_type (general: READABLE_STRING_GENERAL; type_code: CHARACTER; index: INTEGER): EL_ZSTRING
		require
			valid_type_code: valid_string_storage_type (type_code)
			valid_index: 1 <= index and index <= Once_adapted_argument.count
		do
			inspect type_code
				when 'X' then
					if attached {ZSTRING} general as z_str then
						Result := z_str
					end

			else
				inspect index
					when 1 .. 3 then
						Result := Once_adapted_argument [index - 1]
						Result.wipe_out
				else
					create Result.make (general.count)
				end
				Result.append_string_general_for_type (general, type_code)
			end
		end

	encode (a_unicode: READABLE_STRING_GENERAL; area_offset: INTEGER)
		do
			encode_substring (a_unicode, 1, a_unicode.count, area_offset)
		end

	encode_substring (a_unicode: READABLE_STRING_GENERAL; start_index, end_index, area_offset: INTEGER)
		require
			valid_area_offset: valid_area_offset (a_unicode, start_index, end_index, area_offset)
		local
			r: EL_READABLE_STRING_GENERAL_ROUTINES
		do
			if attached Once_interval_list.emptied as unencoded_intervals then
				codec.encode_substring_general (a_unicode, area, start_index, end_index, area_offset, unencoded_intervals)

				if unencoded_intervals.count > 0 and then attached r.shared_cursor (a_unicode) as l_cursor then
					if has_mixed_encoding then
						append_unencoded_intervals (l_cursor, unencoded_intervals, area_offset - start_index + 1)
					else
						make_from_intervals (l_cursor, unencoded_intervals, area_offset - start_index + 1)
					end
				end
			end
		end

	encoded_character (uc: CHARACTER_32): CHARACTER
		do
			if uc.code <= Max_ascii_code then
				Result := uc.to_character_8
			else
				Result := codec.encoded_character (uc)
			end
		end

	leading_ascii_count (a_area: SPECIAL [CHARACTER]; start_index, end_index: INTEGER): INTEGER
		require
			valid_order: start_index <= end_index + 1
			valid_start_index: start_index <= end_index implies a_area.valid_index (start_index)
			valid_end_index: start_index <= end_index implies a_area.valid_index (end_index)
		local
			i: INTEGER; non_ascii: BOOLEAN; c_i: CHARACTER
		do
			from i := start_index until non_ascii or else i > end_index loop
				c_i := a_area [i]
				inspect c_i
					when Substitute then
						non_ascii := True
					when Control_0 .. Control_25, Control_27 .. Max_ascii then
						Result := Result + 1
						i := i + 1
				else
					non_ascii := True
				end
			end
		end

	put_unicode (a_code: NATURAL_32; i: INTEGER)
			-- put unicode at i th position
		do
			put (a_code.to_character_32, i)
		end

	compatible_string_8 (general: READABLE_STRING_GENERAL): detachable READABLE_STRING_8
		do
			Result := compatible_substring_8 (general, 1, general.count)
		end

	compatible_substring_8 (
		general: READABLE_STRING_GENERAL; start_index, end_index: INTEGER

	): detachable READABLE_STRING_8
		-- `general' cast to type `READABLE_STRING_8' if all characters are unchanged for
		-- `Codec' encoding. `Void' if any character has a different encoding
		require
			is_string_8: general.is_string_8
		local
			l_area: like area; i_lower, i_upper, i: INTEGER; c_i: CHARACTER
		do
			if general.is_immutable then
				if attached {READABLE_STRING_8} general as readable_8	and then attached cursor_8 (readable_8) as c then
					l_area := c.area
					i_lower := c.area_first_index + start_index - 1
					i_upper := i_lower + end_index - start_index
					Result := readable_8
				end

			elseif attached {STRING_8} general as str_8 then
				l_area := str_8.area
				i_lower := start_index - 1; i_upper := end_index - 1
				Result := str_8
			end
			if attached Codec.unicode_table as table then
				from i := i_lower until i > i_upper loop
					c_i := l_area [i]
					inspect c_i
						when Control_0 .. Control_25, Control_27 .. Max_ascii then
							i := i + 1

						when Substitute then
							Result := Void
							i := i_upper + 1 -- break
					else
						if table [c_i.code] /= c_i then
							Result := Void
							i := i_upper + 1 -- break
						else
							i := i + 1
						end
					end
				end
			end
		end

	to_lower_area (a: like area; start_index, end_index: INTEGER)
		-- Replace all characters in `a' between `start_index' and `end_index'
		-- with their lower version when available.
		do
			codec.to_lower (a, start_index, end_index, Current)
		end

	to_proper_area (a: like area; start_index, end_index: INTEGER)
		-- Replace all characters in `a' between `start_index' and `end_index'
		-- with their propercase version when available.
		do
			codec.to_proper (a, start_index, end_index, Current)
		end

	to_upper_area (a: like area; start_index, end_index: INTEGER)
		-- Replace all characters in `a' between `start_index' and `end_index'
		-- with their upper version when available.
		do
			codec.to_upper (a, start_index, end_index, Current)
		end

	valid_area_offset (a_unicode: READABLE_STRING_GENERAL; start_index, end_index, area_offset: INTEGER): BOOLEAN
		local
			l_count: INTEGER
		do
			l_count := end_index - start_index + 1
			Result := l_count > 0 implies area.valid_index (l_count + area_offset - 1)
		end

	z_code (i: INTEGER): NATURAL_32
			-- Returns hybrid code of latin and unicode
			-- Single byte codes are reserved for latin encoding.
			-- Unicode characters below 0xFF have bit number 31 set to 1 using `Sign_bit' so
			-- that any zcode <= 0xFF can be assumed to be an encoded character using some codec.

			-- Implementation of {READABLE_STRING_GENERAL}.code
			-- Client classes include `EL_ZSTRING_SEARCHER'
		local
			c: CHARACTER
		do
			c := area [i - 1]
			inspect c
				when Substitute then
					Result := unencoded_z_code (i)
			else
				Result := c.natural_32_code
			end
		ensure then
			first_byte_is_reserved_for_latin: area [i - 1] = Substitute implies Result > 0xFF
			reversible: Codec.z_code_as_unicode (Result) = unicode (i)
		end

feature {NONE} -- Constants

	Latin_1_codec: EL_ZCODEC
		once
			Result := Codec_factory.codec_by ({EL_ENCODING_TYPE}.Latin_1)
		end

	Once_adapted_argument: SPECIAL [ZSTRING]
		once
			create Result.make_filled (create {ZSTRING}.make_empty, 3)
			Result [1] := create {ZSTRING}.make_empty
			Result [2] := create {ZSTRING}.make_empty
		end

	Once_interval_list: EL_ARRAYED_INTERVAL_LIST
		once
			create Result.make_empty
		end

	Once_split_intervals: EL_ZSTRING_SPLIT_INTERVALS
		once
			create Result.make_empty
		end

	Once_substring_indices: EL_ARRAYED_LIST [INTEGER]
		do
			create Result.make (5)
		end

end