class EL_ZSTRING_SEARCHER

(source code)

description

Searcher for ZSTRING strings

note
	description: "Searcher for ${ZSTRING} strings"

	author: "Finnian Reilly"
	copyright: "Copyright (c) 2001-2022 Finnian Reilly"
	contact: "finnian at eiffel hyphen loop dot com"

	license: "MIT license (See: en.wikipedia.org/wiki/MIT_License)"
	date: "2025-04-15 7:54:38 GMT (Tuesday 15th April 2025)"
	revision: "20"

frozen class
	EL_ZSTRING_SEARCHER

inherit
	STRING_SEARCHER
		rename
			max_code_point_value as max_code_point_integer
		redefine
			internal_initialize_deltas, make
		end

	EL_STRING_HANDLER

	EL_ZCODE_CONVERSION

	EL_ZSTRING_CONSTANTS

create
	make

feature {NONE} -- Initialization

	make
		require else
			not_using_shared_extended_strings: not attached {EL_STRING_GENERAL_ROUTINES_I} Current
		do
			Precursor
			create Z_code_pattern.make_empty
			create super_readable_32.make_empty
			create super_readable_8.make_empty
		end

feature -- Initialization

	initialize_z_code_deltas (pattern: READABLE_STRING_GENERAL)
		do
			initialize_z_code_deltas_for_type (pattern, string_storage_type (pattern))
		end

	initialize_z_code_deltas_for_type (pattern: READABLE_STRING_GENERAL; type_code: CHARACTER)
		local
			extended_string: detachable EL_EXTENDED_READABLE_STRING_I [COMPARABLE]
		do
			inspect type_code
				when '1' then
					extended_string := super_readable_8

				when '4' then
					extended_string := super_readable_32
			else
				if attached {ZSTRING} pattern as z_str then
					z_str.fill_z_codes (z_code_pattern)
				end
			end
			if extended_string /= void and then attached extended_string as extended then
				extended.set_target (pattern)
				extended.fill_z_codes (z_code_pattern)
			end
			initialize_deltas (z_code_pattern)
		end

feature -- Search

	fuzzy_index (a_string: like String_type; a_pattern: READABLE_STRING_GENERAL; start_pos, end_pos, fuzzy: INTEGER): INTEGER
			-- Position of first occurrence of `a_pattern' at or after `start_pos' in
			-- `a_string' with 0..`fuzzy' mismatches between `a_string' and `a_pattern'.
			-- 0 if there are no fuzzy matches.
		local
			i, j, l_min_offset, end_index, l_area_lower, pattern_count, l_nb_mismatched, block_index: INTEGER
			iter: EL_COMPACT_SUBSTRINGS_32_ITERATION; area_32: like String_type.unencoded_area
			l_matched: BOOLEAN; l_deltas_array: like deltas_array
			l_area: SPECIAL [CHARACTER]; char_code: NATURAL
		do
			if fuzzy = a_pattern.count then
					-- More mismatches than the pattern length.
				Result := start_pos
			else
				if fuzzy = 0 then
					Result := substring_index (a_string, a_pattern, start_pos, end_pos)
				else
					initialize_fuzzy_deltas (a_pattern, fuzzy)
					l_deltas_array := deltas_array
					if l_deltas_array /= Void then
						l_area := a_string.area; area_32 := a_string.unencoded_area
						from
							pattern_count := a_pattern.count
							l_area_lower := a_string.area_lower
							i := start_pos + l_area_lower
							end_index := end_pos + 1 + l_area_lower
						until
							i + pattern_count > end_index
						loop
							from
								j := 0
								l_nb_mismatched := 0
								l_matched := True
							until
								j = pattern_count
							loop
								char_code := iter.i_th_z_code ($block_index, l_area, area_32, i + j - 1)
								if char_code /= a_pattern.code (j + 1) then
									l_nb_mismatched := l_nb_mismatched + 1;
									if l_nb_mismatched > fuzzy then
											-- Too many mismatched, so we stop
										j := pattern_count - 1	-- Jump out of loop
										l_matched := False
									end
								end
								j := j + 1
							end

							if l_matched then
									-- We got the substring
								Result := i - l_area_lower
								i := end_index	-- Jump out of loop
							else
								if i + pattern_count <= end_pos then
										-- Pattern was not found, compute shift to next location
									from
										j := 0
										l_min_offset := pattern_count + 1
									until
										j > fuzzy
									loop
										char_code := iter.i_th_z_code ($block_index, l_area, area_32, i + pattern_count - j - 1)
										if char_code > Max_code_point_value then
												-- No optimization for a characters above
												-- `Max_code_point_value'.
											l_min_offset := 1
											j := fuzzy + 1 -- Jump out of loop
										else
											l_min_offset := l_min_offset.min (l_deltas_array.item (j).item (char_code.to_integer_32))
										end
										j := j + 1
									end
									i := i + l_min_offset
								else
									i := i + 1
								end
							end
						end
					end
					deltas_array := Void
				end
			end
		end

	sub_zstring_index (a_string: like string_type; a_pattern: EL_READABLE_ZSTRING; start_pos, end_pos: INTEGER): INTEGER
		do
			a_pattern.fill_z_codes (z_code_pattern)
			Result := substring_index (a_string, z_code_pattern, start_pos, end_pos)
		end

	substring_index_with_deltas (
		a_string: like String_type; a_pattern: READABLE_STRING_GENERAL; start_pos, end_pos: INTEGER
	): INTEGER
		-- Position of first occurrence of `a_pattern' at or after `start_pos' in `a_string'.
		-- 0 if there are no matches.
		local
			i, j, end_index, pattern_count, l_area_lower, block_index: INTEGER; char_code: NATURAL
			l_matched: BOOLEAN; l_deltas: like deltas; l_area: SPECIAL [CHARACTER]
			iter: EL_COMPACT_SUBSTRINGS_32_ITERATION; area_32: like String_type.unencoded_area
		do
			if a_string = a_pattern then
				if start_pos = 1 then
					Result := 1
				end
			else
				pattern_count := a_pattern.count
				check l_pattern_count_positive: pattern_count > 0 end
				l_area := a_string.area; area_32 := a_string.unencoded_area
				from
					l_area_lower := a_string.area_lower
					i := start_pos + l_area_lower
					l_deltas := deltas
					end_index := end_pos + 1 + l_area_lower
				until
					i + pattern_count > end_index
				loop
					from
						j := 0
						l_matched := True
					until
						j = pattern_count
					loop
						char_code := iter.i_th_z_code ($block_index, l_area, area_32, i + j - 1)
						if char_code /= a_pattern.code (j + 1) then
						-- Mismatch, so we stop
							j := pattern_count - 1 -- Jump out of loop
							l_matched := False
						end
						j := j + 1
					end

					if l_matched then
					-- We got the substring
						Result := i - l_area_lower
						i := end_index	-- Jump out of loop
					else
					-- Pattern was not found, shift to next location
						if i + pattern_count <= end_pos then
							char_code := iter.i_th_z_code ($block_index, l_area, area_32, i + pattern_count - 1)
							if char_code > Max_code_point_value then
									-- Character is too big, we revert to a slow comparison
								i := i + 1
							else
								i := i + l_deltas.item (char_code.to_integer_32)
							end
						else
							i := i + 1
						end
					end
				end
			end
		end

	substring_index_with_z_code_pattern (a_string: like String_type; start_pos, end_pos: INTEGER): INTEGER
		-- substring index with pattern previously initialized by `initialize_z_code_deltas'
		do
			Result := substring_index_with_deltas (a_string, z_code_pattern, start_pos, end_pos)
		end

feature {NONE} -- Implementation

	internal_initialize_deltas (a_pattern: READABLE_STRING_GENERAL; a_pattern_count: INTEGER; a_deltas: like deltas)
			-- Initialize `a_deltas' with `a_pattern'.
			-- Optimized for the top `max_code_point_value' characters only.
		local
			i: INTEGER; char_code: NATURAL
		do
				-- Initialize the delta table (one more than pattern count).
			a_deltas.fill_with (a_pattern_count + 1, 0, Max_code_point_integer)

				-- Now for each character within the pattern, fill in shifting necessary
				-- to bring the pattern to the character. The rightmost value is kept, as
				-- we scan the pattern from left to right (ie. if there is two 'a', only the
				-- delta associated with the second --rightmost-- will be kept).
			from
				i := 0
			until
				i = a_pattern_count
			loop
				char_code := a_pattern.code (i + 1)
				if char_code <= max_code_point_value then
					a_deltas.put (a_pattern_count - i, char_code.to_integer_32)
				end
				i := i + 1
			end
		end

feature {STRING_HANDLER} -- Internal attributes

	z_code_pattern: STRING_32

	super_readable_32: EL_READABLE_STRING_32

	super_readable_8: EL_READABLE_STRING_8

feature {NONE} -- Constants

	Max_code_point_integer: INTEGER = 2000
		-- We optimize the search for the first 2000 code points of Unicode (i.e. using 8KB of memory).

		-- Ideally we want this to be type NATURAL but we are stuck with how it's defined in STRING_SEARCHER

	Max_code_point_value: NATURAL = 2_000
		-- We optimize the search for the first 2000 code points of Unicode (i.e. using 8KB of memory).

		-- We need the NATURAL value because of the z_code `Sign_bit', which means we cannot
		-- compare with an integer. This conditional will not branch correctly:

		--    if char_code <= max_code_point_integer then

	String_type: EL_READABLE_ZSTRING
		require else
			never_called: False
		once
			Result := Empty_string
		end

end