class EL_BOYER_MOORE_SEARCHER_32
Efficient Boyer-Moore Search for Unicode Strings See: codeproject.com Article
note
description: "[
Efficient Boyer-Moore Search for Unicode Strings
See: [http://www.codeproject.com/KB/recipes/bmsearch.aspx codeproject.com Article]
]"
author: "Finnian Reilly"
copyright: "Copyright (c) 2001-2022 Finnian Reilly"
contact: "finnian at eiffel hyphen loop dot com"
license: "MIT license (See: en.wikipedia.org/wiki/MIT_License)"
date: "2024-09-23 7:54:14 GMT (Monday 23rd September 2024)"
revision: "8"
class
EL_BOYER_MOORE_SEARCHER_32
create
make
feature {NONE} -- Initialization
make (pattern: STRING_32)
require
pattern_not_empty: not pattern.is_empty
local
max_shift, i, new_suffix_pos: INTEGER
suffix, new_suffix, new_pattern: STRING_32
do
create char_shifts.make (17)
char_shifts.compare_objects
pattern_count := pattern.count
max_shift := pattern_count
from i := 1 until i > pattern_count loop
char_shifts.put (create {like other_char_shifts}.make_filled (0, 1, pattern_count), pattern [i])
i := i + 1
end
create other_char_shifts.make_filled (0, 1, pattern_count)
from char_shifts.start until char_shifts.after loop
char_shifts.item_for_iteration [pattern_count] := max_shift
char_shifts.forth
end
other_char_shifts [pattern_count] := max_shift
from i := pattern_count until i = 0 loop
suffix := pattern.substring (i + 1, pattern_count)
if pattern.starts_with (suffix) then
max_shift := i
end
other_char_shifts [i] := max_shift
new_pattern := pattern.string
if suffix.is_empty or else last_index_of (new_pattern, suffix) > 1 then
from char_shifts.start until char_shifts.after loop
create new_suffix.make (suffix.count)
new_suffix.append_character (char_shifts.key_for_iteration)
new_suffix.append (suffix)
new_suffix_pos := last_index_of (new_pattern, new_suffix)
if new_suffix_pos > 0 then
char_shifts.item_for_iteration [i] := i - new_suffix_pos
else
char_shifts.item_for_iteration [i] := max_shift
end
if char_shifts.key_for_iteration = pattern [i] then
char_shifts.item_for_iteration [i] := 0
end
char_shifts.forth
end
else
from char_shifts.start until char_shifts.after loop
char_shifts.item_for_iteration [i] := max_shift
if char_shifts.key_for_iteration = pattern [i] then
char_shifts.item_for_iteration [i] := 0
end
char_shifts.forth
end
end
i := i - 1
end
end
feature -- Access
index (text: STRING_32; start_index: INTEGER): INTEGER
-- index of first pattern occurrence in text starting from 'start_index'
-- Returns 0 if not found
local
pos, i, shift: INTEGER; position_shifted, found: BOOLEAN
shifts: like char_shifts; other_shifts: like other_char_shifts
do
shifts := char_shifts; other_shifts := other_char_shifts
from pos := start_index until found or pos > text.count - pattern_count + 1 loop
position_shifted := False
from i := pattern_count until position_shifted or i = 0 loop
if shifts.has_key (text [pos + i - 1]) then
shift := shifts.found_item [i]
if shift /= 0 then
pos := pos + shift
position_shifted := True
elseif i = 1 then
Result := pos
found := True
end
else
pos := pos + other_shifts [i]
position_shifted := True
end
i := i - 1
end
end
end
feature -- Basic operations
log_print
do
from char_shifts.start until char_shifts.after loop
-- log.put_character (char_shifts.key_for_iteration.to_character_8)
-- log.put_string (": ")
char_shifts.item_for_iteration.do_all_with_index (agent log_shifts)
-- log.put_new_line
char_shifts.forth
end
-- log.put_string ("*: ")
other_char_shifts.do_all_with_index (agent log_shifts)
-- log.put_new_line
end
log_shifts (shift, pos: INTEGER)
do
if pos > 1 then
-- log.put_string (", ")
end
-- log.put_integer (shift)
end
feature {NONE} -- Implementation
last_index_of (str, pattern: STRING_32): INTEGER
-- last pos of pattern in str
local
pos: INTEGER
match_found: BOOLEAN
do
if pattern.count <= str.count then
from pos := str.count - pattern.count + 1 until match_found or pos = 0 loop
if pattern ~ str.substring (pos, pos + pattern.count - 1) then
match_found := True
else
pos := pos - 1
end
end
Result := pos
else
Result := 0
end
end
char_shifts: EL_HASH_TABLE [like other_char_shifts, CHARACTER_32]
other_char_shifts: ARRAY [INTEGER]
pattern_count: INTEGER
end