class EL_ZCODEC
Client examples: DOCUMENT_NODE_STRING_TEST_SET ; STRING_CONVERSION_TEST_SET ; TEXT_DATA_TEST_SET ; ZCODEC_AS_Z_CODE
Base class for Latin, Windows and UTF-8 codecs
Proven to be thread-safe in repository publishing command
note
description: "Base class for Latin, Windows and UTF-8 codecs"
notes: "Proven to be thread-safe in repository publishing command"
author: "Finnian Reilly"
copyright: "Copyright (c) 2001-2022 Finnian Reilly"
contact: "finnian at eiffel hyphen loop dot com"
license: "MIT license (See: en.wikipedia.org/wiki/MIT_License)"
date: "2024-11-08 9:11:25 GMT (Friday 8th November 2024)"
revision: "80"
deferred class
EL_ZCODEC
inherit
EL_ZCODEC_IMPLEMENTATION
export
{EL_ZSTRING_IMPLEMENTATION} shared_interval_list
end
EL_SET [CHARACTER]
rename
has as in_latin_1_disjoint_set
end
feature {EL_ZCODEC_FACTORY} -- Initialization
make
do
make_default
create latin_characters.make_filled ('%U', 1)
unicode_table := new_unicode_table
utf_8_byte_count_table := new_utf_8_byte_count_table
shared_interval_list := Empty_string.Once_interval_list
initialize_latin_sets
end
feature -- Measurement
utf_8_byte_count (latin_area: SPECIAL [CHARACTER]; count: INTEGER): INTEGER
require
valid_count: count > 0 implies latin_area.valid_index (count - 1)
local
i: INTEGER
do
if attached utf_8_byte_count_table as array then
from i := 0 until i = count loop
Result := Result + array [latin_area [i].code]
i := i + 1
end
end
end
feature -- Character query
is_alpha (code: NATURAL): BOOLEAN
deferred
end
is_alphanumeric (code: NATURAL): BOOLEAN
do
Result := is_numeric (code) or else is_alpha (code)
end
is_lower (code: NATURAL): BOOLEAN
deferred
end
is_numeric (code: NATURAL): BOOLEAN
do
inspect code
when 48 .. 57 then
Result := True
else
end
end
is_upper (code: NATURAL): BOOLEAN
deferred
end
same_caseless (a, b: CHARACTER; b_unicode: CHARACTER_32): BOOLEAN
-- `True' if `a' and `b' are same character regardless of case
require
valid_caseless: valid_caseless_argument (b, b_unicode)
local
unicode_substitute: CHARACTER_32; case_offset: INTEGER
a_code: NATURAL; a_is_upper, a_is_lower: BOOLEAN
do
if a = b then
Result := True
else
a_code := a.natural_32_code
if is_lower (a_code) then
case_offset := to_upper_offset (a_code)
a_is_lower := True
elseif is_upper (a_code) then
case_offset := to_lower_offset (a_code)
a_is_upper := True
end
if case_offset.to_boolean then
Result := a + case_offset = b
elseif a_is_lower or a_is_upper then
unicode_substitute := unicode_case_change_substitute (a_code)
if unicode_substitute > '%U' then
Result := unicode_substitute = b_unicode
end
end
end
end
same_caseless_characters (area, other_area: SPECIAL [CHARACTER]; other_offset, start_index, count: INTEGER): BOOLEAN
local
i, j: INTEGER
do
Result := True
from i := 0 until not Result or i = count loop
j := start_index + i
inspect area [j]
when Substitute then
Result := other_area [j + other_offset] = Substitute
else
Result := same_caseless (area [j], other_area [j + other_offset], '%U')
end
i := i + 1
end
end
feature -- Contract Support
is_encodeable_as_string_8 (str: EL_READABLE_ZSTRING; start_index, end_index: INTEGER): BOOLEAN
-- `True' if contents of `str' is encodeable as a single-byte string
require
valid_start_index: str.valid_index (start_index) and str.valid_index (end_index)
local
i, in_offset, block_index, i_upper, i_lower: INTEGER; uc_i: CHARACTER_32
iter: EL_COMPACT_SUBSTRINGS_32_ITERATION; c_i: CHARACTER
do
if attached str.area as str_area and then attached str.unencoded_area as area_32
and then attached unicode_table as unicode
and then attached str.codec.unicode_table as zstring_unicode
then
in_offset := str.area_lower
Result := True
if unicode = zstring_unicode then
Result := not str.has_mixed_encoding
else
i_lower := start_index + in_offset - 1
i_upper := end_index + in_offset - 1
from i := i_lower until not Result or i > i_upper loop
c_i := str_area [i]
inspect c_i
when Substitute then
uc_i := iter.item ($block_index, area_32, i - in_offset + 1)
else
uc_i := zstring_unicode [c_i.code]
end
inspect uc_i.code
when 0 .. Max_ascii_code then
-- do nothing for ASCII
when 0x80 .. Max_8_bit_code then
if unicode [uc_i.code] /= uc_i then
Result := latin_character (uc_i) /= '%U'
end
else
Result := latin_character (uc_i) /= '%U'
end
i := i + 1
end
end
end
end
is_compatible_string_8 (latin_1_area: SPECIAL [CHARACTER]; i_lower, i_upper: INTEGER): BOOLEAN
-- `True' if all Latin-1 encoded characters in `latin_1_area' from `i_lower' to `i_upper'
-- do not need to be changed to match current `encoding'
local
c: EL_CHARACTER_8_ROUTINES
do
inspect encoding
when Latin_1 then
Result := True
else
-- `Current' implements `EL_SET [CHARACTER]' as `in_latin_1_disjoint_set'
Result := not c.has_member (Current, latin_1_area, i_lower, i_upper)
end
end
valid_caseless_argument (b: CHARACTER; b_unicode: CHARACTER_32): BOOLEAN
do
if b = Substitute then
Result := b_unicode.code.to_boolean
else
Result := b_unicode = '%U'
end
end
valid_offset_and_count (source_count: INTEGER; encoded_out: SPECIAL [CHARACTER]; out_offset: INTEGER;): BOOLEAN
do
if encoded_out.count >= source_count then
Result := source_count > 0 implies encoded_out.valid_index (source_count + out_offset - 1)
end
end
feature {EL_SHARED_ZSTRING_CODEC, EL_ENCODING_BASE, STRING_HANDLER} -- Access
order_comparison (a_zcode, b_zcode: NATURAL): INTEGER
-- Comparison must be done as unicode and never Latin-X or Windows-X
do
Result := z_code_as_unicode (b_zcode).to_integer_32 - z_code_as_unicode (a_zcode).to_integer_32
end
unicode_table: like new_unicode_table
-- map latin to unicode
utf_8_byte_count_table: SPECIAL [INTEGER_8]
-- precomputed utf-8 byte counts for characters in `unicode_table'
feature -- Encoding operations
append_encoded_to (str: READABLE_STRING_8; output: ZSTRING)
-- append `str' encoded with `encoding' to `output'
do
if encoded_as_latin (1) then
output.append_string_general (str)
else
Unicode_buffer.set_from_encoded (Current, str)
output.append_string_general (Unicode_buffer)
end
end
append_encoded_to_string_8 (unicode_in: READABLE_STRING_GENERAL; output: STRING)
local
offset, new_count: INTEGER
do
offset := output.count
new_count := offset + unicode_in.count
output.grow (new_count)
output.set_count (new_count)
encode_substring_general (unicode_in, output.area, 1, unicode_in.count, offset, shared_interval_list.emptied)
end
encode (
unicode_in: READABLE_STRING_GENERAL; encoded_out: SPECIAL [CHARACTER]; out_offset: INTEGER;
unencoded_intervals: EL_ARRAYED_INTERVAL_LIST
)
do
encode_substring_general (unicode_in, encoded_out, 1, unicode_in.count, out_offset, unencoded_intervals)
end
encode_as_string_8 (unicode_in: READABLE_STRING_GENERAL; encoded_out: SPECIAL [CHARACTER]; out_offset: INTEGER)
do
encode (unicode_in, encoded_out, out_offset, shared_interval_list.emptied)
end
encode_substring_general (
unicode_in: READABLE_STRING_GENERAL; encoded_out: SPECIAL [CHARACTER]
start_index, end_index, out_offset: INTEGER; unencoded_intervals: EL_ARRAYED_INTERVAL_LIST
)
-- encode general `unicode_in' characters as current `encoding'
-- Set unencodeable characters as the `Substitute' character (26) and record location in `unencoded_intervals'
require
valid_offset_and_count: valid_offset_and_count (end_index - start_index + 1, encoded_out, out_offset)
empty_intervals: unencoded_intervals.is_empty
local
i, out_i, in_offset, i_lower, i_upper: INTEGER; interval: NATURAL_64
area: SPECIAL [CHARACTER]; unicode: like unicode_table; c_i: CHARACTER
do
inspect string_storage_type (unicode_in)
when '1' then
if attached {READABLE_STRING_8} unicode_in as s_8 and then attached cursor_8 (s_8) as c_8 then
unicode := unicode_table; in_offset := c_8.area_first_index; area := c_8.area
i_lower := start_index + in_offset - 1
i_upper := end_index + in_offset - 1
out_i := i_lower - in_offset + out_offset - start_index + 1
from i := i_lower until i > i_upper loop
c_i := area [i]
inspect character_8_band (c_i)
when Ascii_range, Substitute then
encoded_out [out_i] := c_i
else
if unicode [c_i.code].to_character_8 = c_i then
encoded_out [out_i] := c_i
else
inspect latin_character (c_i)
when '%U' then
encoded_out [out_i] := Substitute
interval := unencoded_intervals.extend_next_upper (interval, out_i + 1)
else
encoded_out [out_i] := c_i
end
end
end
i := i + 1; out_i := out_i + 1
end
unencoded_intervals.extend_compact (interval)
end
when '4' then
if attached {READABLE_STRING_32} unicode_in as str_32 then
encode_substring_32 (str_32, encoded_out, start_index, end_index, out_offset, unencoded_intervals)
end
when 'X' then
if attached {EL_READABLE_ZSTRING} unicode_in as str_z then
encode_substring_z (str_z, encoded_out, start_index, end_index, out_offset, unencoded_intervals)
end
end
end
encode_utf (
utf_in: READABLE_STRING_8; encoded_out: SPECIAL [CHARACTER]; utf_type, unicode_count, out_offset: INTEGER
unencoded_characters: EL_COMPACT_SUBSTRINGS_32_BUFFER
)
-- encode unicode characters as latin
-- Set unencodeable characters as the Substitute character (26) and record location in unencoded_intervals
require
valid_utf_type: utf_type = 8 or utf_type = 16
valid_utf_16_input: utf_type = 16 implies utf_in.count \\ 2 = 0
valid_offset_and_count: valid_offset_and_count (unicode_count, encoded_out, out_offset)
local
i, j, byte_count, end_index, last_upper: INTEGER; leading_byte, unicode, code_1: NATURAL
uc: CHARACTER_32; c: CHARACTER; area: SPECIAL [CHARACTER]
l_unicodes: like unicode_table; is_utf_8_in: BOOLEAN
utf_8: EL_UTF_8_CONVERTER; utf_16_le: EL_UTF_16_LE_CONVERTER
do
l_unicodes := unicode_table; is_utf_8_in := utf_type = 8
if attached cursor_8 (utf_in) as cursor then
area := cursor.area; end_index := cursor.area_last_index
last_upper := unencoded_characters.last_upper
from i := cursor.area_first_index; j := out_offset until i > end_index loop
if is_utf_8_in then
leading_byte := area [i].natural_32_code
byte_count := utf_8.sequence_count (leading_byte)
unicode := utf_8.unicode (area, leading_byte, i, byte_count)
else
code_1 := area [i].natural_32_code | (area [i + 1].natural_32_code |<< 8)
byte_count := utf_16_le.sequence_count (code_1)
unicode := utf_16_le.unicode (area, code_1, i, byte_count)
end
uc := unicode.to_character_32
if unicode <= 0xFF and then l_unicodes [uc.code] = uc then
encoded_out [j] := uc.to_character_8
else
c := latin_character (uc)
if c = '%U' then
encoded_out [j] := Substitute
last_upper := unencoded_characters.extend (uc, last_upper, j + 1)
else
encoded_out [j] := c
end
end
i := i + byte_count
j := j + 1
end
unencoded_characters.set_last_upper (last_upper)
end
end
re_encode_substring (
other: EL_ZCODEC; str_8: READABLE_STRING_8; encoded_out: SPECIAL [CHARACTER]
start_index, end_index, out_offset: INTEGER; unencoded_intervals: EL_ARRAYED_INTERVAL_LIST
)
-- re-encode single-byte `str_8' characters encoded with `other' codec
-- Set unencodeable characters as the `Substitute' character (26) and record location in `unencoded_intervals'
require
valid_offset_and_count: valid_offset_and_count (end_index - start_index + 1, encoded_out, out_offset)
local
i, out_i, code_i, in_offset: INTEGER; interval: NATURAL_64; c: CHARACTER; uc: CHARACTER_32
c_8_area: SPECIAL [CHARACTER_8]; o_unicode, unicode: like unicode_table
do
if attached cursor_8 (str_8) as c_8 then
in_offset := c_8.area_first_index; c_8_area := c_8.area
if id = other.id then
encoded_out.copy_data (c_8_area, start_index + in_offset - 1, out_offset, end_index - start_index + 1)
else
o_unicode := other.unicode_table; unicode := unicode_table
from i := start_index until i > end_index loop
c := c_8_area [i + in_offset - 1]; code_i := c.code
out_i := i + out_offset - start_index
if code_i <= Max_ascii_code then -- ASCII characters are the same
encoded_out [out_i] := c
else
uc := o_unicode [code_i]; code_i := uc.code
if unicode.valid_index (code_i) and then unicode [code_i] = uc then
encoded_out [out_i] := uc.to_character_8
else
c := latin_character (uc)
if c = '%U' then
encoded_out [out_i] := Substitute
interval := unencoded_intervals.extend_next_upper (interval, out_i + 1)
else
encoded_out [out_i] := c
end
end
end
i := i + 1
end
unencoded_intervals.extend_compact (interval)
end
end
end
write_encoded (unicode_in: READABLE_STRING_GENERAL; writeable: EL_WRITABLE)
local
i, count: INTEGER
do
count := unicode_in.count
if attached String_8_pool.sufficient_item (count) as borrowed then
if attached borrowed.empty as string_8 and then attached string_8.area as l_area then
encode_as_string_8 (unicode_in, l_area, 0)
from i := 0 until i = count loop
writeable.write_encoded_character_8 (l_area [i])
i := i + 1
end
end
borrowed.return
end
end
write_encoded_character (uc: CHARACTER_32; writeable: EL_WRITABLE)
do
writeable.write_encoded_character_8 (encoded_character (uc))
end
feature -- Basic operations
decode (a_count: INTEGER; latin_in: SPECIAL [CHARACTER]; unicode_out: SPECIAL [CHARACTER_32]; out_offset: INTEGER)
-- decode characters in `latin_in' to unicode outputting in `unicode_out', skipping `Substitute' characters
-- Relative output position determined by `out_offset'
require
enough_latin_characters: latin_in.count > a_count
unicode_out_big_enough: unicode_out.count >= a_count + out_offset
local
i: INTEGER; already_latin_1: BOOLEAN
do
already_latin_1 := encoded_as_latin (1)
if attached unicode_table as unicode then
from i := 0 until i = a_count loop
inspect character_8_band (latin_in [i])
when Substitute then
do_nothing -- Filled in later by call to `{EL_COMPACT_SUBSTRINGS_32_I}.write'
when Ascii_range then
unicode_out [i + out_offset] := latin_in [i].code.to_character_32
else
if already_latin_1 then
unicode_out [i + out_offset] := latin_in [i].to_character_32
else
unicode_out [i + out_offset] := unicode [latin_in [i].code]
end
end
i := i + 1
end
end
end
to_lower (
characters: SPECIAL [CHARACTER]; start_index, end_index: INTEGER; unencoded_characters: EL_COMPACT_SUBSTRINGS_32_I
)
-- Replace all characters in `a' between `start_index' and `end_index'
-- with their lower version when available.
do
change_case (characters, start_index, end_index, {EL_CASE}.Lower, unencoded_characters)
end
to_proper (
characters: SPECIAL [CHARACTER]; start_index, end_index: INTEGER; unencoded_characters: EL_COMPACT_SUBSTRINGS_32_I
)
-- Replace all characters in `a' between `start_index' and `end_index'
-- with their lower version when available.
do
change_case (characters, start_index, end_index, {EL_CASE}.Proper, unencoded_characters)
end
to_upper (
characters: SPECIAL [CHARACTER]; start_index, end_index: INTEGER; unencoded_characters: EL_COMPACT_SUBSTRINGS_32_I
)
-- Replace all characters in `a' between `start_index' and `end_index'
-- with their propercase version when available.
do
change_case (characters, start_index, end_index, {EL_CASE}.Upper, unencoded_characters)
end
feature -- Text conversion
as_unicode (encoded: READABLE_STRING_8; keeping_ref: BOOLEAN): READABLE_STRING_GENERAL
-- returns `encoded' string as unicode assuming the encoding matches `Current' codec
-- when keeping a reference to `Result' specify `keeping_ref' as `True'
do
if encoded_as_latin (1) or else cursor_8 (encoded).all_ascii then
Result := encoded
else
Unicode_buffer.set_from_encoded (Current, encoded)
Result := Unicode_buffer
end
if keeping_ref then
Result := Result.twin
end
end
feature -- Character conversion
as_z_code (uc: CHARACTER_32): NATURAL
-- Returns hybrid code of latin and unicode
-- Single byte codes are reserved for latin encoding.
-- Unicode characters below 0xFF are shifted into the private use range: 0xE000 .. 0xF8FF
-- See https://en.wikipedia.org/wiki/Private_Use_Areas
local
c: CHARACTER; uc_code: INTEGER
do
uc_code := uc.code
if uc_code <= Max_ascii_code then
Result := uc.natural_32_code
elseif uc_code <= Max_8_bit_code and then unicode_table [uc_code] = uc then
Result := uc.natural_32_code
else
c := latin_character (uc)
if c = '%U' then
Result := unicode_to_z_code (uc.natural_32_code)
else
Result := c.natural_32_code
end
end
ensure
reversible: z_code_as_unicode (Result) = uc.natural_32_code
end
encoded_character (uc: CHARACTER_32): CHARACTER
local
unicode: INTEGER
do
unicode := uc.code
if unicode <= Max_ascii_code then
Result := uc.to_character_8
elseif unicode <= Max_8_bit_code and then unicode_table [unicode] = uc then
Result := uc.to_character_8
else
Result := latin_character (uc)
if Result = '%U' then
Result := Substitute
end
end
end
latin_character (uc: CHARACTER_32): CHARACTER
-- unicode to latin translation
-- Returns '%U' if translation is the same as ISO-8859-1 or else not in current set
deferred
ensure
valid_latin: Result /= '%U' implies unicode_table [Result.code] = uc
end
z_code_as_unicode (z_code: NATURAL): NATURAL
do
if z_code > 0xFF then
Result := multi_byte_z_code_to_unicode (z_code)
else
Result := unicode_table [z_code.to_integer_32].natural_32_code
end
end
feature {NONE} -- Implementation
change_case (
latin_in: SPECIAL [CHARACTER]; start_index, end_index: INTEGER; case: NATURAL_8
unencoded_characters: EL_COMPACT_SUBSTRINGS_32_I
)
require
valid_case: case /= {EL_CASE}.Default and then (create {EL_CASE}).is_valid (case)
local
unicode_substitute, uc_i: CHARACTER_32; new_c, c_i: CHARACTER; i: INTEGER
state_alpha, block_index: INTEGER; iter: EL_COMPACT_SUBSTRINGS_32_ITERATION
area_32: SPECIAL [CHARACTER_32]
do
area_32 := unencoded_characters.area
from i := start_index until i > end_index loop
c_i := latin_in [i]
inspect character_8_band (c_i)
when Substitute then
uc_i := iter.item ($block_index, area_32, i + 1)
inspect case
when {EL_CASE}.Upper then
if not uc_i.is_upper then
iter.put (area_32, uc_i.upper, block_index, i + 1)
end
when {EL_CASE}.Lower then
if not uc_i.is_lower then
iter.put (area_32, uc_i.lower, block_index, i + 1)
end
when {EL_CASE}.Proper then
if is_first_alpha ($state_alpha, state_alpha, uc_i.is_alpha) then
if not uc_i.is_upper then
iter.put (area_32, uc_i.upper, block_index, i + 1)
end
else
if not uc_i.is_lower then
iter.put (area_32, uc_i.lower, block_index, i + 1)
end
end
else
end
when Ascii_range then
inspect case
when {EL_CASE}.Upper then
latin_in [i] := c_i.as_upper
when {EL_CASE}.Lower then
latin_in [i] := c_i.as_lower
when {EL_CASE}.Proper then
if is_first_alpha ($state_alpha, state_alpha, c_i.is_alpha) then
latin_in [i] := c_i.as_upper
else
latin_in [i] := c_i.as_lower
end
else
end
else
inspect case
when {EL_CASE}.Upper then
new_c := as_upper (c_i.natural_32_code).to_character_8
when {EL_CASE}.Lower then
new_c := as_lower (c_i.natural_32_code).to_character_8
when {EL_CASE}.Proper then
if is_first_alpha ($state_alpha, state_alpha, c_i.is_alpha) then
new_c := as_upper (c_i.natural_32_code).to_character_8
else
new_c := as_lower (c_i.natural_32_code).to_character_8
end
else
end
if new_c = c_i then
unicode_substitute := unicode_case_change_substitute (c_i.natural_32_code)
if unicode_substitute.natural_32_code > 0 then
new_c := Substitute
unencoded_characters.put (unicode_substitute, i + 1)
end
end
if new_c /= c_i then
latin_in [i] := new_c
end
end
i := i + 1
end
end
encode_substring_z (
zstr_in: EL_READABLE_ZSTRING; encoded_out: SPECIAL [CHARACTER]
start_index, end_index, out_offset: INTEGER
unencoded_intervals: EL_ARRAYED_INTERVAL_LIST
)
-- encode `unicode_in' characters as current `encoding'
-- Set unencodeable characters as the `Substitute' character (26) and record location in `unencoded_intervals'
require
valid_offset_and_count: valid_offset_and_count (end_index - start_index + 1, encoded_out, out_offset)
local
i, i_lower, i_upper, out_i, code_i, in_offset, block_index, count: INTEGER; interval: NATURAL_64
uc_i: CHARACTER_32; iter: EL_COMPACT_SUBSTRINGS_32_ITERATION; latin_c, c_i: CHARACTER
encode_default: BOOLEAN
do
if attached zstr_in.area as area and then attached zstr_in.unencoded_area as area_32
and then attached unicode_table as unicode
and then attached zstr_in.codec.unicode_table as zstring_unicode
then
in_offset := zstr_in.area_lower
i_lower := start_index + in_offset - 1
i_upper := end_index + in_offset - 1
out_i := i_lower - in_offset + out_offset - start_index + 1
if unicode = zstring_unicode then
-- same encoding
count := end_index - start_index + 1
encoded_out.copy_data (zstr_in.area, i_lower, out_offset, count)
if zstr_in.has_mixed_encoding then
from i := i_lower until i > i_upper loop
c_i := area [i]
inspect c_i
when Substitute then
interval := unencoded_intervals.extend_next_upper (interval, out_i + 1)
else
end
i := i + 1; out_i := out_i + 1
end
unencoded_intervals.extend_compact (interval)
end
else
from i := i_lower until i > i_upper loop
c_i := area [i]
inspect c_i
when Substitute then
uc_i := iter.item ($block_index, area_32, i - in_offset + 1)
else
uc_i := zstring_unicode [c_i.code]
end
code_i := uc_i.code
inspect code_i
when 0 .. Max_ascii_code then
encoded_out [out_i] := uc_i.to_character_8
when 0x80 .. Max_8_bit_code then
if unicode [code_i] = uc_i then
encoded_out [out_i] := uc_i.to_character_8
else
encode_default := True
end
else
encode_default := True
end
if encode_default then
latin_c := latin_character (uc_i)
inspect latin_c
when '%U' then
encoded_out [out_i] := Substitute
interval := unencoded_intervals.extend_next_upper (interval, out_i + 1)
else
encoded_out [out_i] := latin_c
end
encode_default := False
end
i := i + 1; out_i := out_i + 1
end
unencoded_intervals.extend_compact (interval)
end
end
end
encode_substring_32 (
unicode_in: READABLE_STRING_32; encoded_out: SPECIAL [CHARACTER]
start_index, end_index, out_offset: INTEGER
unencoded_intervals: EL_ARRAYED_INTERVAL_LIST
)
-- encode `unicode_in' characters as current `encoding'
-- Set unencodeable characters as the `Substitute' character (26) and record location in `unencoded_intervals'
require
not_zstring_in: string_storage_type (unicode_in) /= 'X'
valid_offset_and_count: valid_offset_and_count (end_index - start_index + 1, encoded_out, out_offset)
local
i, i_lower, i_upper, out_i, in_offset: INTEGER; latin_c: CHARACTER
interval: NATURAL_64; encode_default: BOOLEAN; uc_i: CHARACTER_32
do
if attached cursor_32 (unicode_in) as c_32 and then attached c_32.area as area_32
and then attached unicode_table as unicode
then
in_offset := c_32.area_first_index
i_lower := start_index + in_offset - 1
i_upper := end_index + in_offset - 1
out_i := i_lower - in_offset + out_offset - start_index + 1
from i := i_lower until i > i_upper loop
uc_i := area_32 [i]
inspect uc_i.code
when 0 .. Max_ascii_code then
encoded_out [out_i] := uc_i.to_character_8
when 0x80 .. Max_8_bit_code then
if unicode [uc_i.code] = uc_i then
encoded_out [out_i] := uc_i.to_character_8
else
encode_default := True
end
else
encode_default := True
end
if encode_default then
latin_c := latin_character (uc_i)
inspect latin_c
when '%U' then
encoded_out [out_i] := Substitute
interval := unencoded_intervals.extend_next_upper (interval, out_i + 1)
else
encoded_out [out_i] := latin_c
end
encode_default := False
end
i := i + 1; out_i := out_i + 1
end
unencoded_intervals.extend_compact (interval)
end
end
is_first_alpha (state_alpha_ptr: TYPED_POINTER [INTEGER]; state_alpha: INTEGER; is_alpha_item: BOOLEAN): BOOLEAN
require
not_default_pointer: not state_alpha_ptr.is_default_pointer
local
p: EL_POINTER_ROUTINES
do
inspect state_alpha
when 1 then
if not is_alpha_item then
p.put_integer_32 (0, state_alpha_ptr)
end
when 0 then
if is_alpha_item then
p.put_integer_32 (1, state_alpha_ptr)
Result := True
end
end
end
new_utf_8_byte_count_table: SPECIAL [INTEGER_8]
local
i: INTEGER; uc: NATURAL
do
if attached unicode_table as table then
create Result.make_empty (table.count)
from until i > 0xFF loop
uc := table [i].natural_32_code
Result.extend (character_utf_8_byte_count (uc).to_integer_8)
i := i + 1
end
end
-- special case for SUB character
Result [Substitute_code] := 0
end
feature {EL_ZSTRING} -- Deferred implementation
as_lower (code: NATURAL): NATURAL
deferred
ensure then
reversible: code /= Result implies code = as_upper (Result)
end
as_upper (code: NATURAL): NATURAL
deferred
ensure then
reversible: code /= Result implies code = as_lower (Result)
end
initialize_latin_sets
deferred
end
in_latin_1_disjoint_set (c: CHARACTER): BOOLEAN
-- true if `c' is `Substitute' character or else in set of 8-bit characters not
-- present in Latin-1
deferred
end
new_unicode_table: SPECIAL [CHARACTER_32]
-- map latin to unicode
deferred
end
to_lower_offset (code: NATURAL): INTEGER
deferred
ensure
reversible: Result /= 0 implies as_upper ((code.to_character_8 + Result).natural_32_code) = code
end
to_upper_offset (code: NATURAL): INTEGER
deferred
ensure
reversible: Result /= 0 implies as_lower ((code.to_character_8 + Result).natural_32_code) = code
end
unicode_case_change_substitute (code: NATURAL): CHARACTER_32
-- Returns Unicode case change character if c does not have a latin case change
-- or else the Null character
deferred
end
end