class CODEC_INFO

(source code)

Description

Codec information extracted from C source file

note
	description: "Codec information extracted from C source file"

	author: "Finnian Reilly"
	copyright: "Copyright (c) 2001-2022 Finnian Reilly"
	contact: "finnian at eiffel hyphen loop dot com"

	license: "MIT license (See: en.wikipedia.org/wiki/MIT_License)"
	date: "2023-03-30 12:23:15 GMT (Thursday 30th March 2023)"
	revision: "21"

class
	CODEC_INFO

inherit
	EL_FILE_PARSER
		rename
			new_pattern as assignment_pattern
		export
			{NONE} all
		redefine
			make_default
		end

	EVOLICITY_EIFFEL_CONTEXT
		redefine
			make_default
		end

	TP_C_LANGUAGE_FACTORY

	EL_MODULE_LIO

	EL_SHARED_UNICODE_PROPERTY

create
	make

feature {NONE} -- Initialization

	make (a_codec_name: ZSTRING)
			--
		do
			make_default
			codec_name := a_codec_name
		end

	make_default
		local
			i: INTEGER
		do
			create latin_table.make_empty (0x100)
			create latin_characters.make (128)
			from i := 0 until i > 0xFF loop
				latin_table.extend (i.to_natural_32)
				i := i + 1
			end
			create lower_case_offsets.make ("Lower")
			create upper_case_offsets.make ("Upper")

			create single_case_character_set.make (2)

			create unicode_intervals.make_empty

			Precursor {EL_FILE_PARSER}
			Precursor {EVOLICITY_EIFFEL_CONTEXT}
		end

feature -- Access

	alpha_set: CODE_INTERVAL_LIST
		do
			create Result.make_latin_subset (latin_table, agent {LATIN_CHARACTER}.is_alpha)
		end

	codec_base_name: ZSTRING
		do
			if codec_name.has_substring ("iso") then
				Result := "ISO_8859"
			else
				Result := "WINDOWS"
			end
		end

	codec_id: INTEGER
		do
			Result := codec_name.substring_end (codec_name.last_index_of ('_', codec_name.count) + 1).to_integer
		end

	codec_name: ZSTRING

	numeric_set: CODE_INTERVAL_LIST
		do
			create Result.make_latin_subset (latin_table, agent {LATIN_CHARACTER}.is_digit)
		end

	unicode_intervals: EL_SORTABLE_ARRAYED_LIST [UNICODE_INTERVAL]

feature -- Element change

	add_assignment (text: ZSTRING)
			--
		do
			set_source_text (text)
			parse
		end

feature -- Basic operations

	set_case_change_offsets
		local
			table: EL_ARRAYED_LIST [LATIN_CHARACTER]; latin_character: LATIN_CHARACTER
			case_offsets: like lower_case_offsets
			i: INTEGER; unicode, unicode_changed: CHARACTER_32
		do
			create table.make_from_special (latin_table)

			from i := 0 until i = 256 loop
				latin_character := latin_table [i]
				if latin_character.is_alpha then
					unicode := latin_character.unicode.to_character_32
					unicode_changed := '%U'
					if unicode.is_upper then
						unicode_changed := unicode.as_lower; case_offsets := upper_case_offsets
					elseif unicode.is_lower then
						unicode_changed := unicode.as_upper; case_offsets := lower_case_offsets
					end
					if unicode_changed = '%U' then
						lio.put_string_field ("Alpha is neither upper or lower", latin_character.unicode_string)
						lio.put_new_line
					else
						table.find_first_equal (unicode_changed.natural_32_code, agent {LATIN_CHARACTER}.unicode)
						if table.after then
							single_case_character_set.extend (latin_character)
							lio.put_string_field (case_offsets.name + " case character", latin_character.unicode_string)
							lio.put_string_field (" has no latin case change", latin_character.inverse_case_unicode_string)
							lio.put_new_line
						else
							case_offsets.extend (i.to_character_8, (table.index - 1).to_character_8)
						end
					end
				end
				i := i + 1
			end
		end

	set_unicode_intervals
		do
			unicode_intervals := new_unicode_intervals
		end

feature {NONE} -- Pattern definitions

	assignment_pattern: like all_of
			--
		do
			Result := all_of (<<
				string_literal ("%T%T"), identifier, character_literal ('['),
				hexadecimal_constant |to| agent on_latin_code,
				string_literal ("] = (char) ("),
				hexadecimal_constant |to| agent on_unicode,
				string_literal (");"),
				optional (
					all_of (<<
						optional_nonbreaking_white_space,
						character_literal ('/'),
						one_character_from ("/*"),
						one_or_more (any_character) |to| agent on_comment
					>>)
				)
			>>)
		end

feature {NONE} -- Match actions

	on_comment (start_index, end_index: INTEGER)
			--
		local
			l_name: ZSTRING
		do
			l_name := source_substring (start_index, end_index, True)
			l_name.left_adjust
			l_name.prune_all_trailing ('/')
			l_name.prune_all_trailing ('*')
			latin_table.item (last_latin_code).set_name (l_name)
		end

	on_latin_code (start_index, end_index: INTEGER)
			--
		local
			hex: EL_HEXADECIMAL_CONVERTER
		do
			last_latin_code := hex.to_integer (source_substring (start_index, end_index, False))
			latin_characters.extend (last_latin_code.to_natural_32)
			latin_table [last_latin_code] := latin_characters.last
		end

	on_unicode (start_index, end_index: INTEGER)
			--
		local
			hex: EL_HEXADECIMAL_CONVERTER; unicode: NATURAL
		do
			unicode := hex.to_natural_32 (source_substring (start_index, end_index, False))
			latin_table.item (last_latin_code).set_unicode (unicode)
		end

feature {NONE} -- Implementation

	differing_unicodes: ARRAYED_LIST [LATIN_CHARACTER]
		local
			i: INTEGER
		do
			create Result.make (128)
			from i := 0 until i = 256 loop
				if attached latin_table [i] as lc and then lc.code /= lc.unicode then
					Result.extend (lc)
				end
				i := i + 1
			end
		end

	is_case_changeable (latin: LATIN_CHARACTER): BOOLEAN
		do
			Result := across lower_case_offsets as set some set.item.has_character (latin) end
							or else across upper_case_offsets as set some set.item.has_character (latin) end
		end

	new_unicode_intervals: like unicode_intervals
		local
			ascending_unicodes: SORTABLE_ARRAY [LATIN_CHARACTER]
			i, unicode: INTEGER
		do
			create ascending_unicodes.make_from_array (differing_unicodes.to_array)
			ascending_unicodes.sort
			unicode := ascending_unicodes.item (1).unicode.to_integer_32
			create Result.make (ascending_unicodes.count)
			Result.extend (unicode |..| unicode)
			Result.last.extend_latin (ascending_unicodes.item (1))
			from i := 2 until i > ascending_unicodes.count loop
				unicode := ascending_unicodes [i].unicode.to_integer_32
				if Result.last.upper + 1 = unicode then
					Result.last.extend (unicode)
				else
					Result.extend (unicode |..| unicode)
				end
				Result.last.extend_latin (ascending_unicodes [i])
				i := i + 1
			end
			Result.ascending_sort
		end

feature {NONE} -- Internal attributes

	last_latin_code: INTEGER

	latin_characters: ARRAYED_LIST [LATIN_CHARACTER]

	latin_table: SPECIAL [LATIN_CHARACTER]

	lower_case_offsets: CASE_OFFSETS_TABLE

	upper_case_offsets: CASE_OFFSETS_TABLE

	single_case_character_set: ARRAYED_LIST [LATIN_CHARACTER]

feature {NONE} -- Evolicity fields

	get_unchangeable_case_set_string: STRING
		-- alpha characters which are only available in a single case
		do
			create Result.make_empty
			across latin_table as lc loop
				if lc.item.is_alpha and then not is_case_changeable (lc.item) then
					if not Result.is_empty then
						Result.append (", ")
					end
					Result.append_natural_32 (lc.item.code)
				end
			end
		end

	getter_function_table: like getter_functions
			--
		do
			create Result.make (<<
				["has_thai_numerals",				agent: BOOLEAN_REF do Result := (codec_id = 11).to_reference end],
				["codec_name", 						agent: ZSTRING do Result := codec_name.as_upper end],
				["codec_base_name", 					agent: ZSTRING do Result := codec_base_name end],
				["latin_characters", 				agent: ITERABLE [LATIN_CHARACTER] do Result := latin_characters end],
				["lower_case_offsets", 				agent: ITERABLE [STRING] do Result := lower_case_offsets.to_string_table end],
				["upper_case_offsets", 				agent: ITERABLE [STRING] do Result := upper_case_offsets.to_string_table end],
				["lower_case_set_string", 			agent: STRING do Result := lower_case_offsets.case_set_string end],
				["upper_case_set_string", 			agent: STRING do Result := upper_case_offsets.case_set_string end],
				["unchangeable_case_set_string", agent get_unchangeable_case_set_string],
				["alpha_set_string", 				agent: STRING do Result := alpha_set.to_string end],
				["numeric_set_string",				agent: STRING do Result := numeric_set.to_string end],
				["codec_id",							agent: INTEGER_REF do Result := codec_id.to_reference end],
				["single_case_character_set", 	agent: ITERABLE [LATIN_CHARACTER] do Result := single_case_character_set end],
				["unicode_intervals", 				agent: ITERABLE [UNICODE_INTERVAL] do Result := unicode_intervals end]
			>>)
		end

end