class FILTER_INVALID_UTF_8

(source code)

Description

Command to Filter out all invalid UTF-8 lines from file

note
	description: "Command to Filter out all invalid UTF-8 lines from file"

	author: "Finnian Reilly"
	copyright: "Copyright (c) 2001-2022 Finnian Reilly"
	contact: "finnian at eiffel hyphen loop dot com"

	license: "MIT license (See: en.wikipedia.org/wiki/MIT_License)"
	date: "2022-11-23 8:31:54 GMT (Wednesday 23rd November 2022)"
	revision: "12"

class
	FILTER_INVALID_UTF_8

inherit
	EL_APPLICATION_COMMAND

	EL_MODULE_LOG

create
	make

feature {EL_COMMAND_CLIENT} -- Initialization

	make (a_source_path: like source_path)
		do
			source_path  := a_source_path
			output_path := source_path.without_extension
			output_path.add_extension ("fixed")
			output_path.add_extension (source_path.extension)
		end

feature -- Constants

	Description: STRING = "Filter out all invalid UTF-8 lines from file"

feature -- Basic operations

	execute
		local
			in_file, out_file: PLAIN_TEXT_FILE; line: STRING
			bad_count: INTEGER; c: EL_UTF_CONVERTER
		do
			log.enter ("execute")
			lio.put_path_field ("Filtering", source_path)
			lio.put_new_line

			create in_file.make_open_read (source_path)
			create out_file.make_open_write (output_path)

			from until in_file.end_of_file loop
				in_file.read_line
				line := in_file.last_string
				if c.is_valid_utf_8_string_8 (line) then
					if out_file.position > 0 then
						out_file.put_new_line
					end
					out_file.put_string (line)
				else
					bad_count := bad_count + 1
				end
			end
			out_file.close; in_file.close
			lio.put_integer_field ("Found bad lines", bad_count)
			lio.put_new_line
			log.exit
		end

feature {NONE} -- Internal attributes

	output_path: FILE_PATH

	source_path: FILE_PATH

end