Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better error descriptions if CSV is invalid #215

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 58 additions & 10 deletions app/controllers/batches_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def create
redirect_to new_batch_step_preprocess_path(@batch)
end
else
@batch.destroy
format.html { return redirect_to new_batch_path }
end
else
Expand Down Expand Up @@ -62,23 +63,70 @@ def format_csv_validation_error(error)
end

def spreadsheet_ok?
continue = false
Batch.csv_validator_for(@batch) do |validator|
csv_lint_ok? && csv_parse_ok?
end

def csv_lint_ok?
Batch.csv_lint_validator_for(@batch) do |validator|
if validator.valid? && within_csv_row_limit?(validator.row_count)
@batch.update(num_rows: validator.row_count)
continue = true
elsif !within_csv_row_limit?(validator.row_count)
@batch.destroy # scrap it, they'll have to start over
return true
end

if !within_csv_row_limit?(validator.row_count)
flash[:csv_too_long] = true
elsif invalid_encoding?(validator.errors)
flash[:invalid_encoding] = invalid_encoding_errs
else
@batch.destroy # scrap it, they'll have to start over
flash[:csv_lint] = validator.errors
.map{ |err| format_csv_validation_error(err) }
.join('|||')
flash[:csv_lint] = format_csvlint_errors(validator.errors)
end
false
end
end

def csv_parse_ok?
Batch.csv_parse_validator_for(@batch) do |parsed|
return true if parsed == :success

# Catches blank rows between data rows or at the end of the CSV
# that have mixed EOL characters, which Csvlint does not flag as
# invalid, but that raise a malformed CSV error in the CSV library
#
# This also catches extraneous CRLF EOL at the end of final data
# line, when EOL char used elsewhere in file is CR
if parsed.start_with?('New line must be')
flash[:blank_rows] = true
# Catches extraneous EOL chars at end of final row that do not match
# the EOL char used in the rest of the file, which Csvlint calls
# calls valid, but that raise a malformed CSV error in the CSV library
elsif parsed.start_with?('Unquoted fields do not allow new line')
flash[:last_row_eol] = true
# Catches any other malformed CSV errors raised by the CSV library, for
# files Csvlint called valid
else
flash[:malformed_csv] = parsed
end
false
end
end

def invalid_encoding_errs
char_finder = @batch.spreadsheet.open do |csv|
InvalidCharacterFinder.new(File.read(csv.path))
end

char_finder.call.join('|||')
end

# @param errors [Array<Csvlint::ErrorMessage>]
def format_csvlint_errors(errors)
flash[:csv_lint] = errors.map { |err| format_csv_validation_error(err) }
.join('|||')
end

continue
# @param errors [Array<Csvlint::ErrorMessage>]
def invalid_encoding?(errors)
errors.any? { |err| err.type == :invalid_encoding }
end

def set_batch
Expand Down
18 changes: 17 additions & 1 deletion app/models/batch.rb
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def self.content_types
CONTENT_TYPES
end

def self.csv_validator_for(batch)
def self.csv_lint_validator_for(batch)
# raise unless batch.spreadsheet.attached? # TODO

batch.spreadsheet.open do |spreadsheet|
Expand All @@ -111,12 +111,28 @@ def self.csv_validator_for(batch)
end
end

def self.csv_parse_validator_for(batch)
# raise unless batch.spreadsheet.attached? # TODO

batch.spreadsheet.open do |spreadsheet|
yield batch.parse_spreadsheet(spreadsheet.path)
end
end

def self.expired(&block)
all.in_batches do |b|
b.find_all(&:expired?).each(&block)
end
end

def parse_spreadsheet(path)
File.open(path, encoding: 'bom|utf-8') { |file| CSV.table(file) }
rescue CSV::MalformedCSVError => e
e.message
else
:success
end

private

def ensure_batch_config_is_json
Expand Down
63 changes: 63 additions & 0 deletions app/services/invalid_character_finder.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# frozen_string_literal: true

# This service returns non-UTF-8 characters from an uploaded CSV, with their
# surrounding context, so that users can find the characters that need to
# be replaced
class InvalidCharacterFinder
def initialize(csv_string)
@csv_string = csv_string
@char_ct = csv_string.length
@context_chars = 20
@min_idx = context_chars - 1
@max_idx = char_ct - context_chars - 1
@replacement_char = '�'
end

# @return [Array<String>]
def call
invalid_char_indexes.values
.map do |idx|
[context(idx, :before),
replacement_char,
context(idx, :after)].join
end
end

private

attr_reader :csv_string, :char_ct, :context_chars, :min_idx, :max_idx,
:replacement_char

# @return [Hash<String=>Integer>] unique invalid characters in CSV as keys;
# index of first occurrence of the string as values
def invalid_char_indexes
csv_string.chars
.map.with_index { |char, idx| char.is_utf8? ? nil : [char, idx] }
.compact
.group_by { |arr| arr[0] }
.values
.map { |arr| arr.first }
.to_h
end

# @return [String] with any invalid UTF-8 characters occurring in the context
# replaced by replacement_char
def context(idx, mode)
range = if mode == :before
Range.new(min(idx), idx - 1)
else
Range.new(idx + 1, max(idx))
end
csv_string.slice(range)
.chars
.map { |char| char.is_utf8? ? char : replacement_char }
end

def min(idx)
idx < min_idx ? 0 : idx - context_chars
end

def max(idx)
idx > max_idx ? char_ct - 1 : idx + context_chars
end
end
Loading
Loading