Skip to content

Commit

Permalink
refactor auto_detection and file_io into separate files (#260)
Browse files Browse the repository at this point in the history
* refactor auto_detection and file_io into separate files

* eliminate Hash extension
  • Loading branch information
tilo authored Dec 11, 2023
1 parent b074910 commit 9ddfad0
Show file tree
Hide file tree
Showing 6 changed files with 128 additions and 154 deletions.
9 changes: 0 additions & 9 deletions lib/core_ext/hash.rb

This file was deleted.

6 changes: 3 additions & 3 deletions lib/smarter_csv.rb
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# frozen_string_literal: true

require "core_ext/hash"

require "smarter_csv/version"
require "smarter_csv/file_io"
require "smarter_csv/options_processing"
require "smarter_csv/auto_detection"
require "smarter_csv/variables"
require "smarter_csv/headers"
require "smarter_csv/parse"
Expand All @@ -18,7 +18,7 @@
require_relative "smarter_csv/smarter_csv"
# :nocov:
end
rescue Exception
rescue Exception # rubocop:disable Lint/RescueException
# require_relative 'smarter_csv/smarter_csv'
end
# :nocov:
Expand Down
73 changes: 73 additions & 0 deletions lib/smarter_csv/auto_detection.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# frozen_string_literal: true

module SmarterCSV
class << self
protected

# If file has headers, then guesses column separator from headers.
# Otherwise guesses column separator from contents.
# Raises exception if none is found.
def guess_column_separator(filehandle, options)
skip_lines(filehandle, options)

delimiters = [',', "\t", ';', ':', '|']

line = nil
has_header = options[:headers_in_file]
candidates = Hash.new(0)
count = has_header ? 1 : 5
count.times do
line = readline_with_counts(filehandle, options)
delimiters.each do |d|
candidates[d] += line.scan(d).count
end
rescue EOFError # short files
break
end
rewind(filehandle)

if candidates.values.max == 0
# if the header only contains
return ',' if line.chomp(options[:row_sep]) =~ /^\w+$/

raise SmarterCSV::NoColSepDetected
end

candidates.key(candidates.values.max)
end

# limitation: this currently reads the whole file in before making a decision
def guess_line_ending(filehandle, options)
counts = {"\n" => 0, "\r" => 0, "\r\n" => 0}
quoted_char = false

# count how many of the pre-defined line-endings we find
# ignoring those contained within quote characters
last_char = nil
lines = 0
filehandle.each_char do |c|
quoted_char = !quoted_char if c == options[:quote_char]
next if quoted_char

if last_char == "\r"
if c == "\n"
counts["\r\n"] += 1
else
counts["\r"] += 1 # \r are counted after they appeared
end
elsif c == "\n"
counts["\n"] += 1
end
last_char = c
lines += 1
break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
end
rewind(filehandle)

counts["\r"] += 1 if last_char == "\r"
# find the most frequent key/value pair:
most_frequent_key, _count = counts.max_by{|_, v| v}
most_frequent_key
end
end
end
50 changes: 50 additions & 0 deletions lib/smarter_csv/file_io.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# frozen_string_literal: true

module SmarterCSV
class << self
protected

def readline_with_counts(filehandle, options)
line = filehandle.readline(options[:row_sep])
@file_line_count += 1
@csv_line_count += 1
line = remove_bom(line) if @csv_line_count == 1
line
end

def skip_lines(filehandle, options)
options[:skip_lines].to_i.times do
readline_with_counts(filehandle, options)
end
end

def rewind(filehandle)
@file_line_count = 0
@csv_line_count = 0
filehandle.rewind
end

private

UTF_32_BOM = %w[0 0 fe ff].freeze
UTF_32LE_BOM = %w[ff fe 0 0].freeze
UTF_8_BOM = %w[ef bb bf].freeze
UTF_16_BOM = %w[fe ff].freeze
UTF_16LE_BOM = %w[ff fe].freeze

def remove_bom(str)
str_as_hex = str.bytes.map{|x| x.to_s(16)}
# if string does not start with one of the bytes, there is no BOM
return str unless %w[ef fe ff 0].include?(str_as_hex[0])

return str.byteslice(4..-1) if [UTF_32_BOM, UTF_32LE_BOM].include?(str_as_hex[0..3])
return str.byteslice(3..-1) if str_as_hex[0..2] == UTF_8_BOM
return str.byteslice(2..-1) if [UTF_16_BOM, UTF_16LE_BOM].include?(str_as_hex[0..1])

# :nocov:
puts "SmarterCSV found unhandled BOM! #{str.chars[0..7].inspect}"
str
# :nocov:
end
end
end
112 changes: 2 additions & 110 deletions lib/smarter_csv/smarter_csv.rb
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def SmarterCSV.process(input, given_options = {}, &block) # rubocop:disable Lint
# if all values are blank, then ignore this line
next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))

hash = Hash.zip(@headers, dataA) # from Facets of Ruby library
hash = @headers.zip(dataA).to_h

# make sure we delete any key/value pairs from the hash, which the user wanted to delete:
hash.delete(nil)
Expand Down Expand Up @@ -165,6 +165,7 @@ def SmarterCSV.process(input, given_options = {}, &block) # rubocop:disable Lint
ensure
fh.close if fh.respond_to?(:close)
end

if block_given?
@chunk_count # when we do processing through a block we only care how many chunks we processed
else
Expand All @@ -191,26 +192,6 @@ def has_acceleration?

protected

def readline_with_counts(filehandle, options)
line = filehandle.readline(options[:row_sep])
@file_line_count += 1
@csv_line_count += 1
line = remove_bom(line) if @csv_line_count == 1
line
end

def skip_lines(filehandle, options)
options[:skip_lines].to_i.times do
readline_with_counts(filehandle, options)
end
end

def rewind(filehandle)
@file_line_count = 0
@csv_line_count = 0
filehandle.rewind
end

# SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
# and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs
BLANK_RE = /\A\s*\z/.freeze
Expand Down Expand Up @@ -258,94 +239,5 @@ def only_or_except_limit_execution(options, option_name, key)
end
false
end

# If file has headers, then guesses column separator from headers.
# Otherwise guesses column separator from contents.
# Raises exception if none is found.
def guess_column_separator(filehandle, options)
skip_lines(filehandle, options)

delimiters = [',', "\t", ';', ':', '|']

line = nil
has_header = options[:headers_in_file]
candidates = Hash.new(0)
count = has_header ? 1 : 5
count.times do
line = readline_with_counts(filehandle, options)
delimiters.each do |d|
candidates[d] += line.scan(d).count
end
rescue EOFError # short files
break
end
rewind(filehandle)

if candidates.values.max == 0
# if the header only contains
return ',' if line.chomp(options[:row_sep]) =~ /^\w+$/

raise SmarterCSV::NoColSepDetected
end

candidates.key(candidates.values.max)
end

# limitation: this currently reads the whole file in before making a decision
def guess_line_ending(filehandle, options)
counts = {"\n" => 0, "\r" => 0, "\r\n" => 0}
quoted_char = false

# count how many of the pre-defined line-endings we find
# ignoring those contained within quote characters
last_char = nil
lines = 0
filehandle.each_char do |c|
quoted_char = !quoted_char if c == options[:quote_char]
next if quoted_char

if last_char == "\r"
if c == "\n"
counts["\r\n"] += 1
else
counts["\r"] += 1 # \r are counted after they appeared
end
elsif c == "\n"
counts["\n"] += 1
end
last_char = c
lines += 1
break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
end
rewind(filehandle)

counts["\r"] += 1 if last_char == "\r"
# find the most frequent key/value pair:
most_frequent_key, _count = counts.max_by{|_, v| v}
most_frequent_key
end

private

UTF_32_BOM = %w[0 0 fe ff].freeze
UTF_32LE_BOM = %w[ff fe 0 0].freeze
UTF_8_BOM = %w[ef bb bf].freeze
UTF_16_BOM = %w[fe ff].freeze
UTF_16LE_BOM = %w[ff fe].freeze

def remove_bom(str)
str_as_hex = str.bytes.map{|x| x.to_s(16)}
# if string does not start with one of the bytes, there is no BOM
return str unless %w[ef fe ff 0].include?(str_as_hex[0])

return str.byteslice(4..-1) if [UTF_32_BOM, UTF_32LE_BOM].include?(str_as_hex[0..3])
return str.byteslice(3..-1) if str_as_hex[0..2] == UTF_8_BOM
return str.byteslice(2..-1) if [UTF_16_BOM, UTF_16LE_BOM].include?(str_as_hex[0..1])

# :nocov:
puts "SmarterCSV found unhandled BOM! #{str.chars[0..7].inspect}"
str
# :nocov:
end
end
end
32 changes: 0 additions & 32 deletions spec/core_ext/hash_spec.rb

This file was deleted.

0 comments on commit 9ddfad0

Please sign in to comment.