Skip to content

Commit

Permalink
Replace kantan.csv with scala-csv
Browse files Browse the repository at this point in the history
  • Loading branch information
mtomko committed Jan 10, 2024
1 parent 37db39f commit afcb56d
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 35 deletions.
8 changes: 2 additions & 6 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@ lazy val versions = new {
val commonsMath3 = "3.6.1"
val fastutil = "8.5.12"
val fs2 = "3.9.3"
val kantanCodecs = "0.5.3"
val kantanCsv = "0.7.0"
val log4s = "1.10.0"
val logback = "1.2.13"
val munit = "0.7.29"
Expand All @@ -46,12 +44,11 @@ lazy val libraries = new {
val fastutil = "it.unimi.dsi" % "fastutil" % versions.fastutil
val fs2Core = "co.fs2" %% "fs2-core" % versions.fs2
val fs2Io = "co.fs2" %% "fs2-io" % versions.fs2
val kantanCodecs = "com.nrinaudo" %% "kantan.codecs" % versions.kantanCodecs
val kantanCsv = "com.nrinaudo" %% "kantan.csv" % versions.kantanCsv
val log4s = "org.log4s" %% "log4s" % versions.log4s
val logbackCore = "ch.qos.logback" % "logback-core" % versions.logback
val logbackClassic = "ch.qos.logback" % "logback-classic" % versions.logback
val samtools = "com.github.samtools" % "htsjdk" % versions.samTools
val scalaCsv = "com.github.tototoshi" %% "scala-csv" % versions.scalaCsv
val scopt = "com.github.scopt" %% "scopt" % versions.scopt
val slf4j = "org.slf4j" % "slf4j-api" % versions.slf4j

Expand All @@ -72,12 +69,11 @@ lazy val dependencies =
libraries.commonsIo,
libraries.commonsMath3,
libraries.fastutil,
libraries.kantanCodecs,
libraries.kantanCsv,
libraries.log4s,
libraries.logbackCore % Runtime,
libraries.logbackClassic % Runtime,
libraries.samtools,
libraries.scalaCsv,
libraries.scopt,
libraries.slf4j,
libraries.betterFiles % Test,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ import java.nio.file.Path

import scala.util.Using

import kantan.csv._
import kantan.csv.ops._
import com.github.tototoshi.csv._
import org.apache.commons.io.ByteOrderMark
import org.apache.commons.io.input.BOMInputStream
import org.broadinstitute.gpp.poolq3.reports.{GctDialect, PoolQ2Dialect, ReportsDialect}
Expand Down Expand Up @@ -77,34 +76,34 @@ object ReferenceData {
.setInclude(false)
.get()
val br = new BufferedReader(new InputStreamReader(in))
val delimiter = guessDelimiter(br)
val config =
CsvConfiguration(delimiter, quote, CsvConfiguration.QuotePolicy.WhenNeeded, CsvConfiguration.Header.None)
val guessedDelimiter = guessDelimiter(br)
implicit object CSVFormat extends DefaultCSVFormat {
override val delimiter = guessedDelimiter
override val quoteChar: Char = quote
}
skipHeader(br, LineRegex)
val reader = br.asCsvReader[List[String]](config)
val barcodes = reader.map {
case Right(xs) =>
xs match {
case barcodeRaw :: idRaw :: _ =>
// if the CSV parser leaves spaces, we should remove them
val barcode = barcodeRaw.trim()
val id = idRaw.trim()

// N.B. empty IDs are commonly used and must be supported; as long as the barcode is a non-empty, valid
// DNA string, we must accept the row. However, sometimes Excel leaves empty lines in exported CSV; as
// long as *both* the barcode and ID are empty, it's safe to just skip the row. For now we'll be paranoid
// and reject cases where the barcode is empty but the ID is non-empty
if (barcode.isEmpty && id.isEmpty) None
else if (isReferenceBarcode(barcode)) Some(ReferenceEntry(barcode, id))
else throw InvalidFileException(file, s"Invalid DNA barcode '$barcode' for ID '$id'")
case _ =>
throw InvalidFileException(
file,
s"Incorrect number of columns. At least 2 required, got: ${xs.length}: $xs"
)
}
case Left(value) => throw InvalidFileException(file, s"Unable to parse data ${value.getMessage}")
}.toList
val rows = CSVReader.open(br).all()
val barcodes = rows.map { case xs =>
xs match {
case barcodeRaw :: idRaw :: _ =>
// if the CSV parser leaves spaces, we should remove them
val barcode = barcodeRaw.trim()
val id = idRaw.trim()

// N.B. empty IDs are commonly used and must be supported; as long as the barcode is a non-empty, valid
// DNA string, we must accept the row. However, sometimes Excel leaves empty lines in exported CSV; as
// long as *both* the barcode and ID are empty, it's safe to just skip the row. For now we'll be paranoid
// and reject cases where the barcode is empty but the ID is non-empty
if (barcode.isEmpty && id.isEmpty) None
else if (isReferenceBarcode(barcode)) Some(ReferenceEntry(barcode, id))
else throw InvalidFileException(file, s"Invalid DNA barcode '$barcode' for ID '$id'")
case _ =>
throw InvalidFileException(
file,
s"Incorrect number of columns. At least 2 required, got: ${xs.length}: $xs"
)
}
}

if (barcodes.isEmpty) {
throw InvalidFileException(file, "Empty reference file")
Expand Down

0 comments on commit afcb56d

Please sign in to comment.