diff --git a/src/main/scala/org/broadinstitute/gpp/poolq3/PoolQ.scala b/src/main/scala/org/broadinstitute/gpp/poolq3/PoolQ.scala index 56b01fe..1a29723 100644 --- a/src/main/scala/org/broadinstitute/gpp/poolq3/PoolQ.scala +++ b/src/main/scala/org/broadinstitute/gpp/poolq3/PoolQ.scala @@ -116,8 +116,10 @@ object PoolQ { ExactReference(referenceData.mappings, identity, includeAmbiguous = false) } + val colBarcodePolicyOrLength: Either[Int, BarcodePolicy] = colBarcodePolicyOpt.toRight(colReference.barcodeLength) + val barcodes: CloseableIterable[Barcodes] = - barcodeSource(config.input, rowBarcodePolicy, revRowBarcodePolicyOpt, colBarcodePolicyOpt, umiInfo.map(_._2)) + barcodeSource(config.input, rowBarcodePolicy, revRowBarcodePolicyOpt, colBarcodePolicyOrLength, umiInfo.map(_._2)) lazy val unexpectedSequenceCacheDir: Option[Path] = if (config.skipUnexpectedSequenceReport) None diff --git a/src/main/scala/org/broadinstitute/gpp/poolq3/PoolQConfig.scala b/src/main/scala/org/broadinstitute/gpp/poolq3/PoolQConfig.scala index 138b76b..f37e9e2 100644 --- a/src/main/scala/org/broadinstitute/gpp/poolq3/PoolQConfig.scala +++ b/src/main/scala/org/broadinstitute/gpp/poolq3/PoolQConfig.scala @@ -105,8 +105,12 @@ final case class PoolQConfig( def isPairedEnd = reverseRowBarcodePolicyStr.isDefined && (input.readsSourceE match { - case Right(ReadsSource.PairedEnd(_, _, _)) => true - case _ => false + case Right(ReadsSource.PairedEnd(_, _, _)) => true + case Right(ReadsSource.DmuxedPairedEnd(_, _)) => true + case Right(ReadsSource.SelfContained(_)) => false + case Right(ReadsSource.Split(_, _)) => false + case Right(ReadsSource.Dmuxed(_)) => false + case Left(_) => false }) } diff --git a/src/main/scala/org/broadinstitute/gpp/poolq3/barcode/Dmuxed.scala b/src/main/scala/org/broadinstitute/gpp/poolq3/barcode/Dmuxed.scala new file mode 100644 index 0000000..8755847 --- /dev/null +++ b/src/main/scala/org/broadinstitute/gpp/poolq3/barcode/Dmuxed.scala @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2022 The Broad Institute, Inc. All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + */ +package org.broadinstitute.gpp.poolq3.barcode + +object Dmuxed { + + private[barcode] def barcodeFromId(length: Int): String => Option[FoundBarcode] = { + val regex = s"@.*[^ACGTN]([ACGTN]{$length})$$".r + _ match { + case regex(barcode) => Some(FoundBarcode(barcode.toCharArray, 0)) + case _ => None + } + } + +} diff --git a/src/main/scala/org/broadinstitute/gpp/poolq3/barcode/DmuxedBarcodeSource.scala b/src/main/scala/org/broadinstitute/gpp/poolq3/barcode/DmuxedBarcodeSource.scala index 2f2a6c2..23de4de 100644 --- a/src/main/scala/org/broadinstitute/gpp/poolq3/barcode/DmuxedBarcodeSource.scala +++ b/src/main/scala/org/broadinstitute/gpp/poolq3/barcode/DmuxedBarcodeSource.scala @@ -8,8 +8,15 @@ package org.broadinstitute.gpp.poolq3.barcode import org.broadinstitute.gpp.poolq3.parser.{CloseableIterable, CloseableIterator, DmuxedIterable} import org.broadinstitute.gpp.poolq3.types.Read -final class DmuxedBarcodeSource(parser: DmuxedIterable, rowPolicy: BarcodePolicy, umiPolicyOpt: Option[BarcodePolicy]) - extends CloseableIterable[Barcodes] { +final class DmuxedBarcodeSource( + parser: DmuxedIterable, + rowPolicy: BarcodePolicy, + umiPolicyOpt: Option[BarcodePolicy], + colBarcodeLength: Int +) extends CloseableIterable[Barcodes] { + + // used to attempt to parse barcodes out of ids if the file has no associated barcode + private val colBarcodeParser = Dmuxed.barcodeFromId(colBarcodeLength) private def colBarcodeOpt = parser.indexBarcode @@ -20,7 +27,7 @@ final class DmuxedBarcodeSource(parser: DmuxedIterable, rowPolicy: BarcodePolicy val nextRead = iterator.next() val rowBarcodeOpt = rowPolicy.find(nextRead) val umiBarcodeOpt = umiPolicyOpt.flatMap(_.find(nextRead)) - Barcodes(rowBarcodeOpt, None, colBarcodeOpt, umiBarcodeOpt) + Barcodes(rowBarcodeOpt, None, colBarcodeOpt.orElse(colBarcodeParser(nextRead.id)), umiBarcodeOpt) } override def close(): Unit = iterator.close() diff --git a/src/main/scala/org/broadinstitute/gpp/poolq3/barcode/DmuxedPairedEndBarcodeSource.scala b/src/main/scala/org/broadinstitute/gpp/poolq3/barcode/DmuxedPairedEndBarcodeSource.scala index e0e2c88..8eca5ce 100644 --- a/src/main/scala/org/broadinstitute/gpp/poolq3/barcode/DmuxedPairedEndBarcodeSource.scala +++ b/src/main/scala/org/broadinstitute/gpp/poolq3/barcode/DmuxedPairedEndBarcodeSource.scala @@ -14,7 +14,8 @@ class DmuxedPairedEndBarcodeSource( rowPolicy: BarcodePolicy, revRowPolicy: BarcodePolicy, umiPolicyOpt: Option[BarcodePolicy], - readIdCheckPolicy: ReadIdCheckPolicy + readIdCheckPolicy: ReadIdCheckPolicy, + colBarcodeLength: Int ) extends CloseableIterable[Barcodes] { // the index barcode _is_ the column barcode; we get it from the row parser @@ -24,6 +25,9 @@ class DmuxedPairedEndBarcodeSource( private[this] class BarcodeIterator(rowIterator: CloseableIterator[Read], revRowIterator: CloseableIterator[Read]) extends CloseableIterator[Barcodes] { + // used to attempt to parse barcodes out of ids if the file has no associated barcode + private val colBarcodeParser = Dmuxed.barcodeFromId(colBarcodeLength) + final override def hasNext: Boolean = rowIterator.hasNext && revRowIterator.hasNext final override def next(): Barcodes = { @@ -33,7 +37,7 @@ class DmuxedPairedEndBarcodeSource( val rowBarcodeOpt = rowPolicy.find(nextRow) val revRowBarcodeOpt = revRowPolicy.find(nextRevRow) val umiBarcodeOpt = umiPolicyOpt.flatMap(_.find(nextRow)) - Barcodes(rowBarcodeOpt, revRowBarcodeOpt, colBarcodeOpt, umiBarcodeOpt) + Barcodes(rowBarcodeOpt, revRowBarcodeOpt, colBarcodeOpt.orElse(colBarcodeParser(nextRow.id)), umiBarcodeOpt) } final override def close(): Unit = diff --git a/src/main/scala/org/broadinstitute/gpp/poolq3/barcode/package.scala b/src/main/scala/org/broadinstitute/gpp/poolq3/barcode/package.scala index 40738ff..eab96e6 100644 --- a/src/main/scala/org/broadinstitute/gpp/poolq3/barcode/package.scala +++ b/src/main/scala/org/broadinstitute/gpp/poolq3/barcode/package.scala @@ -26,11 +26,11 @@ package object barcode { config: PoolQInput, rowBarcodePolicy: BarcodePolicy, revRowBarcodePolicyOpt: Option[BarcodePolicy], - colBarcodePolicyOpt: Option[BarcodePolicy], + colBarcodePolicyOpt: Either[Int, BarcodePolicy], umiBarcodePolicyOpt: Option[BarcodePolicy] ): CloseableIterable[Barcodes] = (config.readsSource, revRowBarcodePolicyOpt, colBarcodePolicyOpt) match { - case (ReadsSource.Split(index, forward), None, Some(colBarcodePolicy)) => + case (ReadsSource.Split(index, forward), None, Right(colBarcodePolicy)) => new TwoFileBarcodeSource( parserFor(forward.toList), parserFor(index.toList), @@ -39,7 +39,7 @@ package object barcode { umiBarcodePolicyOpt, config.readIdCheckPolicy ) - case (ReadsSource.PairedEnd(index, forward, reverse), Some(revRowBarcodePolicy), Some(colBarcodePolicy)) => + case (ReadsSource.PairedEnd(index, forward, reverse), Some(revRowBarcodePolicy), Right(colBarcodePolicy)) => new ThreeFileBarcodeSource( parserFor(forward.toList), parserFor(reverse.toList), @@ -50,22 +50,24 @@ package object barcode { umiBarcodePolicyOpt, config.readIdCheckPolicy ) - case (ReadsSource.SelfContained(paths), None, Some(colBarcodePolicy)) => + case (ReadsSource.SelfContained(paths), None, Right(colBarcodePolicy)) => new SingleFileBarcodeSource(parserFor(paths.toList), rowBarcodePolicy, colBarcodePolicy, umiBarcodePolicyOpt) - case (ReadsSource.Dmuxed(read1), _, _) => + case (ReadsSource.Dmuxed(read1), _, Left(colBarcodeLength)) => new DmuxedBarcodeSource( DmuxedIterable(read1.toList, parserFor(_).iterator), rowBarcodePolicy, - umiBarcodePolicyOpt + umiBarcodePolicyOpt, + colBarcodeLength ) - case (ReadsSource.DmuxedPairedEnd(read1, read2), Some(revRowBarcodePolicy), _) => + case (ReadsSource.DmuxedPairedEnd(read1, read2), Some(revRowBarcodePolicy), Left(colBarcodeLength)) => new DmuxedPairedEndBarcodeSource( DmuxedIterable(read1.toList, parserFor(_).iterator), DmuxedIterable(read2.toList, parserFor(_).iterator), rowBarcodePolicy, revRowBarcodePolicy, umiBarcodePolicyOpt, - config.readIdCheckPolicy + config.readIdCheckPolicy, + colBarcodeLength ) case _ => throw new IllegalArgumentException("Incompatible reads and barcode policy settings") diff --git a/src/main/scala/org/broadinstitute/gpp/poolq3/parser/CloseableIterable.scala b/src/main/scala/org/broadinstitute/gpp/poolq3/parser/CloseableIterable.scala index f8ff960..91c78d7 100644 --- a/src/main/scala/org/broadinstitute/gpp/poolq3/parser/CloseableIterable.scala +++ b/src/main/scala/org/broadinstitute/gpp/poolq3/parser/CloseableIterable.scala @@ -50,9 +50,12 @@ object DmuxedIterable { val data2: List[(Option[String], List[Read])] = data.map { case (bco, seqs) => (bco, seqs.zipWithIndex.map { case (seq, i) => Read(i.toString, seq) }) } - new DmuxedIterableImpl(data2, CloseableIterator.ofList) + DmuxedIterable.forReads(data2) } + def forReads(data: List[(Option[String], List[Read])]): DmuxedIterable = + new DmuxedIterableImpl(data, CloseableIterator.ofList) + private class DmuxedIterableImpl[A](src: Iterable[(Option[String], A)], makeIterator: A => CloseableIterator[Read]) extends DmuxedIterable { diff --git a/src/test/scala/org/broadinstitute/gpp/poolq3/barcode/DmuxedBarcodeSourceTest.scala b/src/test/scala/org/broadinstitute/gpp/poolq3/barcode/DmuxedBarcodeSourceTest.scala index 33dab40..adbe37a 100644 --- a/src/test/scala/org/broadinstitute/gpp/poolq3/barcode/DmuxedBarcodeSourceTest.scala +++ b/src/test/scala/org/broadinstitute/gpp/poolq3/barcode/DmuxedBarcodeSourceTest.scala @@ -8,6 +8,7 @@ package org.broadinstitute.gpp.poolq3.barcode import cats.syntax.all._ import munit.FunSuite import org.broadinstitute.gpp.poolq3.parser.DmuxedIterable +import org.broadinstitute.gpp.poolq3.types.Read class DmuxedBarcodeSourceTest extends FunSuite { @@ -26,7 +27,7 @@ class DmuxedBarcodeSourceTest extends FunSuite { ) ) - val src = new DmuxedBarcodeSource(iterable, rowPolicy, None) + val src = new DmuxedBarcodeSource(iterable, rowPolicy, None, 8) assertEquals( src.toList, List( @@ -40,9 +41,26 @@ class DmuxedBarcodeSourceTest extends FunSuite { ) } + test("barcodes from read IDs") { + val undeterminedReads = List(Read("@eeeeee ACGTAA", "AAAAAAAAAA"), Read("@eeeeee ACTCAG", "CCCCCCCCCC")) + val aacctgReads = List(Read("@a read", "GGGGGGGGGG"), Read("@another read", "TTTTTTTTTT")) + val iterable = DmuxedIterable.forReads(List(None -> undeterminedReads, Some("AACCTG") -> aacctgReads)) + + val src = new DmuxedBarcodeSource(iterable, rowPolicy, None, 6) + assertEquals( + src.toList, + List( + fb("ACGTAA", "AAAAAAAAAA"), + fb("ACTCAG", "CCCCCCCCCC"), + fb("AACCTG", "GGGGGGGGGG"), + fb("AACCTG", "TTTTTTTTTT") + ) + ) + } + test("nothing works") { val iterable = DmuxedIterable(Nil) - val src = new DmuxedBarcodeSource(iterable, rowPolicy, None) + val src = new DmuxedBarcodeSource(iterable, rowPolicy, None, 8) assertEquals(src.toList, Nil) } diff --git a/src/test/scala/org/broadinstitute/gpp/poolq3/barcode/DmuxedPairedEndBarcodeSourceTest.scala b/src/test/scala/org/broadinstitute/gpp/poolq3/barcode/DmuxedPairedEndBarcodeSourceTest.scala index fd6dc74..19e405c 100644 --- a/src/test/scala/org/broadinstitute/gpp/poolq3/barcode/DmuxedPairedEndBarcodeSourceTest.scala +++ b/src/test/scala/org/broadinstitute/gpp/poolq3/barcode/DmuxedPairedEndBarcodeSourceTest.scala @@ -8,7 +8,7 @@ package org.broadinstitute.gpp.poolq3.barcode import cats.syntax.all._ import munit.FunSuite import org.broadinstitute.gpp.poolq3.parser.DmuxedIterable -import org.broadinstitute.gpp.poolq3.types.ReadIdCheckPolicy +import org.broadinstitute.gpp.poolq3.types.{Read, ReadIdCheckPolicy} class DmuxedPairedEndBarcodeSourceTest extends FunSuite { @@ -32,7 +32,7 @@ class DmuxedPairedEndBarcodeSourceTest extends FunSuite { val iter2 = DmuxedIterable(List(None -> List("AGA", "CTC", "GAG"), Some("CTCGAG") -> List("TGT", "CAC", "TCT"))) - val src = new DmuxedPairedEndBarcodeSource(iter1, iter2, rowPolicy, revRowPolicy, None, ReadIdCheckPolicy.Lax) + val src = new DmuxedPairedEndBarcodeSource(iter1, iter2, rowPolicy, revRowPolicy, None, ReadIdCheckPolicy.Lax, 8) assertEquals( src.toList, List( @@ -46,10 +46,31 @@ class DmuxedPairedEndBarcodeSourceTest extends FunSuite { ) } + test("barcodes from read IDs") { + val undeterminedRead1s = List(Read("@eeeeee ACGTAA", "AAAA"), Read("@eeeeee ACTCAG", "CCCC")) + val undeterminedRead2s = List(Read("@eeeeee ACGTAA", "AAA"), Read("@eeeeee ACTCAG", "CCC")) + val aacctgRead1s = List(Read("@a read", "GGGG"), Read("@another read", "TTTT")) + val aacctgRead2s = List(Read("@a read", "GGG"), Read("@another read", "TTT")) + + val iter1 = DmuxedIterable.forReads(List(None -> undeterminedRead1s, Some("AACCTG") -> aacctgRead1s)) + val iter2 = DmuxedIterable.forReads(List(None -> undeterminedRead2s, Some("AACCTG") -> aacctgRead2s)) + + val src = new DmuxedPairedEndBarcodeSource(iter1, iter2, rowPolicy, revRowPolicy, None, ReadIdCheckPolicy.Lax, 6) + assertEquals( + src.toList, + List( + fb("ACGTAA", "AAAA", "AAA"), + fb("ACTCAG", "CCCC", "CCC"), + fb("AACCTG", "GGGG", "GGG"), + fb("AACCTG", "TTTT", "TTT") + ) + ) + } + test("nothing works") { val i1 = DmuxedIterable(Nil) val i2 = DmuxedIterable(Nil) - val src = new DmuxedPairedEndBarcodeSource(i1, i2, rowPolicy, revRowPolicy, None, ReadIdCheckPolicy.Illumina) + val src = new DmuxedPairedEndBarcodeSource(i1, i2, rowPolicy, revRowPolicy, None, ReadIdCheckPolicy.Illumina, 8) assertEquals(src.toList, Nil) } diff --git a/src/test/scala/org/broadinstitute/gpp/poolq3/barcode/DmuxedTest.scala b/src/test/scala/org/broadinstitute/gpp/poolq3/barcode/DmuxedTest.scala new file mode 100644 index 0000000..9f3a902 --- /dev/null +++ b/src/test/scala/org/broadinstitute/gpp/poolq3/barcode/DmuxedTest.scala @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2022 The Broad Institute, Inc. All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + */ +package org.broadinstitute.gpp.poolq3.barcode + +import munit.FunSuite + +class DmuxedTest extends FunSuite { + + test("extracting a barcode with Ns from an illumina read") { + assertEquals( + Dmuxed.barcodeFromId(8)("@A01379:680:HC37HDRX3:1:2101:1163:1000 1:N:0:GGNGNANT"), + Some(FoundBarcode("GGNGNANT".toCharArray, 0)) + ) + } + + test("extracting a barcode with no Ns from an illumina read") { + assertEquals( + Dmuxed.barcodeFromId(8)("@A01379:680:HC37HDRX3:1:2101:3224:1000 1:N:0:AAATGCGA"), + Some(FoundBarcode("AAATGCGA".toCharArray, 0)) + ) + } + + test("ignore a barcode-like sequence that's too long") { + assertEquals(Dmuxed.barcodeFromId(8)("@A01379:680:HC37HDRX3:1:2101:3224:1000 1:N:0:AAATGCGAGG"), None) + } + + test("ignore a barcode-like sequence that's too short") { + assertEquals(Dmuxed.barcodeFromId(8)("@A01379:680:HC37HDRX3:1:2101:3224:1000 1:N:0:TGCGAGG"), None) + } + +}