Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse barcodes from read IDs in demultiplexed mode #26

Merged
merged 5 commits into from
Oct 30, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/main/scala/org/broadinstitute/gpp/poolq3/PoolQ.scala
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,10 @@ object PoolQ {
ExactReference(referenceData.mappings, identity, includeAmbiguous = false)
}

val colBarcodePolicyOrLength: Either[Int, BarcodePolicy] = colBarcodePolicyOpt.toRight(colReference.barcodeLength)

val barcodes: CloseableIterable[Barcodes] =
barcodeSource(config.input, rowBarcodePolicy, revRowBarcodePolicyOpt, colBarcodePolicyOpt, umiInfo.map(_._2))
barcodeSource(config.input, rowBarcodePolicy, revRowBarcodePolicyOpt, colBarcodePolicyOrLength, umiInfo.map(_._2))

lazy val unexpectedSequenceCacheDir: Option[Path] =
if (config.skipUnexpectedSequenceReport) None
Expand Down
18 changes: 18 additions & 0 deletions src/main/scala/org/broadinstitute/gpp/poolq3/barcode/Dmuxed.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
/*
* Copyright (c) 2022 The Broad Institute, Inc. All rights reserved.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
package org.broadinstitute.gpp.poolq3.barcode

object Dmuxed {

private[barcode] def barcodeFromId(length: Int): String => Option[FoundBarcode] = {
val regex = s"@.*[^ACGTN]([ACGTN]{$length})$$".r
_ match {
case regex(barcode) => Some(FoundBarcode(barcode.toCharArray, 0))
case _ => None
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,15 @@ package org.broadinstitute.gpp.poolq3.barcode
import org.broadinstitute.gpp.poolq3.parser.{CloseableIterable, CloseableIterator, DmuxedIterable}
import org.broadinstitute.gpp.poolq3.types.Read

final class DmuxedBarcodeSource(parser: DmuxedIterable, rowPolicy: BarcodePolicy, umiPolicyOpt: Option[BarcodePolicy])
extends CloseableIterable[Barcodes] {
final class DmuxedBarcodeSource(
parser: DmuxedIterable,
rowPolicy: BarcodePolicy,
umiPolicyOpt: Option[BarcodePolicy],
colBarcodeLength: Int
) extends CloseableIterable[Barcodes] {

// used to attempt to parse barcodes out of ids if the file has no associated barcode
private val colBarcodeParser = Dmuxed.barcodeFromId(colBarcodeLength)

private def colBarcodeOpt = parser.indexBarcode

Expand All @@ -20,7 +27,7 @@ final class DmuxedBarcodeSource(parser: DmuxedIterable, rowPolicy: BarcodePolicy
val nextRead = iterator.next()
val rowBarcodeOpt = rowPolicy.find(nextRead)
val umiBarcodeOpt = umiPolicyOpt.flatMap(_.find(nextRead))
Barcodes(rowBarcodeOpt, None, colBarcodeOpt, umiBarcodeOpt)
Barcodes(rowBarcodeOpt, None, colBarcodeOpt.orElse(colBarcodeParser(nextRead.id)), umiBarcodeOpt)
}

override def close(): Unit = iterator.close()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ class DmuxedPairedEndBarcodeSource(
rowPolicy: BarcodePolicy,
revRowPolicy: BarcodePolicy,
umiPolicyOpt: Option[BarcodePolicy],
readIdCheckPolicy: ReadIdCheckPolicy
readIdCheckPolicy: ReadIdCheckPolicy,
colBarcodeLength: Int
) extends CloseableIterable[Barcodes] {

// the index barcode _is_ the column barcode; we get it from the row parser
Expand All @@ -24,6 +25,9 @@ class DmuxedPairedEndBarcodeSource(
private[this] class BarcodeIterator(rowIterator: CloseableIterator[Read], revRowIterator: CloseableIterator[Read])
extends CloseableIterator[Barcodes] {

// used to attempt to parse barcodes out of ids if the file has no associated barcode
private val colBarcodeParser = Dmuxed.barcodeFromId(colBarcodeLength)

final override def hasNext: Boolean = rowIterator.hasNext && revRowIterator.hasNext

final override def next(): Barcodes = {
Expand All @@ -33,7 +37,7 @@ class DmuxedPairedEndBarcodeSource(
val rowBarcodeOpt = rowPolicy.find(nextRow)
val revRowBarcodeOpt = revRowPolicy.find(nextRevRow)
val umiBarcodeOpt = umiPolicyOpt.flatMap(_.find(nextRow))
Barcodes(rowBarcodeOpt, revRowBarcodeOpt, colBarcodeOpt, umiBarcodeOpt)
Barcodes(rowBarcodeOpt, revRowBarcodeOpt, colBarcodeOpt.orElse(colBarcodeParser(nextRow.id)), umiBarcodeOpt)
}

final override def close(): Unit =
Expand Down
18 changes: 10 additions & 8 deletions src/main/scala/org/broadinstitute/gpp/poolq3/barcode/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ package object barcode {
config: PoolQInput,
rowBarcodePolicy: BarcodePolicy,
revRowBarcodePolicyOpt: Option[BarcodePolicy],
colBarcodePolicyOpt: Option[BarcodePolicy],
colBarcodePolicyOpt: Either[Int, BarcodePolicy],
umiBarcodePolicyOpt: Option[BarcodePolicy]
): CloseableIterable[Barcodes] =
(config.readsSource, revRowBarcodePolicyOpt, colBarcodePolicyOpt) match {
case (ReadsSource.Split(index, forward), None, Some(colBarcodePolicy)) =>
case (ReadsSource.Split(index, forward), None, Right(colBarcodePolicy)) =>
new TwoFileBarcodeSource(
parserFor(forward.toList),
parserFor(index.toList),
Expand All @@ -39,7 +39,7 @@ package object barcode {
umiBarcodePolicyOpt,
config.readIdCheckPolicy
)
case (ReadsSource.PairedEnd(index, forward, reverse), Some(revRowBarcodePolicy), Some(colBarcodePolicy)) =>
case (ReadsSource.PairedEnd(index, forward, reverse), Some(revRowBarcodePolicy), Right(colBarcodePolicy)) =>
new ThreeFileBarcodeSource(
parserFor(forward.toList),
parserFor(reverse.toList),
Expand All @@ -50,22 +50,24 @@ package object barcode {
umiBarcodePolicyOpt,
config.readIdCheckPolicy
)
case (ReadsSource.SelfContained(paths), None, Some(colBarcodePolicy)) =>
case (ReadsSource.SelfContained(paths), None, Right(colBarcodePolicy)) =>
new SingleFileBarcodeSource(parserFor(paths.toList), rowBarcodePolicy, colBarcodePolicy, umiBarcodePolicyOpt)
case (ReadsSource.Dmuxed(read1), _, _) =>
case (ReadsSource.Dmuxed(read1), _, Left(colBarcodeLength)) =>
new DmuxedBarcodeSource(
DmuxedIterable(read1.toList, parserFor(_).iterator),
rowBarcodePolicy,
umiBarcodePolicyOpt
umiBarcodePolicyOpt,
colBarcodeLength
)
case (ReadsSource.DmuxedPairedEnd(read1, read2), Some(revRowBarcodePolicy), _) =>
case (ReadsSource.DmuxedPairedEnd(read1, read2), Some(revRowBarcodePolicy), Left(colBarcodeLength)) =>
new DmuxedPairedEndBarcodeSource(
DmuxedIterable(read1.toList, parserFor(_).iterator),
DmuxedIterable(read2.toList, parserFor(_).iterator),
rowBarcodePolicy,
revRowBarcodePolicy,
umiBarcodePolicyOpt,
config.readIdCheckPolicy
config.readIdCheckPolicy,
colBarcodeLength
)
case _ =>
throw new IllegalArgumentException("Incompatible reads and barcode policy settings")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,12 @@ object DmuxedIterable {
val data2: List[(Option[String], List[Read])] = data.map { case (bco, seqs) =>
(bco, seqs.zipWithIndex.map { case (seq, i) => Read(i.toString, seq) })
}
new DmuxedIterableImpl(data2, CloseableIterator.ofList)
DmuxedIterable.forReads(data2)
}

def forReads(data: List[(Option[String], List[Read])]): DmuxedIterable =
new DmuxedIterableImpl(data, CloseableIterator.ofList)

private class DmuxedIterableImpl[A](src: Iterable[(Option[String], A)], makeIterator: A => CloseableIterator[Read])
extends DmuxedIterable {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ package org.broadinstitute.gpp.poolq3.barcode
import cats.syntax.all._
import munit.FunSuite
import org.broadinstitute.gpp.poolq3.parser.DmuxedIterable
import org.broadinstitute.gpp.poolq3.types.Read

class DmuxedBarcodeSourceTest extends FunSuite {

Expand All @@ -26,7 +27,7 @@ class DmuxedBarcodeSourceTest extends FunSuite {
)
)

val src = new DmuxedBarcodeSource(iterable, rowPolicy, None)
val src = new DmuxedBarcodeSource(iterable, rowPolicy, None, 8)
assertEquals(
src.toList,
List(
Expand All @@ -40,9 +41,26 @@ class DmuxedBarcodeSourceTest extends FunSuite {
)
}

test("barcodes from read IDs") {
mtomko marked this conversation as resolved.
Show resolved Hide resolved
val undeterminedReads = List(Read("@eeeeee ACGTAA", "AAAAAAAAAA"), Read("@eeeeee ACTCAG", "CCCCCCCCCC"))
val aacctgReads = List(Read("@a read", "GGGGGGGGGG"), Read("@another read", "TTTTTTTTTT"))
val iterable = DmuxedIterable.forReads(List(None -> undeterminedReads, Some("AACCTG") -> aacctgReads))

val src = new DmuxedBarcodeSource(iterable, rowPolicy, None, 6)
assertEquals(
src.toList,
List(
fb("ACGTAA", "AAAAAAAAAA"),
fb("ACTCAG", "CCCCCCCCCC"),
fb("AACCTG", "GGGGGGGGGG"),
fb("AACCTG", "TTTTTTTTTT")
)
)
}

test("nothing works") {
val iterable = DmuxedIterable(Nil)
val src = new DmuxedBarcodeSource(iterable, rowPolicy, None)
val src = new DmuxedBarcodeSource(iterable, rowPolicy, None, 8)
assertEquals(src.toList, Nil)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ package org.broadinstitute.gpp.poolq3.barcode
import cats.syntax.all._
import munit.FunSuite
import org.broadinstitute.gpp.poolq3.parser.DmuxedIterable
import org.broadinstitute.gpp.poolq3.types.ReadIdCheckPolicy
import org.broadinstitute.gpp.poolq3.types.{Read, ReadIdCheckPolicy}

class DmuxedPairedEndBarcodeSourceTest extends FunSuite {

Expand All @@ -32,7 +32,7 @@ class DmuxedPairedEndBarcodeSourceTest extends FunSuite {

val iter2 = DmuxedIterable(List(None -> List("AGA", "CTC", "GAG"), Some("CTCGAG") -> List("TGT", "CAC", "TCT")))

val src = new DmuxedPairedEndBarcodeSource(iter1, iter2, rowPolicy, revRowPolicy, None, ReadIdCheckPolicy.Lax)
val src = new DmuxedPairedEndBarcodeSource(iter1, iter2, rowPolicy, revRowPolicy, None, ReadIdCheckPolicy.Lax, 8)
assertEquals(
src.toList,
List(
Expand All @@ -46,10 +46,31 @@ class DmuxedPairedEndBarcodeSourceTest extends FunSuite {
)
}

test("barcodes from read IDs") {
val undeterminedRead1s = List(Read("@eeeeee ACGTAA", "AAAA"), Read("@eeeeee ACTCAG", "CCCC"))
val undeterminedRead2s = List(Read("@eeeeee ACGTAA", "AAA"), Read("@eeeeee ACTCAG", "CCC"))
val aacctgRead1s = List(Read("@a read", "GGGG"), Read("@another read", "TTTT"))
val aacctgRead2s = List(Read("@a read", "GGG"), Read("@another read", "TTT"))

val iter1 = DmuxedIterable.forReads(List(None -> undeterminedRead1s, Some("AACCTG") -> aacctgRead1s))
val iter2 = DmuxedIterable.forReads(List(None -> undeterminedRead2s, Some("AACCTG") -> aacctgRead2s))

val src = new DmuxedPairedEndBarcodeSource(iter1, iter2, rowPolicy, revRowPolicy, None, ReadIdCheckPolicy.Lax, 6)
assertEquals(
src.toList,
List(
fb("ACGTAA", "AAAA", "AAA"),
fb("ACTCAG", "CCCC", "CCC"),
fb("AACCTG", "GGGG", "GGG"),
fb("AACCTG", "TTTT", "TTT")
)
)
}

test("nothing works") {
val i1 = DmuxedIterable(Nil)
val i2 = DmuxedIterable(Nil)
val src = new DmuxedPairedEndBarcodeSource(i1, i2, rowPolicy, revRowPolicy, None, ReadIdCheckPolicy.Illumina)
val src = new DmuxedPairedEndBarcodeSource(i1, i2, rowPolicy, revRowPolicy, None, ReadIdCheckPolicy.Illumina, 8)
assertEquals(src.toList, Nil)
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* Copyright (c) 2022 The Broad Institute, Inc. All rights reserved.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
package org.broadinstitute.gpp.poolq3.barcode

import munit.FunSuite

class DmuxedTest extends FunSuite {

test("extracting a barcode with Ns from an illumina read") {
assertEquals(
Dmuxed.barcodeFromId(8)("@A01379:680:HC37HDRX3:1:2101:1163:1000 1:N:0:GGNGNANT"),
Some(FoundBarcode("GGNGNANT".toCharArray, 0))
)
}

test("extracting a barcode with no Ns from an illumina read") {
assertEquals(
Dmuxed.barcodeFromId(8)("@A01379:680:HC37HDRX3:1:2101:3224:1000 1:N:0:AAATGCGA"),
Some(FoundBarcode("AAATGCGA".toCharArray, 0))
)
}

test("ignore a barcode-like sequence that's too long") {
assertEquals(Dmuxed.barcodeFromId(8)("@A01379:680:HC37HDRX3:1:2101:3224:1000 1:N:0:AAATGCGAGG"), None)
}

test("ignore a barcode-like sequence that's too short") {
assertEquals(Dmuxed.barcodeFromId(8)("@A01379:680:HC37HDRX3:1:2101:3224:1000 1:N:0:TGCGAGG"), None)
}

}