Skip to content

Commit

Permalink
Update to README & allows pipeline to evaluate .Rtab gene presence ma…
Browse files Browse the repository at this point in the history
…trices.
  • Loading branch information
maxgmarin committed Mar 26, 2024
1 parent db2dce6 commit b2360f6
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 52 deletions.
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,13 @@ panqc nrc -a InputAsmPaths.tsv -r pan_genome_reference.fa -m gene_presence_absen

The above command will output an adjusted gene presence absence matrix along with additional statistics to the specified output directory (`NRC_results/`).


Alternatively, if you would like to use a `gene_presence_absence.Rtab` file instead of a CSV matrix of gene presence, use add the `--is-rtab` flag.

```
panqc nrc -a InputAsmPaths.tsv -r pan_genome_reference.fa -m gene_presence_absence.Rtab --is-rtab -o NRC_results/
```

### Analyzing included test data set

If you wish to run an `panqc nrc` on an artifical (and abridged) test data set, you can run the following commands:
Expand Down Expand Up @@ -119,7 +126,7 @@ optional arguments:
Input pan-genome nucleotide reference. Typically output as `pan_genome_reference.fasta` by Panaroo/Roary (FASTA)
-m, --gene_matrix gene_presence_absence.csv
Input pan-genome gene presence/absence matrix. Typically output as `gene_presence_absence.csv` by Panaroo/Roary (CSV)
Input pan-genome gene presence/absence matrix. By default is assumed to be a `gene_presence_absence.csv` output by Panaroo/Roary (CSV) If the user provides the --is-rtab flag, the input is assumed to be an .Rtab (TSV)file.
-o, --results_dir RESULTS_DIR
Output directory for analysis results.
Expand All @@ -138,6 +145,7 @@ optional arguments:
-t, --min-ksim MIN_KSIM
Minimum k-mer similarity (maximum jaccard containment of k-mers between pair of sequences) to cluster sequences into the same "nucleotide similarity cluster" (Default: 0.8))
--is-rtab Flag indicating that the input gene matrix is a tab-delimited .Rtab file
```


Expand Down Expand Up @@ -165,6 +173,7 @@ optional arguments:
```


>🚧 Check back soon for full usage for each of the utility sub-pipelines of the panqc toolkit 🚧

Expand Down
62 changes: 36 additions & 26 deletions panqc/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python3

### Authors: Max Marin ([email protected])
# Pan-genome QC toolkit (PQGC)
# Pan-genome QC (panqc) toolkit


import sys
Expand All @@ -22,21 +22,23 @@ def _nrc_cli(args):
## 1) Set input parameters and PATHs ####
input_Assemblies_TSV = args.asms
input_PG_Ref_FA = args.pg_ref
input_PresAbs_CSV = args.gene_matrix
input_PresAbs_File = args.gene_matrix
results_dir = args.results_dir
min_query_cov = args.min_query_cov
min_seq_id = args.min_seq_id
kmer_size = args.kmer_size
ksim_cluster_thresh = args.min_ksim
prefix = args.prefix
rtab_input = args.is_rtab

## 2) Run the assembly sequence check function ####

Gene_PresAbs_WiAsmSeqCheck_DF = asmseqcheck_frompaths(input_PresAbs_CSV,
input_PG_Ref_FA,
input_Assemblies_TSV,
min_query_cov,
min_seq_id)
Gene_PresAbs_WiAsmSeqCheck_DF = asmseqcheck_frompaths(input_PresAbs_File,
input_PG_Ref_FA,
input_Assemblies_TSV,
min_query_cov,
min_seq_id,
rtab_input)

# 3) Print the general QC Stats
ASC_Stats_DF = get_AsmSeqCheck_QCStatsDF(Gene_PresAbs_WiAsmSeqCheck_DF)
Expand Down Expand Up @@ -124,11 +126,11 @@ def _asmseqcheck_cli(args):
## 1) Set input parameters and PATHs ####

# Define input paths
input_PG_Ref_FA = args.in_pg_ref
input_PG_Ref_FA = args.pg_ref

input_AsmFA_TSV = args.in_assemblies
input_Assemblies_TSV = args.asms

input_PresAbs_CSV = args.in_gene_matrix
input_PresAbs_File = args.gene_matrix

# Define output path
output_PresAbs_WiDNASeqCheck = args.out_gene_matrix_wi_geneseqcheck
Expand All @@ -137,16 +139,18 @@ def _asmseqcheck_cli(args):
min_query_cov = args.min_query_cov
min_seq_id = args.min_seq_id

## 2) Run the assembly sequence check function ####
# Set variable which defines whether the input gene matrix is an .Rtab file (or CSV file)
rtab_input = args.is_rtab

Gene_PresAbs_WiAsmSeqCheck_DF = asmseqcheck_frompaths(input_PresAbs_CSV,
input_PG_Ref_FA,
input_AsmFA_TSV,
min_query_cov,
min_seq_id)
## 2) Run the assembly sequence check function ####
Gene_PresAbs_WiAsmSeqCheck_DF = asmseqcheck_frompaths(input_PresAbs_File,
input_PG_Ref_FA,
input_Assemblies_TSV,
min_query_cov,
min_seq_id,
rtab_input)

# 3) Print the general QC Stats

_ = get_AsmSeqCheck_QCStats(Gene_PresAbs_WiAsmSeqCheck_DF)


Expand Down Expand Up @@ -195,7 +199,6 @@ def _nscluster_cli(args):



# fg("blue")
def main():
parser = argparse.ArgumentParser(description = "Toolkit for focused on augmenting common CDS based pan-genome analysis with nucleotide sequence comparison.")
sub_parser_1 = parser.add_subparsers(required=True, help='Please select one of the pipelines of the PanQC toolkit.')
Expand All @@ -209,7 +212,7 @@ def main():
help="Input pan-genome nucleotide reference. Typically output as `pan_genome_reference.fasta` by Panaroo/Roary (FASTA)")

nrc_parser.add_argument('-m', '--gene_matrix', type=str, required=True, metavar="gene_presence_absence.csv",
help="Input pan-genome gene presence/absence matrix. Typically output as `gene_presence_absence.csv` by Panaroo/Roary (CSV)")
help="Input pan-genome gene presence/absence matrix. By default is assumed to be a `gene_presence_absence.csv` output by Panaroo/Roary (CSV) \n If the user provides the --is-rtab flag, the input is assumed to be an .Rtab (TSV) file.")

nrc_parser.add_argument('-o', '--results_dir', type=str, required=True,
help="Output directory for analysis results.")
Expand All @@ -228,7 +231,10 @@ def main():

nrc_parser.add_argument('-t', '--min-ksim',type=float, default=0.8,
help='Minimum k-mer similarity (maximum jaccard containment of k-mers between pair of sequences) to cluster sequences into the same "nucleotide similarity cluster" (Default: 0.8))')


nrc_parser.add_argument('--is-rtab', action='store_true',
help="Flag indicating that the input gene matrix is a tab-delimited .Rtab file")

nrc_parser.set_defaults(func=_nrc_cli)


Expand All @@ -240,14 +246,14 @@ def main():
utils_sub_parser = utils_parser.add_subparsers(required=True, help='Please select one of the utilility pipelines of the PanQC toolkit.')

asmseqcheck_parser = utils_sub_parser.add_parser("asmseqcheck", help="")
asmseqcheck_parser.add_argument('-a', '--in_assemblies', type=str, required=True,
help="Paths to input assemblies. (TSV)")
asmseqcheck_parser.add_argument('-a', '--asms', type=str, required=True, metavar="PathToAsms.tsv",
help="Table with SampleID & Paths to each input assemblies. (TSV)")

asmseqcheck_parser.add_argument('-r', '--in_pg_ref', type=str, required=True,
help="Input pan-genome nucleotide reference. Typically output as `pan_genome_reference.fasta` by Panaroo/Roary (FASTA)")
asmseqcheck_parser.add_argument('-r', '--pg-ref', type=str, required=True, metavar="pan_genome_reference.fasta",
help="Input pan-genome nucleotide reference. Typically output as `pan_genome_reference.fasta` by Panaroo/Roary (FASTA)")

asmseqcheck_parser.add_argument('-m', '--in_gene_matrix', type=str, required=True,
help="Input pan-genome gene presence/absence matrix. Typically output as `gene_presence_absence.csv` by Panaroo/Roary (CSV)")
asmseqcheck_parser.add_argument('-m', '--gene_matrix', type=str, required=True, metavar="gene_presence_absence.csv",
help="Input pan-genome gene presence/absence matrix. By default is assumed to be a `gene_presence_absence.csv` output by Panaroo/Roary (CSV) \n If the user provides the --is-rtab flag, the input is assumed to be an .Rtab (TSV) file.")

asmseqcheck_parser.add_argument('-o', '--out_gene_matrix_wi_geneseqcheck',type=str, required=True,
help="Output pan-genome gene presence/absence matrix with updated gene presence/absence calls. (CSV). \n NOTE: 2 reflects that similar gene sequence is present at the nucleotide level (CSV)")
Expand All @@ -258,6 +264,10 @@ def main():
asmseqcheck_parser.add_argument('-i', '--min_seq_id', type=float, default=0.9,
help="Minimum sequence identity to classify a gene as present within an assembly (0-1)")

asmseqcheck_parser.add_argument('--is-rtab', action='store_true',
help="Flag indicating that the input gene matrix is a tab-delimited .Rtab file")


asmseqcheck_parser.set_defaults(func=_asmseqcheck_cli)


Expand Down
45 changes: 22 additions & 23 deletions panqc/asm_gene_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# Set the logging level to INFO
#logging.basicConfig(level=logging.INFO)

from .utils import parse_PresAbs_CSV_General, get_columns_excluding, parse_PG_Ref_FA
from .utils import parse_PresAbs_CSV_General, parse_PresAbs_Rtab, get_columns_excluding, parse_PG_Ref_FA


#### Define function for parsing Mappy alignment hits
Expand Down Expand Up @@ -239,59 +239,58 @@ def create_LRandSR_AsmFA_PATH_Dict(i_AsmFA_LRandSR_TSV):
'Order within Fragment', 'Accessory Fragment', 'Accessory Order with Fragment', 'QC',
'Min group size nuc', 'Max group size nuc', 'Avg group size nuc']

def asmseqcheck_frompaths(i_Gene_PresAbs_CSV_PATH,
def asmseqcheck_frompaths(i_Gene_PresAbs_File_PATH,
i_PG_Ref_FA_PATH,
i_AsmFA_TSV,
MinQueryCov, MinQuerySeqID):
MinQueryCov,
MinQuerySeqID,
rtab_matrix = False):
"""
Searches for gene sequences in genome assemblies and returns a DataFrame with presence/absence information.
Args:
i_Gene_PresAbs_CSV_PATH (str): Path to the gene presence/absence matrix CSV file.
i_PG_Ref_FA_PATH (str): Path to the pan-genome (nucleotide) reference fasta file.
i_Gene_PresAbs_File_PATH (str): Path to the gene presence/absence matrix file.
i_PG_Ref_FA_PATH (str): Path to the pan-genome reference fasta file.
i_AsmFA_TSV (str): Path to the TSV file containing the genome assembly fasta file paths.
MinQueryCov (float): Minimum query coverage required for a gene sequence to be considered present.
MinQuerySeqID (float): Minimum query sequence identity required for a gene sequence to be considered present.
rtab_matrix (bool): Flag indicating whether the gene presence/absence matrix is in .Rtab format.
Returns:
pandas.DataFrame: A DataFrame with presence/absence information for each gene in each sample.
"""

# 1) Parse in Gene Pres/Abs matrix and select sampleIDs

# Parse the gene presence/absence matrix and select sampleIDs
print("Parsing input gene presence/absence matrix...")
Gene_PresAbs_DF = parse_PresAbs_CSV_General(i_Gene_PresAbs_CSV_PATH)

i_SampleIDs = get_columns_excluding(Gene_PresAbs_DF, PresAbs_NonSampleID_ColNames)
if rtab_matrix:
print(" Gene presence/absence matrix being parsed as TSV (.Rtab)")
Gene_PresAbs_DF = parse_PresAbs_Rtab(i_Gene_PresAbs_File_PATH)
print(" Dimensions of parsed matrix", Gene_PresAbs_DF.shape)
else:
print(" Gene presence/absence matrix being parsed as CSV")
Gene_PresAbs_DF = parse_PresAbs_CSV_General(i_Gene_PresAbs_File_PATH)
print(" Dimensions of parsed matrix", Gene_PresAbs_DF.shape)

i_SampleIDs = get_columns_excluding(Gene_PresAbs_DF, PresAbs_NonSampleID_ColNames)
print("Finished parsing gene presence/absence matrix\n")

# 2) Read in pan-genome (nucleotide) reference fasta
# Read in the pan-genome reference fasta
print("Parsing input gene reference fasta")
PG_Ref_NucSeqs = parse_PG_Ref_FA(i_PG_Ref_FA_PATH)
print("Finished parsing gene reference fasta\n")

# 3) Read in genome assembly fasta PATH TSV, convert to dict
# Read in the genome assembly fasta PATH TSV and convert to dict
print("Parsing TSV of input assembly PATHs")
AsmFA_Dict = create_AsmFA_PATH_Dict(i_AsmFA_TSV)
print("Finished parsing input assembly PATHs \n")

# 4) Run alignments to check for gene sequences in assemblies
print("Running alignments to check for gene sequences in assemblies...")

# Gene_PresAbs_WiAsmSeqCheck_DF = PresAbsQC_CheckAsmForGeneSeq(Gene_PresAbs_DF, PG_Ref_NucSeqs,
# AsmFA_Dict, i_SampleIDs,
# MinQueryCov, MinQuerySeqID)

# Run alignments to check for gene sequences in assemblies
print("Running alignments to check for gene sequences in assemblies...")
Gene_PresAbs_WiAsmSeqCheck_DF = PresAbsQC_CheckAsmForGeneSeq_V2(Gene_PresAbs_DF, PG_Ref_NucSeqs,
AsmFA_Dict, i_SampleIDs,
MinQueryCov, MinQuerySeqID)



print("Finished searching for gene sequences in assemblies \n")


return Gene_PresAbs_WiAsmSeqCheck_DF


Expand Down
3 changes: 2 additions & 1 deletion panqc/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@

def parse_PresAbs_Rtab(PresAbs_Rtab_PATH):
'''
This function parsesthe `gene_presence_absence.csv` file output by Panaroo '''
This function parses the `gene_presence_absence.Rtab` file output by many pan-genome analysis softwares (Panaroo, Roary)
'''

i_Gene_PresAbs_DF = pd.read_csv(PresAbs_Rtab_PATH, sep = "\t")

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "hatchling.build"
name = "panqc"
description = "Package & software for analysis of nucleotide redundancy within CDS-based pan-genome analyses"
readme = "README.md"
version = "0.0.3"
version = "0.0.4"
authors = [
{ name = "Maximillian Marin", email = "[email protected]" }
]
Expand Down

0 comments on commit b2360f6

Please sign in to comment.