Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Specify queries #242

Merged
merged 11 commits into from
May 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/test_on_push.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
on:
push:
branches: [ master ]
branches: [ main ]
paths-ignore:
- '**/*.md'
pull_request:
branches: [ master ]
branches: [ main ]
paths-ignore:
- '**/*.md'

Expand Down Expand Up @@ -38,9 +38,9 @@ jobs:
- name: Test mapping coverage with 8 yeast genomes (PAF output)
run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/scerevisiae8.fa.gz -p 95 -n 7 -m -L -Y '#' > scerevisiae8.paf; scripts/test.sh data/scerevisiae8.fa.gz.fai scerevisiae8.paf 0.92
- name: Test mapping+alignment with a subset of the LPA dataset (PAF output)
run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -n 10 -T wflign_info. -u ./ -L --path-patching-tsv x.tsv > LPA.subset.paf && head LPA.subset.paf && head x.tsv
run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -n 10 -L > LPA.subset.paf && head LPA.subset.paf
- name: Test mapping+alignment with a subset of the LPA dataset (SAM output)
run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -N -a -T wflign_info. -L > LPA.subset.sam && samtools view LPA.subset.sam -bS | samtools sort > LPA.subset.bam && samtools index LPA.subset.bam && samtools view LPA.subset.bam | head | cut -f 1-9
run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -N -a -L > LPA.subset.sam && samtools view LPA.subset.sam -bS | samtools sort > LPA.subset.bam && samtools index LPA.subset.bam && samtools view LPA.subset.bam | head | cut -f 1-9
- name: Test mapping+alignment with short reads (500 bps) to a reference (SAM output)
run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/reference.fa.gz data/reads.500bps.fa.gz -s 0.5k -N -a > reads.500bps.sam && samtools view reads.500bps.sam -bS | samtools sort > reads.500bps.bam && samtools index reads.500bps.bam && samtools view reads.500bps.bam | head
- name: Test mapping+alignment with short reads (255bps) (PAF output)
Expand Down
90 changes: 90 additions & 0 deletions scripts/all2all_jobs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import argparse
import gzip
import itertools

def parse_fasta_index(fasta_file):
fai_file = fasta_file + '.fai'
sequences = []
with open(fai_file, 'r') as file:
for line in file:
sequence_name = line.strip().split('\t')[0]
sequences.append(sequence_name)
return sequences

def group_sequences(sequences, grouping):
grouped_sequences = {}
for sequence in sequences:
if '#' in sequence:
fields = sequence.split('#')
if grouping in ['g', 'genome']:
group_key = fields[0]
elif grouping in ['h', 'haplotype']:
group_key = '#'.join(fields[:2])
elif grouping in ['c', 'contig']:
group_key = sequence
else:
raise ValueError(f"Invalid grouping: {grouping}")
else:
group_key = sequence

if group_key not in grouped_sequences:
grouped_sequences[group_key] = []
grouped_sequences[group_key].append(sequence)

return grouped_sequences

def generate_pairings(target_grouped_sequences, query_grouped_sequences, num_queries):
pairings = []
target_groups = list(target_grouped_sequences.keys())
query_groups = list(query_grouped_sequences.keys())

for target_group in target_groups:
query_pool = [group for group in query_groups if group != target_group]

for query_chunk in itertools.zip_longest(*[iter(query_pool)] * num_queries):
query_chunk = [q for q in query_chunk if q is not None]
pairings.append((target_group, query_chunk))

return pairings

def main():
parser = argparse.ArgumentParser(description='Generate pairings or wfmash command lines for all-to-all alignment using PanSN format.')
parser.add_argument('fasta_file', help='Path to the FASTA file (can be gzipped)')
parser.add_argument('-n', '--num-queries', type=int, default=4, help='Number of query groups per target group (default: 4)')
parser.add_argument('-t', '--target-grouping', choices=['g', 'genome', 'h', 'haplotype', 'c', 'contig'], default='haplotype', help='Grouping level for targets: g/genome, h/haplotype, or c/contig (default: haplotype)')
parser.add_argument('-q', '--query-grouping', choices=['g', 'genome', 'h', 'haplotype', 'c', 'contig'], default='haplotype', help='Grouping level for queries: g/genome, h/haplotype, or c/contig (default: haplotype)')
parser.add_argument('-o', '--output', help='Output file to save the pairings or command lines')

args, wfmash_args = parser.parse_known_args()

# Parse the FASTA index file
sequences = parse_fasta_index(args.fasta_file)

# Group sequences based on the specified grouping levels
target_grouped_sequences = group_sequences(sequences, args.target_grouping)
query_grouped_sequences = group_sequences(sequences, args.query_grouping)

# Generate pairings
pairings = generate_pairings(target_grouped_sequences, query_grouped_sequences, args.num_queries)

# Save or print the pairings or command lines
if wfmash_args:
wfmash_options = ' '.join(wfmash_args)
if args.output:
with open(args.output, 'w') as file:
for target_group, query_groups in pairings:
file.write(f"wfmash {wfmash_options} -T {target_group} -Q {','.join(query_groups)}\n")
else:
for target_group, query_groups in pairings:
print(f"wfmash {wfmash_options} -T {target_group} -Q {','.join(query_groups)}")
else:
if args.output:
with open(args.output, 'w') as file:
for target_group, query_groups in pairings:
file.write(f"{target_group}\t{','.join(query_groups)}\n")
else:
for target_group, query_groups in pairings:
print(f"{target_group}\t{','.join(query_groups)}")

if __name__ == '__main__':
main()
20 changes: 20 additions & 0 deletions scripts/make_source_targball.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

# run from root of the repository

mkdir source-tarball
cd source-tarball
git clone --recursive https://github.com/waveygang/wfmash
cd wfmash
git fetch --tags origin
LATEST_TAG="$(git describe --tags `git rev-list --tags --max-count=1`)"
git checkout "${LATEST_TAG}"
git submodule update --init --recursive
bash scripts/generate_git_version.sh $PWD/src
sed 's/execute_process(COMMAND bash/#execute_process(COMMAND bash/g' CMakeLists.txt -i
rm -Rf .git
find src/common -name ".git" -exec rm -Rf "{}" \;
cd ..
mv wfmash "wfmash-${LATEST_TAG}"
tar -czf "wfmash-${LATEST_TAG}.tar.gz" "wfmash-${LATEST_TAG}"
rm -Rf "wfmash-${LATEST_TAG}"
53 changes: 53 additions & 0 deletions src/common/seqiter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,4 +106,57 @@ void for_each_seq_in_file(
}
}

void for_each_seq_in_file(
faidx_t* fai,
const std::vector<std::string>& seq_names,
const std::function<void(const std::string&, const std::string&)>& func) {
for (const auto& seq_name : seq_names) {
int len;
char* seq = fai_fetch(fai, seq_name.c_str(), &len);
if (seq != nullptr) {
func(seq_name, std::string(seq));
free(seq);
}
}
}

void for_each_seq_in_file_filtered(
const std::string& filename,
const std::vector<std::string>& query_prefix,
const std::unordered_set<std::string>& query_list,
const std::function<void(const std::string&, const std::string&)>& func) {
faidx_t* fai = fai_load(filename.c_str());
if (fai == nullptr) {
std::cerr << "Error: Failed to load FASTA index for file " << filename << std::endl;
return;
}

std::vector<std::string> query_seq_names;
int num_seqs = faidx_nseq(fai);
for (int i = 0; i < num_seqs; i++) {
const char* seq_name = faidx_iseq(fai, i);
bool prefix_skip = true;
for (const auto& prefix : query_prefix) {
if (strncmp(seq_name, prefix.c_str(), prefix.size()) == 0) {
prefix_skip = false;
break;
}
}
if (!query_prefix.empty() && prefix_skip) {
continue;
}
if (!query_list.empty() && query_list.count(seq_name) == 0) {
continue;
}
query_seq_names.push_back(seq_name);
}

for_each_seq_in_file(
fai,
query_seq_names,
func);

fai_destroy(fai);
}

} // namespace seqiter
11 changes: 4 additions & 7 deletions src/common/wflign/src/wflign.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -695,6 +695,7 @@ void WFlign::wflign_affine_wavefront(

// Free
delete wflambda_aligner;
delete wf_aligner;

#ifdef WFA_PNG_TSV_TIMING
if (extend_data.emit_png) {
Expand Down Expand Up @@ -972,9 +973,6 @@ void WFlign::wflign_affine_wavefront(
}

if (merge_alignments) {
// Free old aligner
delete wf_aligner;

// use biWFA for all patching
wfa::WFAlignerGapAffine2Pieces* wf_aligner =
new wfa::WFAlignerGapAffine2Pieces(
Expand Down Expand Up @@ -1029,7 +1027,9 @@ void WFlign::wflign_affine_wavefront(
emit_patching_tsv,
out_patching_tsv
#endif
);
);

delete wf_aligner;
} else {
// todo old implementation (and SAM format is not supported)
for (auto x = trace.rbegin(); x != trace.rend(); ++x) {
Expand All @@ -1050,9 +1050,6 @@ void WFlign::wflign_affine_wavefront(
}
}
}

// Free
delete wf_aligner;
}
}

Expand Down
40 changes: 23 additions & 17 deletions src/interface/parse_args.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,9 @@ void parse_args(int argc,
args::Group mandatory_opts(parser, "[ MANDATORY OPTIONS ]");
args::Positional<std::string> target_sequence_file(mandatory_opts, "target", "alignment target/reference sequence file");

args::Group io_opts(parser, "[ Files IO Options ]");
args::PositionalList<std::string> query_sequence_files(io_opts, "queries", "query sequences file");
args::ValueFlag<std::string> query_sequence_file_list(io_opts, "queries", "alignment queries files list", {'Q', "query-file-list"});
args::Group io_opts(parser, "[ Files IO Options ]");
args::PositionalList<std::string> query_sequence_files(io_opts, "queries", "query sequence file(s)");
//args::ValueFlag<std::string> query_sequence_file_list(io_opts, "queries", "alignment queries files list", {'Q', "query-file-list"});

args::Group mapping_opts(parser, "[ Mapping Options ]");
args::ValueFlag<float> map_pct_identity(mapping_opts, "%", "percent identity in the mashmap step [default: 90]", {'p', "map-pct-id"});
Expand All @@ -80,8 +80,10 @@ void parse_args(int argc,
args::Flag skip_self(mapping_opts, "", "skip self mappings when the query and target name is the same (for all-vs-all mode)", {'X', "skip-self"});
args::Flag one_to_one(mapping_opts, "", "Perform one-to-one filtering", {'4', "one-to-one"});
args::ValueFlag<char> skip_prefix(mapping_opts, "C", "skip mappings when the query and target have the same prefix before the last occurrence of the given character C", {'Y', "skip-prefix"});
args::ValueFlag<std::string> target_prefix(mapping_opts, "pfx", "use only targets whose name starts with this prefix", {'P', "target-prefix"});
args::ValueFlag<std::string> target_list(mapping_opts, "FILE", "file containing list of target sequence names to use", {'A', "target-list"});
args::ValueFlag<std::string> target_prefix(mapping_opts, "pfx", "use only targets whose names start with this prefix", {'T', "target-prefix"});
args::ValueFlag<std::string> target_list(mapping_opts, "FILE", "file containing list of target sequence names to use", {'R', "target-list"});
args::ValueFlag<std::string> query_prefix(mapping_opts, "pfx[,pfx,...]", "use only queries whose names start with these prefixes (comma delimited)", {'Q', "query-prefix"});
args::ValueFlag<std::string> query_list(mapping_opts, "FILE", "file containing list of query sequence names", {'A', "query-list"});
args::Flag approx_mapping(mapping_opts, "approx-map", "skip base-level alignment, producing an approximate mapping in PAF", {'m',"approx-map"});
args::Flag no_split(mapping_opts, "no-split", "disable splitting of input sequences during mapping [default: enabled]", {'N',"no-split"});
args::ValueFlag<std::string> chain_gap(mapping_opts, "N", "chain mappings closer than this distance in query and target, sets approximate maximum variant length detectable in alignment [default: 4*segment_length, up to 20k]", {'c', "chain-gap"});
Expand All @@ -102,19 +104,19 @@ void parse_args(int argc,
args::Group alignment_opts(parser, "[ Alignment Options ]");
args::ValueFlag<std::string> align_input_paf(alignment_opts, "FILE", "derive precise alignments for this input PAF", {'i', "input-paf"});
args::Flag invert_filtering(alignment_opts, "A", "if an input PAF is specified, remove alignments with gap-compressed identity below --map-pct-id x 0.8, else keep all alignments "
"[default: if an input PAF is specified, keep all alignments, else remove alignments with gap-compressed identity below --map-pct-id x 0.8]",
"[default: if an input PAF is specified, keep all alignments, else remove alignments with gap-compressed identity below --map-pct-id x 0.8]",
{'O', "invert-filtering"});
args::ValueFlag<uint16_t> wflambda_segment_length(alignment_opts, "N", "wflambda segment length: size (in bp) of segment mapped in hierarchical WFA problem [default: 256]", {'W', "wflamda-segment"});
args::ValueFlag<std::string> wfa_score_params(alignment_opts, "mismatch,gap1,ext1",
"score parameters for the wfa alignment (affine); match score is fixed at 0 [default: 6,8,1]",
{"wfa-params"});
"score parameters for the wfa alignment (affine); match score is fixed at 0 [default: 6,8,1]",
{"wfa-params"});
args::ValueFlag<std::string> wfa_patching_score_params(alignment_opts, "mismatch,gap1,ext1,gap2,ext2",
"score parameters for the wfa patching alignment (convex); match score is fixed at 0 [default: 5,8,2,49,1]",
{"wfa-patching-params"});
"score parameters for the wfa patching alignment (convex); match score is fixed at 0 [default: 5,8,2,49,1]",
{"wfa-patching-params"});
//wflign parameters
args::ValueFlag<std::string> wflign_score_params(alignment_opts, "mismatch,gap1,ext1",
"score parameters for the wflign alignment (affine); match score is fixed at 0 [default: 4,6,1]",
{"wflign-params"});
"score parameters for the wflign alignment (affine); match score is fixed at 0 [default: 4,6,1]",
{"wflign-params"});
args::ValueFlag<float> wflign_max_mash_dist(alignment_opts, "N", "maximum mash distance to perform the alignment in a wflambda segment [default: adaptive with respect to the estimated identity]", {'b', "max-mash-dist"});
args::ValueFlag<int> wflign_min_wavefront_length(alignment_opts, "N", "min wavefront length for heuristic WFlign [default: 1024]", {'j', "wflign-min-wf-len"});
args::ValueFlag<int> wflign_max_distance_threshold(alignment_opts, "N", "max distance threshold for heuristic WFlign [default: 2048/(estimated_identity^2)]", {'q', "wflign-max-distance"});
Expand All @@ -139,7 +141,7 @@ void parse_args(int argc,

#ifdef WFA_PNG_TSV_TIMING
args::Group debugging_opts(parser, "[ Debugging Options ]");
args::ValueFlag<std::string> prefix_wavefront_info_in_tsv(parser, "PREFIX", " write wavefronts' information for each alignment in TSV format files with this PREFIX", {'T', "tsv"});
args::ValueFlag<std::string> prefix_wavefront_info_in_tsv(parser, "PREFIX", " write wavefronts' information for each alignment in TSV format files with this PREFIX", {'G', "tsv"});
args::ValueFlag<std::string> prefix_wavefront_plot_in_png(parser, "PREFIX", " write wavefronts' plot for each alignment in PNG format files with this PREFIX", {'u', "prefix-png"});
args::ValueFlag<uint64_t> wfplot_max_size(parser, "N", "max size of the wfplot [default: 1500]", {'z', "wfplot-max-size"});
args::ValueFlag<std::string> path_patching_info_in_tsv(parser, "FILE", " write patching information for each alignment in TSV format in FILE", {"path-patching-tsv"});
Expand Down Expand Up @@ -202,6 +204,14 @@ void parse_args(int argc,
if (target_prefix) {
map_parameters.target_prefix = args::get(target_prefix);
}

if (query_list) {
map_parameters.query_list = args::get(query_list);
}

if (query_prefix) {
map_parameters.query_prefix = skch::CommonFunc::split(args::get(query_prefix), ',');
}

if (target_sequence_file) {
map_parameters.refSequences.push_back(args::get(target_sequence_file));
Expand All @@ -215,10 +225,6 @@ void parse_args(int argc,
align_parameters.querySequences.push_back(q);
}
}
if (query_sequence_file_list) {
skch::parseFileList(args::get(query_sequence_file_list), map_parameters.querySequences);
skch::parseFileList(args::get(query_sequence_file_list), align_parameters.querySequences);
}

if (target_sequence_file && map_parameters.querySequences.empty()
&& map_parameters.refSequences.size() == 1
Expand Down
Loading
Loading