Skip to content

Commit

Permalink
Merge branch 'main' into fix_patching_and_stats
Browse files Browse the repository at this point in the history
  • Loading branch information
AndreaGuarracino committed Mar 6, 2024
2 parents f80da58 + 7205bf7 commit d11aa09
Show file tree
Hide file tree
Showing 14 changed files with 335 additions and 103 deletions.
29 changes: 28 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -175,10 +175,37 @@ Note that this may make the tool a little bit slower.

If you have `nix`, build and installation in your profile are as simple as:

```
```shell
nix-build && nix-env -i ./result
```

### guix

If you have `guix`:

```shell
guix build -f guix.scm
```

#### Docker and Singularity images with nix

Nix is also able to build an Docker image, which can then be loaded by Docker and converted to a Singularity image.

```
nix-build docker.nix
docker load < result
singularity build wfmash.sif docker-daemon://wfmash-docker:latest
```

This can be run with Singularity like this:

```
singularity run wfmash.sif $ARGS
```

Where `$ARGS` are your typical command line arguments to `wfmash`.


### Bioconda

`wfmash` recipes for Bioconda are available at https://anaconda.org/bioconda/wfmash.
Expand Down
2 changes: 1 addition & 1 deletion default.nix
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{ pkgs ? import <nixpkgs> {} }:

pkgs.callPackage ./wfmash.nix {
inherit (pkgs) stdenv fetchFromGitHub cmake gsl gmp makeWrapper jemalloc htslib git zlib pkgconfig;
inherit (pkgs) stdenv fetchFromGitHub cmake gsl gmp makeWrapper jemalloc htslib git zlib pkg-config;
}
15 changes: 15 additions & 0 deletions docker.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{ pkgs ? import <nixpkgs> { }
, pkgsLinux ? import <nixpkgs> { system = "x86_64-linux"; }
}:

let
wfmash = pkgs.callPackage ./wfmash.nix { };
in
pkgs.dockerTools.buildImage {
name = "wfmash-docker";
tag = "latest";
copyToRoot = [ wfmash ];
config = {
Entrypoint = [ "${wfmash}/bin/wfmash" ];
};
}
86 changes: 86 additions & 0 deletions guix.scm
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
;; To use this file to build a static version of wfmash using git HEAD:
;;
;; guix build -f guix.scm
;;
;; To get a development container using a recent guix (see `guix pull`)
;;
;; guix shell -C -D -f guix.scm
;;
;; and inside the container
;;
;; mkdir build
;; cd build
;; cmake ..
;; make
;;
;; For the tests you may need /usr/bin/env. Inside the container:
;;
;; mkdir -p /usr/bin ; ln -s $GUIX_ENVIRONMENT/bin/env /usr/bin/env
;;
;; by Pjotr Prins & Andrea Guarracino (c) 2023

(use-modules
((guix licenses) #:prefix license:)
(guix gexp)
(guix packages)
(guix git-download)
(guix build-system cmake)
; (guix gexp)
(guix utils)
(gnu packages algebra)
(gnu packages base)
(gnu packages bioinformatics)
(gnu packages build-tools)
(gnu packages compression)
; (gnu packages curl)
(gnu packages gcc)
(gnu packages jemalloc)
(gnu packages llvm)
(gnu packages maths)
(gnu packages multiprecision)
(gnu packages pkg-config)
(gnu packages python)
(gnu packages version-control)
(srfi srfi-1)
(ice-9 popen)
(ice-9 rdelim))

(define %source-dir (dirname (current-filename)))

(define %git-commit
(read-string (open-pipe "git show HEAD | head -1 | cut -d ' ' -f 2" OPEN_READ)))

(define-public wfmash-git
(package
(name "wfmash-git")
(version (git-version "0.10.7" "HEAD" %git-commit))
(source (local-file %source-dir #:recursive? #t))
(build-system cmake-build-system)
(arguments
`(#:tests? #f)) ;; Disable the test phase
(inputs
`(
;; ("clang" ,clang) ; add this to test clang builds
;; ("lld" ,lld) ; add this to test clang builds
("gcc" ,gcc-12)
("gsl" ,gsl)
("gmp" ,gmp)
("make" ,gnu-make)
("pkg-config" ,pkg-config)
("jemalloc" ,jemalloc)
("htslib" ,htslib)
("git" ,git)
; ("bc" ,bc) ; for tests
("coreutils" ,coreutils) ; for echo and env in tests
; ("curl" ,curl)
; ("parallel" ,parallel) ; for wfmash-parallel
("bzip2" ,bzip2) ; libz2 part of htslib
("xz" ,xz) ;
("zlib" ,zlib)))
(synopsis "wfmash")
(description
"wfmash.")
(home-page "https://github.com/waveygang/wfmash")
(license license:expat)))

wfmash-git
2 changes: 1 addition & 1 deletion scripts/split_approx_mappings_in_chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def split_chunks(l, n):
# We could avoid keeping everything in memory by reading the file again later
rank_to_mapping_dict[rank] = line

_, _, query_start, query_end, _, _, _, target_start, target_end, _, _, _, estimated_identity = line.strip().split('\t')
_, _, query_start, query_end, _, _, _, target_start, target_end, _, _, _, estimated_identity = line.strip().split('\t')[:13]

num_mapped_bases = max(int(query_end) - int(query_start), int(target_end) - int(target_start))
estimated_identity = float(estimated_identity.split('id:f:')[1]) / 100.0
Expand Down
2 changes: 1 addition & 1 deletion src/align/include/computeAlignments.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ namespace align
// Extract the mashmap identity from the string
const vector<string> mm_id_vec = skch::CommonFunc::split(tokens[12], ':');
// if the estimated identity is missing, avoid assuming too low values
const float mm_id = wfmash::is_a_number(mm_id_vec.back()) ? std::stof(mm_id_vec.back())/(float) 100.0 : skch::fixed::percentage_identity; // divide by 100 for consistency with block alignment
const float mm_id = wfmash::is_a_number(mm_id_vec.back()) ? std::stof(mm_id_vec.back()) : skch::fixed::percentage_identity;

//Save words into currentRecord
{
Expand Down
2 changes: 1 addition & 1 deletion src/align/include/parseCmdArgs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ Example usage: \n\
std::cerr << "[wfmash::align] Reference = " << parameters.refSequences << std::endl;
std::cerr << "[wfmash::align] Query = " << parameters.querySequences << std::endl;
std::cerr << "[wfmash::align] Mapping file = " << parameters.mashmapPafFile << std::endl;
std::cerr << "[wfmash::align] Alignment identity cutoff = " << parameters.min_identity << "\%" << std::endl;
std::cerr << "[wfmash::align] Alignment identity cutoff = " << std::fixed << std::setprecision(2) << parameters.min_identity * 100.0 << "\%" << std::endl;
std::cerr << "[wfmash::align] Alignment output file = " << parameters.pafOutputFile << std::endl;
}

Expand Down
6 changes: 3 additions & 3 deletions src/common/wflign/src/wflign_patch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1631,9 +1631,9 @@ query_start : query_end)
<< std::round(float2phred(1.0 - block_identity))
//<< "\t" << "as:i:" << total_score
<< "\t"
<< "gi:f:" << gap_compressed_identity * 100.0 << "\t"
<< "gi:f:" << gap_compressed_identity << "\t"
<< "bi:f:"
<< block_identity * 100.0
<< block_identity
//<< "\t" << "md:f:" << mash_dist_sum / trace.size()
//<< "\t" << "ma:i:" << matches
//<< "\t" << "mm:i:" << mismatches
Expand All @@ -1642,7 +1642,7 @@ query_start : query_end)
//<< "\t" << "nd:i:" << deletions
//<< "\t" << "dd:i:" << deleted_bp
<< "\t"
<< "md:f:" << mashmap_estimated_identity * 100.0;
<< "md:f:" << mashmap_estimated_identity;

if (emit_md_tag) {
out << "\t";
Expand Down
8 changes: 7 additions & 1 deletion src/interface/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,12 @@ int main(int argc, char** argv) {
std::chrono::duration<double> timeRefSketch = skch::Time::now() - t0;
std::cerr << "[wfmash::map] time spent computing the reference index: " << timeRefSketch.count() << " sec" << std::endl;

if (referSketch.minmerIndex.size() == 0)
{
std::cerr << "[wfmash::map] ERROR, reference sketch is empty. Reference sequences shorter than the segment length are not indexed" << std::endl;
return 1;
}

//Map the sequences in query file
t0 = skch::Time::now();

Expand Down Expand Up @@ -151,7 +157,7 @@ int main(int argc, char** argv) {
<< "\t" << 0
<< "\t" << std::max(e.rEndPos - e.rStartPos, e.qEndPos - e.qStartPos)
<< "\t" << 255
<< "\t" << "id:f:" << e.mashmap_estimated_identity * 100.0
<< "\t" << "id:f:" << e.mashmap_estimated_identity
<< "\n";
}
}
Expand Down
17 changes: 9 additions & 8 deletions src/interface/parse_args.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,13 @@ void parse_args(int argc,
args::ValueFlag<float> map_pct_identity(mapping_opts, "%", "percent identity in the mashmap step [default: 90]", {'p', "map-pct-id"});
args::ValueFlag<std::string> segment_length(mapping_opts, "N", "segment seed length for mapping [default: 5k]", {'s', "segment-length"});
args::ValueFlag<std::string> block_length(mapping_opts, "N", "keep merged mappings supported by homologies of this total length [default: 5*segment-length]", {'l', "block-length"});
args::ValueFlag<uint32_t> num_mappings_for_segments(mapping_opts, "N", "number of mappings to retain for each segment [default: 1]", {'n', "num-mappings-for-segment"});
args::ValueFlag<uint32_t> num_mappings_for_short_seq(mapping_opts, "N", "number of mappings to retain for each sequence shorter than segment length [default: 1]", {'S', "num-mappings-for-short-seq"});
args::ValueFlag<uint32_t> num_mappings_for_segments(mapping_opts, "N", "number of mappings to retain for each query/reference pair [default: 1]", {'n', "num-mappings-for-segment"});
args::ValueFlag<uint32_t> num_mappings_for_short_seq(mapping_opts, "N", "number of mappings to retain for each query/reference pair where the query sequence is shorter than segment length [default: 1]", {'S', "num-mappings-for-short-seq"});
args::ValueFlag<int> kmer_size(mapping_opts, "N", "kmer size [default: 19]", {'k', "kmer"});
args::ValueFlag<float> kmer_pct_threshold(mapping_opts, "%", "ignore the top % most-frequent kmers [default: 0.001]", {'H', "kmer-threshold"});
args::Flag lower_triangular(mapping_opts, "", "only map shorter sequences against longer", {'L', "lower-triangular"});
args::Flag skip_self(mapping_opts, "", "skip self mappings when the query and target name is the same (for all-vs-all mode)", {'X', "skip-self"});
args::Flag one_to_one(mapping_opts, "", "Perform one-to-one filtering", {'4', "one-to-one"});
args::ValueFlag<char> skip_prefix(mapping_opts, "C", "skip mappings when the query and target have the same prefix before the last occurrence of the given character C", {'Y', "skip-prefix"});
args::ValueFlag<std::string> target_prefix(mapping_opts, "pfx", "use only targets whose name starts with this prefix", {'P', "target-prefix"});
args::ValueFlag<std::string> target_list(mapping_opts, "FILE", "file containing list of target sequence names to use", {'A', "target-list"});
Expand All @@ -86,6 +87,7 @@ void parse_args(int argc,
args::Flag drop_low_map_pct_identity(mapping_opts, "K", "drop mappings with estimated identity below --map-pct-id=%", {'K', "drop-low-map-id"});
args::Flag no_filter(mapping_opts, "MODE", "disable mapping filtering", {'f', "no-filter"});
args::ValueFlag<double> map_sparsification(mapping_opts, "FACTOR", "keep this fraction of mappings", {'x', "sparsify-mappings"});
//ToFix: args::Flag keep_ties(mapping_opts, "", "keep all mappings with equal score even if it results in more than n mappings", {'D', "keep-ties"});
args::ValueFlag<int64_t> sketch_size(mapping_opts, "N", "sketch size for sketching.", {'w', "sketch-size"});
args::ValueFlag<double> kmer_complexity(mapping_opts, "F", "Drop segments w/ predicted kmer complexity below this cutoff. Kmer complexity defined as #kmers / (s - k + 1)", {'J', "kmer-complexity"});
args::Flag no_hg_filter(mapping_opts, "", "Don't use the hypergeometric filtering and instead use the MashMap2 first pass filtering.", {'1', "no-hg-filter"});
Expand All @@ -111,7 +113,7 @@ void parse_args(int argc,
{'G', "wflign-params"});
args::ValueFlag<float> wflign_max_mash_dist(alignment_opts, "N", "maximum mash distance to perform the alignment in a wflambda segment [default: adaptive with respect to the estimated identity]", {'b', "max-mash-dist"});
args::ValueFlag<int> wflign_min_wavefront_length(alignment_opts, "N", "min wavefront length for heuristic WFlign [default: 1024]", {'j', "wflign-min-wf-len"});
args::ValueFlag<int> wflign_max_distance_threshold(alignment_opts, "N", "max distance threshold for heuristic WFlign [default: 2048/(estimated_identity^2)]", {'q', "wflign-max-disttance"});
args::ValueFlag<int> wflign_max_distance_threshold(alignment_opts, "N", "max distance threshold for heuristic WFlign [default: 2048/(estimated_identity^2)]", {'q', "wflign-max-distance"});

// patching parameter
args::ValueFlag<std::string> wflign_max_len_major(alignment_opts, "N", "maximum length to patch in the major axis [default: 512*segment-length]", {'C', "max-patch-major"});
Expand Down Expand Up @@ -236,11 +238,8 @@ void parse_args(int argc,
if (no_filter) {
map_parameters.filterMode = skch::filter::NONE;
} else {
if (map_parameters.skip_self || map_parameters.skip_prefix) {
// before we set skch::filter::ONETOONE here
// but this does not provide a clear benefit in all-to-all
// as it sometimes introduces cases of over-filtering
map_parameters.filterMode = skch::filter::MAP;
if (one_to_one) {
map_parameters.filterMode = skch::filter::ONETOONE;
} else {
map_parameters.filterMode = skch::filter::MAP;
}
Expand Down Expand Up @@ -339,6 +338,7 @@ void parse_args(int argc,
align_parameters.sam_format = args::get(sam_format);
align_parameters.no_seq_in_sam = args::get(no_seq_in_sam);
map_parameters.split = !args::get(no_split);
map_parameters.dropRand = false;//ToFix: !args::get(keep_ties);
align_parameters.split = !args::get(no_split);

map_parameters.mergeMappings = !args::get(no_merge);
Expand Down Expand Up @@ -566,6 +566,7 @@ void parse_args(int argc,
map_parameters.filterLengthMismatches = true;

map_parameters.stage1_topANI_filter = !bool(no_hg_filter);
map_parameters.stage2_full_scan = true;

if (hg_filter_ani_diff)
{
Expand Down
Loading

0 comments on commit d11aa09

Please sign in to comment.