Skip to content

Commit

Permalink
Fix issues in processing contaminants refseq genomes, including do no…
Browse files Browse the repository at this point in the history
…t chopping off for the pipe sign in the header
  • Loading branch information
mourisl committed May 9, 2024
1 parent b749f9e commit 2832c62
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 2 deletions.
7 changes: 6 additions & 1 deletion centrifuger-download
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,13 @@ cut_after_first_space_or_second_pipe() {
}
export -f cut_after_first_space_or_second_pipe

cut_after_first_space() {
grep '^>' | sed 's/ .*//'
}
export -f cut_after_first_space

map_headers_to_taxid() {
grep '^>' | cut_after_first_space_or_second_pipe | sed -e "s/^>//" -e "s/\$/ $1/"
grep '^>' | cut_after_first_space | sed -e "s/^>//" -e "s/\$/ $1/"
}
export -f map_headers_to_taxid

Expand Down
3 changes: 2 additions & 1 deletion indices/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,8 @@ endif
$(REFERENCE_SEQUENCES_DIR)/contaminants.fna: | $(REFERENCE_SEQUENCES_DIR)
[[ -d $(TMP_DIR) ]] && rm -rf $(TMP_DIR); mkdir -p $(TMP_DIR)
centrifuger-download -o $(TMP_DIR) contaminants > $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@))
find $(TMP_DIR)/contaminants -name "*.fna.gz" | xargs zcat > $@.tmp && mv $@.tmp $@
#find $(TMP_DIR)/contaminants -name "*.fna.gz" | xargs zcat > [email protected] && mv [email protected] $@
find $(TMP_DIR)/contaminants -name "*.fna" | xargs cat > $@.tmp && mv $@.tmp $@
mv $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@)) $(patsubst %.fna,%$(TAXID_SUFFIX),$@)
ifeq (1,$(KEEP_FILES))
[[ -d $(DL_DIR)/contaminants ]] || mkdir -p $(DL_DIR)/contaminants
Expand Down

0 comments on commit 2832c62

Please sign in to comment.