-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathextract_qiimetax.R
114 lines (104 loc) · 3.08 KB
/
extract_qiimetax.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env R
#' @title Extract taxonomy from FASTA file and output QIIME formatted taxonomy. Made specifically for SILVA, download from https://www.arb-silva.de/no_cache/download/archive/release_138.1/Exports/.
#' @description extracts taxonomy from fasta sequence headers and outputs a QIIME formatted taxonomy table as well as the sequences without the taxonomy string. Eukaryotes are filtered for now. To keep Eukaryotes everything in between domain+phylum should be removed and only 7 levels kingdom/domain -> species should remain.
#'
#' @param input_seqs (Required) Path to input fasta file
#' @param output_seqs Path to output fasta file
#' @param output_tax Path to output taxonomy TSV table
#'
#' @return A list
#' @import Biostrings data.table stringi
#' @author Kasper Skytte Andersen \email{ksa@@bio.aau.dk}
extract_qiimetax <- function(
input_seqs,
output_seqs = paste0(
tools::file_path_sans_ext(input_seqs),
"_noeuk.",
tools::file_ext(input_seqs)
),
output_tax = paste0(
tools::file_path_sans_ext(input_seqs),
"_noeuk_qiimetax.tsv"
)
) {
req_pkgs <- c("Biostrings", "data.table", "stringi")
pkg_status <- lapply(req_pkgs, require, character.only = TRUE)
if (!all(unlist(pkg_status))) {
stop(
"The following packages are required, please install manually:\n",
paste(req_pkgs, collapse = "\n"), call. = FALSE
)
}
#read fasta file
seqs <- readBStringSet(input_seqs)
#extract taxonomy
tax <- trimws(
stri_extract_all_regex(
names(seqs),
"[\t ]+.*$"
)
)
#filter Eukaryotes
non_euk_ids <- stri_detect_regex(
tolower(tax),
"^eukar",
negate = TRUE
)
seqs <- seqs[non_euk_ids]
tax <- tax[non_euk_ids]
#extract sequence IDs' from headers and rename input seqs
seq_ids <- stri_replace_all_regex(
names(seqs),
"[\t ].*$",
""
)
names(seqs) <- seq_ids
#generate taxonomy table
taxonomy <- data.table(
seq_ids,
tax
)
tax_levels <- c(
"Kingdom",
"Phylum",
"Class",
"Order",
"Family",
"Genus",
"Species"
)
#split up taxonomy column into separate columns with tax levels
taxonomy[, c(tax_levels) := tstrsplit(tax, ";", fixed = FALSE)] # nolint
#trim white spaces and add tax level prefix
taxonomy[, Kingdom := trimws(paste0("k__", Kingdom))] # nolint
taxonomy[, Phylum := trimws(paste0("p__", Phylum))] # nolint
taxonomy[, Class := trimws(paste0("c__", Class))] # nolint
taxonomy[, Order := trimws(paste0("o__", Order))] # nolint
taxonomy[, Family := trimws(paste0("f__", Family))] # nolint
taxonomy[, Genus := trimws(paste0("g__", Genus))] # nolint
taxonomy[, Species := trimws(paste0("s__", Species))] # nolint
#combine columns again with "; " separator
qiime_tax <- taxonomy[
,
.(seq_ids, tax = do.call(paste, c(.SD, sep = "; "))),
.SDcols = tax_levels
]
#write out files
Biostrings::writeXStringSet(
seqs,
output_seqs
)
fwrite(
qiime_tax,
output_tax,
col.names = FALSE,
row.names = FALSE,
sep = "\t"
)
invisible(
list(
seqs = seqs,
qiime_tax = qiime_tax
)
)
}