forked from labroo2/rtassel_supp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbarcode_faker_parallel.R
116 lines (92 loc) · 4.81 KB
/
barcode_faker_parallel.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# This function allows parallelization of barcode_faker.R via the R package doSNOW
# The feature was implemented by Luis Diaz ([email protected])
# Remember to use install.packages() to install doSNOW. Set the number of cores with the n.cores option.
# Please see barcode_faker.R for other documentation.
barcode_faker <- function(fastq_dir, read_length = 100L,n.cores=25){
print("Always run barcode_faker on a folder containing all files you intend to use in a given GBSv2 run. Multiple separate runs
of barcode_faker can cause barcodes to be repeated across sample files, though unlikely, and will certainly cause
file names to be repeated.")
#set read length to integer
read_length <- as.integer(read_length)
#get the fastq file paths
fastqs <- list.files(path = fastq_dir, full.names = TRUE)
#check that all files in dir are fastqs
if(length(grep(".fastq", fastqs)) != length(fastqs) | length(grep(".fq", fastqs)) != length(fastqs) &&
length(grep(".fastq", fastqs)) != 0 && length(grep(".fq", fastqs)) != 0){
stop("Not all of the files in your input folder seem to have .fq or .fastq extensions. Please move or delete such files.")
}
#loop through the files to get a vector of trimmed read lengths
sequence_lengths <- c()
for(i in 1:length(fastqs)){
incon <- file(fastqs[i], open = "r")
sequence_lengths[i] <- nchar(readLines(incon, n = 4L, warn = FALSE)[2])
close(incon, warn = FALSE)
}
#this function recursively makes a vector of unique fake barcodes to ensure no non-unique barcodes are generated
#this step does not protect against enzyme cut site being found in the barcode
barcode_inventor <- function(fake_barcodes, read_length, sequence_lengths){
fake_barcodes <- c()
for(i in 1:length(sequence_lengths)){
fake_barcodes[i] <- paste(sample(c("A", "T" ,"C" ,"G"), size = read_length - sequence_lengths[i],
replace = TRUE), collapse = "")
}
if(length(fake_barcodes) == length(unique(fake_barcodes))){
return(fake_barcodes)
} else {
barcode_inventor
}
}
fake_barcodes <- barcode_inventor(fake_barcodes, read_length, sequence_lengths)
#make the fake perfect quality scores matching the barcode length
fake_quals <- c()
for(i in 1:length(sequence_lengths)){
fake_quals[i] <- paste(rep("E", times = read_length - sequence_lengths[i]), collapse = "")
}
#make fake header files for FASTQ reads
#headers are totally fake (do not refer to input file)
#fake headers allows interoperability between TASSEL and demultiplexing software FASTQ formats
fake_headers <- c()
fake_start <- "D00553R:56:"
fake_flowcell_base <- "C8B56ANXX"
fake_flowcell_no <- seq(from = 1, to = length(fastqs), by = 1)
fake_ends <- ":3:1101:1203:2037 1:N:0:3"
fake_headers <- paste(fake_start, fake_flowcell_base, fake_flowcell_no, fake_ends, sep = "")
fake_flowcells <- paste(fake_flowcell_base, fake_flowcell_no, sep = "")
fake_lanes <- rep(3, times = length(fastqs))
#make fake file names that appropriately reference fake flowcell and lane
newfile_names <- paste(fake_flowcells, "_", "3","_","fastq.fastq", sep = "")
#stop if files with the fake file names already exist
workdir <- getwd()
dirfil <- list.files(path = workdir, full.names = FALSE)
if(sum(newfile_names %in% dirfil) != 0){
stop("Files in your working directory seem to have same names as those
output by this function. Please move them, delete them, or choose a new working directory.")
}
#write a key of barcode, flowcell, lane and new file names
key <- cbind(fastqs, newfile_names, fake_flowcells, fake_lanes, fake_barcodes)
write.csv(key, "fakebarcodes_key.csv", row.names = FALSE)
#read 400 lines per file at a time, paste on the barcodes and quality scores, make unique flowcells,
#and write them out with unique names
myfunc<-function(ffastqs,nnewfile_names,ffake_headers,ffake_barcodes,ffake_quals){
incon <- file(ffastqs, open = "r")
outcon <- file(nnewfile_names, open = "w")
while(length(mylines <- readLines(incon, n = 400, warn = FALSE))){
head_pos <- seq(1, length(mylines), by = 4)
seq_pos <- seq(2, length(mylines), by = 4)
qual_pos <- seq(4, length(mylines), by = 4)
for(j in 1:length(mylines)){
mylines[head_pos[j]] <- ffake_headers
mylines[seq_pos[j]] <- paste(ffake_barcodes, mylines[seq_pos[j]], sep = "")
mylines[qual_pos[j]] <- paste(ffake_quals, mylines[qual_pos[j]], sep = "")
}
writeLines(mylines, outcon)
}
#close the connections
close(outcon, warn = FALSE)
close(incon, warn = FALSE)
}
cl <- snow::makeCluster(n.cores)
registerDoSNOW(cl)
foreach(k= 1:length(fastqs)) %dopar% myfunc(fastqs[k],newfile_names[k],fake_headers[k],fake_barcodes[k],fake_quals[k])
snow::stopCluster(cl)
}