-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcount_barcodes_10X.py
154 lines (142 loc) · 6.39 KB
/
count_barcodes_10X.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#counts TF barcodes from fastq for TF screening with 10X scRNA-seq readout
from Bio import SeqIO
import csv
from collections import OrderedDict
import numpy as np
import sys
import argparse
from Bio.SeqIO.QualityIO import FastqGeneralIterator
from Bio import pairwise2
KEY_REGION_START = 25 #start index of key region
KEY_REGION_END = 50 #end index of key region
BARCODE_LENGTH = 24
MAX_BARCODE_DIFF = 1 #allows 1 mismatch for barcode mapping
KEY = "GAAAGGACGA" #identifies sequence before barcode to determine barcode position
def align_barcodes(barcode1, barcode2, max_barcode_diff):
diff = 0
if len(barcode1) != len(barcode2):
return False
for i, base in enumerate(barcode1):
if barcode1[i] != barcode2[i]:
diff += 1
if diff > max_barcode_diff:
return False
return True
def count_barcodes(tf_input_file, cell_input_file, r1_fastq_file, r2_fastq_file, output_prefix):
"""
reads TF barcodes from tf_input_file, reads cell barcodes from cell_input_file, creates a dictionary with cell barcodes and
associated TF counts from r1_fastq_file and r2_fastq_file, writes results to files with output_prefix
tf_input_file: csv file of all possible TF barcodes with 1 barcode per line
cell_input_file: csv file of all possible cell barcodes with 1 barcode per line
r1_fastq_file: read 1 fastq file
r2_fastq_file: read 2 fastq file
output_prefix: prefix of files to write barcode count results to
"""
num_reads = 0 #total number of reads processed
perfect_matches = 0 # barcodes with perfect match to library
non_perfect_matches = 0 #number of barcodes without a perfect match to the library
key_not_found = 0 #count of reads where key was not found
pair_mismatch = 0
try:
# open TF barcodes and initiate dictionary of read counts for each barcode
with open(tf_input_file, 'r') as infile:
reader = csv.reader(infile)
dictionary = {rows[0]:0 for rows in reader}
# open cell barcodes and initiate dictionary of read counts for each barcode
with open(cell_input_file, 'r') as infile:
reader = csv.reader(infile)
cell_dict = {rows[0]:{} for rows in reader}
except:
print('could not open input file')
f_iter = FastqGeneralIterator(open(r1_fastq_file,"r"))
r_iter = FastqGeneralIterator(open(r2_fastq_file,"r"))
for (f_id, f_seq, f_q), (r_id, r_seq, r_q) in zip(f_iter,r_iter):
try:
assert f_id.split()[0] == r_id.split()[0]
except:
print(f_id)
print(r_id)
pair_mismatch += 1
break
num_reads += 1
cell_barcode = str.upper(str(f_seq))
if "N" in cell_barcode:
continue
read_sequence = str.upper(str(r_seq))
key_region = read_sequence[KEY_REGION_START:KEY_REGION_END]
key_index = key_region.find(KEY)
if key_index >= 0:
start_index = key_index + KEY_REGION_START + len(KEY)
tf_barcode = read_sequence[start_index:(start_index + BARCODE_LENGTH)]
barcode_found = False
for b in dictionary.keys():
if align_barcodes(b, tf_barcode, MAX_BARCODE_DIFF):
dictionary[b] += 1
perfect_matches += 1
barcode_found = True
if cell_barcode in cell_dict.keys():
if b in cell_dict[cell_barcode].keys():
cell_dict[cell_barcode][b] += 1
else:
cell_dict[cell_barcode][b] = 1
if not barcode_found:
non_perfect_matches += 1
else:
key_not_found += 1
# create ordered dictionary with barcodes and respective counts and output as a csv file
dict_sorted = OrderedDict(sorted(dictionary.items(), key=lambda t: t[0]))
with open(output_prefix + '_barcodes.csv', 'w') as csvfile:
mywriter = csv.writer(csvfile, delimiter=',')
for barcode in dict_sorted:
count = dict_sorted[barcode]
mywriter.writerow([barcode,count])
with open(output_prefix + '_cells.csv', 'w') as csvfile:
mywriter = csv.writer(csvfile, delimiter=',')
for barcode in cell_dict.keys():
tfs = cell_dict[barcode]
tfs_keys = sorted(tfs, key=tfs.__getitem__, reverse=True)
mywriter.writerow([barcode] + [(x + ':' + str(tfs[x])) for x in tfs_keys])
# percentage of barcodes that matched perfectly
if perfect_matches > 0:
percent_matched = round(perfect_matches/float(perfect_matches + non_perfect_matches) * 100, 1)
else:
percent_matched = 0
# percentage of undetected barcodes with no read counts
all_barcode_counts = list(dictionary.values())
barcodes_with_reads = np.count_nonzero(all_barcode_counts)
barcodes_no_reads = len(all_barcode_counts) - barcodes_with_reads
percent_no_reads = round(barcodes_no_reads/float(len(all_barcode_counts)) * 100, 1)
# skew ratio of top 10% to bottom 10% of barcode counts
top_10 = np.percentile(all_barcode_counts, 90)
bottom_10 = np.percentile(all_barcode_counts, 10)
if top_10 != 0 and bottom_10 != 0:
skew_ratio = top_10/bottom_10
else:
skew_ratio = 'Not enough perfect matches to determine skew ratio'
# write analysis statistics to stats.txt
with open(output_prefix+'_stats.txt', 'w') as infile:
infile.write('Number of perfect barcode matches: ' + str(perfect_matches) + '\n')
infile.write('Number of nonperfect barcode matches: ' + str(non_perfect_matches) + '\n')
infile.write('Number of reads where key was not found: ' + str(key_not_found) + '\n')
infile.write('Number of reads processed: ' + str(num_reads) + '\n')
infile.write('Percentage of barcodes that matched perfectly: ' + str(percent_matched) + '\n')
infile.write('Percentage of undetected barcodes: ' + str(percent_no_reads) + '\n')
infile.write('Skew ratio of top 10% to bottom 10%: ' + str(skew_ratio) + '\n')
infile.write('Pairend mismatches: ' + str(pair_mismatch))
infile.close()
return
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Analyze sequencing data for sgRNA library distribution')
parser.add_argument('-f', '--fastq1', type=str, dest='r1_fastq_file',
help='read 1 fastq file name', default='r1_NGS.fastq')
parser.add_argument('-r', '--fastq2', type=str, dest='r2_fastq_file',
help='read 2 fastq file name', default='r2_NGS.fastq')
parser.add_argument('-o', '--output-prefix', type=str, dest='output_prefix',
help='output file name prefix', default='counts')
parser.add_argument('-i', '--input', type=str, dest='tf_input_file',
help='input file name', default='library_sequences.csv')
parser.add_argument('-ci', '--cell-input', type=str, dest='cell_input_file',
help='cell input file name', default='cell_barcodes.csv')
args = parser.parse_args()
count_barcodes(args.tf_input_file, args.cell_input_file, args.r1_fastq_file, args.r2_fastq_file, args.output_prefix)