-
Notifications
You must be signed in to change notification settings - Fork 3
/
Demultiplexor.py
489 lines (363 loc) · 18.2 KB
/
Demultiplexor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
# Demultiplexor
# James M. Heather, August 2016, UCL
# https://innate2adaptive.github.io/Decombinator/
##################
### BACKGROUND ###
##################
# Takes all three reads and simultaneously demultiplexes and formats read 1 for vDCR.py analysis
# i.e. adds the first 30 nucleotides of R2 to R1
# This will contain the random barcode followed by any V(D)J information downstream, allowing collapsing
# Demultiplexes using a combination of indexes from third read and another index nested in read 1
# A derivate of DualIndexDemultiplexing.py/FuzzyDID, fuzzily demultiplexes ligation TCR sequencing protocol FASTQ data
# i.e. allows a specified number of mismatches in the index sequence
##################
###### INPUT #####
##################
# Requires the command line input of at least 3 file names, giving the three Illumina reads
# Files may be uncompressed, or gzipped (and be named accordingly, e.g. File.fastq.gz)
# A fourth optional comma delimited file detailing sample index specifics is strongly recommended, allowing production of correctly named files
# File must give the following details, one sample (or index combination) per line, with no empty lines:
# Sample name, SP1/R1 index (I), SP2/R2 index (L):
# e.g.: AlphaSample,1,11
# If you include one and only one chain description (i.e. alpha, beta, gamma or delta) into your sample name, you need not set chain in Decombinator
# e.g. run: python Demultiplexor.py -r1 read1.fastq -r2 read2.fastq -i1 indexread1.fastq -ix indexes.ndx
# Other optional flags:
# -s/--supresssummary: Supress the production of a summary file containing details of the run into a 'Logs' directory.
# -a/--outputall: Output the results of all possible index combinations currently used in protocol
# e.g. Useful in finding potential cross-contaminating or incorrectly indexed samples
# NB - This option can be run even if an index list is provided (although only those provided by the index list will be named)
# -t/--threshold: Specifies the threshold by which indexes can be clustered by fuzzy string matching, allowing for sequencing errors
# Default = 2. Setting to zero turns off fuzzy matching, i.e. only allowing exact string matching
# -dz/--dontgzip: Suppress the automatic compression of output demultiplexed FASTQ files with gzip.
# Using this flag makes the script execute faster, but data will require more storage space.
# -dc/--dontcount: Suppress the whether or not to show the running line count, every 100,000 reads.
# Helps in monitoring progress of large batches.
# -fz/--fuzzylist: Output a list of FASTQ IDs of reads which are demultiplexed using fuzzy (i.e. non-exact) index matching, within the specified threshold.
# Default = False, but can be useful to investigate suspected cases of poor quality index reads or clashing sequences.
# -ex/--extension: Allows users to specify the file extension of the demultiplexed FASTQ files produced.
# To see all options, run: python Demultiplexor.py -h
##################
##### OUTPUT #####
##################
# A fastq file will be produced for each sample listed in the index file, in the modified format, containing all reads that matched
# So we go from: R1 - [6s|X1|----J(D)V-----]
# R2 - [X2]
# R3 - [8s|N1|8s|N2|2s|-----5'UTR-----]
# To: ========> out- [8s|N1|8s|N2|2s|X1|X2|----J(D)V-----]
# Where X = hexamer index sequence, N = random barcode sequence, and Ys = spacer sequences of length Y
# The 8s sequences can be used downstream to identify the presence and location of random barcode N sequences
# 2s is also kept to allow for the possibility finding N sequences produced from slipped reads
##################
#### PACKAGES ####
##################
from __future__ import division
from itertools import izip
import time
import sys
import argparse
import gzip
import os
import Levenshtein as lev
import collections as coll
__version__ = '2.1'
##########################################################
############# READ IN COMMAND LINE ARGUMENTS #############
##########################################################
def args():
"""args(): Obtains command line arguments which dictate the script's behaviour"""
# Help flag
parser = argparse.ArgumentParser(
description='Script to demultiplex FASTQ data produced using the Chain labs\' ligation TCRseq protocol. Please see https://innate2adaptive.github.io/Decombinator/ for details.')
# Add arguments
parser.add_argument(
'-r1', '--read1', type=str, help='Read 1 FASTQ file', required=True)
parser.add_argument(
'-r2', '--read2', type=str, help='Read 2 FASTQ file', required=True)
parser.add_argument(
'-i1', '--index1', type=str, help='Index read FASTQ file', required=True)
parser.add_argument(
'-ix', '--indexlist', type=str, help='File containing sample/index table', required=False)
parser.add_argument(
'-t', '--threshold', type=int, help='Edit distance allowed for fuzzy-string index matching (default=2)', required=False, default=2)
parser.add_argument(
'-s', '--suppresssummary', action='store_true', help='Output summary data', required=False)
parser.add_argument(
'-a', '--outputall', action='store_true', help='Output all possible index combinations', required=False)
parser.add_argument(
'-dz', '--dontgzip', action='store_true', help='Stop the output FASTQ files automatically being compressed with gzip (True/False)', required=False)
parser.add_argument(
'-dc', '--dontcount', action='store_true', help='Show the count (True/False)', required=False)
parser.add_argument(
'-fl', '--fuzzylist', type=bool, help='Output a list of those reads that demultiplexed using fuzzy index matching (True/False)', required=False, default=False)
parser.add_argument(
'-ex', '--extension', type=str, help='Specify the file extension of the output FASTQ files. Default = \"fq\"', required=False, default="fq")
return parser.parse_args()
############################################
############# FASTQ PROCESSING #############
############################################
def fastq_check(infile):
"""fastq_check(file): Performs a rudimentary sanity check to see whether a file is indeed a FASTQ file"""
success = True
if infile.endswith('.gz'):
with gzip.open(infile) as possfq:
read = [next(possfq) for x in range(4)]
else:
with open(infile) as possfq:
read = [next(possfq) for x in range(4)]
# @ check
if read[0][0] <> "@":
success = False
# Descriptor check
if read[2][0] <> "+":
success = False
# Read/quality match check
if len(read[1]) <> len(read[3]):
success = False
return(success)
def readfq(fp):
"""readfq(file):Heng Li's Python implementation of his readfq function
See https://github.com/lh3/readfq/blob/master/readfq.py"""
last = None # this is a buffer keeping the last unprocessed line
while True: # mimic closure; is it a bad idea?
if not last: # the first record or a record following a fastq
for l in fp: # search for the start of the next record
if l[0] in '>@': # fasta/q header line
last = l[:-1] # save this line
break
if not last: break
name, seqs, last = last[1:].partition(" ")[0], [], None
for l in fp: # read the sequence
if l[0] in '@+>':
last = l[:-1]
break
seqs.append(l[:-1])
if not last or last[0] != '+': # this is a fasta record
yield name, ''.join(seqs), None # yield a fasta record
if not last: break
else: # this is a fastq record
seq, leng, seqs = ''.join(seqs), 0, []
for l in fp: # read the quality
seqs.append(l[:-1])
leng += len(l) - 1
if leng >= len(seq): # have read enough quality
last = None
yield name, seq, ''.join(seqs); # yield a fastq record
break
if last: # reach EOF before reading enough quality
yield name, seq, None # yield a fasta record instead
break
def sort_permissions(fl):
# Need to ensure proper file permissions on output data
# If users are running pipeline through Docker might otherwise require root access
if oct(os.stat(fl).st_mode)[4:] != '666':
os.chmod(str(fl), 0o666)
inputargs = vars(args())
if inputargs['outputall'] == False and not inputargs['indexlist']:
print "No indexing file provided, and output all option not enabled; one (or both) is required."
sys.exit()
for f in [inputargs['read1'], inputargs['read2'], inputargs['index1']]:
if fastq_check(f) == False:
print "FASTQ sanity check failed reading", f, "- please ensure that this file is properly formatted and/or gzip compressed."
sys.exit()
##########################################################
############ CREATE DICTIONARIES FOR INDEXES #############
##########################################################
# SP1 index = R1 (our own, RC1 proximal index)
X1dict = {"1":"ATCACG", "2":"CGATGT", "3":"TTAGGC", "4":"TGACCA", "5":"ACAGTG", "6":"GCCAAT", "11":"GGCTAC", "12":"CTTGTA", "13":"TAGACT"}
# NB: One index removed due to similarity to others, but exists in earlier datasets: "14":"ACACGG"
# 'SP2' index = R2 (index read, comes first in rearranged sequence)
X2dict = {"1":"CGTGAT", "2":"ACATCG", "3":"GCCTAA", "4":"TGGTCA", "5":"CACTGT", "6":"ATTGGC", "7":"GATCTG", "8":"TCAAGT", "9":"CTGATC", "10":"AAGCTA", "11":"GTAGCC", "12":"TACAAG", "13":"TTGACT", "14":"GGAACT"}
##########################################################
########### GENERATE SAMPLE-NAMED OUTPUT FILES ###########
##########################################################
suffix = "." + inputargs['extension']
failed = open("Undetermined" + suffix, "w")
outputreads = coll.Counter()
outputreads["Undetermined"] = 0
usedindexes = coll.defaultdict(list) # This keeps a track of all files that have been generated to house demultiplexed reads
XXdict = {}
# If given an indexlist, use that to generate named output files
if inputargs['indexlist']:
indexes = list(open(inputargs['indexlist'], "rU"))
for x in indexes:
if x == "\n":
print "Empty line detected in index file, presumed end of file."
break
elements = x.strip("\n").split(",")
sample = elements[0]
open(sample + suffix, "w").close()
compound_index = X1dict[elements[1]] + X2dict[elements[2]]
XXdict[compound_index] = open(sample + suffix, "a")
outputreads[sample] = 0
usedindexes[sample] = compound_index
# If the outputall option is chosen, output all possible index combinations that exist in the data
# Note that if an indexlist is provided, those names are still used in the appropriate output files
# Also note that while all combinations are looked for, those which remain unused at the end will be deleted
if inputargs['outputall'] == True:
# generate all possible index combinations, and then check if they have been generated yet (via an index file)
# only make those that haven't
allXcombs = [x + "-" + y for x in X1dict.keys() for y in X2dict.keys()]
for x in allXcombs:
compound_index = X1dict[x.split("-")[0]] + X2dict[x.split("-")[1]]
if compound_index not in usedindexes.values():
XXdict[compound_index] = open("Indexes_" + x + suffix, "a")
outputreads["Indexes_" + x] = 0
usedindexes["Indexes_" + x] = compound_index
count = 0
dmpd_count = 0 # number successfully demultiplexed
fuzzy_count = 0 # number of sequences that were demultiplexed using non-exact index matches
clash_count = 0 # number of fuzzy ID clashes
fuzzies = [] # list to record IDs matched using fuzzy indexes
t0 = time.time() # Begin timer
##########################################################
########### LOOP THROUGH ALL READ FILES IN SYNC ##########
######## PROCESS INTO CORRECT FORMAT & DEMULTIPLEX #######
##########################################################
print "Reading input files..."
# Open read files
if inputargs['read1'].endswith('.gz'):
fq1 = readfq(gzip.open(inputargs['read1']))
else:
fq1 = readfq(open(inputargs['read1']))
if inputargs['index1'].endswith('.gz'):
fq2 = readfq(gzip.open(inputargs['index1']))
else:
fq2 = readfq(open(inputargs['index1']))
if inputargs['read2'].endswith('.gz'):
fq3 = readfq(gzip.open(inputargs['read2']))
else:
fq3 = readfq(open(inputargs['read2']))
print "Demultiplexing data..."
for record1, record2, record3 in izip(fq1, fq2, fq3):
# Readfq function with return each read from each file as a 3 part tuple
# ('ID', 'SEQUENCE', 'QUALITY')
count += 1
if count % 100000 == 0 and inputargs['dontcount'] == False:
print '\t read', count
### NB For non-standard Illumina encoded fastqs, might need to change which fields are carried into fq_* vars
fq_id = record1[0]
# N relates to barcode random nucleotides, X denotes index bases
### FORMATTING OUTPUT READ ###
Nseq = record3[1][0:30]
X1seq = record1[1][6:12]
X2seq = record2[1]
readseq = record1[1][12:]
Nqual = record3[2][0:30]
X1qual = record1[2][6:12]
X2qual = record2[2]
readqual = record1[2][12:]
fq_seq = Nseq + X1seq + X2seq + readseq
fq_qual = Nqual + X1qual + X2qual + readqual
new_record = str("@" + fq_id + "\n" + fq_seq + "\n+\n" + fq_qual + "\n")
### DEMULTIPLEXING ###
seqX = X1seq + X2seq
if seqX in XXdict:
# Exact index matches
XXdict[seqX].write(new_record)
dmpd_count += 1
outputreads[str(XXdict[seqX]).split("\'")[1][:-len(suffix)]] += 1
else:
# Otherwise allow fuzzy matching
matches = []
for ndx in XXdict.keys():
if lev.distance(ndx, seqX) <= inputargs['threshold']:
matches.append(ndx)
if len(matches) == 1:
# Only allow fuzzy match if there is one candidate match within threshold
XXdict[matches[0]].write(new_record)
dmpd_count += 1
fuzzy_count += 1
fuzzies.append(fq_id)
outputreads[str(XXdict[matches[0]]).split("\'")[1][:-len(suffix)]] += 1
else:
if len(matches) > 1:
clash_count += 1
failed.write(new_record)
outputreads['Undetermined'] += 1
for x in XXdict.values():
x.close()
sort_permissions(x.name)
failed.close()
sort_permissions(failed.name)
fq1.close()
fq2.close()
fq3.close()
# If output all is allowed, delete all unused index combinations
if inputargs['outputall'] == True:
for f in outputreads.keys():
if outputreads[f] == 0:
os.remove(f + suffix)
del outputreads[f]
del usedindexes[f]
# Gzip compress output
if inputargs['dontgzip'] == False:
print "Compressing demultiplexed files..."
for f in outputreads.keys():
with open(f + suffix) as infile, gzip.open(f + suffix + '.gz', 'wb') as outfile:
outfile.writelines(infile)
sort_permissions(outfile.name)
os.unlink(f + suffix)
#################################################
################## STATISTICS ###################
#################################################
timed = time.time() - t0
took = round(timed,2)
#print count, 'reads processed from', rd1file, 'and', fq2file, 'and output into', outfq #FIX
if took < 60:
print '\t\t\t\t\t\t\tTook', took, 'seconds to demultiplex samples'
else:
print '\t\t\t\t\t\t\tTook', round((timed/60),2), 'minutes to jimmy indexes and hexamers around'
print count, "reads processed"
print dmpd_count, "reads demultiplexed"
print fuzzy_count, "reads demultiplexed using fuzzy index matching"
if clash_count > 0:
print clash_count, "reads had fuzzy index clashes (i.e. could have assigned to >1 index) and were discarded"
# Write data to summary file
if inputargs['suppresssummary'] == False:
# Check for directory and make summary file
if not os.path.exists('Logs'):
os.makedirs('Logs')
date = time.strftime("%Y_%m_%d")
# Check for existing date-stamped file
summaryname = "Logs/" + date + "_Demultiplexing_Summary.csv"
if not os.path.exists(summaryname):
summaryfile = open(summaryname, "w")
else:
# If one exists, start an incremental day stamp
for i in range(2,10000):
summaryname = "Logs/" + date + "_Demultiplexing_Summary_" + str(i) + ".csv"
if not os.path.exists(summaryname):
summaryfile = open(summaryname, "w")
break
# Generate string to write to summary file
summstr = "Property,Value\nDirectory," + os.getcwd() + "\nDateFinished," + date + "\nTimeFinished," + time.strftime("%H:%M:%S") + "\nTimeTaken(Seconds)," + str(round(timed,2)) + "\n"
for s in ['read1', 'read2', 'index1', 'indexlist', 'extension', 'threshold', 'outputall', 'dontgzip', 'fuzzylist']:
summstr = summstr + s + "," + str(inputargs[s]) + "\n"
summstr = summstr + "NumberReadsInput," + str(count) + "\nNumberReadsDemultiplexed," + str(dmpd_count) + "\nNumberFuzzyDemultiplexed," + str(fuzzy_count) + "\nNumberIndexClash," + str(clash_count) + "\n\nOutputFile,IndexUsed\n"
# Write out number of reads in and details of each individual output file
for x in sorted(usedindexes.keys()):
summstr = summstr + x + "," + usedindexes[x] + "\n"
if inputargs['indexlist']:
summstr = summstr + "\nOutputFile,IndexNumbersUsed(SP1&SP2)\n"
for x in indexes:
splt = x.rstrip().split(",")
summstr = summstr + splt[0] + "," + splt[1] + " & " + splt[2] + "\n"
summstr = summstr + "\nOutputFile,NumberReads\n"
for x in sorted(outputreads.keys()):
summstr = summstr + x + "," + str(outputreads[x]) + "\n"
print >> summaryfile, summstr
summaryfile.close()
sort_permissions(summaryname)
# Write out list of fuzzy matched sequences, so can fish out later if needed
if inputargs['threshold'] > 0 and inputargs['fuzzylist'] == True:
print "\nOutputting list of reads demultiplexed using fuzzy index matching"
# Check for directory and make summary file
if not os.path.exists('Logs'):
os.makedirs('Logs')
date = time.strftime("%Y_%m_%d")
fuzzname = "Logs/" + date + "_FuzzyMatchedIDs.txt"
fuzzout = open(fuzzname, "w")
for f in fuzzies:
print >> fuzzout, f
fuzzout.close()
sort_permissions(fuzzname)