forked from dcouvin/CRISPRCasFinder
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCRISPRCasFinder.pl
5365 lines (4367 loc) · 177 KB
/
CRISPRCasFinder.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env perl
######################################################################################
# CRISPRCasFinder, an update of CRISRFinder, includes a portable version,
# enhanced performance and integrates search for cas genes.
#
# Copyright (C) 2017- CRISPR-Cas++ team
# (CNRS, Université Paris-Saclay I2BC, Institut Pasteur C3BI, Université de Lille)
#
# See the COPYRIGHT file for details.
#
# CRISPRCasFinder is distributed under the terms of the GNU General Public License
# (GPLv3). See the COPYING file for details.
######################################################################################
# CPAN modules
## insert perlib ##
use strict;
use Class::Struct; #charge le module qui construit les struct
use warnings;
use File::Copy;
use File::Basename;
use Data::Dumper;
use Cwd;
use Date::Calc qw(:all);
use Getopt::Long;
use Unix::Sysexits;
# Bio Perl modules
use Bio::AlignIO;
use Bio::SeqIO;
use Bio::DB::Fasta; #to extract sequence from fasta file
# TODO add URLescape module in case the GFF attributes get messy
#local modules
my $version = "4.2.20";
# set parameters for the Vmatch program
#$ENV{'LD_LIBRARY_PATH'} = '.';
## Parameters
my $SpSim = 60; # maximal allowed percentage of similarity between Spacers (default value=60)
my $Sp2 = 2.5; # maximal Spacers size in function of DR size (default value=2.5)
my $Sp1 = 0.6; # minimal Spacers size in function of DR size (default value=0.6)
my $mismOne = 1; # allow mismatchs (default value=1)
my $S2 = 60; # maximal size of Spacers (default value=60)
my $S1 = 25; # minimal size of Spacers (default value=25)
my $M2 = 55; # maximal size of DRs (default value=55)
my $M1 = 23; # minimal size of DRs (default value=23)
my $DRtrunMism = 33.3; # %mismatchs allowed for truncated DR (default value=33.3)
my $DRerrors = 20; # %mismatchs allowed between DRs (default value=20)
my $userfile = "";
my $launchCasFinder = 0; # boolean variable indicating if we use casfinder or not (default value=0)
my $casfinder = "CasFinder-2.0.3"; # repository containing new CasFinder (default 'CasFinder-2.0.3')
my $kingdom = "Bacteria"; # allow to choose analysis between Archaea and Bacteria (default 'Bacteria')
my $vicinity = 600; # number of nucleotides separating CRISPR array from next Cas system (default value=600)
my $writeFullReport = 0; # boolean variable indicating if we write crispr-cas_vicinity_report or not (default value=0)
my $so = "./sel392v2.so"; # path to shared object (.so) file (former name: $pathSoFile)
my $crisprdb = ""; # path to all CRISPR candidates contained in CRISPRdb (from last update)
my $repeats = ""; # path to file containing repeat sequences, IDs, and Nb in CRISPRdb (last update)
my $dirRepeat = ""; # path to file containing repeat IDs and Orientation according to CRISPRDirection
my $html = 0; # boolean variable indicating if we use html visualization or not (default value=0)
my $outputDirName = ""; # repository name containing results to be set by user (default: word 'Result_' followed by basename of input file)
my $flankingRegion = 100; # size of flanking regions in base pairs (bp) for each analyzed CRISPR array (default value=100)
my $cpuProkka = 1; # number of CPUs to use for Prokka (default value=1)
my $cpuMacSyFinder = 1; # number of CPUs to use for MacSyFinder (default value=1)
my $rcfowce = 0; # option allowing to run CasFinder only when any CRISPR exists (default value=0) (set if -cas is set)
my $metagenome = 0; # option allowing to analyze metagenome (default value=0)
my $logOption = 0; # option allowing to write LOG files (default value=0)
my $keep = 0; # option allowing to keep secondary folders/files (Prodigal/Prokka, CasFinder, rawFASTA, Properties); default value=0
my $definition = "SubTyping"; # option allowing to specify CasFinder definition (if option -cas is set) to be more or less stringent (default value="SubTyping"). Other options include "Typing" and "General".
my $userGFF = ""; # option allowing user to provide an annotation GFF file (if options -cas and -faa are set) (default value='')
my $userFAA = ""; # option allowing user to provide a proteome file '.faa' (if options -cas and -gff are set) (default value='')
my $clusteringThreshold = 0; # option allowing (if option -cas is set) to constitute clusters or groups of CRISPR or Cas systemes given a determined threshold e.g. 20kb (default value=0)
my $useProdigal = 1; # option allowing to use Prodigal as default option instead of Prokka (default value=1)
my $useProkka = 0; # option allowing to use Prokka instead of Prodigal (default value=0)
my $gscf = 0; # option allowing to get summary file of Cas-finder and copy it to TSV repository (default value=0)
my $cssFile = ""; # option allowing to copy CSS file (crispr.css) to get the same design as CRISPRdb when using option -HTML (default value='')
my $genCode = 11; # option allowing to modify the genetic code (translation table) for CDS annotation (default value=11)
my $levelMin = 1; # option allowing to choose the minimum evidence-level corresponding to CRISPR arrays we want to display (default value=1)
my $quiet = 0; # option allowing to run the program quieter (default value=0)
my $seqMinSize = 0; #NV option allowing to fix a sequence minimum size to search CRISPR and Cas systems (lighter process on big Data) (default value=0)
my $fast = 0; # option allowing to run the program faster (default value=0)
my $force = 0; # option allowing to force/foster detection of specific CRISPR arrays (default value=0)
my $fosteredDRLength = 30; #option allowing to foster a specific repeat's length when option '-force' is set (default value=30)
my $fosteredDRBegin = "G"; #option allowing to foster a specific repeat's beginning when option '-force' is set (default value='G'), regular expressions could be considered
my $fosteredDREnd = "AA."; #option allowing to foster a specific repeat's ending when option '-force' is set (default value='AA.'), regular expressions could be considered
my $repeatsQuery = ""; # option allowing to specify a query file containing repeats to be matched (default value='')
my $minNbSpacers = 1; # option allowing to specifiy the minimum number of spacers required for each array (default value=1)
my $betterDetectTruncatedDR = 0; # option allowing to better detect the truncated DR (default value=0)
my $percentageMismatchesHalfDR = 4; # option allowing to set the percentage of allowed mismatches in truncated DR (default value=4)
my $classifySmall = 0; # option allowing to change evidence level status of small arrays (evidence-level 1) having the same consensus repeat as an evidence-level 4 array (default value=0)
# Variables used to determine mode
my $print_version = 0; # Print version
my $print_help = 0; # Print help
##
## Manage arguments
# Note that standard overloading is possible i.e.
# using -cpuMacSysFinder after -fast will set the
# option to the desired value (it was the case with the
# previous implementation).
# Note that $DRtrunMism and $DRerrors are modified after.
GetOptions (
# General options
"help|h" => \$print_help,
"version|v" => \$print_version,
# Input/Output and -so
"in|i=s" => \$userfile,
"outdir|out=s" => \$outputDirName,
"keepAll|keep" => \$keep,
"LOG|log" => \$logOption,
"HTML|html" => \$html,
"copyCSS=s" => \$cssFile,
"soFile|so=s" => \$so,
"quiet|q" => \$quiet,
"faster|fast" => sub {
# Fast mode
$fast = 1;
# Use all CPU for Prokka and MacSyFinder
$cpuProkka = 0;
$cpuMacSyFinder = 0;
},
"minSeqSize|mSS=i" => \$seqMinSize,
# Options for detection of CRISPR arrays
"mismDRs|md=f" => \$DRerrors,
"truncDR|t=f" => \$DRtrunMism,
"minDR|mr=i" => \$M1,
"maxDR|xr=i" => \$M2,
"minSP|ms=i" => \$S1,
"maxSP|xs=i" => \$S2,
"noMism|n" => sub { $mismOne=0; },
"percSPmin|pm=f" => \$Sp1,
"percSPmax|px=f" => \$Sp2,
"spSim|s=f" => \$SpSim,
"DBcrispr|dbc=s" => \$crisprdb,
"repeats|rpts=s" => \$repeats,
"DIRrepeat|drpt=s" => \$dirRepeat,
"flank|fl=i" => \$flankingRegion,
"levelMin|lMin=i" => \$levelMin,
"classifySmallArrays|classifySmall|cSA" => \$classifySmall,
"forceDetection|force" => sub {
$force = 1;
$M1 = $fosteredDRLength;
},
"fosterDRLength|fDRL=i" => \$fosteredDRLength,
"fosterDRBegin|fDRB=s" => \$fosteredDRBegin,
"fosterDREnd|fDRE=s" => \$fosteredDREnd,
"MatchingRepeats|Mrpts=s" => \$repeatsQuery,
"minNbSpacers|mNS=i" => \$minNbSpacers,
"betterDetectTrunc|bDT" => \$betterDetectTruncatedDR,
"PercMismTrunc|PMT=f" => \$percentageMismatchesHalfDR,
# Options for detection of Cas clusters
"cas|cs" => \$launchCasFinder,
"ccvRep|ccvr" => \$writeFullReport,
"vicinity|vi=i" => \$vicinity,
"CASFinder|cf|CasFinder=s" => \$casfinder,
"cpuMacSyFinder|cpuM=i" => \$cpuMacSyFinder,
"rcfowce" => \$rcfowce,
"definition|def=s" => \$definition, # Values: "SubTyping", "Typing", "General"
"gffAnnot|gff=s" => \$userGFF,
"proteome|faa=s" => \$userFAA, # Need -cas and -gff
"cluster|ccc=i" => \$clusteringThreshold, # Need -cas
"getSummaryCasfinder|gscf" => \$gscf,
"geneticCode|gcode=i" => \$genCode,
"metagenome|meta" => \$metagenome,
# Options to use Prokka
"useProkka|prokka" => sub { $useProkka = 1; $useProdigal = 0; },
"cpuProkka|cpuP=i" => \$cpuProkka,
"ArchaCas|ac" => sub { $launchCasFinder=1; $kingdom = "Archaea"; },
) or do {
print STDERR "Error in command line arguments\n";
printhelpbasic($0);
exit EX_USAGE;
};
# If a positional argument remains, it's the input file
my $n_rem_arguments = @ARGV;
if ($n_rem_arguments) {
$userfile = $ARGV[0];
}
# Print help and exit
if ($print_help)
{
printhelpall($0);
exit EX_OK;
}
# Print version and exit
if ($print_version)
{
printversion($0);
exit EX_OK;
}
# Basic checks of CLI arguments
# Output dir is created later if not given
# Check that input file was given and exists
if (!$userfile) {
print STDERR "Error: input file not given.\n";
printhelpbasic($0);
exit EX_USAGE;
}
if (not -e $userfile) {
print STDERR "Error: file $userfile not found. Please check that the file exists or enter a correct file name.\n";
printhelpbasic($0);
exit EX_NOINPUT;
}
print "################################################################\n";
print "# --> Welcome to $0 (version $version) \n";
print "################################################################\n\n\n";
## Control dependencies
my $vmatchProg = isProgInstalled("vmatch2");
if($vmatchProg){
print "vmatch2 is...............OK \n";
}
else
{
print "vmatch2 is not installed, please install it and try again.\n";
exit EX_CONFIG;
}
my $mkvtreeProg = isProgInstalled("mkvtree2");
if($mkvtreeProg){
print "mkvtree2 is...............OK \n";
}
else
{
print "mkvtree2 (Vmatch dependency) is not installed, please install it and try again.\n";
exit EX_CONFIG;
}
my $vsubseqselectProg = isProgInstalled("vsubseqselect2");
if($vsubseqselectProg){
print "vsubseqselect2 is...............OK \n";
}
else
{
print "vsubseqselect2 (Vmatch dependency) is not installed, please install it and try again.\n";
exit EX_CONFIG;
}
#--
my $fuzznuc = isProgInstalled("fuzznuc");
if($fuzznuc){
print "fuzznuc (from emboss) is...............OK \n";
}
else
{
print "fuzznuc (from emboss) is not installed, please install it and try again.\n";
exit EX_CONFIG;
}
my $needle = isProgInstalled("needle");
if($needle){
print "needle (from emboss) is...............OK \n\n\n";
}
else
{
print "needle (from emboss) is not installed, please install it and try again.\n";
exit EX_CONFIG;
}
##
## Manage DRs
$DRtrunMism = 100/$DRtrunMism;
$DRerrors = $DRerrors/100;
##
## Time - Begin
#my ($start_hour,$start_min,$start_sec) = Now(); # Now([+1]) replaced by Now()
my($start_year,$start_month,$start_day, $start_hour,$start_min,$start_sec) = Today_and_Now();
##
my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$userfile);
my $inputfileCount=0;
my ($seq,$inputfile);
my $basename = basename($userfile);
my @outdir = split /\./, $basename;
my $outdir = $outdir[0];
$outdir .= "_".$start_day."_".$start_month."_".$start_year."_".$start_hour."_".$start_min."_".$start_sec;
# DC - 11/05/2017 Retrieve date (localtime) and default repository
my $ResultDir = "";
if($outputDirName eq ""){
$ResultDir = "Result_".$outdir;
}
else{
$ResultDir = $outputDirName;
}
my $ResultDirFinal = $ResultDir;
# LK
if(-d $ResultDirFinal){
die "Output directory $ResultDirFinal already exists. Remove the directory before continuing.\nSuggestion:\nrm -rf $ResultDirFinal/\n";
}
# $ResultDir now corresponds to the temporary result directory ($outdir)
$ResultDir = $outdir;
mkdir $ResultDir unless -d $ResultDir;
mkdir $ResultDir."/CRISPRFinderProperties" unless -d $ResultDir."/CRISPRFinderProperties"; #NV
#create a directory GFF and move all GFFs to this directory
mkdir $ResultDir."/GFF" unless -d $ResultDir."/GFF";
struct Rep => {
Pos1 => '$',
Length => '$',
DRseq => '$',
};
my $htmlFile = "index.html"; # web page allowing a simple visualization of CRISPRs and Cas genes ($ResultDir/ removed)
if($html){
my @status = stat($userfile);
#open (HTML, ">$htmlFile") or die "open : $!";
open(HTML,">",$htmlFile) or die("Could not open the file $htmlFile because $!\n");
print HTML "<!DOCTYPE html>\n";
print HTML "<html>\n";
print HTML "<head>\n";
print HTML "<title>CRISPR-Cas viewer</title>\n";
print HTML "<link rel='stylesheet' type='text/css' href='crispr.css'>\n";
print HTML "</head>\n";
print HTML "<body>\n";
print HTML "<div class = content>\n";
print HTML "<h1> CRISPRs arrays and Cas genes found in the submitted sequence(s) </h1>\n";
print HTML "<br/>\n";
print HTML "<pre>\n";
print HTML "File name : <font COLOR= \#FF0000> $userfile </font>\n";
print HTML "File size (in bytes): $status[7]\n";
print HTML "</pre>\n";
}
my @statusLOG = stat($userfile); # Get input file size in bytes
# Two additional log files (logfile and logSequences) -DC 06/2017
my $logfile = "logFile_".$start_day."_".$start_month."_".$start_year."_".$start_hour."_".$start_min."_".$start_sec.".txt"; # to write command lines
my $logSeq = "logSequences_".$start_day."_".$start_month."_".$start_year."_".$start_hour."_".$start_min."_".$start_sec.".tsv"; # to write quick information on sequences
if($logOption){
open (LOG, ">$logfile") or die "open : $!";
open (LOGSEQ, ">$logSeq") or die "open : $!";
print LOGSEQ "Sequence ID\tName\tSize (bp)\tAT%\tNb_CRISPRs\tTotal Time (s)\tCRISPR Time (s)\tCas Time (s)\tFile size (bytes)\n";
}
#my $jsonSeq = "sequences.json"; # JSON file to get information on sequence name and sequence length (bp)
#open (JSONSEQ, ">$jsonSeq") or die "open : $!";
#print JSONSEQ "[\n";
#my $jsonLineSeq = "";
## JSON file "result.json"
my $jsonResult = "result.json"; # JSON result to get all info concerning CRISPR-Cas and sequences JSON files
open (JSONRES, ">$jsonResult") or die "open : $!";
print JSONRES "{\n";
my $dateJSON = $start_day."/".$start_month."/".$start_year."_".$start_hour.":".$start_min.":".$start_sec;
my $cmdLine = "perl $0 @ARGV";
my $jsonLineRes = "\"Date\":\"".$dateJSON."\",\n\"Version\":\"".$version."\",\n\"Command\":\"".$cmdLine."\",\n\"Sequences\":\n[";
##
if($logOption){
print LOG "[$start_hour:$start_min:$start_sec] $cmdLine\n---> Results will be stored in $ResultDirFinal\n\n"; # the main command line and results path
}
if($quiet){
}
else{
print "[$start_hour:$start_min:$start_sec] ---> Results will be stored in $ResultDirFinal\n\n";
}
#my $totalNumberOfCrisprs=0; #LK
my $allFoundCrisprs = 0; #DC
my $allCrisprs = 0;
my $nbrAllCas = 0; # Total nbrCas
my $currentRepository = getcwd();
#actualMetaOptionValue
my $actualMetaOptionValue = $metagenome;
#my header CCS to write title of CSS tsv file
my $resultsCRISPRCasSummary1 = $ResultDir."/CRISPR-Cas_summary.tsv";
my $headerCCS = "Sequence(s)\tCRISPR array(s)\tNb CRISPRs\tEvidence-levels\tCas cluster(s)\tNb Cas\tCas Types/Subtypes\n";
open (RESULTCCSONE, ">$resultsCRISPRCasSummary1") or die "open : $!"; #Open $resultsCRISPRCasSummary1
print RESULTCCSONE $headerCCS;
close(RESULTCCSONE);
#my Header for CRISPR-Cas_clusters.tsv
if($clusteringThreshold and $launchCasFinder){
my $clusterResultsHead = $ResultDir."/CRISPR-Cas_clusters.tsv";
my $headerCCC = "Sequence\tCluster Id\tNb CRISPRs\tNb Cas\tStart\tEnd\tDescription\n";
open (RESULTCCC, ">$clusterResultsHead") or die "open : $!";
print RESULTCCC $headerCCC;
close (RESULTCCC);
}
#my Header for smallArraysReclassification.xls
if ($classifySmall){
my $smallArrays = $ResultDir."/smallArraysReclassification.xls";
open (SMALL, ">$smallArrays") or die "open : $!";
print SMALL "CRISPR_Id\tConsensus_Repeat\tFormer_Evidence-level\tNew_Evidence-level\n";
close (SMALL);
}
#sequence version
my $sequenceVersion = 0;
while($seq = $seqIO->next_seq()){ # DC - replace 'next_seq' by 'next_seq()'
$inputfileCount++;
$inputfile = $basename;
my $start_run = time();
# DC - 05/2017 - print $inputfile and seq->id
my $seqID1 = $seq->id; # ID of sequence
my $seqDesc = $seq->desc; # Description of sequence
if($seqDesc eq ""){ $seqDesc = "Unknown"; }# If description is missing, the sequence will be labeled as "Unknown"
$sequenceVersion = $seqID1;
my $seqLength = $seq->length(); #Sequence size
# -meta option as given by user
$metagenome=$actualMetaOptionValue;
# DC - when $seqLength < 100000 bases, consider it as a metagenomic dataset for Prodigal (if option -meta was not set)
if(!$metagenome and ($seqLength < 100000) ){
$metagenome=1;
}
#else{
# $metagenome=$actualMetaOptionValue;
#}
#NV condition for mss option
if ($seqLength >= $seqMinSize){
my $globalSeq = $seq->seq; # whole sequence to calculate global AT% # DC modif to perform better
my $globalAT = atpercent($globalSeq); # calculate AT% of running sequence # DC modif to perform better
my @seqID1 = (); # DC
if ($seqID1 =~ /\|/){
@seqID1 = split (/\|/,$seqID1);
$seqID1 = pop(@seqID1);
@seqID1 = ();
@seqID1 = split (/\./,$seqID1);
$seqID1 = $seqID1[0];
}
else{
if($seqID1 =~ /\./){
@seqID1 = split (/\./,$seqID1);
$seqID1 = $seqID1[0];
}
}
# Add sequence number to variable $seqID1
#$seqID1 = $seqID1."_".$inputfileCount;
# DC - 05/2017 - change value of $inputfile
$inputfile = $seqID1.".fna";
# DC - 05/2017 - To change the way of write sequence
my $seqO = Bio::SeqIO->new(-file=>">$inputfile", -format=>'Fasta'); # DC
$seqO->write_seq($seq); # DC
my $nbrcris = 0;
my $OneSpacerCris_nbr = 0;
# execute the mkvtree function to indexate the input sequence file
my $indexname = $inputfile;
my @indexname = split (/\./,$indexname);
print STDERR "\nSequence number $inputfileCount..";
#print STDERR "\n*** your results files will be in the $outdir-$inputfileCount directory ***\n";
callmkvtree($inputfile,$indexname);
# DC - 05/2017 print
if($quiet){}
else{
print " ( Input file: $inputfile, Sequence ID: $seqID1, Sequence name = $seqDesc )\n"; # DC
}
# execute the first vmatch2 search
# Modification DC - 05/05/2017
my @vmatchoptions = ""; #qw(-l 23 25 60 -e 1 -s leftseq -evalue 1 -absolute -nodist -noevalue -noscore -noidentity -sort ia -best 1000000 -selfun ./sel392v2.so 55); #DC
#my @vmatchoptions = qw(-l 23 25 60 -e 1 -s leftseq -evalue 1 -absolute -nodist -noevalue -noscore -noidentity -sort ia -best 1000000 -selfun sel392v2.so 55);
#my $currentRepositoryForSo = getcwd(); #get current repository to use sel392v2.so
my ($vmatch_hour,$vmatch_min,$vmatch_sec) = Now();
# DC - set Vmatch options in function of parameters ### replace -sort ia by -sort ida
#print " $repeatsQuery exists ?\n";
if(-e $so){
if(!$mismOne){
if(-e $repeatsQuery){
#print " $repeatsQuery exists !!!!\n";
@vmatchoptions = ("-l", $M1, "-s", "leftseq", "-evalue", "1", "-absolute", "-nodist","-noevalue", "-noscore", "-noidentity", "-sort", "ia", "-q", "$repeatsQuery", "-best", 1000000, "-selfun", "$so", $M2);
}
else{
@vmatchoptions = ("-l", $M1, $S1, $S2,"-s", "leftseq", "-evalue", "1", "-absolute", "-nodist","-noevalue", "-noscore", "-noidentity", "-sort", "ia", "-best", 1000000, "-selfun", "$so", $M2);
}
}else{
if(-e $repeatsQuery){
#print " $repeatsQuery exists !!!!\n";
@vmatchoptions = ("-l", $M1, "-e $mismOne -s", "leftseq", "-evalue", "1", "-absolute", "-nodist","-noevalue", "-noscore", "-noidentity", "-sort", "ia", "-q", "$repeatsQuery", "-best", 1000000, "-selfun", "$so", $M2);
}
else{
@vmatchoptions = ("-l", $M1, $S1, $S2,"-e $mismOne -s", "leftseq", "-evalue", "1", "-absolute", "-nodist","-noevalue", "-noscore", "-noidentity", "-sort", "ia", "-best", 1000000, "-selfun", "$so", $M2);
}
}
}
else{
print "The shared object file ($so) must be available in your current directory. Otherwise, you must use option -soFile (or -so)! \n\n";
if($logOption){
print LOG "[$vmatch_hour:$vmatch_min:$vmatch_sec] The shared object file ($so) must be available in your current directory. Otherwise, you must use option -soFile (or -so)! \n\n";
}
exit EX_CONFIG;
}
push(@vmatchoptions,$indexname); # DC - replace
push(@vmatchoptions, " > vmatch_result.txt");
if($logOption){
print LOG "\n[$vmatch_hour:$vmatch_min:$vmatch_sec] vmatch2 @vmatchoptions\n"; # print in Logfile DC replaced vmatch by vmatch2
}
#Modification DC - 05/05/2017
#makesystemcall("./vmatch " . join(' ',@vmatchoptions)); #DC
makesystemcall("vmatch2 " . join(' ',@vmatchoptions)); #LK - DC replaced vmatch by vmatch2
#makesystemcall("cp vmatch_result.txt CHECKvmatch_result.txt");
my @rep = trans_data("vmatch_result.txt");
# fill in tabseq table : table containing the begin and end positions
# of sequences candidates ton contain a crispr
# my @temp = split(/\./, $inputfile);
my $RefSeq = $seqID1; #$temp[0];
if($#rep >=0)
{
($nbrcris, $OneSpacerCris_nbr)= write_clusters($RefSeq,@rep);
create_recap($RefSeq, $nbrcris, $OneSpacerCris_nbr,$ResultDir);
#$totalNumberOfCrisprs+=$OneSpacerCris_nbr; #LK
# DC
#$totalNumberOfCrisprs = $nbrcris;
}
else{ create_recap($RefSeq, 0, 0,$ResultDir); }
#$totalNumberOfCrisprs+=$nbrcris; #LK
#before changing directory/path
my $actual_pathHome = getcwd();
if($logOption){
print LOG "Actual path in Home is: $actual_pathHome\n";
#print "Actual path in Home is: $actual_pathHome\n";
}
chdir ".."; #LK #removed by DC
unlink <..\/crispr_result_*>;
unlink <..\/seq_v*>;
unlink <..\/$inputfile.*> ;
unlink <..\/spacersseq_v*>;
unlink <..\/fastaMuscle_spacersseq_v*>;
unlink '../stdout';
unlink '../vmatch_result.txt';
unlink '../alignDR_Spacer.needle';
unlink '../DR';
unlink <*_CRISPRs> ;
#unlink "../$inputfile"; # DC - 12/05/2017 to keep fasta file until gff and json files are done
chdir "..";
#unlink '*.index';
makesystemcall("rm -f $inputfile.index");
## DC - move makeGFF and make JSON functions
# LK - DC
my ($gffFilename,@idDir)=makeGff($ResultDir,$inputfile,$seqDesc,$nbrcris,$OneSpacerCris_nbr);
# DC - 05/2017 - Make Json file thanks to GFF
#print "$gffFilename\n"; # DC
my $jsonFile = makeJson($gffFilename,$ResultDir,$RefSeq);
# DC - 05/2017 - CRISPRs analysis
if($quiet){}
else{
print "Nb of CRISPRs in this sequence = $nbrcris\n";
}
if($logOption){
print LOG "Nb of CRISPRs in this sequence $RefSeq = $nbrcris\n";
}
my ($analysis, $foundCrisprs) = crisprAnalysis($ResultDir,$RefSeq,$nbrcris,$seqDesc,@idDir);
$allFoundCrisprs += $foundCrisprs; #increment for each found CRISPR in CRISPRdb
$allCrisprs += $nbrcris; #all Crisprs
#report end time for CRISPRs analysis
my $end_runCRISPR = time();
my $runtimeCRISPR = $end_runCRISPR - $start_run;
# DC - 06/2017 - Integration of Cas analysis
my ($casfile, $jsonCAS, @tabCRISPRCasClustersA);
$casfile = "";
my $nbrCas = 0;
if($rcfowce){
if($launchCasFinder and ($nbrcris>0)){
($nbrCas, $casfile, $jsonCAS, @tabCRISPRCasClustersA) = casFinder($ResultDir,$inputfile,$seqDesc,$RefSeq,$nbrcris,$kingdom,$casfinder);
}
}
elsif($launchCasFinder and ($rcfowce == 0)){
($nbrCas, $casfile, $jsonCAS, @tabCRISPRCasClustersA) = casFinder($ResultDir,$inputfile,$seqDesc,$RefSeq,$nbrcris,$kingdom,$casfinder);
}
#print "NBR CAS = $nbrCas\n";
if(! $nbrCas){ $nbrCas = 0; }
if($quiet){}
else{
print "Nb of Cas in this sequence = $nbrCas\n";
}
if($logOption){
print LOG "Nb of Cas in this sequence ($RefSeq) = $nbrCas\n";
}
$nbrAllCas += $nbrCas; # all Cas
#if ($launchCasFinder){
# $casfile = casFinder($ResultDir,$inputfile,$seqDesc,$RefSeq,$nbrcris,$kingdom,$casfinder);
# print "Cas file = $casfile\n";
#}
#HTML page
if($html){
my $htmlPage = "";
$htmlPage = makeHtml($gffFilename,$casfile,$ResultDir,$RefSeq,$seqDesc,$seqLength,$globalAT,$nbrcris,$OneSpacerCris_nbr);
if($quiet){}
else{ print "HTML page ($htmlPage) created/updated\n"; }
}
#create a directory GFF and move all GFFs to this directory
#my $directoryGFFfiles = $ResultDir."/GFF"; # Directory for GFFs
#mkdir $directoryGFFfiles unless -d $directoryGFFfiles;
if(-d $ResultDir."/GFF"){ makesystemcall("mv $gffFilename $ResultDir/GFF "); }
#Move CRISPRproperties, DRs and Spacers to specific directories
my $input_spacers = $ResultDir."/".$RefSeq."/Spacers_"; # Path to the Spacers files
my $input_drs = $ResultDir."/".$RefSeq."/DRs_"; # Path to the DRs files
my $input_properties = $ResultDir."/".$RefSeq."/".$RefSeq; # Path to the properties files
my $directorySpacers = $ResultDir."/".$RefSeq."/Spacers"; # Directory for spacers
my $directoryDRs = $ResultDir."/".$RefSeq."/DRs"; # Directory for DRs
my $directoryCRISPRproperties = $ResultDir."/".$RefSeq."/CRISPRproperties"; # Directory for CRISPRproperties
mkdir $directorySpacers unless -d $directorySpacers;
mkdir $directoryDRs unless -d $directoryDRs;
mkdir $directoryCRISPRproperties unless -d $directoryCRISPRproperties;
if ( (-e $input_spacers."1") and (-e $input_drs."1") ){
makesystemcall("mv $input_spacers* $directorySpacers");
makesystemcall("mv $input_drs* $directoryDRs");
makesystemcall("mv $input_properties* $directoryCRISPRproperties");
}
if(-e $input_properties."_CRISPRs"){
my $reportProperties = $input_properties."_CRISPRs";
makesystemcall("mv $reportProperties $directoryCRISPRproperties");
}
my $directoryProperties = $ResultDir."/".$RefSeq;
#my $newProperties = $directoryProperties."_properties"; #before NV
my $newProperties = ""; #NV
if (-e $ResultDir."/CRISPRFinderProperties" and -d $ResultDir."/CRISPRFinderProperties") {
$newProperties = $ResultDir."/CRISPRFinderProperties/".$RefSeq."_properties";
} #NV
if(-d $directoryProperties){
makesystemcall("mv $directoryProperties $newProperties"); # rename properties for each sequence
}
#create a directory Properties and move all sequenceProperties directories in one
#my $directoryPropertiesFinal = $ResultDir."/CRISPRFinderProperties"; # Directory for Properties
#mkdir $directoryPropertiesFinal unless -d $directoryPropertiesFinal;
#makesystemcall("mv $newProperties $directoryPropertiesFinal");
# End Move properties, DRs and Spacers
# DC - 06/2017 - write sequence information
my $end_run = time();
my $runtime = $end_run - $start_run;
my $runtimeCas = $runtime - $runtimeCRISPR;#get calculation time for Cas analysis
if($logOption){
print LOGSEQ "$seqID1\t$seqDesc\t$seqLength\t$globalAT\t$nbrcris\t$runtime\t$runtimeCRISPR\t$runtimeCas\t$statusLOG[7]\n"; #some statistics
}
#$jsonLineSeq .= "{\n\"Id\":\"".$RefSeq."\",\n\"Description\":\"".$seqDesc."\",\n\"Length\":".$seqLength."\n},\n"; #data to be written in JSONSEQ file
##JSON result
my ($catJSONcrispr, $catJSONcas);
#my $jsonFileCrispr = $ResultDir."/".$RefSeq."_crispr.json";
if($jsonFile){
$catJSONcrispr = $jsonFile;
}
else{
$catJSONcrispr = "[]";
}
#my $jsonFileCas = $ResultDir."/".$RefSeq."_cas.json";
if($jsonCAS){
$catJSONcas = $jsonCAS;
}
else{
$catJSONcas = "[]";
}
if($nbrcris == 0){
$catJSONcrispr = "[]";
}
if ( ! $jsonCAS){
$catJSONcas = "[]";
}
#get information from @tabCRISPRCasClustersA
my $tempSumDataCcC = "";
if ($launchCasFinder){
$tempSumDataCcC = "@tabCRISPRCasClustersA";
}
else{
$tempSumDataCcC = "NA";
}
#$sequenceVersion
$jsonLineRes .= "{\n\"Id\":\"".$RefSeq."\",\n\"Version\":\"".$sequenceVersion."\",\n\"Description\":\"".$seqDesc."\",\n\"AT\":".$globalAT.",\n\"Length\":".$seqLength.",\n\"Summary_CRISPR-Cas\":\"".$tempSumDataCcC."\",\n\"Crisprs\":".$catJSONcrispr.",\n\"Cas\":".$catJSONcas."\n},\n";
#put analyzed sequence in a folder named: "analyzedSequences"
my $analyzedSequences = $ResultDir."/analyzedSequences"; # Directory for FASTAs
mkdir $analyzedSequences unless -d $analyzedSequences;
if(-e $inputfile){
makesystemcall("mv $inputfile $analyzedSequences");
}
} #NV end condition for mss option
} # end while (my $seq = $seqIO->next_seq)
# End and close file JSONRES
$jsonLineRes .= "]\n";
$jsonLineRes =~ s/,\n]/\n]/; # change end of file to better fit JSON format
$jsonLineRes .= "}\n";
#$jsonLineRes =~ s/,/,\n/; # make file more readable
print JSONRES $jsonLineRes;
close (JSONRES);
# move JSONRES in ResultDir
if(-e $jsonResult){
makesystemcall("mv $jsonResult $ResultDir");
}
#create a directory JSONfiles and move all JSONs to this directory
#my $directoryJSONfiles = $ResultDir."/JSON_files"; # Directory for JSONs
#mkdir $directoryJSONfiles unless -d $directoryJSONfiles;
#if(-e $ResultDir."/result.json"){
# makesystemcall("mv $ResultDir/*.json $directoryJSONfiles");
#}
#fullReport
if ($launchCasFinder){
if($writeFullReport){
my $fullRep = fullReport($ResultDir);
#print "CRISPR-CAS vicinity Report = $fullRep\n";
}
}
#OrientationCount
if($dirRepeat){
my $orientationCountFile = countOrientation($ResultDir,$allCrisprs);
if($quiet){}
else{ print "Orientations count file created: $orientationCountFile\n\n"; }
}
#copy files
my $resCCSummary = $ResultDir."/CRISPR-Cas_summary.tsv";
my $resCCSummaryX = $ResultDir."/CRISPR-Cas_summary.xls";
if(-e $resCCSummary){ makesystemcall("cp $resCCSummary $resCCSummaryX"); }
my $resCRISPRs = $ResultDir."/Crisprs_REPORT.tsv";
my $resCRISPRsX = $ResultDir."/Crisprs_REPORT.xls";
if(-e $resCRISPRs){ makesystemcall("cp $resCRISPRs $resCRISPRsX"); }
my $resCas = $ResultDir."/Cas_REPORT.tsv";
my $resCasX = $ResultDir."/Cas_REPORT.xls";
if(-e $resCas){ makesystemcall("cp $resCas $resCasX"); }
my $clustersResults = $ResultDir."/CRISPR-Cas_clusters.tsv";
my $clustersResultsX = $ResultDir."/CRISPR-Cas_clusters.xls";
if(-e $clustersResults){ makesystemcall("cp $clustersResults $clustersResultsX"); }
#create a directory TSVfiles and move all TSVs to this directory
my $directoryTSVfiles = $ResultDir."/TSV"; # Directory for TSVs
mkdir $directoryTSVfiles unless -d $directoryTSVfiles;
if(-e "$ResultDir/Crisprs_REPORT.tsv"){
makesystemcall("mv $ResultDir/*.tsv $directoryTSVfiles");
}
if(-e "$ResultDir/Crisprs_REPORT.xls"){
makesystemcall("mv $ResultDir/*.xls $directoryTSVfiles");
}
#create a directory FASTAfiles and move all rawFASTAs to this directory
my $directoryFASTAfiles = $ResultDir."/rawFASTA"; # Directory for FASTAs
mkdir $directoryFASTAfiles unless -d $directoryFASTAfiles;
#if(-e "$ResultDir/rawCRISPRs.fna"){
#makesystemcall("mv $ResultDir/rawCRISPRs.fna $directoryFASTAfiles");
#}
#if(-e "$ResultDir/rawCas.fna"){
#makesystemcall("mv $ResultDir/rawCas.fna $directoryFASTAfiles");
#}
my $analyzedSequences2 = $ResultDir."/analyzedSequences"; # Directory for analyzed FASTAs
if(-d $analyzedSequences2){
makesystemcall("mv $analyzedSequences2 $directoryFASTAfiles");
}
#create a directory Properties and move all sequenceProperties directories in one
my $directoryProperties = $ResultDir."/CRISPRFinderProperties"; # Directory for Properties
#mkdir $directoryProperties unless -d $directoryProperties; #before NV
#if(-e "$ResultDir/*_properties"){
#makesystemcall("mv $ResultDir/*_properties $directoryProperties"); #before NV
#}
#create a directory GFF and move all GFFs to this directory
#my $directoryGFFfiles = $ResultDir."/GFF"; # Directory for GFFs # DC
#mkdir $directoryGFFfiles unless -d $directoryGFFfiles; # DC
#if(-e "$ResultDir/*.gff"){
#makesystemcall("mv $ResultDir/*.gff $directoryGFFfiles"); #before NV
#makesystemcall("find $ResultDir -name *.gff -exec mv {} $directoryGFFfiles \\;"); #NV DC
#}
## Option Keep to remove (by default) all secondary files or keep them
if($keep){
if($useProkka){
if(! $quiet){print "Secondary folders/files (Prokka, CasFinder, rawFASTA, CRISPRFinderProperties) have been created\n";}
}
else{
if(! $quiet){print "Secondary folders/files (Prodigal, CasFinder, rawFASTA, CRISPRFinderProperties) have been created\n";}
}
}
else{
if(! $quiet){print "Deleting secondary folders/files...\n";}
if($launchCasFinder){
if($useProkka){
makesystemcall("rm -rf $ResultDir/Prokka") if (-d "$ResultDir/Prokka"); #rmdir $ResultDir."/Prokka";
}
else{
makesystemcall("rm -rf $ResultDir/Prodigal") if (-d "$ResultDir/Prodigal"); #rmdir $ResultDir."/Prodigal";
}
makesystemcall("rm -rf $ResultDir/CasFinder") if (-d "$ResultDir/CasFinder"); #rmdir $ResultDir."/CasFinder"; #
}
makesystemcall("rm -rf $ResultDir/CRISPRFinderProperties") if (-d "$ResultDir/CRISPRFinderProperties"); # rmdir $ResultDir."/CRISPRFinderProperties"; #
makesystemcall("rm -rf $ResultDir/rawFASTA") if (-d "$ResultDir/rawFASTA"); #rmdir $ResultDir."/rawFASTA"; #
}
if($crisprdb eq ""){
print "\nAll CRISPRs = $allCrisprs\n";
}
elsif(-e $crisprdb){
print "\nNb CRISPRs matching CRISPRdb / All CRISPRs = $allFoundCrisprs / $allCrisprs\n";
}
#print Cas
if($launchCasFinder){
print "All Cas = $nbrAllCas\n";
}
## Time - end
#my ($end_hour,$end_min,$end_sec) = Now();
my($end_year,$end_month,$end_day, $end_hour,$end_min,$end_sec) = Today_and_Now();
##
my ($D_y,$D_m,$D_d, $Dh,$Dm,$Ds) =
Delta_YMDHMS($start_year,$start_month,$start_day, $start_hour, $start_min, $start_sec,
$end_year, $end_month, $end_day, $end_hour,$end_min,$end_sec);
print "\n[$end_hour:$end_min:$end_sec] Thank you for using $0! Thank you for your patience!\n";
print "\n[$end_hour:$end_min:$end_sec] The script lasted: ".$D_y." year(s) ".$D_m." month(s) ".$D_d." day(s) , ".$Dh." hour(s) ".$Dm." minute(s) ".$Ds." second(s)\n";
if($logOption){
print LOG "\n[$end_hour:$end_min:$end_sec] The script lasted: ".$D_y." year(s) ".$D_m." month(s) ".$D_d." day(s) , ".$Dh." hour(s) ".$Dm." minute(s) ".$Ds." second(s)\n";
close (LOG);
close (LOGSEQ);
}
#close HTML page if exists
if($html){
print HTML "</div>";
print HTML "</body>";
print HTML "</html>";
close (HTML);
}
# Move LOG files in LOGs directory
if($logOption){
my $directoryLog = $ResultDir."/LOGs"; # directory to store LOG files
unless(-d $directoryLog){ mkdir $directoryLog or die "$0: I can not create the folder $directoryLog: $!\n" }
if ( (-e $logfile) and (-e $logSeq) ){
makesystemcall("mv $logfile $directoryLog");
makesystemcall("mv $logSeq $directoryLog");
}
}
# Move index.html and copy crispr.css into Visualization directory
if($html){
my $directoryViz = $ResultDir."/Visualization"; # directory to store basic Visualization files
unless(-d $directoryViz){ mkdir $directoryViz or die "$0: I can not create the folder $directoryViz: $!\n" }
makesystemcall("mv $htmlFile $directoryViz");
if(-e "crispr.css"){
makesystemcall("cp crispr.css $directoryViz");
}
elsif(-e "supplementary_files/crispr.css"){
makesystemcall("cp supplementary_files/crispr.css $directoryViz");
}
elsif(-e $cssFile){