-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathneedleman-wunsch.lisp
executable file
·1454 lines (1200 loc) · 53.1 KB
/
needleman-wunsch.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
(in-package :cl-user)
;; Permission is hereby granted, free of charge, to any person obtaining
;; a copy of this software and associated documentation
;; files (the "Software"), to deal in the Software without restriction,
;; including without limitation the rights to use, copy, modify, merge,
;; publish, distribute, sublicense, and/or sell copies of the Software,
;; and to permit persons to whom the Software is furnished to do so,
;; subject to the following conditions:
;; The above copyright notice and this permission notice shall be
;; included in all copies or substantial portions of the Software.
;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
;; LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
;; OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
;; WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
;;; TO RUN EXECUTE:
;;; (TEST-VERBOSE) to see the algorithm progress.
;;; (TIME-NW 20 25) to have algorithm align sequences of lengths 20,21,22,23,24
;;; (TEST-NW) to test algorithm against simple, one-path, cases.
#|
This algorithm finds a set of global best matches for two sequences.
That is, given A[0...N], and B[0...M]
it finds a set of solutions which consist of
-- The elements of A, possibly interspersed with gaps (denoted by '-')
-- The elements of B, possibly interspersed with gaps (denoted by '*')
-- An alignment of A with gaps over B with gaps, such that for at least
one i and j, A(i) is aligned with some B(j).
An example:
Sequence A: CGAGGAA
Sequence B: ACGTGA
One solution may look like
A: CGAGGAA
|| | |
B: ACGTG*A
So there is a single gap in Sequence B while Sequence A is shifted by
one position with respect to Sequence B.
Another might be
A: AC-GTCA
| | |
B: CGA*GG*AA
(this actually has more gaps and less matching alignments, so is not
likely to rank as high as the first solution, using any kind of
reasonable metric, and therefore would not be a 'best match').
(Note that it is possible to have a gap of more than a single space,
and more than one gap per sequence, although the former is not illustrated
above)
(Note that there may be a large number of reasonable solutions,
especially as the sequences get longer.)
What constitutes a solution depends on how one defines what
constitutes a 'good' match.
There are three parameters generally employed to determine a good match:
-- How much to reward element alignment
-- How much to penalize for the existence of a gap
-- How much to penalize as a gap gets longer
It is possible to specify different rewards for different matching
alignments (e.g., a match of T on top with T on bottom could be given
a higher value than matching G on top with G on bottom). It is also
possible to reward aligments that don't 'match': e.g., having T on top
and T on the bottom could generate a very high score, while T on top
and A on the bottom a somewhat lesser score, and T on top and either C
or G on the bottom a poor score. So TT > TA > TC = TG.
It is theoretically possible to specify different penalties for gaps
and gap lengths in the first sequence vs in the second sequence,
although this code does not provide that functionality.
Determining the reward for element alignment is done by constructing a
SUBSTITUTION MATRIX, and, in this code, the corresponding SUBSTITUTION
FUNCTION, SUBF. For any two sequence elements X and Y, (SUBF X Y) is
used to obtain the reward for X and Y being aligned. See below for
more details.
Determining the penalties for a gap is done by providing a
GAP-PENALTY-FUNCTION or providing parameters that allow construction
of a function which increases linearly with increasing gap length.
Typical gap functions are one of:
-- no gap penalty, i.e, #'(lambda (gaplength) 0.0)
-- constant gap penalty, e.g., #'(lambda (gaplength) -1.0)
-- gap penalty proportional to gap length, e.g.,
#'(lambda (gaplength) (* 0.5 gaplength))
-- one penalty for opening a gap and another linear term for
extending it, e.g.,
#'(lambda (gaplength) (+ 1.0 (* 0.25 (1- gaplength))))
The latter form is typically referred to in the literature as the affine
gap penalty function, and is commonly used.
The algorithm to figure the set of best alignments consists of three parts:
-- Generate and fill an NxM SCORE matrix, where M is the length of Sequence
A and N the length of Sequence B. The matrix values are determined by the
order of the sequence elements and the reward and penalty parameters.
-- Generate and fill an NxM BACKPOINTER matrix. Each element (I,J)
of the BACKPOINTER matrix determines an optimal alignment for the
subsequences A[0...I] and B[0...J].
-- Use the generated matrices to then figure out a set of best alignments.
The first two steps are usually done concurrently, and are done that way
in this code.
Let us from now on denote our sequences as SEQA and SEQB.
Let length(SEQA) = N, and length(SEQB) = M.
Assume that SEQA = CGAGGAA, SEQB = ACGTGA
Then the SCORE matrix initially looks conceptually like this, once it's
elements have been initialized to zero.
SEQB -->
A C G T G A
C 0.0 0.0 0.0
G 0.0 0.0 0.0
S
E A 0.0 0.0 0.0 . . .
Q
A G .
.
| G .
|
\ / A
.
A
Given an index (i,j) into this matrix, we can compute what are called
the RESIDUES, (the symbols above the column and the symbols to the
left of the row) for the (i,j)th element of the SCORE matrix (which
are just the sequence elements of the two sequences at positions I and
J, respectively). So the RESIDUES for position (3,5) are G and A,
while the RESIDUES for (0,0) are C and A.
Note that in all the code and all the examples, the matrix is indexed
using (i = ROW, j = COLUMN). Further, I is always the first index and
J the second index. (In the code SEQA is called 'SEQ-DOWN-ROWS' and
SEQB is named 'SEQ-ACROSS-COLS', for clarity)
The substitution matrix, SUB, for our sequences might look like this:
A C G T
A 1 0 0 0
C 0 1 0 0
G 0 0 1 0
T 0 0 0 1
In this metric, all matches are rewarded equally, and all non-matches
are neutral (neither rewarded nor penalized).
Now to the details of the algorithm.
For any (i,j), SEQA[i] and SEQB[j] give us the RESIDUES (as above) for
position (i,j). Given the residues, we can (conceptually) index into
SUB to determine the score for alignment of the two sequences at
position (i,j). (Of course to index into an actual matrix one needs
numerical indices, so we really need the mapping A -> 0, C -> 1, G ->
2, T -> 3, given our example SUB matrix. This mapping is provided
transparently by the function CREATE-SUBSTITUTION-FUNCTION-AND-MATRIX,
which returns a function which is called with the two residues.)
The algorithm uses the SUBSTITUTION FUNCTION to determine a value when
dealing with position (I,J). First it gets the RESIDUES for I and J,
and then the SUBSTITUTION FUNCTION is called to get the score for
aligning the two residues.
The algorithm must first fill in values for all of the elements of SCORE
according to the following recursion relationship:
For any given element SCORE(i,j), assuming we have already filled in
all elements SCORE(k<i,l<j), the value of SCORE(i,j) will be
A. The MAXIMUM of three values
1. SCORE(i-1,j-1)
2. The maximum over k, of SCORE(k,j-1) + W((i-1)-k), k = 0...i-2
(That is, a maximum over the elements in column J-1, down to
and including row I-2).
3. The maximum over k, of SCORE(i-1,k) + W((j-1)-k), k = 0...j-2
(That is, a maximum over the elements in row I-1, over to
and including column J-2).
B. PLUS the value of SUB(SEQB[i],SEQA[j])
So what is W? W is our penalty function for gaps and W(x) is the
penalty value for having a gap of length x. The value of k determines
how big the gap is. (Starting from the left edge, and going across,
as in 3, the size of the gap is a maximum at the left edge (k=0) and
decreases as k increases, and correspondingly as we move down a column
in 2.)
Given that we are careful to special-case the leftmost column and
uppermost row (since there is no J-1 column or I-1 row in these cases)
this algorithm can then be used to sequentially fill in the rest of
the array.
The BACKPOINTER matrix is filled in by determining which of the cases
from above, 1, 2, or 3, had the maximum score, and setting the backpointer
to point from cell (I,J) back to the cell which held the maximum score.
So for the cell labelled '*' in the below matrix, the recursion formula
looks at all cells labelled '+'
- - - + - -
- - - + - -
+ + + + - -
- - - - * -
- - - - - -
For these cells, we find the cell that has the maximum value according
to the recursion formula, and set the backpointer to point to that
cell. (The code handles the case where two or more cells hold maximal
values).
The SCORE matrix (*SCOREMAT*), and the BACKPOINTER matrix
(*POINTERMAT*) are computed by the GENERATE-NEEDLEMAN-WUNSCH-SCORES
function below.
Note: the recursion relationship described above is one of two such
global alignment recursion formulas found in the biology literature.
For a discussion of the differences, and an implementation of the
other one, see the files match-essay.txt and global-alignment.lisp in
this directory.
Note: why the algorithm works is beyond the scope of this explanation.
See, for example, Gusfield, "Algorithms on Strings, Trees and
Sequences", Chapter 11.)
What do the numbers in the SCORE matrix mean once we compute them?
Basically, the numbers tend to increase as we go down and right, and
the more positive an element (I,J) is, the better it is to align the
two sequences such that SEQA[I] is over SEQB[J].
To determine a best alignment, now all we need to do is follow the
backpointers we have set up. What we do is start from a position with
the highest score in the bottom row or rightmost column, and work our
way up and to the left (towards the origin) via our backpointers,
creating paths through the matrix from this point back to the origin.
The geometry of these paths determine the alignment. (Since there may
be more than one such maximal point along the bottom row or rightmost
column, and there may be more than one backpointer from a given cell,
there may be numerous best alignments).
Given a path through the matrix, how do we determine what sequence
alignment it specifies?
This is how: For any given (I,J) in our path, the sequence lines up
at that point and therefore the RESIDUES at (I,J) constitute the
alignment at that point.
Given point (I,J) in the path, if the previous point in the path is at
(I-1,J-1), this means there is no gap.
Otherwise, there is a gap. If the previous point is above (I-1,J-1),
then the gap is in the sequence which got indexed across the columns,
while if the previous point is to the left of (I-1,J-1), then the gap
is in the sequence which got indexed down the rows.
So suppose we have the following path back through the matrix:
(5,5) -> (3,4) -> (2,3) -> (1,2) -> (0,0),
or graphically:
A C G A C T
A *
G *
A *
A *
A
T *
We start at the bottom right, at (5,5), with T aligned with T as the last
characters of both sequences. The next point is not at (4,4), but rather
at (3,4) which is above (4,4), so there is a gap in the sequence which got
indexed across the columns (which means the element of the sequence indexed
down the rows at position 4, A, is paired with the gap). So we have
(remember, we are going backwards):
AAT
C*T
From (3,4) we go to (2,3), so there is no gap, and once more with no gap,
to (1,2), and we have:
GAAAT
GAC*T
Finally, we go to (0,0) which is to the left of (0,1), so there is a gap
in the sequence indexed down the rows. Our end result is therefore:
A-GAAAT
ACGAC*T
We can produce such alignments for every backpath as we choose.
|#
(defvar *pointermat*)
(defvar *scoremat*)
(defvar *best-matches*)
(defvar *bestmatch*)
(defvar *allpaths*)
(defvar *all-points-paths*)
(defvar *substitution-matrix*)
(defvar *substitution-function*)
(defvar *gap-penalty-function*)
(defvar *nw-verbose* t)
(defparameter *time-nw?* t)
(defparameter *time-bp?* t)
(defparameter *time-new?* nil)
(defparameter *time-old?* t)
(defparameter *testseq-a* '(T T G A C A C C C T C C C A A T T G T A))
(defparameter *testseq-b* '(A C C C C A G G C T T T A C A C A T))
;;; MACROS, ETC
;;; Abstraction/optimization of a 2-dimensional POINT,
;;; which is an index (ROW COL) into a matrix. We pack
;;; ROW and COL into a single fixnum.
(defmacro nwpoint (row col)
`(the fixnum (+ (the fixnum (ash ,col 16)) ,row)))
(defmacro nwpoint-row (p) `(logand (the fixnum ,p) #x0000FFFF))
(defmacro nwpoint-col (p) `(ash (the fixnum ,p) -16))
(defmacro nwpoint-list (p)
(let ((psym (gensym "NWPOINT-")))
`(let ((,psym ,p))
(list (nwpoint-row ,psym) (nwpoint-col ,psym)))))
(deftype nwpoint () 'fixnum)
;;; Assumes BODY being timed returns a single result!
(defmacro maybetime ((time? &key (n 1) (notice nil)) &body body)
`(flet ((do-body () ,@body))
(if ,time?
(let ((result nil) (notice ,notice) (n ,n))
(when notice (format t "~&;; Timing ~A, ~D times~%" notice n))
(time (dotimes (j n) (setq result (do-body))))
result
)
(do-body))))
;;;; TOPLEVEL ROUTINE
;;; Outline:
;;; 1. Generate a substitution matrix function from parameters
;;; 2. Generate a gap penalty function, if not supplied, from parameters
;;; 3. Generate the SCORE matrix.
;;; 4. Find all the points that end the best sequence alignments
;;; and select one.
;;; 5. Generate all the best paths from this point back through the matrix
;;; 6. Pretty print (some of) the alignments corresponding to these
;;; best paths.
(defun run-nw
(&optional
(seq-down-rows *testseq-a*)
(seq-across-cols *testseq-b*)
&key
(constituents nil)
(sub-function nil)
(matchval 1.0)
(nonmatchval 0.0)
(pairwise-sub-values nil)
(pairwise-symmetric? t)
(gap-function nil)
(gap-open-penalty 1.0)
(gap-space-penalty 0.25)
(show-alignments? t)
(max-alignments-to-show 5)
(verbose *nw-verbose*)
)
(setq *substitution-matrix* nil)
(when verbose
(format t "~&~%;; Sequence A: ")
(dolist (a seq-down-rows) (format t "~A " a)) (terpri)
(format t "~%;; Sequence B: ")
(dolist (b seq-across-cols) (format t "~A " b)) (terpri) (terpri)
)
;; Create substitution function
(if sub-function
(setq *substitution-function* sub-function)
(multiple-value-setq (*substitution-function* *substitution-matrix*)
(create-substitution-function-and-matrix
(or constituents
(setq constituents
(remove-duplicates
(union seq-down-rows seq-across-cols))))
:default-matching-value matchval
:default-nonmatching-value nonmatchval
:pairwise-values pairwise-sub-values
:pairwise-symmetric? pairwise-symmetric?
)))
(setq sub-function *substitution-function*)
(setq *gap-penalty-function* gap-function)
;; It is more efficient to have the sequences as vectors than as lists.
(setq seq-down-rows (coerce seq-down-rows 'simple-vector))
(setq seq-across-cols (coerce seq-across-cols 'simple-vector))
;; Generate and fill the SCORE matrix (*SCOREMAT*) and the
;; *POINTERMAT* array
(if gap-function
;; General routine
(generate-needleman-wunsch-scores
seq-down-rows seq-across-cols sub-function gap-function)
;; Optimized routine for linear gap penalty function
(progn
(when verbose
(format t "~%;; Gap penalty formula = P(n) = ~A + ((n-1) * ~A))~%"
gap-open-penalty gap-space-penalty))
(generate-needleman-wunsch-scores-linear-gap-function
seq-down-rows seq-across-cols sub-function
gap-open-penalty gap-space-penalty
)))
(when verbose
(when *substitution-matrix*
(pp-sub-matrix *substitution-matrix* constituents)
(pp-score-matrix seq-down-rows seq-across-cols)))
;; Find a best point along the outer column or last row of SCORE.
(setq *best-matches* (find-best-endpoints))
(setq *bestmatch* (first *best-matches*))
(when verbose
(format t "~&;; ~D Best endpoints.~%" (length *best-matches*))
(format t ";; Best points: ~S~%"
(mapcar #'(lambda (p) (nwpoint-list p)) *best-matches*))
(format t ";; Using first endpoint ~S~%" (nwpoint-list *bestmatch*))
)
;; Find all the best paths from that point.
(setq *all-points-paths*
(get-best-paths
(nwpoint-row *bestmatch*) (nwpoint-col *bestmatch*) *pointermat*))
;; Display some or all of the best alignments
;; anchored on the *bestmatch* point.
(let ((npaths (length *all-points-paths*)))
(when *nw-verbose* (format t "Number of best paths: ~D~%" npaths))
(when show-alignments?
(if (>= max-alignments-to-show npaths)
(dotimes (i npaths)
(format t "~&;; Alignment ~D (of ~D):~%" (1+ i) npaths)
(pp-point-path
(elt *all-points-paths* i) seq-down-rows seq-across-cols))
(let ((already-shown nil))
(do ((i 0 (1+ i)))
((= (length already-shown) max-alignments-to-show))
(let ((index (random npaths)))
(unless (member index already-shown)
(format t "~&;; Alignment ~D (of ~D):~%" (1+ index) npaths)
(pp-point-path
(elt *all-points-paths* index) seq-down-rows seq-across-cols)
(push index already-shown)
)))))))
)
;;; SUBSTITUTION MATRIX UTILITIES
;;; Create a matrix with two different values,
;;; one on the diagonal and one off the diagonal.
(defun float-diagonal-matrix (n exact-match-value non-match-value)
(let ((m (make-array (list n n)
:element-type 'single-float
:initial-element (float non-match-value 0.0))))
(declare (type (simple-array single-float 2) m))
(let ((fval (float exact-match-value 0.0)))
(dotimes (j n) (setf (aref m j j) fval)))
m
))
;;; Given the set of possible elements in the sequences,
;;; return a function of two arguments F(C1,C2). When called
;;; on any two sequence elements, returns the value of the substitution matrix
;;; for those two elements.
;;; So (CREATE-SUBSTITUTION-FUNCTION-AND-MATRIX '(A C G T))
;;; creates a 4x4 floating point identity matrix and returns a function
;;; F, such that (F 'A 'A) -> 1.0, (F 'C 'C) -> 1.0, (F 'C 'G) -> 0.0
;;; The key arguments allow one to customize the substitution matrix.
;;; For example:
#|
(create-substitution-function-and-matrix
'(A C G T) :default-matching-value 2.5
:default-nonmatching-value -1.0
:pairwise-values '((a c 17.0)))
Produces a matrix which looks like
2.5 17.0 -1.0 -1.0
17.0 2.5 -1.0 -1.0
-1.0 -1.0 2.5 -1.0
-1.0 -1.0 -1.0 2.5
|#
;;; and (F 'A 'C) -> 17.0, etc.
(defun create-substitution-function-and-matrix
(ordered-constituent-set
&key (default-matching-value 1.0)
(default-nonmatching-value 0.0)
(pairwise-values nil)
(pairwise-symmetric? t)
)
(dolist (triple pairwise-values)
(unless (and (member (first triple) ordered-constituent-set)
(member (second triple) ordered-constituent-set))
(error "Oops! One of the elements in ~S is not a sequence element"
triple)))
(let* ((matrix-size (length ordered-constituent-set))
(sub (float-diagonal-matrix
matrix-size
default-matching-value
default-nonmatching-value))
(map (make-hash-table :test #'eql))
)
(declare (type (simple-array single-float 2) sub))
;; Create mapping from elements of sequence alphabet to
;; indices of substitution matrix.
(do ((set ordered-constituent-set (cdr set)) (index 0 (1+ index)))
((null set))
(setf (gethash (car set) map) index))
;; Store user-specified special values in substitution matrix.
(dolist (triple pairwise-values)
(let* ((c1 (first triple))
(c2 (second triple))
(val (float (third triple) 0.0))
(down-rows-index (gethash c1 map))
(across-cols-index (gethash c2 map))
)
(setf (aref sub down-rows-index across-cols-index) val)
(when pairwise-symmetric?
(setf (aref sub across-cols-index down-rows-index) val))
))
;; Return substitution function as primary result.
;; It calculates the indices into the substitution matrix
;; (which it has closed over) and arefs the matrix to get the value.
(values
#'(lambda (c1 c2)
(declare (optimize (speed 3) (safety 0) (debug 0)))
(if (eql c1 c2)
(let ((index (gethash c1 map)))
(declare (fixnum index))
(aref sub index index))
(let ((down-rows-index (gethash c1 map))
(across-cols-index (gethash c2 map)))
(declare (fixnum down-rows-index across-cols-index))
(aref sub down-rows-index across-cols-index)
)))
sub
)))
;;;; MATRIX GENERATION
(defparameter *paths-hash-table* (make-hash-table :test #'eql))
(defun adjust-nw-arrays-to-size (nrows ncols)
(setq *scoremat*
(or (and (boundp '*scoremat*)
(arrayp *scoremat*)
(equal (array-dimensions *scoremat*) (list nrows ncols))
*scoremat*)
(make-array (list nrows ncols) :element-type 'single-float)
))
(setq *pointermat*
(or (and (boundp '*pointermat)
(arrayp *pointermat*)
(equal (array-dimensions *pointermat*) (list nrows ncols))
*pointermat*)
(make-array (list nrows ncols))
)))
(defun generate-needleman-wunsch-scores
(seq-down-rows seq-across-cols sub-function gap-function)
(declare (optimize (speed 3) (safety 0) (debug 0)))
(declare (type simple-vector seq-down-rows seq-across-cols))
(let* ((nrows (length seq-down-rows))
(ncols (length seq-across-cols))
(colmax 0.0) (rowmax 0.0) (max 0.0)
(no-gap-score 0.0)
(subval 0.0)
collist rowlist pointlist
)
(declare (single-float colmax rowmax max no-gap-score subval))
(declare (fixnum nrows ncols))
(adjust-nw-arrays-to-size nrows ncols)
;; We are generating new scores, so cached paths will no
;; longer be valid.
(clrhash *paths-hash-table*)
(let ((scoremat *scoremat*) (pointmat *pointermat*))
(declare (type (simple-array t 2) pointmat))
(declare (type (simple-array single-float 2) scoremat))
;; Make sure arrays are initialized.
(dotimes (i nrows)
(dotimes (j ncols)
(setf (aref scoremat i j) 0.0)
(setf (aref pointmat i j) nil)))
;; For each row
(dotimes (i nrows)
(declare (fixnum i))
;; Across each column
(dotimes (j ncols)
(declare (fixnum j))
;; Get the residues at (I,J) and determine their match value
;; via the SUBSTITUTION FUNCTION. This is SUBVAL.
(setq subval (funcall sub-function (aref seq-down-rows i)
(aref seq-across-cols j)))
;; Get the previously computed score at SCORE(I-1,J-1)
;; This is the NO-GAP-SCORE.
(if (and (> i 0) (> j 0))
(setq no-gap-score
(aref scoremat (the fixnum (1- i)) (the fixnum (1- j))))
(setq no-gap-score 0.0))
;; Find the best score in column j-1
(setq colmax -100.0)
(setq collist nil)
(when (> j 0)
;; For all points in column j-1 ranging from row 0 to
;; row i-2, but in reverse (right to left)
(loop for x from 2 to i do
(let* ((gaplen (the fixnum (1- x)))
(currow (the fixnum (- i x)))
(curcol (the fixnum (1- j)))
;; W(k), the penalty function for a gap of length k.
(penalty (funcall gap-function gaplen))
;; The previously computed score for this cell.
(col-point-score (aref scoremat currow curcol))
;; The score for this cell to be maxxed against
;; all the others in the same column
(colscore (+ col-point-score penalty)))
(declare (fixnum x gaplen currow curcol))
(declare (single-float penalty col-point-score colscore))
;; Keep track of all points which have the maximum score,
;; and what the maximum score is.
(when (>= colscore colmax)
(if (= colscore colmax)
(setq collist (cons (nwpoint currow curcol) collist))
(setq collist (list (nwpoint currow curcol))))
(setq colmax colscore)
))))
;; colmax now contains the best score in column j-1.
;; collist contains all points in the column j-1 that equal colmax.
;; Find the best score in row i-1
;; Algorithm is identical to above, but with row and column switched.
(setq rowmax -100.0)
(setq rowlist nil)
(when (> i 0)
(loop for y from 2 to j do
(let* ((gaplen (the fixnum (1- y)))
(currow (the fixnum (1- i)))
(curcol (the fixnum (- j y)))
(penalty (funcall gap-function gaplen))
(row-point-score (aref scoremat currow curcol))
(rowscore (+ row-point-score penalty)))
(declare (fixnum y gaplen currow curcol))
(declare (single-float row-point-score penalty rowscore))
(when (>= rowscore rowmax)
(if (= rowscore rowmax)
(setq rowlist (cons (nwpoint currow curcol) rowlist))
(setq rowlist (list (nwpoint currow curcol))))
(setq rowmax rowscore)
))))
;; rowmax now contains the best score in row i-1.
;; rowlist contains all points in the row i-1 that equal rowmax.
;; Now we have three potential best scores, NO-GAP-SCORE,
;; ROWMAX and COLMAX. One or more of these values may be equal,
;; and each of these values is associated with a set of
;; points that have this value.
;; So figure out THE maximum value, and append together all
;; the points that equal this maximum value. Store this
;; pointlist away at index (I,J) is the POINTMAT array.
;; Finally, set SCORE(I,J) equal to this maximum value + SUB(I,J)
;; as the algorithm dictates.
(setq pointlist nil)
(setq max (max no-gap-score rowmax colmax))
(if (and (= max no-gap-score) (> i 0) (> j 0))
(setq pointlist (cons (nwpoint (1- i) (1- j)) pointlist)))
(if (= max rowmax) (setq pointlist (append pointlist rowlist)))
(if (= max colmax) (setq pointlist (append pointlist collist)))
(setf (aref pointmat i j) pointlist)
(setf (aref scoremat i j) (+ subval max))
)) ; Close loops
)))
;;; GAP-OPEN and GAP-DELTA must be non-negative numbers!
;;; (So they represent the magnitude of the penalty)
;;; Penalty function is - ((N*GAP-DELTA) + GAP-OPEN)
(defun generate-needleman-wunsch-scores-linear-gap-function
(seq-down-rows seq-across-cols sub-function gap-open gap-delta)
(declare (optimize (speed 3) (safety 0) (debug 0)))
(declare (type simple-vector seq-down-rows seq-across-cols))
(let* ((nrows (length seq-down-rows))
(ncols (length seq-across-cols))
(large-negative-float (float most-negative-fixnum 0.0))
(rowmax 0.0) (themax 0.0) (no-gap-score 0.0) (subval 0.0)
(colmax (make-array (list ncols) :element-type 'single-float))
(colmax-points (make-array (list ncols)))
(backpoints nil)
rowmax-points
)
(declare (single-float rowmax themax no-gap-score subval))
(declare (fixnum nrows ncols))
(declare (type (simple-array single-float 1) colmax))
(declare (type simple-vector colmax-points))
(adjust-nw-arrays-to-size nrows ncols)
;; We are generating new scores, so cached paths will no
;; longer be valid.
(clrhash *paths-hash-table*)
(let ((scoremat *scoremat*) (pointmat *pointermat*))
(declare (type (simple-array t 2) pointmat))
(declare (type (simple-array single-float 2) scoremat))
;; Make sure arrays, etc. are initialized.
(dotimes (i nrows)
(dotimes (j ncols)
(setf (aref scoremat i j) 0.0)
(setf (aref pointmat i j) nil)))
(dotimes (j ncols)
(setf (aref colmax j) large-negative-float)
(setf (aref colmax-points j) nil))
;; For each row
(dotimes (i nrows)
(declare (fixnum i))
(setq rowmax large-negative-float)
(setq rowmax-points nil)
;; Across each element of the row
(dotimes (j ncols)
(declare (fixnum j))
;; Get the residues at (I,J) and determine their match value
;; via the SUBSTITUTION FUNCTION. This is SUBVAL.
(setq subval (funcall sub-function (aref seq-down-rows i)
(aref seq-across-cols j)))
;; Get the previously computed score at SCORE(I-1,J-1)
;; This is the NO-GAP-SCORE.
(if (and (> i 0) (> j 0))
(setq no-gap-score
(aref scoremat (the fixnum (1- i)) (the fixnum (1- j))))
(setq no-gap-score 0.0))
;; Find the best score for all elements in column j-1 above
;; row i-1, and keep a list of the points in that column that
;; match that score. The previous best score for all elements
;; in column j-1 above row i-2 is kept in COLMAX[j], and the
;; list of points that equal that score is in COLMAX-POINTS[j].
(unless (or (< i 2) (zerop j))
(let* ((newi (the fixnum (- i 2)))
(newj (the fixnum (- j 1)))
(newpoint (nwpoint newi newj))
(addgapval (- (aref colmax j) gap-delta))
(newspotval (- (aref scoremat newi newj) gap-open)))
(setf (aref colmax j) (max addgapval newspotval))
(cond
((= addgapval newspotval)
(push newpoint (aref colmax-points j)))
((> newspotval addgapval)
(setf (aref colmax-points j) (list newpoint)))
((< newspotval addgapval) nil)
)
(unless (aref colmax-points j) (error "Internal error col"))
))
;; Find the best score for all elements in row i-1 to the
;; left of column j-1, and keep a list of the
;; points in that row that match that score.
;; The best score is either the previous best score for the row
;; minus the penalty for one more space in the gap, or the score for
;; the new element in the row (at j-2) minus the penalty for
;; starting a gap (since if we go back from (i,j) to (i-1,j-2)
;; we've opened a one-space gap)
(unless (or (zerop i) (< j 2))
(let* ((newi (the fixnum (- i 1)))
(newj (the fixnum (- j 2)))
(newpoint (nwpoint newi newj))
(addgapval (- rowmax gap-delta))
(newspotval (- (aref scoremat newi newj) gap-open)))
(setq rowmax (max addgapval newspotval))
(cond
((= addgapval newspotval)
(push newpoint rowmax-points))
((> newspotval addgapval)
(setq rowmax-points (list newpoint)))
((< newspotval addgapval) nil))
(unless rowmax-points (error "Internal error row"))
))
(setf backpoints nil)
(setq themax (max no-gap-score rowmax (aref colmax j)))
(when (and (plusp i) (plusp j))
(when (= themax no-gap-score)
(push (nwpoint (1- i) (1- j)) backpoints))
(when (= themax rowmax)
(setq backpoints (append backpoints rowmax-points)))
(when (= themax (aref colmax j))
(setq backpoints (append backpoints (aref colmax-points j))))
(when (null backpoints) (error "Internal error 27"))
)
(setf (aref pointmat i j) backpoints)
(setf (aref scoremat i j) (+ themax subval))
)) ; Close loops
)))
;;;; FINDING THE BEST PATHS GIVEN THE GENERATED MATRIX.
;;; Returns a list of all the points in last row and/or last column that
;;; have the maximum score.
(defun find-best-endpoints ()
(let* ((s *scoremat*)
(nrows (array-dimension s 0))
(ncols (array-dimension s 1))
(lastrow (1- nrows))
(lastcol (1- ncols))
(bestscore (float most-negative-fixnum 0.0))
(score 0.0)
(pointlist nil))
(declare (fixnum nrows ncols lastrow lastcol))
(declare (single-float bestscore score))
(declare (type (array single-float 2) s))
;; First search the last column for the highest score.
(dotimes (currow nrows)
(setq bestscore (max bestscore (aref s currow lastcol))))
;; Then search the last row.
(dotimes (curcol ncols)
(setq bestscore (max bestscore (aref s lastrow curcol))))
;; Okay, we have the best score. Cons up a list of
;; all points that are equal to this best score.
(dotimes (currow nrows)
(setq score (aref s currow lastcol))
(when (= score bestscore) (push (nwpoint currow lastcol) pointlist)))
;; Careful not to get bottom-right point twice!
(dotimes (curcol (1- ncols))
(setq score (aref s lastrow curcol))
(when (= score bestscore) (push (nwpoint lastrow curcol) pointlist)))
pointlist
))
;; Return a list of all optimal paths ending at (I,J).
;; The paths are lists of NWPOINTs.
;; For any given point (I,J), there is a unique set of best paths
;; that go back from it. If two paths converge on point (I,J), then
;; from that point backwards they will encompass the same set of paths.
;; So once we compute the set of best paths from (I,J), we store that
;; set in a hash table and retrieve it if we ever hit (I,J) again.
;; This saves mucho computation for large matrices.
(defun get-best-paths (i j pointmat)
(setq *all-points-paths*
(mapcar
#'reverse
(get-best-paths-internal i j pointmat)
)))
(defun get-best-paths-internal (i j pointmat)
(declare (fixnum i j))
(declare (type (simple-array t 2) pointmat))
(declare (optimize (speed 3) (safety 0) (debug 0)))
;; If (I,J) is already hashed were done.
(let ((hashval (gethash (nwpoint i j) *paths-hash-table*)))
(when hashval (return-from get-best-paths-internal hashval)))
(let* ((backcol 0) (backrow 0)
(curpoint (nwpoint i j))
(best-paths nil) (result nil))
(declare (fixnum backcol backrow))
(declare (type nwpoint curpoint))
(setq result
(cond
;; The path begins HERE! (since there is no previous point)
((null (aref pointmat i j)) (list (list (nwpoint i j))))
(t
;; For each backpointer, find the best paths from there, then
;; chain the current point on to all those paths.
(loop for x in (aref pointmat i j) do
(setq backrow (nwpoint-row x))
(setq backcol (nwpoint-col x))
(setq best-paths
(get-best-paths-internal backrow backcol pointmat))
append
(mapcar #'(lambda (path) (cons curpoint path)) best-paths)
))))
(setf (gethash curpoint *paths-hash-table*) result)
result
))
;;;; PATHS TO ALIGNMENTS FUNCTIONALITY