-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathadtstralgs.pas.mcp
733 lines (644 loc) · 21.9 KB
/
adtstralgs.pas.mcp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
(* This file is a part of the PascalAdt library, which provides
commonly used algorithms and data structures for the FPC and
Delphi compilers.
Copyright (C) 2004 by Lukasz Czajka
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1 of
the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
02110-1301 USA *)
unit adtstralgs;
{ This unit provides various string algorithms, including the
Knuth-Morris-Pratt, Boyer-Moore and Karp-Miller-Rosenberg algorithms. }
interface
uses
SysUtils, adtdarray;
&include adtdefs.inc
type
TKmpTable = array of IndexType;
TKmpReverseTable = array of IndexType;
TBmTable = array of IndexType;
TCardinalArray = array of Cardinal;
TCharSet = set of Char;
{ ------------------------ General algorithms ------------------------------ }
{ reverses a substring [start..finish-1] in <str> so that it becomes its
own 'mirror reflection'; e.g. for <str> = 'abcdef' and start = 2,
finish = 6 we get Result = 'aedcbf'; @complexity O(n); }
function Reverse(const str : String; start, finish : IndexType) : String;
{ replaces the first occurence of one of the characters from <chars>
in <str> with <c>; returns the position of the character replaced or
-1 if none was replaced; @complexity O(n) }
function Replace(var str : String; chars : TCharSet; c : Char;
startIndex : IndexType) : IndexType;
{ replaces all occurences of one of the characters from <chars> in
<str> with <c>; returns the result; @complexity O(n) }
function ReplaceAll(const str : String; chars : TCharSet; c : Char) : String;
{ finds <str2> in <str1>, starting from startIndex; returns the index of
the position at which <str2> is found or -1 if it is not found;
@complexity O(n); }
function FindSubstr(const str1, str2 : String;
startIndex : IndexType) : IndexType; overload;
{ the same as above, but assumes startIndex = 1 }
function FindSubstr(const str1, str2 : String) : IndexType; overload;
{ counts the occurences of <str2> in <str1>; overlapping occurences
_are_ included; uses <tab> as the table; uses the Knuth-Morris-Pratt
algorithm (KmpFindSubstr); @complexity O(n) }
function CountSubstrings(const str1, str2 : String;
const tab : TKmpTable) : SizeType;
{ ------------------------ Specific algorithms ------------------------------ }
{ 'naive', straight-forward substring search algorithm; @complexity
worst-case O(n^2) }
function NaiveFindSubstr(const str1, str2 : String;
startIndex : IndexType) : IndexType;
{ computes the table needed by KmpFindSubstr; this should be called for the
string searched, and may be later used in many different searches;
@complexity O(n) }
function KmpComputeTable(const str : String) : TKmpTable;
{ returns the first index at which <str2> is found in str1; starts
searching forward from startIndex; if <str2> is not found then
returns -1; the table parameter should be the table generated by
KmpComputeTable for <str2>; the algorithm used is the
Knuth-Morris-Pratt algorithm; @complexity worst-case O(n) }
function KmpFindSubstr(const str1, str2 : String; startIndex : IndexType;
const table : TKmpTable) : IndexType;
{ computes a table needed by KmpReverseFindSubstr; @complexity O(n); }
function KmpReverseComputeTable(const str : String) : TKmpReverseTable;
{ the same as KmpFind but searches from the back of <str1>; @complexity O(n) }
function KmpReverseFindSubstr(const str1, str2 : String; finishIndex : IndexType;
const table : TKmpReverseTable) : IndexType;
{ computes the table needed by the Boyer-Moore algorithm; @complexity
O(n); }
function BmComputeTable(const str2 : String) : TBmTable;
{ uses the Boyer-Moore algorithm to check whether <str2> is a substring
of str1; although the worst-case complexity is O(n), in most cases
this algorithm is better than the Knuth-Morris-Pratt
algorithm. However, computing the table takes longer. @complexity
O(n) }
function BmFindSubstr(const str1, str2 : String; startIndex : IndexType;
const table : TBmTable) : IndexType;
{ implements the Karp-Miller-Rosenberg algorithm for computing the
equivalence classes of all substrings with length len in str; if the
table returned contains the same number at index i and j, it means
that the substrings of length len starting at i and at j are equal;
only positions from [start,finish-len) in the returned array contain
valid indices !! others are undefined; len is the desired length of
substrings, should be > 1; start and finish designate the range in
which the search is performed; @complexity O(n*log(n)) }
function KmrFindSubstrings(const str : String; start, finish : IndexType;
len : SizeType) : TCardinalArray;
implementation
uses
adtutils;
const
strMaxLengthForNaiveFind = 10;
{ ------------------------- General algorithms ---------------------------- }
function Reverse(const str : String; start, finish : IndexType) : String;
var
c : Char;
begin
Result := str;
Dec(finish);
while start < finish do
begin
c := Result[start];
Result[start] := Result[finish];
Result[finish] := c;
Inc(start);
Dec(finish);
end;
end;
function Replace(var str : String; chars : TCharSet; c : Char;
startIndex : IndexType) : IndexType;
begin
Result := startIndex;
while Result <= Length(str) do
begin
if str[Result] in chars then
begin
str[Result] := c;
Exit;
end;
Inc(Result);
end;
Result := -1;
end;
function ReplaceAll(const str : String; chars : TCharSet; c : Char) : String;
var
i : IndexType;
begin
Result := str;
i := 1;
while i <= Length(Result) do
begin
if Result[i] in chars then
Result[i] := c;
Inc(i);
end;
end;
function FindSubstr(const str1, str2 : String;
startIndex : IndexType) : IndexType;
var
len : SizeType;
begin
len := Length(str1) - startIndex + 1;
if len < Length(str2) then
Result := -1
else if len = Length(str2) then
begin
if str1 = str2 then
Result := 1
else
Result := -1;
end else if len - Length(str2) <= strMaxLengthForNaiveFind then
Result := NaiveFindSubstr(str1, str2, startIndex)
else
Result := KmpFindSubstr(str1, str2, startIndex,
KmpComputeTable(str2));
end;
function FindSubstr(const str1, str2 : String) : IndexType;
begin
Result := FindSubstr(str1, str2, 1);
end;
function CountSubstrings(const str1, str2 : String;
const tab : TKmpTable) : SizeType;
var
i : IndexType;
begin
Result := 0;
i := 0;
repeat
i := KmpFindSubstr(str1, str2, i + 1, tab);
if i <> -1 then
begin
Inc(Result);
end;
until i = -1;
end;
{ ------------------------ Specific algorithms ------------------------------ }
function NaiveFindSubstr(const str1, str2 : String;
startIndex : IndexType) : IndexType;
var
j : IndexType;
begin
Result := -1;
while startIndex < Length(str1) - Length(str2) + 1 do
begin
j := 1;
while (j <= Length(str2)) and (str1[startIndex + j - 1] = str2[j]) do
Inc(j);
if j > Length(str2) then
begin
Result := startIndex;
break;
end;
Inc(startIndex);
end; { end while }
end;
function KmpComputeTable(const str : String) : TKmpTable;
var
i, j : IndexType;
begin
if Length(str) = 0 then
begin
Result := nil;
Exit;
end;
SetLength(Result, Length(str) + 1);
{ see the function below }
{ Result[i] = Result(j)[i - 1] + 1 <=> x[Result(j)[i - 1] + 1] = x[i],
Result(j)[n] means Result[Result[Result[...[Result[n]]...]]] where
Result occurs j times }
Result[0] := 0;
Result[1] := 0;
for i := 2 to Length(str) do
begin
j := Result[i - 1];
while (j <> 0) and (str[i] <> str[j + 1]) do
j := Result[j];
if (str[i] = str[j + 1]) then
begin
j := j + 1;
end;
Result[i] := j;
end;
end;
function KmpFindSubstr(const str1, str2 : String; startIndex : IndexType;
const table : TKmpTable) : IndexType;
var
j : IndexType;
begin
Result := -1;
if (Length(str1) = 0) or (Length(str2) = 0) or (table = nil) then
begin
Exit;
end;
{ table[i] contains the length of a maximal suffix of
str2[1..i] that is also a prefix of str2[1..i]; table[0] =
0; formally: table[i] = max(k in [0,i-1] : str2[1..k] =
str2[i-k..i]) }
{ startIndex - current position within str1 }
{ j - current position within str2 }
j := 0;
while startIndex < Length(str1) - Length(str2) + 1 do
begin
j := table[j] + 1;
while (j <= Length(str2)) and (str1[startIndex + j - 1] = str2[j]) do
Inc(j);
if j > Length(str2) then
begin
Result := startIndex;
break;
end;
Dec(j); { now str2[1..j] = str1[i..i+j-1] }
if j - table[j] = 0 then
startIndex := startIndex + 1
else
startIndex := startIndex + j - table[j]
end; { end while }
end;
function KmpReverseComputeTable(const str : String) : TKmpReverseTable;
var
i, j : IndexType;
begin
if Length(str) = 0 then
begin
Result := nil;
Exit;
end;
SetLength(Result, Length(str) + 2);
{ see the function below; this function is basically the same as
KmpComputeTable but the table is 'reversed', i.e. computed from
the back to the front }
Result[Length(str) + 1] := 0;
Result[Length(str)] := 0;
for i := Length(str) - 1 downto 1 do
begin
j := i + 1;
while (j <> Length(str) + 1) and
(str[i] <> str[Length(str) - Result[j]]) do
begin
j := Length(str) - Result[j] + 1;
end;
if j <> Length(str) + 1 then { i.e. str[i] = str[Length(str)-Result[j]] }
Result[i] := Result[j] + 1
else
Result[i] := 0;
end;
end;
function KmpReverseFindSubstr(const str1, str2 : String; finishIndex : IndexType;
const table : TKmpReverseTable) : IndexType;
var
i, j : IndexType;
begin
Result := -1;
if (table = nil) or (Length(str2) = 0) or (length(str1) = 0) then
begin
Exit;
end;
{ table[i] contains the length of a maximal prefix of
str2[i..Length(str2)] that is also its suffix; formally: table[i]
= max(k in [0,Length(str2)-i] : str2[i..i+k-1] =
str2[Length(str2)-k+1..Length(str2)]) }
j := 0;
i := finishIndex - 1;
while i >= 0 do
begin
j := table[Length(str2) + 1 - j] + 1;
while (j <= Length(str2)) and
(str1[i - j + 1] = str2[Length(str2) + 1 - j]) do
begin
Inc(j);
end;
if j > Length(str2) then
begin
Result := i - length(str2) + 1;
break;
end;
Dec(j); { now str1[i-j+1...i] = str2[Length(str2)+1-j..Length(str2)] }
if j - table[Length(str2) + 1 - j] = 0 then
i := i - 1
else
i := i - j + table[Length(str2) + 1 - j];
end; { end while }
end;
function BmComputeTable(const str2 : String) : TBmTable;
var
tab : TKmpReverseTable;
i, j, k, s : IndexType;
len, filler : SizeType;
begin
tab := KmpReverseComputeTable(str2);
len := Length(str2);
Result := nil;
SetLength(Result, len + 2);
filler := len + 2;
for i := 0 to len do
begin
Result[i] := filler;
end;
{ there must be two conditions satisfied for result[j] = s: }
{ cond1(s,j) : for each k in (j,len] s >= k or str[k - s] =
str[k]; in other words, the end of the substring (j,len] is
equal to the end of the substring [1,len-s] }
{ graphically: <> <> (<> - equal distances }
{ +----------------+ +-----------------+ }
{ |XX| | |XX| or | |XXX| | |XXX| }
{ +----------------+ +-----------------+ }
{ j s j-s s j }
{ The two groups of X's are equal. }
{ cond2(s,j) : s >= j or str[j - s] <> str[j]; }
{ the s chosen is a minimal one that satisfies both conditions
}
{ In the algorithm we use the reversed table of maximal suffixes
that are also prefixes of a given substring, obtained from the
KmpReverseComputeTable routine. We base on the fact that the
X's marked in the above picture are both suffixes and prefixes
of some substring of str ending at str's last position (they
need not be necessarily maximal). Notice that the larger is
the position i at which the first group of X's starts, the
smaller is s. So we begin with i = 1 and increase i until it
reaches len. In each iteration we find all prefixes that are
also suffixes of str[i..len], and for each of them we compute
j and s for this j, if cond2 is satisfied (the situation shown
in the right picture above). Because with increasing i s is
decreasing we are guaranteed to choose the smallest s for a
given j. However, not all j's are assigned a value in this
computation. Those which are not correspond to the situation
in the left picture above. We handle them in the second loop
by assigning the s computed for the largest suffix of str that
is also its prefix. It guarantees that s is minimal.
Additionally, we require s > j. }
for i := 1 to len do
begin
k := i;
repeat
j := len - tab[k];
s := j - i + 1;
{ cond2(s,j) : }
if (j <= s) or (str2[j - s] <> str2[j]) then
begin
Result[j] := s;
end;
{ get the next prefix that is also a suffix of str[i..len] }
k := len - tab[k] + 1;
until k = len + 1;
end;
s := len; /// not even used
for i := len downto 1 do
begin
if Result[i] = filler then
Result[i] := s
else
s := i;
end;
end;
function BmFindSubstr(const str1, str2 : String; startIndex : IndexType;
const table : TBmTable) : IndexType;
var
j : IndexType;
begin
Result := -1;
while startIndex <= Length(str1) - Length(str2) do
begin
j := Length(str2);
while (j > 0) and (str1[startIndex + j - 1] = str2[j]) do
begin
Dec(j);
end;
if j = 0 then
begin
Result := startIndex;
break;
end;
startIndex := startIndex + table[j];
end;
end;
function KmrFindSubstrings(const str : String; start, finish : IndexType;
len : SizeType) : TCardinalArray;
const
BITS_IN_CHAR = 8;
MAX_CHARS = 1 shl BITS_IN_CHAR;
MASK = MAX_CHARS - 1;
MAX_TAB_IND = MAX_CHARS - 1;
CHARS_NUM1 = 2;
CHARS_NUM2 = 4;
CHARS_NUM3 = (SizeOf(IndexType) * 2) div SizeOf(Char);
type
PTriple = ^TTriple;
TTriple = record
Num : array[0..CHARS_NUM3 - 1] of Char;
Ind : IndexType;
Next : PTriple;
end;
TBucket = record
First, Last : PTriple;
end;
var
table : array[0..MAX_TAB_IND] of TBucket; { for bucket sort }
lastNum : array[0..CHARS_NUM3 - 1] of Char;
list, freelist, node, nnode : PTriple;
sgroup : 1..3; { sgroup = 1 : Length(str) < MAX_CHARS; 2 : Length(str) <
MAX_CHARS*MAX_CHARS; 3 : Length(str) > MAX_CHARS^2 }
p, loglen, slen : SizeType;
maxj, ii : IndexType;
procedure CreateSubstrings(l, off : SizeType);
var
i, j, k : IndexType;
equal : Boolean;
begin
{ Items(p)[i] denotes a substring with length p starting at
str[i] }
{ p is assumed to be the current length of the substrings }
{ l - the new length of the substrings }
{ off - the offset added to the index of a substring to get the next
adjacent substring }
{ create substrings of length l; just merge adjacent
substrings (i.e. their numbers which identify them) }
for i := start to finish - l do
begin
if freelist <> nil then
begin
node := freelist;
freelist := freelist^.Next;
end else
New(node); { may raise }
with node^ do
begin
case sgroup of
1:
begin
Num[0] := Char(Result[i]);
Num[1] := Char(Result[i + off]);
end;
2:
begin
Num[0] := Char(Result[i] and MASK);
Num[1] := Char((Result[i] shr BITS_IN_CHAR) and MASK);
Num[2] := Char(Result[i + off] and MASK);
Num[3] := Char((Result[i + off] shr BITS_IN_CHAR) and
MASK);
end;
3:
begin
Num[0] := Char(Result[i] and MASK);
Num[1] := Char((Result[i] shr BITS_IN_CHAR) and MASK);
Num[2] := Char((Result[i] shr (2*BITS_IN_CHAR)) and MASK);
Num[3] := Char((Result[i] shr (3*BITS_IN_CHAR)) and MASK);
Num[4] := Char(Result[i + off] and MASK);
Num[5] := Char((Result[i + off] shr BITS_IN_CHAR) and
MASK);
Num[6] := Char((Result[i + off] shr (2*BITS_IN_CHAR)) and
MASK);
Num[7] := Char((Result[i + off] shr (3*BITS_IN_CHAR)) and
MASK);
end;
end;
Ind := i;
Next := list;
end;
list := node;
end; { end for }
{ sort list lexicographically according to the Num field }
for i := 0 to MAX_TAB_IND do
begin
with table[i] do
begin
First := nil;
Last := nil;
end;
end;
j := maxj;
while j >= 0 do
begin
while list <> nil do
begin
node := list;
list := list^.Next;
k := SizeType(node^.Num[j]);
if table[k].First <> nil then
begin
table[k].Last^.Next := node;
end else
begin
table[k].First := node;
end;
table[k].Last := node;
node^.Next := nil;
end;
node := nil; { will point to the last node in list }
for i := 0 to MAX_TAB_IND do
begin
with table[i] do
begin
if First <> nil then
begin
if node <> nil then
node^.Next := First
else
list := First;
node := Last;
Last := nil;
First := nil;
end;
end;
end;
Dec(j);
end; { end while }
{ assign numbers to the substrings with the length l; nodes with equal
Num fields become equal substrings; }
// a preparation for the loop
if list <> nil then
begin
for j := 0 to maxj do
begin
lastNum[j] := list^.Num[j];
end;
end;
i := 0;
while list <> nil do
begin
{ equal is true if the current node (in list) is equal to
the previous one (i.e. the one analysed in the previous
iteration of the loop). Since nodes are ordered
lexicographically the new substrings will be assigned
equal numbers if and only if they are equal }
equal := true;
for j := 0 to maxj do
begin
if list^.Num[j] <> lastNum[j] then
begin
equal := false;
lastNum[j] := list^.Num[j];
end;
end;
if not equal then
Inc(i); { the next sequence of equal substrings - a new
id needed }
Result[list^.Ind] := i;
node := freelist;
freelist := list;
list := list^.Next;
freelist^.Next := node;
end; { end while list <> nil }
end; { end CreateSubstrings }
begin
Assert((start <= finish) and (finish <= Length(str) + 1) and (len > 1));
if start = finish then
begin
Result := nil;
exit;
end;
SetLength(Result, finish);
{ may raise, but harmless }
if Length(str) < MAX_CHARS then
sgroup := 1
else if Length(str) < MAX_CHARS * MAX_CHARS then
sgroup := 2
else
sgroup := 3;
case sgroup of
1:
maxj := CHARS_NUM1 - 1;
2:
maxj := CHARS_NUM2 - 1;
3:
maxj := CHARS_NUM3 - 1;
end;
try
list := nil;
freelist := nil;
p := 1; { the current length of the substrings }
loglen := FloorLog2(len);
slen := 1 shl loglen;
for ii := start to finish - 1 do
Result[ii] := Ord(str[ii]);
repeat
{ (Result(2*p)[i] = Result(2*p)[j]) <=> (Result(p)[i] =
Result(p)[j]) and (Result(p)[i+p] = Result(p)[j+p]) }
CreateSubstrings(2*p, p);
p := 2*p;
until p = slen;
if slen <> len then
begin
{ (Result(len)[i] = Result(len)[j]) <=> (Result(slen)[i]
= Result(slen)[j]) and (Result(slen)[i+len-slen]
= Result(slen)[j+len-slen]) }
CreateSubstrings(len, len - slen);
end;
finally
while freelist <> nil do
begin
nnode := freelist^.Next;
Dispose(freelist);
freelist := nnode;
end;
end;
end;
end.