adtstralgs.pas.mcp

(* This file is a part of the PascalAdt library, which provides
   commonly used algorithms and data structures for the FPC and
   Delphi compilers.

   Copyright (C) 2004 by Lukasz Czajka

   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public License
   as published by the Free Software Foundation; either version 2.1 of
   the License, or (at your option) any later version.

   This library is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with this library; if not, write to the Free Software
   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
   02110-1301 USA *)

unit adtstralgs;

{ This unit provides various string algorithms, including the
  Knuth-Morris-Pratt, Boyer-Moore and Karp-Miller-Rosenberg algorithms. }

interface

uses
   SysUtils, adtdarray;

&include adtdefs.inc

type
   TKmpTable = array of IndexType;
   TKmpReverseTable = array of IndexType;
   TBmTable = array of IndexType;
   TCardinalArray = array of Cardinal;
   TCharSet = set of Char;

{ ------------------------ General algorithms ------------------------------ }

{ reverses a substring [start..finish-1] in <str> so that it becomes its
  own 'mirror reflection'; e.g. for <str> = 'abcdef' and start = 2,
  finish = 6 we get Result = 'aedcbf'; @complexity O(n); }
function Reverse(const str : String; start, finish : IndexType) : String;
{ replaces the first occurence of one of the characters from <chars>
  in <str> with <c>; returns the position of the character replaced or
  -1 if none was replaced; @complexity O(n) }
function Replace(var str : String; chars : TCharSet; c : Char;
                 startIndex : IndexType) : IndexType;
{ replaces all occurences of one of the characters from <chars> in
  <str> with <c>; returns the result; @complexity O(n) }
function ReplaceAll(const str : String; chars : TCharSet; c : Char) : String;
{ finds <str2> in <str1>, starting from startIndex; returns the index of
  the position at which <str2> is found or -1 if it is not found;
  @complexity O(n); }
function FindSubstr(const str1, str2 : String;
                    startIndex : IndexType) : IndexType; overload;
{ the same as above, but assumes startIndex = 1 }
function FindSubstr(const str1, str2 : String) : IndexType; overload;
{ counts the occurences of <str2> in <str1>; overlapping occurences
  _are_ included; uses <tab> as the table; uses the Knuth-Morris-Pratt
  algorithm (KmpFindSubstr); @complexity O(n) }
function CountSubstrings(const str1, str2 : String;
                         const tab : TKmpTable) : SizeType;

{ ------------------------ Specific algorithms ------------------------------  }

{ 'naive', straight-forward substring search algorithm; @complexity
  worst-case O(n^2) }
function NaiveFindSubstr(const str1, str2 : String;
                         startIndex : IndexType) : IndexType;
{ computes the table needed by KmpFindSubstr; this should be called for the
  string searched, and may be later used in many different searches;
  @complexity O(n) }
function KmpComputeTable(const str : String) : TKmpTable;
{ returns the first index at which <str2> is found in str1; starts
  searching forward from startIndex; if <str2> is not found then
  returns -1; the table parameter should be the table generated by
  KmpComputeTable for <str2>; the algorithm used is the
  Knuth-Morris-Pratt algorithm; @complexity worst-case O(n) }
function KmpFindSubstr(const str1, str2 : String; startIndex : IndexType;
                       const table : TKmpTable) : IndexType;
{ computes a table needed by KmpReverseFindSubstr; @complexity O(n); }
function KmpReverseComputeTable(const str : String) : TKmpReverseTable;
{ the same as KmpFind but searches from the back of <str1>; @complexity O(n) }
function KmpReverseFindSubstr(const str1, str2 : String; finishIndex : IndexType;
                              const table : TKmpReverseTable) : IndexType;
{ computes the table needed by the Boyer-Moore algorithm; @complexity
  O(n); }
function BmComputeTable(const str2 : String) : TBmTable;
{ uses the Boyer-Moore algorithm to check whether <str2> is a substring
  of str1; although the worst-case complexity is O(n), in most cases
  this algorithm is better than the Knuth-Morris-Pratt
  algorithm. However, computing the table takes longer. @complexity
  O(n) }
function BmFindSubstr(const str1, str2 : String; startIndex : IndexType;
                      const table : TBmTable) : IndexType;

{ implements the Karp-Miller-Rosenberg algorithm for computing the
  equivalence classes of all substrings with length len in str; if the
  table returned contains the same number at index i and j, it means
  that the substrings of length len starting at i and at j are equal;
  only positions from [start,finish-len) in the returned array contain
  valid indices !! others are undefined; len is the desired length of
  substrings, should be > 1; start and finish designate the range in
  which the search is performed; @complexity O(n*log(n)) }
function KmrFindSubstrings(const str : String; start, finish : IndexType;
                           len : SizeType) : TCardinalArray;


implementation

uses
   adtutils;

const
   strMaxLengthForNaiveFind = 10;

{ ------------------------- General algorithms ---------------------------- }

function Reverse(const str : String; start, finish : IndexType) : String;
var
   c : Char;
begin
   Result := str;
   Dec(finish);
   while start < finish do
   begin
      c := Result[start];
      Result[start] := Result[finish];
      Result[finish] := c;
      Inc(start);
      Dec(finish);
   end;
end;

function Replace(var str : String; chars : TCharSet; c : Char;
                 startIndex : IndexType) : IndexType;
begin
   Result := startIndex;
   while Result <= Length(str) do
   begin
      if str[Result] in chars then
      begin
         str[Result] := c;
         Exit;
      end;
      Inc(Result);
   end;
   Result := -1;
end;

function ReplaceAll(const str : String; chars : TCharSet; c : Char) : String;
var
   i : IndexType;
begin
   Result := str;
   i := 1;
   while i <= Length(Result) do
   begin
      if Result[i] in chars then
         Result[i] := c;
      Inc(i);
   end;
end;

function FindSubstr(const str1, str2 : String;
                    startIndex : IndexType) : IndexType;
var
   len : SizeType;
begin
   len := Length(str1) - startIndex + 1;
   if len < Length(str2) then
      Result := -1
   else if len = Length(str2) then
   begin
      if str1 = str2 then
         Result := 1
      else
         Result := -1;
   end else if len - Length(str2) <= strMaxLengthForNaiveFind then
      Result := NaiveFindSubstr(str1, str2, startIndex)
   else
      Result := KmpFindSubstr(str1, str2, startIndex,
                              KmpComputeTable(str2));
end;

function FindSubstr(const str1, str2 : String) : IndexType;
begin
   Result := FindSubstr(str1, str2, 1);
end;

function CountSubstrings(const str1, str2 : String;
                         const tab : TKmpTable) : SizeType;
var
   i : IndexType;
begin
   Result := 0;
   i := 0;
   repeat
      i := KmpFindSubstr(str1, str2, i + 1, tab);
      if i <> -1 then
      begin
         Inc(Result);
      end;
   until i = -1;
end;

{ ------------------------ Specific algorithms ------------------------------  }

function NaiveFindSubstr(const str1, str2 : String;
                         startIndex : IndexType) : IndexType;
var
   j : IndexType;
begin
   Result := -1;

   while startIndex < Length(str1) - Length(str2) + 1 do
   begin
      j := 1;
      while (j <= Length(str2)) and (str1[startIndex + j - 1] = str2[j]) do
         Inc(j);

      if j > Length(str2) then
      begin
         Result := startIndex;
         break;
      end;

      Inc(startIndex);
   end; { end while }
end;

function KmpComputeTable(const str : String) : TKmpTable;
var
   i, j : IndexType;
begin
   if Length(str) = 0 then
   begin
      Result := nil;
      Exit;
   end;

   SetLength(Result, Length(str) + 1);

   { see the function below }
   { Result[i] = Result(j)[i - 1] + 1 <=> x[Result(j)[i - 1] + 1] = x[i],
     Result(j)[n] means Result[Result[Result[...[Result[n]]...]]] where
     Result occurs j times }

   Result[0] := 0;
   Result[1] := 0;

   for i := 2 to Length(str) do
   begin
      j := Result[i - 1];
      while (j <> 0) and (str[i] <> str[j + 1]) do
         j := Result[j];

      if (str[i] = str[j + 1]) then
      begin
         j := j + 1;
         end;
      Result[i] := j;
   end;
end;

function KmpFindSubstr(const str1, str2 : String; startIndex : IndexType;
                       const table : TKmpTable) : IndexType;
var
   j : IndexType;
begin
   Result := -1;

   if (Length(str1) = 0) or (Length(str2) = 0) or (table = nil) then
   begin
      Exit;
   end;

   { table[i] contains the length of a maximal suffix of
     str2[1..i] that is also a prefix of str2[1..i]; table[0] =
     0; formally: table[i] = max(k in [0,i-1] : str2[1..k] =
     str2[i-k..i]) }

   { startIndex - current position within str1  }
   { j - current position within str2 }

   j := 0;
   while startIndex < Length(str1) - Length(str2) + 1 do
   begin
      j := table[j] + 1;
      while (j <= Length(str2)) and (str1[startIndex + j - 1] = str2[j]) do
         Inc(j);

      if j > Length(str2) then
      begin
         Result := startIndex;
         break;
      end;

      Dec(j); { now str2[1..j] = str1[i..i+j-1] }
      if j - table[j] = 0 then
         startIndex := startIndex + 1
      else
         startIndex := startIndex + j - table[j]
   end; { end while }
end;

function KmpReverseComputeTable(const str : String) : TKmpReverseTable;
var
   i, j : IndexType;
begin
   if Length(str) = 0 then
   begin
      Result := nil;
      Exit;
   end;

   SetLength(Result, Length(str) + 2);

   { see the function below; this function is basically the same as
     KmpComputeTable but the table is 'reversed', i.e. computed from
     the back to the front }

   Result[Length(str) + 1] := 0;
   Result[Length(str)] := 0;
   for i := Length(str) - 1 downto 1 do
   begin
      j := i + 1;
      while (j <> Length(str) + 1) and
               (str[i] <> str[Length(str) - Result[j]]) do
      begin
         j := Length(str) - Result[j] + 1;
      end;
      if j <> Length(str) + 1 then { i.e. str[i] = str[Length(str)-Result[j]] }
         Result[i] := Result[j] + 1
      else
         Result[i] := 0;
   end;
end;

function KmpReverseFindSubstr(const str1, str2 : String; finishIndex : IndexType;
                              const table : TKmpReverseTable) : IndexType;
var
   i, j : IndexType;
begin
   Result := -1;

   if (table = nil) or (Length(str2) = 0) or (length(str1) = 0) then
   begin
      Exit;
   end;

   { table[i] contains the length of a maximal prefix of
     str2[i..Length(str2)] that is also its suffix; formally: table[i]
     = max(k in [0,Length(str2)-i] : str2[i..i+k-1] =
     str2[Length(str2)-k+1..Length(str2)]) }

   j := 0;
   i := finishIndex - 1;
   while i >= 0 do
   begin
      j := table[Length(str2) + 1 - j] + 1;
      while (j <= Length(str2)) and
               (str1[i - j + 1] = str2[Length(str2) + 1 - j]) do
      begin
         Inc(j);
      end;
      if j > Length(str2) then
      begin
         Result := i - length(str2) + 1;
         break;
      end;

      Dec(j); { now str1[i-j+1...i] = str2[Length(str2)+1-j..Length(str2)] }
      if j - table[Length(str2) + 1 - j] = 0 then
         i := i - 1
      else
         i := i - j + table[Length(str2) + 1 - j];

   end; { end while }
end;

function BmComputeTable(const str2 : String) : TBmTable;
var
   tab : TKmpReverseTable;
   i, j, k, s : IndexType;
   len, filler : SizeType;
begin
   tab := KmpReverseComputeTable(str2);

   len := Length(str2);
   Result := nil;
   SetLength(Result, len + 2);
   filler := len + 2;
   for i := 0 to len do
   begin
      Result[i] := filler;
   end;

   { there must be two conditions satisfied for result[j] = s: }
   { cond1(s,j) : for each k in (j,len] s >= k or str[k - s] =
     str[k]; in other words, the end of the substring (j,len] is
     equal to the end of the substring [1,len-s] }
   { graphically:              <>         <>    (<> - equal distances }
   {  +----------------+      +-----------------+    }
   {  |XX|      |   |XX|  or  |  |XXX|   |  |XXX|    }
   {  +----------------+      +-----------------+    }
   {            j   s           j-s      s  j        }
   { The two groups of X's are equal. }
   { cond2(s,j) : s >= j or str[j - s] <> str[j]; }
   { the s chosen is a minimal one that satisfies both conditions
   }
   { In the algorithm we use the reversed table of maximal suffixes
     that are also prefixes of a given substring, obtained from the
     KmpReverseComputeTable routine. We base on the fact that the
     X's marked in the above picture are both suffixes and prefixes
     of some substring of str ending at str's last position (they
     need not be necessarily maximal). Notice that the larger is
     the position i at which the first group of X's starts, the
     smaller is s. So we begin with i = 1 and increase i until it
     reaches len. In each iteration we find all prefixes that are
     also suffixes of str[i..len], and for each of them we compute
     j and s for this j, if cond2 is satisfied (the situation shown
     in the right picture above). Because with increasing i s is
     decreasing we are guaranteed to choose the smallest s for a
     given j. However, not all j's are assigned a value in this
     computation. Those which are not correspond to the situation
     in the left picture above. We handle them in the second loop
     by assigning the s computed for the largest suffix of str that
     is also its prefix. It guarantees that s is minimal.
     Additionally, we require s > j.  }

   for i := 1 to len do
   begin
      k := i;
      repeat
         j := len - tab[k];
         s := j - i + 1;
         { cond2(s,j) : }
         if (j <= s) or (str2[j - s] <> str2[j]) then
         begin
            Result[j] := s;
         end;
         { get the next prefix that is also a suffix of str[i..len] }
         k := len - tab[k] + 1;
      until k = len + 1;
   end;

   s := len; /// not even used
   for i := len downto 1 do
   begin
      if Result[i] = filler then
         Result[i] := s
      else
         s := i;
   end;
end;

function BmFindSubstr(const str1, str2 : String; startIndex : IndexType;
                      const table : TBmTable) : IndexType;
var
   j : IndexType;
begin
   Result := -1;

   while startIndex <= Length(str1) - Length(str2) do
   begin
      j := Length(str2);
      while (j > 0) and (str1[startIndex + j - 1] = str2[j]) do
      begin
         Dec(j);
      end;
      if j = 0 then
      begin
         Result := startIndex;
         break;
      end;
      startIndex := startIndex + table[j];
   end;
end;

function KmrFindSubstrings(const str : String; start, finish : IndexType;
                           len : SizeType) : TCardinalArray;
const
   BITS_IN_CHAR = 8;
   MAX_CHARS = 1 shl BITS_IN_CHAR;
   MASK = MAX_CHARS - 1;
   MAX_TAB_IND = MAX_CHARS - 1;
   CHARS_NUM1 = 2;
   CHARS_NUM2 = 4;
   CHARS_NUM3 = (SizeOf(IndexType) * 2) div SizeOf(Char);
type
   PTriple = ^TTriple;
   TTriple = record
      Num : array[0..CHARS_NUM3 - 1] of Char;
      Ind : IndexType;
      Next : PTriple;
   end;
   TBucket = record
      First, Last : PTriple;
   end;
var
   table : array[0..MAX_TAB_IND] of TBucket; { for bucket sort }
   lastNum : array[0..CHARS_NUM3 - 1] of Char;
   list, freelist, node, nnode : PTriple;
   sgroup : 1..3; { sgroup = 1 : Length(str) < MAX_CHARS; 2 : Length(str) <
                    MAX_CHARS*MAX_CHARS; 3 : Length(str) > MAX_CHARS^2 }
   p, loglen, slen : SizeType;
   maxj, ii : IndexType;

   procedure CreateSubstrings(l, off : SizeType);
   var
      i, j, k : IndexType;
      equal : Boolean;
   begin
      { Items(p)[i] denotes a substring with length p starting at
        str[i] }
      { p is assumed to be the current length of the substrings }
      { l - the new length of the substrings }
      { off - the offset added to the index of a substring to get the next
        adjacent substring }

      { create substrings of length l; just merge adjacent
        substrings (i.e. their numbers which identify them) }

      for i := start to finish - l do
      begin
         if freelist <> nil then
         begin
            node := freelist;
            freelist := freelist^.Next;
         end else
            New(node); { may raise }

         with node^ do
         begin
            case sgroup of
               1:
                  begin
                     Num[0] := Char(Result[i]);
                     Num[1] := Char(Result[i + off]);
                  end;
               2:
                  begin
                     Num[0] := Char(Result[i] and MASK);
                     Num[1] := Char((Result[i] shr BITS_IN_CHAR) and MASK);
                     Num[2] := Char(Result[i + off] and MASK);
                     Num[3] := Char((Result[i + off] shr BITS_IN_CHAR) and
                                       MASK);
                  end;
               3:
                  begin
                     Num[0] := Char(Result[i] and MASK);
                     Num[1] := Char((Result[i] shr BITS_IN_CHAR) and MASK);
                     Num[2] := Char((Result[i] shr (2*BITS_IN_CHAR)) and MASK);
                     Num[3] := Char((Result[i] shr (3*BITS_IN_CHAR)) and MASK);

                     Num[4] := Char(Result[i + off] and MASK);
                     Num[5] := Char((Result[i + off] shr BITS_IN_CHAR) and
                                       MASK);
                     Num[6] := Char((Result[i + off] shr (2*BITS_IN_CHAR)) and
                                       MASK);
                     Num[7] := Char((Result[i + off] shr (3*BITS_IN_CHAR)) and
                                       MASK);
                  end;
            end;
            Ind := i;
            Next := list;
         end;
         list := node;
      end; { end for }

      { sort list lexicographically according to the Num field }

      for i := 0 to MAX_TAB_IND do
      begin
         with table[i] do
         begin
            First := nil;
            Last := nil;
         end;
      end;

      j := maxj;
      while j >= 0 do
      begin

         while list <> nil do
         begin
            node := list;
            list := list^.Next;
            k := SizeType(node^.Num[j]);
            if table[k].First <> nil then
            begin
               table[k].Last^.Next := node;
            end else
            begin
               table[k].First := node;
            end;
            table[k].Last := node;
            node^.Next := nil;
         end;

         node := nil; { will point to the last node in list }
         for i := 0 to MAX_TAB_IND do
         begin
            with table[i] do
            begin
               if First <> nil then
               begin
                  if node <> nil then
                     node^.Next := First
                  else
                     list := First;
                  node := Last;
                  Last := nil;
                  First := nil;
               end;
            end;
         end;

         Dec(j);
      end; { end while }

      { assign numbers to the substrings with the length l; nodes with equal
        Num fields become equal substrings; }

      // a preparation for the loop
      if list <> nil then
      begin
         for j := 0 to maxj do
         begin
            lastNum[j] := list^.Num[j];
         end;
      end;

      i := 0;
      while list <> nil do
      begin
         { equal is true if the current node (in list) is equal to
           the previous one (i.e. the one analysed in the previous
           iteration of the loop). Since nodes are ordered
           lexicographically the new substrings will be assigned
           equal numbers if and only if they are equal }
         equal := true;
         for j := 0 to maxj do
         begin
            if list^.Num[j] <> lastNum[j] then
            begin
               equal := false;
               lastNum[j] := list^.Num[j];
            end;
         end;
         if not equal then
            Inc(i); { the next sequence of equal substrings - a new
                      id needed }

         Result[list^.Ind] := i;

         node := freelist;
         freelist := list;
         list := list^.Next;
         freelist^.Next := node;
      end; { end while list <> nil }
   end; { end CreateSubstrings }

begin
   Assert((start <= finish) and (finish <= Length(str) + 1) and (len > 1));

   if start = finish then
   begin
      Result := nil;
      exit;
   end;

   SetLength(Result, finish);
   { may raise, but harmless }

   if Length(str) < MAX_CHARS then
      sgroup := 1
   else if Length(str) < MAX_CHARS * MAX_CHARS then
      sgroup := 2
   else
      sgroup := 3;

   case sgroup of
      1:
         maxj := CHARS_NUM1 - 1;
      2:
         maxj := CHARS_NUM2 - 1;
      3:
         maxj := CHARS_NUM3 - 1;
   end;

   try
      list := nil;
      freelist := nil;
      p := 1; { the current length of the substrings }
      loglen := FloorLog2(len);
      slen := 1 shl loglen;
      for ii := start to finish - 1 do
         Result[ii] := Ord(str[ii]);

      repeat
         { (Result(2*p)[i] = Result(2*p)[j]) <=> (Result(p)[i] =
           Result(p)[j]) and (Result(p)[i+p] = Result(p)[j+p]) }
         CreateSubstrings(2*p, p);
         p := 2*p;
      until p = slen;

      if slen <> len then
      begin
         { (Result(len)[i] = Result(len)[j]) <=> (Result(slen)[i]
           = Result(slen)[j]) and (Result(slen)[i+len-slen]
           = Result(slen)[j+len-slen]) }
         CreateSubstrings(len, len - slen);
      end;

   finally
      while freelist <> nil do
      begin
         nnode := freelist^.Next;
         Dispose(freelist);
         freelist := nnode;
      end;
   end;
end;


end.