Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added the Aho-Сorasiсk algorithm #145

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 165 additions & 0 deletions Algorithms/Strings/AhoCorasick.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
using System.Collections.Generic;

namespace Algorithms.Strings
{
/// <summary>
/// The substring search algorithm implements the search for multiple substrings from the dictionary in a given string.
/// </summary>
public class AhoCorasick
{
/// <summary>
/// Tree in which each vertex denotes a row (the root denotes a zero row - $).
/// We will store the Tree as an array of vertices, where each vertex has its own unique number, and the root has a zero value (root = 0)
/// </summary>
private readonly List<AhoCorasickVertex> Tree = new List<AhoCorasickVertex>();

public AhoCorasick()
{
// Add root vertex.
Tree.Add(new AhoCorasickVertex(0, '$'));
}

public void AddPattern(string pattern)
{
int num = 0;

foreach (char ch in pattern.ToCharArray())
{
if (!Tree[num].NextVertex.ContainsKey(ch)) // sign of no rib.
{
Tree.Add(new AhoCorasickVertex(num, ch));
Tree[num].NextVertex.Add(ch, Tree.Count - 1);
}

num = Tree[num].NextVertex[ch];
}

Tree[num].IsPattern = true;
Tree[num].Str = pattern;
}

public void ClearPatterns()
{
Tree.Clear();
// Add root vertex.
Tree.Add(new AhoCorasickVertex(0, '$'));
}

public bool Exist(string pattern)
{
int num = 0;
foreach(var ch in pattern)
{
if(!Tree[num].NextVertex.ContainsKey(ch))
{
return false;
}
num = Tree[num].NextVertex[ch];
}

return Tree[num].IsPattern;
}

private int GetSuffLink(int index)
{
AhoCorasickVertex node = Tree[index];
if (node.SuffLink == -1)
{
node.SuffLink = (index == 0 || node.Parent == 0) ? 0 : GetAutoMove(GetSuffLink(node.Parent), node.Symbol);
}

return node.SuffLink;
}

/// <summary>
/// Transition from the state of the automaton are interconnected.
/// </summary>
/// <param name="index">Vertex index.</param>
/// <param name="ch">Transition symbol.</param>
private int GetAutoMove(int index, char ch)
{
AhoCorasickVertex node = Tree[index];
if (!node.AutoMove.ContainsKey(ch))
{
// if there is an vertex with the symbol ch from the current vertex, then we will follow it,
// otherwise we will follow the suffix link and start recursively from the new vertex.
int autoMove;
if (node.NextVertex.ContainsKey(ch))
{
autoMove = node.NextVertex[ch];
}
else
{
autoMove = (index == 0) ? 0 : GetAutoMove(GetSuffLink(index), ch);
}

node.AutoMove.Add(ch, autoMove);
}

return node.AutoMove[ch];
}

private int GetGoodSuffLink(int index)
{
AhoCorasickVertex node = Tree[index];
if (node.GoodSuffLink == -1)
{
int slink = GetSuffLink(index);

if (slink == 0)
{
// Suffix link is root vertex.
node.GoodSuffLink = 0;
}
else
{
// If flag = true for the vertex by the suffix link, then this is the desired vertex; otherwise, we start recursively from the same vertex.
node.GoodSuffLink = Tree[slink].IsPattern ? slink : GetGoodSuffLink(slink);
}
}

return node.GoodSuffLink;
}

/// <summary>
/// Walking on "good" suffix links.
/// </summary>
/// <param name="index">Current position of the automaton.</param>
/// <returns>For tests.</returns>
private List<string> Check(int index)
{
List<string> patterns = new List<string>();
while (index != 0)
{
AhoCorasickVertex node = Tree[index];
if (node.IsPattern)
{
patterns.Add(node.Str);
}

index = GetGoodSuffLink(index);
}

return patterns;
}

/// <summary>
/// Search for all patterns in a string.
/// </summary>
/// <param name="line">Line in which the search occurs.</param>
/// <returns>For tests.</returns>
public List<string> FindAllOccurrences(string line)
{
List<string> occurences = new List<string>();
int index = 0;

for (int i = 0; i < line.Length; i++)
{
index = GetAutoMove(index, line[i]);
occurences.AddRange(Check(index));
}

return occurences;
}
}
}
68 changes: 68 additions & 0 deletions Algorithms/Strings/AhoCorasickVertex.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
using System.Collections.Generic;

namespace Algorithms.Strings
{
internal class AhoCorasickVertex
{
/// <summary>
/// A flag indicating whether our vertex is the source string.
/// </summary>
public bool IsPattern;

/// <summary>
/// The number (Value) of the vertex to which we arrive by symbol (Key).
/// </summary>
public readonly SortedDictionary<char, int> NextVertex;

/// <summary>
/// Remembering the transition of the automaton.
/// </summary>
public readonly SortedDictionary<char, int> AutoMove;

/// <summary>
/// The suffix link of the vertex X is a pointer to the vertex Y,
/// such that the string Y is the largest own suffix of the string X, or,
/// if there is no such vertex in the tree, then the pointer to the root.
/// In particular, a link from the root leads to it.
/// </summary>
public int SuffLink;

/// <summary>
/// "Good" suffix link.
/// </summary>
public int GoodSuffLink;

/// <summary>
/// parrent vertex in a tree.
/// </summary>
public readonly int Parent;

/// <summary>
/// Symbol on the vertex.
/// </summary>
public readonly char Symbol;

/// <summary>
/// For tests.
/// </summary>
public string Str;

/// <summary>
/// Create a vertex by initializing the variables and setting the parrent and symbol.
/// </summary>
/// <param name="parent">Number of the parrent</param>
/// <param name="symbol">Symbol on the vertex in the tree.</param>
public AhoCorasickVertex(int parent, char symbol)
{
IsPattern = false;
NextVertex = new SortedDictionary<char, int>();
AutoMove = new SortedDictionary<char, int>();

Parent = parent;
Symbol = symbol;

GoodSuffLink = -1; // initially - no suffix flink.
SuffLink = -1; // initially - no suffix link.
}
}
}
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,8 @@ If you wish to contribute to C# ALGORITHMS, then please make sure you check out
* [Permutations and Anagrams](Algorithms/Strings/Permutations.cs)
* [Edit Distance](Algorithms/Strings/EditDistance.cs)
+ Uses a generic custom class for passing costs: [EditDistanceCostsMap\<T\>](Algorithms/Strings/EditDistanceCostsMap.cs)
* [Aho-Corasick](Algorithms/Strings/AhoCorasick.cs)
+ Uses a class to store information about vertices and transitions between them.: [AhoCorasickVertex](Algorithms/Strings/AhoCorasickVertex.cs)

#### Numeric:

Expand Down
50 changes: 50 additions & 0 deletions UnitTest/AlgorithmsTests/StringAhoCorasickTest.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
using Algorithms.Strings;

using System.Collections.Generic;
using System.Linq;

using Xunit;

namespace UnitTest.AlgorithmsTests
{
public static class StringAhoCorasickTest
{
[Fact]
public static void DoTest()
{
AhoCorasick alg = new AhoCorasick();

// Initialize patterns

alg.AddPattern("a");
alg.AddPattern("b");
alg.AddPattern("c");
alg.AddPattern("d");
alg.AddPattern("aa");

List<string> foundPatterns = alg.FindAllOccurrences("caaab");

Assert.True(foundPatterns.Count == 7);
Assert.True(foundPatterns.Where(q => q.Equals("c")).Count() == 1);
Assert.True(foundPatterns.Where(q => q.Equals("a")).Count() == 3);
Assert.True(foundPatterns.Where(q => q.Equals("aa")).Count() == 2);
Assert.True(foundPatterns.Where(q => q.Equals("b")).Count() == 1);
alg.ClearPatterns();

alg.AddPattern("test1");
alg.AddPattern("test2");
alg.AddPattern("test3");
alg.AddPattern("test33");
alg.AddPattern("verybigtest");

foundPatterns = alg.FindAllOccurrences("testtest1test1122test22test3549798test3656test333354654sdjkhbfabvdskhjfbashjdvbfjhksdbahjfvhusgdabvfhjsdvfgsdkhjvkverybigtesthdsagfhkgasdkhfverybigtestsdhgfjhkgsdfgk");

Assert.True(foundPatterns.Count == 9);
Assert.True(foundPatterns.Where(q => q.Equals("test1")).Count() == 2);
Assert.True(foundPatterns.Where(q => q.Equals("test2")).Count() == 1);
Assert.True(foundPatterns.Where(q => q.Equals("test3")).Count() == 3);
Assert.True(foundPatterns.Where(q => q.Equals("test33")).Count() == 1);
Assert.True(foundPatterns.Where(q => q.Equals("verybigtest")).Count() == 2);
}
}
}