Trie of all Suffixes


From the text, we can generate all suffixes to make a tree structure. We know that every pattern that presents in the text, must be a prefix of one of the possible suffix in the text. By building Trie of all suffixes, we can find any substring in linear time. Every suffix is ending with string terminating symbol. From each node if there is any path, it moves forward, otherwise returns that pattern is not found.

For this algorithm, the time complexity is O(m+k), where the m is the length of string and k is the frequency of the pattern in the text.

Input and Output

Input:
Main String: “ABAAABCDBBABCDDEBCABC”. Pattern “ABC”
Output:
Pattern found at position: 4
Pattern found at position: 10
Pattern found at position: 18

Algorithm

In this algorithm, we will use a special node, called a trie node. It will hold indexes of all suffixes and another trie nodes address as a link.

createTrie(root: trieNode, text)

Input: A root node of type trieNode.

Output: The suffix tree using the main string

Begin
   for i := 0 to length of text, do
      substring from ith position to end as suffix, and add in index i in tire.
   done
End

findPat(pattern, node)

Input: pattern to find and node, which is used to check in its suffix subtrees

Output − The index list where the pattern was found

Begin
   if pattern size is 0, then
      return suffIndex of node
   if node.suff[patten[0]] ≠φ, then
      return node.suff[pattern[0]].findPat(substring from 1 to end o pattern)
   else
      return φ
End

searchPat(pattern)

Input − The pattern which will be searched

Output − The list where indexes of text, where the pattern was found

Begin
   define res as list.
   res := findPat(pattern)

   if res ≠φ, then
      patLen := length of pattern
      for i := 0 to end of res list, do
         print all indexes where pattern was found
      done
End

Example

#include<iostream>
#include<list>
#define MAXCHAR 256
using namespace std;

class trieNode {      //node to hold all suffixes
   private:
      trieNode *suff[MAXCHAR];
      list<int> *suffIndex;
   public:
      trieNode() {
         suffIndex = new list<int>;
         for (int i = 0; i < MAXCHAR; i++)
            suff[i] = NULL;       //no child initially
      }

      void addSuffix(string suffix, int sIndex);
      list<int>* searchPattern(string pat);
};

void trieNode::addSuffix(string suffix, int sIndex) {
   suffIndex->push_back(sIndex);        //store index initially

   if (suffix.size() > 0) {
      char cIndex = suffix[0];
      if (suff[cIndex] == NULL)        //if no sub tree present for this character
         suff[cIndex] = new trieNode();     //create new node
      suff[cIndex]->addSuffix(suffix.substr(1), sIndex+1);      //for next suffix
   }
}

list<int>* trieNode::searchPattern(string pattern) {
   if (pattern.size() == 0)
      return suffIndex;
   if (suff[pattern[0]] != NULL)
      return (suff[pattern[0]])->searchPattern(pattern.substr(1));    //follow to next node
   else
      return NULL;       //when no node are there to jump
}

class trieSuffix {      //trie for all suffixes
   trieNode root;
   public:
      trieSuffix(string mainString) {       //add suffixes and make trie
         for (int i = 0; i < mainString.length(); i++)
            root.addSuffix(mainString.substr(i), i);
      }

   void searchPat(string pattern, int *locArray, int *index);
};

void trieSuffix::searchPat(string pattern, int *locArray, int *index) {
   list<int> *res = root.searchPattern(pattern);
   // Check if the list of indexes is empty or not
   if (res != NULL) {
      list<int>::iterator it;
      int patLen = pattern.length();
      for (it = res->begin(); it != res->end(); it++) {
         (*index)++;
         locArray[(*index)] = *it - patLen;
      }
   }
}

int main() {
   string mainString = "ABAAABCDBBABCDDEBCABC";
   string pattern = "ABC";
   int locArray[mainString.size()];
   int index = -1;

   trieSuffix trie(mainString);
   trie.searchPat(pattern, locArray, &index);

   for(int i = 0; i <= index; i++) {
      cout << "Pattern found at position: " << locArray[i]<<endl;
   }

}

Output

Pattern found at position: 4
Pattern found at position: 10
Pattern found at position: 18

karthikeya Boyini
karthikeya Boyini

I love programming (: That's all I know

Updated on: 15-Jun-2020

366 Views

Kickstart Your Career

Get certified by completing the course

Get Started
Advertisements