Suffix Array Algorithm



A suffix array is a data structure that stores all suffixes of a given string in the lexicographic order. It is useful for various string processing problems, such as pattern matching, searching, finding longest common prefix and so on. This array can quickly find the location of a pattern within a larger text.

How Suffix Array works?

Suppose the text is "Carpet". To construct its suffix array, follow the below steps −

  • Generate all the suffixes of the given text. In this case, possible suffixes could be "carpet", "arpet", "rpet", "pet", "et", "t".

  • Sort all the suffixes. All suffixes in sorted order are "arpet", "carpet", "et", "pet", "rpet", and "t".

  • Hence, the suffix array is as follows: [1, 0, 4, 3, 2, 5].

Suffix Array

To use this suffix array for pattern matching, we can perform a binary search on the sorted suffixes to find the range of suffixes that start with the pattern. For instance, let's take the above string "Carpet" and we want to find the pattern "ar", we can compare it with the middle suffix in the suffix array, which is "pet".

Since "ar" is smaller than this suffix, we can discard the right half of the suffix array and continue the binary search on the left half. Eventually, we will find that the position of suffix that starts with "ar" is "1" in the original string.

Let's see the input-output scenario for Suffix Array −

Input:
string: "AABABCEDBABCDEB" 
Output:
Pattern found at index: 3
Pattern found at index: 9
Pattern found at index: 1 

Example

Following is the example illustrating the working of suffix array in pattern matching.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// Structure of suffixes
struct Suffix {
   int index;
   char suff[100];
};
// Function to compare two suffixes for sorting
int strCompare(const void* a, const void* b) {
   struct Suffix* s1 = (struct Suffix*)a;
   struct Suffix* s2 = (struct Suffix*)b;
   return strcmp(s1->suff, s2->suff);
}
// Function to fill the suffix array
int* fillSuffixArray(char* txt, int n) {
   struct Suffix* suffixes = (struct Suffix*) malloc(n * sizeof(struct Suffix));
   // Store suffixes and their indexes in an array 
   for (int i = 0; i < n; i++) {
      suffixes[i].index = i;
      strncpy(suffixes[i].suff, &(txt[i]), n - i);
      suffixes[i].suff[n-i] = '\0';
   }
   // Sort the suffixes 
   qsort(suffixes, n, sizeof(struct Suffix), strCompare);
   // Store indexes of all sorted suffixes in the suffix array
   int* suffixArr = (int*) malloc(n * sizeof(int));
   for (int i = 0; i < n; i++)
      suffixArr[i] = suffixes[i].index;

   // Deallocate the dynamic memory
   free(suffixes);
   return suffixArr;
}
// binary search on suffix array and find all occurrences of pattern
void suffixArraySearch(char* pat, char* txt, int* suffixArr, int n) {
   int m = strlen(pat);
   // binary search for pattern in text using the built suffix array
   int l = 0, r = n - 1;
   while (l <= r) {
      int mid = l + (r - l) / 2;
      char substr[100];
      strncpy(substr, &(txt[suffixArr[mid]]), m);
      substr[m] = '\0';
      int res = strncmp(pat, substr, m);
      if (res == 0) {
         printf("Pattern found at index: %d\n", suffixArr[mid]);
         // Move to the left of mid
         int temp = mid - 1;
         while (temp >= 0 && strncmp(pat, &(txt[suffixArr[temp]]), m) == 0) {
            printf("Pattern found at index: %d\n", suffixArr[temp]);
            temp--;
         }
         // Move to the right of mid
         temp = mid + 1;
         while (temp < n && strncmp(pat, &(txt[suffixArr[temp]]), m) == 0) {
            printf("Pattern found at index: %d\n", suffixArr[temp]);
            temp++;
         }
         return;
      }
      if (res < 0) r = mid - 1;
      else l = mid + 1;
    }
    printf("Pattern not found\n");
}
int main() {
   char txt[] = "AAAABCAEAAABCBDDAAAABC";
   // pattern to be searched
   char pat[] = "AAABC";
   int n = strlen(txt);
   int* suffixArr = fillSuffixArray(txt, n);
   suffixArraySearch(pat, txt, suffixArr, n);
   free(suffixArr);  
   return 0;
}
#include <iostream>
#include <algorithm>
#include <cstring>
using namespace std;
// Structure of suffixes
struct Suffix {
   int index;
   string suff;
};
// Function to compare two suffixes for sorting
bool strCompare(Suffix a, Suffix b) {
   return a.suff < b.suff;
}
// Function to fill the suffix array
int* fillSuffixArray(string txt, int n) {
   Suffix* suffixes = new Suffix[n];
   // Storing suffixes and indexes in an array 
   for (int i = 0; i < n; i++) {
      suffixes[i].index = i;
      suffixes[i].suff = txt.substr(i, n - i);
   }
   // Sorting the suffixes 
   sort(suffixes, suffixes+n, strCompare);
   // Store indexes of all sorted suffixes in suffix array
   int* suffixArr = new int[n];
   for (int i = 0; i < n; i++)
      suffixArr[i] = suffixes[i].index;
   // Deallocate the dynamic memory
   delete[] suffixes;
   return suffixArr;
}
// binary search on the suffix array and find all occurrences of pattern
void suffixArraySearch(string pat, string txt, int* suffixArr, int n) {
   int m = pat.length();
   // binary search for the pattern 
   int l = 0, r = n - 1;
   while (l <= r) {
      int mid = l + (r - l) / 2;
      string substr = txt.substr(suffixArr[mid], m);
      if (pat == substr) {
         cout << "Pattern found at index: " << suffixArr[mid] << endl;
         // Move to the left of mid
         int temp = mid - 1;
         while (temp >= 0 && txt.substr(suffixArr[temp], m) == pat) {
            cout << "Pattern found at index: " << suffixArr[temp] << endl;
            temp--;
         }
         // Move to the right of mid
         temp = mid + 1;
         while (temp < n && txt.substr(suffixArr[temp], m) == pat) {
            cout << "Pattern found at index: " << suffixArr[temp] << endl;
            temp++;
         }
         return;
      }
      if (pat < substr) r = mid - 1;
      else l = mid + 1;
   }
   cout << "Pattern not found" << endl;
}
int main() {
   string txt = "AAAABCAEAAABCBDDAAAABC";
   // pattern to be searched
   string pat = "AAABC";
   int n = txt.length();
   int* suffixArr = fillSuffixArray(txt, n);
   suffixArraySearch(pat, txt, suffixArr, n);
   delete[] suffixArr;  
   return 0;
} 
import java.util.Arrays;
public class Main {
   // Structure of suffixes
   static class SuffixCmpr implements Comparable<SuffixCmpr> {
      int index;
      String suff;
      // Constructor
      public SuffixCmpr(int index, String suff) {
         this.index = index;
         this.suff = suff;
      }
      // to sort suffixes alphabetically
      public int compareTo(SuffixCmpr other) {
         return this.suff.compareTo(other.suff);
      }
   }
   // method to build a suffix array
   public static int[] fillsuffixArray(String s) {
      int n = s.length();
      SuffixCmpr[] suffixes = new SuffixCmpr[n];
      // Create and sort suffixes
      for (int i = 0; i < n; i++) {
         suffixes[i] = new SuffixCmpr(i, s.substring(i));
      }
      Arrays.sort(suffixes);
      // Store indexes of all sorted suffixes
      int[] fillsuffixArray = new int[n];
      for (int i = 0; i < n; i++) {
         fillsuffixArray[i] = suffixes[i].index;
      }
      return fillsuffixArray;
   }
   // method to search a pattern in a text using suffix array
   public static void suffixArraySearch(String pattern, String txt, int[] suffArr) {
      int n = txt.length();
      int m = pattern.length();
      // binary search for the pattern in the text using suffix array
      int l = 0, r = n - 1;
      while (l <= r) {
         int mid = l + (r - l) / 2;
         int res = pattern.compareTo(txt.substring(suffArr[mid], Math.min(suffArr[mid] + m, n)));
         if (res == 0) {
            System.out.println("Pattern found at index: " + suffArr[mid]);
            // Move to previous suffix in the sorted array
            int temp = mid - 1;
            while (temp >= 0 && txt.substring(suffArr[temp], Math.min(suffArr[temp] + m, n)).equals(pattern)) {
                System.out.println("Pattern found at index: " + suffArr[temp]);
                temp--;
            }
            // Move to next suffix in the sorted array
            temp = mid + 1;
            while (temp < n && txt.substring(suffArr[temp], Math.min(suffArr[temp] + m, n)).equals(pattern)) {
                System.out.println("Pattern found at index: " + suffArr[temp]);
                temp++;
            }
            return;
         }
         if (res < 0) r = mid - 1;
         else l = mid + 1;
      }
      System.out.println("Pattern not found");
   }
   public static void main(String[] args) {
      String txt = "AAAABCAEAAABCBDDAAAABC";
      String pattern = "AAABC";
      // Filling the suffix array
      int[] suffArr = fillsuffixArray(txt);
      // Calling method to Search pattern in suffix array
      suffixArraySearch(pattern, txt, suffArr);
   }
}
def fill_suffix_array(txt):
    # Array of tuples, each tuple stores the index and suffix
    suffixes = [(i, txt[i:]) for i in range(len(txt))]
    # Sort the suffixes
    suffixes.sort(key=lambda x: x[1])
    # Return the list of indices after sorting
    return [suff[0] for suff in suffixes]

def suffixArraySearch(pat, txt, suffix_arr):
    n = len(txt)
    m = len(pat)
    # Iterate over the suffix array
    for i in range(n):
        if txt[suffix_arr[i]:min(suffix_arr[i] + m, n)] == pat:
            print(f"Pattern found at index: {suffix_arr[i]}")

def main():
    txt = "AAAABCAEAAABCBDDAAAABC"
    pat = "AAABC"
    suffix_arr = fill_suffix_array(txt)
    suffixArraySearch(pat, txt, suffix_arr)

if __name__ == "__main__":
    main()

Output

Pattern found at index: 8
Pattern found at index: 1
Pattern found at index: 17

Complexity of Suffix Array

The advantage of using a suffix array for pattern matching is that it requires only O(m log n) time, where m is the length of the pattern and n is the length of the string. The disadvantage is that it requires O(n log n) time and O(n) space to construct the suffix array in the first place.

Advertisements