programming-examples/java/Data_Structures/SuffixArray.java

113 lines
3.8 KiB
Java
Raw Normal View History

2019-11-15 12:59:38 +01:00
import java.util.*;
import java.util.stream.IntStream;
// https://en.wikipedia.org/wiki/Suffix_array
public class SuffixArray {
// sort suffixes of S in O(n*log(n))
public static int[] suffixArray(CharSequence S) {
int n = S.length();
// stable sort of characters
int[] sa = IntStream.range(0, n).mapToObj(i -> n - 1 - i).
sorted((a, b) -> Character.compare(S.charAt(a), S.charAt(b))).mapToInt(Integer::intValue).toArray();
int[] classes = S.chars().toArray();
// sa[i] - suffix on i'th position after sorting by first len characters
// classes[i] - equivalence class of the i'th suffix after sorting by first len characters
for (int len = 1; len < n; len *= 2) {
int[] c = classes.clone();
for (int i = 0; i < n; i++) {
// condition sa[i - 1] + len < n simulates 0-symbol at the end of the string
// a separate class is created for each suffix followed by simulated 0-symbol
classes[sa[i]] = i > 0 && c[sa[i - 1]] == c[sa[i]] && sa[i - 1] + len < n && c[sa[i - 1] + len / 2] == c[sa[i] + len / 2] ? classes[sa[i - 1]] : i;
}
// Suffixes are already sorted by first len characters
// Now sort suffixes by first len * 2 characters
int[] cnt = IntStream.range(0, n).toArray();
int[] s = sa.clone();
for (int i = 0; i < n; i++) {
// s[i] - order of suffixes sorted by first len characters
// (s[i] - len) - order of suffixes sorted only by second len characters
int s1 = s[i] - len;
// sort only suffixes of length > len, others are already sorted
if (s1 >= 0)
sa[cnt[classes[s1]]++] = s1;
}
}
return sa;
}
// sort rotations of S in O(n*log(n))
public static int[] rotationArray(CharSequence S) {
int n = S.length();
int[] sa = IntStream.range(0, n).mapToObj(Integer::valueOf).
sorted((a, b) -> Character.compare(S.charAt(a), S.charAt(b))).mapToInt(Integer::intValue).toArray();
int[] classes = S.chars().toArray();
for (int len = 1; len < n; len *= 2) {
int[] c = classes.clone();
for (int i = 0; i < n; i++)
classes[sa[i]] = i > 0 && c[sa[i - 1]] == c[sa[i]] && c[(sa[i - 1] + len / 2) % n] == c[(sa[i] + len / 2) % n] ? classes[sa[i - 1]] : i;
int[] cnt = IntStream.range(0, n).toArray();
int[] s = sa.clone();
for (int i = 0; i < n; i++) {
int s1 = (s[i] - len + n) % n;
sa[cnt[classes[s1]]++] = s1;
}
}
return sa;
}
// longest common prefixes array in O(n)
public static int[] lcp(int[] sa, CharSequence s) {
int n = sa.length;
int[] rank = new int[n];
for (int i = 0; i < n; i++)
rank[sa[i]] = i;
int[] lcp = new int[n - 1];
for (int i = 0, h = 0; i < n; i++) {
if (rank[i] < n - 1) {
for (int j = sa[rank[i] + 1]; Math.max(i, j) + h < s.length() && s.charAt(i + h) == s.charAt(j + h); ++h)
;
lcp[rank[i]] = h;
if (h > 0)
--h;
}
}
return lcp;
}
// Usage example
public static void main(String[] args) {
String s1 = "abcab";
int[] sa1 = suffixArray(s1);
// print suffixes in lexicographic order
for (int p : sa1)
System.out.println(s1.substring(p));
System.out.println("lcp = " + Arrays.toString(lcp(sa1, s1)));
// random test
Random rnd = new Random(1);
for (int step = 0; step < 100000; step++) {
int n = rnd.nextInt(100) + 1;
StringBuilder s = rnd.ints(n, 0, 10).collect(StringBuilder::new, (sb, i) -> sb.append((char) ('\1' + i)), StringBuilder::append);
int[] sa = suffixArray(s);
int[] ra = rotationArray(s.toString() + '\0');
int[] lcp = lcp(sa, s);
for (int i = 0; i + 1 < n; i++) {
String a = s.substring(sa[i]);
String b = s.substring(sa[i + 1]);
if (a.compareTo(b) >= 0
|| !a.substring(0, lcp[i]).equals(b.substring(0, lcp[i]))
|| (a + " ").charAt(lcp[i]) == (b + " ").charAt(lcp[i])
|| sa[i] != ra[i + 1])
throw new RuntimeException();
}
}
System.out.println("Test passed");
}
}