programming-examples/c++/Basic/SuffixArrayLcp.cpp
2019-11-18 14:44:36 +01:00

163 lines
5.2 KiB
C++

#include <cstdlib>
#include <string>
#include <iostream>
using namespace std;
unsigned char mask[] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
#define tget(i) ( (t[(i)/8]&mask[(i)%8]) ? 1 : 0 )
#define tset(i, b) t[(i)/8]=(b) ? (mask[(i)%8]|t[(i)/8]) : ((~mask[(i)%8])&t[(i)/8])
#define chr(i) (cs==sizeof(int)?((int*)s)[i]:((unsigned char *)s)[i])
#define isLMS(i) (i>0 && tget(i) && !tget(i-1))
// find the start or end of each bucket
void getBuckets(unsigned char *s, int *bkt, int n, int K, int cs, bool end) {
int i, sum = 0;
for (i = 0; i <= K; i++)
bkt[i] = 0; // clear all buckets
for (i = 0; i < n; i++)
bkt[chr(i)]++; // compute the size of each bucket
for (i = 0; i <= K; i++) {
sum += bkt[i];
bkt[i] = end ? sum : sum - bkt[i];
}
}
// compute SAl
void induceSAl(unsigned char *t, int *SA, unsigned char *s, int *bkt, int n, int K, int cs, bool end) {
int i, j;
getBuckets(s, bkt, n, K, cs, end); // find starts of buckets
for (i = 0; i < n; i++) {
j = SA[i] - 1;
if (j >= 0 && !tget(j))
SA[bkt[chr(j)]++] = j;
}
}
// compute SAs
void induceSAs(unsigned char *t, int *SA, unsigned char *s, int *bkt, int n, int K, int cs, bool end) {
int i, j;
getBuckets(s, bkt, n, K, cs, end); // find ends of buckets
for (i = n - 1; i >= 0; i--) {
j = SA[i] - 1;
if (j >= 0 && tget(j))
SA[--bkt[chr(j)]] = j;
}
}
// find the suffix array SA of s[0..n-1] in {1..K}^n
// require s[n-1]=0 (the sentinel!), n>=2
// use a working space (excluding s and SA) of at most 2.25n+O(1) for a constant alphabet
void SA_IS(unsigned char *s, int *SA, int n, int K, int cs) {
int i, j;
unsigned char *t = (unsigned char *) malloc(n / 8 + 1); // LS-type array in bits
// Classify the type of each character
tset(n-2, 0);
tset(n-1, 1); // the sentinel must be in s1, important!!!
for (i = n - 3; i >= 0; i--)
tset(i, (chr(i)<chr(i+1) || (chr(i)==chr(i+1) && tget(i+1)==1))?1:0);
// stage 1: reduce the problem by at least 1/2
// sort all the S-substrings
int *bkt = (int *) malloc(sizeof(int) * (K + 1)); // bucket array
getBuckets(s, bkt, n, K, cs, true); // find ends of buckets
for (i = 0; i < n; i++)
SA[i] = -1;
for (i = 1; i < n; i++)
if (isLMS(i))
SA[--bkt[chr(i)]] = i;
induceSAl(t, SA, s, bkt, n, K, cs, false);
induceSAs(t, SA, s, bkt, n, K, cs, true);
free(bkt);
// compact all the sorted substrings into the first n1 items of SA
// 2*n1 must be not larger than n (proveable)
int n1 = 0;
for (i = 0; i < n; i++)
if (isLMS(SA[i]))
SA[n1++] = SA[i];
// find the lexicographic names of all substrings
for (i = n1; i < n; i++)
SA[i] = -1; // init the name array buffer
int name = 0, prev = -1;
for (i = 0; i < n1; i++) {
int pos = SA[i];
bool diff = false;
for (int d = 0; d < n; d++)
if (prev == -1 || chr(pos+d) != chr(prev+d) || tget(pos+d) != tget(prev+d)) {
diff = true;
break;
} else if (d > 0 && (isLMS(pos+d) || isLMS(prev+d)))
break;
if (diff) {
name++;
prev = pos;
}
pos = (pos % 2 == 0) ? pos / 2 : (pos - 1) / 2;
SA[n1 + pos] = name - 1;
}
for (i = n - 1, j = n - 1; i >= n1; i--)
if (SA[i] >= 0)
SA[j--] = SA[i];
// stage 2: solve the reduced problem
// recurse if names are not yet unique
int *SA1 = SA, *s1 = SA + n - n1;
if (name < n1)
SA_IS((unsigned char*) s1, SA1, n1, name - 1, sizeof(int));
else
// generate the suffix array of s1 directly
for (i = 0; i < n1; i++)
SA1[s1[i]] = i;
// stage 3: induce the result for the original problem
bkt = (int *) malloc(sizeof(int) * (K + 1)); // bucket array
// put all left-most S characters into their buckets
getBuckets(s, bkt, n, K, cs, true); // find ends of buckets
for (i = 1, j = 0; i < n; i++)
if (isLMS(i))
s1[j++] = i; // get p1
for (i = 0; i < n1; i++)
SA1[i] = s1[SA1[i]]; // get index in s
for (i = n1; i < n; i++)
SA[i] = -1; // init SA[n1..n-1]
for (i = n1 - 1; i >= 0; i--) {
j = SA[i];
SA[i] = -1;
SA[--bkt[chr(j)]] = j;
}
induceSAl(t, SA, s, bkt, n, K, cs, false);
induceSAs(t, SA, s, bkt, n, K, cs, true);
free(bkt);
free(t);
}
const int maxn = 200000;
int sa[maxn];
int lcp[maxn];
int Rank[maxn];
unsigned char *s;
int n;
void calc_lcp() {
for (int i = 0; i < n; i++)
Rank[sa[i]] = i;
for (int i = 0, h = 0; i < n; i++) {
if (Rank[i] < n - 1) {
for (int j = sa[Rank[i] + 1]; s[i + h] == s[j + h]; ++h)
;
lcp[Rank[i]] = h;
if (h > 0)
--h;
}
}
}
int main() {
string str = "abcab";
n = str.size();
s = (unsigned char*) str.c_str();
SA_IS(s, sa, n + 1, 256, 1);
calc_lcp();
for (int i = 0; i < n; i++) {
cout << str.substr(sa[i + 1]);
if (i < n - 1)
cout << " " << lcp[i + 1];
cout << endl;
}
}