304 lines
11 KiB
Java
304 lines
11 KiB
Java
|
/*This is a Java Program to implement Suffix Tree. A suffix tree is a compressed trie containing all the suffixes of the given text as their keys and positions in the text as their values. Suffix tree allows a particularly fast implementation of many important string operations. The construction of such a tree for the string S takes time and space linear in the length of S. Once constructed, several operations can be performed quickly, for instance locating a substring in S, locating a substring if a certain number of mistakes are allowed, locating matches for a regular expression pattern etc. Suffix trees also provided one of the first linear-time solutions for the longest common substring problem. These speedups come at a cost: storing a string’s suffix tree typically requires significantly more space than storing the string itself. This program is based on Mark Nelson’s implementation of Ukkonen’s algorithm.*/
|
|||
|
|
|||
|
/*
|
|||
|
* Java Program to Implement Suffix Tree
|
|||
|
*/
|
|||
|
|
|||
|
import java.io.*;
|
|||
|
|
|||
|
/** Class Node **/
|
|||
|
class Node
|
|||
|
{
|
|||
|
public int suffix_node;
|
|||
|
public static int Count = 1;
|
|||
|
|
|||
|
/** Constructor **/
|
|||
|
public Node()
|
|||
|
{
|
|||
|
suffix_node = -1;
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
/** Class Suffix Tree **/
|
|||
|
class SuffixTree
|
|||
|
{
|
|||
|
private static final int MAX_LENGTH = 1000;
|
|||
|
private static final int HASH_TABLE_SIZE = 2179;
|
|||
|
|
|||
|
private char[] T = new char[ MAX_LENGTH ];
|
|||
|
private int N;
|
|||
|
private Edge[] Edges ;
|
|||
|
private Node[] Nodes ;
|
|||
|
|
|||
|
private Suffix active;
|
|||
|
|
|||
|
/** Class Suffix **/
|
|||
|
class Suffix
|
|||
|
{
|
|||
|
public int origin_node;
|
|||
|
public int first_char_index;
|
|||
|
public int last_char_index;
|
|||
|
|
|||
|
/** Constructor **/
|
|||
|
public Suffix(int node, int start, int stop )
|
|||
|
{
|
|||
|
origin_node = node ;
|
|||
|
first_char_index = start ;
|
|||
|
last_char_index = stop;
|
|||
|
}
|
|||
|
|
|||
|
/** Function Implicit **/
|
|||
|
public boolean Implicit()
|
|||
|
{
|
|||
|
return first_char_index > last_char_index;
|
|||
|
}
|
|||
|
|
|||
|
/** Function Explicit **/
|
|||
|
public boolean Explicit()
|
|||
|
{
|
|||
|
return first_char_index > last_char_index;
|
|||
|
}
|
|||
|
|
|||
|
/** Function Canonize()
|
|||
|
* A suffix in the tree is denoted by a Suffix structure
|
|||
|
* that denotes its last character. The canonical
|
|||
|
* representation of a suffix for this algorithm requires
|
|||
|
* that the origin_node by the closest node to the end
|
|||
|
* of the tree. To force this to be true, we have to
|
|||
|
* slide down every edge in our current path until we
|
|||
|
* reach the final node
|
|||
|
**/
|
|||
|
public void Canonize()
|
|||
|
{
|
|||
|
if (!Explicit() )
|
|||
|
{
|
|||
|
Edge edge = Find( origin_node, T[ first_char_index ] );
|
|||
|
int edge_span = edge.last_char_index - edge.first_char_index;
|
|||
|
while ( edge_span <= ( last_char_index - first_char_index ) )
|
|||
|
{
|
|||
|
first_char_index = first_char_index + edge_span + 1;
|
|||
|
origin_node = edge.end_node;
|
|||
|
if ( first_char_index <= last_char_index )
|
|||
|
{
|
|||
|
edge = Find( edge.end_node, T[ first_char_index ] );
|
|||
|
edge_span = edge.last_char_index - edge.first_char_index;
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
/** Class Edge **/
|
|||
|
class Edge
|
|||
|
{
|
|||
|
public int first_char_index;
|
|||
|
public int last_char_index;
|
|||
|
public int end_node;
|
|||
|
public int start_node;
|
|||
|
|
|||
|
/** Constructor **/
|
|||
|
public Edge()
|
|||
|
{
|
|||
|
start_node = -1;
|
|||
|
}
|
|||
|
|
|||
|
/** Constructor **/
|
|||
|
public Edge( int init_first, int init_last, int parent_node )
|
|||
|
{
|
|||
|
first_char_index = init_first;
|
|||
|
last_char_index = init_last;
|
|||
|
start_node = parent_node;
|
|||
|
end_node = Node.Count++;
|
|||
|
}
|
|||
|
|
|||
|
/** function Insert ()
|
|||
|
* A given edge gets a copy of itself inserted into the table
|
|||
|
* with this function. It uses a linear probe technique, which
|
|||
|
* means in the case of a collision, we just step forward through
|
|||
|
* the table until we find the first unused slot.
|
|||
|
**/
|
|||
|
public void Insert()
|
|||
|
{
|
|||
|
int i = Hash( start_node, T[ first_char_index ] );
|
|||
|
while ( Edges[ i ].start_node != -1 )
|
|||
|
i = ++i % HASH_TABLE_SIZE;
|
|||
|
Edges[ i ] = this;
|
|||
|
}
|
|||
|
|
|||
|
/** function SplitEdge ()
|
|||
|
* This function is called
|
|||
|
* to split an edge at the point defined by the Suffix argument
|
|||
|
**/
|
|||
|
public int SplitEdge( Suffix s )
|
|||
|
{
|
|||
|
Remove();
|
|||
|
Edge new_edge = new Edge( first_char_index, first_char_index + s.last_char_index - s.first_char_index, s.origin_node );
|
|||
|
new_edge.Insert();
|
|||
|
Nodes[ new_edge.end_node ].suffix_node = s.origin_node;
|
|||
|
first_char_index += s.last_char_index - s.first_char_index + 1;
|
|||
|
start_node = new_edge.end_node;
|
|||
|
Insert();
|
|||
|
return new_edge.end_node;
|
|||
|
}
|
|||
|
|
|||
|
/** function Remove ()
|
|||
|
* This function is called to remove an edge from hash table
|
|||
|
**/
|
|||
|
public void Remove()
|
|||
|
{
|
|||
|
int i = Hash( start_node, T[ first_char_index ] );
|
|||
|
while ( Edges[ i ].start_node != start_node ||
|
|||
|
Edges[ i ].first_char_index != first_char_index )
|
|||
|
i = ++i % HASH_TABLE_SIZE;
|
|||
|
for ( ; ; )
|
|||
|
{
|
|||
|
Edges[ i ].start_node = -1;
|
|||
|
int j = i;
|
|||
|
for ( ; ; )
|
|||
|
{
|
|||
|
i = ++i % HASH_TABLE_SIZE;
|
|||
|
if ( Edges[ i ].start_node == -1 )
|
|||
|
return;
|
|||
|
int r = Hash( Edges[ i ].start_node, T[ Edges[ i ].first_char_index ] );
|
|||
|
if ( i >= r && r > j )
|
|||
|
continue;
|
|||
|
if ( r > j && j > i )
|
|||
|
continue;
|
|||
|
if ( j > i && i >= r )
|
|||
|
continue;
|
|||
|
break;
|
|||
|
}
|
|||
|
Edges[ j ] = Edges[ i ];
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
/** Constructor */
|
|||
|
public SuffixTree()
|
|||
|
{
|
|||
|
Edges = new Edge[ HASH_TABLE_SIZE ];
|
|||
|
for (int i = 0; i < HASH_TABLE_SIZE; i++)
|
|||
|
Edges[i] = new Edge();
|
|||
|
Nodes = new Node[ MAX_LENGTH * 2 ];
|
|||
|
for (int i = 0; i < MAX_LENGTH * 2 ; i++)
|
|||
|
Nodes[i] = new Node();
|
|||
|
active = new Suffix( 0, 0, -1 );
|
|||
|
}
|
|||
|
|
|||
|
/** Function Find() - function to find an edge **/
|
|||
|
public Edge Find( int node, int c )
|
|||
|
{
|
|||
|
int i = Hash( node, c );
|
|||
|
for ( ; ; )
|
|||
|
{
|
|||
|
if ( Edges[ i ].start_node == node )
|
|||
|
if ( c == T[ Edges[ i ].first_char_index ] )
|
|||
|
return Edges[ i ];
|
|||
|
if ( Edges[ i ].start_node == -1 )
|
|||
|
return Edges[ i ];
|
|||
|
i = ++i % HASH_TABLE_SIZE;
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
/** Function Hash() - edges are inserted into the hash table using this hashing function **/
|
|||
|
public static int Hash( int node, int c )
|
|||
|
{
|
|||
|
return (( node << 8 ) + c ) % HASH_TABLE_SIZE;
|
|||
|
}
|
|||
|
|
|||
|
/** Function AddPrefix() - called repetitively, once for each of the prefixes of the input string **/
|
|||
|
public void AddPrefix( Suffix active, int last_char_index )
|
|||
|
{
|
|||
|
int parent_node;
|
|||
|
int last_parent_node = -1;
|
|||
|
for ( ; ; )
|
|||
|
{
|
|||
|
Edge edge;
|
|||
|
parent_node = active.origin_node;
|
|||
|
if ( active.Explicit() )
|
|||
|
{
|
|||
|
edge = Find( active.origin_node, T[ last_char_index ] );
|
|||
|
if ( edge.start_node != -1 )
|
|||
|
break;
|
|||
|
}
|
|||
|
else
|
|||
|
{
|
|||
|
edge = Find( active.origin_node, T[ active.first_char_index ] );
|
|||
|
int span = active.last_char_index - active.first_char_index;
|
|||
|
if ( T[ edge.first_char_index + span + 1 ] == T[ last_char_index ] )
|
|||
|
break;
|
|||
|
parent_node = edge.SplitEdge( active );
|
|||
|
}
|
|||
|
Edge new_edge = new Edge( last_char_index, N, parent_node );
|
|||
|
new_edge.Insert();
|
|||
|
if ( last_parent_node > 0 )
|
|||
|
Nodes[ last_parent_node ].suffix_node = parent_node;
|
|||
|
last_parent_node = parent_node;
|
|||
|
if ( active.origin_node == 0 )
|
|||
|
active.first_char_index++;
|
|||
|
else
|
|||
|
active.origin_node = Nodes[ active.origin_node ].suffix_node;
|
|||
|
active.Canonize();
|
|||
|
}
|
|||
|
if ( last_parent_node > 0 )
|
|||
|
Nodes[ last_parent_node ].suffix_node = parent_node;
|
|||
|
active.last_char_index++;
|
|||
|
active.Canonize();
|
|||
|
}
|
|||
|
|
|||
|
/** Function to print all contents and details of suffix tree **/
|
|||
|
public void dump_edges(int current_n )
|
|||
|
{
|
|||
|
System.out.println(" Start End Suf First Last String\n");
|
|||
|
for ( int j = 0 ; j < HASH_TABLE_SIZE ; j++ )
|
|||
|
{
|
|||
|
Edge s = Edges[j];
|
|||
|
if ( s.start_node == -1 )
|
|||
|
continue;
|
|||
|
System.out.printf("%5d %5d %3d %5d %6d ", s.start_node, s.end_node, Nodes[ s.end_node ].suffix_node, s.first_char_index, s.last_char_index);
|
|||
|
int top;
|
|||
|
if ( current_n > s.last_char_index )
|
|||
|
top = s.last_char_index;
|
|||
|
else
|
|||
|
top = current_n;
|
|||
|
for ( int l = s.first_char_index ; l <= top; l++)
|
|||
|
System.out.print( T[ l ]);
|
|||
|
System.out.println();
|
|||
|
}
|
|||
|
}
|
|||
|
/** Main Function **/
|
|||
|
public static void main(String[] args) throws IOException
|
|||
|
{
|
|||
|
BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
|
|||
|
System.out.println("Suffix Tree Test\n");
|
|||
|
System.out.println("Enter string\n");
|
|||
|
String str = br.readLine();
|
|||
|
/** Construct Suffix Tree **/
|
|||
|
SuffixTree st = new SuffixTree();
|
|||
|
st.T = str.toCharArray();
|
|||
|
st.N = st.T.length - 1;
|
|||
|
for (int i = 0 ; i <= st.N ; i++ )
|
|||
|
st.AddPrefix( st.active, i );
|
|||
|
st.dump_edges( st.N );
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
/*
|
|||
|
|
|||
|
Suffix Tree Test
|
|||
|
|
|||
|
Enter string
|
|||
|
|
|||
|
Start End Suf First Last String
|
|||
|
|
|||
|
0 2 -1 1 9 anfoundry
|
|||
|
0 9 -1 7 9 dry
|
|||
|
0 4 -1 3 9 foundry
|
|||
|
0 7 0 2 2 n
|
|||
|
0 5 -1 4 9 oundry
|
|||
|
0 10 -1 8 9 ry
|
|||
|
0 1 -1 0 9 sanfoundry
|
|||
|
0 6 -1 5 9 undry
|
|||
|
0 11 -1 9 9 y
|
|||
|
7 8 -1 7 9 dry
|
|||
|
7 3 -1 3 9 foundry
|