programming-examples/java/Graph_Problems_Algorithms/Java Program to Implement Suffix Tree.java

304 lines
11 KiB
Java
Raw Normal View History

2019-11-15 12:59:38 +01:00
/*This is a Java Program to implement Suffix Tree. A suffix tree is a compressed trie containing all the suffixes of the given text as their keys and positions in the text as their values. Suffix tree allows a particularly fast implementation of many important string operations. The construction of such a tree for the string S takes time and space linear in the length of S. Once constructed, several operations can be performed quickly, for instance locating a substring in S, locating a substring if a certain number of mistakes are allowed, locating matches for a regular expression pattern etc. Suffix trees also provided one of the first linear-time solutions for the longest common substring problem. These speedups come at a cost: storing a strings suffix tree typically requires significantly more space than storing the string itself. This program is based on Mark Nelsons implementation of Ukkonens algorithm.*/
/*
* Java Program to Implement Suffix Tree
*/
import java.io.*;
/** Class Node **/
class Node
{
public int suffix_node;
public static int Count = 1;
/** Constructor **/
public Node()
{
suffix_node = -1;
}
}
/** Class Suffix Tree **/
class SuffixTree
{
private static final int MAX_LENGTH = 1000;
private static final int HASH_TABLE_SIZE = 2179;
private char[] T = new char[ MAX_LENGTH ];
private int N;
private Edge[] Edges ;
private Node[] Nodes ;
private Suffix active;
/** Class Suffix **/
class Suffix
{
public int origin_node;
public int first_char_index;
public int last_char_index;
/** Constructor **/
public Suffix(int node, int start, int stop )
{
origin_node = node ;
first_char_index = start ;
last_char_index = stop;
}
/** Function Implicit **/
public boolean Implicit()
{
return first_char_index > last_char_index;
}
/** Function Explicit **/
public boolean Explicit()
{
return first_char_index > last_char_index;
}
/** Function Canonize()
* A suffix in the tree is denoted by a Suffix structure
* that denotes its last character. The canonical
* representation of a suffix for this algorithm requires
* that the origin_node by the closest node to the end
* of the tree. To force this to be true, we have to
* slide down every edge in our current path until we
* reach the final node
**/
public void Canonize()
{
if (!Explicit() )
{
Edge edge = Find( origin_node, T[ first_char_index ] );
int edge_span = edge.last_char_index - edge.first_char_index;
while ( edge_span <= ( last_char_index - first_char_index ) )
{
first_char_index = first_char_index + edge_span + 1;
origin_node = edge.end_node;
if ( first_char_index <= last_char_index )
{
edge = Find( edge.end_node, T[ first_char_index ] );
edge_span = edge.last_char_index - edge.first_char_index;
}
}
}
}
}
/** Class Edge **/
class Edge
{
public int first_char_index;
public int last_char_index;
public int end_node;
public int start_node;
/** Constructor **/
public Edge()
{
start_node = -1;
}
/** Constructor **/
public Edge( int init_first, int init_last, int parent_node )
{
first_char_index = init_first;
last_char_index = init_last;
start_node = parent_node;
end_node = Node.Count++;
}
/** function Insert ()
* A given edge gets a copy of itself inserted into the table
* with this function. It uses a linear probe technique, which
* means in the case of a collision, we just step forward through
* the table until we find the first unused slot.
**/
public void Insert()
{
int i = Hash( start_node, T[ first_char_index ] );
while ( Edges[ i ].start_node != -1 )
i = ++i % HASH_TABLE_SIZE;
Edges[ i ] = this;
}
/** function SplitEdge ()
* This function is called
* to split an edge at the point defined by the Suffix argument
**/
public int SplitEdge( Suffix s )
{
Remove();
Edge new_edge = new Edge( first_char_index, first_char_index + s.last_char_index - s.first_char_index, s.origin_node );
new_edge.Insert();
Nodes[ new_edge.end_node ].suffix_node = s.origin_node;
first_char_index += s.last_char_index - s.first_char_index + 1;
start_node = new_edge.end_node;
Insert();
return new_edge.end_node;
}
/** function Remove ()
* This function is called to remove an edge from hash table
**/
public void Remove()
{
int i = Hash( start_node, T[ first_char_index ] );
while ( Edges[ i ].start_node != start_node ||
Edges[ i ].first_char_index != first_char_index )
i = ++i % HASH_TABLE_SIZE;
for ( ; ; )
{
Edges[ i ].start_node = -1;
int j = i;
for ( ; ; )
{
i = ++i % HASH_TABLE_SIZE;
if ( Edges[ i ].start_node == -1 )
return;
int r = Hash( Edges[ i ].start_node, T[ Edges[ i ].first_char_index ] );
if ( i >= r && r > j )
continue;
if ( r > j && j > i )
continue;
if ( j > i && i >= r )
continue;
break;
}
Edges[ j ] = Edges[ i ];
}
}
}
/** Constructor */
public SuffixTree()
{
Edges = new Edge[ HASH_TABLE_SIZE ];
for (int i = 0; i < HASH_TABLE_SIZE; i++)
Edges[i] = new Edge();
Nodes = new Node[ MAX_LENGTH * 2 ];
for (int i = 0; i < MAX_LENGTH * 2 ; i++)
Nodes[i] = new Node();
active = new Suffix( 0, 0, -1 );
}
/** Function Find() - function to find an edge **/
public Edge Find( int node, int c )
{
int i = Hash( node, c );
for ( ; ; )
{
if ( Edges[ i ].start_node == node )
if ( c == T[ Edges[ i ].first_char_index ] )
return Edges[ i ];
if ( Edges[ i ].start_node == -1 )
return Edges[ i ];
i = ++i % HASH_TABLE_SIZE;
}
}
/** Function Hash() - edges are inserted into the hash table using this hashing function **/
public static int Hash( int node, int c )
{
return (( node << 8 ) + c ) % HASH_TABLE_SIZE;
}
/** Function AddPrefix() - called repetitively, once for each of the prefixes of the input string **/
public void AddPrefix( Suffix active, int last_char_index )
{
int parent_node;
int last_parent_node = -1;
for ( ; ; )
{
Edge edge;
parent_node = active.origin_node;
if ( active.Explicit() )
{
edge = Find( active.origin_node, T[ last_char_index ] );
if ( edge.start_node != -1 )
break;
}
else
{
edge = Find( active.origin_node, T[ active.first_char_index ] );
int span = active.last_char_index - active.first_char_index;
if ( T[ edge.first_char_index + span + 1 ] == T[ last_char_index ] )
break;
parent_node = edge.SplitEdge( active );
}
Edge new_edge = new Edge( last_char_index, N, parent_node );
new_edge.Insert();
if ( last_parent_node > 0 )
Nodes[ last_parent_node ].suffix_node = parent_node;
last_parent_node = parent_node;
if ( active.origin_node == 0 )
active.first_char_index++;
else
active.origin_node = Nodes[ active.origin_node ].suffix_node;
active.Canonize();
}
if ( last_parent_node > 0 )
Nodes[ last_parent_node ].suffix_node = parent_node;
active.last_char_index++;
active.Canonize();
}
/** Function to print all contents and details of suffix tree **/
public void dump_edges(int current_n )
{
System.out.println(" Start End Suf First Last String\n");
for ( int j = 0 ; j < HASH_TABLE_SIZE ; j++ )
{
Edge s = Edges[j];
if ( s.start_node == -1 )
continue;
System.out.printf("%5d %5d %3d %5d %6d ", s.start_node, s.end_node, Nodes[ s.end_node ].suffix_node, s.first_char_index, s.last_char_index);
int top;
if ( current_n > s.last_char_index )
top = s.last_char_index;
else
top = current_n;
for ( int l = s.first_char_index ; l <= top; l++)
System.out.print( T[ l ]);
System.out.println();
}
}
/** Main Function **/
public static void main(String[] args) throws IOException
{
BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
System.out.println("Suffix Tree Test\n");
System.out.println("Enter string\n");
String str = br.readLine();
/** Construct Suffix Tree **/
SuffixTree st = new SuffixTree();
st.T = str.toCharArray();
st.N = st.T.length - 1;
for (int i = 0 ; i <= st.N ; i++ )
st.AddPrefix( st.active, i );
st.dump_edges( st.N );
}
}
/*
Suffix Tree Test
Enter string
Start End Suf First Last String
0 2 -1 1 9 anfoundry
0 9 -1 7 9 dry
0 4 -1 3 9 foundry
0 7 0 2 2 n
0 5 -1 4 9 oundry
0 10 -1 8 9 ry
0 1 -1 0 9 sanfoundry
0 6 -1 5 9 undry
0 11 -1 9 9 y
7 8 -1 7 9 dry
7 3 -1 3 9 foundry