programming-examples/java/Graph_Problems_Algorithms/Java Program to Implement Suffix Tree.java
2019-11-15 12:59:38 +01:00

304 lines
11 KiB
Java
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*This is a Java Program to implement Suffix Tree. A suffix tree is a compressed trie containing all the suffixes of the given text as their keys and positions in the text as their values. Suffix tree allows a particularly fast implementation of many important string operations. The construction of such a tree for the string S takes time and space linear in the length of S. Once constructed, several operations can be performed quickly, for instance locating a substring in S, locating a substring if a certain number of mistakes are allowed, locating matches for a regular expression pattern etc. Suffix trees also provided one of the first linear-time solutions for the longest common substring problem. These speedups come at a cost: storing a strings suffix tree typically requires significantly more space than storing the string itself. This program is based on Mark Nelsons implementation of Ukkonens algorithm.*/
/*
* Java Program to Implement Suffix Tree
*/
import java.io.*;
/** Class Node **/
class Node
{
public int suffix_node;
public static int Count = 1;
/** Constructor **/
public Node()
{
suffix_node = -1;
}
}
/** Class Suffix Tree **/
class SuffixTree
{
private static final int MAX_LENGTH = 1000;
private static final int HASH_TABLE_SIZE = 2179;
private char[] T = new char[ MAX_LENGTH ];
private int N;
private Edge[] Edges ;
private Node[] Nodes ;
private Suffix active;
/** Class Suffix **/
class Suffix
{
public int origin_node;
public int first_char_index;
public int last_char_index;
/** Constructor **/
public Suffix(int node, int start, int stop )
{
origin_node = node ;
first_char_index = start ;
last_char_index = stop;
}
/** Function Implicit **/
public boolean Implicit()
{
return first_char_index > last_char_index;
}
/** Function Explicit **/
public boolean Explicit()
{
return first_char_index > last_char_index;
}
/** Function Canonize()
* A suffix in the tree is denoted by a Suffix structure
* that denotes its last character. The canonical
* representation of a suffix for this algorithm requires
* that the origin_node by the closest node to the end
* of the tree. To force this to be true, we have to
* slide down every edge in our current path until we
* reach the final node
**/
public void Canonize()
{
if (!Explicit() )
{
Edge edge = Find( origin_node, T[ first_char_index ] );
int edge_span = edge.last_char_index - edge.first_char_index;
while ( edge_span <= ( last_char_index - first_char_index ) )
{
first_char_index = first_char_index + edge_span + 1;
origin_node = edge.end_node;
if ( first_char_index <= last_char_index )
{
edge = Find( edge.end_node, T[ first_char_index ] );
edge_span = edge.last_char_index - edge.first_char_index;
}
}
}
}
}
/** Class Edge **/
class Edge
{
public int first_char_index;
public int last_char_index;
public int end_node;
public int start_node;
/** Constructor **/
public Edge()
{
start_node = -1;
}
/** Constructor **/
public Edge( int init_first, int init_last, int parent_node )
{
first_char_index = init_first;
last_char_index = init_last;
start_node = parent_node;
end_node = Node.Count++;
}
/** function Insert ()
* A given edge gets a copy of itself inserted into the table
* with this function. It uses a linear probe technique, which
* means in the case of a collision, we just step forward through
* the table until we find the first unused slot.
**/
public void Insert()
{
int i = Hash( start_node, T[ first_char_index ] );
while ( Edges[ i ].start_node != -1 )
i = ++i % HASH_TABLE_SIZE;
Edges[ i ] = this;
}
/** function SplitEdge ()
* This function is called
* to split an edge at the point defined by the Suffix argument
**/
public int SplitEdge( Suffix s )
{
Remove();
Edge new_edge = new Edge( first_char_index, first_char_index + s.last_char_index - s.first_char_index, s.origin_node );
new_edge.Insert();
Nodes[ new_edge.end_node ].suffix_node = s.origin_node;
first_char_index += s.last_char_index - s.first_char_index + 1;
start_node = new_edge.end_node;
Insert();
return new_edge.end_node;
}
/** function Remove ()
* This function is called to remove an edge from hash table
**/
public void Remove()
{
int i = Hash( start_node, T[ first_char_index ] );
while ( Edges[ i ].start_node != start_node ||
Edges[ i ].first_char_index != first_char_index )
i = ++i % HASH_TABLE_SIZE;
for ( ; ; )
{
Edges[ i ].start_node = -1;
int j = i;
for ( ; ; )
{
i = ++i % HASH_TABLE_SIZE;
if ( Edges[ i ].start_node == -1 )
return;
int r = Hash( Edges[ i ].start_node, T[ Edges[ i ].first_char_index ] );
if ( i >= r && r > j )
continue;
if ( r > j && j > i )
continue;
if ( j > i && i >= r )
continue;
break;
}
Edges[ j ] = Edges[ i ];
}
}
}
/** Constructor */
public SuffixTree()
{
Edges = new Edge[ HASH_TABLE_SIZE ];
for (int i = 0; i < HASH_TABLE_SIZE; i++)
Edges[i] = new Edge();
Nodes = new Node[ MAX_LENGTH * 2 ];
for (int i = 0; i < MAX_LENGTH * 2 ; i++)
Nodes[i] = new Node();
active = new Suffix( 0, 0, -1 );
}
/** Function Find() - function to find an edge **/
public Edge Find( int node, int c )
{
int i = Hash( node, c );
for ( ; ; )
{
if ( Edges[ i ].start_node == node )
if ( c == T[ Edges[ i ].first_char_index ] )
return Edges[ i ];
if ( Edges[ i ].start_node == -1 )
return Edges[ i ];
i = ++i % HASH_TABLE_SIZE;
}
}
/** Function Hash() - edges are inserted into the hash table using this hashing function **/
public static int Hash( int node, int c )
{
return (( node << 8 ) + c ) % HASH_TABLE_SIZE;
}
/** Function AddPrefix() - called repetitively, once for each of the prefixes of the input string **/
public void AddPrefix( Suffix active, int last_char_index )
{
int parent_node;
int last_parent_node = -1;
for ( ; ; )
{
Edge edge;
parent_node = active.origin_node;
if ( active.Explicit() )
{
edge = Find( active.origin_node, T[ last_char_index ] );
if ( edge.start_node != -1 )
break;
}
else
{
edge = Find( active.origin_node, T[ active.first_char_index ] );
int span = active.last_char_index - active.first_char_index;
if ( T[ edge.first_char_index + span + 1 ] == T[ last_char_index ] )
break;
parent_node = edge.SplitEdge( active );
}
Edge new_edge = new Edge( last_char_index, N, parent_node );
new_edge.Insert();
if ( last_parent_node > 0 )
Nodes[ last_parent_node ].suffix_node = parent_node;
last_parent_node = parent_node;
if ( active.origin_node == 0 )
active.first_char_index++;
else
active.origin_node = Nodes[ active.origin_node ].suffix_node;
active.Canonize();
}
if ( last_parent_node > 0 )
Nodes[ last_parent_node ].suffix_node = parent_node;
active.last_char_index++;
active.Canonize();
}
/** Function to print all contents and details of suffix tree **/
public void dump_edges(int current_n )
{
System.out.println(" Start End Suf First Last String\n");
for ( int j = 0 ; j < HASH_TABLE_SIZE ; j++ )
{
Edge s = Edges[j];
if ( s.start_node == -1 )
continue;
System.out.printf("%5d %5d %3d %5d %6d ", s.start_node, s.end_node, Nodes[ s.end_node ].suffix_node, s.first_char_index, s.last_char_index);
int top;
if ( current_n > s.last_char_index )
top = s.last_char_index;
else
top = current_n;
for ( int l = s.first_char_index ; l <= top; l++)
System.out.print( T[ l ]);
System.out.println();
}
}
/** Main Function **/
public static void main(String[] args) throws IOException
{
BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
System.out.println("Suffix Tree Test\n");
System.out.println("Enter string\n");
String str = br.readLine();
/** Construct Suffix Tree **/
SuffixTree st = new SuffixTree();
st.T = str.toCharArray();
st.N = st.T.length - 1;
for (int i = 0 ; i <= st.N ; i++ )
st.AddPrefix( st.active, i );
st.dump_edges( st.N );
}
}
/*
Suffix Tree Test
Enter string
Start End Suf First Last String
0 2 -1 1 9 anfoundry
0 9 -1 7 9 dry
0 4 -1 3 9 foundry
0 7 0 2 2 n
0 5 -1 4 9 oundry
0 10 -1 8 9 ry
0 1 -1 0 9 sanfoundry
0 6 -1 5 9 undry
0 11 -1 9 9 y
7 8 -1 7 9 dry
7 3 -1 3 9 foundry