programming-examples/java/Graph_Problems_Algorithms/Java Program to Implement Suffix Tree.java

/*This is a Java Program to implement Suffix Tree. A suffix tree is a compressed trie containing all the suffixes of the given text as their keys and positions in the text as their values. Suffix tree allows a particularly fast implementation of many important string operations. The construction of such a tree for the string S takes time and space linear in the length of S. Once constructed, several operations can be performed quickly, for instance locating a substring in S, locating a substring if a certain number of mistakes are allowed, locating matches for a regular expression pattern etc. Suffix trees also provided one of the first linear-time solutions for the longest common substring problem. These speedups come at a cost: storing a string’s suffix tree typically requires significantly more space than storing the string itself. This program is based on Mark Nelson’s implementation of Ukkonen’s algorithm.*/

/*
 *      Java Program to Implement Suffix Tree
 */

import java.io.*;

/** Class Node **/
class Node
{
    public int suffix_node;
    public static int Count = 1;

    /** Constructor **/
    public Node()
    {
        suffix_node = -1;
    }
}

/** Class Suffix Tree **/
class SuffixTree
{
    private static final int MAX_LENGTH = 1000;
    private static final int HASH_TABLE_SIZE = 2179;

    private char[] T = new char[ MAX_LENGTH ];
    private int N;
    private Edge[] Edges ;
    private Node[] Nodes ;

    private Suffix active;

    /** Class Suffix **/
    class Suffix
    {
        public int origin_node;
        public int first_char_index;
        public int last_char_index;

        /** Constructor **/
        public Suffix(int node, int start, int stop )
        {
            origin_node = node ;
            first_char_index = start ;
            last_char_index = stop;
        }

        /** Function Implicit  **/
        public boolean Implicit()
        {
            return first_char_index > last_char_index;
        }

        /** Function Explicit  **/
        public boolean Explicit()
        {
            return first_char_index > last_char_index;
        }

        /** Function Canonize()
         * A suffix in the tree is denoted by a Suffix structure
         * that denotes its last character.  The canonical
         * representation of a suffix for this algorithm requires
         * that the origin_node by the closest node to the end
         * of the tree.  To force this to be true, we have to
         * slide down every edge in our current path until we
         * reach the final node
         **/
        public void Canonize()
        {
            if (!Explicit() )
                {
                    Edge edge = Find( origin_node, T[ first_char_index ] );
                    int edge_span = edge.last_char_index - edge.first_char_index;
                    while ( edge_span <= ( last_char_index - first_char_index ) )
                        {
                            first_char_index = first_char_index + edge_span + 1;
                            origin_node = edge.end_node;
                            if ( first_char_index <= last_char_index )
                                {
                                    edge = Find( edge.end_node, T[ first_char_index ] );
                                    edge_span = edge.last_char_index - edge.first_char_index;
                                }
                        }
                }
        }
    }

    /** Class Edge **/
    class Edge
    {
        public int first_char_index;
        public int last_char_index;
        public int end_node;
        public int start_node;

        /** Constructor **/
        public Edge()
        {
            start_node = -1;
        }

        /** Constructor **/
        public Edge( int init_first, int init_last, int parent_node )
        {
            first_char_index = init_first;
            last_char_index = init_last;
            start_node = parent_node;
            end_node = Node.Count++;
        }

        /** function Insert ()
         *  A given edge gets a copy of itself inserted into the table
         *  with this function.  It uses a linear probe technique, which
         *  means in the case of a collision, we just step forward through
         *  the table until we find the first unused slot.
         **/
        public void Insert()
        {
            int i = Hash( start_node, T[ first_char_index ] );
            while ( Edges[ i ].start_node != -1 )
                i = ++i % HASH_TABLE_SIZE;
            Edges[ i ] = this;
        }

        /** function SplitEdge ()
         *  This function is called
         *  to split an edge at the point defined by the Suffix argument
         **/
        public int SplitEdge( Suffix s )
        {
            Remove();
            Edge new_edge =  new Edge( first_char_index, first_char_index + s.last_char_index - s.first_char_index, s.origin_node );
            new_edge.Insert();
            Nodes[ new_edge.end_node ].suffix_node = s.origin_node;
            first_char_index += s.last_char_index - s.first_char_index + 1;
            start_node = new_edge.end_node;
            Insert();
            return new_edge.end_node;
        }

        /** function Remove ()
         *  This function is called to remove an edge from hash table
         **/
        public void Remove()
        {
            int i = Hash( start_node, T[ first_char_index ] );
            while ( Edges[ i ].start_node != start_node ||
                    Edges[ i ].first_char_index != first_char_index )
                i = ++i % HASH_TABLE_SIZE;
            for ( ; ; )
                {
                    Edges[ i ].start_node = -1;
                    int j = i;
                    for ( ; ; )
                        {
                            i = ++i % HASH_TABLE_SIZE;
                            if ( Edges[ i ].start_node == -1 )
                                return;
                            int r = Hash( Edges[ i ].start_node, T[ Edges[ i ].first_char_index ] );
                            if ( i >= r && r > j )
                                continue;
                            if ( r > j && j > i )
                                continue;
                            if ( j > i && i >= r )
                                continue;
                            break;
                        }
                    Edges[ j ] = Edges[ i ];
                }
        }
    }

    /** Constructor */
    public SuffixTree()
    {
        Edges = new Edge[ HASH_TABLE_SIZE ];
        for (int i = 0; i < HASH_TABLE_SIZE; i++)
            Edges[i] = new Edge();
        Nodes = new Node[ MAX_LENGTH * 2 ];
        for (int i = 0; i < MAX_LENGTH * 2 ; i++)
            Nodes[i] = new Node();
        active = new Suffix( 0, 0, -1 );
    }

    /** Function Find() - function to find an edge **/
    public Edge Find( int node, int c )
    {
        int i = Hash( node, c );
        for ( ; ; )
            {
                if ( Edges[ i ].start_node == node )
                    if ( c == T[ Edges[ i ].first_char_index ] )
                        return Edges[ i ];
                if ( Edges[ i ].start_node == -1 )
                    return Edges[ i ];
                i = ++i % HASH_TABLE_SIZE;
            }
    }

    /** Function Hash() - edges are inserted into the hash table using this hashing function **/
    public static int Hash( int node, int c )
    {
        return (( node << 8 ) + c ) % HASH_TABLE_SIZE;
    }

    /** Function AddPrefix() - called repetitively, once for each of the prefixes of the input string **/
    public void AddPrefix( Suffix active, int last_char_index )
    {
        int parent_node;
        int last_parent_node = -1;
        for ( ; ; )
            {
                Edge edge;
                parent_node = active.origin_node;
                if ( active.Explicit() )
                    {
                        edge = Find( active.origin_node, T[ last_char_index ] );
                        if ( edge.start_node != -1 )
                            break;
                    }
                else
                    {
                        edge = Find( active.origin_node, T[ active.first_char_index ] );
                        int span = active.last_char_index - active.first_char_index;
                        if ( T[ edge.first_char_index + span + 1 ] == T[ last_char_index ] )
                            break;
                        parent_node = edge.SplitEdge( active );
                    }
                Edge new_edge = new Edge( last_char_index, N, parent_node );
                new_edge.Insert();
                if ( last_parent_node > 0 )
                    Nodes[ last_parent_node ].suffix_node = parent_node;
                last_parent_node = parent_node;
                if ( active.origin_node == 0 )
                    active.first_char_index++;
                else
                    active.origin_node = Nodes[ active.origin_node ].suffix_node;
                active.Canonize();
            }
        if ( last_parent_node > 0 )
            Nodes[ last_parent_node ].suffix_node = parent_node;
        active.last_char_index++;
        active.Canonize();
    }

    /** Function to print all contents and details of suffix tree **/
    public void dump_edges(int current_n )
    {
        System.out.println(" Start  End  Suf  First Last  String\n");
        for ( int j = 0 ; j < HASH_TABLE_SIZE ; j++ )
            {
                Edge s = Edges[j];
                if ( s.start_node == -1 )
                    continue;
                System.out.printf("%5d %5d %3d %5d %6d   ", s.start_node, s.end_node, Nodes[ s.end_node ].suffix_node, s.first_char_index, s.last_char_index);
                int top;
                if ( current_n > s.last_char_index )
                    top = s.last_char_index;
                else
                    top = current_n;
                for ( int l = s.first_char_index ; l <= top; l++)
                    System.out.print( T[ l ]);
                System.out.println();
            }
    }
    /** Main Function **/
    public static void main(String[] args) throws IOException
    {
        BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
        System.out.println("Suffix Tree Test\n");
        System.out.println("Enter string\n");
        String str = br.readLine();
        /** Construct Suffix Tree **/
        SuffixTree st = new SuffixTree();
        st.T = str.toCharArray();
        st.N = st.T.length - 1;
        for (int i = 0 ; i <= st.N ; i++ )
            st.AddPrefix( st.active, i );
        st.dump_edges( st.N );
    }
}

/*

Suffix Tree Test

Enter string

Start  End  Suf  First Last  String

    0     2  -1     1      9   anfoundry
    0     9  -1     7      9   dry
    0     4  -1     3      9   foundry
    0     7   0     2      2   n
    0     5  -1     4      9   oundry
    0    10  -1     8      9   ry
    0     1  -1     0      9   sanfoundry
    0     6  -1     5      9   undry
    0    11  -1     9      9   y
    7     8  -1     7      9   dry
    7     3  -1     3      9   foundry