String matching in linear-time #include #include #include #include typedef std::vector int_vec; class string_search { int_vec shifts; void compute_shifts(const std::string &pattern); public: int find_first(const std::string &text, const std::string &pattern); int_vec find_all(const std::string &text, const std::string &pattern); }; // create the shift-lookup-table void string_search::compute_shifts(const std::string &pattern) { int next_shift = 0; shifts.clear(); shifts.push_back(0); // shift to the first character // start with the second character, since the shift to the first is always 0 for(int i = 1; i < pattern.length(); i++) { while(next_shift > 0 && pattern[next_shift] != pattern[i]) next_shift = shifts[next_shift]; if(pattern[next_shift] == pattern[i]) next_shift++; shifts.push_back(next_shift); } } // search the string and return when the first occurrence is found int string_search::find_first(const std::string &text, const std::string &pattern) { int next_shift = 0; compute_shifts(pattern); for(int i = 0; i < text.length(); i++) { while(next_shift > 0 && pattern[next_shift] != text[i]) next_shift = shifts[next_shift - 1]; if(pattern[next_shift] == text[i]) next_shift++; if(next_shift == pattern.length()) return i - (pattern.length() - 1); // found the first so return } return -1; } // search the string and put every occurence in a vector int_vec string_search::find_all(const std::string &text, const std::string &pattern) { int next_shift = 0; int_vec positions; compute_shifts(pattern); for(int i = 0; i < text.length(); i++) { while(next_shift > 0 && pattern[next_shift] != text[i]) next_shift = shifts[next_shift - 1]; if(pattern[next_shift] == text[i]) next_shift++; if(next_shift == pattern.length()) { positions.push_back(i - (pattern.length() - 1)); // found one, put in list next_shift = shifts[next_shift - 1]; // restart pattern with last shift } } return positions; } int main(int argc, char **argv) { if(argc <= 2){ cout << "Usage: " << argv[0] << " filename searchpattern" << endl; return 0; } std::string pattern = argv[2]; // read the file. Since the file is read like this all white-characters // are eaten, so a search including white-characters will fail... fstream fs; std::string text, temp; fs.open(argv[1], ios::in); while(!fs.eof()){ fs >> temp; text += temp; } fs.close(); // search the file string_search search; int_vec pos_list = search.find_all(text, pattern); // print out result std::vector::iterator it; cout << "Found " << pos_list.size() << " occurrences" << endl; for(it = pos_list.begin(); it != pos_list.end(); it++){ temp = text.substr(*it, pattern.length()); cout << "Pos=" << *it << " == " << temp << endl; } }