libmoost
/home/mhx/git/github/libmoost/include/moost/string/levenshtein.hpp
Go to the documentation of this file.
00001 /* vim:set ts=3 sw=3 sts=3 et: */
00028 #ifndef MOOST_STRING_LEVENSHTEIN_HPP__
00029 #define MOOST_STRING_LEVENSHTEIN_HPP__
00030 
00031 #include <vector>
00032 #include <string>
00033 #include <stdexcept>
00034 #include <climits>
00035 
00036 namespace moost { namespace string {
00037 
00045 template <class RandomAccessIterator>
00046 int levenshtein(RandomAccessIterator first1,
00047                 RandomAccessIterator last1,
00048                 RandomAccessIterator first2,
00049                 RandomAccessIterator last2,
00050                 std::vector< std::vector<int> > & d)
00051 {
00052   const int m = last1 - first1; // length of 1
00053   const int n = last2 - first2; // length of 2
00054   int i, j, cost; // cost, iteration
00055 
00056   d.resize(m + 1);
00057   for (std::vector< std::vector< int> >::iterator it = d.begin(); it != d.end(); ++it)
00058     it->resize(n + 1);
00059 
00060   for (i = 0; i <= m; ++i)
00061     d[i][0] = i;
00062   for (j = 0; j <= n; ++j)
00063     d[0][j] = j;
00064 
00065   for (i = 1; i <= m; ++i)
00066   {
00067     for (j = 1; j <= n; ++j)
00068     {
00069       cost = (*(first1 + i - 1) == *(first2 + j - 1) ? 0 : 1);
00070       d[i][j] = std::min(d[i - 1][j] + 1, // deletion
00071                 std::min(d[i][j - 1] + 1, // insertion
00072                 d[i - 1][j - 1] +   cost)); // substitution
00073 
00074       if (   i > 1
00075           && j > 1
00076           && *(first1 + i - 1) == *(first2 + j - 2)
00077           && *(first1 + i - 2) == *(first2 + j - 1))
00078         d[i][j] = std::min(d[i][j], d[i - 2][j - 2] + cost); // transposition
00079     }
00080   }
00081 
00082   return d[m][n];
00083 }
00084 
00090 template <class RandomAccessIterator>
00091 int levenshtein(RandomAccessIterator first1,
00092                 RandomAccessIterator last1,
00093                 RandomAccessIterator first2,
00094                 RandomAccessIterator last2)
00095 {
00096   std::vector< std::vector<int> > d;
00097 
00098   return levenshtein(first1, last1, first2, last2, d);
00099 }
00100 
00106 int levenshtein(const std::string & first,
00107                 const std::string & second,
00108                 std::vector< std::vector<int> > & d)
00109 {
00110   return levenshtein(first.begin(), first.end(), second.begin(), second.end(), d);
00111 }
00112 
00118 int levenshtein(const std::string & first,
00119                 const std::string & second)
00120 {
00121   std::vector< std::vector<int> > d;
00122   return levenshtein(first.begin(), first.end(), second.begin(), second.end(), d);
00123 }
00124 
00135 template <class RandomAccessIterator>
00136 int fast_levenshtein(RandomAccessIterator first1,
00137                 RandomAccessIterator last1,
00138                 RandomAccessIterator first2,
00139                 RandomAccessIterator last2,
00140                 int thresh = INT_MAX)
00141 {
00142    const int n = last1 - first1; // length of 1
00143    const int m = last2 - first2; // length of 2
00144 
00145    // check precondition not violated
00146    if (n > m)
00147       throw std::runtime_error("fast_levenshtein called with first string longer than second!");
00148 
00149    if ((m - n) > thresh)    // abort, edit dist has to be more than thresh
00150       return m - n;
00151 
00152    std::vector<int> current(n + 1);
00153    std::vector<int> previous(n + 1);
00154    for (int i = 0; i <= n; ++i)
00155       current[i] = i;
00156 
00157    for (int i = 1; i <= m; ++i)
00158    {
00159       std::copy(current.begin(), current.end(), previous.begin());
00160       std::fill(current.begin(), current.end(), 0);
00161       current[0] = i;
00162 
00163       for (int j = 1; j <= n; ++j)
00164       {
00165          int substitute = previous[j - 1];
00166          if (*(first1 + j - 1) != *(first2 + i - 1))
00167             ++substitute;
00168 
00169          // Get minimum of insert, delete and substitute
00170          current[j] = std::min(std::min(previous[j] + 1, current[j - 1] + 1) , substitute);
00171       }
00172 
00173       // bail out if already over thresh
00174       int min_current = current[0];
00175       for (int j = 1; j <= n; ++j)
00176          if (current[j] < min_current)
00177             min_current = current[j];
00178       if (min_current > thresh)
00179          return min_current;
00180    }
00181 
00182    return current[n];
00183 }
00184 
00194 int fast_levenshtein(const std::string & first,
00195                 const std::string & second,
00196                 int thresh = INT_MAX)
00197 {
00198   if (first.length() <= second.length())
00199      return fast_levenshtein(first.begin(), first.end(), second.begin(), second.end(), thresh);
00200   else
00201      return fast_levenshtein(second.begin(), second.end(), first.begin(), first.end(), thresh);
00202 }
00203 
00204 }} // moost::string
00205 
00206 #endif // MOOST_STRING_LEVENSHTEIN_HPP__