libmoost
|
00001 /* vim:set ts=3 sw=3 sts=3 et: */ 00028 #ifndef MOOST_STRING_LEVENSHTEIN_HPP__ 00029 #define MOOST_STRING_LEVENSHTEIN_HPP__ 00030 00031 #include <vector> 00032 #include <string> 00033 #include <stdexcept> 00034 #include <climits> 00035 00036 namespace moost { namespace string { 00037 00045 template <class RandomAccessIterator> 00046 int levenshtein(RandomAccessIterator first1, 00047 RandomAccessIterator last1, 00048 RandomAccessIterator first2, 00049 RandomAccessIterator last2, 00050 std::vector< std::vector<int> > & d) 00051 { 00052 const int m = last1 - first1; // length of 1 00053 const int n = last2 - first2; // length of 2 00054 int i, j, cost; // cost, iteration 00055 00056 d.resize(m + 1); 00057 for (std::vector< std::vector< int> >::iterator it = d.begin(); it != d.end(); ++it) 00058 it->resize(n + 1); 00059 00060 for (i = 0; i <= m; ++i) 00061 d[i][0] = i; 00062 for (j = 0; j <= n; ++j) 00063 d[0][j] = j; 00064 00065 for (i = 1; i <= m; ++i) 00066 { 00067 for (j = 1; j <= n; ++j) 00068 { 00069 cost = (*(first1 + i - 1) == *(first2 + j - 1) ? 0 : 1); 00070 d[i][j] = std::min(d[i - 1][j] + 1, // deletion 00071 std::min(d[i][j - 1] + 1, // insertion 00072 d[i - 1][j - 1] + cost)); // substitution 00073 00074 if ( i > 1 00075 && j > 1 00076 && *(first1 + i - 1) == *(first2 + j - 2) 00077 && *(first1 + i - 2) == *(first2 + j - 1)) 00078 d[i][j] = std::min(d[i][j], d[i - 2][j - 2] + cost); // transposition 00079 } 00080 } 00081 00082 return d[m][n]; 00083 } 00084 00090 template <class RandomAccessIterator> 00091 int levenshtein(RandomAccessIterator first1, 00092 RandomAccessIterator last1, 00093 RandomAccessIterator first2, 00094 RandomAccessIterator last2) 00095 { 00096 std::vector< std::vector<int> > d; 00097 00098 return levenshtein(first1, last1, first2, last2, d); 00099 } 00100 00106 int levenshtein(const std::string & first, 00107 const std::string & second, 00108 std::vector< std::vector<int> > & d) 00109 { 00110 return levenshtein(first.begin(), first.end(), second.begin(), second.end(), d); 00111 } 00112 00118 int levenshtein(const std::string & first, 00119 const std::string & second) 00120 { 00121 std::vector< std::vector<int> > d; 00122 return levenshtein(first.begin(), first.end(), second.begin(), second.end(), d); 00123 } 00124 00135 template <class RandomAccessIterator> 00136 int fast_levenshtein(RandomAccessIterator first1, 00137 RandomAccessIterator last1, 00138 RandomAccessIterator first2, 00139 RandomAccessIterator last2, 00140 int thresh = INT_MAX) 00141 { 00142 const int n = last1 - first1; // length of 1 00143 const int m = last2 - first2; // length of 2 00144 00145 // check precondition not violated 00146 if (n > m) 00147 throw std::runtime_error("fast_levenshtein called with first string longer than second!"); 00148 00149 if ((m - n) > thresh) // abort, edit dist has to be more than thresh 00150 return m - n; 00151 00152 std::vector<int> current(n + 1); 00153 std::vector<int> previous(n + 1); 00154 for (int i = 0; i <= n; ++i) 00155 current[i] = i; 00156 00157 for (int i = 1; i <= m; ++i) 00158 { 00159 std::copy(current.begin(), current.end(), previous.begin()); 00160 std::fill(current.begin(), current.end(), 0); 00161 current[0] = i; 00162 00163 for (int j = 1; j <= n; ++j) 00164 { 00165 int substitute = previous[j - 1]; 00166 if (*(first1 + j - 1) != *(first2 + i - 1)) 00167 ++substitute; 00168 00169 // Get minimum of insert, delete and substitute 00170 current[j] = std::min(std::min(previous[j] + 1, current[j - 1] + 1) , substitute); 00171 } 00172 00173 // bail out if already over thresh 00174 int min_current = current[0]; 00175 for (int j = 1; j <= n; ++j) 00176 if (current[j] < min_current) 00177 min_current = current[j]; 00178 if (min_current > thresh) 00179 return min_current; 00180 } 00181 00182 return current[n]; 00183 } 00184 00194 int fast_levenshtein(const std::string & first, 00195 const std::string & second, 00196 int thresh = INT_MAX) 00197 { 00198 if (first.length() <= second.length()) 00199 return fast_levenshtein(first.begin(), first.end(), second.begin(), second.end(), thresh); 00200 else 00201 return fast_levenshtein(second.begin(), second.end(), first.begin(), first.end(), thresh); 00202 } 00203 00204 }} // moost::string 00205 00206 #endif // MOOST_STRING_LEVENSHTEIN_HPP__