libmoost
|
00001 /* vim:set ts=3 sw=3 sts=3 et: */ 00028 #ifndef MOOST_CONTAINER_POLICIES_READERS_HPP 00029 #define MOOST_CONTAINER_POLICIES_READERS_HPP 00030 00031 #include <vector> 00032 #include <string> 00033 #include <fstream> 00034 #include <iostream> 00035 #include <sstream> 00036 #include <algorithm> 00037 #include <cstdio> 00038 00039 #include <boost/cstdint.hpp> 00040 #include <boost/lexical_cast.hpp> 00041 00042 #include "../../which.hpp" 00043 00044 namespace moost { namespace container { namespace policies { 00045 00046 /* 00047 * some simple readers to parse vector data 00048 * from text files in various formats 00049 * for use with simple_multi_map 00050 * 00051 * K is the key type and must support: 00052 * K k; istream >> k; 00053 */ 00054 00055 /* 00056 * first some little traits classes 00057 * so we can use sscanf to speed up 00058 * reading from file 00059 */ 00060 template<typename T> 00061 struct reader_value_traits; 00062 00063 template<> 00064 struct reader_value_traits<float> 00065 { 00066 typedef float value_type; 00067 static inline std::string format_string() { return "%f"; } 00068 }; 00069 00070 template<> 00071 struct reader_value_traits<double> 00072 { 00073 typedef double value_type; 00074 static inline std::string format_string() { return "%lg"; } 00075 }; 00076 00077 template<> 00078 struct reader_value_traits<int> 00079 { 00080 typedef int value_type; 00081 static inline std::string format_string() { return "%d"; } 00082 }; 00083 00084 template<> 00085 struct reader_value_traits<long> 00086 { 00087 typedef long value_type; 00088 static inline std::string format_string() { return "%l"; } 00089 }; 00090 00091 #if defined(__GNUC__) && ULONG_MAX == 0xFFFFFFFF 00092 00093 template<> 00094 struct reader_value_traits<long long> 00095 { 00096 typedef long long value_type; 00097 static inline std::string format_string() { return "%ll"; } 00098 }; 00099 00100 template<> 00101 struct reader_value_traits<unsigned long long> 00102 { 00103 typedef unsigned long long value_type; 00104 static inline std::string format_string() { return "%llu"; } 00105 }; 00106 00107 #endif 00108 00109 /* 00110 * now the readers themselves 00111 */ 00112 00113 // this expects 00114 // id idx val idx val idx val... 00115 template<typename K, typename T> 00116 class tsv_sparsevec_reader 00117 { 00118 public: 00119 typedef std::vector<std::pair<int, T> > sparsevec_t; 00120 00121 tsv_sparsevec_reader(std::istream& is) 00122 : m_is(is) { } 00123 00124 tsv_sparsevec_reader(const std::string& fileName) 00125 : m_is(m_ifs) 00126 { 00127 m_ifs.open(fileName.c_str()); 00128 if ( !m_ifs.is_open() ) 00129 throw std::runtime_error("Cannot open file <" + fileName + ">!"); 00130 } 00131 00132 bool read(K& key, sparsevec_t& vec, bool sort_by_value) 00133 { 00134 if (m_is.eof()) 00135 return false; 00136 std::string line; 00137 getline(m_is, line); 00138 if (line.empty()) 00139 return false; 00140 00141 vec.clear(); 00142 00143 std::istringstream iss(line); 00144 iss >> key; 00145 // now read the remaining values into a vector 00146 int idx; 00147 T val; 00148 while (iss >> idx >> val) 00149 { 00150 vec.push_back(std::make_pair(idx, val)); 00151 } 00152 00153 if (sort_by_value) // sort by value desc 00154 std::stable_sort(vec.begin(), vec.end(), moost::which<2>::comparer<std::greater>()); 00155 else // sort by idx 00156 std::stable_sort(vec.begin(), vec.end(), moost::which<1>::comparer<std::less>()); 00157 00158 return true; 00159 } 00160 00161 void clear() 00162 { 00163 m_is.clear(); 00164 m_is.seekg(0, std::ios::beg); 00165 } 00166 00167 private: 00168 std::ifstream m_ifs; 00169 std::istream& m_is; 00170 00171 }; 00172 00173 // this expects 00174 // id (idx, val) (idx, val) (idx, val)... 00175 // we sometimes get this format back from dumbo 00176 template<typename K, typename T> 00177 class python_sparsevec_reader 00178 { 00179 public: 00180 typedef std::vector<std::pair<int, T> > sparsevec_t; 00181 00182 python_sparsevec_reader(std::istream& is) 00183 : m_is(is), m_format_string("%*2c%d%*2c" + reader_value_traits<T>::format_string()) { } 00184 00185 python_sparsevec_reader(const std::string fileName) 00186 : m_is(m_ifs), m_format_string("%*2c%d%*2c" + reader_value_traits<T>::format_string()) 00187 { 00188 m_ifs.open(fileName.c_str()); 00189 if ( !m_ifs.is_open() ) 00190 throw std::runtime_error("Cannot open file <" + fileName + ">!"); 00191 } 00192 00193 bool read(K& key, sparsevec_t& vec, bool sort_by_value) 00194 { 00195 if (m_is.eof()) 00196 return false; 00197 std::string line; 00198 getline(m_is, line); 00199 if (line.empty()) 00200 return false; 00201 00202 vec.clear(); 00203 00204 std::istringstream iss(line); 00205 iss >> key; 00206 // now read the remaining values into a vector 00207 std::string s; 00208 int idx; 00209 T val; 00210 while (getline(iss, s, ')')){ 00211 sscanf(s.c_str(), m_format_string.data(), &idx, &val); 00212 vec.push_back(std::make_pair(idx, val)); 00213 } 00214 00215 if (sort_by_value) // sort by value desc 00216 std::stable_sort(vec.begin(), vec.end(), moost::which<2>::comparer<std::greater>()); 00217 else // sort by idx 00218 std::stable_sort(vec.begin(), vec.end(), moost::which<1>::comparer<std::less>()); 00219 00220 return true; 00221 } 00222 00223 void clear() 00224 { 00225 m_is.clear(); 00226 m_is.seekg(0, std::ios::beg); 00227 } 00228 00229 private: 00230 std::ifstream m_ifs; 00231 std::istream& m_is; 00232 const std::string m_format_string; 00233 }; 00234 00235 // this expects CF format i.e. 00236 // 10 200 3.0 00237 // 10 300 2.0 00238 // 20 100 1.0 00239 // 20 300 4.0 00240 // means 00241 // 10 -> (200,3.0), (300,2.0) 00242 // 20 -> (100,1.0), (300,4.0) 00243 template<typename K, typename T> 00244 class cf_sparsevec_reader 00245 { 00246 public: 00247 typedef std::vector<std::pair<int, T> > sparsevec_t; 00248 00249 cf_sparsevec_reader(std::istream& is) 00250 : m_is(is), m_eof(false) 00251 { 00252 cache_first_line(); 00253 } 00254 00255 cf_sparsevec_reader(const std::string fileName) 00256 : m_is(m_ifs), m_eof(false) 00257 { 00258 m_ifs.open(fileName.c_str()); 00259 if ( !m_ifs.is_open() ) 00260 throw std::runtime_error("Cannot open file <" + fileName + ">!"); 00261 00262 cache_first_line(); 00263 } 00264 void cache_first_line() 00265 { 00266 getline(m_is, m_line); 00267 std::istringstream iss(m_line); 00268 iss >> m_currentid; 00269 } 00270 00271 bool parseline(sparsevec_t& vec) 00272 { 00273 if (m_line.empty()) 00274 { 00275 m_eof = true; 00276 return false; 00277 } 00278 00279 // returns false if id doesn't match 00280 std::istringstream iss(m_line); 00281 K id; 00282 int idx; 00283 T value; 00284 iss >> id >> idx >> value; 00285 if (id != m_currentid) 00286 { 00287 m_currentid = id; 00288 return false; 00289 } 00290 vec.push_back(std::make_pair(idx, value)); 00291 return true; 00292 } 00293 00294 bool read(K& key, sparsevec_t& vec, bool sort_by_value) 00295 { 00296 if (m_eof) 00297 return false; 00298 00299 vec.clear(); 00300 key = m_currentid; 00301 00302 // build up the vector 00303 while (parseline(vec) && !m_is.eof()) 00304 getline(m_is, m_line); 00305 00306 if (sort_by_value) // sort by value desc 00307 std::stable_sort(vec.begin(), vec.end(), moost::which<2>::comparer<std::greater>()); 00308 else // sort by idx 00309 std::stable_sort(vec.begin(), vec.end(), moost::which<1>::comparer<std::less>()); 00310 00311 return true; 00312 } 00313 00314 void clear() 00315 { 00316 m_is.clear(); 00317 m_is.seekg(0, std::ios::beg); 00318 } 00319 00320 private: 00321 std::ifstream m_ifs; 00322 std::istream& m_is; 00323 bool m_eof; 00324 std::string m_line; 00325 K m_currentid; 00326 }; 00327 00328 // this expects 00329 // id idx idx idx... 00330 template<typename K, typename T> 00331 class tsv_vec_reader 00332 { 00333 public: 00334 typedef std::vector<T> vec_t; 00335 00336 tsv_vec_reader(std::istream& is) 00337 : m_is(is) { } 00338 00339 tsv_vec_reader(const std::string fileName) 00340 : m_is(m_ifs) 00341 { 00342 m_ifs.open(fileName.c_str()); 00343 if ( !m_ifs.is_open() ) 00344 throw std::runtime_error("Cannot open file <" + fileName + ">!"); 00345 } 00346 00347 // sort_by_value is ignored 00348 bool read(K& key, vec_t& vec, bool /*sort_by_value*/) 00349 { 00350 if (m_is.eof()) 00351 return false; 00352 std::string line; 00353 getline(m_is, line); 00354 if (line.empty()) 00355 return false; 00356 00357 vec.clear(); 00358 00359 std::istringstream iss(line); 00360 iss >> key; 00361 T val; 00362 while (iss >> val) 00363 { 00364 vec.push_back(val); 00365 } 00366 00367 return true; 00368 } 00369 00370 void clear() 00371 { 00372 m_is.clear(); 00373 m_is.seekg(0, std::ios::beg); 00374 } 00375 00376 private: 00377 std::ifstream m_ifs; 00378 std::istream& m_is; 00379 }; 00380 00381 // this expects 00382 // id [idx, idx, idx,... ] 00383 // sometimes we can get dumbo output back in this format 00384 template<typename K, typename T> 00385 class python_vec_reader 00386 { 00387 public: 00388 typedef std::vector<T> vec_t; 00389 00390 python_vec_reader(std::istream& is) 00391 : m_is(is) { } 00392 00393 python_vec_reader(const std::string fileName) 00394 : m_is(m_ifs) 00395 { 00396 m_ifs.open(fileName.c_str()); 00397 if ( !m_ifs.is_open() ) 00398 throw std::runtime_error("Cannot open file <" + fileName + ">!"); 00399 } 00400 00401 // sort_by_value is ignored 00402 bool read(K& key, vec_t& vec, bool /*sort_by_value*/) 00403 { 00404 if (m_is.eof()) 00405 return false; 00406 m_is >> key; 00407 std::string line; 00408 getline(m_is, line); 00409 if (line.length() < 3) 00410 return false; 00411 00412 vec.clear(); 00413 00414 std::istringstream iss(line); 00415 // now read the python list into a vector 00416 std::string val; 00417 iss >> val; 00418 // trim leading [ and trailing , or ] from the first val 00419 vec.push_back(boost::lexical_cast<T>(val.substr(1, val.length() - 2))); 00420 // trim trailing , or ] from the rest 00421 while (iss >> val) 00422 { 00423 vec.push_back(boost::lexical_cast<T>(val.substr(0, val.length() - 1))); 00424 } 00425 00426 return true; 00427 } 00428 00429 void clear() 00430 { 00431 m_is.clear(); 00432 m_is.seekg(0, std::ios::beg); 00433 } 00434 00435 private: 00436 std::ifstream m_ifs; 00437 std::istream& m_is; 00438 }; 00439 00440 }}} 00441 00442 #endif