libmoost
/home/mhx/git/github/libmoost/include/moost/container/policies/readers.hpp
Go to the documentation of this file.
00001 /* vim:set ts=3 sw=3 sts=3 et: */
00028 #ifndef MOOST_CONTAINER_POLICIES_READERS_HPP
00029 #define MOOST_CONTAINER_POLICIES_READERS_HPP
00030 
00031 #include <vector>
00032 #include <string>
00033 #include <fstream>
00034 #include <iostream>
00035 #include <sstream>
00036 #include <algorithm>
00037 #include <cstdio>
00038 
00039 #include <boost/cstdint.hpp>
00040 #include <boost/lexical_cast.hpp>
00041 
00042 #include "../../which.hpp"
00043 
00044 namespace moost { namespace container { namespace policies {
00045 
00046 /*
00047 *  some simple readers to parse vector data
00048 *  from text files in various formats
00049 *  for use with simple_multi_map
00050 *
00051 *  K is the key type and must support:
00052 *  K k; istream >> k;
00053 */
00054 
00055 /*
00056 * first some little traits classes
00057 * so we can use sscanf to speed up
00058 * reading from file
00059 */
00060 template<typename T>
00061 struct reader_value_traits;
00062 
00063 template<>
00064 struct reader_value_traits<float>
00065 {
00066    typedef float value_type;
00067    static inline std::string format_string() { return "%f"; }
00068 };
00069 
00070 template<>
00071 struct reader_value_traits<double>
00072 {
00073    typedef double value_type;
00074    static inline std::string format_string() { return "%lg"; }
00075 };
00076 
00077 template<>
00078 struct reader_value_traits<int>
00079 {
00080    typedef int value_type;
00081    static inline std::string format_string() { return "%d"; }
00082 };
00083 
00084 template<>
00085 struct reader_value_traits<long>
00086 {
00087    typedef long value_type;
00088    static inline std::string format_string() { return "%l"; }
00089 };
00090 
00091 #if defined(__GNUC__) && ULONG_MAX == 0xFFFFFFFF
00092 
00093 template<>
00094 struct reader_value_traits<long long>
00095 {
00096    typedef long long value_type;
00097    static inline std::string format_string() { return "%ll"; }
00098 };
00099 
00100 template<>
00101 struct reader_value_traits<unsigned long long>
00102 {
00103    typedef unsigned long long value_type;
00104    static inline std::string format_string() { return "%llu"; }
00105 };
00106 
00107 #endif
00108 
00109 /*
00110 * now the readers themselves
00111 */
00112 
00113 // this expects
00114 // id idx val idx val idx val...
00115 template<typename K, typename T>
00116 class tsv_sparsevec_reader
00117 {
00118 public:
00119    typedef std::vector<std::pair<int, T> > sparsevec_t;
00120 
00121    tsv_sparsevec_reader(std::istream& is)
00122       : m_is(is) { }
00123 
00124    tsv_sparsevec_reader(const std::string& fileName)
00125       : m_is(m_ifs)
00126    {
00127       m_ifs.open(fileName.c_str());
00128       if ( !m_ifs.is_open() )
00129          throw std::runtime_error("Cannot open file <" + fileName + ">!");
00130    }
00131 
00132    bool read(K& key, sparsevec_t& vec, bool sort_by_value)
00133    {
00134       if (m_is.eof())
00135          return false;
00136       std::string line;
00137       getline(m_is, line);
00138       if (line.empty())
00139          return false;
00140 
00141       vec.clear();
00142 
00143       std::istringstream iss(line);
00144       iss >> key;
00145       // now read the remaining values into a vector
00146       int idx;
00147       T val;
00148       while (iss >> idx >> val)
00149       {
00150          vec.push_back(std::make_pair(idx, val));
00151       }
00152 
00153       if (sort_by_value)   // sort by value desc
00154          std::stable_sort(vec.begin(), vec.end(), moost::which<2>::comparer<std::greater>());
00155       else // sort by idx
00156          std::stable_sort(vec.begin(), vec.end(), moost::which<1>::comparer<std::less>());
00157 
00158       return true;
00159    }
00160 
00161    void clear()
00162    {
00163       m_is.clear();
00164       m_is.seekg(0, std::ios::beg);
00165    }
00166 
00167 private:
00168    std::ifstream m_ifs;
00169    std::istream& m_is;
00170 
00171 };
00172 
00173 // this expects
00174 // id (idx, val) (idx, val) (idx, val)...
00175 // we sometimes get this format back from dumbo
00176 template<typename K, typename T>
00177 class python_sparsevec_reader
00178 {
00179 public:
00180    typedef std::vector<std::pair<int, T> > sparsevec_t;
00181 
00182    python_sparsevec_reader(std::istream& is)
00183       : m_is(is), m_format_string("%*2c%d%*2c" + reader_value_traits<T>::format_string()) { }
00184 
00185    python_sparsevec_reader(const std::string fileName)
00186       : m_is(m_ifs),  m_format_string("%*2c%d%*2c" + reader_value_traits<T>::format_string())
00187    {
00188       m_ifs.open(fileName.c_str());
00189       if ( !m_ifs.is_open() )
00190          throw std::runtime_error("Cannot open file <" + fileName + ">!");
00191    }
00192 
00193    bool read(K& key, sparsevec_t& vec, bool sort_by_value)
00194    {
00195       if (m_is.eof())
00196          return false;
00197       std::string line;
00198       getline(m_is, line);
00199       if (line.empty())
00200          return false;
00201 
00202       vec.clear();
00203 
00204       std::istringstream iss(line);
00205       iss >> key;
00206       // now read the remaining values into a vector
00207       std::string s;
00208       int idx;
00209       T val;
00210       while (getline(iss, s, ')')){
00211          sscanf(s.c_str(), m_format_string.data(), &idx, &val);
00212          vec.push_back(std::make_pair(idx, val));
00213       }
00214 
00215       if (sort_by_value)   // sort by value desc
00216          std::stable_sort(vec.begin(), vec.end(), moost::which<2>::comparer<std::greater>());
00217       else // sort by idx
00218          std::stable_sort(vec.begin(), vec.end(), moost::which<1>::comparer<std::less>());
00219 
00220       return true;
00221    }
00222 
00223    void clear()
00224    {
00225       m_is.clear();
00226       m_is.seekg(0, std::ios::beg);
00227    }
00228 
00229 private:
00230    std::ifstream m_ifs;
00231    std::istream& m_is;
00232    const std::string m_format_string;
00233 };
00234 
00235 // this expects CF format i.e.
00236 // 10 200 3.0
00237 // 10 300 2.0
00238 // 20 100 1.0
00239 // 20 300 4.0
00240 // means
00241 // 10 -> (200,3.0), (300,2.0)
00242 // 20 -> (100,1.0), (300,4.0)
00243 template<typename K, typename T>
00244 class cf_sparsevec_reader
00245 {
00246 public:
00247    typedef std::vector<std::pair<int, T> > sparsevec_t;
00248 
00249    cf_sparsevec_reader(std::istream& is)
00250       : m_is(is), m_eof(false)
00251    {
00252       cache_first_line();
00253    }
00254 
00255    cf_sparsevec_reader(const std::string fileName)
00256       : m_is(m_ifs), m_eof(false)
00257    {
00258       m_ifs.open(fileName.c_str());
00259       if ( !m_ifs.is_open() )
00260          throw std::runtime_error("Cannot open file <" + fileName + ">!");
00261 
00262       cache_first_line();
00263    }
00264    void cache_first_line()
00265    {
00266       getline(m_is, m_line);
00267       std::istringstream iss(m_line);
00268       iss >> m_currentid;
00269    }
00270 
00271    bool parseline(sparsevec_t& vec)
00272    {
00273       if (m_line.empty())
00274       {
00275          m_eof = true;
00276          return false;
00277       }
00278 
00279       // returns false if id doesn't match
00280       std::istringstream iss(m_line);
00281       K id;
00282       int idx;
00283       T value;
00284       iss >> id >> idx >> value;
00285       if (id != m_currentid)
00286       {
00287          m_currentid = id;
00288          return false;
00289       }
00290       vec.push_back(std::make_pair(idx, value));
00291       return true;
00292    }
00293 
00294    bool read(K& key, sparsevec_t& vec, bool sort_by_value)
00295    {
00296       if (m_eof)
00297          return false;
00298 
00299       vec.clear();
00300       key = m_currentid;
00301 
00302       // build up the vector
00303       while (parseline(vec) && !m_is.eof())
00304          getline(m_is, m_line);
00305 
00306       if (sort_by_value)   // sort by value desc
00307          std::stable_sort(vec.begin(), vec.end(), moost::which<2>::comparer<std::greater>());
00308       else // sort by idx
00309          std::stable_sort(vec.begin(), vec.end(), moost::which<1>::comparer<std::less>());
00310 
00311       return true;
00312    }
00313 
00314    void clear()
00315    {
00316       m_is.clear();
00317       m_is.seekg(0, std::ios::beg);
00318    }
00319 
00320 private:
00321    std::ifstream m_ifs;
00322    std::istream& m_is;
00323    bool m_eof;
00324    std::string m_line;
00325    K m_currentid;
00326 };
00327 
00328 // this expects
00329 // id idx idx idx...
00330 template<typename K, typename T>
00331 class tsv_vec_reader
00332 {
00333 public:
00334    typedef std::vector<T> vec_t;
00335 
00336    tsv_vec_reader(std::istream& is)
00337       : m_is(is) { }
00338 
00339    tsv_vec_reader(const std::string fileName)
00340       : m_is(m_ifs)
00341    {
00342       m_ifs.open(fileName.c_str());
00343       if ( !m_ifs.is_open() )
00344          throw std::runtime_error("Cannot open file <" + fileName + ">!");
00345    }
00346 
00347    // sort_by_value is ignored
00348    bool read(K& key, vec_t& vec, bool /*sort_by_value*/)
00349    {
00350       if (m_is.eof())
00351          return false;
00352       std::string line;
00353       getline(m_is, line);
00354       if (line.empty())
00355          return false;
00356 
00357       vec.clear();
00358 
00359       std::istringstream iss(line);
00360       iss >> key;
00361       T val;
00362       while (iss >> val)
00363       {
00364          vec.push_back(val);
00365       }
00366 
00367       return true;
00368    }
00369 
00370    void clear()
00371    {
00372       m_is.clear();
00373       m_is.seekg(0, std::ios::beg);
00374    }
00375 
00376 private:
00377    std::ifstream m_ifs;
00378    std::istream& m_is;
00379 };
00380 
00381 // this expects
00382 // id [idx, idx, idx,... ]
00383 // sometimes we can get dumbo output back in this format
00384 template<typename K, typename T>
00385 class python_vec_reader
00386 {
00387 public:
00388    typedef std::vector<T> vec_t;
00389 
00390    python_vec_reader(std::istream& is)
00391       : m_is(is) { }
00392 
00393    python_vec_reader(const std::string fileName)
00394       : m_is(m_ifs)
00395    {
00396       m_ifs.open(fileName.c_str());
00397       if ( !m_ifs.is_open() )
00398          throw std::runtime_error("Cannot open file <" + fileName + ">!");
00399    }
00400 
00401    // sort_by_value is ignored
00402    bool read(K& key, vec_t& vec, bool /*sort_by_value*/)
00403    {
00404       if (m_is.eof())
00405          return false;
00406       m_is >> key;
00407       std::string line;
00408       getline(m_is, line);
00409       if (line.length() < 3)
00410          return false;
00411 
00412       vec.clear();
00413 
00414       std::istringstream iss(line);
00415       // now read the python list into a vector
00416       std::string val;
00417       iss >> val;
00418       // trim leading [ and trailing , or ] from the first val
00419       vec.push_back(boost::lexical_cast<T>(val.substr(1, val.length() - 2)));
00420       // trim trailing , or ] from the rest
00421       while (iss >> val)
00422       {
00423          vec.push_back(boost::lexical_cast<T>(val.substr(0, val.length() - 1)));
00424       }
00425 
00426       return true;
00427    }
00428 
00429    void clear()
00430    {
00431       m_is.clear();
00432       m_is.seekg(0, std::ios::beg);
00433    }
00434 
00435 private:
00436    std::ifstream m_ifs;
00437    std::istream& m_is;
00438 };
00439 
00440 }}}
00441 
00442 #endif