libmoost
/home/mhx/git/github/libmoost/include/moost/container/simple_multi_map.hpp
Go to the documentation of this file.
00001 /* vim:set ts=3 sw=3 sts=3 et: */
00028 #ifndef __SIMPLE_MULTI_MAP_CONTAINER_H
00029 #define __SIMPLE_MULTI_MAP_CONTAINER_H
00030 
00031 #include <string>
00032 #include <vector>
00033 #include <limits>
00034 #include <fstream>
00035 #include <iostream>
00036 #include <stdexcept>
00037 
00038 #include "multi_map.hpp"
00039 
00040 #include "dense_hash_map.hpp"
00041 
00042 namespace moost { namespace container {
00043 
00044 template <typename TVal>
00045 struct IdentityTransform
00046 {
00047    inline void operator()(TVal& /*val*/) const { }
00048 };
00049 
00050 /*
00051 *  simple_multi_map is an alternative to neigh_multi_map that
00052 *  allows you to load data from arbitrary file formats
00053 *  by specifying a Reader policy class to create_map()
00054 *
00055 *  some Readers for various simple text formats are defined in
00056 *  policies/readers.hpp, or you can write your own satisfying this interface:
00057 *     Reader::Reader(std::istream&)
00058 *     bool Reader::read(int id, std::vector<TVal>& vec, bool sort_by_value)
00059 *     1. read() should return false when there is no more data to read
00060 *     2. it should sort vec in accordance with sort_by_value
00061 *     3. sort_by_value is meaningful for sparse vectors i.e. when TVal is
00062 *        an <idx,value> pair, otherwise it can be ignored by the Reader
00063 *
00064 *  simple usage to load vectors of ints:
00065 *     typedef simple_multi_map<int, int> mmap_t;
00066 *     mmap_t myMap;
00067 *     int maxEntriesPerVec = 5;
00068 *     // use a ready-made reader for tsv data
00069 *     myMap.create_map<tsv_vec_reader<int, int> >(myFile, maxEntriesPerVec, false);
00070 *     // now access the data like this:
00071 *     mmap_t::range r = myMap[3];
00072 *     mmap_t::range_iterator mapIt;
00073 *     for ( mapIt = r.begin(); mapIt != r.end(); ++mapIt )
00074 *        cout << *mapIt << endl;
00075 *
00076 *  note that files of sparse vectors of floats (i.e. TVal = std::pair<int, float>)
00077 *  can be loaded *much* more quickly by converting to "neigh binary" format and
00078 *  using neigh_multi_map
00079 *
00080 */
00081 template < typename TKey = int,
00082            typename TVal = std::pair<int, float>,
00083            typename TLocMap = moost::container::dense_hash_map<TKey, multimap_value_type>
00084          >
00085 class simple_multi_map : public multi_map< TKey, TVal, TLocMap >
00086 {
00087 public:
00088 
00089    typedef typename multi_map<TKey, TVal, TLocMap>::loc_map_policy_type loc_map_policy_type;
00090 
00091    // the default policy uses dense hash map, and the default empty key is 0,
00092    // so if your data has zeroes for keys you have to make sure
00093    // you are using a different policy, i.e.
00094    // simple_multi_map<> m( simple_multi_map<>::loc_map_policy_type(-1) );
00095    simple_multi_map( const loc_map_policy_type& locHandlerPolicy = loc_map_policy_type() )
00096       : multi_map<TKey, TVal, TLocMap>(locHandlerPolicy)
00097    {}
00098 
00099    // entries can be sorted by value if requested
00100    template <typename Reader>
00101    void create_map( const std::string& dataFileName,
00102                     int maxEntriesPerVec = std::numeric_limits<int>::max(),
00103                     bool sortByValue = false )
00104    {
00105       Reader reader(dataFileName);
00106       IdentityTransform<TVal> identityTransform;
00107       create_map( reader, identityTransform,
00108                   maxEntriesPerVec, sortByValue);
00109    }
00110 
00111    // entries can be sorted by value if requested
00112    template <typename Reader, typename ValueTransform>
00113    void create_map( const std::string& dataFileName,
00114                     const ValueTransform& valueTransformPolicy,
00115                     int maxEntriesPerVec = std::numeric_limits<int>::max(),
00116                     bool sortByValue = false )
00117    {
00118       Reader reader(dataFileName);
00119       create_map( reader,
00120                   valueTransformPolicy,
00121                   maxEntriesPerVec,
00122                   sortByValue );
00123    }
00124 
00125    // entries can be sorted by value if requested
00126    template <typename Reader, typename ValueTransform>
00127    void create_map( Reader& reader,
00128                     const ValueTransform& valueTransformPolicy,
00129                     int maxEntriesPerVec = std::numeric_limits<int>::max(),
00130                     bool sortByValue = false );
00131 
00132    inline void create_map_from_vector( std::vector<std::pair<TKey, TVal> >& i2i )
00133    {
00134       multi_map<TKey, TVal, TLocMap>::template create_map<1>(i2i.begin(), i2i.end());
00135    }
00136 
00137 private:
00138 
00139    using multi_map< TKey, TVal, TLocMap >::m_data;
00140    using multi_map< TKey, TVal, TLocMap >::m_locations;
00141    using multi_map< TKey, TVal, TLocMap >::m_locHandlerPolicy;
00142 };
00143 
00144 
00145 template <typename TKey, typename TVal, typename TLocMap>
00146 template <typename Reader, typename ValueTransform>
00147 void simple_multi_map<TKey, TVal, TLocMap>::create_map( Reader& reader,
00148                                                         const ValueTransform& valueTransformPolicy,
00149                                                         int maxEntriesPerVec /*= (std::numeric_limits<int>::max)() */,
00150                                                         bool sortByValue /* = false */ )
00151 {
00152    TKey tmpID;
00153    std::vector<TVal> vec;
00154 
00155    int numKeys = 0;
00156    size_t totEntries = 0;
00157 
00158    std::cerr << "scanning...";
00159 
00160    while (reader.read(tmpID, vec, sortByValue))
00161    {
00162       ++numKeys;
00163       totEntries += std::min(vec.size(), static_cast<size_t>(maxEntriesPerVec));
00164 
00165       if ((numKeys & (numKeys - 1)) == 0)
00166          std::cerr << numKeys << "...";
00167    }
00168 
00169    if (totEntries == 0)
00170       throw std::runtime_error("Empty source!");
00171 
00172    // reserve/allocate memory
00173    this->m_data.reserve(totEntries);
00174    m_locHandlerPolicy.resize(this->m_locations, numKeys);
00175 
00176    reader.clear();
00177 
00178    // now load the data
00179    size_t entryPos = 0;
00180    size_t numToRead;
00181    int i = 0;
00182 
00183    std::cerr << "reading...";
00184 
00185    while (reader.read(tmpID, vec, sortByValue))
00186    {
00187       if (vec.size() > static_cast<size_t>(maxEntriesPerVec))
00188          vec.resize(maxEntriesPerVec);
00189 
00190       // transform values with the supplied policy
00191       for (typename std::vector<TVal>::iterator it = vec.begin(); it != vec.end(); ++it)
00192          valueTransformPolicy(*it);
00193 
00194       numToRead = vec.size();
00195 
00196       if (entryPos + numToRead > totEntries )
00197          throw std::runtime_error("There were more entries than what was found during scan!");
00198 
00199       for (size_t j = 0; j < numToRead; ++j)
00200          m_data.push_back(vec[j]);
00201       m_locations[tmpID] = std::make_pair(entryPos, numToRead);
00202 
00203       entryPos += numToRead;
00204 
00205       if ((i & (i - 1)) == 0)
00206          std::cerr << i << "...";
00207       ++i;
00208    }
00209 
00210    std::cerr << "done" << std::endl;
00211 }
00212 
00213 // -----------------------------------------------------------------------------
00214 
00215 }}
00216 
00217 #endif // __SIMPLE_MULTI_MAP_CONTAINER_H
00218 
00219 // -----------------------------------------------------------------------------