libmoost
|
00001 /* vim:set ts=3 sw=3 sts=3 et: */ 00028 #ifndef __SIMPLE_MULTI_MAP_CONTAINER_H 00029 #define __SIMPLE_MULTI_MAP_CONTAINER_H 00030 00031 #include <string> 00032 #include <vector> 00033 #include <limits> 00034 #include <fstream> 00035 #include <iostream> 00036 #include <stdexcept> 00037 00038 #include "multi_map.hpp" 00039 00040 #include "dense_hash_map.hpp" 00041 00042 namespace moost { namespace container { 00043 00044 template <typename TVal> 00045 struct IdentityTransform 00046 { 00047 inline void operator()(TVal& /*val*/) const { } 00048 }; 00049 00050 /* 00051 * simple_multi_map is an alternative to neigh_multi_map that 00052 * allows you to load data from arbitrary file formats 00053 * by specifying a Reader policy class to create_map() 00054 * 00055 * some Readers for various simple text formats are defined in 00056 * policies/readers.hpp, or you can write your own satisfying this interface: 00057 * Reader::Reader(std::istream&) 00058 * bool Reader::read(int id, std::vector<TVal>& vec, bool sort_by_value) 00059 * 1. read() should return false when there is no more data to read 00060 * 2. it should sort vec in accordance with sort_by_value 00061 * 3. sort_by_value is meaningful for sparse vectors i.e. when TVal is 00062 * an <idx,value> pair, otherwise it can be ignored by the Reader 00063 * 00064 * simple usage to load vectors of ints: 00065 * typedef simple_multi_map<int, int> mmap_t; 00066 * mmap_t myMap; 00067 * int maxEntriesPerVec = 5; 00068 * // use a ready-made reader for tsv data 00069 * myMap.create_map<tsv_vec_reader<int, int> >(myFile, maxEntriesPerVec, false); 00070 * // now access the data like this: 00071 * mmap_t::range r = myMap[3]; 00072 * mmap_t::range_iterator mapIt; 00073 * for ( mapIt = r.begin(); mapIt != r.end(); ++mapIt ) 00074 * cout << *mapIt << endl; 00075 * 00076 * note that files of sparse vectors of floats (i.e. TVal = std::pair<int, float>) 00077 * can be loaded *much* more quickly by converting to "neigh binary" format and 00078 * using neigh_multi_map 00079 * 00080 */ 00081 template < typename TKey = int, 00082 typename TVal = std::pair<int, float>, 00083 typename TLocMap = moost::container::dense_hash_map<TKey, multimap_value_type> 00084 > 00085 class simple_multi_map : public multi_map< TKey, TVal, TLocMap > 00086 { 00087 public: 00088 00089 typedef typename multi_map<TKey, TVal, TLocMap>::loc_map_policy_type loc_map_policy_type; 00090 00091 // the default policy uses dense hash map, and the default empty key is 0, 00092 // so if your data has zeroes for keys you have to make sure 00093 // you are using a different policy, i.e. 00094 // simple_multi_map<> m( simple_multi_map<>::loc_map_policy_type(-1) ); 00095 simple_multi_map( const loc_map_policy_type& locHandlerPolicy = loc_map_policy_type() ) 00096 : multi_map<TKey, TVal, TLocMap>(locHandlerPolicy) 00097 {} 00098 00099 // entries can be sorted by value if requested 00100 template <typename Reader> 00101 void create_map( const std::string& dataFileName, 00102 int maxEntriesPerVec = std::numeric_limits<int>::max(), 00103 bool sortByValue = false ) 00104 { 00105 Reader reader(dataFileName); 00106 IdentityTransform<TVal> identityTransform; 00107 create_map( reader, identityTransform, 00108 maxEntriesPerVec, sortByValue); 00109 } 00110 00111 // entries can be sorted by value if requested 00112 template <typename Reader, typename ValueTransform> 00113 void create_map( const std::string& dataFileName, 00114 const ValueTransform& valueTransformPolicy, 00115 int maxEntriesPerVec = std::numeric_limits<int>::max(), 00116 bool sortByValue = false ) 00117 { 00118 Reader reader(dataFileName); 00119 create_map( reader, 00120 valueTransformPolicy, 00121 maxEntriesPerVec, 00122 sortByValue ); 00123 } 00124 00125 // entries can be sorted by value if requested 00126 template <typename Reader, typename ValueTransform> 00127 void create_map( Reader& reader, 00128 const ValueTransform& valueTransformPolicy, 00129 int maxEntriesPerVec = std::numeric_limits<int>::max(), 00130 bool sortByValue = false ); 00131 00132 inline void create_map_from_vector( std::vector<std::pair<TKey, TVal> >& i2i ) 00133 { 00134 multi_map<TKey, TVal, TLocMap>::template create_map<1>(i2i.begin(), i2i.end()); 00135 } 00136 00137 private: 00138 00139 using multi_map< TKey, TVal, TLocMap >::m_data; 00140 using multi_map< TKey, TVal, TLocMap >::m_locations; 00141 using multi_map< TKey, TVal, TLocMap >::m_locHandlerPolicy; 00142 }; 00143 00144 00145 template <typename TKey, typename TVal, typename TLocMap> 00146 template <typename Reader, typename ValueTransform> 00147 void simple_multi_map<TKey, TVal, TLocMap>::create_map( Reader& reader, 00148 const ValueTransform& valueTransformPolicy, 00149 int maxEntriesPerVec /*= (std::numeric_limits<int>::max)() */, 00150 bool sortByValue /* = false */ ) 00151 { 00152 TKey tmpID; 00153 std::vector<TVal> vec; 00154 00155 int numKeys = 0; 00156 size_t totEntries = 0; 00157 00158 std::cerr << "scanning..."; 00159 00160 while (reader.read(tmpID, vec, sortByValue)) 00161 { 00162 ++numKeys; 00163 totEntries += std::min(vec.size(), static_cast<size_t>(maxEntriesPerVec)); 00164 00165 if ((numKeys & (numKeys - 1)) == 0) 00166 std::cerr << numKeys << "..."; 00167 } 00168 00169 if (totEntries == 0) 00170 throw std::runtime_error("Empty source!"); 00171 00172 // reserve/allocate memory 00173 this->m_data.reserve(totEntries); 00174 m_locHandlerPolicy.resize(this->m_locations, numKeys); 00175 00176 reader.clear(); 00177 00178 // now load the data 00179 size_t entryPos = 0; 00180 size_t numToRead; 00181 int i = 0; 00182 00183 std::cerr << "reading..."; 00184 00185 while (reader.read(tmpID, vec, sortByValue)) 00186 { 00187 if (vec.size() > static_cast<size_t>(maxEntriesPerVec)) 00188 vec.resize(maxEntriesPerVec); 00189 00190 // transform values with the supplied policy 00191 for (typename std::vector<TVal>::iterator it = vec.begin(); it != vec.end(); ++it) 00192 valueTransformPolicy(*it); 00193 00194 numToRead = vec.size(); 00195 00196 if (entryPos + numToRead > totEntries ) 00197 throw std::runtime_error("There were more entries than what was found during scan!"); 00198 00199 for (size_t j = 0; j < numToRead; ++j) 00200 m_data.push_back(vec[j]); 00201 m_locations[tmpID] = std::make_pair(entryPos, numToRead); 00202 00203 entryPos += numToRead; 00204 00205 if ((i & (i - 1)) == 0) 00206 std::cerr << i << "..."; 00207 ++i; 00208 } 00209 00210 std::cerr << "done" << std::endl; 00211 } 00212 00213 // ----------------------------------------------------------------------------- 00214 00215 }} 00216 00217 #endif // __SIMPLE_MULTI_MAP_CONTAINER_H 00218 00219 // -----------------------------------------------------------------------------