libmoost
|
00001 /* vim:set ts=3 sw=3 sts=3 et: */ 00028 #ifndef MOOST_CONTAINER_MEMORY_MAPPED_DATASET_DENSE_HASH_MAP_HPP__ 00029 #define MOOST_CONTAINER_MEMORY_MAPPED_DATASET_DENSE_HASH_MAP_HPP__ 00030 00031 #include <string> 00032 #include <vector> 00033 #include <stdexcept> 00034 #include <iterator> 00035 00036 #include <boost/type_traits/is_pod.hpp> 00037 #include <boost/noncopyable.hpp> 00038 00039 #include "section_writer_base.hpp" 00040 #include "pod_pair.hpp" 00041 00042 namespace moost { namespace container { 00043 00057 template <typename Key, typename T, class HashFcn = MMD_DEFAULT_HASH_FCN<Key> > 00058 class mmd_dense_hash_map : public boost::noncopyable 00059 { 00060 BOOST_STATIC_ASSERT_MSG(boost::is_pod<Key>::value, "mmd_dense_hash_map<> template can only handle POD key types"); 00061 BOOST_STATIC_ASSERT_MSG(boost::is_pod<T>::value, "mmd_dense_hash_map<> template can only handle POD value types"); 00062 00063 friend class const_iterator; 00064 00065 public: 00066 static const size_t MMD_HASH_ALIGNMENT = 16; 00067 00068 static float MAX_POPULATION_RATIO() 00069 { 00070 // lower values result in faster lookups and higher memory usage 00071 return 0.8; 00072 } 00073 00074 typedef Key key_type; 00075 typedef T mapped_type; 00076 typedef pod_pair<Key, T> value_type; 00077 00078 typedef size_t size_type; 00079 00080 typedef const value_type& const_reference; 00081 typedef const value_type* const_pointer; 00082 00083 typedef std::forward_iterator_tag iterator_category; 00084 00085 class const_iterator 00086 { 00087 friend class mmd_dense_hash_map; 00088 00089 public: 00090 const_reference operator* () const 00091 { 00092 return *m_it; 00093 } 00094 00095 const_pointer operator-> () const 00096 { 00097 return &(operator*()); 00098 } 00099 00100 const_iterator& operator++ () 00101 { 00102 ++m_it; 00103 skip(); 00104 return *this; 00105 } 00106 00107 const_iterator operator++ (int) 00108 { 00109 const_iterator tmp(*this); 00110 ++*this; 00111 return tmp; 00112 } 00113 00114 bool operator== (const const_iterator& it) const 00115 { 00116 return m_it == it.m_it; 00117 } 00118 00119 bool operator!= (const const_iterator& it) const 00120 { 00121 return !(*this == it); 00122 } 00123 00124 private: 00125 const_iterator(const mmd_dense_hash_map& map, const value_type *it, const value_type *end) 00126 : m_it(it) 00127 , m_end(end) 00128 , m_map(map) 00129 { 00130 skip(); 00131 } 00132 00133 void skip() 00134 { 00135 while (m_it != m_end && m_it->first == m_map.empty_key()) 00136 { 00137 ++m_it; 00138 } 00139 } 00140 00141 const value_type *m_it; 00142 const value_type *m_end; 00143 const mmd_dense_hash_map& m_map; 00144 }; 00145 00146 struct HashingPolicy 00147 { 00148 HashingPolicy(const key_type& k, size_type size) 00149 : m_size(size) 00150 , m_hash_mask(size - 1) 00151 , m_index(HashFcn()(k) & m_hash_mask) 00152 , m_iter(0) 00153 { 00154 } 00155 00156 HashingPolicy& operator++() 00157 { 00158 if (++m_iter < m_size) 00159 { 00160 m_index = (m_index + m_iter) & m_hash_mask; 00161 } 00162 else 00163 { 00164 m_index = m_size; 00165 } 00166 00167 return *this; 00168 } 00169 00170 size_type operator() () const 00171 { 00172 return m_index; 00173 } 00174 00175 size_type iter() const 00176 { 00177 return m_iter; 00178 } 00179 00180 private: 00181 const size_type m_size; 00182 const size_type m_hash_mask; 00183 size_type m_index; 00184 size_type m_iter; 00185 }; 00186 00187 public: 00188 class writer : public mmd_section_writer_base 00189 { 00190 public: 00191 writer(memory_mapped_dataset::writer& wr, const std::string& name, const key_type& empty_key, float max_population_ratio = MAX_POPULATION_RATIO(), size_t alignment = MMD_HASH_ALIGNMENT) 00192 : mmd_section_writer_base(wr, name, "mmd_dense_hash_map", alignment) 00193 , m_empty_key(empty_key) 00194 , m_max_pop_ratio(max_population_ratio) 00195 { 00196 if (max_population_ratio < 0.009999 || max_population_ratio > 0.990001) 00197 { 00198 rollback(); 00199 throw std::runtime_error("invalid max_population_ratio (must be in [0.01, 0.99])"); 00200 } 00201 setattr("key_size", sizeof(key_type)); 00202 setattr("mapped_size", sizeof(mapped_type)); 00203 setattr("elem_size", sizeof(value_type)); 00204 } 00205 00206 writer& operator<< (const value_type& e) 00207 { 00208 insert(e); 00209 return *this; 00210 } 00211 00212 writer& operator<< (const std::pair<Key, T>& e) 00213 { 00214 insert(e); 00215 return *this; 00216 } 00217 00218 void insert(const value_type& e) 00219 { 00220 if (e.first == m_empty_key) 00221 { 00222 throw std::runtime_error("attempt to insert empty key"); 00223 } 00224 m_values.push_back(e); 00225 } 00226 00227 void insert(const std::pair<Key, T>& e) 00228 { 00229 value_type v; 00230 v.first = e.first; 00231 v.second = e.second; 00232 insert(v); 00233 } 00234 00235 size_type size() const 00236 { 00237 return m_values.size(); 00238 } 00239 00240 protected: 00241 void pre_commit() // all the writing actually happens here 00242 { 00243 setattr("population", size()); 00244 00245 std::vector<value_type> map; 00246 build_dense_hash_map(map); 00247 write(map); 00248 00249 value_type empty; 00250 empty_value(empty); 00251 write(empty); 00252 00253 setattr("size", map.size()); 00254 } 00255 00256 private: 00257 size_type get_optimum_table_size(size_type pop) const 00258 { 00259 if (pop <= 1) 00260 { 00261 return pop; 00262 } 00263 00264 pop /= m_max_pop_ratio; 00265 size_type opt = 1; 00266 00267 while (opt < pop) 00268 { 00269 opt <<= 1; 00270 } 00271 00272 return opt; 00273 } 00274 00275 size_type find(const key_type& key, const value_type *begin, size_type size) const 00276 { 00277 HashingPolicy hash(key, size); 00278 00279 for (;;) 00280 { 00281 const value_type& target = begin[hash()]; 00282 00283 if (target.first == m_empty_key) 00284 { 00285 break; 00286 } 00287 00288 if (target.first == key) 00289 { 00290 return size; 00291 } 00292 00293 ++hash; 00294 } 00295 00296 return hash(); 00297 } 00298 00299 void empty_value(value_type& val) const 00300 { 00301 std::memset(&val, 0, sizeof(value_type)); 00302 val.first = m_empty_key; 00303 } 00304 00305 void build_dense_hash_map(std::vector<value_type>& target) const 00306 { 00307 // TODO: priority sorting; if we insert high priority items first, we will find those again much faster 00308 00309 std::vector<value_type> rv; 00310 00311 size_type size = get_optimum_table_size(this->size()); 00312 00313 rv.resize(size); 00314 value_type empty; 00315 empty_value(empty); 00316 std::fill(rv.begin(), rv.end(), empty); 00317 00318 for (typename std::vector<value_type>::const_iterator it = m_values.begin(); it != m_values.end(); ++it) 00319 { 00320 size_type index = find(it->first, &rv[0], rv.size()); 00321 00322 if (index == rv.size()) 00323 { 00324 throw std::runtime_error("duplicate key detected"); 00325 } 00326 00327 rv[index] = *it; 00328 } 00329 00330 target.swap(rv); 00331 } 00332 00333 key_type m_empty_key; 00334 float m_max_pop_ratio; 00335 std::vector<value_type> m_values; 00336 }; 00337 00338 mmd_dense_hash_map() 00339 : m_size(0) 00340 , m_population(0) 00341 , m_begin(0) 00342 { 00343 } 00344 00345 mmd_dense_hash_map(const memory_mapped_dataset& mmd, const std::string& name) 00346 { 00347 set(mmd, name); 00348 } 00349 00350 void set(const memory_mapped_dataset& mmd, const std::string& name) 00351 { 00352 const memory_mapped_dataset::section_info& info = mmd.find(name, "mmd_dense_hash_map"); 00353 00354 if (info.getattr<size_t>("key_size") != sizeof(key_type)) 00355 { 00356 throw std::runtime_error("wrong key size for dense_hash_map " + name + " in dataset " + mmd.description()); 00357 } 00358 00359 if (info.getattr<size_t>("mapped_size") != sizeof(mapped_type)) 00360 { 00361 throw std::runtime_error("wrong mapped size for dense_hash_map " + name + " in dataset " + mmd.description()); 00362 } 00363 00364 if (info.getattr<size_t>("elem_size") != sizeof(value_type)) 00365 { 00366 // shouldn't happen unless we run into an alignment mismatch 00367 throw std::runtime_error("wrong element size for dense_hash_map " + name + " in dataset " + mmd.description()); 00368 } 00369 00370 m_size = info.getattr<size_type>("size"); 00371 m_population = info.getattr<size_type>("population"); 00372 m_begin = mmd.data<value_type>(info.offset(), m_size + 1); 00373 m_empty_key = m_begin[m_size].first; 00374 } 00375 00376 void warm_cache() const 00377 { 00378 memory_mapped_dataset::warm_cache(m_begin, m_begin + m_size); 00379 } 00380 00381 const_iterator begin() const 00382 { 00383 return const_iterator(*this, m_begin, m_begin + m_size); 00384 } 00385 00386 const_iterator end() const 00387 { 00388 return const_iterator(*this, m_begin + m_size, m_begin + m_size); 00389 } 00390 00391 size_type size() const 00392 { 00393 return m_population; 00394 } 00395 00396 size_type capacity() const 00397 { 00398 return m_size; 00399 } 00400 00401 bool empty() const 00402 { 00403 return size() == 0; 00404 } 00405 00406 const key_type& empty_key() const 00407 { 00408 return m_empty_key; 00409 } 00410 00411 const_iterator find(const key_type& key) const 00412 { 00413 return const_iterator(*this, m_begin + find(key, m_begin, m_size), m_begin + m_size); 00414 } 00415 00416 const mapped_type& operator[] (const key_type& key) const 00417 { 00418 size_type index = find(key, m_begin, m_size); 00419 00420 if (index < m_size) 00421 { 00422 return m_begin[index].second; 00423 } 00424 00425 throw std::runtime_error("no such key"); 00426 } 00427 00428 private: 00429 size_type find(const key_type& key, const value_type *begin, size_type size) const 00430 { 00431 HashingPolicy hash(key, size); 00432 00433 for (;;) 00434 { 00435 const value_type& target = begin[hash()]; 00436 00437 if (target.first == key) 00438 { 00439 break; 00440 } 00441 00442 if (target.first == m_empty_key) 00443 { 00444 return size; 00445 } 00446 00447 ++hash; 00448 } 00449 00450 return hash(); 00451 } 00452 00453 size_type m_size; 00454 size_type m_population; 00455 const value_type *m_begin; 00456 key_type m_empty_key; 00457 }; 00458 00459 }} 00460 00461 #endif