libmoost
/home/mhx/git/github/libmoost/include/moost/container/memory_mapped_dataset/dense_hash_map.hpp
Go to the documentation of this file.
00001 /* vim:set ts=3 sw=3 sts=3 et: */
00028 #ifndef MOOST_CONTAINER_MEMORY_MAPPED_DATASET_DENSE_HASH_MAP_HPP__
00029 #define MOOST_CONTAINER_MEMORY_MAPPED_DATASET_DENSE_HASH_MAP_HPP__
00030 
00031 #include <string>
00032 #include <vector>
00033 #include <stdexcept>
00034 #include <iterator>
00035 
00036 #include <boost/type_traits/is_pod.hpp>
00037 #include <boost/noncopyable.hpp>
00038 
00039 #include "section_writer_base.hpp"
00040 #include "pod_pair.hpp"
00041 
00042 namespace moost { namespace container {
00043 
00057 template <typename Key, typename T, class HashFcn = MMD_DEFAULT_HASH_FCN<Key> >
00058 class mmd_dense_hash_map : public boost::noncopyable
00059 {
00060    BOOST_STATIC_ASSERT_MSG(boost::is_pod<Key>::value, "mmd_dense_hash_map<> template can only handle POD key types");
00061    BOOST_STATIC_ASSERT_MSG(boost::is_pod<T>::value, "mmd_dense_hash_map<> template can only handle POD value types");
00062 
00063    friend class const_iterator;
00064 
00065 public:
00066    static const size_t MMD_HASH_ALIGNMENT = 16;
00067 
00068    static float MAX_POPULATION_RATIO()
00069    {
00070       // lower values result in faster lookups and higher memory usage
00071       return 0.8;
00072    }
00073 
00074    typedef Key key_type;
00075    typedef T mapped_type;
00076    typedef pod_pair<Key, T> value_type;
00077 
00078    typedef size_t size_type;
00079 
00080    typedef const value_type& const_reference;
00081    typedef const value_type* const_pointer;
00082 
00083    typedef std::forward_iterator_tag iterator_category;
00084 
00085    class const_iterator
00086    {
00087       friend class mmd_dense_hash_map;
00088 
00089    public:
00090       const_reference operator* () const
00091       {
00092          return *m_it;
00093       }
00094 
00095       const_pointer operator-> () const
00096       {
00097          return &(operator*());
00098       }
00099 
00100       const_iterator& operator++ ()
00101       {
00102          ++m_it;
00103          skip();
00104          return *this;
00105       }
00106 
00107       const_iterator operator++ (int)
00108       {
00109          const_iterator tmp(*this);
00110          ++*this;
00111          return tmp;
00112       }
00113 
00114       bool operator== (const const_iterator& it) const
00115       {
00116          return m_it == it.m_it;
00117       }
00118 
00119       bool operator!= (const const_iterator& it) const
00120       {
00121          return !(*this == it);
00122       }
00123 
00124    private:
00125       const_iterator(const mmd_dense_hash_map& map, const value_type *it, const value_type *end)
00126          : m_it(it)
00127          , m_end(end)
00128          , m_map(map)
00129       {
00130          skip();
00131       }
00132 
00133       void skip()
00134       {
00135          while (m_it != m_end && m_it->first == m_map.empty_key())
00136          {
00137             ++m_it;
00138          }
00139       }
00140 
00141       const value_type *m_it;
00142       const value_type *m_end;
00143       const mmd_dense_hash_map& m_map;
00144    };
00145 
00146    struct HashingPolicy
00147    {
00148       HashingPolicy(const key_type& k, size_type size)
00149          : m_size(size)
00150          , m_hash_mask(size - 1)
00151          , m_index(HashFcn()(k) & m_hash_mask)
00152          , m_iter(0)
00153       {
00154       }
00155 
00156       HashingPolicy& operator++()
00157       {
00158          if (++m_iter < m_size)
00159          {
00160             m_index = (m_index + m_iter) & m_hash_mask;
00161          }
00162          else
00163          {
00164             m_index = m_size;
00165          }
00166 
00167          return *this;
00168       }
00169 
00170       size_type operator() () const
00171       {
00172          return m_index;
00173       }
00174 
00175       size_type iter() const
00176       {
00177          return m_iter;
00178       }
00179 
00180    private:
00181       const size_type m_size;
00182       const size_type m_hash_mask;
00183       size_type m_index;
00184       size_type m_iter;
00185    };
00186 
00187 public:
00188    class writer : public mmd_section_writer_base
00189    {
00190    public:
00191       writer(memory_mapped_dataset::writer& wr, const std::string& name, const key_type& empty_key, float max_population_ratio = MAX_POPULATION_RATIO(), size_t alignment = MMD_HASH_ALIGNMENT)
00192          : mmd_section_writer_base(wr, name, "mmd_dense_hash_map", alignment)
00193          , m_empty_key(empty_key)
00194          , m_max_pop_ratio(max_population_ratio)
00195       {
00196          if (max_population_ratio < 0.009999 || max_population_ratio > 0.990001)
00197          {
00198             rollback();
00199             throw std::runtime_error("invalid max_population_ratio (must be in [0.01, 0.99])");
00200          }
00201          setattr("key_size", sizeof(key_type));
00202          setattr("mapped_size", sizeof(mapped_type));
00203          setattr("elem_size", sizeof(value_type));
00204       }
00205 
00206       writer& operator<< (const value_type& e)
00207       {
00208          insert(e);
00209          return *this;
00210       }
00211 
00212       writer& operator<< (const std::pair<Key, T>& e)
00213       {
00214          insert(e);
00215          return *this;
00216       }
00217 
00218       void insert(const value_type& e)
00219       {
00220          if (e.first == m_empty_key)
00221          {
00222             throw std::runtime_error("attempt to insert empty key");
00223          }
00224          m_values.push_back(e);
00225       }
00226 
00227       void insert(const std::pair<Key, T>& e)
00228       {
00229          value_type v;
00230          v.first = e.first;
00231          v.second = e.second;
00232          insert(v);
00233       }
00234 
00235       size_type size() const
00236       {
00237          return m_values.size();
00238       }
00239 
00240    protected:
00241       void pre_commit()      // all the writing actually happens here
00242       {
00243          setattr("population", size());
00244 
00245          std::vector<value_type> map;
00246          build_dense_hash_map(map);
00247          write(map);
00248 
00249          value_type empty;
00250          empty_value(empty);
00251          write(empty);
00252 
00253          setattr("size", map.size());
00254       }
00255 
00256    private:
00257       size_type get_optimum_table_size(size_type pop) const
00258       {
00259          if (pop <= 1)
00260          {
00261             return pop;
00262          }
00263 
00264          pop /= m_max_pop_ratio;
00265          size_type opt = 1;
00266 
00267          while (opt < pop)
00268          {
00269             opt <<= 1;
00270          }
00271 
00272          return opt;
00273       }
00274 
00275       size_type find(const key_type& key, const value_type *begin, size_type size) const
00276       {
00277          HashingPolicy hash(key, size);
00278 
00279          for (;;)
00280          {
00281             const value_type& target = begin[hash()];
00282 
00283             if (target.first == m_empty_key)
00284             {
00285                break;
00286             }
00287 
00288             if (target.first == key)
00289             {
00290                return size;
00291             }
00292 
00293             ++hash;
00294          }
00295 
00296          return hash();
00297       }
00298 
00299       void empty_value(value_type& val) const
00300       {
00301          std::memset(&val, 0, sizeof(value_type));
00302          val.first = m_empty_key;
00303       }
00304 
00305       void build_dense_hash_map(std::vector<value_type>& target) const
00306       {
00307          // TODO: priority sorting; if we insert high priority items first, we will find those again much faster
00308 
00309          std::vector<value_type> rv;
00310 
00311          size_type size = get_optimum_table_size(this->size());
00312 
00313          rv.resize(size);
00314          value_type empty;
00315          empty_value(empty);
00316          std::fill(rv.begin(), rv.end(), empty);
00317 
00318          for (typename std::vector<value_type>::const_iterator it = m_values.begin(); it != m_values.end(); ++it)
00319          {
00320             size_type index = find(it->first, &rv[0], rv.size());
00321 
00322             if (index == rv.size())
00323             {
00324                throw std::runtime_error("duplicate key detected");
00325             }
00326 
00327             rv[index] = *it;
00328          }
00329 
00330          target.swap(rv);
00331       }
00332 
00333       key_type m_empty_key;
00334       float m_max_pop_ratio;
00335       std::vector<value_type> m_values;
00336    };
00337 
00338    mmd_dense_hash_map()
00339       : m_size(0)
00340       , m_population(0)
00341       , m_begin(0)
00342    {
00343    }
00344 
00345    mmd_dense_hash_map(const memory_mapped_dataset& mmd, const std::string& name)
00346    {
00347       set(mmd, name);
00348    }
00349 
00350    void set(const memory_mapped_dataset& mmd, const std::string& name)
00351    {
00352       const memory_mapped_dataset::section_info& info = mmd.find(name, "mmd_dense_hash_map");
00353 
00354       if (info.getattr<size_t>("key_size") != sizeof(key_type))
00355       {
00356          throw std::runtime_error("wrong key size for dense_hash_map " + name + " in dataset " + mmd.description());
00357       }
00358 
00359       if (info.getattr<size_t>("mapped_size") != sizeof(mapped_type))
00360       {
00361          throw std::runtime_error("wrong mapped size for dense_hash_map " + name + " in dataset " + mmd.description());
00362       }
00363 
00364       if (info.getattr<size_t>("elem_size") != sizeof(value_type))
00365       {
00366          // shouldn't happen unless we run into an alignment mismatch
00367          throw std::runtime_error("wrong element size for dense_hash_map " + name + " in dataset " + mmd.description());
00368       }
00369 
00370       m_size = info.getattr<size_type>("size");
00371       m_population = info.getattr<size_type>("population");
00372       m_begin = mmd.data<value_type>(info.offset(), m_size + 1);
00373       m_empty_key = m_begin[m_size].first;
00374    }
00375 
00376    void warm_cache() const
00377    {
00378       memory_mapped_dataset::warm_cache(m_begin, m_begin + m_size);
00379    }
00380 
00381    const_iterator begin() const
00382    {
00383       return const_iterator(*this, m_begin, m_begin + m_size);
00384    }
00385 
00386    const_iterator end() const
00387    {
00388       return const_iterator(*this, m_begin + m_size, m_begin + m_size);
00389    }
00390 
00391    size_type size() const
00392    {
00393       return m_population;
00394    }
00395 
00396    size_type capacity() const
00397    {
00398       return m_size;
00399    }
00400 
00401    bool empty() const
00402    {
00403       return size() == 0;
00404    }
00405 
00406    const key_type& empty_key() const
00407    {
00408       return m_empty_key;
00409    }
00410 
00411    const_iterator find(const key_type& key) const
00412    {
00413       return const_iterator(*this, m_begin + find(key, m_begin, m_size), m_begin + m_size);
00414    }
00415 
00416    const mapped_type& operator[] (const key_type& key) const
00417    {
00418       size_type index = find(key, m_begin, m_size);
00419 
00420       if (index < m_size)
00421       {
00422          return m_begin[index].second;
00423       }
00424 
00425       throw std::runtime_error("no such key");
00426    }
00427 
00428 private:
00429    size_type find(const key_type& key, const value_type *begin, size_type size) const
00430    {
00431       HashingPolicy hash(key, size);
00432 
00433       for (;;)
00434       {
00435          const value_type& target = begin[hash()];
00436 
00437          if (target.first == key)
00438          {
00439             break;
00440          }
00441 
00442          if (target.first == m_empty_key)
00443          {
00444             return size;
00445          }
00446 
00447          ++hash;
00448       }
00449 
00450       return hash();
00451    }
00452 
00453    size_type m_size;
00454    size_type m_population;
00455    const value_type *m_begin;
00456    key_type m_empty_key;
00457 };
00458 
00459 }}
00460 
00461 #endif