libmoost
/home/mhx/git/github/libmoost/include/moost/container/memory_mapped_dataset/dataset.hpp
Go to the documentation of this file.
00001 /* vim:set ts=3 sw=3 sts=3 et: */
00028 #ifndef MOOST_CONTAINER_MEMORY_MAPPED_DATASET_DATASET_HPP__
00029 #define MOOST_CONTAINER_MEMORY_MAPPED_DATASET_DATASET_HPP__
00030 
00031 #include <string>
00032 #include <stdexcept>
00033 #include <fstream>
00034 #include <sstream>
00035 #include <algorithm>
00036 #include <cstring>
00037 
00038 #include <boost/archive/text_iarchive.hpp>
00039 #include <boost/archive/text_oarchive.hpp>
00040 #include <boost/serialization/string.hpp>
00041 #include <boost/serialization/map.hpp>
00042 #include <boost/iostreams/device/mapped_file.hpp>
00043 #include <boost/type_traits/is_pod.hpp>
00044 #include <boost/lexical_cast.hpp>
00045 #include <boost/noncopyable.hpp>
00046 #include <boost/cstdint.hpp>
00047 
00048 #include "config.hpp"
00049 
00050 namespace moost { namespace container {
00051 
00063 class memory_mapped_dataset : public boost::noncopyable
00064 {
00065 private:
00066    static const boost::uint32_t MMD_MAGIC = 0x7473614C;    
00067    static const boost::uint32_t MMD_VERSION = 1;           
00068 
00069    static const size_t MAP_PAGE_SIZE = 4096;               
00070 
00079    struct mmd_header // must be POD
00080    {
00081       boost::uint32_t mmd_magic;              
00082       boost::uint32_t mmd_version;            
00083       boost::uint64_t index_offset;           
00084       boost::uint64_t index_length;           
00085    };
00086 
00087 public:
00101    class section_info
00102    {
00103    private:
00104       typedef std::map<std::string, std::string> attribute_map_type;
00105 
00106    public:
00107       section_info()
00108          : m_offset(0)
00109          , m_alignment(0)
00110       {
00111       }
00112 
00113       section_info(const std::string& type, size_t alignment)
00114          : m_type(type)
00115          , m_offset(0)
00116          , m_alignment(alignment)
00117       {
00118       }
00119 
00120       boost::uint64_t offset() const
00121       {
00122          return m_offset;
00123       }
00124 
00125       size_t alignment() const
00126       {
00127          return m_alignment;
00128       }
00129 
00130       void set_offset(boost::uint64_t offset)
00131       {
00132          m_offset = offset;
00133       }
00134 
00135       template <typename T>
00136       void setattr(const std::string& name, const T& value)
00137       {
00138          m_attributes[name] = boost::lexical_cast<std::string>(value);
00139       }
00140 
00141       template <typename T>
00142       const T getattr(const std::string& name) const
00143       {
00144          attribute_map_type::const_iterator it = m_attributes.find(name);
00145 
00146          if (it == m_attributes.end())
00147          {
00148             throw std::runtime_error("no such attribute " + name);
00149          }
00150 
00151          return boost::lexical_cast<T>(it->second);
00152       }
00153 
00154       const std::string& type() const
00155       {
00156          return m_type;
00157       }
00158 
00159    private:
00160       friend class boost::serialization::access;
00161 
00162       template <class Archive>
00163       void serialize(Archive & ar, const unsigned int /* version */)
00164       {
00165          ar & m_type & m_offset & m_attributes;
00166       }
00167 
00168       std::string m_type;
00169       boost::uint64_t m_offset;
00170       const size_t m_alignment;
00171       attribute_map_type m_attributes;
00172    };
00173 
00174    typedef std::map<std::string, section_info> section_map_type;
00175 
00187    class writer
00188    {
00189    public:
00201       writer(const std::string& map_file_name, const std::string& dataset_name, boost::uint32_t format_version)
00202          : m_ofs(map_file_name.c_str(), std::ios::binary | std::ios::trunc)
00203          , m_dataset_name(dataset_name)
00204          , m_format_version(format_version)
00205       {
00206          if (!m_ofs)
00207          {
00208             throw std::runtime_error("failed to open file " + map_file_name);
00209          }
00210 
00211          if (dataset_name.empty())
00212          {
00213             throw std::runtime_error("empty dataset name");
00214          }
00215 
00216          m_header.mmd_magic = MMD_MAGIC;
00217          m_header.mmd_version = MMD_VERSION;
00218          m_header.index_offset = 0;
00219          m_header.index_length = 0;
00220 
00221          write(m_header);
00222       }
00223 
00224       ~writer()
00225       {
00226          try
00227          {
00228             close();
00229          }
00230          catch (...)
00231          {
00232          }
00233       }
00234 
00241       void close()
00242       {
00243          if (m_ofs.is_open())
00244          {
00245             m_header.index_offset = m_ofs.tellp();
00246             boost::archive::text_oarchive oa(m_ofs);
00247             oa << m_dataset_name << m_format_version << m_section_map;
00248             m_header.index_length = static_cast<boost::uint64_t>(m_ofs.tellp()) - m_header.index_offset;
00249             m_ofs.seekp(0);
00250             write(m_header);
00251             m_ofs.close();
00252          }
00253       }
00254 
00255       void create_section(const std::string& name, const std::string& type, size_t alignment)
00256       {
00257          if (name.empty())
00258          {
00259             throw std::runtime_error("invalid empty section name");
00260          }
00261 
00262          if (type.empty())
00263          {
00264             throw std::runtime_error("invalid empty section type");
00265          }
00266 
00267          if (alignment == 0 || (alignment & (alignment - 1)) != 0)
00268          {
00269             throw std::runtime_error("alignment must be a power of 2");
00270          }
00271 
00272          std::pair<section_map_type::iterator, bool> rv = m_section_map.insert(std::make_pair(name, section_info(type, alignment)));
00273 
00274          if (!rv.second)
00275          {
00276             throw std::runtime_error("attempt to create duplicate section " + name);
00277          }
00278       }
00279 
00280       void uncreate_section(const std::string& name)
00281       {
00282          if (name.empty())
00283          {
00284             throw std::runtime_error("invalid empty section name");
00285          }
00286 
00287          section_map_type::iterator it = m_section_map.find(name);
00288 
00289          if (it == m_section_map.end())
00290          {
00291             throw std::runtime_error("attempt to uncreate non-existent section " + name);
00292          }
00293 
00294          if (it->second.offset())
00295          {
00296             throw std::runtime_error("cannot uncreate section " + name + " that has already been written to");
00297          }
00298 
00299          m_section_map.erase(it);
00300       }
00301 
00302       void write(const std::string& section, const char *data, size_t size)
00303       {
00304          set_active_section(section);
00305          m_ofs.write(data, size);
00306       }
00307 
00308       void commit_section(const std::string& section)
00309       {
00310          set_active_section(section);
00311       }
00312 
00313       template <typename T>
00314       void setattr(const std::string& section, const std::string& attr, const T& value)
00315       {
00316          find(section).setattr(attr, value);
00317       }
00318 
00319    private:
00320       section_info& find(const std::string& section)
00321       {
00322          section_map_type::iterator it = m_section_map.find(section);
00323 
00324          if (it == m_section_map.end())
00325          {
00326             throw std::runtime_error("no such section " + section);
00327          }
00328 
00329          return it->second;
00330       }
00331 
00332       template <typename T>
00333       void write(const T& data)
00334       {
00335          m_ofs.write(reinterpret_cast<const char *>(&data), sizeof(data));
00336       }
00337 
00338       void set_active_section(const std::string& section)
00339       {
00340          // Yeah, this is slightly inefficient, but it's only being used at
00341          // dataset creation time.
00342          if (section != m_active_section)
00343          {
00344             section_info& sec = find(section);
00345 
00346             // We have to ensure that only one section is written at a time.
00347             if (sec.offset() > 0)
00348             {
00349                throw std::runtime_error("interleaved write access to section " + section);
00350             }
00351 
00352             align_stream(sec.alignment());
00353             sec.set_offset(m_ofs.tellp());
00354             m_active_section = section;
00355          }
00356       }
00357 
00358       void align_stream(size_t alignment)
00359       {
00360          while (m_ofs.tellp() % alignment)
00361          {
00362             m_ofs.put(0);
00363          }
00364       }
00365 
00366       section_map_type m_section_map;
00367       std::string m_active_section;
00368       std::ofstream m_ofs;
00369       const std::string m_dataset_name;
00370       const boost::uint32_t m_format_version;
00371       mmd_header m_header;
00372    };
00373 
00385    memory_mapped_dataset(const std::string& map_file_name,
00386                          const std::string& dataset_name,
00387                          boost::uint32_t format_version)
00388       : m_file(map_file_name)
00389       , m_format(dataset_name)
00390    {
00391       try
00392       {
00393          m_map.open(m_file, boost::iostreams::mapped_file::readonly);
00394       }
00395       catch (const BOOST_IOSTREAMS_FAILURE& fail)
00396       {
00397          // otherwise it's a real pain to figure out which file it's actually complaining about
00398          throw BOOST_IOSTREAMS_FAILURE(m_file + ": " + fail.what());
00399       }
00400 
00401       const mmd_header *hdr = data<mmd_header>();
00402 
00403       if (hdr->mmd_magic != MMD_MAGIC)
00404       {
00405          throw std::runtime_error(m_file + ": invalid magic");
00406       }
00407 
00408       if (hdr->mmd_version != MMD_VERSION)
00409       {
00410          throw std::runtime_error(m_file + ": unsupported version");
00411       }
00412 
00413       if (hdr->index_offset == 0 || hdr->index_length == 0)
00414       {
00415          throw std::runtime_error(m_file + ": corrupted file");
00416       }
00417 
00418       std::string indexstr(data<char>(hdr->index_offset, hdr->index_length), hdr->index_length);
00419       std::istringstream iss(indexstr);
00420 
00421       std::string dset_name;
00422       boost::uint32_t fmt_version;
00423 
00424       boost::archive::text_iarchive ia(iss);
00425       ia >> dset_name >> fmt_version >> m_section_map;
00426 
00427       if (dset_name != dataset_name)
00428       {
00429          throw std::runtime_error(m_file + ": unexpected format name: " + dset_name + " (expected " + dataset_name + ")");
00430       }
00431 
00432       if (fmt_version != format_version)
00433       {
00434          std::ostringstream oss;
00435          oss << m_file << ": unsupported format version: " << fmt_version << " (expected " << format_version << ")";
00436          throw std::runtime_error(oss.str());
00437       }
00438    }
00439 
00440    std::string description() const
00441    {
00442       return m_format + " (" + m_file + ")";
00443    }
00444 
00455    const section_info& find(const std::string& section, const std::string& type) const
00456    {
00457       section_map_type::const_iterator it = m_section_map.find(section);
00458 
00459       if (it == m_section_map.end())
00460       {
00461          throw std::runtime_error(m_file + ": no such section " + section);
00462       }
00463 
00464       if (it->second.offset() == 0)
00465       {
00466          throw std::runtime_error(m_file + ": corrupt section " + section);
00467       }
00468 
00469       if (it->second.type() != type)
00470       {
00471          throw std::runtime_error(m_file + ": invalid section type " + it->second.type() + " (expected " + type + ")");
00472       }
00473 
00474       return it->second;
00475    }
00476 
00488    template <typename T>
00489    const T *data(size_t offset = 0, size_t count = 1) const
00490    {
00491       BOOST_STATIC_ASSERT_MSG(boost::is_pod<T>::value, "data<>() called on non-POD type");
00492 
00493       if (offset + count*sizeof(T) > m_map.size())
00494       {
00495          throw std::runtime_error(m_file + ": potential attempt to access data beyond end of mapping");
00496       }
00497 
00498       return reinterpret_cast<const T *>(m_map.const_data() + offset);
00499    }
00500 
00504    const std::string& filename() const
00505    {
00506       return m_file;
00507    }
00508 
00518    static void warm_cache(const void *beg, const void *end)
00519    {
00520       const char *b = reinterpret_cast<const char *>(beg);
00521       const char *e = reinterpret_cast<const char *>(end);
00522       char buf[MAP_PAGE_SIZE];
00523       size_t page_off = static_cast<size_t>(b - static_cast<const char *>(0))%sizeof(buf);
00524 
00525       if (page_off)
00526       {
00527          size_t len = std::min(sizeof(buf) - page_off, static_cast<size_t>(e - b));
00528          std::memcpy(buf, beg, len);
00529          b += len;
00530       }
00531 
00532       while (b < e)
00533       {
00534          size_t len = std::min(sizeof(buf), static_cast<size_t>(e - b));
00535          std::memcpy(buf, b, len);
00536          b += len;
00537       }
00538    }
00539 
00540 private:
00541    const std::string m_file;
00542    const std::string m_format;
00543    boost::iostreams::mapped_file m_map;
00544    section_map_type m_section_map;
00545 };
00546 
00547 }}
00548 
00549 #endif