cdbpp.h

00001 /*
00002  *      C++ implementation of Constant Database (CDB++)
00003  *
00004  * Copyright (c) 2008,2009, Naoaki Okazaki
00005  * All rights reserved.
00006  *
00007  * Redistribution and use in source and binary forms, with or without
00008  * modification, are permitted provided that the following conditions are met:
00009  *     * Redistributions of source code must retain the above copyright
00010  *       notice, this list of conditions and the following disclaimer.
00011  *     * Redistributions in binary form must reproduce the above copyright
00012  *       notice, this list of conditions and the following disclaimer in the
00013  *       documentation and/or other materials provided with the distribution.
00014  *     * Neither the name of the authors nor the names of its contributors may
00015  *       be used to endorse or promote products derived from this software
00016  *       without specific prior written permission.
00017  *
00018  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00019  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00020  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
00021  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
00022  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00023  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00024  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00025  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00026  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00027  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00028  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00029  */
00030 
00031 /* $Id: cdbpp.h 14 2009-07-13 16:39:27Z naoaki $ */
00032 
00033 #ifndef __CDBPP_H__
00034 #define __CDBPP_H__
00035 
00036 #include <cstring>
00037 #include <fstream>
00038 #include <functional>
00039 #include <iostream>
00040 #include <vector>
00041 #include <stdint.h>
00042 #include <stdexcept>
00043 
00044 namespace cdbpp
00045 {
00046 
00054 // Global constants.
00055 enum {
00056     // Version number.
00057     VERSION = 1,
00058     // The number of hash tables.
00059     NUM_TABLES = 256,
00060     // A constant for byte-order checking.
00061     BYTEORDER_CHECK = 0x62445371,
00062 };
00063 
00064 
00065 
00066 
00080 class murmurhash2 :
00081     public std::binary_function<const void *, size_t, uint32_t>
00082 {
00083 protected:
00084     inline static uint32_t get32bits(const char *d)
00085     {
00086         return *reinterpret_cast<const uint32_t*>(d);
00087     }
00088 
00089 public:
00090     inline uint32_t operator() (const void *key, size_t size) const
00091     {
00092         // 'm' and 'r' are mixing constants generated offline.
00093         // They're not really 'magic', they just happen to work well.
00094 
00095         const uint32_t m = 0x5bd1e995;
00096         const int32_t r = 24;
00097 
00098         // Initialize the hash to a 'random' value
00099 
00100         const uint32_t seed = 0x87654321;
00101         uint32_t h = seed ^ size;
00102 
00103         // Mix 4 bytes at a time into the hash
00104 
00105         const char * data = (const char *)key;
00106 
00107         while (size >= 4)
00108         {
00109             uint32_t k = get32bits(data);
00110 
00111             k *= m; 
00112             k ^= k >> r; 
00113             k *= m; 
00114             
00115             h *= m; 
00116             h ^= k;
00117 
00118             data += 4;
00119             size -= 4;
00120         }
00121         
00122         // Handle the last few bytes of the input array
00123 
00124         switch (size)
00125         {
00126         case 3: h ^= data[2] << 16;
00127         case 2: h ^= data[1] << 8;
00128         case 1: h ^= data[0];
00129                 h *= m;
00130         };
00131 
00132         // Do a few final mixes of the hash to ensure the last few
00133         // bytes are well-incorporated.
00134 
00135         h ^= h >> 13;
00136         h *= m;
00137         h ^= h >> 15;
00138 
00139         return h;
00140     } 
00141 };
00142 
00143 
00144 
00145 
00146 struct tableref_t
00147 {
00148     uint32_t    offset;     // Offset to a hash table.
00149     uint32_t    num;        // Number of elements in the hash table.
00150 };
00151 
00152 
00153 static uint32_t get_data_begin()
00154 {
00155     return (16 + sizeof(tableref_t) * NUM_TABLES);
00156 }
00157 
00158 
00159 
00163 class builder_exception : public std::invalid_argument
00164 {
00165 public:
00166     builder_exception(const std::string& msg)
00167         : std::invalid_argument(msg)
00168     {
00169     }
00170 };
00171 
00172 
00173 
00177 template <typename hash_function>
00178 class builder_base
00179 {
00180 protected:
00181     // A bucket structure.
00182     struct bucket
00183     {
00184         uint32_t    hash;       // Hash value of the record.
00185         uint32_t    offset;     // Offset address to the actual record.
00186 
00187         bucket() : hash(0), offset(0)
00188         {
00189         }
00190 
00191         bucket(uint32_t h, uint32_t o) : hash(h), offset(o)
00192         {
00193         }
00194     };
00195 
00196     // A hash table is a vector of buckets.
00197     typedef std::vector<bucket> hashtable;
00198 
00199 protected:
00200     std::ofstream&  m_os;               // Output stream.
00201     uint32_t        m_begin;
00202     uint32_t        m_cur;
00203     hashtable       m_ht[NUM_TABLES];   // Hash tables.
00204 
00205 public:
00212     builder_base(std::ofstream& os) : m_os(os)
00213     {
00214         m_begin = m_os.tellp();
00215         m_cur = get_data_begin();
00216         m_os.seekp(m_begin + m_cur);
00217     }
00218 
00222     virtual ~builder_base()
00223     {
00224         this->close();
00225     }
00226 
00236     template <class key_t, class value_t>
00237     void put(const key_t *key, size_t ksize, const value_t *value, size_t vsize)
00238     {
00239         // Write out the current record.
00240         write_uint32((uint32_t)ksize);
00241         m_os.write(reinterpret_cast<const char *>(key), ksize);
00242         write_uint32((uint32_t)vsize);
00243         m_os.write(reinterpret_cast<const char *>(value), vsize);
00244 
00245         // Compute the hash value and choose a hash table.
00246         uint32_t hv = hash_function()(static_cast<const void *>(key), ksize);
00247         hashtable& ht = m_ht[hv % NUM_TABLES];
00248 
00249         // Store the hash value and offset to the hash table.
00250         ht.push_back(bucket(hv, m_cur));
00251 
00252         // Increment the current position.
00253         m_cur += sizeof(uint32_t) + ksize + sizeof(uint32_t) + vsize;
00254     }
00255 
00256 protected:
00257     void close()
00258     {
00259         // Check the consistency of the stream offset.
00260         if (m_begin + m_cur != (uint32_t)m_os.tellp()) {
00261             throw builder_exception("Inconsistent stream offset");
00262         }
00263 
00264         // Store the hash tables. At this moment, the file pointer refers to
00265         // the offset succeeding the last key/value pair.
00266         for (size_t i = 0;i < NUM_TABLES;++i) {
00267             hashtable& ht = m_ht[i];
00268 
00269             // Do not write an empty hash table.
00270             if (!ht.empty()) {
00271                 // An actual table will have the double size; half elements
00272                 // in the table are kept empty.
00273                 int n = ht.size() * 2;
00274 
00275                 // Allocate the actual table.
00276                 bucket* dst = new bucket[n];
00277 
00278                 // Put hash elements to the table with the open-address method.
00279                 typename hashtable::const_iterator it;
00280                 for (it = ht.begin();it != ht.end();++it) {
00281                     int k = (it->hash >> 8) % n;
00282 
00283                     // Find a vacant element.
00284                     while (dst[k].offset != 0) {
00285                         k = (k+1) % n;
00286                     }
00287 
00288                     // Store the hash element.
00289                     dst[k].hash = it->hash;
00290                     dst[k].offset = it->offset;
00291                 }
00292 
00293                 // Write out the new table.
00294                 for (int k = 0;k < n;++k) {
00295                     write_uint32(dst[k].hash);
00296                     write_uint32(dst[k].offset);
00297                 }
00298 
00299                 // Free the table.
00300                 delete[] dst;
00301             }
00302         }
00303 
00304         // Store the current position.
00305         uint32_t offset = (uint32_t)m_os.tellp();
00306 
00307         // Rewind the stream position to the beginning.
00308         m_os.seekp(m_begin);
00309 
00310         // Write the file header.
00311         char chunkid[4] = {'C','D','B','+'};
00312         m_os.write(chunkid, 4);
00313         write_uint32(offset - m_begin);
00314         write_uint32(VERSION);
00315         write_uint32(BYTEORDER_CHECK);
00316 
00317         // Write references to hash tables. At this moment, dbw->cur points
00318         // to the offset succeeding the last key/data pair. 
00319         for (size_t i = 0;i < NUM_TABLES;++i) {
00320             // Offset to the hash table (or zero for non-existent tables).
00321             write_uint32(m_ht[i].empty() ? 0 : m_cur);
00322             // Bucket size is double to the number of elements.
00323             write_uint32(m_ht[i].size() * 2);
00324             // Advance the offset counter.
00325             m_cur += sizeof(uint32_t) * 2 * m_ht[i].size() * 2;
00326         }
00327 
00328         // Seek to the last position.
00329         m_os.seekp(offset);
00330     }
00331 
00332     inline void write_uint32(uint32_t value)
00333     {
00334         m_os.write(reinterpret_cast<const char *>(&value), sizeof(value));
00335     }
00336 };
00337 
00338 
00339 
00343 class cdbpp_exception : public std::invalid_argument
00344 {
00345 public:
00346     cdbpp_exception(const std::string& msg)
00347         : std::invalid_argument(msg)
00348     {
00349     }
00350 };
00351 
00355 template <typename hash_function>
00356 class cdbpp_base
00357 {
00358 protected:
00359     struct bucket_t
00360     {
00361         uint32_t    hash;           // Hash value of the record.
00362         uint32_t    offset;         // Offset address to the actual record.
00363     };
00364 
00365 
00366     struct hashtable_t
00367     {
00368         uint32_t        num;            // Number of elements in the table.
00369         const bucket_t* buckets;        // Buckets (array of bucket).
00370     };
00371 
00372 
00373 protected:
00374     const uint8_t*  m_buffer;           // Pointer to the memory block.
00375     size_t          m_size;             // Size of the memory block.
00376     bool            m_own;              // 
00377 
00378     hashtable_t     m_ht[NUM_TABLES];   // Hash tables.
00379     size_t          m_n;
00380 
00381 public:
00385     cdbpp_base()
00386         : m_buffer(NULL), m_size(0), m_own(false), m_n(0)
00387     {
00388     }
00389 
00397     cdbpp_base(const void *buffer, size_t size, bool own)
00398         : m_buffer(NULL), m_size(0), m_own(false), m_n(0)
00399     {
00400         this->open(buffer, size, own);
00401     }
00402 
00408     cdbpp_base(std::ifstream& ifs)
00409         : m_buffer(NULL), m_size(0), m_own(false), m_n(0)
00410     {
00411         this->open(ifs);
00412     }
00413 
00417     virtual ~cdbpp_base()
00418     {
00419         close();
00420     }
00421 
00427     bool is_open() const
00428     {
00429         return (m_buffer != NULL);
00430     }
00431 
00436     size_t size() const
00437     {
00438         return m_n;
00439     }
00440 
00446     bool empty() const
00447     {
00448         return (m_n == 0);
00449     }
00450 
00456     size_t open(std::ifstream& ifs)
00457     {
00458         char chunk[4], size[4];
00459         std::istream::pos_type offset = ifs.tellg();
00460 
00461         do {
00462             // Read a chunk identifier.
00463             ifs.read(chunk, 4);
00464             if (ifs.fail()) {
00465                 break;
00466             }
00467 
00468             // Check the chunk identifier.
00469             if (std::strncmp(chunk, "CDB+", 4) != 0) {
00470                 break;
00471             }
00472 
00473             // Read the size of the chunk.
00474             ifs.read(size, 4);
00475             if (ifs.fail()) {
00476                 break;
00477             }
00478 
00479             // Allocate a memory block for the chunk.
00480             uint32_t chunk_size = read_uint32(reinterpret_cast<uint8_t*>(size));
00481             uint8_t* block = new uint8_t[chunk_size];
00482 
00483             // Read the memory image from the stream.
00484             ifs.seekg(0, std::ios_base::beg);
00485             if (ifs.fail()) {
00486                 break;
00487             }
00488             ifs.read(reinterpret_cast<char*>(block), chunk_size);
00489             if (ifs.fail()) {
00490                 break;
00491             }
00492 
00493             return this->open(block, chunk_size, true);
00494 
00495         } while (0);
00496 
00497         ifs.seekg(offset, std::ios::beg);
00498         return 0;
00499     }
00500 
00508     size_t open(const void *buffer, size_t size, bool own = false)
00509     {
00510         const uint8_t *p = reinterpret_cast<const uint8_t*>(buffer);
00511 
00512         // Make sure that the size of the chunk is larger than the minimum size.
00513         if (size < get_data_begin()) {
00514             throw cdbpp_exception("The memory image is smaller than a chunk header.");
00515         }
00516 
00517         // Check the chunk identifier.
00518         if (memcmp(p, "CDB+", 4) != 0) {
00519             throw cdbpp_exception("Incorrect chunk header");
00520         }
00521         p += 4;
00522 
00523         // Read the chunk header.
00524         uint32_t csize = read_uint32(p);
00525         p += sizeof(uint32_t);
00526         uint32_t version = read_uint32(p);
00527         p += sizeof(uint32_t);
00528         uint32_t byteorder = read_uint32(p);
00529         p += sizeof(uint32_t);
00530 
00531         // Check the byte-order consistency.
00532         if (byteorder != BYTEORDER_CHECK) {
00533             throw cdbpp_exception("Inconsistent byte order");
00534         }
00535         // Check the chunk size.
00536         if (size < csize) {
00537             throw cdbpp_exception("The memory image is smaller than a chunk size.");
00538         }
00539 
00540         // Set memory block and size.
00541         m_buffer = reinterpret_cast<const uint8_t*>(buffer);
00542         m_size = size;
00543         m_own = own;
00544 
00545         // Set pointers to the hash tables.
00546         m_n = 0;
00547         const tableref_t* ref = reinterpret_cast<const tableref_t*>(p);
00548         for (size_t i = 0;i < NUM_TABLES;++i) {
00549             if (ref[i].offset) {
00550                 // Set the buckets.
00551                 m_ht[i].buckets = reinterpret_cast<const bucket_t*>(m_buffer + ref[i].offset);
00552                 m_ht[i].num = ref[i].num;
00553             } else {
00554                 // An empty hash table.
00555                 m_ht[i].buckets = NULL;
00556                 m_ht[i].num = 0;
00557             }
00558 
00559             // The number of records is the half of the table size.
00560             m_n += (ref[i].num / 2);
00561         }
00562 
00563         return (size_t)csize;
00564     }
00565 
00569     void close()
00570     {
00571         if (m_own && m_buffer != NULL) {
00572             delete[] m_buffer;
00573         }
00574         m_buffer = NULL;
00575         m_size = 0;
00576         m_n = 0;
00577     }
00578 
00587     const void* get(const void *key, size_t ksize, size_t* vsize) const
00588     {
00589         uint32_t hv = hash_function()(key, ksize);
00590         const hashtable_t* ht = &m_ht[hv % NUM_TABLES];
00591 
00592         if (ht->num && ht->buckets != NULL) {
00593             int n = ht->num;
00594             int k = (hv >> 8) % n;
00595             const bucket_t* p = NULL;
00596 
00597             while (p = &ht->buckets[k], p->offset) {
00598                 if (p->hash == hv) {
00599                     const uint8_t *q = m_buffer + p->offset;
00600                     if (read_uint32(q) == ksize &&
00601                         memcmp(key, q + sizeof(uint32_t), ksize) == 0) {
00602                         q += sizeof(uint32_t) + ksize;
00603                         if (vsize != NULL) {
00604                             *vsize = read_uint32(q);
00605                         }
00606                         return q + sizeof(uint32_t);
00607                     }
00608                 }
00609                 k = (k+1) % n;
00610             }
00611         }
00612 
00613         if (vsize != NULL) {
00614             *vsize = 0;
00615         }
00616         return NULL;
00617     }
00618 
00619 protected:
00620     inline uint32_t read_uint32(const uint8_t* p) const
00621     {
00622         return *reinterpret_cast<const uint32_t*>(p);
00623     }
00624 };
00625 
00627 typedef builder_base<murmurhash2> builder;
00629 typedef cdbpp_base<murmurhash2> cdbpp;
00630 
00631 
00632 };
00633 
00732 #endif/*__CDBPP_H__*/

Copyright (c) 2002-2009 by Naoaki Okazaki
Tue Jul 14 01:35:35 2009