about summary refs log tree commit homepage
path: root/ext/tdb/murmur2.c
diff options
context:
space:
mode:
Diffstat (limited to 'ext/tdb/murmur2.c')
-rw-r--r--ext/tdb/murmur2.c290
1 files changed, 290 insertions, 0 deletions
diff --git a/ext/tdb/murmur2.c b/ext/tdb/murmur2.c
new file mode 100644
index 0000000..6b6f8a6
--- /dev/null
+++ b/ext/tdb/murmur2.c
@@ -0,0 +1,290 @@
+#include "rbtdb.h"
+/*
+ * https://sites.google.com/site/murmurhash/
+ *
+ * Public Domain hash functions by Austin Appleby.
+ *
+ * Trivially adapted for use with Ruby TDB by Eric Wong.
+ */
+
+/*
+ * 'm' and 'r' are mixing constants generated offline.
+ * They're not really 'magic', they just happen to work well.
+ */
+
+static const unsigned int m = 0x5bd1e995;
+static const int r = 24;
+static const unsigned int seed;
+
+unsigned int rbtdb_murmur2(TDB_DATA * key)
+{
+        const unsigned char *data = key->dptr;
+        int len = (int)key->dsize;
+        /* Initialize the hash to a 'random' value */
+        unsigned int h = seed ^ len;
+
+        while (len >= 4) {
+                unsigned int k = *(const unsigned int *)data;
+
+                k *= m;
+                k ^= k >> r;
+                k *= m;
+
+                h *= m;
+                h ^= k;
+
+                data += 4;
+                len -= 4;
+        }
+
+        /* Handle the last few bytes of the input array */
+        switch (len) {
+        case 3:
+                h ^= data[2] << 16;
+        case 2:
+                h ^= data[1] << 8;
+        case 1:
+                h ^= data[0];
+                h *= m;
+        };
+
+        /*
+         * Do a few final mixes of the hash to ensure the last few
+         * bytes are well-incorporated.
+         */
+
+        h ^= h >> 13;
+        h *= m;
+        h ^= h >> 15;
+
+        return h;
+}
+
+/*
+ * This is a variant of MurmurHash2 modified to use the Merkle-Damgard
+ * construction. Bulk speed should be identical to Murmur2, small-key speed
+ * will be 10%-20% slower due to the added overhead at the end of the hash.
+ *
+ * This variant fixes a minor issue where null keys were more likely to
+ * collide with each other than expected, and also makes the algorithm
+ * more amenable to incremental implementations. All other caveats from
+ * MurmurHash2 still apply.
+ */
+
+#define mmix(h,k) do { k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; } while (0)
+
+unsigned int rbtdb_murmur2a(TDB_DATA * key)
+{
+        const unsigned char *data = key->dptr;
+        int len = (int)key->dsize;
+        unsigned int l = (unsigned int)len;
+        unsigned int h = seed;
+        unsigned int t = 0;
+
+        while (len >= 4) {
+                unsigned int k = *(const unsigned int *)data;
+
+                mmix(h, k);
+
+                data += 4;
+                len -= 4;
+        }
+
+        switch (len) {
+        case 3:
+                t ^= data[2] << 16;
+        case 2:
+                t ^= data[1] << 8;
+        case 1:
+                t ^= data[0];
+        };
+
+        mmix(h, t);
+        mmix(h, l);
+
+        h ^= h >> 13;
+        h *= m;
+        h ^= h >> 15;
+
+        return h;
+}
+
+/*
+ * Same algorithm as MurmurHash2, but only does aligned reads - should be safer
+ * on certain platforms
+ *
+ * Performance will be lower than MurmurHash2
+ */
+
+#define MIX(h,k,m) do { k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; } while (0)
+
+unsigned int rbtdb_murmur2_aligned(TDB_DATA * key)
+{
+        const unsigned char *data = key->dptr;
+        int len = (int)key->dsize;
+        unsigned int h = seed ^ len;
+        union { const unsigned char *byte; int integer; } cast = { data };
+        int align = cast.integer & 3;
+
+        if (align && (len >= 4)) {
+                /* Pre-load the temp registers */
+                unsigned int t = 0, d = 0;
+                int sl, sr;
+
+                switch (align) {
+                case 1:
+                        t |= data[2] << 16;
+                case 2:
+                        t |= data[1] << 8;
+                case 3:
+                        t |= data[0];
+                }
+
+                t <<= (8 * align);
+
+                data += 4 - align;
+                len -= 4 - align;
+
+                sl = 8 * (4 - align);
+                sr = 8 * align;
+
+                /* Mix */
+                while (len >= 4) {
+                        unsigned int k;
+
+                        d = *(const unsigned int *)data;
+                        t = (t >> sr) | (d << sl);
+
+                        k = t;
+
+                        MIX(h, k, m);
+
+                        t = d;
+
+                        data += 4;
+                        len -= 4;
+                }
+
+                /* Handle leftover data in temp registers */
+                d = 0;
+                if (len >= align) {
+                        unsigned int k;
+
+                        switch (align) {
+                        case 3:
+                                d |= data[2] << 16;
+                        case 2:
+                                d |= data[1] << 8;
+                        case 1:
+                                d |= data[0];
+                        }
+
+                        k = (t >> sr) | (d << sl);
+                        MIX(h, k, m);
+
+                        data += align;
+                        len -= align;
+
+                        /* Handle tail bytes */
+                        switch (len) {
+                        case 3:
+                                h ^= data[2] << 16;
+                        case 2:
+                                h ^= data[1] << 8;
+                        case 1:
+                                h ^= data[0];
+                                h *= m;
+                        };
+                } else {
+                        switch (len) {
+                        case 3:
+                                d |= data[2] << 16;
+                        case 2:
+                                d |= data[1] << 8;
+                        case 1:
+                                d |= data[0];
+                        case 0:
+                                h ^= (t >> sr) | (d << sl);
+                                h *= m;
+                        }
+                }
+
+                h ^= h >> 13;
+                h *= m;
+                h ^= h >> 15;
+
+                return h;
+        } else {
+                while (len >= 4) {
+                        unsigned int k = *(const unsigned int *)data;
+
+                        MIX(h, k, m);
+
+                        data += 4;
+                        len -= 4;
+                }
+
+                /* Handle tail bytes */
+                switch (len) {
+                case 3:
+                        h ^= data[2] << 16;
+                case 2:
+                        h ^= data[1] << 8;
+                case 1:
+                        h ^= data[0];
+                        h *= m;
+                };
+
+                h ^= h >> 13;
+                h *= m;
+                h ^= h >> 15;
+
+                return h;
+        }
+}
+
+/*
+ * Same as MurmurHash2, but endian- and alignment-neutral.
+ * Half the speed though, alas.
+ */
+unsigned int rbtdb_murmur2_neutral(TDB_DATA * key)
+{
+        const unsigned char *data = key->dptr;
+        int len = (int)key->dsize;
+        unsigned int h = seed ^ len;
+
+        while (len >= 4) {
+                unsigned int k;
+
+                k = data[0];
+                k |= data[1] << 8;
+                k |= data[2] << 16;
+                k |= data[3] << 24;
+
+                k *= m;
+                k ^= k >> r;
+                k *= m;
+
+                h *= m;
+                h ^= k;
+
+                data += 4;
+                len -= 4;
+        }
+
+        switch (len) {
+        case 3:
+                h ^= data[2] << 16;
+        case 2:
+                h ^= data[1] << 8;
+        case 1:
+                h ^= data[0];
+                h *= m;
+        };
+
+        h ^= h >> 13;
+        h *= m;
+        h ^= h >> 15;
+
+        return h;
+}