From 28385505e0d3ccf23a331f003ce3f6cfa66667a5 Mon Sep 17 00:00:00 2001
From: Yao Yue <yao@twitter.com>
Date: Tue, 26 Oct 2021 18:32:41 -0700
Subject: [PATCH] partial impl of poc APIs for hashmap

---
 src/data_structure/hashmap/CMakeLists.txt |   1 +
 src/data_structure/hashmap/hashmap.c      | 221 ++++++++++++++++++++++
 src/data_structure/hashmap/hashmap.h      |  97 ++++++++++
 src/rust/.gitignore                       |   1 +
 4 files changed, 320 insertions(+)
 create mode 100644 src/data_structure/hashmap/CMakeLists.txt
 create mode 100644 src/data_structure/hashmap/hashmap.c
 create mode 100644 src/data_structure/hashmap/hashmap.h
 create mode 100644 src/rust/.gitignore

diff --git a/src/data_structure/hashmap/CMakeLists.txt b/src/data_structure/hashmap/CMakeLists.txt
new file mode 100644
index 000000000..162692890
--- /dev/null
+++ b/src/data_structure/hashmap/CMakeLists.txt
@@ -0,0 +1 @@
+add_library(ds_hashmap hashmap.c)
diff --git a/src/data_structure/hashmap/hashmap.c b/src/data_structure/hashmap/hashmap.c
new file mode 100644
index 000000000..0d4ca0780
--- /dev/null
+++ b/src/data_structure/hashmap/hashmap.c
@@ -0,0 +1,221 @@
+#include "hashmap.h"
+
+#include <cc_debug.h>
+
+
+#define HM_BODY(_hm) ((char *)(_hm) + HASHMAP_HEADER_SIZE)
+#define HM_END(_hm) ((char *)(_hm) + HASHMAP_HEADER_SIZE + HM_NBODY(hm))
+
+
+static inline uint8_t
+_entry_klen(char *entry)
+{
+    return *((uint8_t *)(entry));
+}
+
+static inline uint8_t
+_entry_vlen(char *entry)
+{
+    return *((uint8_t *)((entry) + sizeof(uint8_t)));
+}
+
+static inline uint32_t
+_entry_nbyte(char *entry)
+{
+    return (uint32_t)HASHMAP_ENTRY_HEADER_SIZE + _entry_klen(entry) +
+        _entry_vlen(entry);
+}
+
+static inline char *
+_entry_key(char *entry)
+{
+    return entry + HASHMAP_ENTRY_HEADER_SIZE;
+}
+
+static inline char *
+_entry_val(char *entry)
+{
+    return entry + HASHMAP_ENTRY_HEADER_SIZE + _entry_klen(entry);
+}
+
+static inline void
+_entry_set(char *entry, const struct bstring *key, const struct bstring *val)
+{
+    *((uint8_t *)entry) = key->len;
+    *((uint8_t *)entry + sizeof(uint8_t)) = val->len;
+    cc_memcpy(entry + HASHMAP_ENTRY_HEADER_SIZE, key->data, key->len);
+    cc_memcpy(entry + HASHMAP_ENTRY_HEADER_SIZE + key->len, val->data, val->len);
+}
+
+static inline char *
+_entry_val(char *entry)
+{
+    return entry + HASHMAP_ENTRY_HEADER_SIZE + _entry_klen(entry);
+}
+
+static inline char *
+_next_entry(char *entry)
+{
+    return entry + HASHMAP_ENTRY_HEADER_SIZE + _entry_klen(entry) + _entry_vlen(entry);
+}
+
+
+/* returns true if an exact match is found, false otherwise.
+ * If a match is found, the position of the entry element is stored in pos;
+ * otherwise, pos contains the position of the insertion spot
+ */
+static inline bool
+_locate(char **pos, uint32_t *idx, const char *entry, uint32_t nentry, struct bstring *key)
+{
+    uint32_t i;
+    int bcmp_sgn;
+    int eklen_sgn;
+
+    ASSERT(idx != NULL);
+
+    if (nentry == 0) {
+        return false;
+    }
+
+    for (*pos = entry; *idx < nentry; *idx += 1) {
+        uint8_t eklen = _entry_klen(*pos);
+        bcmp_sgn = cc_bcmp(key->data, _entry_key(*pos), MIN(key->len, eklen));
+        if (bcmp_sgn > 0) { /* no match, insert position found */
+            return false;
+        }
+        if (bcmp_sign < 0) { /* no match, next entry */
+            *pos = _next_entry(*pos);
+            continue;
+        }
+
+        /* bcmp_sign == 0, may need to look at length. A shorter key is smaller */
+
+        /* -1: key is shorter than eklen; 0: equal length; 1: key is longer */
+        eklen_sgn = (key->len > eklen) - (eklen > key->len);
+        if (eklen_sgn > 0) { /* no match, insert position found */
+            return false;
+        }
+        if (eklen_sgn < 0) { /* no match, next entry */
+            *pos = _next_entry(*pos);
+            continue;
+        }
+
+        return true; /* match iff bcmp_sgn and eklen_sgn are both 0 */
+    }
+
+    return false; /* *pos points to the end of body */
+}
+
+
+hashmap_rstatus_e
+hashmap_init(hashmap_p hm)
+{
+    if (hm == NULL) {
+        log_debug("NULL pointer encountered for hm");
+
+        return HASHMAP_ERROR;
+    }
+
+    HM_NENTRY(hm) = 0;
+    HM_NBYTE(hm) = 0;
+
+    return HASHMAP_OK;
+}
+
+
+hashmap_rstatus_e
+hashmap_get(struct bstring *val, const hashmap_p hm, const struct bstring *key)
+{
+    uint32_t idx, nentry;
+    char *entry;
+
+    if (key == NULL || val == NULL || hm == NULL) {
+        log_debug("NULL pointer encountered for hm %p, key %p, or val %p", hm,
+                key, val);
+
+        return HASHMAP_ERROR;
+    }
+
+    idx = 0;
+    nentry = hashmap_nentry(hm);
+    if (_locate(&entry, &idx, HM_BODY(hm), nentry, key)) { /* found */
+        val->len = _entry_vlen(entry);
+        val->data = entry_val(entry);
+        return HASHMAP_OK;
+    } else {
+        val->len = 0;
+        val->data = NULL;
+        return HASHMAP_ENOTFOUND;
+    }
+}
+
+
+uint32_t
+hashmap_multiget(struct bstring *val[], const hashmap_p hm, const struct bstring *key[], uint32_t cardinality)
+{
+    uint32_t k, idx, nentry, nfound;
+    char *entry, *curr;
+
+    if (key == NULL || val == NULL || hm == NULL) {
+        log_debug("NULL pointer encountered for hm %p, key %p, or val %p", hm,
+                key, val);
+
+        return HASHMAP_ERROR;
+    }
+
+    idx = 0;
+    nentry = hashmap_nentry(hm);
+    for (k = 0, curr = HM_BODY(hm), nfound = 0; k < cardinality; k++) {
+        if (_locate(&entry, &idx, curr, nentry, key)) { /* found */
+            val[k]->len = _entry_vlen(entry);
+            val[k]->data = entry_val(entry);
+            nfound++;
+        } else {
+            val[k]->len = 0;
+            val[k]->data = NULL;
+        }
+    }
+
+    return nfound;
+}
+
+
+hashmap_rstatus_e
+hashmap_insert(hashmap_p hm, const struct bstring *key, const struct bstring *val)
+{
+    bool found;
+    char *body, *entry;
+    uint32_t idx, nentry;
+
+    if (hm == NULL) {
+        log_debug("NULL pointer encountered for hm");
+
+        return HASHMAP_ERROR;
+    }
+
+    if (key->len > UINT8_MAX || val->len > UINT8_MAX) {
+        log_debug("key / value size too big for current hashmap implementation:"
+                "key size: %"PRIu32", val size: %"PRIu32". (Allowed: %"PRIu8")",
+                key->len, val->len, UINT8_MAX);
+
+        return HASHMAP_EINVALID;
+    }
+
+    body = HM_BODY(hm);
+    nentry = hashmap_nentry(hm);
+
+    if (_locate(&entry, &idx, HM_BODY(hm), nentry, key)) { /* found */
+        return HASHMAP_EDUP;
+    }
+
+    if (entry < HM_END(hm)) {
+        cc_memmove(entry + HASHMAP_ENTRY_HEADER_SIZE + key->len + val->len,
+                entry, HM_END(hm) - entry);
+    }
+    _entry_set(entry, key, val);
+
+    HM_NENTRY(hm) += 1;
+    HM_NBODY(hm) += _entry_nbyte(entry);
+
+    return HASHMAP_OK;
+}
diff --git a/src/data_structure/hashmap/hashmap.h b/src/data_structure/hashmap/hashmap.h
new file mode 100644
index 000000000..620f3fbc3
--- /dev/null
+++ b/src/data_structure/hashmap/hashmap.h
@@ -0,0 +1,97 @@
+#pragma once
+
+/* This is an implementation of hashmaps with bounded but flexible entry size
+ * with binary field keys. The size of both field key and value are limited to
+ * 255 bytes in this POC.
+ *
+ * The fields are sorted but not indexed. This makes bulk lookup faster when the
+ * field (keys) are also sorted.
+ *
+ * ----------------------------------------------------------------------------
+ *
+ * HASHMAP OVERALL LAYOUT
+ * =====================
+ *
+ * The general layout of the hashmap is as follows:
+ *                         entry
+ *                ╭------------------------╮
+ * <nentry><nbody><eklen><evlen><ekey><eval> ... <eklen><evlen><ekey><eval>
+ * ╰-------------╯╰-------------------------------------------------------╯
+ *     header                             body
+ *
+ * Overhead: 8 bytes (two 32-bit integers)
+ *
+ * <uint32_t nentry> is the number of entries.
+ * <uint32_t nbody> is the number of bytes in the body (not including header).
+ *
+ *
+ *
+ * HASHMAP ENTRIES
+ * ==============
+ *
+ * For each entry:
+ * <uint8_t eklen> is the size of hash field in each entry (entry key)
+ * <uint8_t evlen> is the size of hash value in each entry (entry value)
+ *
+ * The rest of the entry is a tuple of a binary string (non-empty byte array)
+ * for field and a byte array for value.
+ *
+ * RUNTIME
+ * =======
+ *
+ * Entry lookup takes O(N) where N is the number of entries in the list.
+ *
+ * Insertion and removal of entries involve scan-based lookup, as well as
+ * shifting data. So in additional to the considerations above, the amount of
+ * data being moved for updates will affect performance. Updates near the "fixed
+ * end" of the hashmap (currently the beginning) require moving more data and
+ * therefore will be slower. Overall, it is cheapest to perform updates at the
+ * end of the array due to zero data movement.
+ *
+ */
+
+#include <cc_bstring.h>
+
+#include <stdint.h>
+
+#define HASHMAP_HEADER_SIZE (sizeof(uint32_t) + sizeof(uint32_t))      /* 8 */
+#define HASHMAP_ENTRY_HEADER_SIZE (sizeof(uint8_t) + sizeof(uint8_t))  /* 2 */
+
+typedef char * hashmap_p;
+
+typedef enum {
+    HASHMAP_OK,
+    HASHMAP_ENOTFOUND,  /* value not found error */
+    HASHMAP_EINVALID,   /* invalid (entry) data error */
+    HASHMAP_EDUP,       /* duplicate entry found */
+    HASHMAP_ERROR,
+    HASHMAP_SENTINEL
+} hashmap_rstatus_e;
+
+#define HM_NENTRY(_hm) (*((uint32_t *)(_hm)))
+#define HM_NBODY(_hm) (*((uint32_t *)((_hm) + sizeof(uint32_t))))
+
+static inline uint32_t
+hashmap_nentry(const hashmap_p hm)
+{
+    return HM_NENTRY(hm);
+}
+
+static inline uint32_t
+hashmap_size(const hashmap_p hm)
+{
+    return HASHMAP_HEADER_SIZE + HM_NBODY(hm);
+}
+
+/* initialize an hashmap of key size 1/2/4/8 bytes and vsize */
+hashmap_rstatus_e hashmap_init(hashmap_p hm);
+
+/* hashmap APIs: seek */
+hashmap_rstatus_e hashmap_get(struct bstring *val, const hashmap_p hm, const struct bstring *key);
+uint32_t hashmap_multiget(struct bstring *val[], const hashmap_p hm, const struct bstring *key[], uint32_t cardinality);
+
+/* hashmap APIs: modify */
+hashmap_rstatus_e hashmap_insert(hashmap_p hm, const struct bstring *key, const struct bstring *val);
+hashmap_rstatus_e hashmap_bulk_insert(hashmap_p hm, const struct bstring *key, const struct bstring *val);
+hashmap_rstatus_e hashmap_remove(hashmap_p hm, const struct bstring *key);
+hashmap_rstatus_e hashmap_bulk_insert(hashmap_p hm, const struct bstring *key, const struct bstring *val);
diff --git a/src/rust/.gitignore b/src/rust/.gitignore
new file mode 100644
index 000000000..1d74e2196
--- /dev/null
+++ b/src/rust/.gitignore
@@ -0,0 +1 @@
+.vscode/