From 28385505e0d3ccf23a331f003ce3f6cfa66667a5 Mon Sep 17 00:00:00 2001 From: Yao Yue Date: Tue, 26 Oct 2021 18:32:41 -0700 Subject: [PATCH] partial impl of poc APIs for hashmap --- src/data_structure/hashmap/CMakeLists.txt | 1 + src/data_structure/hashmap/hashmap.c | 221 ++++++++++++++++++++++ src/data_structure/hashmap/hashmap.h | 97 ++++++++++ src/rust/.gitignore | 1 + 4 files changed, 320 insertions(+) create mode 100644 src/data_structure/hashmap/CMakeLists.txt create mode 100644 src/data_structure/hashmap/hashmap.c create mode 100644 src/data_structure/hashmap/hashmap.h create mode 100644 src/rust/.gitignore diff --git a/src/data_structure/hashmap/CMakeLists.txt b/src/data_structure/hashmap/CMakeLists.txt new file mode 100644 index 000000000..162692890 --- /dev/null +++ b/src/data_structure/hashmap/CMakeLists.txt @@ -0,0 +1 @@ +add_library(ds_hashmap hashmap.c) diff --git a/src/data_structure/hashmap/hashmap.c b/src/data_structure/hashmap/hashmap.c new file mode 100644 index 000000000..0d4ca0780 --- /dev/null +++ b/src/data_structure/hashmap/hashmap.c @@ -0,0 +1,221 @@ +#include "hashmap.h" + +#include + + +#define HM_BODY(_hm) ((char *)(_hm) + HASHMAP_HEADER_SIZE) +#define HM_END(_hm) ((char *)(_hm) + HASHMAP_HEADER_SIZE + HM_NBODY(hm)) + + +static inline uint8_t +_entry_klen(char *entry) +{ + return *((uint8_t *)(entry)); +} + +static inline uint8_t +_entry_vlen(char *entry) +{ + return *((uint8_t *)((entry) + sizeof(uint8_t))); +} + +static inline uint32_t +_entry_nbyte(char *entry) +{ + return (uint32_t)HASHMAP_ENTRY_HEADER_SIZE + _entry_klen(entry) + + _entry_vlen(entry); +} + +static inline char * +_entry_key(char *entry) +{ + return entry + HASHMAP_ENTRY_HEADER_SIZE; +} + +static inline char * +_entry_val(char *entry) +{ + return entry + HASHMAP_ENTRY_HEADER_SIZE + _entry_klen(entry); +} + +static inline void +_entry_set(char *entry, const struct bstring *key, const struct bstring *val) +{ + *((uint8_t *)entry) = key->len; + *((uint8_t *)entry + sizeof(uint8_t)) = val->len; + cc_memcpy(entry + HASHMAP_ENTRY_HEADER_SIZE, key->data, key->len); + cc_memcpy(entry + HASHMAP_ENTRY_HEADER_SIZE + key->len, val->data, val->len); +} + +static inline char * +_entry_val(char *entry) +{ + return entry + HASHMAP_ENTRY_HEADER_SIZE + _entry_klen(entry); +} + +static inline char * +_next_entry(char *entry) +{ + return entry + HASHMAP_ENTRY_HEADER_SIZE + _entry_klen(entry) + _entry_vlen(entry); +} + + +/* returns true if an exact match is found, false otherwise. + * If a match is found, the position of the entry element is stored in pos; + * otherwise, pos contains the position of the insertion spot + */ +static inline bool +_locate(char **pos, uint32_t *idx, const char *entry, uint32_t nentry, struct bstring *key) +{ + uint32_t i; + int bcmp_sgn; + int eklen_sgn; + + ASSERT(idx != NULL); + + if (nentry == 0) { + return false; + } + + for (*pos = entry; *idx < nentry; *idx += 1) { + uint8_t eklen = _entry_klen(*pos); + bcmp_sgn = cc_bcmp(key->data, _entry_key(*pos), MIN(key->len, eklen)); + if (bcmp_sgn > 0) { /* no match, insert position found */ + return false; + } + if (bcmp_sign < 0) { /* no match, next entry */ + *pos = _next_entry(*pos); + continue; + } + + /* bcmp_sign == 0, may need to look at length. A shorter key is smaller */ + + /* -1: key is shorter than eklen; 0: equal length; 1: key is longer */ + eklen_sgn = (key->len > eklen) - (eklen > key->len); + if (eklen_sgn > 0) { /* no match, insert position found */ + return false; + } + if (eklen_sgn < 0) { /* no match, next entry */ + *pos = _next_entry(*pos); + continue; + } + + return true; /* match iff bcmp_sgn and eklen_sgn are both 0 */ + } + + return false; /* *pos points to the end of body */ +} + + +hashmap_rstatus_e +hashmap_init(hashmap_p hm) +{ + if (hm == NULL) { + log_debug("NULL pointer encountered for hm"); + + return HASHMAP_ERROR; + } + + HM_NENTRY(hm) = 0; + HM_NBYTE(hm) = 0; + + return HASHMAP_OK; +} + + +hashmap_rstatus_e +hashmap_get(struct bstring *val, const hashmap_p hm, const struct bstring *key) +{ + uint32_t idx, nentry; + char *entry; + + if (key == NULL || val == NULL || hm == NULL) { + log_debug("NULL pointer encountered for hm %p, key %p, or val %p", hm, + key, val); + + return HASHMAP_ERROR; + } + + idx = 0; + nentry = hashmap_nentry(hm); + if (_locate(&entry, &idx, HM_BODY(hm), nentry, key)) { /* found */ + val->len = _entry_vlen(entry); + val->data = entry_val(entry); + return HASHMAP_OK; + } else { + val->len = 0; + val->data = NULL; + return HASHMAP_ENOTFOUND; + } +} + + +uint32_t +hashmap_multiget(struct bstring *val[], const hashmap_p hm, const struct bstring *key[], uint32_t cardinality) +{ + uint32_t k, idx, nentry, nfound; + char *entry, *curr; + + if (key == NULL || val == NULL || hm == NULL) { + log_debug("NULL pointer encountered for hm %p, key %p, or val %p", hm, + key, val); + + return HASHMAP_ERROR; + } + + idx = 0; + nentry = hashmap_nentry(hm); + for (k = 0, curr = HM_BODY(hm), nfound = 0; k < cardinality; k++) { + if (_locate(&entry, &idx, curr, nentry, key)) { /* found */ + val[k]->len = _entry_vlen(entry); + val[k]->data = entry_val(entry); + nfound++; + } else { + val[k]->len = 0; + val[k]->data = NULL; + } + } + + return nfound; +} + + +hashmap_rstatus_e +hashmap_insert(hashmap_p hm, const struct bstring *key, const struct bstring *val) +{ + bool found; + char *body, *entry; + uint32_t idx, nentry; + + if (hm == NULL) { + log_debug("NULL pointer encountered for hm"); + + return HASHMAP_ERROR; + } + + if (key->len > UINT8_MAX || val->len > UINT8_MAX) { + log_debug("key / value size too big for current hashmap implementation:" + "key size: %"PRIu32", val size: %"PRIu32". (Allowed: %"PRIu8")", + key->len, val->len, UINT8_MAX); + + return HASHMAP_EINVALID; + } + + body = HM_BODY(hm); + nentry = hashmap_nentry(hm); + + if (_locate(&entry, &idx, HM_BODY(hm), nentry, key)) { /* found */ + return HASHMAP_EDUP; + } + + if (entry < HM_END(hm)) { + cc_memmove(entry + HASHMAP_ENTRY_HEADER_SIZE + key->len + val->len, + entry, HM_END(hm) - entry); + } + _entry_set(entry, key, val); + + HM_NENTRY(hm) += 1; + HM_NBODY(hm) += _entry_nbyte(entry); + + return HASHMAP_OK; +} diff --git a/src/data_structure/hashmap/hashmap.h b/src/data_structure/hashmap/hashmap.h new file mode 100644 index 000000000..620f3fbc3 --- /dev/null +++ b/src/data_structure/hashmap/hashmap.h @@ -0,0 +1,97 @@ +#pragma once + +/* This is an implementation of hashmaps with bounded but flexible entry size + * with binary field keys. The size of both field key and value are limited to + * 255 bytes in this POC. + * + * The fields are sorted but not indexed. This makes bulk lookup faster when the + * field (keys) are also sorted. + * + * ---------------------------------------------------------------------------- + * + * HASHMAP OVERALL LAYOUT + * ===================== + * + * The general layout of the hashmap is as follows: + * entry + * ╭------------------------╮ + * ... + * ╰-------------╯╰-------------------------------------------------------╯ + * header body + * + * Overhead: 8 bytes (two 32-bit integers) + * + * is the number of entries. + * is the number of bytes in the body (not including header). + * + * + * + * HASHMAP ENTRIES + * ============== + * + * For each entry: + * is the size of hash field in each entry (entry key) + * is the size of hash value in each entry (entry value) + * + * The rest of the entry is a tuple of a binary string (non-empty byte array) + * for field and a byte array for value. + * + * RUNTIME + * ======= + * + * Entry lookup takes O(N) where N is the number of entries in the list. + * + * Insertion and removal of entries involve scan-based lookup, as well as + * shifting data. So in additional to the considerations above, the amount of + * data being moved for updates will affect performance. Updates near the "fixed + * end" of the hashmap (currently the beginning) require moving more data and + * therefore will be slower. Overall, it is cheapest to perform updates at the + * end of the array due to zero data movement. + * + */ + +#include + +#include + +#define HASHMAP_HEADER_SIZE (sizeof(uint32_t) + sizeof(uint32_t)) /* 8 */ +#define HASHMAP_ENTRY_HEADER_SIZE (sizeof(uint8_t) + sizeof(uint8_t)) /* 2 */ + +typedef char * hashmap_p; + +typedef enum { + HASHMAP_OK, + HASHMAP_ENOTFOUND, /* value not found error */ + HASHMAP_EINVALID, /* invalid (entry) data error */ + HASHMAP_EDUP, /* duplicate entry found */ + HASHMAP_ERROR, + HASHMAP_SENTINEL +} hashmap_rstatus_e; + +#define HM_NENTRY(_hm) (*((uint32_t *)(_hm))) +#define HM_NBODY(_hm) (*((uint32_t *)((_hm) + sizeof(uint32_t)))) + +static inline uint32_t +hashmap_nentry(const hashmap_p hm) +{ + return HM_NENTRY(hm); +} + +static inline uint32_t +hashmap_size(const hashmap_p hm) +{ + return HASHMAP_HEADER_SIZE + HM_NBODY(hm); +} + +/* initialize an hashmap of key size 1/2/4/8 bytes and vsize */ +hashmap_rstatus_e hashmap_init(hashmap_p hm); + +/* hashmap APIs: seek */ +hashmap_rstatus_e hashmap_get(struct bstring *val, const hashmap_p hm, const struct bstring *key); +uint32_t hashmap_multiget(struct bstring *val[], const hashmap_p hm, const struct bstring *key[], uint32_t cardinality); + +/* hashmap APIs: modify */ +hashmap_rstatus_e hashmap_insert(hashmap_p hm, const struct bstring *key, const struct bstring *val); +hashmap_rstatus_e hashmap_bulk_insert(hashmap_p hm, const struct bstring *key, const struct bstring *val); +hashmap_rstatus_e hashmap_remove(hashmap_p hm, const struct bstring *key); +hashmap_rstatus_e hashmap_bulk_insert(hashmap_p hm, const struct bstring *key, const struct bstring *val); diff --git a/src/rust/.gitignore b/src/rust/.gitignore new file mode 100644 index 000000000..1d74e2196 --- /dev/null +++ b/src/rust/.gitignore @@ -0,0 +1 @@ +.vscode/