Skip to content

Commit

Permalink
DAOS-10877 vos: gang allocation for huge SV (#14790)
Browse files Browse the repository at this point in the history
To avoid allocation failure on a fragmented system, huge SV allocation will
be split into multiple smaller allocations, each allocation size is capped
to 8MB (the DMA chunk size, that could avoid huge DMA buffer allocation).

The address of such scattered SV payload is represented by 'gang address'.

Removed io_allocbuf_failure() vos unit test, it's not applicable in gang
SV mode now.

Signed-off-by: Niu Yawei <[email protected]>
  • Loading branch information
NiuYawei authored Sep 9, 2024
1 parent 1101699 commit b95ef01
Show file tree
Hide file tree
Showing 16 changed files with 473 additions and 202 deletions.
6 changes: 1 addition & 5 deletions src/bio/bio_buffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,6 @@ dma_alloc_chunk(unsigned int cnt)

D_ASSERT(bytes > 0);

if (DAOS_FAIL_CHECK(DAOS_NVME_ALLOCBUF_ERR)) {
D_ERROR("Injected DMA buffer allocation error.\n");
return NULL;
}

D_ALLOC_PTR(chunk);
if (chunk == NULL) {
return NULL;
Expand Down Expand Up @@ -848,6 +843,7 @@ dma_map_one(struct bio_desc *biod, struct bio_iov *biov, void *arg)
bio_iov_set_raw_buf(biov, NULL);
return 0;
}
D_ASSERT(!BIO_ADDR_IS_GANG(&biov->bi_addr));

if (direct_scm_access(biod, biov)) {
struct umem_instance *umem = biod->bd_umem;
Expand Down
3 changes: 2 additions & 1 deletion src/bio/bio_bulk.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2021-2022 Intel Corporation.
* (C) Copyright 2021-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -640,6 +640,7 @@ bulk_map_one(struct bio_desc *biod, struct bio_iov *biov, void *data)
goto done;
}
D_ASSERT(!BIO_ADDR_IS_DEDUP(&biov->bi_addr));
D_ASSERT(!BIO_ADDR_IS_GANG(&biov->bi_addr));

hdl = bulk_get_hdl(biod, biov, roundup_pgs(pg_cnt), pg_off, arg);
if (hdl == NULL) {
Expand Down
3 changes: 1 addition & 2 deletions src/bio/bio_xstream.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
/* SPDK blob parameters */
#define DAOS_BS_CLUSTER_SZ (1ULL << 25) /* 32MB */
/* DMA buffer parameters */
#define DAOS_DMA_CHUNK_MB 8 /* 8MB DMA chunks */
#define DAOS_DMA_CHUNK_CNT_INIT 24 /* Per-xstream init chunks, 192MB */
#define DAOS_DMA_CHUNK_CNT_MAX 128 /* Per-xstream max chunks, 1GB */
#define DAOS_DMA_CHUNK_CNT_MIN 32 /* Per-xstream min chunks, 256MB */
Expand Down Expand Up @@ -207,7 +206,7 @@ bio_nvme_init(const char *nvme_conf, int numa_node, unsigned int mem_size,
{
char *env;
int rc, fd;
unsigned int size_mb = DAOS_DMA_CHUNK_MB;
unsigned int size_mb = BIO_DMA_CHUNK_MB;

if (tgt_nr <= 0) {
D_ERROR("tgt_nr: %u should be > 0\n", tgt_nr);
Expand Down
2 changes: 1 addition & 1 deletion src/include/daos/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -851,7 +851,7 @@ enum {
#define DAOS_NVME_FAULTY (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x50)
#define DAOS_NVME_WRITE_ERR (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x51)
#define DAOS_NVME_READ_ERR (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x52)
#define DAOS_NVME_ALLOCBUF_ERR (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x53)
#define DAOS_NVME_ALLOCBUF_ERR (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x53) /* deprecated */
#define DAOS_NVME_WAL_TX_LOST (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x54)

#define DAOS_POOL_CREATE_FAIL_CORPC (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x60)
Expand Down
84 changes: 78 additions & 6 deletions src/include/daos_srv/bio.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,30 +29,47 @@
((addr)->ba_flags &= ~(BIO_FLAG_DEDUP_BUF))
#define BIO_ADDR_IS_CORRUPTED(addr) ((addr)->ba_flags & BIO_FLAG_CORRUPTED)
#define BIO_ADDR_SET_CORRUPTED(addr) ((addr)->ba_flags |= BIO_FLAG_CORRUPTED)
#define BIO_ADDR_IS_GANG(addr) ((addr)->ba_flags & BIO_FLAG_GANG)
#define BIO_ADDR_SET_GANG(addr) ((addr)->ba_flags |= BIO_FLAG_GANG)

/* Can support up to 16 flags for a BIO address */
enum BIO_FLAG {
/* The address is a hole */
BIO_FLAG_HOLE = (1 << 0),
/* The address is a deduped extent */
/* The address is a deduped extent, transient only flag */
BIO_FLAG_DEDUP = (1 << 1),
/* The address is a buffer for dedup verify */
/* The address is a buffer for dedup verify, transient only flag */
BIO_FLAG_DEDUP_BUF = (1 << 2),
/* The data located on the address is marked as corrupted */
BIO_FLAG_CORRUPTED = (1 << 3),
/* The address is a gang address */
BIO_FLAG_GANG = (1 << 4),
};

#define BIO_DMA_CHUNK_MB 8 /* 8MB DMA chunks */

/**
* It's used to represent an address on SCM, or an address on NVMe, or a gang address.
*
* The gang address consists of N addresses from scattered allocations, the scattered
* allocations could have different size and media type, they are compactly stored on
* the SCM pointing by 'ba_off' as following:
*
* N 64bits offsets, N 32bits sizes, N 8bits media types
*/
typedef struct {
/*
* Byte offset within PMDK pmemobj pool for SCM;
* Byte offset within PMDK pmemobj pool for SCM or gang address;
* Byte offset within SPDK blob for NVMe.
*/
uint64_t ba_off;
/* DAOS_MEDIA_SCM or DAOS_MEDIA_NVME */
uint8_t ba_type;
uint8_t ba_pad1;
/* Number of addresses when BIO_FLAG_GANG is set */
uint8_t ba_gang_nr;
/* See BIO_FLAG enum */
uint16_t ba_flags;
uint32_t ba_pad2;
uint32_t ba_pad;
} bio_addr_t;

struct sys_db;
Expand Down Expand Up @@ -127,8 +144,63 @@ enum bio_bs_state {
BIO_BS_STATE_SETUP,
};

/* Size for storing N offset + size + metia_type */
static inline unsigned int
bio_gaddr_size(uint8_t gang_nr)
{
unsigned int size;

if (gang_nr == 0)
return 0;

size = sizeof(uint64_t) + sizeof(uint32_t) + sizeof(uint8_t);
return roundup(size * gang_nr, sizeof(uint64_t));
}

static inline void
bio_gaddr_set(struct umem_instance *umm, bio_addr_t *gaddr, int i,
uint8_t type, uint32_t len, uint64_t off)
{
uint8_t *ptr;
unsigned int ptr_off;

D_ASSERT(BIO_ADDR_IS_GANG(gaddr));
D_ASSERT(i < gaddr->ba_gang_nr);
ptr = umem_off2ptr(umm, gaddr->ba_off);

ptr_off = sizeof(uint64_t) * i;
*((uint64_t *)(ptr + ptr_off)) = off;

ptr_off = sizeof(uint64_t) * gaddr->ba_gang_nr + sizeof(uint32_t) * i;
*((uint32_t *)(ptr + ptr_off)) = len;

ptr_off = (sizeof(uint64_t) + sizeof(uint32_t)) * gaddr->ba_gang_nr + i;
*(ptr + ptr_off) = type;
}

static inline void
bio_gaddr_get(struct umem_instance *umm, bio_addr_t *gaddr, int i,
uint8_t *type, uint32_t *len, uint64_t *off)
{
uint8_t *ptr;
unsigned int ptr_off;

D_ASSERT(BIO_ADDR_IS_GANG(gaddr));
D_ASSERT(i < gaddr->ba_gang_nr);
ptr = umem_off2ptr(umm, gaddr->ba_off);

ptr_off = sizeof(uint64_t) * i;
*off = *((uint64_t *)(ptr + ptr_off));

ptr_off = sizeof(uint64_t) * gaddr->ba_gang_nr + sizeof(uint32_t) * i;
*len = *((uint32_t *)(ptr + ptr_off));

ptr_off = (sizeof(uint64_t) + sizeof(uint32_t)) * gaddr->ba_gang_nr + i;
*type = *(ptr + ptr_off);
}

static inline void
bio_addr_set(bio_addr_t *addr, uint16_t type, uint64_t off)
bio_addr_set(bio_addr_t *addr, uint8_t type, uint64_t off)
{
addr->ba_type = type;
addr->ba_off = umem_off2offset(off);
Expand Down
3 changes: 3 additions & 0 deletions src/include/daos_srv/vos_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#define VOS_POOL_DF_2_2 24
#define VOS_POOL_DF_2_4 25
#define VOS_POOL_DF_2_6 26
#define VOS_POOL_DF_2_8 28

struct dtx_rsrvd_uint {
void *dru_scm;
Expand Down Expand Up @@ -299,6 +300,8 @@ enum {
VOS_POOL_FEAT_EMBED_FIRST = (1ULL << 3),
/** Flat DKEY support enabled */
VOS_POOL_FEAT_FLAT_DKEY = (1ULL << 4),
/** Gang address for SV support */
VOS_POOL_FEAT_GANG_SV = (1ULL << 5),
};

/** Mask for any conditionals passed to to the fetch */
Expand Down
5 changes: 3 additions & 2 deletions src/object/srv_enum.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* (C) Copyright 2018-2022 Intel Corporation.
* (C) Copyright 2018-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -617,7 +617,8 @@ fill_rec(daos_handle_t ih, vos_iter_entry_t *key_ent, struct ds_obj_enum_arg *ar
* enum pack implementation doesn't support yield & re-probe.
*/
if (arg->inline_thres > 0 && data_size <= arg->inline_thres &&
data_size > 0 && bio_iov2media(&key_ent->ie_biov) != DAOS_MEDIA_NVME) {
data_size > 0 && bio_iov2media(&key_ent->ie_biov) != DAOS_MEDIA_NVME &&
!BIO_ADDR_IS_GANG(&key_ent->ie_biov.bi_addr)) {
inline_data = true;
size += data_size;
}
Expand Down
118 changes: 65 additions & 53 deletions src/vos/tests/vts_io.c
Original file line number Diff line number Diff line change
Expand Up @@ -2955,78 +2955,90 @@ io_query_key_negative(void **state)
assert_rc_equal(rc, -DER_INVAL);
}

static inline int
dummy_bulk_create(void *ctxt, d_sg_list_t *sgl, unsigned int perm, void **bulk_hdl)
static int
gang_sv_io(struct io_test_args *arg, daos_epoch_t epoch, char *dkey_buf, char *akey_buf,
char *update_buf, char *fetch_buf, daos_size_t rsize)
{
return 0;
}
daos_iod_t iod = { 0 };
daos_key_t dkey, akey;
d_iov_t val_iov;
d_sg_list_t sgl = { 0 };
int rc;

set_iov(&dkey, dkey_buf, is_daos_obj_type_set(arg->otype, DAOS_OT_DKEY_UINT64));
set_iov(&akey, akey_buf, is_daos_obj_type_set(arg->otype, DAOS_OT_AKEY_UINT64));

iod.iod_name = akey;
iod.iod_type = DAOS_IOD_SINGLE;
iod.iod_size = rsize;
iod.iod_nr = 1;

dts_buf_render(update_buf, rsize);
d_iov_set(&val_iov, update_buf, rsize);
sgl.sg_nr = 1;
sgl.sg_iovs = &val_iov;

rc = io_test_obj_update(arg, epoch, 0, &dkey, &iod, &sgl, NULL, true);
if (rc)
return rc;

memset(fetch_buf, 0, rsize);
d_iov_set(&val_iov, fetch_buf, rsize);
iod.iod_size = DAOS_REC_ANY;

rc = io_test_obj_fetch(arg, epoch, 0, &dkey, &iod, &sgl, true);
if (rc)
return rc;

/* Verify */
assert_int_equal(iod.iod_size, rsize);
assert_memory_equal(update_buf, fetch_buf, rsize);

static inline int
dummy_bulk_free(void *bulk_hdl)
{
return 0;
}

/* Verify the fix of DAOS-10748 */
static void
io_allocbuf_failure(void **state)
gang_sv_test(void **state)
{
struct io_test_args *arg = *state;
char dkey_buf[UPDATE_DKEY_SIZE] = { 0 };
char akey_buf[UPDATE_AKEY_SIZE] = { 0 };
daos_iod_t iod = { 0 };
d_sg_list_t sgl = { 0 };
daos_key_t dkey_iov, akey_iov;
daos_epoch_t epoch = 1;
char *buf;
daos_handle_t ioh;
int fake_ctxt;
daos_size_t buf_len = (40UL << 20); /* 40MB, larger than DMA chunk size */
int rc;
char dkey_buf[UPDATE_DKEY_SIZE], akey_buf[UPDATE_AKEY_SIZE];
char *update_buf, *fetch_buf;
daos_size_t rsize = (27UL << 20); /* 27MB */
daos_epoch_t epoch = 1;
int rc;

D_ALLOC(update_buf, rsize);
assert_non_null(update_buf);

FAULT_INJECTION_REQUIRED();
D_ALLOC(fetch_buf, rsize);
assert_non_null(fetch_buf);

vts_key_gen(&dkey_buf[0], arg->dkey_size, true, arg);
vts_key_gen(&akey_buf[0], arg->akey_size, false, arg);
set_iov(&dkey_iov, &dkey_buf[0], is_daos_obj_type_set(arg->otype, DAOS_OT_DKEY_UINT64));
set_iov(&akey_iov, &akey_buf[0], is_daos_obj_type_set(arg->otype, DAOS_OT_AKEY_UINT64));

rc = d_sgl_init(&sgl, 1);
print_message("Gang SV update/fetch.\n");
rc = gang_sv_io(arg, epoch, dkey_buf, akey_buf, update_buf, fetch_buf, rsize);
assert_rc_equal(rc, 0);

D_ALLOC(buf, buf_len);
assert_non_null(buf);

sgl.sg_iovs[0].iov_buf = buf;
sgl.sg_iovs[0].iov_buf_len = buf_len;
sgl.sg_iovs[0].iov_len = buf_len;

iod.iod_name = akey_iov;
iod.iod_nr = 1;
iod.iod_type = DAOS_IOD_SINGLE;
iod.iod_size = buf_len;
iod.iod_recxs = NULL;

print_message("Gang SV ZC update/fetch.\n");
epoch++;
arg->ta_flags |= TF_ZERO_COPY;

bio_register_bulk_ops(dummy_bulk_create, dummy_bulk_free);
daos_fail_loc_set(DAOS_NVME_ALLOCBUF_ERR | DAOS_FAIL_ONCE);

rc = vos_update_begin(arg->ctx.tc_co_hdl, arg->oid, epoch, 0, &dkey_iov,
1, &iod, NULL, 0, &ioh, NULL);
rc = gang_sv_io(arg, epoch, dkey_buf, akey_buf, update_buf, fetch_buf, rsize);
assert_rc_equal(rc, 0);

rc = bio_iod_prep(vos_ioh2desc(ioh), BIO_CHK_TYPE_IO, (void *)&fake_ctxt, 0);
assert_rc_equal(rc, -DER_NOMEM);
daos_fail_loc_set(0);
bio_register_bulk_ops(NULL, NULL);
print_message("Gang SV update/fetch with CSUM.\n");
epoch++;
arg->ta_flags &= ~TF_ZERO_COPY;
arg->ta_flags |= TF_USE_CSUMS;
rc = gang_sv_io(arg, epoch, dkey_buf, akey_buf, update_buf, fetch_buf, rsize);
assert_rc_equal(rc, 0);

rc = vos_update_end(ioh, 0, &dkey_iov, rc, NULL, NULL);
assert_rc_equal(rc, -DER_NOMEM);
print_message("Gang SV overwrite with CSUM.\n");
rc = gang_sv_io(arg, epoch, dkey_buf, akey_buf, update_buf, fetch_buf, rsize);
assert_rc_equal(rc, 0);

d_sgl_fini(&sgl, false);
D_FREE(buf);
arg->ta_flags &= ~TF_ZERO_COPY;
D_FREE(update_buf);
D_FREE(fetch_buf);
}

static const struct CMUnitTest iterator_tests[] = {
Expand Down Expand Up @@ -3074,7 +3086,7 @@ static const struct CMUnitTest int_tests[] = {
NULL},
{"VOS300.2: Key query test", io_query_key, NULL, NULL},
{"VOS300.3: Key query negative test", io_query_key_negative, NULL, NULL},
{"VOS300.4: Return error on DMA buffer allocation failure", io_allocbuf_failure, NULL, NULL},
{"VOS300.4: Gang SV update/fetch test", gang_sv_test, NULL, NULL},
};

static int
Expand Down
1 change: 1 addition & 0 deletions src/vos/vos_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ vos_bio_addr_free(struct vos_pool *pool, bio_addr_t *addr, daos_size_t nob)
if (bio_addr_is_hole(addr))
return 0;

D_ASSERT(!BIO_ADDR_IS_GANG(addr));
if (addr->ba_type == DAOS_MEDIA_SCM) {
rc = umem_free(&pool->vp_umm, addr->ba_off);
} else {
Expand Down
Loading

0 comments on commit b95ef01

Please sign in to comment.