Skip to content

Commit de63cae

Browse files
tristan957nabeelmmd
andcommitted
Implement vblock split write amp reduction algo
During a vblock split, we can save write amp by using mpool_mblock_punch() to FALLOC_FL_PUNCH_HOLE a portion of the mblock for both the left and right hand side destination mblocks. Signed-off-by: Tristan Partin <[email protected]> Co-authored-by: Nabeel Meeramohideen Mohamed <[email protected]>
1 parent b64501d commit de63cae

File tree

1 file changed

+227
-53
lines changed

1 file changed

+227
-53
lines changed

lib/cn/kvset_split.c

Lines changed: 227 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,20 @@
33
* Copyright (C) 2022 Micron Technology, Inc. All rights reserved.
44
*/
55

6+
#include <stdint.h>
67
#include <sys/mman.h>
78

8-
#include <hse/util/event_counter.h>
9-
#include <hse/util/assert.h>
10-
#include <hse/util/keycmp.h>
11-
#include <hse/logging/logging.h>
12-
#include <hse/util/perfc.h>
13-
14-
#include <hse/limits.h>
15-
169
#include <hse/ikvdb/cn.h>
1710
#include <hse/ikvdb/ikvdb.h>
1811
#include <hse/ikvdb/cndb.h>
1912
#include <hse/ikvdb/kvset_builder.h>
20-
13+
#include <hse/limits.h>
14+
#include <hse/logging/logging.h>
2115
#include <hse/mpool/mpool.h>
16+
#include <hse/util/event_counter.h>
17+
#include <hse/util/assert.h>
18+
#include <hse/util/keycmp.h>
19+
#include <hse/util/perfc.h>
2220

2321
#include "kvs_mblk_desc.h"
2422
#include "kvset.h"
@@ -399,6 +397,13 @@ kblocks_split(
399397
return err;
400398
}
401399

400+
struct vgroup_split_metadata {
401+
bool overlaps;
402+
bool accessed;
403+
uint16_t vblk_idx;
404+
uint offset;
405+
};
406+
402407
/**
403408
* Return a split vblock index for the specified range of vblocks [start, end] by comparing
404409
* the min/max keys stored in a vblock footer against the split key.
@@ -451,6 +456,114 @@ get_vblk_split_index(
451456
return v;
452457
}
453458

459+
static void
460+
find_max_key_among_overlapping_vblocks(
461+
const uint32_t nvgroups,
462+
const struct vgroup_split_metadata *metadatav,
463+
struct kvset *const ks,
464+
struct key_obj *max_key)
465+
{
466+
for (uint32_t i = 0; i < nvgroups; i++) {
467+
struct key_obj curr_key = { 0 };
468+
const struct vblock_desc *vbd;
469+
const struct vgroup_split_metadata *metadata = metadatav + i;
470+
471+
if (!metadata->overlaps)
472+
continue;
473+
474+
vbd = kvset_get_nth_vblock_desc(ks, metadata->vblk_idx);
475+
476+
curr_key.ko_sfx = vbd->vbd_mblkdesc->map_base + vbd->vbd_max_koff;
477+
curr_key.ko_sfx_len = vbd->vbd_max_klen;
478+
479+
if (key_obj_cmp(max_key, &curr_key) < 0)
480+
*max_key = curr_key;
481+
}
482+
}
483+
484+
static merr_t
485+
mark_vgroup_accesses(
486+
const uint32_t nvgroups,
487+
struct vgroup_split_metadata *metadatav,
488+
struct kvset *const ks,
489+
const struct key_obj *split_key,
490+
const struct key_obj *max_key)
491+
{
492+
merr_t err;
493+
bool first = true;
494+
struct kv_iterator *iter;
495+
496+
INVARIANT(key_obj_cmp(split_key, max_key) <= 0);
497+
498+
err = kvset_iter_create(ks, NULL, NULL, NULL, kvset_iter_flag_mmap, &iter);
499+
if (ev(err))
500+
return err;
501+
502+
err = kvset_iter_seek(iter, split_key->ko_sfx, split_key->ko_sfx_len, NULL);
503+
if (ev(err))
504+
goto out;
505+
506+
while (true) {
507+
uint vlen;
508+
uint vbidx;
509+
uint vboff;
510+
uint complen;
511+
uint64_t seqno;
512+
const void *vdata;
513+
enum kmd_vtype vtype;
514+
struct key_obj curr_key;
515+
struct kvset_iter_vctx vc;
516+
517+
if (iter->kvi_eof)
518+
break;
519+
520+
err = kvset_iter_next_key(iter, &curr_key, &vc);
521+
if (ev(err))
522+
goto out;
523+
524+
/* In the event that this kvset contains the split key, skip it. */
525+
if (first) {
526+
first = false;
527+
if (key_obj_cmp(&curr_key, split_key) == 0)
528+
continue;
529+
}
530+
531+
if (key_obj_cmp(&curr_key, max_key) > 0)
532+
break;
533+
534+
while (kvset_iter_next_vref(iter, &vc, &seqno, &vtype, &vbidx, &vboff, &vdata, &vlen,
535+
&complen)) {
536+
uint64_t vgidx;
537+
const struct vblock_desc *vbd;
538+
struct vgroup_split_metadata *metadata;
539+
540+
switch (vtype) {
541+
case VTYPE_UCVAL:
542+
case VTYPE_CVAL:
543+
vbd = kvset_get_nth_vblock_desc(ks, vbidx);
544+
vgidx = atomic_read(&vbd->vbd_vgidx);
545+
assert(vgidx <= nvgroups);
546+
metadata = metadatav + vgidx - 1;
547+
if (!metadata->accessed) {
548+
metadata->accessed = true;
549+
metadata->offset = vboff;
550+
}
551+
/* fallthrough */
552+
case VTYPE_IVAL:
553+
case VTYPE_ZVAL:
554+
case VTYPE_TOMB:
555+
case VTYPE_PTOMB:
556+
continue;
557+
}
558+
}
559+
}
560+
561+
out:
562+
kvset_iter_release(iter);
563+
564+
return err;
565+
}
566+
454567
/**
455568
* @vbidx_left, @vbidx_right - tracks vblock index for the left and right kvsets
456569
* @vgidx_left, @vgidx_right - tracks vgroup index for the left and right kvsets
@@ -464,6 +577,8 @@ vblocks_split(
464577
struct perfc_set *pc,
465578
struct kvset_split_res *result)
466579
{
580+
struct key_obj max_key;
581+
struct vgroup_split_metadata *metadatav;
467582
struct vgmap *vgmap_src = ks->ks_vgmap;
468583
struct vgmap *vgmap_left = work[LEFT].vgmap;
469584
struct vgmap *vgmap_right = work[RIGHT].vgmap;
@@ -472,8 +587,8 @@ vblocks_split(
472587
uint16_t vbidx_left = 0, vbidx_right = 0;
473588
uint32_t vgidx_left = 0, vgidx_right = 0;
474589
uint32_t nvgroups = kvset_get_vgroups(ks), perfc_rwc = 0;
475-
bool move_left = (blks_right->kblks.idc == 0);
476-
bool move_right = (blks_left->kblks.idc == 0);
590+
const bool move_left = (blks_right->kblks.idc == 0);
591+
const bool move_right = (blks_left->kblks.idc == 0);
477592
uint64_t perfc_rwb = 0;
478593
merr_t err;
479594

@@ -482,87 +597,143 @@ vblocks_split(
482597
return 0;
483598
}
484599

600+
metadatav = calloc(nvgroups, sizeof(*metadatav));
601+
if (ev(!metadatav))
602+
return merr(ENOMEM);
603+
485604
for (uint32_t i = 0; i < nvgroups; i++) {
486-
uint16_t src_start, src_end, src_split, end;
487-
uint32_t vbcnt = 0;
488-
bool overlap = false;
605+
uint16_t start, end;
606+
struct vgroup_split_metadata *metadata = metadatav + i;
489607

490608
/* Per vgroup start and end output vblock index in the source kvset
491609
*/
492-
src_start = vgmap_vbidx_out_start(ks, i);
493-
src_end = vgmap_vbidx_out_end(ks, i);
610+
start = vgmap_vbidx_out_start(ks, i);
611+
end = vgmap_vbidx_out_end(ks, i);
494612

495613
if (move_left || move_right) {
496-
/* If all the kblocks are on one side then all the vblocks can be safely moved
497-
* to the same side
498-
*/
499-
src_split = move_right ? src_start : src_end + 1;
500-
assert(!overlap);
614+
metadata->vblk_idx = move_right ? start : end + 1;
615+
metadata->overlaps = false;
501616
} else {
502-
src_split = get_vblk_split_index(ks, src_start, src_end, split_key, &overlap);
617+
metadata->vblk_idx = get_vblk_split_index(ks, start, end, split_key,
618+
&metadata->overlaps);
503619
}
504-
assert(src_split >= src_start && src_split <= src_end + 1);
620+
assert(metadata->vblk_idx >= start && metadata->vblk_idx <= end + 1);
621+
}
622+
623+
max_key = *split_key;
624+
find_max_key_among_overlapping_vblocks(nvgroups, metadatav, ks, &max_key);
505625

506-
/* Add vblocks in [src_start, end - 1] to the left kvset
626+
err = mark_vgroup_accesses(nvgroups, metadatav, ks, split_key, &max_key);
627+
if (ev(err))
628+
goto out;
629+
630+
for (uint32_t i = 0; i < nvgroups; i++) {
631+
uint32_t vbcnt = 0;
632+
uint16_t split, start, end, last;
633+
const struct vgroup_split_metadata *metadata = metadatav + i;
634+
635+
/* Per vgroup start and end output vblock index in the source kvset
507636
*/
508-
end = overlap ? src_split + 1 : src_split;
509-
for (uint16_t j = src_start; j < end; j++) {
510-
err = blk_list_append(&blks_left->vblks, kvset_get_nth_vblock_id(ks, j));
511-
if (err)
512-
return err;
637+
start = vgmap_vbidx_out_start(ks, i);
638+
end = vgmap_vbidx_out_end(ks, i);
639+
split = metadata->vblk_idx;
640+
641+
/* Add vblocks in [start, last - 1] to the left kvset
642+
*/
643+
last = metadata->overlaps ? split + 1 : split;
644+
for (uint16_t j = start; j < last; j++) {
645+
uint64_t mbid;
646+
647+
mbid = kvset_get_nth_vblock_id(ks, j);
648+
649+
err = blk_list_append(&blks_left->vblks, mbid);
650+
if (ev(err))
651+
goto out;
652+
653+
if (j == last - 1 && metadata->overlaps && metadata->accessed) {
654+
off_t off;
655+
uint32_t len;
656+
struct mblock_props props;
657+
658+
/* Offset must be page aligned. Punching the rest of the vblock
659+
* from the page aligned offset up to the vblock footer.
660+
*/
661+
off = roundup(metadata->offset, PAGE_SIZE);
662+
len = kvset_get_nth_vblock_len(ks, j) - off;
663+
664+
err = mpool_mblock_punch(ks->ks_mp, mbid, off, len);
665+
if (ev(err))
666+
goto out;
667+
668+
log_debug("Punched mblock (0x%" PRIx64 ") starting at offset %u for %u bytes",
669+
mbid, metadata->offset, len);
670+
671+
err = mpool_mblock_props_get(ks->ks_mp, mbid, &props);
672+
if (ev(err))
673+
goto out;
674+
675+
blks_right->bl_vused -= props.mpr_write_len - props.mpr_alloc_cap;
676+
blks_left->bl_vtotal += props.mpr_alloc_cap - VBLOCK_FOOTER_LEN;
677+
} else {
678+
blks_left->bl_vtotal += kvset_get_nth_vblock_len(ks, j);
679+
}
513680

514681
vbcnt++;
515-
blks_left->bl_vtotal += kvset_get_nth_vblock_len(ks, j);
516682
}
517683

518684
if (vbcnt > 0) {
519685
vbidx_left += vbcnt;
520686

521-
err = vgmap_vbidx_set(vgmap_src, end - 1, vgmap_left, vbidx_left - 1, vgidx_left);
687+
err = vgmap_vbidx_set(vgmap_src, last - 1, vgmap_left, vbidx_left - 1, vgidx_left);
522688
if (err)
523-
return err;
689+
goto out;
524690

525691
vgidx_left++;
526692
}
527693

528694
vbcnt = 0; /* reset vbcnt for the right kvset */
529-
if (overlap) {
695+
if (metadata->overlaps) {
530696
/* Append a clone of the overlapping vblock to the right kvset */
531-
const uint64_t src_mbid = kvset_get_nth_vblock_id(ks, src_split);
697+
off_t off;
532698
uint64_t clone_mbid;
699+
struct mblock_props props;
700+
const uint64_t src_mbid = kvset_get_nth_vblock_id(ks, split);
701+
702+
off = metadata->accessed ? metadata->offset : 0;
703+
off = off < PAGE_SIZE ? 0 : roundup(off - PAGE_SIZE, PAGE_SIZE);
533704

534-
err = mpool_mblock_clone(ks->ks_mp, src_mbid, 0, 0, &clone_mbid);
705+
err = mpool_mblock_clone(ks->ks_mp, src_mbid, off, 0, &clone_mbid);
535706
if (!err) {
536707
err = blk_list_append(&blks_right->vblks, clone_mbid);
537708
if (!err)
538709
err = blk_list_append(result->ks[RIGHT].blks_commit, clone_mbid);
539710
}
540711

541712
if (err)
542-
return err;
713+
goto out;
543714

544-
perfc_rwc++;
545-
if (perfc_ison(pc, PERFC_RA_CNCOMP_RBYTES) || perfc_ison(pc, PERFC_RA_CNCOMP_WBYTES)) {
546-
struct mblock_props props;
715+
log_debug("Cloned mblock (0x%" PRIx64 ") starting at offset %ld", src_mbid, off);
547716

548-
err = mpool_mblock_props_get(ks->ks_mp, src_mbid, &props);
549-
if (!ev(err))
550-
perfc_rwb += props.mpr_write_len;
551-
else
552-
err = 0;
553-
}
717+
err = mpool_mblock_props_get(ks->ks_mp, clone_mbid, &props);
718+
if (ev(err))
719+
goto out;
720+
721+
perfc_rwc++;
722+
if (perfc_ison(pc, PERFC_RA_CNCOMP_RBYTES) || perfc_ison(pc, PERFC_RA_CNCOMP_WBYTES))
723+
perfc_rwb += props.mpr_write_len - off;
554724

555725
vbcnt++;
556-
blks_right->bl_vtotal += kvset_get_nth_vblock_len(ks, src_split);
557-
src_split++;
726+
blks_right->bl_vused -= props.mpr_write_len - props.mpr_alloc_cap;
727+
blks_right->bl_vtotal += props.mpr_alloc_cap - VBLOCK_FOOTER_LEN;
728+
split++;
558729
}
559730

560-
/* Add the remaining vblocks in [src_split, src_end] to the right kvset
731+
/* Add the remaining vblocks in [split, end] to the right kvset
561732
*/
562-
for (uint16_t j = src_split; j <= src_end; j++) {
733+
for (uint16_t j = split; j <= end; j++) {
563734
err = blk_list_append(&blks_right->vblks, kvset_get_nth_vblock_id(ks, j));
564735
if (err)
565-
return err;
736+
goto out;
566737

567738
vbcnt++;
568739
blks_right->bl_vtotal += kvset_get_nth_vblock_len(ks, j);
@@ -571,9 +742,9 @@ vblocks_split(
571742
if (vbcnt > 0) {
572743
vbidx_right += vbcnt;
573744

574-
err = vgmap_vbidx_set(vgmap_src, src_end, vgmap_right, vbidx_right - 1, vgidx_right);
745+
err = vgmap_vbidx_set(vgmap_src, end, vgmap_right, vbidx_right - 1, vgidx_right);
575746
if (err)
576-
return err;
747+
goto out;
577748

578749
vgidx_right++;
579750
}
@@ -593,7 +764,10 @@ vblocks_split(
593764
perfc_add2(pc, PERFC_RA_CNCOMP_WREQS, perfc_rwc, PERFC_RA_CNCOMP_WBYTES, perfc_rwb);
594765
}
595766

596-
return 0;
767+
out:
768+
free(metadatav);
769+
770+
return err;
597771
}
598772

599773
/**

0 commit comments

Comments
 (0)