3
3
* Copyright (C) 2022 Micron Technology, Inc. All rights reserved.
4
4
*/
5
5
6
+ #include <stdint.h>
6
7
#include <sys/mman.h>
7
8
8
- #include <hse/util/event_counter.h>
9
- #include <hse/util/assert.h>
10
- #include <hse/util/keycmp.h>
11
- #include <hse/logging/logging.h>
12
- #include <hse/util/perfc.h>
13
-
14
- #include <hse/limits.h>
15
-
16
9
#include <hse/ikvdb/cn.h>
17
10
#include <hse/ikvdb/ikvdb.h>
18
11
#include <hse/ikvdb/cndb.h>
19
12
#include <hse/ikvdb/kvset_builder.h>
20
-
13
+ #include <hse/limits.h>
14
+ #include <hse/logging/logging.h>
21
15
#include <hse/mpool/mpool.h>
16
+ #include <hse/util/event_counter.h>
17
+ #include <hse/util/assert.h>
18
+ #include <hse/util/keycmp.h>
19
+ #include <hse/util/perfc.h>
22
20
23
21
#include "kvs_mblk_desc.h"
24
22
#include "kvset.h"
@@ -399,6 +397,13 @@ kblocks_split(
399
397
return err ;
400
398
}
401
399
400
+ struct vgroup_split_metadata {
401
+ bool overlaps ;
402
+ bool accessed ;
403
+ uint16_t vblk_idx ;
404
+ uint offset ;
405
+ };
406
+
402
407
/**
403
408
* Return a split vblock index for the specified range of vblocks [start, end] by comparing
404
409
* the min/max keys stored in a vblock footer against the split key.
@@ -451,6 +456,114 @@ get_vblk_split_index(
451
456
return v ;
452
457
}
453
458
459
+ static void
460
+ find_max_key_among_overlapping_vblocks (
461
+ const uint32_t nvgroups ,
462
+ const struct vgroup_split_metadata * metadatav ,
463
+ struct kvset * const ks ,
464
+ struct key_obj * max_key )
465
+ {
466
+ for (uint32_t i = 0 ; i < nvgroups ; i ++ ) {
467
+ struct key_obj curr_key = { 0 };
468
+ const struct vblock_desc * vbd ;
469
+ const struct vgroup_split_metadata * metadata = metadatav + i ;
470
+
471
+ if (!metadata -> overlaps )
472
+ continue ;
473
+
474
+ vbd = kvset_get_nth_vblock_desc (ks , metadata -> vblk_idx );
475
+
476
+ curr_key .ko_sfx = vbd -> vbd_mblkdesc -> map_base + vbd -> vbd_max_koff ;
477
+ curr_key .ko_sfx_len = vbd -> vbd_max_klen ;
478
+
479
+ if (key_obj_cmp (max_key , & curr_key ) < 0 )
480
+ * max_key = curr_key ;
481
+ }
482
+ }
483
+
484
+ static merr_t
485
+ mark_vgroup_accesses (
486
+ const uint32_t nvgroups ,
487
+ struct vgroup_split_metadata * metadatav ,
488
+ struct kvset * const ks ,
489
+ const struct key_obj * split_key ,
490
+ const struct key_obj * max_key )
491
+ {
492
+ merr_t err ;
493
+ bool first = true;
494
+ struct kv_iterator * iter ;
495
+
496
+ INVARIANT (key_obj_cmp (split_key , max_key ) <= 0 );
497
+
498
+ err = kvset_iter_create (ks , NULL , NULL , NULL , kvset_iter_flag_mmap , & iter );
499
+ if (ev (err ))
500
+ return err ;
501
+
502
+ err = kvset_iter_seek (iter , split_key -> ko_sfx , split_key -> ko_sfx_len , NULL );
503
+ if (ev (err ))
504
+ goto out ;
505
+
506
+ while (true) {
507
+ uint vlen ;
508
+ uint vbidx ;
509
+ uint vboff ;
510
+ uint complen ;
511
+ uint64_t seqno ;
512
+ const void * vdata ;
513
+ enum kmd_vtype vtype ;
514
+ struct key_obj curr_key ;
515
+ struct kvset_iter_vctx vc ;
516
+
517
+ if (iter -> kvi_eof )
518
+ break ;
519
+
520
+ err = kvset_iter_next_key (iter , & curr_key , & vc );
521
+ if (ev (err ))
522
+ goto out ;
523
+
524
+ /* In the event that this kvset contains the split key, skip it. */
525
+ if (first ) {
526
+ first = false;
527
+ if (key_obj_cmp (& curr_key , split_key ) == 0 )
528
+ continue ;
529
+ }
530
+
531
+ if (key_obj_cmp (& curr_key , max_key ) > 0 )
532
+ break ;
533
+
534
+ while (kvset_iter_next_vref (iter , & vc , & seqno , & vtype , & vbidx , & vboff , & vdata , & vlen ,
535
+ & complen )) {
536
+ uint64_t vgidx ;
537
+ const struct vblock_desc * vbd ;
538
+ struct vgroup_split_metadata * metadata ;
539
+
540
+ switch (vtype ) {
541
+ case VTYPE_UCVAL :
542
+ case VTYPE_CVAL :
543
+ vbd = kvset_get_nth_vblock_desc (ks , vbidx );
544
+ vgidx = atomic_read (& vbd -> vbd_vgidx );
545
+ assert (vgidx <= nvgroups );
546
+ metadata = metadatav + vgidx - 1 ;
547
+ if (!metadata -> accessed ) {
548
+ metadata -> accessed = true;
549
+ metadata -> offset = vboff ;
550
+ }
551
+ /* fallthrough */
552
+ case VTYPE_IVAL :
553
+ case VTYPE_ZVAL :
554
+ case VTYPE_TOMB :
555
+ case VTYPE_PTOMB :
556
+ continue ;
557
+ }
558
+ }
559
+ }
560
+
561
+ out :
562
+ kvset_iter_release (iter );
563
+
564
+ return err ;
565
+ }
566
+
454
567
/**
455
568
* @vbidx_left, @vbidx_right - tracks vblock index for the left and right kvsets
456
569
* @vgidx_left, @vgidx_right - tracks vgroup index for the left and right kvsets
@@ -464,6 +577,8 @@ vblocks_split(
464
577
struct perfc_set * pc ,
465
578
struct kvset_split_res * result )
466
579
{
580
+ struct key_obj max_key ;
581
+ struct vgroup_split_metadata * metadatav ;
467
582
struct vgmap * vgmap_src = ks -> ks_vgmap ;
468
583
struct vgmap * vgmap_left = work [LEFT ].vgmap ;
469
584
struct vgmap * vgmap_right = work [RIGHT ].vgmap ;
@@ -472,8 +587,8 @@ vblocks_split(
472
587
uint16_t vbidx_left = 0 , vbidx_right = 0 ;
473
588
uint32_t vgidx_left = 0 , vgidx_right = 0 ;
474
589
uint32_t nvgroups = kvset_get_vgroups (ks ), perfc_rwc = 0 ;
475
- bool move_left = (blks_right -> kblks .idc == 0 );
476
- bool move_right = (blks_left -> kblks .idc == 0 );
590
+ const bool move_left = (blks_right -> kblks .idc == 0 );
591
+ const bool move_right = (blks_left -> kblks .idc == 0 );
477
592
uint64_t perfc_rwb = 0 ;
478
593
merr_t err ;
479
594
@@ -482,87 +597,143 @@ vblocks_split(
482
597
return 0 ;
483
598
}
484
599
600
+ metadatav = calloc (nvgroups , sizeof (* metadatav ));
601
+ if (ev (!metadatav ))
602
+ return merr (ENOMEM );
603
+
485
604
for (uint32_t i = 0 ; i < nvgroups ; i ++ ) {
486
- uint16_t src_start , src_end , src_split , end ;
487
- uint32_t vbcnt = 0 ;
488
- bool overlap = false;
605
+ uint16_t start , end ;
606
+ struct vgroup_split_metadata * metadata = metadatav + i ;
489
607
490
608
/* Per vgroup start and end output vblock index in the source kvset
491
609
*/
492
- src_start = vgmap_vbidx_out_start (ks , i );
493
- src_end = vgmap_vbidx_out_end (ks , i );
610
+ start = vgmap_vbidx_out_start (ks , i );
611
+ end = vgmap_vbidx_out_end (ks , i );
494
612
495
613
if (move_left || move_right ) {
496
- /* If all the kblocks are on one side then all the vblocks can be safely moved
497
- * to the same side
498
- */
499
- src_split = move_right ? src_start : src_end + 1 ;
500
- assert (!overlap );
614
+ metadata -> vblk_idx = move_right ? start : end + 1 ;
615
+ metadata -> overlaps = false;
501
616
} else {
502
- src_split = get_vblk_split_index (ks , src_start , src_end , split_key , & overlap );
617
+ metadata -> vblk_idx = get_vblk_split_index (ks , start , end , split_key ,
618
+ & metadata -> overlaps );
503
619
}
504
- assert (src_split >= src_start && src_split <= src_end + 1 );
620
+ assert (metadata -> vblk_idx >= start && metadata -> vblk_idx <= end + 1 );
621
+ }
622
+
623
+ max_key = * split_key ;
624
+ find_max_key_among_overlapping_vblocks (nvgroups , metadatav , ks , & max_key );
505
625
506
- /* Add vblocks in [src_start, end - 1] to the left kvset
626
+ err = mark_vgroup_accesses (nvgroups , metadatav , ks , split_key , & max_key );
627
+ if (ev (err ))
628
+ goto out ;
629
+
630
+ for (uint32_t i = 0 ; i < nvgroups ; i ++ ) {
631
+ uint32_t vbcnt = 0 ;
632
+ uint16_t split , start , end , last ;
633
+ const struct vgroup_split_metadata * metadata = metadatav + i ;
634
+
635
+ /* Per vgroup start and end output vblock index in the source kvset
507
636
*/
508
- end = overlap ? src_split + 1 : src_split ;
509
- for (uint16_t j = src_start ; j < end ; j ++ ) {
510
- err = blk_list_append (& blks_left -> vblks , kvset_get_nth_vblock_id (ks , j ));
511
- if (err )
512
- return err ;
637
+ start = vgmap_vbidx_out_start (ks , i );
638
+ end = vgmap_vbidx_out_end (ks , i );
639
+ split = metadata -> vblk_idx ;
640
+
641
+ /* Add vblocks in [start, last - 1] to the left kvset
642
+ */
643
+ last = metadata -> overlaps ? split + 1 : split ;
644
+ for (uint16_t j = start ; j < last ; j ++ ) {
645
+ uint64_t mbid ;
646
+
647
+ mbid = kvset_get_nth_vblock_id (ks , j );
648
+
649
+ err = blk_list_append (& blks_left -> vblks , mbid );
650
+ if (ev (err ))
651
+ goto out ;
652
+
653
+ if (j == last - 1 && metadata -> overlaps && metadata -> accessed ) {
654
+ off_t off ;
655
+ uint32_t len ;
656
+ struct mblock_props props ;
657
+
658
+ /* Offset must be page aligned. Punching the rest of the vblock
659
+ * from the page aligned offset up to the vblock footer.
660
+ */
661
+ off = roundup (metadata -> offset , PAGE_SIZE );
662
+ len = kvset_get_nth_vblock_len (ks , j ) - off ;
663
+
664
+ err = mpool_mblock_punch (ks -> ks_mp , mbid , off , len );
665
+ if (ev (err ))
666
+ goto out ;
667
+
668
+ log_debug ("Punched mblock (0x%" PRIx64 ") starting at offset %u for %u bytes" ,
669
+ mbid , metadata -> offset , len );
670
+
671
+ err = mpool_mblock_props_get (ks -> ks_mp , mbid , & props );
672
+ if (ev (err ))
673
+ goto out ;
674
+
675
+ blks_right -> bl_vused -= props .mpr_write_len - props .mpr_alloc_cap ;
676
+ blks_left -> bl_vtotal += props .mpr_alloc_cap - VBLOCK_FOOTER_LEN ;
677
+ } else {
678
+ blks_left -> bl_vtotal += kvset_get_nth_vblock_len (ks , j );
679
+ }
513
680
514
681
vbcnt ++ ;
515
- blks_left -> bl_vtotal += kvset_get_nth_vblock_len (ks , j );
516
682
}
517
683
518
684
if (vbcnt > 0 ) {
519
685
vbidx_left += vbcnt ;
520
686
521
- err = vgmap_vbidx_set (vgmap_src , end - 1 , vgmap_left , vbidx_left - 1 , vgidx_left );
687
+ err = vgmap_vbidx_set (vgmap_src , last - 1 , vgmap_left , vbidx_left - 1 , vgidx_left );
522
688
if (err )
523
- return err ;
689
+ goto out ;
524
690
525
691
vgidx_left ++ ;
526
692
}
527
693
528
694
vbcnt = 0 ; /* reset vbcnt for the right kvset */
529
- if (overlap ) {
695
+ if (metadata -> overlaps ) {
530
696
/* Append a clone of the overlapping vblock to the right kvset */
531
- const uint64_t src_mbid = kvset_get_nth_vblock_id ( ks , src_split ) ;
697
+ off_t off ;
532
698
uint64_t clone_mbid ;
699
+ struct mblock_props props ;
700
+ const uint64_t src_mbid = kvset_get_nth_vblock_id (ks , split );
701
+
702
+ off = metadata -> accessed ? metadata -> offset : 0 ;
703
+ off = off < PAGE_SIZE ? 0 : roundup (off - PAGE_SIZE , PAGE_SIZE );
533
704
534
- err = mpool_mblock_clone (ks -> ks_mp , src_mbid , 0 , 0 , & clone_mbid );
705
+ err = mpool_mblock_clone (ks -> ks_mp , src_mbid , off , 0 , & clone_mbid );
535
706
if (!err ) {
536
707
err = blk_list_append (& blks_right -> vblks , clone_mbid );
537
708
if (!err )
538
709
err = blk_list_append (result -> ks [RIGHT ].blks_commit , clone_mbid );
539
710
}
540
711
541
712
if (err )
542
- return err ;
713
+ goto out ;
543
714
544
- perfc_rwc ++ ;
545
- if (perfc_ison (pc , PERFC_RA_CNCOMP_RBYTES ) || perfc_ison (pc , PERFC_RA_CNCOMP_WBYTES )) {
546
- struct mblock_props props ;
715
+ log_debug ("Cloned mblock (0x%" PRIx64 ") starting at offset %ld" , src_mbid , off );
547
716
548
- err = mpool_mblock_props_get (ks -> ks_mp , src_mbid , & props );
549
- if (!ev (err ))
550
- perfc_rwb += props .mpr_write_len ;
551
- else
552
- err = 0 ;
553
- }
717
+ err = mpool_mblock_props_get (ks -> ks_mp , clone_mbid , & props );
718
+ if (ev (err ))
719
+ goto out ;
720
+
721
+ perfc_rwc ++ ;
722
+ if (perfc_ison (pc , PERFC_RA_CNCOMP_RBYTES ) || perfc_ison (pc , PERFC_RA_CNCOMP_WBYTES ))
723
+ perfc_rwb += props .mpr_write_len - off ;
554
724
555
725
vbcnt ++ ;
556
- blks_right -> bl_vtotal += kvset_get_nth_vblock_len (ks , src_split );
557
- src_split ++ ;
726
+ blks_right -> bl_vused -= props .mpr_write_len - props .mpr_alloc_cap ;
727
+ blks_right -> bl_vtotal += props .mpr_alloc_cap - VBLOCK_FOOTER_LEN ;
728
+ split ++ ;
558
729
}
559
730
560
- /* Add the remaining vblocks in [src_split, src_end ] to the right kvset
731
+ /* Add the remaining vblocks in [split, end ] to the right kvset
561
732
*/
562
- for (uint16_t j = src_split ; j <= src_end ; j ++ ) {
733
+ for (uint16_t j = split ; j <= end ; j ++ ) {
563
734
err = blk_list_append (& blks_right -> vblks , kvset_get_nth_vblock_id (ks , j ));
564
735
if (err )
565
- return err ;
736
+ goto out ;
566
737
567
738
vbcnt ++ ;
568
739
blks_right -> bl_vtotal += kvset_get_nth_vblock_len (ks , j );
@@ -571,9 +742,9 @@ vblocks_split(
571
742
if (vbcnt > 0 ) {
572
743
vbidx_right += vbcnt ;
573
744
574
- err = vgmap_vbidx_set (vgmap_src , src_end , vgmap_right , vbidx_right - 1 , vgidx_right );
745
+ err = vgmap_vbidx_set (vgmap_src , end , vgmap_right , vbidx_right - 1 , vgidx_right );
575
746
if (err )
576
- return err ;
747
+ goto out ;
577
748
578
749
vgidx_right ++ ;
579
750
}
@@ -593,7 +764,10 @@ vblocks_split(
593
764
perfc_add2 (pc , PERFC_RA_CNCOMP_WREQS , perfc_rwc , PERFC_RA_CNCOMP_WBYTES , perfc_rwb );
594
765
}
595
766
596
- return 0 ;
767
+ out :
768
+ free (metadatav );
769
+
770
+ return err ;
597
771
}
598
772
599
773
/**
0 commit comments