diff --git a/usr/src/lib/libfakekernel/common/mapfile-vers b/usr/src/lib/libfakekernel/common/mapfile-vers index 40ae056e59af..deb8b61edca5 100644 --- a/usr/src/lib/libfakekernel/common/mapfile-vers +++ b/usr/src/lib/libfakekernel/common/mapfile-vers @@ -238,8 +238,8 @@ SYMBOL_VERSION SUNWprivate_1.1 { taskq_dispatch; taskq_dispatch_ent; taskq_empty; + taskq_init_ent; taskq_member; - taskq_empty; taskq_wait; taskq_wait_id; diff --git a/usr/src/lib/libfakekernel/common/sys/taskq_impl.h b/usr/src/lib/libfakekernel/common/sys/taskq_impl.h index 1920034b041e..8660f27d97d9 100644 --- a/usr/src/lib/libfakekernel/common/sys/taskq_impl.h +++ b/usr/src/lib/libfakekernel/common/sys/taskq_impl.h @@ -51,6 +51,7 @@ typedef struct taskq_ent { /* Special form of taskq dispatch that uses preallocated entries. */ void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, taskq_ent_t *); +void taskq_init_ent(taskq_ent_t *); #ifdef __cplusplus } diff --git a/usr/src/lib/libfakekernel/common/taskq.c b/usr/src/lib/libfakekernel/common/taskq.c index 6fe2b995a0e9..714322482ec2 100644 --- a/usr/src/lib/libfakekernel/common/taskq.c +++ b/usr/src/lib/libfakekernel/common/taskq.c @@ -182,6 +182,12 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags) return (1); } +void +taskq_init_ent(taskq_ent_t *tqe) +{ + tqe->tqent_flags = 0; +} + void taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, taskq_ent_t *t) diff --git a/usr/src/lib/libzpool/common/sys/zfs_context.h b/usr/src/lib/libzpool/common/sys/zfs_context.h index 2f5d3ff142e6..5b28a4710db5 100644 --- a/usr/src/lib/libzpool/common/sys/zfs_context.h +++ b/usr/src/lib/libzpool/common/sys/zfs_context.h @@ -285,6 +285,7 @@ extern vnode_t *rootdir; #define minclsyspri 60 #define maxclsyspri 99 +#define defclsyspri minclsyspri #if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__) #define _zfs_expect(expr, value) (__builtin_expect((expr), (value))) diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 0f0a9835f0de..175f63599a86 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -26,6 +26,7 @@ * Copyright 2017 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2019, Delphix. All rights reserved. * Copyright (c) 2020, George Amanakis. All rights reserved. + * Copyright (c) 2019, 2024, 2025, Klara, Inc. * Copyright (c) 2020, The FreeBSD Foundation [1] * Copyright 2024 Bill Sommerfeld * @@ -321,13 +322,18 @@ static zthr_t *arc_reap_zthr; /* * This thread's job is to keep arc_size under arc_c, by calling - * arc_adjust(), which improves arc_is_overflowing(). + * arc_evict(), which improves arc_is_overflowing(). */ -static zthr_t *arc_adjust_zthr; +static zthr_t *arc_evict_zthr; +static arc_buf_hdr_t **arc_state_evict_markers; +static int arc_state_evict_marker_count; -static kmutex_t arc_adjust_lock; -static kcondvar_t arc_adjust_waiters_cv; -static boolean_t arc_adjust_needed = B_FALSE; +static kmutex_t arc_evict_lock; +static kcondvar_t arc_evict_waiters_cv; +static boolean_t arc_evict_needed = B_FALSE; + +static taskq_t *arc_evict_taskq; +static struct evict_arg *arc_evict_arg; uint_t arc_reduce_dnlc_percent = 3; @@ -416,6 +422,18 @@ uint_t zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */ boolean_t zfs_compressed_arc_enabled = B_TRUE; +/* + * Controls the number of ARC eviction threads to dispatch sublists to. + * + * Possible values: + * 0 (auto) compute the number of threads using a logarithmic formula. + * 1 (disabled) one thread - parallel eviction is disabled. + * 2+ (manual) set the number manually. + * + * See arc_evict_thread_init() for how "auto" is computed. + */ +static uint_t zfs_arc_evict_threads = 0; + /* The 6 states: */ static arc_state_t ARC_anon; static arc_state_t ARC_mru; @@ -582,12 +600,6 @@ arc_stats_t arc_stats = { } while (0) kstat_t *arc_ksp; -static arc_state_t *arc_anon; -static arc_state_t *arc_mru; -static arc_state_t *arc_mru_ghost; -static arc_state_t *arc_mfu; -static arc_state_t *arc_mfu_ghost; -static arc_state_t *arc_l2c_only; /* * There are also some ARC variables that we want to export, but that are @@ -1008,6 +1020,7 @@ hdr_full_cons(void *vbuf, void *unused, int kmflag) zfs_refcount_create(&hdr->b_l1hdr.b_refcnt); mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); multilist_link_init(&hdr->b_l1hdr.b_arc_node); + list_link_init(&hdr->b_l2hdr.b_l2node); arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); return (0); @@ -2078,7 +2091,6 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) return; } - ASSERT(!GHOST_STATE(state)); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_add_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); @@ -2118,7 +2130,6 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) return; } - ASSERT(!GHOST_STATE(state)); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_remove_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); @@ -2158,7 +2169,7 @@ add_reference(arc_buf_hdr_t *hdr, void *tag) (state != arc_anon)) { /* We don't use the L2-only state list. */ if (state != arc_l2c_only) { - multilist_remove(state->arcs_list[arc_buf_type(hdr)], + multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr); arc_evictable_space_decrement(hdr, state); } @@ -2192,7 +2203,7 @@ remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) */ if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { - multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr); + multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); arc_evictable_space_increment(hdr, state); } @@ -2247,7 +2258,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); - multilist_remove(old_state->arcs_list[buftype], hdr); + multilist_remove(&old_state->arcs_list[buftype], hdr); if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); @@ -2265,7 +2276,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, * beforehand. */ ASSERT(HDR_HAS_L1HDR(hdr)); - multilist_insert(new_state->arcs_list[buftype], hdr); + multilist_insert(&new_state->arcs_list[buftype], hdr); if (GHOST_STATE(new_state)) { ASSERT0(bufcnt); @@ -2412,13 +2423,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, l2arc_hdr_arcstats_increment_state(hdr); } } - - /* - * L2 headers should never be on the L2 state list since they don't - * have L1 headers allocated. - */ - ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && - multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); } void @@ -3907,23 +3911,21 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) static uint64_t arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, - uint64_t spa, int64_t bytes) + uint64_t spa, uint64_t bytes) { multilist_sublist_t *mls; uint64_t bytes_evicted = 0; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; - int evict_count = 0; + int evict_count = zfs_arc_evict_batch_limit; ASSERT3P(marker, !=, NULL); - IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); mls = multilist_sublist_lock(ml, idx); - for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; + for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL); hdr = multilist_sublist_prev(mls, marker)) { - if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || - (evict_count >= zfs_arc_evict_batch_limit)) + if ((evict_count <= 0) || (bytes_evicted >= bytes)) break; /* @@ -3982,7 +3984,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, * evict_count in this case. */ if (evicted != 0) - evict_count++; + evict_count--; /* * If arc_size isn't overflowing, signal any @@ -4000,13 +4002,13 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, * * If threads are left sleeping, due to not * using cv_broadcast here, they will be woken - * up via cv_broadcast in arc_adjust_cb() just - * before arc_adjust_zthr sleeps. + * up via cv_broadcast in arc_evict_cb() just + * before arc_evict_zthr sleeps. */ - mutex_enter(&arc_adjust_lock); + mutex_enter(&arc_evict_lock); if (!arc_is_overflowing()) - cv_signal(&arc_adjust_waiters_cv); - mutex_exit(&arc_adjust_lock); + cv_signal(&arc_evict_waiters_cv); + mutex_exit(&arc_evict_lock); } else { ARCSTAT_BUMP(arcstat_mutex_miss); } @@ -4017,6 +4019,94 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, return (bytes_evicted); } +/* + * Allocate an array of buffer headers used as placeholders during arc state + * eviction. + */ +static arc_buf_hdr_t ** +arc_state_alloc_markers(int count) +{ + arc_buf_hdr_t **markers; + + markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP); + for (int i = 0; i < count; i++) { + markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); + + /* + * A b_spa of 0 is used to indicate that this header is + * a marker. This fact is used in arc_evict_type() and + * arc_evict_state_impl(). + */ + markers[i]->b_spa = 0; + + } + return (markers); +} + +static void +arc_state_free_markers(arc_buf_hdr_t **markers, int count) +{ + for (int i = 0; i < count; i++) + kmem_cache_free(hdr_full_cache, markers[i]); + kmem_free(markers, sizeof (*markers) * count); +} + +typedef struct evict_arg { + taskq_ent_t eva_tqent; + multilist_t *eva_ml; + arc_buf_hdr_t *eva_marker; + int eva_idx; + uint64_t eva_spa; + uint64_t eva_bytes; + uint64_t eva_evicted; +} evict_arg_t; + +static void +arc_evict_task(void *arg) +{ + evict_arg_t *eva = arg; + eva->eva_evicted = arc_evict_state_impl(eva->eva_ml, eva->eva_idx, + eva->eva_marker, eva->eva_spa, eva->eva_bytes); +} + +static void +arc_evict_thread_init(void) +{ + if (zfs_arc_evict_threads == 0) { + /* + * Compute number of threads we want to use for eviction. + * + * Normally, it's log2(ncpus) + ncpus/32, which gets us to the + * default max of 16 threads at ~256 CPUs. + * + * However, that formula goes to two threads at 4 CPUs, which + * is still rather to low to be really useful, so we just go + * with 1 thread at fewer than 6 cores. + */ + if (max_ncpus < 6) + zfs_arc_evict_threads = 1; + else + zfs_arc_evict_threads = + (highbit64(max_ncpus) - 1) + max_ncpus / 32; + } else if (zfs_arc_evict_threads > max_ncpus) + zfs_arc_evict_threads = max_ncpus; + + if (zfs_arc_evict_threads > 1) { + arc_evict_taskq = taskq_create("arc_evict", + zfs_arc_evict_threads, defclsyspri, 0, INT_MAX, + TASKQ_PREPOPULATE); + arc_evict_arg = kmem_zalloc( + sizeof (evict_arg_t) * zfs_arc_evict_threads, KM_SLEEP); + } +} + +/* + * The minimum number of bytes we can evict at once is a block size. + * So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task. + * We use this value to compute a scaling factor for the eviction tasks. + */ +#define MIN_EVICT_SIZE (SPA_MAXBLOCKSIZE) + /* * Evict buffers from the given arc state, until we've removed the * specified number of bytes. Move the removed buffers to the @@ -4031,18 +4121,19 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, * the given arc state; which is used by arc_flush(). */ static uint64_t -arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, +arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes, arc_buf_contents_t type) { uint64_t total_evicted = 0; - multilist_t *ml = state->arcs_list[type]; + multilist_t *ml = &state->arcs_list[type]; int num_sublists; arc_buf_hdr_t **markers; - - IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); + evict_arg_t *eva = NULL; num_sublists = multilist_get_num_sublists(ml); + boolean_t use_evcttq = zfs_arc_evict_threads > 1; + /* * If we've tried to evict from each sublist, made some * progress, but still have not hit the target number of bytes @@ -4050,44 +4141,104 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, * pick up where we left off for each individual sublist, rather * than starting from the tail each time. */ - markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); + if (zthr_iscurthread(arc_evict_zthr)) { + markers = arc_state_evict_markers; + ASSERT3S(num_sublists, <=, arc_state_evict_marker_count); + } else { + markers = arc_state_alloc_markers(num_sublists); + } for (int i = 0; i < num_sublists; i++) { - markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); - - /* - * A b_spa of 0 is used to indicate that this header is - * a marker. This fact is used in arc_adjust_type() and - * arc_evict_state_impl(). - */ - markers[i]->b_spa = 0; - multilist_sublist_t *mls = multilist_sublist_lock(ml, i); multilist_sublist_insert_tail(mls, markers[i]); multilist_sublist_unlock(mls); } + if (use_evcttq) { + if (zthr_iscurthread(arc_evict_zthr)) + eva = arc_evict_arg; + else + eva = kmem_alloc(sizeof (evict_arg_t) * + zfs_arc_evict_threads, KM_NOSLEEP); + if (eva) { + for (int i = 0; i < zfs_arc_evict_threads; i++) { + taskq_init_ent(&eva[i].eva_tqent); + eva[i].eva_ml = ml; + eva[i].eva_spa = spa; + } + } else { + /* + * Fall back to the regular single evict if it is not + * possible to allocate memory for the taskq entries. + */ + use_evcttq = B_FALSE; + } + } + + /* + * Start eviction using a randomly selected sublist, this is to try and + * evenly balance eviction across all sublists. Always starting at the + * same sublist (e.g. index 0) would cause evictions to favor certain + * sublists over others. + */ + uint64_t scan_evicted = 0; + int sublists_left = num_sublists; + int sublist_idx = multilist_get_random_index(ml); + /* * While we haven't hit our target number of bytes to evict, or * we're evicting all available buffers. */ - while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { - /* - * Start eviction using a randomly selected sublist, - * this is to try and evenly balance eviction across all - * sublists. Always starting at the same sublist - * (e.g. index 0) would cause evictions to favor certain - * sublists over others. - */ - int sublist_idx = multilist_get_random_index(ml); - uint64_t scan_evicted = 0; + while (total_evicted < bytes) { + uint64_t evict = MIN_EVICT_SIZE; + uint_t ntasks = zfs_arc_evict_threads; - for (int i = 0; i < num_sublists; i++) { + if (use_evcttq) { + if (sublists_left < ntasks) + ntasks = sublists_left; + + if (ntasks < 2) + use_evcttq = B_FALSE; + } + + if (use_evcttq) { + uint64_t left = bytes - total_evicted; + + if (bytes == ARC_EVICT_ALL) { + evict = bytes; + } else if (left > ntasks * MIN_EVICT_SIZE) { + evict = DIV_ROUND_UP(left, ntasks); + } else { + ntasks = DIV_ROUND_UP(left, MIN_EVICT_SIZE); + if (ntasks == 1) + use_evcttq = B_FALSE; + } + } + + for (int i = 0; sublists_left > 0; i++, sublist_idx++, + sublists_left--) { uint64_t bytes_remaining; uint64_t bytes_evicted; - if (bytes == ARC_EVICT_ALL) - bytes_remaining = ARC_EVICT_ALL; - else if (total_evicted < bytes) + /* we've reached the end, wrap to the beginning */ + if (sublist_idx >= num_sublists) + sublist_idx = 0; + + if (use_evcttq) { + if (i == ntasks) + break; + + eva[i].eva_marker = markers[sublist_idx]; + eva[i].eva_idx = sublist_idx; + eva[i].eva_bytes = evict; + + taskq_dispatch_ent(arc_evict_taskq, + arc_evict_task, &eva[i], 0, + &eva[i].eva_tqent); + + continue; + } + + if (total_evicted < bytes) bytes_remaining = bytes - total_evicted; else break; @@ -4097,18 +4248,23 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, scan_evicted += bytes_evicted; total_evicted += bytes_evicted; + } - /* we've reached the end, wrap to the beginning */ - if (++sublist_idx >= num_sublists) - sublist_idx = 0; + if (use_evcttq) { + taskq_wait(arc_evict_taskq); + + for (int i = 0; i < ntasks; i++) { + scan_evicted += eva[i].eva_evicted; + total_evicted += eva[i].eva_evicted; + } } /* - * If we didn't evict anything during this scan, we have - * no reason to believe we'll evict more during another + * If we scanned all sublists and didn't evict anything, we + * have no reason to believe we'll evict more during another * scan, so break the loop. */ - if (scan_evicted == 0) { + if (scan_evicted == 0 && sublists_left == 0) { /* This isn't possible, let's make that obvious */ ASSERT3S(bytes, !=, 0); @@ -4125,16 +4281,35 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, break; } + + /* + * If we scanned all sublists but still have more to do, + * reset the counts so we can go around again. + */ + if (sublists_left == 0) { + sublists_left = num_sublists; + sublist_idx = multilist_get_random_index(ml); + scan_evicted = 0; + + /* + * Since we're about to reconsider all sublists, + * re-enable use of the evict threads if available. + */ + use_evcttq = (zfs_arc_evict_threads > 1 && eva != NULL); + } } + if (eva != NULL && eva != arc_evict_arg) + kmem_free(eva, sizeof (evict_arg_t) * zfs_arc_evict_threads); + for (int i = 0; i < num_sublists; i++) { multilist_sublist_t *mls = multilist_sublist_lock(ml, i); multilist_sublist_remove(mls, markers[i]); multilist_sublist_unlock(mls); - - kmem_cache_free(hdr_full_cache, markers[i]); } - kmem_free(markers, sizeof (*markers) * num_sublists); + + if (markers != arc_state_evict_markers) + arc_state_free_markers(markers, num_sublists); return (total_evicted); } @@ -4179,10 +4354,10 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, * evict everything it can, when passed a negative value for "bytes". */ static uint64_t -arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, +arc_evict_impl(arc_state_t *state, uint64_t spa, int64_t bytes, arc_buf_contents_t type) { - int64_t delta; + uint64_t delta; if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) { delta = MIN(zfs_refcount_count(&state->arcs_esize[type]), @@ -4198,7 +4373,7 @@ arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, * capped by the arc_meta_limit tunable. */ static uint64_t -arc_adjust_meta(uint64_t meta_used) +arc_evict_meta(uint64_t meta_used) { uint64_t total_evicted = 0; int64_t target; @@ -4214,7 +4389,7 @@ arc_adjust_meta(uint64_t meta_used) (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + zfs_refcount_count(&arc_mru->arcs_size) - arc_p)); - total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); + total_evicted += arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); /* * Similar to the above, we want to evict enough bytes to get us @@ -4225,7 +4400,7 @@ arc_adjust_meta(uint64_t meta_used) (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p))); - total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + total_evicted += arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); return (total_evicted); } @@ -4239,10 +4414,10 @@ arc_adjust_meta(uint64_t meta_used) * returned. */ static arc_buf_contents_t -arc_adjust_type(arc_state_t *state) +arc_evict_type(arc_state_t *state) { - multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA]; - multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA]; + multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA]; + multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA]; int data_idx = multilist_get_random_index(data_ml); int meta_idx = multilist_get_random_index(meta_ml); multilist_sublist_t *data_mls; @@ -4309,7 +4484,7 @@ arc_adjust_type(arc_state_t *state) * Evict buffers from the cache, such that arc_size is capped by arc_c. */ static uint64_t -arc_adjust(void) +arc_evict(void) { uint64_t total_evicted = 0; uint64_t bytes; @@ -4321,7 +4496,7 @@ arc_adjust(void) * If we're over arc_meta_limit, we want to correct that before * potentially evicting data buffers below. */ - total_evicted += arc_adjust_meta(ameta); + total_evicted += arc_evict_meta(ameta); /* * Adjust MRU size @@ -4345,9 +4520,9 @@ arc_adjust(void) * type. If we cannot satisfy the number of bytes from this * type, spill over into the next type. */ - if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && + if (arc_evict_type(arc_mru) == ARC_BUFC_METADATA && ameta > arc_meta_min) { - bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); + bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); total_evicted += bytes; /* @@ -4357,9 +4532,9 @@ arc_adjust(void) target -= bytes; total_evicted += - arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); + arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA); } else { - bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); + bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA); total_evicted += bytes; /* @@ -4369,7 +4544,7 @@ arc_adjust(void) target -= bytes; total_evicted += - arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); + arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); } /* @@ -4381,9 +4556,9 @@ arc_adjust(void) */ target = asize - arc_c; - if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && + if (arc_evict_type(arc_mfu) == ARC_BUFC_METADATA && ameta > arc_meta_min) { - bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); total_evicted += bytes; /* @@ -4393,9 +4568,9 @@ arc_adjust(void) target -= bytes; total_evicted += - arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); + arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA); } else { - bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); + bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA); total_evicted += bytes; /* @@ -4405,7 +4580,7 @@ arc_adjust(void) target -= bytes; total_evicted += - arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); } /* @@ -4422,13 +4597,13 @@ arc_adjust(void) target = zfs_refcount_count(&arc_mru->arcs_size) + zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c; - bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); + bytes = arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); total_evicted += bytes; target -= bytes; total_evicted += - arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); + arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); /* * We assume the sum of the mru list and mfu list is less than @@ -4441,13 +4616,13 @@ arc_adjust(void) target = zfs_refcount_count(&arc_mru_ghost->arcs_size) + zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; - bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); + bytes = arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); total_evicted += bytes; target -= bytes; total_evicted += - arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); + arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); return (total_evicted); } @@ -4501,11 +4676,11 @@ arc_reduce_target_size(int64_t to_free) } if (asize > arc_c) { - /* See comment in arc_adjust_cb_check() on why lock+flag */ - mutex_enter(&arc_adjust_lock); - arc_adjust_needed = B_TRUE; - mutex_exit(&arc_adjust_lock); - zthr_wakeup(arc_adjust_zthr); + /* See comment in arc_evict_cb_check() on why lock+flag */ + mutex_enter(&arc_evict_lock); + arc_evict_needed = B_TRUE; + mutex_exit(&arc_evict_lock); + zthr_wakeup(arc_evict_zthr); } } @@ -4686,7 +4861,7 @@ arc_kmem_reap_soon(void) /* ARGSUSED */ static boolean_t -arc_adjust_cb_check(void *arg, zthr_t *zthr) +arc_evict_cb_check(void *arg, zthr_t *zthr) { /* * This is necessary in order for the mdb ::arc dcmd to @@ -4695,7 +4870,7 @@ arc_adjust_cb_check(void *arg, zthr_t *zthr) * this call, the command may show stale stats for the * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even * with this change, the data might be up to 1 second - * out of date(the arc_adjust_zthr has a maximum sleep + * out of date(the arc_evict_zthr has a maximum sleep * time of 1 second); but that should suffice. The * arc_state_t structures can be queried directly if more * accurate information is needed. @@ -4707,35 +4882,35 @@ arc_adjust_cb_check(void *arg, zthr_t *zthr) * We have to rely on arc_get_data_impl() to tell us when to adjust, * rather than checking if we are overflowing here, so that we are * sure to not leave arc_get_data_impl() waiting on - * arc_adjust_waiters_cv. If we have become "not overflowing" since + * arc_evict_waiters_cv. If we have become "not overflowing" since * arc_get_data_impl() checked, we need to wake it up. We could * broadcast the CV here, but arc_get_data_impl() may have not yet * gone to sleep. We would need to use a mutex to ensure that this * function doesn't broadcast until arc_get_data_impl() has gone to - * sleep (e.g. the arc_adjust_lock). However, the lock ordering of + * sleep (e.g. the arc_evict_lock). However, the lock ordering of * such a lock would necessarily be incorrect with respect to the * zthr_lock, which is held before this function is called, and is * held by arc_get_data_impl() when it calls zthr_wakeup(). */ - return (arc_adjust_needed); + return (arc_evict_needed); } /* - * Keep arc_size under arc_c by running arc_adjust which evicts data + * Keep arc_size under arc_c by running arc_evict which evicts data * from the ARC. */ /* ARGSUSED */ static void -arc_adjust_cb(void *arg, zthr_t *zthr) +arc_evict_cb(void *arg, zthr_t *zthr) { uint64_t evicted = 0; /* Evict from cache */ - evicted = arc_adjust(); + evicted = arc_evict(); /* * If evicted is zero, we couldn't evict anything - * via arc_adjust(). This could be due to hash lock + * via arc_evict(). This could be due to hash lock * collisions, but more likely due to the majority of * arc buffers being unevictable. Therefore, even if * arc_size is above arc_c, another pass is unlikely to @@ -4744,18 +4919,18 @@ arc_adjust_cb(void *arg, zthr_t *zthr) * checked here so that if the arc is shutting down, the * broadcast will wake any remaining arc adjust waiters. */ - mutex_enter(&arc_adjust_lock); - arc_adjust_needed = !zthr_iscancelled(arc_adjust_zthr) && + mutex_enter(&arc_evict_lock); + arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) && evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0; - if (!arc_adjust_needed) { + if (!arc_evict_needed) { /* * We're either no longer overflowing, or we * can't evict anything more, so we should wake * up any waiters. */ - cv_broadcast(&arc_adjust_waiters_cv); + cv_broadcast(&arc_evict_waiters_cv); } - mutex_exit(&arc_adjust_lock); + mutex_exit(&arc_evict_lock); } /* ARGSUSED */ @@ -4793,7 +4968,7 @@ arc_reap_cb_check(void *arg, zthr_t *zthr) /* * Keep enough free memory in the system by reaping the ARC's kmem * caches. To cause more slabs to be reapable, we may reduce the - * target size of the cache (arc_c), causing the arc_adjust_cb() + * target size of the cache (arc_c), causing the arc_evict_cb() * to free more buffers. */ /* ARGSUSED */ @@ -4991,7 +5166,7 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, * overflowing; thus we don't use a while loop here. */ if (arc_is_overflowing()) { - mutex_enter(&arc_adjust_lock); + mutex_enter(&arc_evict_lock); /* * Now that we've acquired the lock, we may no longer be @@ -5005,12 +5180,12 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, * shouldn't cause any harm. */ if (arc_is_overflowing()) { - arc_adjust_needed = B_TRUE; - zthr_wakeup(arc_adjust_zthr); - (void) cv_wait(&arc_adjust_waiters_cv, - &arc_adjust_lock); + arc_evict_needed = B_TRUE; + zthr_wakeup(arc_evict_zthr); + (void) cv_wait(&arc_evict_waiters_cv, + &arc_evict_lock); } - mutex_exit(&arc_adjust_lock); + mutex_exit(&arc_evict_lock); } VERIFY3U(hdr->b_type, ==, type); @@ -6971,56 +7146,54 @@ arc_state_multilist_index_func(multilist_t *ml, void *obj) multilist_get_num_sublists(ml)); } +static unsigned int +arc_state_l2c_multilist_index_func(multilist_t *ml, void *obj) +{ + panic("Header %p insert into arc_l2c_only %p", obj, ml); +} + +static void +arc_state_multilist_init(multilist_t *ml, + multilist_sublist_index_func_t *index_func, int *maxcountp) +{ + multilist_create(ml, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), index_func); + *maxcountp = MAX(*maxcountp, multilist_get_num_sublists(ml)); +} + static void arc_state_init(void) { - arc_anon = &ARC_anon; - arc_mru = &ARC_mru; - arc_mru_ghost = &ARC_mru_ghost; - arc_mfu = &ARC_mfu; - arc_mfu_ghost = &ARC_mfu_ghost; - arc_l2c_only = &ARC_l2c_only; - - arc_mru->arcs_list[ARC_BUFC_METADATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_mru->arcs_list[ARC_BUFC_DATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_mru_ghost->arcs_list[ARC_BUFC_DATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_mfu->arcs_list[ARC_BUFC_METADATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_mfu->arcs_list[ARC_BUFC_DATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_l2c_only->arcs_list[ARC_BUFC_METADATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - arc_l2c_only->arcs_list[ARC_BUFC_DATA] = - multilist_create(sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); + int num_sublists = 0; + + arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_METADATA], + arc_state_multilist_index_func, &num_sublists); + arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_DATA], + arc_state_multilist_index_func, &num_sublists); + arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], + arc_state_multilist_index_func, &num_sublists); + arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], + arc_state_multilist_index_func, &num_sublists); + arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_METADATA], + arc_state_multilist_index_func, &num_sublists); + arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_DATA], + arc_state_multilist_index_func, &num_sublists); + arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], + arc_state_multilist_index_func, &num_sublists); + arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], + arc_state_multilist_index_func, &num_sublists); + + arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], + arc_state_multilist_index_func, &num_sublists); + arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], + arc_state_multilist_index_func, &num_sublists); + + /* + * Keep track of the number of markers needed to reclaim buffers from + * any ARC state. The markers will be pre-allocated so as to minimize + * the number of memory allocations performed by the eviction thread. + */ + arc_state_evict_marker_count = num_sublists; zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); @@ -7081,16 +7254,16 @@ arc_state_fini(void) zfs_refcount_destroy(&arc_mfu_ghost->arcs_size); zfs_refcount_destroy(&arc_l2c_only->arcs_size); - multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); aggsum_fini(&arc_meta_used); aggsum_fini(&arc_size); @@ -7119,8 +7292,8 @@ arc_init(void) #else uint64_t allmem = (physmem * PAGESIZE) / 2; #endif - mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_evict_waiters_cv, NULL, CV_DEFAULT, NULL); /* * Set the minimum cache size to 1/64 of all memory, with a hard @@ -7218,6 +7391,8 @@ arc_init(void) ASSERT(!arc_initialized); buf_init(); + arc_evict_thread_init(); + arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); @@ -7227,8 +7402,10 @@ arc_init(void) kstat_install(arc_ksp); } - arc_adjust_zthr = zthr_create(arc_adjust_cb_check, - arc_adjust_cb, NULL); + arc_state_evict_markers = + arc_state_alloc_markers(arc_state_evict_marker_count); + arc_evict_zthr = zthr_create(arc_evict_cb_check, + arc_evict_cb, NULL); arc_reap_zthr = zthr_create_timer(arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1)); @@ -7264,14 +7441,25 @@ arc_fini(void) arc_ksp = NULL; } - (void) zthr_cancel(arc_adjust_zthr); - zthr_destroy(arc_adjust_zthr); + if (arc_evict_taskq != NULL) + taskq_wait(arc_evict_taskq); + + (void) zthr_cancel(arc_evict_zthr); + zthr_destroy(arc_evict_zthr); (void) zthr_cancel(arc_reap_zthr); zthr_destroy(arc_reap_zthr); + arc_state_free_markers(arc_state_evict_markers, + arc_state_evict_marker_count); + + if (arc_evict_taskq != NULL) { + taskq_destroy(arc_evict_taskq); + kmem_free(arc_evict_arg, + sizeof (evict_arg_t) * zfs_arc_evict_threads); + } - mutex_destroy(&arc_adjust_lock); - cv_destroy(&arc_adjust_waiters_cv); + mutex_destroy(&arc_evict_lock); + cv_destroy(&arc_evict_waiters_cv); /* * buf_fini() must proceed arc_state_fini() because buf_fin() may @@ -8128,16 +8316,16 @@ l2arc_sublist_lock(int list_num) switch (list_num) { case 0: - ml = arc_mfu->arcs_list[ARC_BUFC_METADATA]; + ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; break; case 1: - ml = arc_mru->arcs_list[ARC_BUFC_METADATA]; + ml = &arc_mru->arcs_list[ARC_BUFC_METADATA]; break; case 2: - ml = arc_mfu->arcs_list[ARC_BUFC_DATA]; + ml = &arc_mfu->arcs_list[ARC_BUFC_DATA]; break; case 3: - ml = arc_mru->arcs_list[ARC_BUFC_DATA]; + ml = &arc_mru->arcs_list[ARC_BUFC_DATA]; break; default: return (NULL); @@ -8922,6 +9110,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; + list_link_init(&adddev->l2ad_node); adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP); mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); diff --git a/usr/src/uts/common/fs/zfs/bpobj.c b/usr/src/uts/common/fs/zfs/bpobj.c index ec0d115cfc42..a2875e84b8b1 100644 --- a/usr/src/uts/common/fs/zfs/bpobj.c +++ b/usr/src/uts/common/fs/zfs/bpobj.c @@ -207,29 +207,73 @@ bpobj_is_empty(bpobj_t *bpo) (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0)); } +/* + * A recursive iteration of the bpobjs would be nice here but we run the risk + * of overflowing function stack space. Instead, find each subobj and add it + * to the head of our list so it can be scanned for subjobjs. Like a + * recursive implementation, the "deepest" subobjs will be freed first. + * When a subobj is found to have no additional subojs, free it. + */ +typedef struct bpobj_info { + bpobj_t *bpi_bpo; + /* + * This object is a subobj of bpi_parent, + * at bpi_index in its subobj array. + */ + struct bpobj_info *bpi_parent; + uint64_t bpi_index; + /* How many of our subobj's are left to process. */ + uint64_t bpi_unprocessed_subobjs; + /* True after having visited this bpo's directly referenced BPs. */ + boolean_t bpi_visited; + list_node_t bpi_node; +} bpobj_info_t; + +static bpobj_info_t * +bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index) +{ + bpobj_info_t *bpi = kmem_zalloc(sizeof (bpobj_info_t), KM_SLEEP); + bpi->bpi_bpo = bpo; + bpi->bpi_parent = parent; + bpi->bpi_index = index; + if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { + bpi->bpi_unprocessed_subobjs = bpo->bpo_phys->bpo_num_subobjs; + } + return (bpi); +} + +/* + * Update bpobj and all of its parents with new space accounting. + */ +static void +propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed, + uint64_t comp_freed, uint64_t uncomp_freed, dmu_tx_t *tx) +{ + + for (; bpi != NULL; bpi = bpi->bpi_parent) { + bpobj_t *p = bpi->bpi_bpo; + ASSERT(dmu_buf_is_dirty(p->bpo_dbuf, tx)); + p->bpo_phys->bpo_bytes -= freed; + ASSERT3S(p->bpo_phys->bpo_bytes, >=, 0); + if (p->bpo_havecomp) { + p->bpo_phys->bpo_comp -= comp_freed; + p->bpo_phys->bpo_uncomp -= uncomp_freed; + } + } +} + static int -bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, - boolean_t free) +bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, + dmu_tx_t *tx, boolean_t free) { - dmu_object_info_t doi; - int epb; - int64_t i; int err = 0; + uint64_t freed = 0, comp_freed = 0, uncomp_freed = 0; dmu_buf_t *dbuf = NULL; + bpobj_t *bpo = bpi->bpi_bpo; - ASSERT(bpobj_is_open(bpo)); - mutex_enter(&bpo->bpo_lock); - - if (free) - dmu_buf_will_dirty(bpo->bpo_dbuf, tx); - - for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) { - blkptr_t *bparray; - blkptr_t *bp; - uint64_t offset, blkoff; - - offset = i * sizeof (blkptr_t); - blkoff = P2PHASE(i, bpo->bpo_epb); + for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) { + uint64_t offset = i * sizeof (blkptr_t); + uint64_t blkoff = P2PHASE(i, bpo->bpo_epb); if (dbuf == NULL || dbuf->db_offset > offset) { if (dbuf) @@ -243,119 +287,200 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, ASSERT3U(offset, >=, dbuf->db_offset); ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); - bparray = dbuf->db_data; - bp = &bparray[blkoff]; + blkptr_t *bparray = dbuf->db_data; + blkptr_t *bp = &bparray[blkoff]; err = func(arg, bp, tx); if (err) break; + if (free) { - bpo->bpo_phys->bpo_bytes -= - bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); - ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); - if (bpo->bpo_havecomp) { - bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp); - bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp); - } + spa_t *spa = dmu_objset_spa(bpo->bpo_os); + freed += bp_get_dsize_sync(spa, bp); + comp_freed += BP_GET_PSIZE(bp); + uncomp_freed += BP_GET_UCSIZE(bp); + ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx)); bpo->bpo_phys->bpo_num_blkptrs--; ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); } } + if (free) { + propagate_space_reduction(bpi, freed, comp_freed, + uncomp_freed, tx); + VERIFY0(dmu_free_range(bpo->bpo_os, + bpo->bpo_object, + bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), + DMU_OBJECT_END, tx)); + } if (dbuf) { dmu_buf_rele(dbuf, FTAG); dbuf = NULL; } - if (free) { - VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object, - (i + 1) * sizeof (blkptr_t), DMU_OBJECT_END, tx)); - } - if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0) - goto out; + return (err); +} - ASSERT(bpo->bpo_havecomp); - err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi); - if (err) { - mutex_exit(&bpo->bpo_lock); - return (err); - } - ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); - epb = doi.doi_data_block_size / sizeof (uint64_t); +/* + * Given an initial bpo, start by freeing the BPs that are directly referenced + * by that bpo. If the bpo has subobjs, read in its last subobj and push the + * subobj to our stack. By popping items off our stack, eventually we will + * encounter a bpo that has no subobjs. We can free its bpobj_info_t, and if + * requested also free the now-empty bpo from disk and decrement + * its parent's subobj count. We continue popping each subobj from our stack, + * visiting its last subobj until they too have no more subobjs, and so on. + */ +static int +bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, + dmu_tx_t *tx, boolean_t free) +{ + list_t stack; + bpobj_info_t *bpi; + int err = 0; - for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { - uint64_t *objarray; - uint64_t offset, blkoff; - bpobj_t sublist; - uint64_t used_before, comp_before, uncomp_before; - uint64_t used_after, comp_after, uncomp_after; + /* + * Create a "stack" for us to work with without worrying about + * stack overflows. Initialize it with the initial_bpo. + */ + list_create(&stack, sizeof (bpobj_info_t), + offsetof(bpobj_info_t, bpi_node)); + mutex_enter(&initial_bpo->bpo_lock); + list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0)); - offset = i * sizeof (uint64_t); - blkoff = P2PHASE(i, epb); + while ((bpi = list_head(&stack)) != NULL) { + bpobj_t *bpo = bpi->bpi_bpo; - if (dbuf == NULL || dbuf->db_offset > offset) { - if (dbuf) - dmu_buf_rele(dbuf, FTAG); - err = dmu_buf_hold(bpo->bpo_os, - bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0); - if (err) + ASSERT3P(bpo, !=, NULL); + ASSERT(MUTEX_HELD(&bpo->bpo_lock)); + ASSERT(bpobj_is_open(bpo)); + + if (free) + dmu_buf_will_dirty(bpo->bpo_dbuf, tx); + + if (bpi->bpi_visited == B_FALSE) { + err = bpobj_iterate_blkptrs(bpi, func, arg, tx, free); + bpi->bpi_visited = B_TRUE; + if (err != 0) break; } + /* + * We've finished with this bpo's directly-referenced BP's and + * it has no more unprocessed subobjs. We can free its + * bpobj_info_t (unless it is the topmost, initial_bpo). + * If we are freeing from disk, we can also do that. + */ + if (bpi->bpi_unprocessed_subobjs == 0) { + /* + * If there are no entries, there should + * be no bytes. + */ + if (bpobj_is_empty(bpo)) { + ASSERT0(bpo->bpo_phys->bpo_bytes); + ASSERT0(bpo->bpo_phys->bpo_comp); + ASSERT0(bpo->bpo_phys->bpo_uncomp); + } - ASSERT3U(offset, >=, dbuf->db_offset); - ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); + /* The initial_bpo has no parent and is not closed. */ + if (bpi->bpi_parent != NULL) { + if (free) { + bpobj_t *p = bpi->bpi_parent->bpi_bpo; + + ASSERT0(bpo->bpo_phys->bpo_num_blkptrs); + ASSERT3U(p->bpo_phys->bpo_num_subobjs, + >, 0); + ASSERT3U(bpi->bpi_index, ==, + p->bpo_phys->bpo_num_subobjs - 1); + ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, + tx)); + + p->bpo_phys->bpo_num_subobjs--; + + VERIFY0(dmu_free_range(p->bpo_os, + p->bpo_phys->bpo_subobjs, + bpi->bpi_index * sizeof (uint64_t), + sizeof (uint64_t), tx)); + + /* eliminate the empty subobj list */ + if (bpo->bpo_havesubobj && + bpo->bpo_phys->bpo_subobjs != 0) { + ASSERT0(bpo->bpo_phys-> + bpo_num_subobjs); + err = dmu_object_free( + bpo->bpo_os, + bpo->bpo_phys->bpo_subobjs, + tx); + if (err) + break; + bpo->bpo_phys->bpo_subobjs = 0; + } + err = dmu_object_free(p->bpo_os, + bpo->bpo_object, tx); + if (err) + break; + } + + mutex_exit(&bpo->bpo_lock); + bpobj_close(bpo); + kmem_free(bpo, sizeof (bpobj_t)); + } else { + mutex_exit(&bpo->bpo_lock); + } - objarray = dbuf->db_data; - err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]); - if (err) - break; - if (free) { - err = bpobj_space(&sublist, - &used_before, &comp_before, &uncomp_before); - if (err != 0) { - bpobj_close(&sublist); + /* + * Finished processing this bpo. Unlock, and free + * our "stack" info. + */ + list_remove_head(&stack); + kmem_free(bpi, sizeof (bpobj_info_t)); + } else { + /* + * We have unprocessed subobjs. Process the next one. + */ + ASSERT(bpo->bpo_havecomp); + + /* Add the last subobj to stack. */ + int64_t i = bpi->bpi_unprocessed_subobjs - 1; + uint64_t offset = i * sizeof (uint64_t); + + uint64_t obj_from_sublist; + err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + offset, sizeof (uint64_t), &obj_from_sublist, + DMU_READ_PREFETCH); + if (err) break; - } - } - err = bpobj_iterate_impl(&sublist, func, arg, tx, free); - if (free) { - VERIFY3U(0, ==, bpobj_space(&sublist, - &used_after, &comp_after, &uncomp_after)); - bpo->bpo_phys->bpo_bytes -= used_before - used_after; - ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); - bpo->bpo_phys->bpo_comp -= comp_before - comp_after; - bpo->bpo_phys->bpo_uncomp -= - uncomp_before - uncomp_after; - } - bpobj_close(&sublist); - if (err) - break; - if (free) { - err = dmu_object_free(bpo->bpo_os, - objarray[blkoff], tx); + bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t), + KM_SLEEP); + + err = bpobj_open(sublist, bpo->bpo_os, + obj_from_sublist); if (err) break; - bpo->bpo_phys->bpo_num_subobjs--; - ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0); + + list_insert_head(&stack, bpi_alloc(sublist, bpi, i)); + mutex_enter(&sublist->bpo_lock); + bpi->bpi_unprocessed_subobjs--; } } - if (dbuf) { - dmu_buf_rele(dbuf, FTAG); - dbuf = NULL; - } - if (free) { - VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, - bpo->bpo_phys->bpo_subobjs, - (i + 1) * sizeof (uint64_t), DMU_OBJECT_END, tx)); - } + /* + * Cleanup anything left on the "stack" after we left the loop. + * Every bpo on the stack is locked so we must remember to undo + * that now (in LIFO order). + */ + while ((bpi = list_remove_head(&stack)) != NULL) { + bpobj_t *bpo = bpi->bpi_bpo; + ASSERT(err != 0); + ASSERT3P(bpo, !=, NULL); -out: - /* If there are no entries, there should be no bytes. */ - if (bpobj_is_empty(bpo)) { - ASSERT0(bpo->bpo_phys->bpo_bytes); - ASSERT0(bpo->bpo_phys->bpo_comp); - ASSERT0(bpo->bpo_phys->bpo_uncomp); + mutex_exit(&bpo->bpo_lock); + + /* do not free the initial_bpo */ + if (bpi->bpi_parent != NULL) { + bpobj_close(bpi->bpi_bpo); + kmem_free(bpi->bpi_bpo, sizeof (bpobj_t)); + } + kmem_free(bpi, sizeof (bpobj_info_t)); } - mutex_exit(&bpo->bpo_lock); + list_destroy(&stack); + return (err); } diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index 38c4a83cb150..648e70d53f8f 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -53,13 +53,12 @@ static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); +static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr); -#ifndef __lint extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync, dmu_buf_evict_func_t *evict_func_async, dmu_buf_t **clear_on_evict_dbufp); -#endif /* ! __lint */ /* * Global data structures and functions for the dbuf cache. @@ -104,8 +103,8 @@ static boolean_t dbuf_evict_thread_exit; * by those caches' matching enum values (from dbuf_cached_state_t). */ typedef struct dbuf_cache { - multilist_t *cache; - zfs_refcount_t size; + multilist_t cache; + zfs_refcount_t size __cacheline_aligned; } dbuf_cache_t; dbuf_cache_t dbuf_caches[DB_CACHE_MAX]; @@ -519,9 +518,9 @@ dbuf_cache_above_lowater(void) static void dbuf_evict_one(void) { - int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache); + int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache); multilist_sublist_t *mls = multilist_sublist_lock( - dbuf_caches[DB_DBUF_CACHE].cache, idx); + &dbuf_caches[DB_DBUF_CACHE].cache, idx); ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); @@ -667,8 +666,8 @@ dbuf_init(void) dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { - dbuf_caches[dcs].cache = - multilist_create(sizeof (dmu_buf_impl_t), + multilist_create(&dbuf_caches[dcs].cache, + sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_cache_link), dbuf_cache_multilist_index_func); zfs_refcount_create(&dbuf_caches[dcs].size); @@ -706,7 +705,7 @@ dbuf_fini(void) for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { zfs_refcount_destroy(&dbuf_caches[dcs].size); - multilist_destroy(dbuf_caches[dcs].cache); + multilist_destroy(&dbuf_caches[dcs].cache); } } @@ -720,6 +719,7 @@ dbuf_verify(dmu_buf_impl_t *db) { dnode_t *dn; dbuf_dirty_record_t *dr; + uint32_t txg_prev; ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -751,11 +751,16 @@ dbuf_verify(dmu_buf_impl_t *db) ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); } - for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) - ASSERT(dr->dr_dbuf == db); - - for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) + if ((dr = list_head(&db->db_dirty_records)) != NULL) { ASSERT(dr->dr_dbuf == db); + txg_prev = dr->dr_txg; + for (dr = list_next(&db->db_dirty_records, dr); dr != NULL; + dr = list_next(&db->db_dirty_records, dr)) { + ASSERT(dr->dr_dbuf == db); + ASSERT(txg_prev > dr->dr_txg); + txg_prev = dr->dr_txg; + } + } /* * We can't assert that db_size matches dn_datablksz because it @@ -1121,7 +1126,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, } ASSERT3U(bonuslen, <=, db->db.db_size); - db->db.db_data = zio_buf_alloc(max_bonuslen); + db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP); arc_space_consume(max_bonuslen, ARC_SPACE_BONUS); if (bonuslen < max_bonuslen) bzero(db->db.db_data, max_bonuslen); @@ -1255,7 +1260,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, static void dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) { - dbuf_dirty_record_t *dr = db->db_last_dirty; + dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db.db_data != NULL); @@ -1276,10 +1281,9 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) */ ASSERT3U(dr->dr_txg, >=, txg - 2); if (db->db_blkid == DMU_BONUS_BLKID) { - /* Note that the data bufs here are zio_bufs */ dnode_t *dn = DB_DNODE(db); int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); - dr->dt.dl.dr_data = zio_buf_alloc(bonuslen); + dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP); arc_space_consume(bonuslen, ARC_SPACE_BONUS); bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen); } else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) { @@ -1398,8 +1402,17 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) DB_DNODE_EXIT(db); - if (!err && need_wait) - err = zio_wait(zio); + /* + * If we created a zio_root we must execute it to avoid + * leaking it, even if it isn't attached to any work due + * to an error in dbuf_read_impl(). + */ + if (need_wait) { + if (err == 0) + err = zio_wait(zio); + else + (void) zio_wait(zio); + } } else { /* * Another reader came in while the dbuf was in flight @@ -1566,9 +1579,10 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, } /* The dbuf is referenced */ - if (db->db_last_dirty != NULL) { - dbuf_dirty_record_t *dr = db->db_last_dirty; + if (!list_is_empty(&db->db_dirty_records)) { + dbuf_dirty_record_t *dr; + dr = list_head(&db->db_dirty_records); if (dr->dr_txg == txg) { /* * This buffer is "in-use", re-adjust the file @@ -1608,6 +1622,7 @@ void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) { arc_buf_t *buf, *obuf; + dbuf_dirty_record_t *dr; int osize = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dnode_t *dn; @@ -1638,10 +1653,12 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) arc_buf_destroy(obuf, db); db->db.db_size = size; - if (db->db_level == 0) { - ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); - db->db_last_dirty->dt.dl.dr_data = buf; - } + dr = list_head(&db->db_dirty_records); + if (db->db_level == 0) + dr->dt.dl.dr_data = buf; + ASSERT3U(dr->dr_txg, ==, tx->tx_txg); + ASSERT3U(dr->dr_accounted, ==, osize); + dr->dr_accounted = size; mutex_exit(&db->db_mtx); dmu_objset_willuse_space(dn->dn_objset, size - osize, tx); @@ -1687,12 +1704,80 @@ dbuf_redirty(dbuf_dirty_record_t *dr) } } +dbuf_dirty_record_t * +dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx) +{ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + IMPLY(dn->dn_objset->os_raw_receive, dn->dn_maxblkid >= blkid); + dnode_new_blkid(dn, blkid, tx, B_TRUE, B_FALSE); + ASSERT(dn->dn_maxblkid >= blkid); + + dbuf_dirty_record_t *dr = kmem_zalloc(sizeof (*dr), KM_SLEEP); + list_link_init(&dr->dr_dirty_node); + list_link_init(&dr->dr_dbuf_node); + dr->dr_dnode = dn; + dr->dr_txg = tx->tx_txg; + dr->dt.dll.dr_blkid = blkid; + dr->dr_accounted = dn->dn_datablksz; + + /* + * There should not be any dbuf for the block that we're dirtying. + * Otherwise the buffer contents could be inconsistent between the + * dbuf and the lightweight dirty record. + */ + ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid)); + + mutex_enter(&dn->dn_mtx); + int txgoff = tx->tx_txg & TXG_MASK; + if (dn->dn_free_ranges[txgoff] != NULL) { + range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1); + } + + if (dn->dn_nlevels == 1) { + ASSERT3U(blkid, <, dn->dn_nblkptr); + list_insert_tail(&dn->dn_dirty_records[txgoff], dr); + mutex_exit(&dn->dn_mtx); + rw_exit(&dn->dn_struct_rwlock); + dnode_setdirty(dn, tx); + } else { + mutex_exit(&dn->dn_mtx); + + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + dmu_buf_impl_t *parent_db = dbuf_hold_level(dn, + 1, blkid >> epbs, FTAG); + rw_exit(&dn->dn_struct_rwlock); + if (parent_db == NULL) { + kmem_free(dr, sizeof (*dr)); + return (NULL); + } + int err = dbuf_read(parent_db, NULL, + (DB_RF_NOPREFETCH | DB_RF_CANFAIL)); + if (err != 0) { + dbuf_rele(parent_db, FTAG); + kmem_free(dr, sizeof (*dr)); + return (NULL); + } + + dbuf_dirty_record_t *parent_dr = dbuf_dirty(parent_db, tx); + dbuf_rele(parent_db, FTAG); + mutex_enter(&parent_dr->dt.di.dr_mtx); + ASSERT3U(parent_dr->dr_txg, ==, tx->tx_txg); + list_insert_tail(&parent_dr->dt.di.dr_children, dr); + mutex_exit(&parent_dr->dt.di.dr_mtx); + dr->dr_parent = parent_dr; + } + + dmu_objset_willuse_space(dn->dn_objset, dr->dr_accounted, tx); + + return (dr); +} + dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; objset_t *os; - dbuf_dirty_record_t **drp, *dr; + dbuf_dirty_record_t *dr, *dr_next, *dr_head; int txgoff = tx->tx_txg & TXG_MASK; boolean_t drop_struct_rwlock = B_FALSE; @@ -1770,17 +1855,16 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) /* * If this buffer is already dirty, we're done. */ - drp = &db->db_last_dirty; - ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || + dr_head = list_head(&db->db_dirty_records); + ASSERT(dr_head == NULL || dr_head->dr_txg <= tx->tx_txg || db->db.db_object == DMU_META_DNODE_OBJECT); - while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) - drp = &dr->dr_next; - if (dr && dr->dr_txg == tx->tx_txg) { + dr_next = dbuf_find_dirty_lte(db, tx->tx_txg); + if (dr_next && dr_next->dr_txg == tx->tx_txg) { DB_DNODE_EXIT(db); - dbuf_redirty(dr); + dbuf_redirty(dr_next); mutex_exit(&db->db_mtx); - return (dr); + return (dr_next); } /* @@ -1823,6 +1907,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * transaction group won't leak out when we sync the older txg. */ dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); + list_link_init(&dr->dr_dirty_node); + list_link_init(&dr->dr_dbuf_node); + dr->dr_dnode = dn; if (db->db_level == 0) { void *data_old = db->db_buf; @@ -1853,12 +1940,11 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } - if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) + if (db->db_blkid != DMU_BONUS_BLKID) dr->dr_accounted = db->db.db_size; dr->dr_dbuf = db; dr->dr_txg = tx->tx_txg; - dr->dr_next = *drp; - *drp = dr; + list_insert_before(&db->db_dirty_records, dr_next, dr); /* * We could have been freed_in_flight between the dbuf_noread @@ -1956,7 +2042,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * Since we've dropped the mutex, it's possible that * dbuf_undirty() might have changed this out from under us. */ - if (db->db_last_dirty == dr || + if (list_head(&db->db_dirty_records) == dr || dn->dn_object == DMU_META_DNODE_OBJECT) { mutex_enter(&di->dt.di.dr_mtx); ASSERT3U(di->dr_txg, ==, tx->tx_txg); @@ -1983,6 +2069,30 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) return (dr); } +static void +dbuf_undirty_bonus(dbuf_dirty_record_t *dr) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + + if (dr->dt.dl.dr_data != db->db.db_data) { + struct dnode *dn = dr->dr_dnode; + int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); + + kmem_free(dr->dt.dl.dr_data, max_bonuslen); + arc_space_return(max_bonuslen, ARC_SPACE_BONUS); + } + db->db_data_pending = NULL; + ASSERT(list_next(&db->db_dirty_records, dr) == NULL); + list_remove(&db->db_dirty_records, dr); + if (dr->dr_dbuf->db_level != 0) { + mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); + } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + ASSERT3U(db->db_dirtycnt, >, 0); + db->db_dirtycnt -= 1; +} + /* * Undirty a buffer in the transaction group referenced by the given * transaction. Return whether this evicted the dbuf. @@ -1990,9 +2100,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { - dnode_t *dn; uint64_t txg = tx->tx_txg; - dbuf_dirty_record_t *dr, **drp; ASSERT(txg != 0); @@ -2012,16 +2120,12 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) /* * If this buffer is not dirty, we're done. */ - for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) - if (dr->dr_txg <= txg) - break; - if (dr == NULL || dr->dr_txg < txg) + dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, txg); + if (dr == NULL) return (B_FALSE); - ASSERT(dr->dr_txg == txg); ASSERT(dr->dr_dbuf == db); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); + dnode_t *dn = dr->dr_dnode; dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); @@ -2030,7 +2134,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), dr->dr_accounted, txg); - *drp = dr->dr_next; + list_remove(&db->db_dirty_records, dr); /* * Note that there are three places in dbuf_dirty() @@ -2049,7 +2153,6 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); mutex_exit(&dn->dn_mtx); } - DB_DNODE_EXIT(db); if (db->db_state != DB_NOFILL) { dbuf_unoverride(dr); @@ -2089,15 +2192,15 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) * cached). */ mutex_enter(&db->db_mtx); - dbuf_dirty_record_t *dr; - for (dr = db->db_last_dirty; - dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) { + + if (db->db_state == DB_CACHED) { + dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg); /* * It's possible that it is already dirty but not cached, * because there are some calls to dbuf_dirty() that don't * go through dmu_buf_will_dirty(). */ - if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) { + if (dr != NULL) { /* This dbuf is already dirty and cached. */ dbuf_redirty(dr); mutex_exit(&db->db_mtx); @@ -2121,6 +2224,18 @@ dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx); } +boolean_t +dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dbuf_dirty_record_t *dr; + + mutex_enter(&db->db_mtx); + dr = dbuf_find_dirty_eq(db, tx->tx_txg); + mutex_exit(&db->db_mtx); + return (dr != NULL); +} + void dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { @@ -2172,12 +2287,9 @@ dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, dmu_buf_will_dirty_impl(db_fake, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx); - dr = db->db_last_dirty; - while (dr != NULL && dr->dr_txg > tx->tx_txg) - dr = dr->dr_next; + dr = dbuf_find_dirty_eq(db, tx->tx_txg); ASSERT3P(dr, !=, NULL); - ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dr->dt.dl.dr_has_raw_params = B_TRUE; dr->dt.dl.dr_byteorder = byteorder; @@ -2217,6 +2329,7 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; struct dirty_leaf *dl; dmu_object_type_t type; + dbuf_dirty_record_t *dr; if (etype == BP_EMBEDDED_TYPE_DATA) { ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), @@ -2232,8 +2345,9 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, dmu_buf_will_not_fill(dbuf, tx); - ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); - dl = &db->db_last_dirty->dt.dl; + dr = list_head(&db->db_dirty_records); + ASSERT3U(dr->dr_txg, ==, tx->tx_txg); + dl = &dr->dt.dl; encode_embedded_bp_compressed(&dl->dr_overridden_by, data, comp, uncompressed_size, compressed_size); BPE_SET_ETYPE(&dl->dr_overridden_by, etype); @@ -2242,7 +2356,7 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); dl->dr_override_state = DR_OVERRIDDEN; - dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; + dl->dr_overridden_by.blk_birth = dr->dr_txg; } /* @@ -2289,7 +2403,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) xuio_stat_wbuf_nocopy(); if (db->db_state == DB_CACHED) { - dbuf_dirty_record_t *dr = db->db_last_dirty; + dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); ASSERT(db->db_buf != NULL); if (dr != NULL && dr->dr_txg == tx->tx_txg) { @@ -2335,7 +2449,7 @@ dbuf_destroy(dmu_buf_impl_t *db) int slots = DB_DNODE(db)->dn_num_slots; int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); if (db->db.db_data != NULL) { - zio_buf_free(db->db.db_data, bonuslen); + kmem_free(db->db.db_data, bonuslen); arc_space_return(bonuslen, ARC_SPACE_BONUS); db->db_state = DB_UNCACHED; } @@ -2347,7 +2461,7 @@ dbuf_destroy(dmu_buf_impl_t *db) ASSERT(db->db_caching_status == DB_DBUF_CACHE || db->db_caching_status == DB_DBUF_METADATA_CACHE); - multilist_remove(dbuf_caches[db->db_caching_status].cache, db); + multilist_remove(&dbuf_caches[db->db_caching_status].cache, db); (void) zfs_refcount_remove_many( &dbuf_caches[db->db_caching_status].size, db->db.db_size, db); @@ -2528,11 +2642,13 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); + list_create(&db->db_dirty_records, sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dbuf_node)); + db->db_objset = os; db->db.db_object = dn->dn_object; db->db_level = level; db->db_blkid = blkid; - db->db_last_dirty = NULL; db->db_dirtycnt = 0; db->db_dnode_handle = dn->dn_handle; db->db_parent = parent; @@ -3008,7 +3124,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, ASSERT(db->db_caching_status == DB_DBUF_CACHE || db->db_caching_status == DB_DBUF_METADATA_CACHE); - multilist_remove(dbuf_caches[db->db_caching_status].cache, db); + multilist_remove(&dbuf_caches[db->db_caching_status].cache, db); (void) zfs_refcount_remove_many( &dbuf_caches[db->db_caching_status].size, db->db.db_size, db); @@ -3241,7 +3357,7 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting) DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE; db->db_caching_status = dcs; - multilist_insert(dbuf_caches[dcs].cache, db); + multilist_insert(&dbuf_caches[dcs].cache, db); (void) zfs_refcount_add_many( &dbuf_caches[dcs].size, db->db.db_size, db); mutex_exit(&db->db_mtx); @@ -3410,6 +3526,28 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) } } +static void +dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + void *data = dr->dt.dl.dr_data; + + ASSERT0(db->db_level); + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_blkid == DMU_BONUS_BLKID); + ASSERT(data != NULL); + + dnode_t *dn = dr->dr_dnode; + ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=, + DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1)); + bcopy(data, DN_BONUS(dn->dn_phys), DN_MAX_BONUS_LEN(dn->dn_phys)); + + dbuf_sync_leaf_verify_bonus_dnode(dr); + + dbuf_undirty_bonus(dr); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); +} + /* * When syncing out blocks of dnodes, adjust the block to deal with * encryption. Normally, we make sure the block is decrypted before writing @@ -3458,8 +3596,7 @@ static void dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn; - zio_t *zio; + dnode_t *dn = dr->dr_dnode; ASSERT(dmu_tx_is_syncing(tx)); @@ -3479,12 +3616,9 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT3U(db->db_state, ==, DB_CACHED); ASSERT(db->db_buf != NULL); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); /* Indirect block size must match what the dnode thinks it is. */ ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); dbuf_check_blkptr(dn, db); - DB_DNODE_EXIT(db); /* Provide the pending dirty record to child dbufs */ db->db_data_pending = dr; @@ -3493,7 +3627,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dbuf_write(dr, db->db_buf, tx); - zio = dr->dr_zio; + zio_t *zio = dr->dr_zio; mutex_enter(&dr->dt.di.dr_mtx); dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); @@ -3501,12 +3635,199 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) zio_nowait(zio); } +/* + * Verify that the size of the data in our bonus buffer does not exceed + * its recorded size. + * + * The purpose of this verification is to catch any cases in development + * where the size of a phys structure (i.e space_map_phys_t) grows and, + * due to incorrect feature management, older pools expect to read more + * data even though they didn't actually write it to begin with. + * + * For a example, this would catch an error in the feature logic where we + * open an older pool and we expect to write the space map histogram of + * a space map with size SPACE_MAP_SIZE_V0. + */ +static void +dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr) +{ +#ifdef ZFS_DEBUG + dnode_t *dn = dr->dr_dnode; + + /* + * Encrypted bonus buffers can have data past their bonuslen. + * Skip the verification of these blocks. + */ + if (DMU_OT_IS_ENCRYPTED(dn->dn_bonustype)) + return; + + uint16_t bonuslen = dn->dn_phys->dn_bonuslen; + uint16_t maxbonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); + ASSERT3U(bonuslen, <=, maxbonuslen); + + arc_buf_t *datap = dr->dt.dl.dr_data; + char *datap_end = ((char *)datap) + bonuslen; + char *datap_max = ((char *)datap) + maxbonuslen; + + /* ensure that everything is zero after our data */ + for (; datap_end < datap_max; datap_end++) + ASSERT(*datap_end == 0); +#endif +} + +static blkptr_t * +dbuf_lightweight_bp(dbuf_dirty_record_t *dr) +{ + /* This must be a lightweight dirty record. */ + ASSERT3P(dr->dr_dbuf, ==, NULL); + dnode_t *dn = dr->dr_dnode; + + if (dn->dn_phys->dn_nlevels == 1) { + VERIFY3U(dr->dt.dll.dr_blkid, <, dn->dn_phys->dn_nblkptr); + return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]); + } else { + dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + VERIFY3U(parent_db->db_level, ==, 1); + VERIFY3P(parent_db->db_dnode_handle->dnh_dnode, ==, dn); + VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid); + blkptr_t *bp = parent_db->db.db_data; + return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]); + } +} + +static void +dbuf_lightweight_ready(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + blkptr_t *bp = zio->io_bp; + + if (zio->io_error != 0) + return; + + dnode_t *dn = dr->dr_dnode; + + blkptr_t *bp_orig = dbuf_lightweight_bp(dr); + spa_t *spa = dmu_objset_spa(dn->dn_objset); + int64_t delta = bp_get_dsize_sync(spa, bp) - + bp_get_dsize_sync(spa, bp_orig); + dnode_diduse_space(dn, delta); + + uint64_t blkid = dr->dt.dll.dr_blkid; + mutex_enter(&dn->dn_mtx); + if (blkid > dn->dn_phys->dn_maxblkid) { + ASSERT0(dn->dn_objset->os_raw_receive); + dn->dn_phys->dn_maxblkid = blkid; + } + mutex_exit(&dn->dn_mtx); + + if (!BP_IS_EMBEDDED(bp)) { + uint64_t fill = BP_IS_HOLE(bp) ? 0 : 1; + BP_SET_FILL(bp, fill); + } + + dmu_buf_impl_t *parent_db; + EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1); + if (dr->dr_parent == NULL) { + parent_db = dn->dn_dbuf; + } else { + parent_db = dr->dr_parent->dr_dbuf; + } + rw_enter(&parent_db->db_rwlock, RW_WRITER); + *bp_orig = *bp; + rw_exit(&parent_db->db_rwlock); +} + +static void +dbuf_lightweight_physdone(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + dsl_pool_t *dp = spa_get_dsl(zio->io_spa); + ASSERT3U(dr->dr_txg, ==, zio->io_txg); + + /* + * The callback will be called io_phys_children times. Retire one + * portion of our dirty space each time we are called. Any rounding + * error will be cleaned up by dbuf_lightweight_done(). + */ + int delta = dr->dr_accounted / zio->io_phys_children; + dsl_pool_undirty_space(dp, delta, zio->io_txg); +} + +static void +dbuf_lightweight_done(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + + VERIFY0(zio->io_error); + + objset_t *os = dr->dr_dnode->dn_objset; + dmu_tx_t *tx = os->os_synctx; + + if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { + ASSERT(BP_EQUAL(zio->io_bp, &zio->io_bp_orig)); + } else { + dsl_dataset_t *ds = os->os_dsl_dataset; + (void) dsl_dataset_block_kill(ds, &zio->io_bp_orig, tx, B_TRUE); + dsl_dataset_block_born(ds, zio->io_bp, tx); + } + + /* + * See comment in dbuf_write_done(). + */ + if (zio->io_phys_children == 0) { + dsl_pool_undirty_space(dmu_objset_pool(os), + dr->dr_accounted, zio->io_txg); + } else { + dsl_pool_undirty_space(dmu_objset_pool(os), + dr->dr_accounted % zio->io_phys_children, zio->io_txg); + } + + abd_free(dr->dt.dll.dr_abd); + kmem_free(dr, sizeof (*dr)); +} + +static void +dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx) +{ + dnode_t *dn = dr->dr_dnode; + zio_t *pio; + if (dn->dn_phys->dn_nlevels == 1) { + pio = dn->dn_zio; + } else { + pio = dr->dr_parent->dr_zio; + } + + zbookmark_phys_t zb = { + .zb_objset = dmu_objset_id(dn->dn_objset), + .zb_object = dn->dn_object, + .zb_level = 0, + .zb_blkid = dr->dt.dll.dr_blkid, + }; + + /* + * See comment in dbuf_write(). This is so that zio->io_bp_orig + * will have the old BP in dbuf_lightweight_done(). + */ + dr->dr_bp_copy = *dbuf_lightweight_bp(dr); + + dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset), + dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd, + dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd), + &dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL, + dbuf_lightweight_physdone, dbuf_lightweight_done, dr, + ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb); + + zio_nowait(dr->dr_zio); +} + static void dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { arc_buf_t **datap = &dr->dt.dl.dr_data; dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn; + dnode_t *dn = dr->dr_dnode; objset_t *os; uint64_t txg = tx->tx_txg; @@ -3530,9 +3851,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) } DBUF_VERIFY(db); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - if (db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; @@ -3546,33 +3864,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) * be called). */ if (db->db_blkid == DMU_BONUS_BLKID) { - dbuf_dirty_record_t **drp; - - ASSERT(*datap != NULL); - ASSERT0(db->db_level); - ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=, - DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1)); - bcopy(*datap, DN_BONUS(dn->dn_phys), - DN_MAX_BONUS_LEN(dn->dn_phys)); - DB_DNODE_EXIT(db); - - if (*datap != db->db.db_data) { - int slots = DB_DNODE(db)->dn_num_slots; - int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); - zio_buf_free(*datap, bonuslen); - arc_space_return(bonuslen, ARC_SPACE_BONUS); - } - db->db_data_pending = NULL; - drp = &db->db_last_dirty; - while (*drp != dr) - drp = &(*drp)->dr_next; - ASSERT(dr->dr_next == NULL); ASSERT(dr->dr_dbuf == db); - *drp = dr->dr_next; - kmem_free(dr, sizeof (dbuf_dirty_record_t)); - ASSERT(db->db_dirtycnt > 0); - db->db_dirtycnt -= 1; - dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); + dbuf_sync_bonus(dr, tx); return; } @@ -3652,16 +3945,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(!list_link_active(&dr->dr_dirty_node)); if (dn->dn_object == DMU_META_DNODE_OBJECT) { list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); - DB_DNODE_EXIT(db); } else { - /* - * Although zio_nowait() does not "wait for an IO", it does - * initiate the IO. If this is an empty write it seems plausible - * that the IO could actually be completed before the nowait - * returns. We need to DB_DNODE_EXIT() first in case - * zio_nowait() invalidates the dbuf. - */ - DB_DNODE_EXIT(db); zio_nowait(dr->dr_zio); } } @@ -3684,15 +3968,19 @@ dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) DMU_META_DNODE_OBJECT); break; } - if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && - dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { - VERIFY3U(dr->dr_dbuf->db_level, ==, level); - } list_remove(list, dr); + if (dr->dr_dbuf == NULL) { + dbuf_sync_lightweight(dr, tx); + } else { + if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && + dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { + VERIFY3U(dr->dr_dbuf->db_level, ==, level); + } if (dr->dr_dbuf->db_level > 0) dbuf_sync_indirect(dr, tx); else dbuf_sync_leaf(dr, tx); + } } } @@ -3857,8 +4145,7 @@ dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) /* * The callback will be called io_phys_children times. Retire one * portion of our dirty space each time we are called. Any rounding - * error will be cleaned up by dsl_pool_sync()'s call to - * dsl_pool_undirty_space(). + * error will be cleaned up by dbuf_write_done(). */ delta = dr->dr_accounted / zio->io_phys_children; dsl_pool_undirty_space(dp, delta, zio->io_txg); @@ -3873,7 +4160,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) blkptr_t *bp = db->db_blkptr; objset_t *os = db->db_objset; dmu_tx_t *tx = os->os_synctx; - dbuf_dirty_record_t **drp, *dr; ASSERT0(zio->io_error); ASSERT(db->db_blkptr == bp); @@ -3894,24 +4180,18 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) DBUF_VERIFY(db); - drp = &db->db_last_dirty; - while ((dr = *drp) != db->db_data_pending) - drp = &dr->dr_next; + dbuf_dirty_record_t *dr = db->db_data_pending; + dnode_t *dn = dr->dr_dnode; ASSERT(!list_link_active(&dr->dr_dirty_node)); ASSERT(dr->dr_dbuf == db); - ASSERT(dr->dr_next == NULL); - *drp = dr->dr_next; + ASSERT(list_next(&db->db_dirty_records, dr) == NULL); + list_remove(&db->db_dirty_records, dr); #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); - DB_DNODE_EXIT(db); } #endif @@ -3923,10 +4203,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); if (!BP_IS_HOLE(db->db_blkptr)) { @@ -3937,17 +4213,39 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); } - DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } - kmem_free(dr, sizeof (dbuf_dirty_record_t)); cv_broadcast(&db->db_changed); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); + + /* + * If we didn't do a physical write in this ZIO and we + * still ended up here, it means that the space of the + * dbuf that we just released (and undirtied) above hasn't + * been marked as undirtied in the pool's accounting. + * + * Thus, we undirty that space in the pool's view of the + * world here. For physical writes this type of update + * happens in dbuf_write_physdone(). + * + * If we did a physical write, cleanup any rounding errors + * that came up due to writing multiple copies of a block + * on disk [see dbuf_write_physdone()]. + */ + if (zio->io_phys_children == 0) { + dsl_pool_undirty_space(dmu_objset_pool(os), + dr->dr_accounted, zio->io_txg); + } else { + dsl_pool_undirty_space(dmu_objset_pool(os), + dr->dr_accounted % zio->io_phys_children, zio->io_txg); + } + + kmem_free(dr, sizeof (dbuf_dirty_record_t)); } static void @@ -4136,7 +4434,7 @@ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn; + dnode_t *dn = dr->dr_dnode; objset_t *os; dmu_buf_impl_t *parent = db->db_parent; uint64_t txg = tx->tx_txg; @@ -4147,8 +4445,6 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) ASSERT(dmu_tx_is_syncing(tx)); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); os = dn->dn_objset; if (db->db_state != DB_NOFILL) { @@ -4205,8 +4501,6 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); - DB_DNODE_EXIT(db); - /* * We copy the blkptr now (rather than when we instantiate the dirty * record), because its value can change between open context and diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index 96cdab913d18..0d687080ded4 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -1703,6 +1703,32 @@ dmu_return_arcbuf(arc_buf_t *buf) arc_buf_destroy(buf, FTAG); } +/* + * A "lightweight" write is faster than a regular write (e.g. + * dmu_write_by_dnode() or dmu_assign_arcbuf_by_dnode()), because it avoids the + * CPU cost of creating a dmu_buf_impl_t and arc_buf_[hdr_]_t. However, the + * data can not be read or overwritten until the transaction's txg has been + * synced. This makes it appropriate for workloads that are known to be + * (temporarily) write-only, like "zfs receive". + * + * A single block is written, starting at the specified offset in bytes. If + * the call is successful, it returns 0 and the provided abd has been + * consumed (the caller should not free it). + */ +int +dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd, + const zio_prop_t *zp, enum zio_flag flags, dmu_tx_t *tx) +{ + dbuf_dirty_record_t *dr = + dbuf_dirty_lightweight(dn, dbuf_whichblock(dn, 0, offset), tx); + if (dr == NULL) + return (SET_ERROR(EIO)); + dr->dt.dll.dr_abd = abd; + dr->dt.dll.dr_props = *zp; + dr->dt.dll.dr_flags = flags; + return (0); +} + void dmu_copy_from_buf(objset_t *os, uint64_t object, uint64_t offset, dmu_buf_t *handle, dmu_tx_t *tx) @@ -1776,8 +1802,8 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, return (SET_ERROR(EIO)); /* - * We can only assign if the offset is aligned, the arc buf is the - * same size as the dbuf, and the dbuf is not metadata. + * We can only assign if the offset is aligned and the arc buf is the + * same size as the dbuf. */ if (offset == db->db.db_offset && blksz == db->db.db_size) { dbuf_assign_arcbuf(db, buf, tx); @@ -2030,7 +2056,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; objset_t *os = db->db_objset; dsl_dataset_t *ds = os->os_dsl_dataset; - dbuf_dirty_record_t *dr; + dbuf_dirty_record_t *dr, *dr_next; dmu_sync_arg_t *dsa; zbookmark_phys_t zb; zio_prop_t zp; @@ -2078,9 +2104,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); } - dr = db->db_last_dirty; - while (dr && dr->dr_txg != txg) - dr = dr->dr_next; + dr = dbuf_find_dirty_eq(db, txg); if (dr == NULL) { /* @@ -2091,7 +2115,8 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) return (SET_ERROR(ENOENT)); } - ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg); + dr_next = list_next(&db->db_dirty_records, dr); + ASSERT(dr_next == NULL || dr_next->dr_txg < txg); if (db->db_blkptr != NULL) { /* @@ -2132,7 +2157,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); - if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) + if (dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) zp.zp_nopwrite = B_FALSE; DB_DNODE_EXIT(db); diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c index 4bf7458419db..ed3280d9a9c3 100644 --- a/usr/src/uts/common/fs/zfs/dmu_objset.c +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright 2019 Joyent, Inc. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -605,7 +605,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, os->os_zil = zil_alloc(os, &os->os_zil_header); for (i = 0; i < TXG_SIZE; i++) { - os->os_dirty_dnodes[i] = multilist_create(sizeof (dnode_t), + multilist_create(&os->os_dirty_dnodes[i], sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[i]), dnode_multilist_index_func); } @@ -991,9 +991,8 @@ dmu_objset_evict_done(objset_t *os) mutex_destroy(&os->os_obj_lock); mutex_destroy(&os->os_user_ptr_lock); mutex_destroy(&os->os_upgrade_lock); - for (int i = 0; i < TXG_SIZE; i++) { - multilist_destroy(os->os_dirty_dnodes[i]); - } + for (int i = 0; i < TXG_SIZE; i++) + multilist_destroy(&os->os_dirty_dnodes[i]); spa_evicting_os_deregister(os->os_spa, os); kmem_free(os, sizeof (objset_t)); } @@ -1214,7 +1213,7 @@ dmu_objset_create_sync(void *arg, dmu_tx_t *tx) need_sync_done = B_TRUE; } VERIFY0(zio_wait(rzio)); - dmu_objset_do_userquota_updates(os, tx); + dmu_objset_sync_done(os, tx); taskq_wait(dp->dp_sync_taskq); if (txg_list_member(&dp->dp_dirty_datasets, ds, tx->tx_txg)) { ASSERT3P(ds->ds_key_mapping, !=, NULL); @@ -1567,23 +1566,13 @@ dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx) multilist_sublist_remove(list, dn); /* - * If we are not doing useraccounting (os_synced_dnodes == NULL) - * we are done with this dnode for this txg. Unset dn_dirty_txg - * if later txgs aren't dirtying it so that future holders do - * not get a stale value. Otherwise, we will do this in - * userquota_updates_task() when processing has completely - * finished for this txg. + * See the comment above dnode_rele_task() for an explanation + * of why this dnode hold is always needed (even when not + * doing user accounting). */ - multilist_t *newlist = dn->dn_objset->os_synced_dnodes; - if (newlist != NULL) { - (void) dnode_add_ref(dn, newlist); - multilist_insert(newlist, dn); - } else { - mutex_enter(&dn->dn_mtx); - if (dn->dn_dirty_txg == tx->tx_txg) - dn->dn_dirty_txg = 0; - mutex_exit(&dn->dn_mtx); - } + multilist_t *newlist = &dn->dn_objset->os_synced_dnodes; + (void) dnode_add_ref(dn, newlist); + multilist_insert(newlist, dn); dnode_sync(dn, tx); } @@ -1672,6 +1661,8 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) zio_t *zio; list_t *list; dbuf_dirty_record_t *dr; + int num_sublists; + multilist_t *ml; blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP); *blkptr_copy = *os->os_rootbp; @@ -1742,28 +1733,28 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) txgoff = tx->tx_txg & TXG_MASK; - if (dmu_objset_userused_enabled(os) && - (!os->os_encrypted || !dmu_objset_is_receiving(os))) { - /* - * We must create the list here because it uses the - * dn_dirty_link[] of this txg. But it may already - * exist because we call dsl_dataset_sync() twice per txg. - */ - if (os->os_synced_dnodes == NULL) { - os->os_synced_dnodes = - multilist_create(sizeof (dnode_t), - offsetof(dnode_t, dn_dirty_link[txgoff]), - dnode_multilist_index_func); - } else { - ASSERT3U(os->os_synced_dnodes->ml_offset, ==, - offsetof(dnode_t, dn_dirty_link[txgoff])); - } + /* + * We must create the list here because it uses the + * dn_dirty_link[] of this txg. But it may already + * exist because we call dsl_dataset_sync() twice per txg. + */ + if (os->os_synced_dnodes.ml_sublists == NULL) { + multilist_create(&os->os_synced_dnodes, + sizeof (dnode_t), + offsetof(dnode_t, dn_dirty_link[txgoff]), + dnode_multilist_index_func); + } else { + ASSERT3U(os->os_synced_dnodes.ml_offset, ==, + offsetof(dnode_t, dn_dirty_link[txgoff])); } - for (int i = 0; - i < multilist_get_num_sublists(os->os_dirty_dnodes[txgoff]); i++) { + ml = &os->os_dirty_dnodes[txgoff]; + num_sublists = multilist_get_num_sublists(ml); + for (int i = 0; i < num_sublists; i++) { + if (multilist_sublist_is_empty_idx(ml, i)) + continue; sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP); - sda->sda_list = os->os_dirty_dnodes[txgoff]; + sda->sda_list = ml; sda->sda_sublist_idx = i; sda->sda_tx = tx; (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, @@ -1797,7 +1788,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg) { - return (!multilist_is_empty(os->os_dirty_dnodes[txg & TXG_MASK])); + return (!multilist_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK])); } static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; @@ -1997,7 +1988,7 @@ userquota_updates_task(void *arg) userquota_cache_t cache = { 0 }; multilist_sublist_t *list = - multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx); + multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx); ASSERT(multilist_sublist_head(list) == NULL || dmu_objset_userused_enabled(os)); @@ -2051,23 +2042,54 @@ userquota_updates_task(void *arg) dn->dn_id_flags |= DN_ID_CHKED_BONUS; } dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); - if (dn->dn_dirty_txg == spa_syncing_txg(os->os_spa)) - dn->dn_dirty_txg = 0; mutex_exit(&dn->dn_mtx); multilist_sublist_remove(list, dn); - dnode_rele(dn, os->os_synced_dnodes); + dnode_rele(dn, &os->os_synced_dnodes); } do_userquota_cacheflush(os, &cache, tx); multilist_sublist_unlock(list); kmem_free(uua, sizeof (*uua)); } -void -dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) +/* + * Release dnode holds from dmu_objset_sync_dnodes(). When the dnode is being + * synced (i.e. we have issued the zio's for blocks in the dnode), it can't be + * evicted because the block containing the dnode can't be evicted until it is + * written out. However, this hold is necessary to prevent the dnode_t from + * being moved (via dnode_move()) while it's still referenced by + * dbuf_dirty_record_t:dr_dnode. And dr_dnode is needed for + * dirty_lightweight_leaf-type dirty records. + * + * If we are doing user-object accounting, the dnode_rele() happens from + * userquota_updates_task() instead. + */ +static void +dnode_rele_task(void *arg) +{ + userquota_updates_arg_t *uua = arg; + objset_t *os = uua->uua_os; + + multilist_sublist_t *list = + multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx); + + dnode_t *dn; + while ((dn = multilist_sublist_head(list)) != NULL) { + multilist_sublist_remove(list, dn); + dnode_rele(dn, &os->os_synced_dnodes); + } + multilist_sublist_unlock(list); + kmem_free(uua, sizeof (*uua)); +} + +/* + * Return TRUE if userquota updates are needed. + */ +static boolean_t +dmu_objset_do_userquota_updates_prep(objset_t *os, dmu_tx_t *tx) { if (!dmu_objset_userused_enabled(os)) - return; + return (B_FALSE); /* * If this is a raw receive just return and handle accounting @@ -2077,10 +2099,10 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) * used for recovery. */ if (os->os_encrypted && dmu_objset_is_receiving(os)) - return; + return (B_FALSE); if (tx->tx_txg <= os->os_spa->spa_claim_max_txg) - return; + return (B_FALSE); /* Allocate the user/group/project used objects if necessary. */ if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { @@ -2097,21 +2119,37 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) VERIFY0(zap_create_claim(os, DMU_PROJECTUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); } + return (B_TRUE); +} + +/* + * Dispatch taskq tasks to dp_sync_taskq to update the user accounting, and + * also release the holds on the dnodes from dmu_objset_sync_dnodes(). + * The caller must taskq_wait(dp_sync_taskq). + */ +void +dmu_objset_sync_done(objset_t *os, dmu_tx_t *tx) +{ + boolean_t need_userquota = dmu_objset_do_userquota_updates_prep(os, tx); - for (int i = 0; - i < multilist_get_num_sublists(os->os_synced_dnodes); i++) { + int num_sublists = multilist_get_num_sublists(&os->os_synced_dnodes); + for (int i = 0; i < num_sublists; i++) { + if (multilist_sublist_is_empty_idx(&os->os_synced_dnodes, i)) + continue; userquota_updates_arg_t *uua = kmem_alloc(sizeof (*uua), KM_SLEEP); uua->uua_os = os; uua->uua_sublist_idx = i; uua->uua_tx = tx; - /* note: caller does taskq_wait() */ + (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, - userquota_updates_task, uua, 0); + need_userquota ? userquota_updates_task : dnode_rele_task, + uua, 0); /* callback frees uua */ } } + /* * Returns a pointer to data to find uid/gid from * @@ -2122,31 +2160,22 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) static void * dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) { - dbuf_dirty_record_t *dr, **drp; + dbuf_dirty_record_t *dr; void *data; if (db->db_dirtycnt == 0) return (db->db.db_data); /* Nothing is changing */ - for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) - if (dr->dr_txg == tx->tx_txg) - break; + dr = dbuf_find_dirty_eq(db, tx->tx_txg); if (dr == NULL) { data = NULL; } else { - dnode_t *dn; - - DB_DNODE_ENTER(dr->dr_dbuf); - dn = DB_DNODE(dr->dr_dbuf); - - if (dn->dn_bonuslen == 0 && + if (dr->dr_dnode->dn_bonuslen == 0 && dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) data = dr->dt.dl.dr_data->b_data; else data = dr->dt.dl.dr_data; - - DB_DNODE_EXIT(dr->dr_dbuf); } return (data); @@ -2973,9 +3002,17 @@ dmu_fsname(const char *snapname, char *buf) } /* - * Call when we think we're going to write/free space in open context to track - * the amount of dirty data in the open txg, which is also the amount - * of memory that can not be evicted until this txg syncs. + * Call when we think we're going to write/free space in open context + * to track the amount of dirty data in the open txg, which is also the + * amount of memory that can not be evicted until this txg syncs. + * + * Note that there are two conditions where this can be called from + * syncing context: + * + * [1] When we just created the dataset, in which case we go on with + * updating any accounting of dirty data as usual. + * [2] When we are dirtying MOS data, in which case we only update the + * pool's accounting of dirty data. */ void dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx) @@ -2985,6 +3022,6 @@ dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx) if (ds != NULL) { dsl_dir_willuse_space(ds->ds_dir, aspace, tx); - dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); } + dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); } diff --git a/usr/src/uts/common/fs/zfs/dmu_recv.c b/usr/src/uts/common/fs/zfs/dmu_recv.c index 4077fdb6ba89..a9658224cb53 100644 --- a/usr/src/uts/common/fs/zfs/dmu_recv.c +++ b/usr/src/uts/common/fs/zfs/dmu_recv.c @@ -57,6 +57,7 @@ #include int zfs_recv_queue_length = SPA_MAXBLOCKSIZE; +int zfs_recv_write_batch_size = 1024 * 1024; static char *dmu_recv_tag = "dmu_recv_tag"; const char *recv_clone_name = "%recv"; @@ -799,10 +800,10 @@ struct receive_record_arg { dmu_replay_record_t header; void *payload; /* Pointer to a buffer containing the payload */ /* - * If the record is a write, pointer to the arc_buf_t containing the + * If the record is a WRITE or SPILL, pointer to the abd containing the * payload. */ - arc_buf_t *arc_buf; + abd_t *abd; int payload_size; uint64_t bytes_read; /* bytes read from stream when record created */ boolean_t eos_marker; /* Marks the end of the stream */ @@ -815,8 +816,8 @@ struct receive_writer_arg { bqueue_t q; /* - * These three args are used to signal to the main thread that we're - * done. + * These three members are used to signal to the main thread when + * we're done. */ kmutex_t mutex; kcondvar_t cv; @@ -833,6 +834,8 @@ struct receive_writer_arg { uint64_t max_object; /* highest object ID referenced in stream */ uint64_t bytes_read; /* bytes read when current record created */ + list_t write_batch; + /* Encryption parameters for the last received DRR_OBJECT_RANGE */ boolean_t or_crypt_params_present; uint64_t or_firstobj; @@ -1002,18 +1005,6 @@ byteswap_record(dmu_replay_record_t *drr) DO64(drr_write.drr_key.ddk_prop); DO64(drr_write.drr_compressed_size); break; - case DRR_WRITE_BYREF: - DO64(drr_write_byref.drr_object); - DO64(drr_write_byref.drr_offset); - DO64(drr_write_byref.drr_length); - DO64(drr_write_byref.drr_toguid); - DO64(drr_write_byref.drr_refguid); - DO64(drr_write_byref.drr_refobject); - DO64(drr_write_byref.drr_refoffset); - ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref. - drr_key.ddk_cksum); - DO64(drr_write_byref.drr_key.ddk_prop); - break; case DRR_WRITE_EMBEDDED: DO64(drr_write_embedded.drr_object); DO64(drr_write_embedded.drr_offset); @@ -1498,13 +1489,190 @@ receive_freeobjects(struct receive_writer_arg *rwa, return (0); } +/* + * Note: if this fails, the caller will clean up any records left on the + * rwa->write_batch list. + */ static int -receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, - arc_buf_t *abuf) +flush_write_batch_impl(struct receive_writer_arg *rwa) { - int err; - dmu_tx_t *tx; dnode_t *dn; + int err; + + if (dnode_hold(rwa->os, rwa->last_object, FTAG, &dn) != 0) + return (SET_ERROR(EINVAL)); + + struct receive_record_arg *last_rrd = list_tail(&rwa->write_batch); + struct drr_write *last_drrw = &last_rrd->header.drr_u.drr_write; + + struct receive_record_arg *first_rrd = list_head(&rwa->write_batch); + struct drr_write *first_drrw = &first_rrd->header.drr_u.drr_write; + + ASSERT3U(rwa->last_object, ==, last_drrw->drr_object); + ASSERT3U(rwa->last_offset, ==, last_drrw->drr_offset); + + dmu_tx_t *tx = dmu_tx_create(rwa->os); + dmu_tx_hold_write_by_dnode(tx, dn, first_drrw->drr_offset, + last_drrw->drr_offset - first_drrw->drr_offset + + last_drrw->drr_logical_size); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_tx_abort(tx); + dnode_rele(dn, FTAG); + return (err); + } + + struct receive_record_arg *rrd; + while ((rrd = list_head(&rwa->write_batch)) != NULL) { + struct drr_write *drrw = &rrd->header.drr_u.drr_write; + abd_t *abd = rrd->abd; + + ASSERT3U(drrw->drr_object, ==, rwa->last_object); + + if (drrw->drr_logical_size != dn->dn_datablksz) { + /* + * The WRITE record is larger than the object's block + * size. We must be receiving an incremental + * large-block stream into a dataset that previously did + * a non-large-block receive. Lightweight writes must + * be exactly one block, so we need to decompress the + * data (if compressed) and do a normal dmu_write(). + */ + ASSERT3U(drrw->drr_logical_size, >, dn->dn_datablksz); + if (DRR_WRITE_COMPRESSED(drrw)) { + abd_t *decomp_abd = + abd_alloc_linear(drrw->drr_logical_size, + B_FALSE); + + err = zio_decompress_data( + drrw->drr_compressiontype, + abd, abd_to_buf(decomp_abd), + abd_get_size(abd), + abd_get_size(decomp_abd)); + + if (err == 0) { + dmu_write_by_dnode(dn, + drrw->drr_offset, + drrw->drr_logical_size, + abd_to_buf(decomp_abd), tx); + } + abd_free(decomp_abd); + } else { + dmu_write_by_dnode(dn, + drrw->drr_offset, + drrw->drr_logical_size, + abd_to_buf(abd), tx); + } + if (err == 0) + abd_free(abd); + } else { + zio_prop_t zp; + dmu_write_policy(rwa->os, dn, 0, 0, &zp); + + enum zio_flag zio_flags = 0; + + if (rwa->raw) { + zp.zp_encrypt = B_TRUE; + zp.zp_compress = drrw->drr_compressiontype; + zp.zp_byteorder = ZFS_HOST_BYTEORDER ^ + !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^ + rwa->byteswap; + bcopy(drrw->drr_salt, zp.zp_salt, + ZIO_DATA_SALT_LEN); + bcopy(drrw->drr_iv, zp.zp_iv, + ZIO_DATA_IV_LEN); + bcopy(drrw->drr_mac, zp.zp_mac, + ZIO_DATA_MAC_LEN); + if (DMU_OT_IS_ENCRYPTED(zp.zp_type)) { + zp.zp_nopwrite = B_FALSE; + zp.zp_copies = MIN(zp.zp_copies, + SPA_DVAS_PER_BP - 1); + } + zio_flags |= ZIO_FLAG_RAW; + } else if (DRR_WRITE_COMPRESSED(drrw)) { + ASSERT3U(drrw->drr_compressed_size, >, 0); + ASSERT3U(drrw->drr_logical_size, >=, + drrw->drr_compressed_size); + zp.zp_compress = drrw->drr_compressiontype; + zio_flags |= ZIO_FLAG_RAW_COMPRESS; + } else if (rwa->byteswap) { + /* + * Note: compressed blocks never need to be + * byteswapped, because WRITE records for + * metadata blocks are never compressed. The + * exception is raw streams, which are written + * in the original byteorder, and the byteorder + * bit is preserved in the BP by setting + * zp_byteorder above. + */ + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drrw->drr_type); + dmu_ot_byteswap[byteswap].ob_func( + abd_to_buf(abd), + DRR_WRITE_PAYLOAD_SIZE(drrw)); + } + + /* + * Since this data can't be read until the receive + * completes, we can do a "lightweight" write for + * improved performance. + */ + err = dmu_lightweight_write_by_dnode(dn, + drrw->drr_offset, abd, &zp, zio_flags, tx); + } + + if (err != 0) { + /* + * This rrd is left on the list, so the caller will + * free it (and the abd). + */ + break; + } + + /* + * Note: If the receive fails, we want the resume stream to + * start with the same record that we last successfully + * received (as opposed to the next record), so that we can + * verify that we are resuming from the correct location. + */ + save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx); + + list_remove(&rwa->write_batch, rrd); + kmem_free(rrd, sizeof (*rrd)); + } + + dmu_tx_commit(tx); + dnode_rele(dn, FTAG); + return (err); +} + +static int +flush_write_batch(struct receive_writer_arg *rwa) +{ + if (list_is_empty(&rwa->write_batch)) + return (0); + int err = rwa->err; + if (err == 0) + err = flush_write_batch_impl(rwa); + if (err != 0) { + struct receive_record_arg *rrd; + while ((rrd = list_remove_head(&rwa->write_batch)) != NULL) { + abd_free(rrd->abd); + kmem_free(rrd, sizeof (*rrd)); + } + } + ASSERT(list_is_empty(&rwa->write_batch)); + return (err); +} + +static int +receive_process_write_record(struct receive_writer_arg *rwa, + struct receive_record_arg *rrd) +{ + int err = 0; + + ASSERT3U(rrd->header.drr_type, ==, DRR_WRITE); + struct drr_write *drrw = &rrd->header.drr_u.drr_write; if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset || !DMU_OT_IS_VALID(drrw->drr_type)) @@ -1519,51 +1687,32 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, drrw->drr_offset < rwa->last_offset)) { return (SET_ERROR(EINVAL)); } + + struct receive_record_arg *first_rrd = list_head(&rwa->write_batch); + struct drr_write *first_drrw = &first_rrd->header.drr_u.drr_write; + uint64_t batch_size = + MIN(zfs_recv_write_batch_size, DMU_MAX_ACCESS / 2); + if (first_rrd != NULL && + (drrw->drr_object != first_drrw->drr_object || + drrw->drr_offset >= first_drrw->drr_offset + batch_size)) { + err = flush_write_batch(rwa); + if (err != 0) + return (err); + } + rwa->last_object = drrw->drr_object; rwa->last_offset = drrw->drr_offset; if (rwa->last_object > rwa->max_object) rwa->max_object = rwa->last_object; - if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0) - return (SET_ERROR(EINVAL)); - - tx = dmu_tx_create(rwa->os); - dmu_tx_hold_write(tx, drrw->drr_object, - drrw->drr_offset, drrw->drr_logical_size); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err != 0) { - dmu_tx_abort(tx); - return (err); - } - - if (rwa->byteswap && !arc_is_encrypted(abuf) && - arc_get_compression(abuf) == ZIO_COMPRESS_OFF) { - dmu_object_byteswap_t byteswap = - DMU_OT_BYTESWAP(drrw->drr_type); - dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, - DRR_WRITE_PAYLOAD_SIZE(drrw)); - } - - VERIFY0(dnode_hold(rwa->os, drrw->drr_object, FTAG, &dn)); - err = dmu_assign_arcbuf_by_dnode(dn, drrw->drr_offset, abuf, tx); - if (err != 0) { - dnode_rele(dn, FTAG); - dmu_tx_commit(tx); - return (err); - } - dnode_rele(dn, FTAG); - + list_insert_tail(&rwa->write_batch, rrd); /* - * Note: If the receive fails, we want the resume stream to start - * with the same record that we last successfully received (as opposed - * to the next record), so that we can verify that we are - * resuming from the correct location. + * Return EAGAIN to indicate that we will use this rrd again, + * so the caller should not free it */ - save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx); - dmu_tx_commit(tx); - return (0); + return (EAGAIN); } /* @@ -1688,9 +1837,8 @@ receive_write_embedded(struct receive_writer_arg *rwa, static int receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, - arc_buf_t *abuf) + abd_t *abd) { - dmu_tx_t *tx; dmu_buf_t *db, *db_spill; int err; uint32_t flags = 0; @@ -1706,7 +1854,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, * the DRR_FLAG_SPILL_BLOCK flag. */ if (rwa->spill && DRR_SPILL_IS_UNMODIFIED(drrs->drr_flags)) { - dmu_return_arcbuf(abuf); + abd_free(abd); return (0); } @@ -1732,7 +1880,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, return (err); } - tx = dmu_tx_create(rwa->os); + dmu_tx_t *tx = dmu_tx_create(rwa->os); dmu_tx_hold_spill(tx, db->db_object); @@ -1751,18 +1899,35 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, */ if (db_spill->db_size != drrs->drr_length) { dmu_buf_will_fill(db_spill, tx); - VERIFY(0 == dbuf_spill_set_blksz(db_spill, + VERIFY0(dbuf_spill_set_blksz(db_spill, drrs->drr_length, tx)); } - if (rwa->byteswap && !arc_is_encrypted(abuf) && - arc_get_compression(abuf) == ZIO_COMPRESS_OFF) { - dmu_object_byteswap_t byteswap = - DMU_OT_BYTESWAP(drrs->drr_type); - dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, - DRR_SPILL_PAYLOAD_SIZE(drrs)); + arc_buf_t *abuf; + if (rwa->raw) { + boolean_t byteorder = ZFS_HOST_BYTEORDER ^ + !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^ + rwa->byteswap; + + abuf = arc_loan_raw_buf(dmu_objset_spa(rwa->os), + drrs->drr_object, byteorder, drrs->drr_salt, + drrs->drr_iv, drrs->drr_mac, drrs->drr_type, + drrs->drr_compressed_size, drrs->drr_length, + drrs->drr_compressiontype); + } else { + abuf = arc_loan_buf(dmu_objset_spa(rwa->os), + DMU_OT_IS_METADATA(drrs->drr_type), + drrs->drr_length); + if (rwa->byteswap) { + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drrs->drr_type); + dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(abd), + DRR_SPILL_PAYLOAD_SIZE(drrs)); + } } + bcopy(abd_to_buf(abd), abuf->b_data, DRR_SPILL_PAYLOAD_SIZE(drrs)); + abd_free(abd); dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx); dmu_buf_rele(db, FTAG); @@ -2093,53 +2258,19 @@ receive_read_record(struct receive_arg *ra) case DRR_WRITE: { struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write; - arc_buf_t *abuf; - boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type); - - if (ra->raw) { - boolean_t byteorder = ZFS_HOST_BYTEORDER ^ - !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^ - ra->byteswap; - - abuf = arc_loan_raw_buf(dmu_objset_spa(ra->os), - drrw->drr_object, byteorder, drrw->drr_salt, - drrw->drr_iv, drrw->drr_mac, drrw->drr_type, - drrw->drr_compressed_size, drrw->drr_logical_size, - drrw->drr_compressiontype); - } else if (DRR_WRITE_COMPRESSED(drrw)) { - ASSERT3U(drrw->drr_compressed_size, >, 0); - ASSERT3U(drrw->drr_logical_size, >=, - drrw->drr_compressed_size); - ASSERT(!is_meta); - abuf = arc_loan_compressed_buf( - dmu_objset_spa(ra->os), - drrw->drr_compressed_size, drrw->drr_logical_size, - drrw->drr_compressiontype); - } else { - abuf = arc_loan_buf(dmu_objset_spa(ra->os), - is_meta, drrw->drr_logical_size); - } - - err = receive_read_payload_and_next_header(ra, - DRR_WRITE_PAYLOAD_SIZE(drrw), abuf->b_data); + int size = DRR_WRITE_PAYLOAD_SIZE(drrw); + abd_t *abd = abd_alloc_linear(size, B_FALSE); + err = receive_read_payload_and_next_header(ra, size, + abd_to_buf(abd)); if (err != 0) { - dmu_return_arcbuf(abuf); + abd_free(abd); return (err); } - ra->rrd->arc_buf = abuf; + ra->rrd->abd = abd; receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset, drrw->drr_logical_size); return (err); } - case DRR_WRITE_BYREF: - { - struct drr_write_byref *drrwb = - &ra->rrd->header.drr_u.drr_write_byref; - err = receive_read_payload_and_next_header(ra, 0, NULL); - receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset, - drrwb->drr_length); - return (err); - } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = @@ -2176,33 +2307,14 @@ receive_read_record(struct receive_arg *ra) case DRR_SPILL: { struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill; - arc_buf_t *abuf; - int len = DRR_SPILL_PAYLOAD_SIZE(drrs); - - /* DRR_SPILL records are either raw or uncompressed */ - if (ra->raw) { - boolean_t byteorder = ZFS_HOST_BYTEORDER ^ - !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^ - ra->byteswap; - - abuf = arc_loan_raw_buf(dmu_objset_spa(ra->os), - dmu_objset_id(ra->os), byteorder, drrs->drr_salt, - drrs->drr_iv, drrs->drr_mac, drrs->drr_type, - drrs->drr_compressed_size, drrs->drr_length, - drrs->drr_compressiontype); - } else { - abuf = arc_loan_buf(dmu_objset_spa(ra->os), - DMU_OT_IS_METADATA(drrs->drr_type), - drrs->drr_length); - } - - err = receive_read_payload_and_next_header(ra, len, - abuf->b_data); - if (err != 0) { - dmu_return_arcbuf(abuf); - return (err); - } - ra->rrd->arc_buf = abuf; + int size = DRR_SPILL_PAYLOAD_SIZE(drrs); + abd_t *abd = abd_alloc_linear(size, B_FALSE); + err = receive_read_payload_and_next_header(ra, size, + abd_to_buf(abd)); + if (err != 0) + abd_free(abd); + else + ra->rrd->abd = abd; return (err); } case DRR_OBJECT_RANGE: @@ -2228,6 +2340,22 @@ receive_process_record(struct receive_writer_arg *rwa, ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read); rwa->bytes_read = rrd->bytes_read; + if (rrd->header.drr_type != DRR_WRITE) { + err = flush_write_batch(rwa); + if (err != 0) { + if (rrd->abd != NULL) { + abd_free(rrd->abd); + rrd->abd = NULL; + rrd->payload = NULL; + } else if (rrd->payload != NULL) { + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + } + + return (err); + } + } + switch (rrd->header.drr_type) { case DRR_OBJECT: { @@ -2245,13 +2373,17 @@ receive_process_record(struct receive_writer_arg *rwa, } case DRR_WRITE: { - struct drr_write *drrw = &rrd->header.drr_u.drr_write; - err = receive_write(rwa, drrw, rrd->arc_buf); - /* if receive_write() is successful, it consumes the arc_buf */ - if (err != 0) - dmu_return_arcbuf(rrd->arc_buf); - rrd->arc_buf = NULL; - rrd->payload = NULL; + err = receive_process_write_record(rwa, rrd); + if (err != EAGAIN) { + /* + * On success, receive_process_write_record() returns + * EAGAIN to indicate that we do not want to free + * the rrd or abd. + */ + ASSERT(err != 0); + abd_free(rrd->abd); + rrd->abd = NULL; + } return (err); } case DRR_WRITE_BYREF: @@ -2277,11 +2409,11 @@ receive_process_record(struct receive_writer_arg *rwa, case DRR_SPILL: { struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; - err = receive_spill(rwa, drrs, rrd->arc_buf); + err = receive_spill(rwa, drrs, rrd->abd); /* if receive_spill() is successful, it consumes the arc_buf */ if (err != 0) - dmu_return_arcbuf(rrd->arc_buf); - rrd->arc_buf = NULL; + abd_free(rrd->abd); + rrd->abd = NULL; rrd->payload = NULL; return (err); } @@ -2312,19 +2444,33 @@ receive_writer_thread(void *arg) * on the queue, but we need to clear everything in it before we * can exit. */ + int err = 0; if (rwa->err == 0) { - rwa->err = receive_process_record(rwa, rrd); - } else if (rrd->arc_buf != NULL) { - dmu_return_arcbuf(rrd->arc_buf); - rrd->arc_buf = NULL; + err = receive_process_record(rwa, rrd); + } else if (rrd->abd != NULL) { + abd_free(rrd->abd); + rrd->abd = NULL; rrd->payload = NULL; } else if (rrd->payload != NULL) { kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; } - kmem_free(rrd, sizeof (*rrd)); + /* + * EAGAIN indicates that this record has been saved (on + * raw->write_batch), and will be used again, so we don't + * free it. + */ + if (err != EAGAIN) { + rwa->err = err; + kmem_free(rrd, sizeof (*rrd)); + } } kmem_free(rrd, sizeof (*rrd)); + + int err = flush_write_batch(rwa); + if (rwa->err == 0) + rwa->err = err; + mutex_enter(&rwa->mutex); rwa->done = B_TRUE; cv_signal(&rwa->cv); @@ -2514,6 +2660,8 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, rwa.raw = drc->drc_raw; rwa.spill = drc->drc_spill; rwa.os->os_raw_receive = drc->drc_raw; + list_create(&rwa.write_batch, sizeof (struct receive_record_arg), + offsetof(struct receive_record_arg, node.bqn_node)); (void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, curproc, TS_RUN, minclsyspri); @@ -2603,6 +2751,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, cv_destroy(&rwa.cv); mutex_destroy(&rwa.mutex); bqueue_destroy(&rwa.q); + list_destroy(&rwa.write_batch); if (err == 0) err = rwa.err; diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c index e40be2b71285..7d6de5e93497 100644 --- a/usr/src/uts/common/fs/zfs/dnode.c +++ b/usr/src/uts/common/fs/zfs/dnode.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 RackTop Systems. @@ -389,6 +389,14 @@ dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx) rw_enter(&dn->dn_struct_rwlock, RW_WRITER); ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - (dn->dn_nblkptr-1) * sizeof (blkptr_t)); + + if (newsize < dn->dn_bonuslen) { + /* clear any data after the end of the new size */ + size_t diff = dn->dn_bonuslen - newsize; + char *data_end = ((char *)dn->dn_bonus->db.db_data) + newsize; + bzero(data_end, diff); + } + dn->dn_bonuslen = newsize; if (newsize == 0) dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN; @@ -605,7 +613,6 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT(dn->dn_type == DMU_OT_NONE); ASSERT0(dn->dn_maxblkid); ASSERT0(dn->dn_allocated_txg); - ASSERT0(dn->dn_dirty_txg); ASSERT0(dn->dn_assigned_txg); ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds)); ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1); @@ -645,6 +652,8 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dn->dn_dirtyctx = 0; dn->dn_free_txg = 0; + dn->dn_dirty_txg = 0; + if (dn->dn_dirtyctx_firstset) { kmem_free(dn->dn_dirtyctx_firstset, 1); dn->dn_dirtyctx_firstset = NULL; @@ -1667,7 +1676,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) */ dmu_objset_userquota_get_ids(dn, B_TRUE, tx); - multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK]; + multilist_t *dirtylist = &os->os_dirty_dnodes[txg & TXG_MASK]; multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn); /* @@ -1802,6 +1811,7 @@ dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx) ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + ASSERT3U(new_nlevels, >, dn->dn_nlevels); dn->dn_nlevels = new_nlevels; ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]); @@ -1819,10 +1829,12 @@ dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx) list = &dn->dn_dirty_records[txgoff]; for (dr = list_head(list); dr; dr = dr_next) { dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); - if (dr->dr_dbuf->db_level != new_nlevels-1 && + + IMPLY(dr->dr_dbuf == NULL, old_nlevels == 1); + if (dr->dr_dbuf == NULL || + (dr->dr_dbuf->db_level == old_nlevels - 1 && dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && - dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { - ASSERT(dr->dr_dbuf->db_level == old_nlevels-1); + dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID)) { list_remove(&dn->dn_dirty_records[txgoff], dr); list_insert_tail(&new->dt.di.dr_children, dr); dr->dr_parent = new; @@ -2063,7 +2075,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); /* don't dirty if it isn't on disk and isn't dirty */ - dirty = db->db_last_dirty || + dirty = !list_is_empty(&db->db_dirty_records) || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr)); dmu_buf_unlock_parent(db, dblt, FTAG); if (dirty) { @@ -2106,7 +2118,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) /* don't dirty if not on disk and not dirty */ db_lock_type_t type = dmu_buf_lock_parent(db, RW_READER, FTAG); - dirty = db->db_last_dirty || + dirty = !list_is_empty(&db->db_dirty_records) || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr)); dmu_buf_unlock_parent(db, type, FTAG); if (dirty) { diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c index 384998b717e8..d3d03a3ca29b 100644 --- a/usr/src/uts/common/fs/zfs/dnode_sync.c +++ b/usr/src/uts/common/fs/zfs/dnode_sync.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2020 Oxide Computer Company */ @@ -208,10 +208,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) continue; ASSERT(err == 0); ASSERT(child->db_level == 0); - dr = child->db_last_dirty; - while (dr && dr->dr_txg > txg) - dr = dr->dr_next; - ASSERT(dr == NULL || dr->dr_txg == txg); + dr = dbuf_find_dirty_eq(child, txg); /* data_old better be zeroed */ if (dr) { @@ -232,7 +229,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) mutex_enter(&child->db_mtx); buf = child->db.db_data; if (buf != NULL && child->db_state != DB_FILL && - child->db_last_dirty == NULL) { + list_is_empty(&child->db_dirty_records)) { for (j = 0; j < child->db.db_size >> 3; j++) { if (buf[j] != 0) { panic("freed data not zero: " @@ -537,8 +534,9 @@ dnode_undirty_dbufs(list_t *list) mutex_enter(&db->db_mtx); /* XXX - use dbuf_undirty()? */ list_remove(list, dr); - ASSERT(db->db_last_dirty == dr); - db->db_last_dirty = NULL; + ASSERT(list_head(&db->db_dirty_records) == dr); + list_remove_head(&db->db_dirty_records); + ASSERT(list_is_empty(&db->db_dirty_records)); db->db_dirtycnt -= 1; if (db->db_level == 0) { ASSERT(db->db_blkid == DMU_BONUS_BLKID || @@ -847,6 +845,8 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) /* * Although we have dropped our reference to the dnode, it * can't be evicted until its written, and we haven't yet - * initiated the IO for the dnode's dbuf. + * initiated the IO for the dnode's dbuf. Additionally, the caller + * has already added a reference to the dnode because it's on the + * os_synced_dnodes list. */ } diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c index 8f07c6cef780..6428c76e60bb 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dataset.c +++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c @@ -535,6 +535,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds->ds_dbuf = dbuf; ds->ds_object = dsobj; ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0; + list_link_init(&ds->ds_synced_link); err = dsl_dir_hold_obj(dp, dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, &ds->ds_dir); @@ -2001,10 +2002,7 @@ dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx) bplist_iterate(&ds->ds_pending_deadlist, deadlist_enqueue_cb, &ds->ds_deadlist, tx); - if (os->os_synced_dnodes != NULL) { - multilist_destroy(os->os_synced_dnodes); - os->os_synced_dnodes = NULL; - } + multilist_destroy(&os->os_synced_dnodes); if (os->os_encrypted) os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_FALSE; diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c index d3901c6f794a..e5f260edec1d 100644 --- a/usr/src/uts/common/fs/zfs/dsl_pool.c +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Integros [integros.com] @@ -552,6 +552,10 @@ dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx) zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); dmu_objset_sync(dp->dp_meta_objset, zio, tx); VERIFY0(zio_wait(zio)); + dmu_objset_sync_done(dp->dp_meta_objset, tx); + taskq_wait(dp->dp_sync_taskq); + multilist_destroy(&dp->dp_meta_objset->os_synced_dnodes); + dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); } @@ -643,15 +647,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) } VERIFY0(zio_wait(zio)); - /* - * We have written all of the accounted dirty data, so our - * dp_space_towrite should now be zero. However, some seldom-used - * code paths do not adhere to this (e.g. dbuf_undirty(), also - * rounding error in dbuf_write_physdone). - * Shore up the accounting of any dirtied space now. - */ - dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg); - /* * Update the long range free counter after * we're done syncing user data @@ -670,7 +665,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) */ for (ds = list_head(&synced_datasets); ds != NULL; ds = list_next(&synced_datasets, ds)) { - dmu_objset_do_userquota_updates(ds->ds_objset, tx); + dmu_objset_sync_done(ds->ds_objset, tx); } taskq_wait(dp->dp_sync_taskq); @@ -745,6 +740,21 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dsl_pool_sync_mos(dp, tx); } + /* + * We have written all of the accounted dirty data, so our + * dp_space_towrite should now be zero. However, some seldom-used + * code paths do not adhere to this (e.g. dbuf_undirty()). Shore up + * the accounting of any dirtied space now. + * + * Note that, besides any dirty data from datasets, the amount of + * dirty data in the MOS is also accounted by the pool. Therefore, + * we want to do this cleanup after dsl_pool_sync_mos() so we don't + * attempt to update the accounting for the same dirty data twice. + * (i.e. at this point we only update the accounting for the space + * that we know that we "leaked"). + */ + dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg); + /* * If we modify a dataset in the same txg that we want to destroy it, * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index 697dbec7d63c..b69ab4053cb1 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -384,7 +384,7 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) mc->mc_rotor = NULL; mc->mc_ops = ops; mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); - mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t), + multilist_create(&mc->mc_metaslab_txg_list, sizeof (metaslab_t), offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func); mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * sizeof (zfs_refcount_t), KM_SLEEP); @@ -412,7 +412,7 @@ metaslab_class_destroy(metaslab_class_t *mc) kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * sizeof (uint64_t)); mutex_destroy(&mc->mc_lock); - multilist_destroy(mc->mc_metaslab_txg_list); + multilist_destroy(&mc->mc_metaslab_txg_list); kmem_free(mc, sizeof (metaslab_class_t)); } @@ -606,7 +606,7 @@ metaslab_class_expandable_space(metaslab_class_t *mc) void metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) { - multilist_t *ml = mc->mc_metaslab_txg_list; + multilist_t *ml = &mc->mc_metaslab_txg_list; for (int i = 0; i < multilist_get_num_sublists(ml); i++) { multilist_sublist_t *mls = multilist_sublist_lock(ml, i); metaslab_t *msp = multilist_sublist_head(mls); @@ -1130,7 +1130,7 @@ metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = - multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); + multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (multilist_link_active(&msp->ms_class_txg_node)) multilist_sublist_remove(mls, msp); multilist_sublist_unlock(mls); @@ -2152,20 +2152,20 @@ metaslab_potentially_evict(metaslab_class_t *mc) uint64_t size = kmem_cache_stat(zfs_btree_leaf_cache, "buf_size"); int tries = 0; for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size && - tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2; + tries < multilist_get_num_sublists(&mc->mc_metaslab_txg_list) * 2; tries++) { unsigned int idx = multilist_get_random_index( - mc->mc_metaslab_txg_list); + &mc->mc_metaslab_txg_list); multilist_sublist_t *mls = - multilist_sublist_lock(mc->mc_metaslab_txg_list, idx); + multilist_sublist_lock(&mc->mc_metaslab_txg_list, idx); metaslab_t *msp = multilist_sublist_head(mls); multilist_sublist_unlock(mls); while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 < inuse * size) { VERIFY3P(mls, ==, multilist_sublist_lock( - mc->mc_metaslab_txg_list, idx)); + &mc->mc_metaslab_txg_list, idx)); ASSERT3U(idx, ==, - metaslab_idx_func(mc->mc_metaslab_txg_list, msp)); + metaslab_idx_func(&mc->mc_metaslab_txg_list, msp)); if (!multilist_link_active(&msp->ms_class_txg_node)) { multilist_sublist_unlock(mls); @@ -2526,7 +2526,7 @@ metaslab_unload(metaslab_t *msp) if (msp->ms_group != NULL) { metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = - multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); + multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (multilist_link_active(&msp->ms_class_txg_node)) multilist_sublist_remove(mls, msp); multilist_sublist_unlock(mls); @@ -2577,7 +2577,7 @@ metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg) ASSERT(MUTEX_HELD(&msp->ms_lock)); metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = - multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); + multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (multilist_link_active(&msp->ms_class_txg_node)) multilist_sublist_remove(mls, msp); msp->ms_selected_txg = txg; @@ -5636,7 +5636,7 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, if (spa_writeable(spa)) { /* don't dirty if we're zdb(8) */ metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = - multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); + multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (!multilist_link_active(&msp->ms_class_txg_node)) { msp->ms_selected_txg = txg; multilist_sublist_insert_head(mls, msp); diff --git a/usr/src/uts/common/fs/zfs/multilist.c b/usr/src/uts/common/fs/zfs/multilist.c index 36b9feba84d3..1e3fb00e1091 100644 --- a/usr/src/uts/common/fs/zfs/multilist.c +++ b/usr/src/uts/common/fs/zfs/multilist.c @@ -65,8 +65,8 @@ multilist_d2l(multilist_t *ml, void *obj) * requirement, but a general rule of thumb in order to garner the * best multi-threaded performance out of the data structure. */ -static multilist_t * -multilist_create_impl(size_t size, size_t offset, +static void +multilist_create_impl(multilist_t *ml, size_t size, size_t offset, unsigned int num, multilist_sublist_index_func_t *index_func) { ASSERT3U(size, >, 0); @@ -74,7 +74,6 @@ multilist_create_impl(size_t size, size_t offset, ASSERT3U(num, >, 0); ASSERT3P(index_func, !=, NULL); - multilist_t *ml = kmem_alloc(sizeof (*ml), KM_SLEEP); ml->ml_offset = offset; ml->ml_num_sublists = num; ml->ml_index_func = index_func; @@ -89,7 +88,6 @@ multilist_create_impl(size_t size, size_t offset, mutex_init(&mls->mls_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&mls->mls_list, size, offset); } - return (ml); } /* @@ -97,8 +95,8 @@ multilist_create_impl(size_t size, size_t offset, * (the number of CPUs, or at least 4, or the tunable * zfs_multilist_num_sublists). */ -multilist_t * -multilist_create(size_t size, size_t offset, +void +multilist_create(multilist_t *ml, size_t size, size_t offset, multilist_sublist_index_func_t *index_func) { int num_sublists; @@ -109,7 +107,7 @@ multilist_create(size_t size, size_t offset, num_sublists = MAX(boot_ncpus, 4); } - return (multilist_create_impl(size, offset, num_sublists, index_func)); + multilist_create_impl(ml, size, offset, num_sublists, index_func); } /* @@ -135,7 +133,7 @@ multilist_destroy(multilist_t *ml) ml->ml_num_sublists = 0; ml->ml_offset = 0; - kmem_free(ml, sizeof (multilist_t)); + ml->ml_sublists = NULL; } /* @@ -360,6 +358,28 @@ multilist_sublist_remove(multilist_sublist_t *mls, void *obj) list_remove(&mls->mls_list, obj); } +int +multilist_sublist_is_empty(multilist_sublist_t *mls) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_is_empty(&mls->mls_list)); +} + +int +multilist_sublist_is_empty_idx(multilist_t *ml, unsigned int sublist_idx) +{ + multilist_sublist_t *mls; + int empty; + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + mls = &ml->ml_sublists[sublist_idx]; + ASSERT(!MUTEX_HELD(&mls->mls_lock)); + mutex_enter(&mls->mls_lock); + empty = list_is_empty(&mls->mls_list); + mutex_exit(&mls->mls_lock); + return (empty); +} + void * multilist_sublist_head(multilist_sublist_t *mls) { diff --git a/usr/src/uts/common/fs/zfs/sys/abd.h b/usr/src/uts/common/fs/zfs/sys/abd.h index 23699c042049..91471b44a3a7 100644 --- a/usr/src/uts/common/fs/zfs/sys/abd.h +++ b/usr/src/uts/common/fs/zfs/sys/abd.h @@ -146,6 +146,12 @@ abd_zero(abd_t *abd, size_t size) abd_zero_off(abd, 0, size); } +static inline uint_t +abd_get_size(abd_t *abd) +{ + return (abd->abd_size); +} + /* * Module lifecycle */ diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h index 4590caa95656..925d959d7703 100644 --- a/usr/src/uts/common/fs/zfs/sys/arc.h +++ b/usr/src/uts/common/fs/zfs/sys/arc.h @@ -42,7 +42,7 @@ extern "C" { * Used by arc_flush() to inform arc_evict_state() that it should evict * all available buffers from the arc state being passed in. */ -#define ARC_EVICT_ALL -1ULL +#define ARC_EVICT_ALL UINT64_MAX #define HDR_SET_LSIZE(hdr, x) do { \ ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \ diff --git a/usr/src/uts/common/fs/zfs/sys/arc_impl.h b/usr/src/uts/common/fs/zfs/sys/arc_impl.h index d35b7eea2ddf..7fa86ee011f8 100644 --- a/usr/src/uts/common/fs/zfs/sys/arc_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/arc_impl.h @@ -73,18 +73,20 @@ typedef struct arc_state { /* * list of evictable buffers */ - multilist_t *arcs_list[ARC_BUFC_NUMTYPES]; + multilist_t arcs_list[ARC_BUFC_NUMTYPES]; + /* + * supports the "dbufs" kstat + */ + arc_state_type_t arcs_state; /* * total amount of evictable data in this state */ - zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; + zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES] __cacheline_aligned; /* * total amount of data in this state; this includes: evictable, * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. */ zfs_refcount_t arcs_size; - - arc_state_type_t arcs_state; } arc_state_t; typedef struct arc_callback arc_callback_t; @@ -863,6 +865,13 @@ typedef struct arc_stats { /* number of bytes in the arc from arc_buf_t's */ #define arc_overhead_size ARCSTAT(arcstat_overhead_size) +#define arc_anon (&ARC_anon) +#define arc_mru (&ARC_mru) +#define arc_mru_ghost (&ARC_mru_ghost) +#define arc_mfu (&ARC_mfu) +#define arc_mfu_ghost (&ARC_mfu_ghost) +#define arc_l2c_only (&ARC_l2c_only) + extern arc_stats_t arc_stats; /* used in zdb.c */ diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h index e543f6ac09dd..9ed5c8735d48 100644 --- a/usr/src/uts/common/fs/zfs/sys/dbuf.h +++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -127,8 +127,18 @@ typedef struct dbuf_dirty_record { /* pointer back to our dbuf */ struct dmu_buf_impl *dr_dbuf; - /* pointer to next dirty record */ - struct dbuf_dirty_record *dr_next; + /* list link for dbuf dirty records */ + list_node_t dr_dbuf_node; + + /* + * The dnode we are part of. Note that the dnode can not be moved or + * evicted due to the hold that's added by dnode_setdirty() or + * dmu_objset_sync_dnodes(), and released by dnode_rele_task() or + * userquota_updates_task(). This hold is necessary for + * dirty_lightweight_leaf-type dirty records, which don't have a hold + * on a dbuf. + */ + dnode_t *dr_dnode; /* pointer to parent dirty record */ struct dbuf_dirty_record *dr_parent; @@ -171,6 +181,17 @@ typedef struct dbuf_dirty_record { uint8_t dr_iv[ZIO_DATA_IV_LEN]; uint8_t dr_mac[ZIO_DATA_MAC_LEN]; } dl; + struct dirty_lightweight_leaf { + /* + * This dirty record refers to a leaf (level=0) + * block, whose dbuf has not been instantiated for + * performance reasons. + */ + uint64_t dr_blkid; + abd_t *dr_abd; + zio_prop_t dr_props; + enum zio_flag dr_flags; + } dll; } dt; } dbuf_dirty_record_t; @@ -257,8 +278,8 @@ typedef struct dmu_buf_impl { kcondvar_t db_changed; dbuf_dirty_record_t *db_data_pending; - /* pointer to most recent dirty record for this buffer */ - dbuf_dirty_record_t *db_last_dirty; + /* List of dirty records for the buffer sorted newest to oldest. */ + list_t db_dirty_records; /* * Our link on the owner dnodes's dn_dbufs list. @@ -349,11 +370,16 @@ void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, + dmu_tx_t *tx); arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); +int dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd, + const struct zio_prop *zp, enum zio_flag flags, dmu_tx_t *tx); + void dbuf_destroy(dmu_buf_impl_t *db); void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); @@ -381,6 +407,29 @@ void dbuf_fini(void); boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); +static inline dbuf_dirty_record_t * +dbuf_find_dirty_lte(dmu_buf_impl_t *db, uint64_t txg) +{ + dbuf_dirty_record_t *dr; + + for (dr = list_head(&db->db_dirty_records); + dr != NULL && dr->dr_txg > txg; + dr = list_next(&db->db_dirty_records, dr)) + continue; + return (dr); +} + +static inline dbuf_dirty_record_t * +dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg) +{ + dbuf_dirty_record_t *dr; + + dr = dbuf_find_dirty_lte(db, txg); + if (dr && dr->dr_txg == txg) + return (dr); + return (NULL); +} + #define DBUF_GET_BUFC_TYPE(_db) \ (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h index be834895c886..e63f886ae761 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h @@ -752,6 +752,7 @@ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). */ void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); +boolean_t dmu_buf_is_dirty(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx); diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h index dac448d16130..10cc0ce69042 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h @@ -147,7 +147,7 @@ struct objset { /* no lock needed: */ struct dmu_tx *os_synctx; /* XXX sketchy */ zil_header_t os_zil_header; - multilist_t *os_synced_dnodes; + multilist_t os_synced_dnodes; uint64_t os_flags; uint64_t os_freed_dnodes; boolean_t os_rescan_dnodes; @@ -166,7 +166,7 @@ struct objset { /* Protected by os_lock */ kmutex_t os_lock; - multilist_t *os_dirty_dnodes[TXG_SIZE]; + multilist_t os_dirty_dnodes[TXG_SIZE]; list_t os_dnodes; list_t os_downgraded_dbufs; @@ -239,7 +239,7 @@ objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, objset_t **osp); void dmu_objset_evict(objset_t *os); -void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx); +void dmu_objset_sync_done(objset_t *os, dmu_tx_t *tx); void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx); boolean_t dmu_objset_userused_enabled(objset_t *os); int dmu_objset_userspace_upgrade(objset_t *os); diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h index 6adc8fa14e5f..6e69ee8538d9 100644 --- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h @@ -200,7 +200,7 @@ struct metaslab_class { * List of all loaded metaslabs in the class, sorted in order of most * recent use. */ - multilist_t *mc_metaslab_txg_list; + multilist_t mc_metaslab_txg_list; }; /* diff --git a/usr/src/uts/common/fs/zfs/sys/multilist.h b/usr/src/uts/common/fs/zfs/sys/multilist.h index a2031da77daa..248d6252389c 100644 --- a/usr/src/uts/common/fs/zfs/sys/multilist.h +++ b/usr/src/uts/common/fs/zfs/sys/multilist.h @@ -72,8 +72,9 @@ struct multilist { multilist_sublist_index_func_t *ml_index_func; }; +void multilist_create(multilist_t *, size_t, size_t, + multilist_sublist_index_func_t *); void multilist_destroy(multilist_t *); -multilist_t *multilist_create(size_t, size_t, multilist_sublist_index_func_t *); void multilist_insert(multilist_t *, void *); void multilist_remove(multilist_t *, void *); @@ -90,6 +91,8 @@ void multilist_sublist_insert_head(multilist_sublist_t *, void *); void multilist_sublist_insert_tail(multilist_sublist_t *, void *); void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj); void multilist_sublist_remove(multilist_sublist_t *, void *); +int multilist_sublist_is_empty(multilist_sublist_t *); +int multilist_sublist_is_empty_idx(multilist_t *, unsigned int); void *multilist_sublist_head(multilist_sublist_t *); void *multilist_sublist_tail(multilist_sublist_t *); diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_context.h b/usr/src/uts/common/fs/zfs/sys/zfs_context.h index cd5519149882..e891a6ee8e27 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_context.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_context.h @@ -94,6 +94,7 @@ extern "C" { #define TREE_PCMP(a, b) \ (((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b))) +#define defclsyspri minclsyspri #ifdef __cplusplus } diff --git a/usr/src/uts/common/fs/zfs/sys/zthr.h b/usr/src/uts/common/fs/zfs/sys/zthr.h index 33c218ec4c7d..e0add6226861 100644 --- a/usr/src/uts/common/fs/zfs/sys/zthr.h +++ b/usr/src/uts/common/fs/zfs/sys/zthr.h @@ -35,5 +35,6 @@ extern void zthr_cancel(zthr_t *t); extern void zthr_resume(zthr_t *t); extern boolean_t zthr_iscancelled(zthr_t *t); +extern boolean_t zthr_iscurthread(zthr_t *t); #endif /* _SYS_ZTHR_H */ diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index fcf55b86b61d..90dec33938cd 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -517,6 +517,10 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); + + list_link_init(&vd->vdev_config_dirty_node); + list_link_init(&vd->vdev_state_dirty_node); + vic->vic_prev_indirect_vdev = UINT64_MAX; rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); diff --git a/usr/src/uts/common/fs/zfs/vdev_indirect_mapping.c b/usr/src/uts/common/fs/zfs/vdev_indirect_mapping.c index 3d0f1344dd88..f311d264c022 100644 --- a/usr/src/uts/common/fs/zfs/vdev_indirect_mapping.c +++ b/usr/src/uts/common/fs/zfs/vdev_indirect_mapping.c @@ -405,9 +405,9 @@ vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim, dmu_buf_will_dirty(vim->vim_dbuf, tx); - mapbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); + mapbuf = kmem_alloc(SPA_OLD_MAXBLOCKSIZE, KM_SLEEP); if (vim->vim_havecounts) { - countbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); + countbuf = kmem_alloc(SPA_OLD_MAXBLOCKSIZE, KM_SLEEP); ASSERT(spa_feature_is_active(vim->vim_objset->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)); } @@ -462,9 +462,9 @@ vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim, } vim->vim_phys->vimp_num_entries += i; } - zio_buf_free(mapbuf, SPA_OLD_MAXBLOCKSIZE); + kmem_free(mapbuf, SPA_OLD_MAXBLOCKSIZE); if (vim->vim_havecounts) - zio_buf_free(countbuf, SPA_OLD_MAXBLOCKSIZE); + kmem_free(countbuf, SPA_OLD_MAXBLOCKSIZE); /* * Update the entry array to reflect the new entries. First, copy diff --git a/usr/src/uts/common/fs/zfs/zap_micro.c b/usr/src/uts/common/fs/zfs/zap_micro.c index ab90742c5405..600b4cdbcf56 100644 --- a/usr/src/uts/common/fs/zfs/zap_micro.c +++ b/usr/src/uts/common/fs/zfs/zap_micro.c @@ -603,7 +603,7 @@ mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags) ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); int sz = zap->zap_dbuf->db_size; - mzap_phys_t *mzp = zio_buf_alloc(sz); + mzap_phys_t *mzp = kmem_alloc(sz, KM_SLEEP); bcopy(zap->zap_dbuf->db_data, mzp, sz); int nchunks = zap->zap_m.zap_num_chunks; @@ -611,7 +611,7 @@ mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags) err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, 1ULL << fzap_default_block_shift, 0, tx); if (err != 0) { - zio_buf_free(mzp, sz); + kmem_free(mzp, sz); return (err); } } @@ -637,7 +637,7 @@ mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags) if (err != 0) break; } - zio_buf_free(mzp, sz); + kmem_free(mzp, sz); *zapp = zap; return (err); } diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c index 7a15838338ba..1b2d957d52d2 100644 --- a/usr/src/uts/common/fs/zfs/zil.c +++ b/usr/src/uts/common/fs/zfs/zil.c @@ -388,8 +388,18 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, error = zil_read_log_block(zilog, decrypt, &blk, &next_blk, lrbuf, &end); - if (error != 0) + if (error != 0) { + if (claimed) { + char name[ZFS_MAX_DATASET_NAME_LEN]; + + dmu_objset_name(zilog->zl_os, name); + + cmn_err(CE_WARN, "ZFS read log block error %d, " + "dataset %s, seq 0x%llx\n", error, name, + (u_longlong_t)blk_seq); + } break; + } for (lrp = lrbuf; lrp < end; lrp += reclen) { lr_t *lr = (lr_t *)lrp; @@ -413,10 +423,6 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, zilog->zl_parse_blk_count = blk_count; zilog->zl_parse_lr_count = lr_count; - ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || - (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq) || - (decrypt && error == EIO)); - zil_bp_tree_fini(zilog); zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE); diff --git a/usr/src/uts/common/fs/zfs/zthr.c b/usr/src/uts/common/fs/zfs/zthr.c index 76a9fa122b26..51da38023b58 100644 --- a/usr/src/uts/common/fs/zfs/zthr.c +++ b/usr/src/uts/common/fs/zfs/zthr.c @@ -429,3 +429,9 @@ zthr_iscancelled(zthr_t *t) mutex_exit(&t->zthr_state_lock); return (cancelled); } + +boolean_t +zthr_iscurthread(zthr_t *t) +{ + return (t->zthr_thread == curthread); +} diff --git a/usr/src/uts/common/os/taskq.c b/usr/src/uts/common/os/taskq.c index 718f5af11956..13928f0ac192 100644 --- a/usr/src/uts/common/os/taskq.c +++ b/usr/src/uts/common/os/taskq.c @@ -769,15 +769,13 @@ uint_t taskq_smtbf = UINT_MAX; /* mean time between injected failures */ /* * Do-nothing task which may be used to prepopulate thread caches. */ -/*ARGSUSED*/ void -nulltask(void *unused) +nulltask(void *unused __unused) { } -/*ARGSUSED*/ static int -taskq_constructor(void *buf, void *cdrarg, int kmflags) +taskq_constructor(void *buf, void *cdrarg __unused, int kmflags __unused) { taskq_t *tq = buf; @@ -796,9 +794,8 @@ taskq_constructor(void *buf, void *cdrarg, int kmflags) return (0); } -/*ARGSUSED*/ static void -taskq_destructor(void *buf, void *cdrarg) +taskq_destructor(void *buf, void *cdrarg __unused) { taskq_t *tq = buf; @@ -815,21 +812,26 @@ taskq_destructor(void *buf, void *cdrarg) cv_destroy(&tq->tq_maxalloc_cv); } -/*ARGSUSED*/ +void +taskq_init_ent(taskq_ent_t *tqe) +{ + tqe->tqent_un.tqent_flags = 0; + tqe->tqent_thread = NULL; + cv_init(&tqe->tqent_cv, NULL, CV_DEFAULT, NULL); +} + static int -taskq_ent_constructor(void *buf, void *cdrarg, int kmflags) +taskq_ent_constructor(void *buf, void *cdrarg __unused, int kmflags __unused) { taskq_ent_t *tqe = buf; - tqe->tqent_thread = NULL; - cv_init(&tqe->tqent_cv, NULL, CV_DEFAULT, NULL); + taskq_init_ent(tqe); return (0); } -/*ARGSUSED*/ static void -taskq_ent_destructor(void *buf, void *cdrarg) +taskq_ent_destructor(void *buf, void *cdrarg __unused) { taskq_ent_t *tqe = buf; diff --git a/usr/src/uts/common/sys/taskq_impl.h b/usr/src/uts/common/sys/taskq_impl.h index 59aae4db8f6f..780a06342804 100644 --- a/usr/src/uts/common/sys/taskq_impl.h +++ b/usr/src/uts/common/sys/taskq_impl.h @@ -66,7 +66,7 @@ typedef struct tqstat { uint_t tqs_hits; uint_t tqs_misses; uint_t tqs_overflow; /* no threads to allocate */ - uint_t tqs_tcreates; /* threads created */ + uint_t tqs_tcreates; /* threads created */ uint_t tqs_tdeaths; /* threads died */ uint_t tqs_maxthreads; /* max # of alive threads */ uint_t tqs_disptcreates; @@ -154,7 +154,7 @@ struct taskq { /* Special form of taskq dispatch that uses preallocated entries. */ void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, taskq_ent_t *); - +void taskq_init_ent(taskq_ent_t *); #define tq_thread tq_thr._tq_thread #define tq_threadlist tq_thr._tq_threadlist