diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 4458b902de31..72aa7148d4c2 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -32,7 +32,7 @@ * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K * Copyright (c) 2021, Colm Buckley - * Copyright (c) 2021, 2023, Klara Inc. + * Copyright (c) 2021, 2023, 2024, Klara, Inc. * Copyright [2021] Hewlett Packard Enterprise Development LP */ @@ -126,6 +126,7 @@ static int zpool_do_get(int, char **); static int zpool_do_set(int, char **); static int zpool_do_sync(int, char **); +static int zpool_do_condense(int, char **); static int zpool_do_version(int, char **); @@ -173,6 +174,7 @@ typedef enum { HELP_CLEAR, HELP_CREATE, HELP_CHECKPOINT, + HELP_CONDENSE, HELP_DDT_PRUNE, HELP_DESTROY, HELP_DETACH, @@ -360,6 +362,16 @@ static const char *vdev_trim_state_str[] = { "COMPLETE" }; +static const char *condense_type_str[POOL_CONDENSE_TYPES] = { + "log spacemap", +}; +static const char *condense_type_nv_str[POOL_CONDENSE_TYPES] = { + "log_spacemap", +}; +static const char *condense_type_unit_str[POOL_CONDENSE_TYPES] = { + "blocks", +}; + #define ZFS_NICE_TIMESTAMP 100 /* @@ -416,6 +428,7 @@ static zpool_command_t command_table[] = { { "resilver", zpool_do_resilver, HELP_RESILVER }, { "scrub", zpool_do_scrub, HELP_SCRUB }, { "trim", zpool_do_trim, HELP_TRIM }, + { "condense", zpool_do_condense, HELP_CONDENSE }, { NULL }, { "import", zpool_do_import, HELP_IMPORT }, { "export", zpool_do_export, HELP_EXPORT }, @@ -427,6 +440,7 @@ static zpool_command_t command_table[] = { { NULL }, { "get", zpool_do_get, HELP_GET }, { "set", zpool_do_set, HELP_SET }, + { NULL }, { "sync", zpool_do_sync, HELP_SYNC }, { NULL }, { "wait", zpool_do_wait, HELP_WAIT }, @@ -546,6 +560,8 @@ get_usage(zpool_help_t idx) return (gettext("\treguid [-g guid] \n")); case HELP_SYNC: return (gettext("\tsync [pool] ...\n")); + case HELP_CONDENSE: + return (gettext("\tcondense -t [-c | -w] \n")); case HELP_VERSION: return (gettext("\tversion [-j]\n")); case HELP_WAIT: @@ -8688,6 +8704,122 @@ zpool_do_trim(int argc, char **argv) return (error); } +typedef struct { + pool_condense_func_t func; + pool_condense_type_t type; +} condense_cb_t; + +static int +condense_cb(zpool_handle_t *zhp, void *data) +{ + condense_cb_t *cb = data; + return (zpool_condense(zhp, cb->func, cb->type)); +} + +/* + * zpool condense -t [-c | -w] + * + * -t What to condense. + * -c Cancel. Ends any in-progress condense. + * -w Wait. Blocks until condense has completed. + * + * Condense (flush) the log spacemap on the specified pool(s). + */ +static int +zpool_do_condense(int argc, char **argv) +{ + struct option long_options[] = { + {"target", required_argument, NULL, 't'}, + {"cancel", no_argument, NULL, 'c'}, + {"wait", no_argument, NULL, 'w'}, + {0, 0, 0, 0} + }; + + struct target_map { + const char *name; + pool_condense_type_t type; + } targets[] = { + {"log-spacemap", POOL_CONDENSE_LOG_SPACEMAP}, + {0, 0} + }; + + condense_cb_t cb = { + .func = POOL_CONDENSE_START, + .type = POOL_CONDENSE_TYPES, + }; + boolean_t wait = B_FALSE; + + int c; + while ((c = getopt_long(argc, argv, "t:cw", long_options, NULL)) + != -1) { + switch (c) { + case 't': { + struct target_map *t; + for (t = targets; t->name != NULL; t++) { + if (strcmp(t->name, optarg) == 0) { + cb.type = t->type; + break; + } + } + if (t->name == NULL) { + (void) fprintf(stderr, + gettext("invalid condense target '%s'\n"), + optarg); + usage(B_FALSE); + } + break; + } + case 'c': + cb.func = POOL_CONDENSE_CANCEL; + break; + case 'w': + wait = B_TRUE; + break; + case '?': + if (optopt != 0) { + (void) fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + } else { + (void) fprintf(stderr, + gettext("invalid option '%s'\n"), + argv[optind - 1]); + } + usage(B_FALSE); + } + } + + if (cb.type == POOL_CONDENSE_TYPES) { + (void) fprintf(stderr, gettext("missing condense target\n")); + usage(B_FALSE); + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + return (-1); + } + + if (wait && (cb.func != POOL_CONDENSE_START)) { + (void) fprintf(stderr, gettext("-w cannot be used with -c\n")); + usage(B_FALSE); + } + + int error = for_each_pool(argc, argv, B_FALSE, NULL, ZFS_TYPE_POOL, + B_FALSE, condense_cb, &cb); + + if (wait && !error) { + zpool_wait_activity_t act = ZPOOL_WAIT_CONDENSE; + error = for_each_pool(argc, argv, B_FALSE, NULL, ZFS_TYPE_POOL, + B_FALSE, wait_callback, &act); + } + + return (error); +} + + /* * Converts a total number of seconds to a human readable string broken * down in to days/hours/minutes/seconds. @@ -9767,6 +9899,55 @@ removal_status_nvlist(zpool_handle_t *zhp, status_cbdata_t *cb, } } +static void +condense_status_nvlist(nvlist_t *nvroot, status_cbdata_t *cb, nvlist_t *item) +{ + pool_condense_stat_t *pcnsp = NULL; + uint_t c; + + (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CONDENSE_STATS, + (uint64_t **)&pcnsp, &c); + if (pcnsp == NULL || c == 0) + return; + + uint_t n = MIN(POOL_CONDENSE_TYPES, + c / (sizeof (pool_condense_stat_t) / sizeof (uint64_t))); + + nvlist_t *cnv = fnvlist_alloc(); + + for (pool_condense_type_t type = 0; type < n; type++) { + pool_condense_stat_t *pcns = &pcnsp[type]; + if (pcns->pcns_start_time == 0) + continue; + + nvlist_t *nv = fnvlist_alloc(); + + nice_num_str_nvlist(nv, "start_time", + pcns->pcns_start_time, cb->cb_literal, cb->cb_json_as_int, + ZFS_NICE_TIMESTAMP); + if (pcns->pcns_end_time > 0) + nice_num_str_nvlist(nv, "end_time", + pcns->pcns_end_time, cb->cb_literal, + cb->cb_json_as_int, ZFS_NICE_TIMESTAMP); + nice_num_str_nvlist(nv, "processed", + pcns->pcns_processed, cb->cb_literal, cb->cb_json_as_int, + ZFS_NICENUM_1024); + nice_num_str_nvlist(nv, "total", + pcns->pcns_total, cb->cb_literal, cb->cb_json_as_int, + ZFS_NICENUM_1024); + fnvlist_add_string(nv, "unit", condense_type_unit_str[type]); + + fnvlist_add_nvlist(cnv, condense_type_nv_str[type], nv); + fnvlist_free(nv); + } + + if (fnvlist_num_pairs(cnv)) + fnvlist_add_nvlist(item, "condense", cnv); + + fnvlist_free(cnv); +} + + static void scan_status_nvlist(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nvroot, nvlist_t *item) @@ -10213,6 +10394,50 @@ print_checkpoint_status(pool_checkpoint_stat_t *pcs) space_buf); } +static void +print_condense_status(pool_condense_stat_t *pcnsp, uint_t n) +{ + if (pcnsp == NULL || n == 0) + return; + + for (pool_condense_type_t type = 0; type < n; type++) { + pool_condense_stat_t *pcns = &pcnsp[type]; + if (pcns->pcns_start_time == 0) + continue; + + const char *t = (type < POOL_CONDENSE_TYPES) ? + condense_type_str[type] : "[unknown type]"; + const char *u = (type < POOL_CONDENSE_TYPES) ? + condense_type_unit_str[type] : "items"; + + char cur[32], tot[32], elapsed[32]; + zfs_nicenum(pcns->pcns_processed, cur, sizeof (cur)); + zfs_nicenum(pcns->pcns_total, tot, sizeof (tot)); + + if (pcns->pcns_end_time == 0) { + secs_to_dhms(time(NULL) - pcns->pcns_start_time, + elapsed); + (void) printf(gettext( + "condense: %s: condensing, %s/%s %s done in %s\n"), + t, cur, tot, u, elapsed); + } else if (pcns->pcns_processed < pcns->pcns_total) { + secs_to_dhms( + pcns->pcns_end_time - pcns->pcns_start_time, + elapsed); + (void) printf(gettext( + "condense: %s: cancelled, %s/%s %s done in %s\n"), + t, cur, tot, u, elapsed); + } else { + secs_to_dhms( + pcns->pcns_end_time - pcns->pcns_start_time, + elapsed); + (void) printf(gettext( + "condense: %s: done, %s %s done in %s\n"), + t, cur, u, elapsed); + } + } +} + static void print_error_log(zpool_handle_t *zhp) { @@ -10742,6 +10967,7 @@ status_callback_json(zpool_handle_t *zhp, void *data) scan_status_nvlist(zhp, cbp, nvroot, item); removal_status_nvlist(zhp, cbp, nvroot, item); checkpoint_status_nvlist(nvroot, cbp, item); + condense_status_nvlist(nvroot, cbp, item); raidz_expand_status_nvlist(zhp, cbp, nvroot, item); vdev_stats_nvlist(zhp, cbp, nvroot, 0, B_FALSE, NULL, vds); if (cbp->cb_flat_vdevs) { @@ -10889,6 +11115,12 @@ status_callback(zpool_handle_t *zhp, void *data) ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t **)&pres, &c); print_raidz_expand_status(zhp, pres); + pool_condense_stat_t *pcnsp = NULL; + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_CONDENSE_STATS, (uint64_t **)&pcnsp, &c); + print_condense_status(pcnsp, + c / (sizeof (pool_condense_stat_t) / sizeof (uint64_t))); + cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, cbp->cb_name_flags | VDEV_NAME_TYPE_ID); if (cbp->cb_namewidth < 10) @@ -13099,8 +13331,10 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) pool_scan_stat_t *pss = NULL; pool_removal_stat_t *prs = NULL; pool_raidz_expand_stat_t *pres = NULL; + pool_condense_stat_t *pcns = NULL; const char *const headers[] = {"DISCARD", "FREE", "INITIALIZE", - "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM", "RAIDZ_EXPAND"}; + "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM", "RAIDZ_EXPAND", + "CONDENSE"}; int col_widths[ZPOOL_WAIT_NUM_ACTIVITIES]; /* Calculate the width of each column */ @@ -13169,6 +13403,21 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) bytes_rem[ZPOOL_WAIT_RAIDZ_EXPAND] = rem; } + /* + * Count each outstanding condense item as a "byte". Its not true, + * but its a counter, and it'll display nicely. + */ + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_CONDENSE_STATS, (uint64_t **)&pcns, &c); + c = c / (sizeof (pool_condense_stat_t) / sizeof (uint64_t)); + if (pcns != NULL && c > 0) { + do { + c--; + bytes_rem[ZPOOL_WAIT_CONDENSE] += + (pcns[c].pcns_total - pcns[c].pcns_processed); + } while (c > 0); + } + bytes_rem[ZPOOL_WAIT_INITIALIZE] = vdev_activity_remaining(nvroot, ZPOOL_WAIT_INITIALIZE); bytes_rem[ZPOOL_WAIT_TRIM] = @@ -13307,7 +13556,7 @@ zpool_do_wait(int argc, char **argv) static const char *const col_opts[] = { "discard", "free", "initialize", "replace", "remove", "resilver", "scrub", "trim", - "raidz_expand" }; + "raidz_expand", "condense" }; for (i = 0; i < ARRAY_SIZE(col_opts); ++i) if (strcmp(tok, col_opts[i]) == 0) { diff --git a/cmd/ztest.c b/cmd/ztest.c index 4a7959ebfca5..c55aab816750 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -26,7 +26,7 @@ * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. - * Copyright (c) 2023, Klara, Inc. + * Copyright (c) 2023, 2024, Klara, Inc. */ /* @@ -449,6 +449,8 @@ ztest_func_t ztest_fletcher_incr; ztest_func_t ztest_verify_dnode_bt; ztest_func_t ztest_pool_prefetch_ddt; ztest_func_t ztest_ddt_prune; +ztest_func_t ztest_spa_log_flushall_start; +ztest_func_t ztest_spa_log_flushall_cancel; static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ @@ -506,6 +508,8 @@ static ztest_info_t ztest_info[] = { ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), ZTI_INIT(ztest_pool_prefetch_ddt, 1, &zopt_rarely), ZTI_INIT(ztest_ddt_prune, 1, &zopt_rarely), + ZTI_INIT(ztest_spa_log_flushall_start, 1, &zopt_rarely), + ZTI_INIT(ztest_spa_log_flushall_cancel, 1, &zopt_rarely), }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) @@ -6217,6 +6221,20 @@ ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) } } +void +ztest_spa_log_flushall_start(ztest_ds_t *zd, uint64_t id) +{ + (void) zd, (void) id; + spa_log_flushall_start(ztest_spa, SPA_LOG_FLUSHALL_REQUEST, 0); +} + +void +ztest_spa_log_flushall_cancel(ztest_ds_t *zd, uint64_t id) +{ + (void) zd, (void) id; + spa_log_flushall_cancel(ztest_spa); +} + void ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) { diff --git a/include/libzfs.h b/include/libzfs.h index 01d51999f4eb..be6427338dd8 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -29,6 +29,7 @@ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2021, Colm Buckley + * Copyright (c) 2024, Klara, Inc. */ #ifndef _LIBZFS_H @@ -297,6 +298,8 @@ _LIBZFS_H int zpool_initialize_wait(zpool_handle_t *, pool_initialize_func_t, nvlist_t *); _LIBZFS_H int zpool_trim(zpool_handle_t *, pool_trim_func_t, nvlist_t *, trimflags_t *); +_LIBZFS_H int zpool_condense(zpool_handle_t *, pool_condense_func_t, + pool_condense_type_t); _LIBZFS_H int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); _LIBZFS_H int zpool_reguid(zpool_handle_t *); diff --git a/include/libzfs_core.h b/include/libzfs_core.h index b1d74fbbc8f5..3cbee598dfd5 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -24,6 +24,7 @@ * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. + * Copyright (c) 2024, Klara, Inc. */ #ifndef _LIBZFS_CORE_H @@ -142,6 +143,9 @@ _LIBZFS_CORE_H int lzc_channel_program_nosync(const char *, const char *, _LIBZFS_CORE_H int lzc_sync(const char *, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_reopen(const char *, boolean_t); +_LIBZFS_CORE_H int lzc_condense(const char *, + pool_condense_func_t, pool_condense_type_t); + _LIBZFS_CORE_H int lzc_pool_checkpoint(const char *); _LIBZFS_CORE_H int lzc_pool_checkpoint_discard(const char *); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 1676020d04d3..e52617ffaa30 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -745,6 +745,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */ #define ZPOOL_CONFIG_RAIDZ_EXPAND_STATS "raidz_expand_stats" /* not on disk */ #define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ +#define ZPOOL_CONFIG_CONDENSE_STATS "com.klarasystems:condense_stats" #define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */ /* container nvlist of extended stats */ @@ -1213,6 +1214,20 @@ typedef struct vdev_rebuild_stat { uint64_t vrs_pass_bytes_skipped; /* bytes skipped since start/resume */ } vdev_rebuild_stat_t; +/* + * "Condense" is a general concept for fully writing down an intermediate log, + * journal or cache to its final resting place. The stats for condense are + * counts of how many things need to be written down and how many have been + * done so far, so that 'zpool status' can show progress. How it shows that + * depends on what the thing is. + */ +typedef struct pool_condense_stat { + uint64_t pcns_start_time; /* time_t */ + uint64_t pcns_end_time; /* time_t */ + uint64_t pcns_processed; /* items processed */ + uint64_t pcns_total; /* total items to process */ +} pool_condense_stat_t; + /* * Errata described by https://openzfs.github.io/openzfs-docs/msg/ZFS-8000-ER. * The ordering of this enum must be maintained to ensure the errata identifiers @@ -1355,6 +1370,20 @@ typedef enum pool_trim_func { POOL_TRIM_FUNCS } pool_trim_func_t; +/* + * Condense functions. + */ +typedef enum pool_condense_func { + POOL_CONDENSE_START, + POOL_CONDENSE_CANCEL, + POOL_CONDENSE_FUNCS +} pool_condense_func_t; + +typedef enum pool_condense_type { + POOL_CONDENSE_LOG_SPACEMAP, + POOL_CONDENSE_TYPES, +} pool_condense_type_t; + /* * DDT statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. @@ -1534,6 +1563,7 @@ typedef enum zfs_ioc { ZFS_IOC_POOL_SCRUB, /* 0x5a57 */ ZFS_IOC_POOL_PREFETCH, /* 0x5a58 */ ZFS_IOC_DDT_PRUNE, /* 0x5a59 */ + ZFS_IOC_POOL_CONDENSE, /* 0x5a5a */ /* * Per-platform (Optional) - 8/128 numbers reserved. @@ -1658,6 +1688,7 @@ typedef enum { ZPOOL_WAIT_SCRUB, ZPOOL_WAIT_TRIM, ZPOOL_WAIT_RAIDZ_EXPAND, + ZPOOL_WAIT_CONDENSE, ZPOOL_WAIT_NUM_ACTIVITIES } zpool_wait_activity_t; @@ -1745,6 +1776,12 @@ typedef enum { #define ZPOOL_TRIM_RATE "trim_rate" #define ZPOOL_TRIM_SECURE "trim_secure" +/* + * The following are names used when invoking ZPOOL_IOC_POOL_CONDENSE. + */ +#define ZPOOL_CONDENSE_COMMAND "condense_command" +#define ZPOOL_CONDENSE_TYPE "condense_type" + /* * The following are names used when invoking ZFS_IOC_POOL_WAIT. */ diff --git a/include/sys/spa.h b/include/sys/spa.h index ca30b60c0af7..1d3e488a2289 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -28,7 +28,7 @@ * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Allan Jude - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2024, Klara, Inc. * Copyright (c) 2019, Datto Inc. */ @@ -911,6 +911,7 @@ typedef struct spa_stats { spa_history_kstat_t state; /* pool state */ spa_history_kstat_t guid; /* pool guid */ spa_history_kstat_t iostats; + spa_history_kstat_t log_spacemaps; } spa_stats_t; typedef enum txg_state { @@ -1029,6 +1030,7 @@ typedef enum spa_log_state { extern spa_log_state_t spa_get_log_state(spa_t *spa); extern void spa_set_log_state(spa_t *spa, spa_log_state_t state); extern int spa_reset_logs(spa_t *spa); +extern void spa_log_sm_stats_update(spa_t *spa); /* Log claim callback */ extern void spa_claim_notify(zio_t *zio); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 7811abbb9ce3..d5bb653a5505 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -27,6 +27,7 @@ * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019 Datto Inc. + * Copyright (c) 2024, Klara, Inc. */ #ifndef _SYS_SPA_IMPL_H @@ -358,6 +359,8 @@ struct spa { avl_tree_t spa_metaslabs_by_flushed; spa_unflushed_stats_t spa_unflushed_stats; list_t spa_log_summary; + + spa_log_flushall_mode_t spa_log_flushall_mode; uint64_t spa_log_flushall_txg; zthr_t *spa_livelist_delete_zthr; /* deleting livelists */ @@ -471,6 +474,9 @@ struct spa { uint64_t spa_dedup_dsize; /* cached on-disk size of DDT */ uint64_t spa_dedup_class_full_txg; /* txg dedup class was full */ + /* stats for user-initiated condense operations */ + pool_condense_stat_t spa_condense_stats[POOL_CONDENSE_TYPES]; + /* * spa_refcount & spa_config_lock must be the last elements * because zfs_refcount_t changes size based on compilation options. diff --git a/include/sys/spa_log_spacemap.h b/include/sys/spa_log_spacemap.h index f59e69917833..e9a0826563dc 100644 --- a/include/sys/spa_log_spacemap.h +++ b/include/sys/spa_log_spacemap.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2018, 2019 by Delphix. All rights reserved. + * Copyright (c) 2024, Klara, Inc. */ #ifndef _SYS_SPA_LOG_SPACEMAP_H @@ -56,6 +57,12 @@ typedef struct spa_log_sm { space_map_t *sls_sm; /* space map pointer, if open */ } spa_log_sm_t; +typedef enum spa_log_flushall_mode { + SPA_LOG_FLUSHALL_NONE = 0, /* flushall inactive */ + SPA_LOG_FLUSHALL_REQUEST, /* flushall active by admin request */ + SPA_LOG_FLUSHALL_EXPORT, /* flushall active for pool export */ +} spa_log_flushall_mode_t; + int spa_ld_log_spacemaps(spa_t *); void spa_generate_syncing_log_sm(spa_t *, dmu_tx_t *); @@ -77,7 +84,10 @@ void spa_log_summary_dirty_flushed_metaslab(spa_t *, uint64_t); void spa_log_summary_decrement_mscount(spa_t *, uint64_t, boolean_t); void spa_log_summary_decrement_blkcount(spa_t *, uint64_t); -boolean_t spa_flush_all_logs_requested(spa_t *); +void spa_log_flushall_start(spa_t *spa, spa_log_flushall_mode_t mode, + uint64_t txg); +void spa_log_flushall_done(spa_t *spa); +void spa_log_flushall_cancel(spa_t *spa); extern int zfs_keep_log_spacemaps_at_export; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index ac9ae233c72d..ffb9b6eadc6b 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -478,6 +478,7 @@ + @@ -6013,6 +6014,19 @@ + + + + + + + + + + + + + @@ -6107,6 +6121,7 @@ + @@ -6132,7 +6147,8 @@ - + + @@ -6271,6 +6287,12 @@ + + + + + + @@ -6852,6 +6874,12 @@ + + + + + + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index f256535e8ea0..63cd41bc4484 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -29,7 +29,7 @@ * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2018, loli10K * Copyright (c) 2021, Colm Buckley - * Copyright (c) 2021, 2023, Klara Inc. + * Copyright (c) 2021, 2023, 2024, Klara, Inc. */ #include @@ -4404,6 +4404,23 @@ zpool_sync_one(zpool_handle_t *zhp, void *data) return (0); } +int +zpool_condense(zpool_handle_t *zhp, + pool_condense_func_t func, pool_condense_type_t type) +{ + int ret; + + libzfs_handle_t *hdl = zpool_get_handle(zhp); + const char *pool_name = zpool_get_name(zhp); + + if ((ret = lzc_condense(pool_name, func, type)) != 0) { + return (zpool_standard_error_fmt(hdl, ret, + dgettext(TEXT_DOMAIN, "condense '%s' failed"), pool_name)); + } + + return (0); +} + #define PATH_BUF_LEN 64 /* diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi index 6a9c20a2bb88..915b5be392c2 100644 --- a/lib/libzfs_core/libzfs_core.abi +++ b/lib/libzfs_core/libzfs_core.abi @@ -162,6 +162,7 @@ + @@ -1524,6 +1525,19 @@ + + + + + + + + + + + + + @@ -1618,6 +1632,7 @@ + @@ -1643,7 +1658,8 @@ - + + @@ -2882,6 +2898,12 @@ + + + + + + diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index d07fca6cebad..84099530305c 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -26,6 +26,7 @@ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. * Copyright (c) 2019 Datto Inc. + * Copyright (c) 2024, Klara, Inc. */ /* @@ -493,6 +494,23 @@ lzc_sync(const char *pool_name, nvlist_t *innvl, nvlist_t **outnvl) return (lzc_ioctl(ZFS_IOC_POOL_SYNC, pool_name, innvl, NULL)); } +int +lzc_condense(const char *pool_name, + pool_condense_func_t func, pool_condense_type_t type) +{ + int error; + + nvlist_t *args = fnvlist_alloc(); + fnvlist_add_uint64(args, ZPOOL_CONDENSE_COMMAND, (uint64_t)func); + fnvlist_add_uint64(args, ZPOOL_CONDENSE_TYPE, (uint64_t)type); + + error = lzc_ioctl(ZFS_IOC_POOL_CONDENSE, pool_name, args, NULL); + + fnvlist_free(args); + + return (error); +} + /* * Create "user holds" on snapshots. If there is a hold on a snapshot, * the snapshot can not be destroyed. (However, it can be marked for deletion diff --git a/man/Makefile.am b/man/Makefile.am index fde704933764..3ccbc22bfeec 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -69,6 +69,7 @@ dist_man_MANS = \ %D%/man8/zpool-attach.8 \ %D%/man8/zpool-checkpoint.8 \ %D%/man8/zpool-clear.8 \ + %D%/man8/zpool-condense.8 \ %D%/man8/zpool-create.8 \ %D%/man8/zpool-destroy.8 \ %D%/man8/zpool-detach.8 \ diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index da027798f962..828a65d6edc8 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -18,7 +18,7 @@ .\" .\" Copyright (c) 2024, Klara, Inc. .\" -.Dd November 1, 2024 +.Dd November 11, 2024 .Dt ZFS 4 .Os . @@ -1778,6 +1778,10 @@ Normally disabled because these datasets may be missing key data. .It Sy zfs_min_metaslabs_to_flush Ns = Ns Sy 1 Pq u64 Minimum number of metaslabs to flush per dirty TXG. . +.It Sy zfs_min_metaslabs_to_flush_all Ns = Ns Sy 5 Pq u64 +Minimum number of metaslabs to flush per dirty TXG when condensing log +spacemaps. +. .It Sy zfs_metaslab_fragmentation_threshold Ns = Ns Sy 70 Ns % Pq uint Allow metaslabs to keep their active state as long as their fragmentation percentage is no more than this value. diff --git a/man/man8/zpool-condense.8 b/man/man8/zpool-condense.8 new file mode 100644 index 000000000000..1c4a38de14cd --- /dev/null +++ b/man/man8/zpool-condense.8 @@ -0,0 +1,62 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2024, Klara, Inc. +.\" +.Dd November 11, 2024 +.Dt ZPOOL-CONDENSE 8 +.Os +. +.Sh NAME +.Nm zpool-condense +.Nd Condense, flush, garbage collect or otherwise clean up pool metadata +.Sh SYNOPSIS +.Nm zpool +.Cm condense +.Fl t Ar target +.Op Fl c | w +.Ar pool +.Sh DESCRIPTION +Many internal pool metadata updates are performed in the background at a rate +chosen to limit the performance impact to normal use of the pool. +Sometimes it is desirable to accelerate these operations, +even if it affects overall performance. +.Sy condense +allows an operator to request that a specific background operation be +prioritised to complete as soon as possible. +.Pp +These are the possible values for +.Ar target : +.Bl -tag -compact -offset Ds -width "log-spacemap" +.It Sy log-spacemap +flushing log spacemap entries to their underlying metaslabs +.El +.Bl -tag -width Ds +.It Fl c +Cancel a previous +.Sy condense . +This will return background updates to their normal rate. +.It Fl w +Wait until the condense has completed before returning. +.El +.Sh SEE ALSO +.Xr zpool-status 8 , +.Xr zpool-wait 8 diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index b9b54185d050..11441d0f3c74 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -25,8 +25,9 @@ .\" Copyright (c) 2018 George Melikov. All Rights Reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" Copyright (c) 2024, Klara, Inc. .\" -.Dd February 14, 2024 +.Dd November 11, 2024 .Dt ZPOOL-STATUS 8 .Os . @@ -356,6 +357,7 @@ can be used to run a script on each VDEV. .Ed . .Sh SEE ALSO +.Xr zpool-condense 8 , .Xr zpool-events 8 , .Xr zpool-history 8 , .Xr zpool-iostat 8 , diff --git a/man/man8/zpool-wait.8 b/man/man8/zpool-wait.8 index 50f947bab603..7c1a93757ff3 100644 --- a/man/man8/zpool-wait.8 +++ b/man/man8/zpool-wait.8 @@ -26,8 +26,9 @@ .\" Copyright (c) 2018 George Melikov. All Rights Reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" Copyright (c) 2024, Klara, Inc. .\" -.Dd May 27, 2021 +.Dd November 11, 2024 .Dt ZPOOL-WAIT 8 .Os . @@ -78,6 +79,8 @@ Scrub to cease Manual trim to cease .It Sy raidz_expand Attaching to a RAID-Z vdev to complete +.It Sy condense +Metadata condense operations to complete .El .Pp If an @@ -109,6 +112,7 @@ See . .Sh SEE ALSO .Xr zpool-checkpoint 8 , +.Xr zpool-condense 8 , .Xr zpool-initialize 8 , .Xr zpool-remove 8 , .Xr zpool-replace 8 , diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index 02a258f66708..a45b0ccd8e82 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -25,8 +25,9 @@ .\" Copyright (c) 2018 George Melikov. All Rights Reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" Copyright (c) 2024, Klara, Inc. .\" -.Dd February 14, 2024 +.Dd November 11, 2024 .Dt ZPOOL 8 .Os . @@ -177,6 +178,8 @@ specified. Prefetches specific types of pool data. .It Xr zpool-scrub 8 Begins a scrub or resumes a paused scrub. +.It Xr zpool-condense 8 +Condense, flush, garbage collect or otherwise clean up pool metadata. .It Xr zpool-checkpoint 8 Checkpoints the current state of .Ar pool , diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 6b8c7ee93daa..e015924334a2 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -34,7 +34,7 @@ * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2021, Colm Buckley * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. - * Copyright (c) 2023, 2024, Klara Inc. + * Copyright (c) 2023, 2024, Klara, Inc. */ /* @@ -1984,8 +1984,8 @@ spa_unload_log_sm_flush_all(spa_t *spa) dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); - ASSERT3U(spa->spa_log_flushall_txg, ==, 0); - spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); + spa_log_flushall_start(spa, SPA_LOG_FLUSHALL_EXPORT, + dmu_tx_get_txg(tx)); dmu_tx_commit(tx); txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); @@ -2012,6 +2012,8 @@ spa_unload_log_sm_metadata(spa_t *spa) spa->spa_unflushed_stats.sus_nblocks = 0; spa->spa_unflushed_stats.sus_memused = 0; spa->spa_unflushed_stats.sus_blocklimit = 0; + + spa_log_sm_stats_update(spa); } static void @@ -2068,6 +2070,8 @@ spa_unload(spa_t *spa) */ if (spa_should_flush_logs_on_unload(spa)) spa_unload_log_sm_flush_all(spa); + else + spa_log_flushall_done(spa); /* * Stop async tasks. @@ -10255,6 +10259,7 @@ spa_sync(spa_t *spa, uint64_t txg) spa_sync_close_syncing_log_sm(spa); spa_update_dspace(spa); + spa_log_sm_stats_update(spa); if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) vdev_autotrim_kick(spa); @@ -10798,6 +10803,20 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, *in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING); break; } + case ZPOOL_WAIT_CONDENSE: { + pool_condense_stat_t *pcns; + *in_progress = B_FALSE; + for (pool_condense_type_t type = 0; + type < POOL_CONDENSE_TYPES; type++) { + pcns = &spa->spa_condense_stats[type]; + if (pcns->pcns_start_time > 0 && + pcns->pcns_end_time == 0) { + *in_progress = B_TRUE; + break; + } + } + break; + } default: panic("unrecognized value for activity %d", activity); } diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c index f55218e3579b..42a31f4f38f0 100644 --- a/module/zfs/spa_log_spacemap.c +++ b/module/zfs/spa_log_spacemap.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2018, 2019 by Delphix. All rights reserved. + * Copyright (c) 2024, Klara, Inc. */ #include @@ -284,6 +285,12 @@ static uint64_t zfs_max_logsm_summary_length = 10; */ static uint64_t zfs_min_metaslabs_to_flush = 1; +/* + * Tuneable that sets the minimum metaslabs to flush every TXG when the user + * has requested flushall (via 'zpool condense'). + */ +static uint64_t zfs_min_metaslabs_to_flush_all = 5; + /* * Tunable that specifies how far in the past do we want to look when trying to * estimate the incoming log blocks for the current TXG. @@ -676,7 +683,9 @@ spa_estimate_metaslabs_to_flush(spa_t *spa) uint64_t total_flushes = 0; /* Holds the current maximum of our estimates so far. */ - uint64_t max_flushes_pertxg = zfs_min_metaslabs_to_flush; + uint64_t max_flushes_pertxg = + spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_REQUEST ? + zfs_min_metaslabs_to_flush_all : zfs_min_metaslabs_to_flush; /* * For our estimations we only look as far in the future @@ -746,10 +755,87 @@ spa_log_exceeds_memlimit(spa_t *spa) return (B_FALSE); } -boolean_t -spa_flush_all_logs_requested(spa_t *spa) +void +spa_log_flushall_start(spa_t *spa, spa_log_flushall_mode_t mode, uint64_t txg) +{ + /* Shouldn't happen, but its not dangerous if it does. */ + ASSERT3U(mode, !=, SPA_LOG_FLUSHALL_NONE); + if (mode == SPA_LOG_FLUSHALL_NONE) + return; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + if (txg == 0) + txg = spa_last_synced_txg(spa); + + if (spa->spa_log_flushall_mode != SPA_LOG_FLUSHALL_EXPORT) { + /* + * We can set _REQUEST even if its already in _REQUEST; this + * has the effect of just pushing out the end txg. + */ + spa->spa_log_flushall_mode = mode; + spa->spa_log_flushall_txg = txg; + } + + if (spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_REQUEST) { + /* Reset stats */ + pool_condense_stat_t *pcns = + &spa->spa_condense_stats[POOL_CONDENSE_LOG_SPACEMAP]; + memset(pcns, 0, sizeof (pool_condense_stat_t)); + pcns->pcns_start_time = gethrestime_sec(); + pcns->pcns_total = spa_log_sm_nblocks(spa); + } + + spa_config_exit(spa, SCL_VDEV, FTAG); +} + +void +spa_log_flushall_done(spa_t *spa) { - return (spa->spa_log_flushall_txg != 0); + if (spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_NONE) + return; + + IMPLY(spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_REQUEST, + spa_state(spa) == POOL_STATE_ACTIVE); + IMPLY(spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_EXPORT, + spa_state(spa) == POOL_STATE_EXPORTED); + ASSERT(spa->spa_log_flushall_txg); + + if (spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_REQUEST) { + /* + * Finish stats. Note that the flush is by txgs, not blocks, so + * we set the processed to the total just so everything looks + * right for the user even if they're not exactly the same. + */ + pool_condense_stat_t *pcns = + &spa->spa_condense_stats[POOL_CONDENSE_LOG_SPACEMAP]; + pcns->pcns_end_time = gethrestime_sec(); + pcns->pcns_processed = pcns->pcns_total; + } + + spa->spa_log_flushall_mode = SPA_LOG_FLUSHALL_NONE; + spa->spa_log_flushall_txg = 0; + + spa_notify_waiters(spa); +} + +void +spa_log_flushall_cancel(spa_t *spa) +{ + if (spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_NONE) + return; + + ASSERT(spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_REQUEST); + + spa->spa_log_flushall_mode = SPA_LOG_FLUSHALL_NONE; + spa->spa_log_flushall_txg = 0; + + /* Finish stats. */ + pool_condense_stat_t *pcns = + &spa->spa_condense_stats[POOL_CONDENSE_LOG_SPACEMAP]; + pcns->pcns_end_time = gethrestime_sec(); + + spa_notify_waiters(spa); } void @@ -785,7 +871,7 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx) */ if (BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg && !dmu_objset_is_dirty(spa_meta_objset(spa), txg) && - !spa_flush_all_logs_requested(spa)) + spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_NONE) return; /* @@ -809,16 +895,25 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx) * metaslabs and attempt to destroy old log space maps. */ uint64_t want_to_flush; - if (spa_flush_all_logs_requested(spa)) { + if (spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_EXPORT) { ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED); want_to_flush = UINT64_MAX; } else { want_to_flush = spa_estimate_metaslabs_to_flush(spa); } - /* Used purely for verification purposes */ + /* + * Count of metaslabs we checked this round. Used to know we've + * finished a user-requested flushall, and for verification. + */ uint64_t visited = 0; + /* + * Unflushed blocks at start of loop, so we can report on how many we + * flushed. + */ + uint64_t start_nblocks = spa_log_sm_nblocks(spa); + /* * Ideally we would only iterate through spa_metaslabs_by_flushed * using only one variable (curr). We can't do that because @@ -836,8 +931,30 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx) * If this metaslab has been flushed this txg then we've done * a full circle over the metaslabs. */ - if (metaslab_unflushed_txg(curr) == txg) + uint64_t unflushed_txg = metaslab_unflushed_txg(curr); + if (unflushed_txg == txg) { + spa_log_flushall_done(spa); break; + } + + /* + * If the admin requested flush, skip metaslabs that were + * modified after the flush request. + */ + if (spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_REQUEST && + unflushed_txg > spa->spa_log_flushall_txg) { + visited++; + if (visited < + avl_numnodes(&spa->spa_metaslabs_by_flushed)) + continue; + + /* + * We visited all metaslabs and they're all dirty after + * the admin requested flush, so all flushing is done. + */ + spa_log_flushall_done(spa); + break; + } /* * If we are done flushing for the block heuristic and the @@ -862,6 +979,22 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx) ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited); spa_log_sm_set_blocklimit(spa); + + if (spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_REQUEST) { + /* + * If the admin requested a flush, then we're only processing + * blocks created before the flush request. The total number of + * unflushed blocks can still go up, but not since we set + * start_nblocks before the metaslab loop above. Therefore, + * there can never be more blocks than there were at the start. + */ + uint64_t end_nblocks = spa_log_sm_nblocks(spa); + ASSERT3U(start_nblocks, >=, end_nblocks); + + pool_condense_stat_t *pcns = + &spa->spa_condense_stats[POOL_CONDENSE_LOG_SPACEMAP]; + pcns->pcns_processed += start_nblocks - end_nblocks; + } } /* @@ -901,9 +1034,9 @@ spa_sync_close_syncing_log_sm(spa_t *spa) * so the last few TXGs before closing the pool can be empty * (e.g. not dirty). */ - if (spa_flush_all_logs_requested(spa)) { + if (spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_EXPORT) { ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED); - spa->spa_log_flushall_txg = 0; + spa_log_flushall_done(spa); } } @@ -1396,6 +1529,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, U64, ZMOD_RW, "The number of past TXGs that the flushing algorithm of the log " "spacemap feature uses to estimate incoming log blocks"); +ZFS_MODULE_PARAM(zfs, zfs_, min_metaslabs_to_flush_all, ULONG, ZMOD_RW, + "Minimum number of metaslabs to flush per TXG when condensing"); + ZFS_MODULE_PARAM(zfs, zfs_, keep_log_spacemaps_at_export, INT, ZMOD_RW, "Prevent the log spacemaps from being flushed and destroyed " "during pool export/destroy"); diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c index 45a2f06263a0..210108a2cb05 100644 --- a/module/zfs/spa_stats.c +++ b/module/zfs/spa_stats.c @@ -19,6 +19,10 @@ * CDDL HEADER END */ +/* + * Copyright (c) 2024, Klara, Inc. + */ + #include #include #include @@ -1034,6 +1038,82 @@ spa_iostats_destroy(spa_t *spa) mutex_destroy(&shk->lock); } +/* + * Log spacemap stats. + */ +typedef struct spa_log_sm_stats { + kstat_named_t unflushed_memused; + kstat_named_t unflushed_blocklimit; + kstat_named_t unflushed_nblocks; +} spa_log_sm_stats_t; + +static spa_log_sm_stats_t spa_log_sm_stats_template = { + { "unflushed_memused", KSTAT_DATA_UINT64 }, + { "unflushed_blocklimit", KSTAT_DATA_UINT64 }, + { "unflushed_nblocks", KSTAT_DATA_UINT64 } +}; + +#define SPA_LOG_SM_STATS_SET(stat, val) \ + atomic_store_64(&log_sm_stats->stat.value.ui64, (val)); + +void +spa_log_sm_stats_update(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.log_spacemaps; + kstat_t *ksp = shk->kstat; + + if (ksp == NULL) + return; + + spa_log_sm_stats_t *log_sm_stats = ksp->ks_data; + + SPA_LOG_SM_STATS_SET(unflushed_memused, + spa->spa_unflushed_stats.sus_memused); + SPA_LOG_SM_STATS_SET(unflushed_blocklimit, + spa->spa_unflushed_stats.sus_blocklimit); + SPA_LOG_SM_STATS_SET(unflushed_nblocks, + spa->spa_unflushed_stats.sus_nblocks); +} + +static void +spa_log_sm_stats_init(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.log_spacemaps; + + mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); + + char *name = kmem_asprintf("zfs/%s", spa_name(spa)); + kstat_t *ksp = kstat_create(name, 0, "log_spacemaps", "misc", + KSTAT_TYPE_NAMED, + sizeof (spa_log_sm_stats_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + shk->kstat = ksp; + if (ksp) { + ksp->ks_lock = &shk->lock; + ksp->ks_data = + kmem_alloc(sizeof (spa_log_sm_stats_t), KM_SLEEP); + memcpy(ksp->ks_data, &spa_log_sm_stats_template, + sizeof (spa_log_sm_stats_t)); + kstat_install(ksp); + } + + kmem_strfree(name); +} + +static void +spa_log_sm_stats_destroy(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.log_spacemaps; + kstat_t *ksp = shk->kstat; + if (ksp) { + kmem_free(ksp->ks_data, sizeof (spa_log_sm_stats_t)); + kstat_delete(ksp); + } + + mutex_destroy(&shk->lock); +} + void spa_stats_init(spa_t *spa) { @@ -1044,11 +1124,13 @@ spa_stats_init(spa_t *spa) spa_state_init(spa); spa_guid_init(spa); spa_iostats_init(spa); + spa_log_sm_stats_init(spa); } void spa_stats_destroy(spa_t *spa) { + spa_log_sm_stats_destroy(spa); spa_iostats_destroy(spa); spa_health_destroy(spa); spa_tx_assign_destroy(spa); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 9d12bc2eb0a2..b65fc7236a57 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -23,6 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2024, Klara, Inc. */ /* @@ -435,6 +436,14 @@ root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t *)&pres, sizeof (pres) / sizeof (uint64_t)); } + + pool_condense_stat_t pcns[POOL_CONDENSE_TYPES]; + memcpy(pcns, spa->spa_condense_stats, + sizeof (pool_condense_stat_t) * POOL_CONDENSE_TYPES); + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_CONDENSE_STATS, (uint64_t *)pcns, + (sizeof (pool_condense_stat_t) / sizeof (uint64_t)) * + POOL_CONDENSE_TYPES); } static void diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 8188a9e46865..21973b664b3a 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -38,7 +38,7 @@ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. - * Copyright (c) 2019, 2021, 2023, 2024, Klara Inc. + * Copyright (c) 2019, 2021, 2023, 2024, Klara, Inc. * Copyright (c) 2019, Allan Jude * Copyright 2024 Oxide Computer Company */ @@ -7079,6 +7079,70 @@ zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl) return (0); } +static const zfs_ioc_key_t zfs_keys_pool_condense[] = { + {ZPOOL_CONDENSE_COMMAND, DATA_TYPE_UINT64, 0}, + {ZPOOL_CONDENSE_TYPE, DATA_TYPE_UINT64, 0}, +}; + +static int +zfs_ioc_pool_condense(const char *pool, nvlist_t *innvl, nvlist_t *onvl) +{ + spa_t *spa; + int err; + + uint64_t cmd; + if (nvlist_lookup_uint64(innvl, ZPOOL_CONDENSE_COMMAND, &cmd) != 0) + return (SET_ERROR(EINVAL)); + + if (cmd >= POOL_CONDENSE_FUNCS) + return (SET_ERROR(EINVAL)); + + uint64_t type; + if (nvlist_lookup_uint64(innvl, ZPOOL_CONDENSE_TYPE, &type) != 0) + return (SET_ERROR(EINVAL)); + + if (type >= POOL_CONDENSE_TYPES) + return (SET_ERROR(EINVAL)); + + if ((err = spa_open(pool, &spa, FTAG)) != 0) + return (err); + + if (spa_suspended(spa)) { + spa_close(spa, FTAG); + return (SET_ERROR(EAGAIN)); + } + + if (!spa_writeable(spa)) { + spa_close(spa, FTAG); + return (SET_ERROR(EROFS)); + } + + switch (type) { + case POOL_CONDENSE_LOG_SPACEMAP: + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + if (cmd == POOL_CONDENSE_START) + spa_log_flushall_start(spa, + SPA_LOG_FLUSHALL_REQUEST, 0); + else + spa_log_flushall_cancel(spa); + + break; + + default: + /* unreachable */ + spa_close(spa, FTAG); + return (SET_ERROR(EINVAL)); + } + + spa_close(spa, FTAG); + + return (0); +} + /* * Load a user's wrapping key into the kernel. * innvl: { @@ -7426,6 +7490,10 @@ zfs_ioctl_init(void) zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_pool_sync, ARRAY_SIZE(zfs_keys_pool_sync)); + zfs_ioctl_register("condense", ZFS_IOC_POOL_CONDENSE, + zfs_ioc_pool_condense, zfs_secpolicy_none, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, + zfs_keys_pool_condense, ARRAY_SIZE(zfs_keys_pool_condense)); zfs_ioctl_register("reopen", ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_pool_reopen, ARRAY_SIZE(zfs_keys_pool_reopen)); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index fc4adc42d00a..f22d6f261c46 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -1054,7 +1054,7 @@ tests = ['many_fds', 'libzfs_input'] tags = ['functional', 'libzfs'] [tests/functional/log_spacemap] -tests = ['log_spacemap_import_logs'] +tests = ['log_spacemap_import_logs', 'log_spacemap_flushall'] pre = post = tags = ['functional', 'log_spacemap'] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 7d1551a63f0d..d3b4aa20e94c 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1612,6 +1612,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/longname/longname_002_pos.ksh \ functional/longname/longname_003_pos.ksh \ functional/longname/setup.ksh \ + functional/log_spacemap/log_spacemap_flushall.ksh \ functional/log_spacemap/log_spacemap_import_logs.ksh \ functional/migration/cleanup.ksh \ functional/migration/migration_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/log_spacemap/log_spacemap_flushall.ksh b/tests/zfs-tests/tests/functional/log_spacemap/log_spacemap_flushall.ksh new file mode 100755 index 000000000000..10e4f80c03d3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/log_spacemap/log_spacemap_flushall.ksh @@ -0,0 +1,80 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 by Delphix. All rights reserved. +# Copyright (c) 2024, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: + +# This tests the on-demand "flush all spacemap logs" feature. This is the same +# process is that triggered at pool export, but instead we trigger it ahead of +# time via `zpool condense`. +# +# This test uses the `log_spacemaps` kstat and `zdb -m` to know how much is +# waiting to be flushed. All we're looking for is that the flushall function +# works, not how much it's doing. + +# +# STRATEGY: +# 1. Create pool. +# 2. Write things, which will add to the spacemap logs. +# 3. Save the counters. +# 4. Request the spacemap logs be flushed. +# 5. Compare counters against previous values. +# + +verify_runnable "global" + +function cleanup +{ + if poolexists $LOGSM_POOL; then + log_must zpool destroy -f $LOGSM_POOL + fi +} +log_onexit cleanup + +function get_smp_length { + zdb -m $LOGSM_POOL | grep smp_length | \ + awk '{ sum += $3 } END { print sum }' +} + +LOGSM_POOL="logsm_flushall" +read -r TESTDISK _ <<<"$DISKS" + +log_must zpool create -o cachefile=none -f -O compression=off \ + $LOGSM_POOL $TESTDISK + +log_must file_write -o create -f /$LOGSM_POOL/f1 -b 131072 -c 32 -d R +log_must file_write -o create -f /$LOGSM_POOL/f2 -b 131072 -c 32 -d R +log_must file_write -o create -f /$LOGSM_POOL/f3 -b 131072 -c 32 -d R +log_must file_write -o create -f /$LOGSM_POOL/f4 -b 131072 -c 32 -d R + +sync_all_pools + +typeset length_1=$(get_smp_length) + +log_must zpool condense -t log-spacemap -w $LOGSM_POOL + +typeset length_2=$(get_smp_length) + +log_must test $length_1 -gt $length_2 + +log_pass "Log spacemaps on-demand flushall works"