diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c index ad6bac38432..0900642edec 100644 --- a/src/dtx/dtx_common.c +++ b/src/dtx/dtx_common.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1438,7 +1439,9 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul D_WARN(DF_UUID": Fail to sync %s commit DTX "DF_DTI": "DF_RC"\n", DP_UUID(cont->sc_uuid), dlh->dlh_coll ? "collective" : "regular", DP_DTI(&dth->dth_xid), DP_RC(rc)); - if (likely(dtx_batched_ult_max != 0)) { + if (likely(dtx_batched_ult_max != 0 && + !DAOS_FAIL_CHECK(DAOS_DTX_PARTIAL_COMMIT_P1) && + !DAOS_FAIL_CHECK(DAOS_DTX_PARTIAL_COMMIT_P2))) { dth->dth_sync = 0; goto cache; } diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c index 20ecb614cc1..8a66652f82f 100644 --- a/src/dtx/dtx_rpc.c +++ b/src/dtx/dtx_rpc.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -411,7 +412,7 @@ dtx_req_list_send(struct dtx_common_args *dca, bool is_reentrance) dca->dca_drr->drr_result = 0; if (unlikely(dra->dra_opc == DTX_COMMIT && dca->dca_i == 0 && - DAOS_FAIL_CHECK(DAOS_DTX_FAIL_COMMIT))) + DAOS_FAIL_CHECK(DAOS_DTX_PARTIAL_COMMIT_P1))) dtx_req_send(dca->dca_drr, 1); else dtx_req_send(dca->dca_drr, dca->dca_epoch); diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c index 8241fe9dfd8..5b3e7fac7c3 100644 --- a/src/dtx/dtx_srv.c +++ b/src/dtx/dtx_srv.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -189,7 +190,7 @@ dtx_handler(crt_rpc_t *rpc) break; if (unlikely(din->di_epoch == 1)) - D_GOTO(out, rc = -DER_IO); + daos_fail_loc_set(DAOS_DTX_PARTIAL_COMMIT_P2); while (i < din->di_dtx_array.ca_count) { if (i + count > din->di_dtx_array.ca_count) @@ -205,6 +206,9 @@ dtx_handler(crt_rpc_t *rpc) i += count; } + if (unlikely(din->di_epoch == 1)) + D_GOTO(out, rc = -DER_IO); + if (din->di_flags.ca_count > 0) flags = din->di_flags.ca_arrays; diff --git a/src/include/daos/common.h b/src/include/daos/common.h index ad252f12681..448a20c030f 100644 --- a/src/include/daos/common.h +++ b/src/include/daos/common.h @@ -848,11 +848,13 @@ enum { #define DAOS_DTX_RESEND_DELAY1 (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x48) #define DAOS_DTX_UNCERTAIN (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x49) #define DAOS_DTX_RESYNC_DELAY (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x4a) -#define DAOS_DTX_FAIL_COMMIT (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x4b) #define DAOS_OBJ_SYNC_RETRY (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x4c) #define DAOS_OBJ_COLL_SPARSE (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x4d) +#define DAOS_DTX_PARTIAL_COMMIT_P1 (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x4e) +#define DAOS_DTX_PARTIAL_COMMIT_P2 (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x4f) + #define DAOS_NVME_FAULTY (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x50) #define DAOS_NVME_WRITE_ERR (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x51) #define DAOS_NVME_READ_ERR (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x52) diff --git a/src/include/daos_srv/evtree.h b/src/include/daos_srv/evtree.h index 63224259ccc..8ae4f12638e 100644 --- a/src/include/daos_srv/evtree.h +++ b/src/include/daos_srv/evtree.h @@ -1,5 +1,6 @@ /** * (C) Copyright 2017-2023 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -810,4 +811,16 @@ evt_feats_get(struct evt_root *root) */ int evt_feats_set(struct evt_root *root, struct umem_instance *umm, uint64_t feats); +/** Validate the provided evt. + * + * Note: It is designed for catastrophic recovery. Not to perform at run-time. + * + * \param evt[in] + * \param dtx_lid[in] local id of the DTX entry the evt is supposed to belong to + * + * \return true if evt is valid. + **/ +bool +evt_desc_is_valid(const struct evt_desc *evt, uint32_t dtx_lid); + #endif /* __DAOS_EV_TREE_H__ */ diff --git a/src/include/daos_srv/vos_types.h b/src/include/daos_srv/vos_types.h index dde07ce7ec3..924479ac866 100644 --- a/src/include/daos_srv/vos_types.h +++ b/src/include/daos_srv/vos_types.h @@ -1,5 +1,6 @@ /** * (C) Copyright 2015-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -57,6 +58,10 @@ enum dtx_entry_flags { * on all yet, need to be re-committed. */ DTE_PARTIAL_COMMITTED = (1 << 5), + /* The DTX epoch is sorted locally. */ + DTE_EPOCH_SORTED = (1 << 6), + /* The active DTX entry is redundant, should be discarded. */ + DTE_REDUN = (1 << 7), }; struct dtx_entry { diff --git a/src/tests/ftest/util/telemetry_utils.py b/src/tests/ftest/util/telemetry_utils.py index 17f1753b100..a075bb0ecdd 100644 --- a/src/tests/ftest/util/telemetry_utils.py +++ b/src/tests/ftest/util/telemetry_utils.py @@ -1,5 +1,7 @@ """ (C) Copyright 2021-2024 Intel Corporation. +(C) Copyright 2025 Hewlett Packard Enterprise Development LP +(C) Copyright 2025 Google LLC SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -207,6 +209,8 @@ class TelemetryUtils(): _gen_stats_metrics("engine_io_dtx_committable") ENGINE_IO_DTX_COMMITTED_METRICS = \ _gen_stats_metrics("engine_io_dtx_committed") + ENGINE_IO_DTX_INVALID_METRICS = \ + _gen_stats_metrics("engine_io_dtx_invalid") ENGINE_IO_LATENCY_FETCH_METRICS = \ _gen_stats_metrics("engine_io_latency_fetch") ENGINE_IO_LATENCY_BULK_FETCH_METRICS = \ @@ -310,6 +314,7 @@ class TelemetryUtils(): ENGINE_IO_METRICS = ENGINE_IO_DTX_ASYNC_CMT_LAT_METRICS +\ ENGINE_IO_DTX_COMMITTABLE_METRICS +\ ENGINE_IO_DTX_COMMITTED_METRICS +\ + ENGINE_IO_DTX_INVALID_METRICS +\ ENGINE_IO_LATENCY_FETCH_METRICS +\ ENGINE_IO_LATENCY_BULK_FETCH_METRICS +\ ENGINE_IO_LATENCY_VOS_FETCH_METRICS +\ diff --git a/src/tests/suite/daos_base_tx.c b/src/tests/suite/daos_base_tx.c index 5752ade4fb8..7d24fa53db3 100644 --- a/src/tests/suite/daos_base_tx.c +++ b/src/tests/suite/daos_base_tx.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -847,41 +848,85 @@ dtx_20(void **state) static void dtx_21(void **state) { - test_arg_t *arg = *state; - char *update_buf; - const char *dkey = dts_dtx_dkey; - const char *akey = dts_dtx_akey; - daos_obj_id_t oid; - struct ioreq req; + test_arg_t *arg = *state; + char *update_buf; + const char *dkey = dts_dtx_dkey; + const char *akey = dts_dtx_akey; + daos_obj_id_t oid; + struct ioreq req; + daos_pool_info_t info_prep = { 0 }; + daos_pool_info_t info_post = { 0 }; + int rc; FAULT_INJECTION_REQUIRED(); - print_message("do not abort partially committed DTX\n"); + print_message("handle partially committed DTX\n"); if (!test_runable(arg, dts_dtx_replica_cnt)) return; - D_ALLOC(update_buf, dts_dtx_iosize); - assert_non_null(update_buf); - dts_buf_render(update_buf, dts_dtx_iosize); - - oid = daos_test_oid_gen(arg->coh, dts_dtx_class, 0, 0, arg->myrank); - ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); - - dtx_set_fail_loc(arg, DAOS_DTX_FAIL_COMMIT); /* - * The DTX that create the object will trigger synchronous commit. One of - * the replicas will fail commit locally because of DAOS_DTX_FAIL_COMMIT. - * But the other replicas will commit successfully, then related data can - * be accessed. + * The DTX that create the object will trigger synchronous commit. One non-leader + * replica will commit locally but left the active DTX entry on disk, then report + * IO failure, that will cause the DTX on leader to be marked as partially commit. + * + * Then restart the system, such DTX will be re-loaded from disk. The subsequent + * DTX resync will re-commit such partial committed DTX. The logic on non-leader + * should be able to detect and handle such re-committed DTX; otherwise, it will + * trigger assertion. + * + * To simplify the test logic, only perform the test on rank_0. */ - insert_single(dkey, akey, 0, update_buf, dts_dtx_iosize, DAOS_TX_NONE, &req); - dtx_set_fail_loc(arg, 0); - dtx_check_replicas(dkey, akey, "update_succ", update_buf, dts_dtx_iosize, &req); + par_barrier(PAR_COMM_WORLD); + if (arg->myrank == 0) { + rc = daos_pool_query(arg->pool.poh, NULL, &info_prep, NULL, NULL); + assert_true(rc == 0); - D_FREE(update_buf); - ioreq_fini(&req); + D_ALLOC(update_buf, dts_dtx_iosize); + assert_non_null(update_buf); + dts_buf_render(update_buf, dts_dtx_iosize); + + oid = daos_test_oid_gen(arg->coh, OC_RP_3GX, 0, 0, arg->myrank); + ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); + + daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_LOC, + DAOS_DTX_PARTIAL_COMMIT_P1, 0, NULL); + + print_message("Generating partially committed DTX ...\n"); + insert_single(dkey, akey, 0, update_buf, dts_dtx_iosize, DAOS_TX_NONE, &req); + + D_FREE(update_buf); + ioreq_fini(&req); + daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_LOC, 0, 0, NULL); + } + par_barrier(PAR_COMM_WORLD); + + test_teardown_cont_hdl(arg); + daos_pool_disconnect(arg->pool.poh, NULL); + arg->pool.poh = DAOS_HDL_INVAL; + + par_barrier(PAR_COMM_WORLD); + if (arg->myrank == 0) { + print_message("Stopping system ...\n"); + dmg_system_stop_rank(dmg_config_file, CRT_NO_RANK, false); + + print_message("Starting system ...\n"); + dmg_system_start_rank(dmg_config_file, CRT_NO_RANK); + + /* Sleep a while for DTX resync. */ + sleep(30); + + rc = daos_pool_connect(arg->pool.pool_str, arg->group, DAOS_PC_RW, + &arg->pool.poh, NULL, NULL); + assert_true(rc == 0); + + rc = daos_pool_query(arg->pool.poh, NULL, &info_post, NULL, NULL); + assert_true(rc == 0); + + assert_int_equal(info_prep.pi_ndisabled, info_post.pi_ndisabled); + } + par_barrier(PAR_COMM_WORLD); } static int @@ -945,7 +990,7 @@ static const struct CMUnitTest dtx_tests[] = { dtx_19, NULL, test_case_teardown}, {"DTX20: race between DTX refresh and DTX resync", dtx_20, dtx_base_rf1_setup, rebuild_sub_teardown}, - {"DTX21: do not abort partially committed DTX", + {"DTX21: handle partially committed DTX", dtx_21, dtx_base_rf0_setup, rebuild_sub_teardown}, }; diff --git a/src/vos/evt_priv.h b/src/vos/evt_priv.h index e855a9c74b2..bbf94ecd6cf 100644 --- a/src/vos/evt_priv.h +++ b/src/vos/evt_priv.h @@ -1,5 +1,7 @@ /** * (C) Copyright 2017-2022 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -115,7 +117,6 @@ struct evt_context { umem_off2ptr(evt_umm(tcx), offset) #define EVT_NODE_MAGIC 0xf00d -#define EVT_DESC_MAGIC 0xbeefdead /** Convert an offset to a evtree node descriptor * \param[IN] tcx Tree context diff --git a/src/vos/evtree.c b/src/vos/evtree.c index d635453f8b2..883e3820bc6 100644 --- a/src/vos/evtree.c +++ b/src/vos/evtree.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2017-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -4086,3 +4087,12 @@ evt_feats_set(struct evt_root *root, struct umem_instance *umm, uint64_t feats) return rc; } +bool +evt_desc_is_valid(const struct evt_desc *evt, uint32_t dtx_lid) +{ + if (evt == NULL || evt->dc_magic != EVT_DESC_MAGIC) { + return false; + } + + return (evt->dc_dtx == dtx_lid); +} diff --git a/src/vos/ilog.c b/src/vos/ilog.c index 1d1d6508087..86ce9c2afea 100644 --- a/src/vos/ilog.c +++ b/src/vos/ilog.c @@ -1,5 +1,7 @@ /** * (C) Copyright 2019-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -390,16 +392,18 @@ ilog_create(struct umem_instance *umm, struct ilog_df *root) return rc; } -#define ILOG_ASSERT_VALID(root_df) \ - do { \ - struct ilog_root *_root; \ - \ - _root = (struct ilog_root *)(root_df); \ - D_ASSERTF((_root != NULL) && \ - ILOG_MAGIC_VALID(_root->lr_magic), \ - "Invalid ilog root detected %p magic=%#x\n", \ - _root, _root == NULL ? 0 : _root->lr_magic); \ - } while (0) +#define ILOG_CHECK_VALID(root_df) \ + ({ \ + struct ilog_root *_root = NULL; \ + D_ASSERT((root_df) != NULL); \ + _root = (struct ilog_root *)(root_df); \ + if (!ILOG_MAGIC_VALID(_root->lr_magic)) { \ + D_WARN("Invalid ilog root detected %p magic=%#x\n", _root, \ + _root == NULL ? 0 : _root->lr_magic); \ + _root = NULL; \ + } \ + _root != NULL; \ + }) int ilog_open(struct umem_instance *umm, struct ilog_df *root, @@ -408,7 +412,8 @@ ilog_open(struct umem_instance *umm, struct ilog_df *root, struct ilog_context *lctx; int rc; - ILOG_ASSERT_VALID(root); + if (!ILOG_CHECK_VALID(root)) + return -DER_NONEXIST; rc = ilog_ctx_create(umm, (struct ilog_root *)root, cbs, &lctx); if (rc != 0) @@ -474,7 +479,7 @@ ilog_destroy(struct umem_instance *umm, int rc = 0; struct ilog_array_cache cache = {0}; - ILOG_ASSERT_VALID(root); + D_ASSERT(ILOG_CHECK_VALID(root)); rc = ilog_tx_begin(&lctx); if (rc != 0) { @@ -984,8 +989,12 @@ ilog_modify(daos_handle_t loh, const struct ilog_id *id_in, "%s in incarnation log " DF_X64 " status: rc=" DF_RC " tree_version: %d\n", opc_str[opc], id_in->id_epoch, DP_RC(rc), ilog_mag2ver(lctx->ic_root->lr_magic)); - if (rc == 0 && version != ilog_mag2ver(lctx->ic_root->lr_magic) && - (opc == ILOG_OP_PERSIST || opc == ILOG_OP_ABORT)) { + if (rc == 0 && opc != ILOG_OP_UPDATE) { + if (version == ilog_mag2ver(lctx->ic_root->lr_magic)) { + D_WARN("ilog entry on %s doesn't exist\n", opc_str[opc]); + return -DER_NONEXIST; + } + /** If we persisted or aborted an entry successfully, * invoke the callback, if applicable but without * deregistration @@ -1213,7 +1222,7 @@ ilog_fetch(struct umem_instance *umm, struct ilog_df *root_df, int rc = 0; bool retry; - ILOG_ASSERT_VALID(root_df); + D_ASSERT(ILOG_CHECK_VALID(root_df)); root = (struct ilog_root *)root_df; @@ -1539,7 +1548,7 @@ ilog_aggregate(struct umem_instance *umm, struct ilog_df *ilog, root = lctx->ic_root; - ILOG_ASSERT_VALID(root); + D_ASSERT(ILOG_CHECK_VALID(root)); D_ASSERT(!ilog_empty(root)); /* ilog_fetch should have failed */ diff --git a/src/vos/tests/vts_ilog.c b/src/vos/tests/vts_ilog.c index c696ff0b487..3982fc4c807 100644 --- a/src/vos/tests/vts_ilog.c +++ b/src/vos/tests/vts_ilog.c @@ -1,5 +1,7 @@ /** * (C) Copyright 2019-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -530,6 +532,12 @@ ilog_test_update(void **state) rc = entries_check(umm, ilog, &ilog_callbacks, NULL, 0, entries); assert_rc_equal(rc, 0); + /* Test non-existent tx */ + id.id_epoch = epoch; + id.id_tx_id = current_tx_id.id_tx_id + 4000; + rc = ilog_persist(loh, &id); + assert_rc_equal(rc, -DER_NONEXIST); + /* Commit the punch ilog. */ id.id_epoch = epoch; id.id_tx_id = current_tx_id.id_tx_id; @@ -668,6 +676,12 @@ ilog_test_abort(void **state) rc = entries_check(umm, ilog, &ilog_callbacks, NULL, 0, entries); assert_rc_equal(rc, 0); + /* Test non-existent tx */ + id = current_tx_id; + id.id_tx_id += 400; + rc = ilog_abort(loh, &id); + assert_rc_equal(rc, -DER_NONEXIST); + id = current_tx_id; rc = ilog_abort(loh, &id); LOG_FAIL(rc, 0, "Failed to abort log entry\n"); @@ -735,6 +749,11 @@ ilog_test_abort(void **state) rc = ilog_destroy(umm, &ilog_callbacks, ilog); assert_rc_equal(rc, 0); + /** Test open of "reallocated" ilog */ + memset(ilog, 0xa1, sizeof(*ilog)); + rc = ilog_open(umm, ilog, &ilog_callbacks, false, &loh); + assert_rc_equal(rc, -DER_NONEXIST); + assert_true(d_list_empty(&fake_tx_list)); ilog_free_root(umm, ilog); } diff --git a/src/vos/vos_common.c b/src/vos/vos_common.c index 011f8a8ccd5..cedda5dc756 100644 --- a/src/vos/vos_common.c +++ b/src/vos/vos_common.c @@ -581,6 +581,12 @@ vos_tls_init(int tags, int xs_id, int tgt_id) D_WARN("Failed to create committed cnt sensor: "DF_RC"\n", DP_RC(rc)); + rc = d_tm_add_metric(&tls->vtl_invalid_dtx, D_TM_STATS_GAUGE, + "Number of invalid active DTX", "entries", + "io/dtx/invalid/tgt_%u", tgt_id); + if (rc) + D_WARN("Failed to create invalid DTX cnt sensor: " DF_RC "\n", DP_RC(rc)); + rc = d_tm_add_metric(&tls->vtl_obj_cnt, D_TM_GAUGE, "Number of cached vos object", "entry", "mem/vos/vos_obj_%u/tgt_%u", diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index 2cf6ad2ec45..cb14e4c9f60 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -1,6 +1,7 @@ /** * (C) Copyright 2019-2024 Intel Corporation. * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -573,7 +574,7 @@ dtx_ilog_rec_release(struct umem_instance *umm, struct vos_container *cont, ilog_close(loh); - if (rc != 0) + if (rc != 0 && rc != -DER_NONEXIST) D_ERROR("Failed to release ilog rec for "DF_DTI", abort %s: "DF_RC"\n", DP_DTI(&DAE_XID(dae)), abort ? "yes" : "no", DP_RC(rc)); @@ -598,6 +599,12 @@ do_dtx_rec_release(struct umem_instance *umm, struct vos_container *cont, struct vos_irec_df *svt; svt = umem_off2ptr(umm, umem_off2offset(rec)); + + if (!vos_irec_is_valid(svt, DAE_LID(dae))) { + rc = -DER_NONEXIST; + break; + } + if (abort) { if (DAE_INDEX(dae) != DTX_INDEX_INVAL) { rc = umem_tx_add_ptr(umm, &svt->ir_dtx, @@ -621,6 +628,12 @@ do_dtx_rec_release(struct umem_instance *umm, struct vos_container *cont, struct evt_desc *evt; evt = umem_off2ptr(umm, umem_off2offset(rec)); + + if (!evt_desc_is_valid(evt, DAE_LID(dae))) { + rc = -DER_NONEXIST; + break; + } + if (abort) { if (DAE_INDEX(dae) != DTX_INDEX_INVAL) { rc = umem_tx_add_ptr(umm, &evt->dc_dtx, @@ -648,6 +661,13 @@ do_dtx_rec_release(struct umem_instance *umm, struct vos_container *cont, break; } + if (unlikely(rc == -DER_NONEXIST)) { + D_WARN("DTX record no longer exists, may indicate some corruption: " + DF_DTI " type %u, discard\n", + DP_DTI(&DAE_XID(dae)), dtx_umoff_flag2type(rec)); + d_tm_inc_gauge(vos_tls_get(cont->vc_pool->vp_sysdb)->vtl_invalid_dtx, 1); + } + return rc; } @@ -657,6 +677,7 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae, bool ab struct umem_instance *umm = vos_cont2umm(cont); struct vos_dtx_act_ent_df *dae_df; struct vos_dtx_blob_df *dbd; + bool invalid = false; int count; int i; int rc = 0; @@ -685,42 +706,70 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae, bool ab abort ? "abort" : "commit", DP_DTI(&DAE_XID(dae)), dbd, DP_UUID(cont->vc_pool->vp_id), DP_UUID(cont->vc_id)); - if (dae->dae_records != NULL) { + /* Handle DTX records as FIFO order to find out potential invalid DTX earlier. */ + + if (DAE_REC_CNT(dae) > DTX_INLINE_REC_CNT) + count = DTX_INLINE_REC_CNT; + else + count = DAE_REC_CNT(dae); + + for (i = 0; i < count; i++) { + rc = do_dtx_rec_release(umm, cont, dae, DAE_REC_INLINE(dae)[i], abort); + if (unlikely(rc == -DER_NONEXIST)) { + invalid = true; + break; + } + if (rc != 0) + return rc; + } + + if (!invalid && dae->dae_records != NULL) { D_ASSERT(DAE_REC_CNT(dae) > DTX_INLINE_REC_CNT); D_ASSERT(!UMOFF_IS_NULL(dae_df->dae_rec_off)); - for (i = DAE_REC_CNT(dae) - DTX_INLINE_REC_CNT - 1; i >= 0; i--) { + for (i = 0; i < DAE_REC_CNT(dae) - DTX_INLINE_REC_CNT; i++) { rc = do_dtx_rec_release(umm, cont, dae, dae->dae_records[i], abort); + if (unlikely(rc == -DER_NONEXIST)) { + invalid = true; + break; + } if (rc != 0) return rc; } + } + + /* + * Inject fault to simulate the case of DTX records are left on disk after commit. + * Then when container re-open (such as for engine restart), it will be re-loaded + * from disk and then trigger DTX re-commit via DTX resync. + */ + if (unlikely(DAOS_FAIL_CHECK(DAOS_DTX_PARTIAL_COMMIT_P2))) { + rc = umem_tx_add_ptr(umm, &dae_df->dae_flags, sizeof(dae_df->dae_flags)); + if (rc != 0) + return rc; + + dae_df->dae_flags |= DTE_REDUN; + return 0; + } + + /* It is expected to detect some invalid DTX records. Otherwise assert for test. */ + if (DAE_REC_CNT(dae) != 0 && unlikely(DAE_FLAGS(dae) & DTE_REDUN)) + D_ASSERT(invalid); + if (!UMOFF_IS_NULL(dae_df->dae_rec_off)) { rc = umem_free(umm, dae_df->dae_rec_off); if (rc != 0) return rc; - if (keep_act) { + if (!invalid && keep_act) { rc = umem_tx_add_ptr(umm, &dae_df->dae_rec_off, sizeof(dae_df->dae_rec_off)); if (rc != 0) return rc; - dae_df->dae_rec_off = UMOFF_NULL; } - - count = DTX_INLINE_REC_CNT; - } else { - D_ASSERT(DAE_REC_CNT(dae) <= DTX_INLINE_REC_CNT); - - count = DAE_REC_CNT(dae); } - for (i = count - 1; i >= 0; i--) { - rc = do_dtx_rec_release(umm, cont, dae, DAE_REC_INLINE(dae)[i], abort); - if (rc != 0) - return rc; - } - - if (keep_act) { + if (!invalid && keep_act) { /* When re-commit partial committed DTX, the count can be zero. */ if (dae_df->dae_rec_cnt > 0) { rc = umem_tx_add_ptr(umm, &dae_df->dae_rec_cnt, @@ -747,6 +796,9 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae, bool ab return 0; } + if (invalid) + rc = 0; + if (!UMOFF_IS_NULL(dae_df->dae_mbs_off)) { /* dae_mbs_off will be invalid via flag DTE_INVALID. */ rc = umem_free(umm, dae_df->dae_mbs_off); @@ -2223,8 +2275,7 @@ vos_dtx_post_handle(struct vos_container *cont, } if (!abort && dces != NULL) { - struct vos_tls *tls = vos_tls_get(false); - int j = 0; + int j = 0; D_ASSERT(cont->vc_pool->vp_sysdb == false); for (i = 0; i < count; i++) { @@ -2235,7 +2286,7 @@ vos_dtx_post_handle(struct vos_container *cont, if (j > 0) { cont->vc_dtx_committed_count += j; cont->vc_pool->vp_dtx_committed_count += j; - d_tm_inc_gauge(tls->vtl_committed, j); + d_tm_inc_gauge(vos_tls_get(cont->vc_pool->vp_sysdb)->vtl_committed, j); } } @@ -2552,7 +2603,6 @@ vos_dtx_set_flags(daos_handle_t coh, struct dtx_id dtis[], int count, uint32_t f int vos_dtx_aggregate(daos_handle_t coh) { - struct vos_tls *tls = vos_tls_get(false); struct vos_container *cont; struct vos_cont_df *cont_df; struct umem_instance *umm; @@ -2671,7 +2721,7 @@ vos_dtx_aggregate(daos_handle_t coh) cont->vc_dtx_committed_count -= count; cont->vc_pool->vp_dtx_committed_count -= count; - d_tm_dec_gauge(tls->vtl_committed, count); + d_tm_dec_gauge(vos_tls_get(cont->vc_pool->vp_sysdb)->vtl_committed, count); } DL_CDEBUG(rc != 0, DLOG_ERR, DB_IO, rc, diff --git a/src/vos/vos_ilog.c b/src/vos/vos_ilog.c index 54abf2f407f..19bf0102e6e 100644 --- a/src/vos/vos_ilog.c +++ b/src/vos/vos_ilog.c @@ -1,5 +1,7 @@ /** * (C) Copyright 2019-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -427,6 +429,7 @@ int vos_ilog_update_(struct vos_container *cont, struct ilog_df *ilog, vos_ilog_desc_cbs_init(&cbs, vos_cont2hdl(cont)); rc = ilog_open(vos_cont2umm(cont), ilog, &cbs, dth == NULL, &loh); + D_ASSERTF(rc != -DER_NONEXIST, "Uncorrectable incarnation log corruption detected"); if (rc != 0) { D_ERROR("Could not open incarnation log: "DF_RC"\n", DP_RC(rc)); return rc; @@ -522,6 +525,7 @@ vos_ilog_punch_(struct vos_container *cont, struct ilog_df *ilog, punch_log: vos_ilog_desc_cbs_init(&cbs, vos_cont2hdl(cont)); rc = ilog_open(vos_cont2umm(cont), ilog, &cbs, dth == NULL, &loh); + D_ASSERTF(rc != -DER_NONEXIST, "Uncorrectable incarnation log corruption detected"); if (rc != 0) { D_ERROR("Could not open incarnation log: "DF_RC"\n", DP_RC(rc)); return rc; diff --git a/src/vos/vos_internal.h b/src/vos/vos_internal.h index 1e0a219df0b..05a2ff7afd4 100644 --- a/src/vos/vos_internal.h +++ b/src/vos/vos_internal.h @@ -1,6 +1,7 @@ /** * (C) Copyright 2016-2024 Intel Corporation. * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -454,6 +455,8 @@ struct vos_dtx_cmt_ent { #define DCE_EPOCH(dce) ((dce)->dce_base.dce_epoch) #define DCE_CMT_TIME(dce) ((dce)->dce_base.dce_cmt_time) +#define EVT_DESC_MAGIC 0xbeefdead + extern uint64_t vos_evt_feats; /** Flags for internal use - Bit 63 can be used for another purpose so as to @@ -1858,4 +1861,16 @@ vos_io_scm(struct vos_pool *pool, daos_iod_type_t type, daos_size_t size, enum v int vos_insert_oid(struct dtx_handle *dth, struct vos_container *cont, daos_unit_oid_t *oid); +/** Validate the provided svt. + * + * Note: It is designed for catastrophic recovery. Not to perform at run-time. + * + * \param svt[in] + * \param dtx_lid[in] local id of the DTX entry the evt is supposed to belong to + * + * \return true if svt is valid. + **/ +bool +vos_irec_is_valid(const struct vos_irec_df *svt, uint32_t dtx_lid); + #endif /* __VOS_INTERNAL_H__ */ diff --git a/src/vos/vos_obj_index.c b/src/vos/vos_obj_index.c index b8123672aa4..35ff96872df 100644 --- a/src/vos/vos_obj_index.c +++ b/src/vos/vos_obj_index.c @@ -1,5 +1,7 @@ /** * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -289,6 +291,7 @@ vos_oi_find_alloc(struct vos_container *cont, daos_unit_oid_t oid, goto skip_log; vos_ilog_desc_cbs_init(&cbs, vos_cont2hdl(cont)); rc = ilog_open(vos_cont2umm(cont), &obj->vo_ilog, &cbs, dth == NULL, &loh); + D_ASSERTF(rc != -DER_NONEXIST, "Uncorrectable incarnation log corruption detected"); if (rc != 0) return rc; diff --git a/src/vos/vos_tls.h b/src/vos/vos_tls.h index 2fc328457d0..11f45beef17 100644 --- a/src/vos/vos_tls.h +++ b/src/vos/vos_tls.h @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -63,6 +64,7 @@ struct vos_tls { bool vtl_hash_set; }; struct d_tm_node_t *vtl_committed; + struct d_tm_node_t *vtl_invalid_dtx; struct d_tm_node_t *vtl_obj_cnt; struct d_tm_node_t *vtl_lru_alloc_size; }; diff --git a/src/vos/vos_tree.c b/src/vos/vos_tree.c index c36fcaa88c5..4beeb7e766f 100644 --- a/src/vos/vos_tree.c +++ b/src/vos/vos_tree.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1292,3 +1293,13 @@ obj_tree_find_attr(unsigned tree_class, int flags) return NULL; } } + +bool +vos_irec_is_valid(const struct vos_irec_df *svt, uint32_t dtx_lid) +{ + if (svt == NULL) { + return false; + } + + return svt->ir_dtx == dtx_lid; +}