diff --git a/src/third_party/wiredtiger/src/history/hs_rec.c b/src/third_party/wiredtiger/src/history/hs_rec.c index c11d672c6db..6fa727305d7 100644 --- a/src/third_party/wiredtiger/src/history/hs_rec.c +++ b/src/third_party/wiredtiger/src/history/hs_rec.c @@ -1153,9 +1153,7 @@ __hs_delete_record( { WT_DECL_RET; bool hs_read_committed; -#ifdef HAVE_DIAGNOSTIC WT_TIME_WINDOW *hs_tw; -#endif if (r->hs_cursor == NULL) WT_RET(__wt_curhs_open(session, NULL, &r->hs_cursor)); @@ -1171,11 +1169,22 @@ __hs_delete_record( WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, r->hs_cursor), true); /* It's possible the value in the history store becomes obsolete concurrently. */ if (ret == WT_NOTFOUND) { + /* + * The history store update may not exist even if there is no tombstone associated with it + * as this update may have already been removed by rollback to stable. + */ WT_ASSERT(session, tombstone != NULL && __wt_txn_upd_visible_all(session, tombstone)); ret = 0; } else { -#ifdef HAVE_DIAGNOSTIC + /* + * We have found a record that is not obsolete. However, we only want to delete a record if + * it has a stop timestamp greater than the start timestamp of the update. + */ __wt_hs_upd_time_window(r->hs_cursor, &hs_tw); + if (hs_tw->stop_ts <= upd->start_ts) + goto done; + +#ifdef HAVE_DIAGNOSTIC WT_ASSERT(session, hs_tw->start_txn == WT_TXN_NONE || hs_tw->start_txn == upd->txnid); WT_ASSERT(session, hs_tw->start_ts == WT_TS_NONE || hs_tw->start_ts == upd->start_ts); WT_ASSERT(session, @@ -1203,7 +1212,7 @@ err: /* * __wt_hs_delete_updates -- - * Delete the updates from the history store + * Delete the updates from the history store. */ int __wt_hs_delete_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r) diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index 30f8e588b03..4415720e7a1 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -156,6 +156,17 @@ __rollback_abort_update(WT_SESSION_IMPL *session, WT_ITEM *key, WT_UPDATE *first F_CLR(stable_upd, WT_UPDATE_HS | WT_UPDATE_TO_DELETE_FROM_HS); if (tombstone != NULL) F_CLR(tombstone, WT_UPDATE_HS | WT_UPDATE_TO_DELETE_FROM_HS); + } else if (WT_IS_HS(session->dhandle) && stable_upd->type != WT_UPDATE_TOMBSTONE) { + /* + * History store will have a combination of both tombstone and update/modify types in + * the update list to represent the time window of an update. When we are aborting the + * tombstone, make sure to remove all of the remaining updates also. In most of the + * scenarios, there will be only one update present except when the data store is a + * prepared commit where it is possible to have more than one update. The existing + * on-disk versions are removed while processing the on-disk entries. + */ + for (; stable_upd != NULL; stable_upd = stable_upd->next) + stable_upd->txnid = WT_TXN_ABORTED; } if (stable_update_found != NULL) *stable_update_found = true; @@ -1516,16 +1527,18 @@ __rollback_to_stable_hs_final_pass(WT_SESSION_IMPL *session, wt_timestamp_t roll WT_ERR(__wt_session_get_dhandle(session, WT_HS_URI, NULL, NULL, 0)); /* - * The rollback operation should be performed on the history store file when the checkpoint - * durable start/stop timestamp is greater than the rollback timestamp. But skip if there is no - * stable timestamp. + * The rollback operation should be skipped if there is no stable timestamp. Otherwise, it + * should be performed if one of the following criteria is satisfied: + * - The history store has dirty content. + * - The checkpoint durable start/stop timestamp is greater than the rollback timestamp. * * Note that the corresponding code in __rollback_to_stable_btree_apply also checks whether * there _are_ timestamped updates by checking max_durable_ts; that check is redundant here for * several reasons, the most immediate being that max_durable_ts cannot be none (zero) because * it's greater than rollback_timestamp, which is itself greater than zero. */ - if (max_durable_ts > rollback_timestamp && rollback_timestamp != WT_TS_NONE) { + if ((S2BT(session)->modified || max_durable_ts > rollback_timestamp) && + rollback_timestamp != WT_TS_NONE) { __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), "tree rolled back with durable timestamp: %s", __wt_timestamp_to_string(max_durable_ts, ts_string[0])); @@ -1654,13 +1667,11 @@ __rollback_to_stable_btree_apply( } /* - * The rollback to stable will skip the tables during recovery and shutdown in the following - * conditions. - * 1. Empty table. - * 2. Table has timestamped updates without a stable timestamp. + * During recovery, a table is skipped by RTS if one of the conditions is met: + * 1. The table is empty or newly-created. + * 2. The table has timestamped updates without a stable timestamp. */ - if ((F_ISSET(S2C(session), WT_CONN_RECOVERING) || - F_ISSET(S2C(session), WT_CONN_CLOSING_CHECKPOINT)) && + if (F_ISSET(S2C(session), WT_CONN_RECOVERING) && (addr_size == 0 || (rollback_timestamp == WT_TS_NONE && max_durable_ts != WT_TS_NONE))) { __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), "skip rollback to stable on file %s because %s", uri, @@ -1812,7 +1823,16 @@ __rollback_to_stable_btree_apply_all(WT_SESSION_IMPL *session, wt_timestamp_t ro } WT_ERR_NOTFOUND_OK(ret, false); - if (F_ISSET(S2C(session), WT_CONN_RECOVERING)) + /* + * Performing eviction in parallel to a checkpoint can lead to a situation where the history + * store has more updates than its corresponding data store. Performing history store cleanup at + * the end can enable the removal of any such unstable updates that are written to the history + * store. + * + * Do not perform the final pass on the history store in an in-memory configuration as it + * doesn't exist. + */ + if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) WT_ERR(__rollback_to_stable_hs_final_pass(session, rollback_timestamp)); err: