diff --git a/jstests/sharding/reshardingStepdownDuringCriticalSection.js b/jstests/sharding/reshardingStepdownDuringCriticalSection.js new file mode 100644 index 00000000000..b931a4491e5 --- /dev/null +++ b/jstests/sharding/reshardingStepdownDuringCriticalSection.js @@ -0,0 +1,63 @@ +// +// Basic tests for reshardCollection. +// @tags: [ +// uses_atclustertime, +// ] +// + +import {DiscoverTopology} from "jstests/libs/discover_topology.js"; +import {configureFailPoint} from "jstests/libs/fail_point_util.js"; +import {funWithArgs} from "jstests/libs/parallel_shell_helpers.js"; + +const st = new ShardingTest({mongos: 1, shards: 2}); +const kDbName = 'db'; +const collName = 'foo'; +const ns = kDbName + '.' + collName; +const kNumInitialDocs = 500; + +jsTest.log("Set critical section timeout"); +const criticalSectionTimeoutMS = 10 * 1000; /* 10 seconds */ +st.configRS.nodes.forEach((configNode) => { + assert.commandWorked(configNode.getDB("admin").adminCommand( + {setParameter: 1, reshardingCriticalSectionTimeoutMillis: criticalSectionTimeoutMS})); +}); + +jsTest.log("Create initial collection"); +assert.commandWorked( + st.s.adminCommand({enableSharding: kDbName, primaryShard: st.shard0.shardName})); +assert.commandWorked(st.s.adminCommand({shardCollection: ns, key: {oldKey: 1}})); + +let bulk = st.s.getDB(kDbName).getCollection(collName).initializeOrderedBulkOp(); +for (let x = 0; x < kNumInitialDocs; x++) { + bulk.insert({oldKey: x, newKey: kNumInitialDocs - x}); +} +assert.commandWorked(bulk.execute()); + +jsTest.log("Make recipient wait before transitioning to strict"); +let fp1 = configureFailPoint(st.rs0.getPrimary(), "reshardingPauseBeforeEnteringStrictConsistency"); + +jsTest.log("Set failpoint"); +let fp = configureFailPoint(st.configRS.getPrimary(), "pauseAfterEngagingCriticalSection"); + +jsTest.log("Launch resharding operation"); +const awaitResult = startParallelShell( + funWithArgs(function(ns) { + assert.commandFailedWithCode(db.adminCommand({reshardCollection: ns, key: {newKey: 1}}), + ErrorCodes.ReshardingCriticalSectionTimeout); + }, ns), st.s.port); + +jsTest.log("Wait for failpoint"); +fp.wait(); + +jsTest.log("Trigger stepdown"); +assert.commandWorked(st.configRS.getPrimary().adminCommand({replSetStepDown: 60, force: 1})); + +jsTest.log("Release failpoint"); +fp.off(); + +jsTest.log("Wait for resharding to complete"); +awaitResult(); + +fp1.off(); + +st.stop(); diff --git a/src/mongo/db/s/resharding/resharding_coordinator_observer.cpp b/src/mongo/db/s/resharding/resharding_coordinator_observer.cpp index db9722bf5a8..9faea3cc351 100644 --- a/src/mongo/db/s/resharding/resharding_coordinator_observer.cpp +++ b/src/mongo/db/s/resharding/resharding_coordinator_observer.cpp @@ -269,7 +269,10 @@ void ReshardingCoordinatorObserver::_onAbortOrStepdown(WithLock, Status status) } if (!_allRecipientsReportedStrictConsistencyTimestamp.getFuture().isReady()) { + logd("(allison) still waiting for all recipients to report strict consistency"); _allRecipientsReportedStrictConsistencyTimestamp.setError(status); + } else { + logd("(allison) not currently waiting for all recipients to report strict consistency"); } } diff --git a/src/mongo/db/s/resharding/resharding_coordinator_service.cpp b/src/mongo/db/s/resharding/resharding_coordinator_service.cpp index 733fbe3d655..1ccc07ab124 100644 --- a/src/mongo/db/s/resharding/resharding_coordinator_service.cpp +++ b/src/mongo/db/s/resharding/resharding_coordinator_service.cpp @@ -147,6 +147,7 @@ MONGO_FAIL_POINT_DEFINE(reshardingPauseCoordinatorBeforePersistingStateTransitio MONGO_FAIL_POINT_DEFINE(pauseBeforeTellDonorToRefresh); MONGO_FAIL_POINT_DEFINE(pauseAfterInsertCoordinatorDoc); MONGO_FAIL_POINT_DEFINE(pauseBeforeCTHolderInitialization); +MONGO_FAIL_POINT_DEFINE(pauseAfterEngagingCriticalSection); const std::string kReshardingCoordinatorActiveIndexName = "ReshardingCoordinatorActiveIndex"; const Backoff kExponentialBackoff(Seconds(1), Milliseconds::max()); @@ -1486,7 +1487,9 @@ void ReshardingCoordinator::installCoordinatorDoc( "oldState"_attr = CoordinatorState_serializer(_coordinatorDoc.getState()), logAttrs(doc.getSourceNss()), "collectionUUID"_attr = doc.getSourceUUID(), - "reshardingUUID"_attr = doc.getReshardingUUID()); + "reshardingUUID"_attr = doc.getReshardingUUID(), + "criticalSectionTimeoutEngaged"_attr = + _criticalSectionTimeoutCbHandle.is_initialized()); const auto previousState = _coordinatorDoc.getState(); _coordinatorDoc = doc; @@ -1945,6 +1948,7 @@ ExecutorFuture ReshardingCoordinator::_runReshardingOp( } if (_criticalSectionTimeoutCbHandle) { + logd("(allison) cancelling timeout handler"); (*executor)->cancel(*_criticalSectionTimeoutCbHandle); } @@ -2457,6 +2461,8 @@ ExecutorFuture ReshardingCoordinator::_awaitAllRecipientsFinishedApplying( } _criticalSectionTimeoutCbHandle = swCbHandle.getValue(); + + pauseAfterEngagingCriticalSection.pauseWhileSet(); }); } diff --git a/src/mongo/db/s/resharding/resharding_recipient_service.cpp b/src/mongo/db/s/resharding/resharding_recipient_service.cpp index 4e013e0b612..a05165c106d 100644 --- a/src/mongo/db/s/resharding/resharding_recipient_service.cpp +++ b/src/mongo/db/s/resharding/resharding_recipient_service.cpp @@ -125,6 +125,7 @@ MONGO_FAIL_POINT_DEFINE(reshardingPauseRecipientDuringOplogApplication); MONGO_FAIL_POINT_DEFINE(reshardingOpCtxKilledWhileRestoringMetrics); MONGO_FAIL_POINT_DEFINE(reshardingRecipientFailsAfterTransitionToCloning); MONGO_FAIL_POINT_DEFINE(reshardingPauseRecipientBeforeBuildingIndex); +MONGO_FAIL_POINT_DEFINE(reshardingPauseBeforeEnteringStrictConsistency); namespace { @@ -1174,6 +1175,11 @@ void ReshardingRecipientService::RecipientStateMachine::_transitionToApplying( void ReshardingRecipientService::RecipientStateMachine::_transitionToStrictConsistency( const CancelableOperationContextFactory& factory) { + if (MONGO_unlikely(reshardingPauseBeforeEnteringStrictConsistency.shouldFail())) { + logd("(allison) delaying transition to strict consistency"); + sleepsecs(120); + logd("(allison) done delaying"); + } auto newRecipientCtx = _recipientCtx; newRecipientCtx.setState(RecipientStateEnum::kStrictConsistency); _transitionState(std::move(newRecipientCtx), boost::none, boost::none, factory);