diff --git a/jstests/sharding/migration_coordinator_failover.js b/jstests/sharding/migration_coordinator_failover.js
|
index 4418ea7589..4d14000639 100644
|
--- a/jstests/sharding/migration_coordinator_failover.js
|
+++ b/jstests/sharding/migration_coordinator_failover.js
|
@@ -25,7 +25,7 @@ function getNewNs(dbName) {
|
|
const dbName = "test";
|
|
-var st = new ShardingTest({shards: 2, rs: {nodes: 2}});
|
+var st = new ShardingTest({shards: 2, rs: {nodes: [{}, {rsConfig: {priority: 0}}]}});
|
|
assert.commandWorked(st.s.adminCommand({enableSharding: dbName}));
|
assert.commandWorked(st.s.adminCommand({movePrimary: dbName, to: st.shard0.shardName}));
|
@@ -76,6 +76,11 @@ function runMoveChunkMakeDonorStepDownAfterFailpoint(
|
failpointHandle.wait();
|
|
jsTest.log("Make the donor primary step down.");
|
+ // Note: stepUpNoAwaitReplication waits for the old primary to see the new primary, which means
|
+ // it waits for the old primary to have stepped down. Making the secondary step up makes the
|
+ // test run faster than making the primary step down, since it avoids waiting for the election
|
+ // timeout (10 seconds) for the replica set to run an election.
|
+ //st.rs0.stepUpNoAwaitReplication(st.rs0.getSecondary());
|
assert.commandWorked(
|
st.rs0.getPrimary().adminCommand({replSetStepDown: 10 /* stepDownSecs */, force: true}));
|
failpointHandle.off();
|
@@ -122,48 +127,48 @@ function runMoveChunkMakeDonorStepDownAfterFailpoint(
|
// Decision is commit
|
//
|
|
-runMoveChunkMakeDonorStepDownAfterFailpoint("hangBeforeMakingCommitDecisionDurable",
|
- false /* shouldMakeMigrationFailToCommitOnConfig */);
|
-runMoveChunkMakeDonorStepDownAfterFailpoint("hangBeforeSendingCommitDecision",
|
- false /* shouldMakeMigrationFailToCommitOnConfig */);
|
-runMoveChunkMakeDonorStepDownAfterFailpoint("hangBeforeForgettingMigrationAfterCommitDecision",
|
- false /* shouldMakeMigrationFailToCommitOnConfig */);
|
-
|
+//runMoveChunkMakeDonorStepDownAfterFailpoint("hangBeforeMakingCommitDecisionDurable",
|
+// false /* shouldMakeMigrationFailToCommitOnConfig */);
|
+//runMoveChunkMakeDonorStepDownAfterFailpoint("hangBeforeSendingCommitDecision",
|
+// false /* shouldMakeMigrationFailToCommitOnConfig */);
|
+//runMoveChunkMakeDonorStepDownAfterFailpoint("hangBeforeForgettingMigrationAfterCommitDecision",
|
+// false /* shouldMakeMigrationFailToCommitOnConfig */);
|
//
|
-// Decision is abort
|
+////
|
+//// Decision is abort
|
+////
|
//
|
+//runMoveChunkMakeDonorStepDownAfterFailpoint("moveChunkHangAtStep3",
|
+// false /* shouldMakeMigrationFailToCommitOnConfig */,
|
+// ErrorCodes.OperationFailed);
|
+//
|
+//runMoveChunkMakeDonorStepDownAfterFailpoint("moveChunkHangAtStep4",
|
+// false /* shouldMakeMigrationFailToCommitOnConfig */,
|
+// ErrorCodes.OperationFailed);
|
|
-runMoveChunkMakeDonorStepDownAfterFailpoint("moveChunkHangAtStep3",
|
- false /* shouldMakeMigrationFailToCommitOnConfig */,
|
- ErrorCodes.OperationFailed);
|
-
|
-runMoveChunkMakeDonorStepDownAfterFailpoint("moveChunkHangAtStep4",
|
- false /* shouldMakeMigrationFailToCommitOnConfig */,
|
- ErrorCodes.OperationFailed);
|
-
|
-runMoveChunkMakeDonorStepDownAfterFailpoint("moveChunkHangAtStep5",
|
- false /* shouldMakeMigrationFailToCommitOnConfig */,
|
- ErrorCodes.OperationFailed);
|
-
|
-runMoveChunkMakeDonorStepDownAfterFailpoint("hangInEnsureChunkVersionIsGreaterThanThenThrow",
|
- true /* shouldMakeMigrationFailToCommitOnConfig */,
|
- ErrorCodes.OperationFailed);
|
-
|
+//runMoveChunkMakeDonorStepDownAfterFailpoint("moveChunkHangAtStep5",
|
+// false /* shouldMakeMigrationFailToCommitOnConfig */,
|
+// ErrorCodes.OperationFailed);
|
+//
|
+//runMoveChunkMakeDonorStepDownAfterFailpoint("hangInEnsureChunkVersionIsGreaterThanThenThrow",
|
+// true /* shouldMakeMigrationFailToCommitOnConfig */,
|
+// ErrorCodes.OperationFailed);
|
+//
|
runMoveChunkMakeDonorStepDownAfterFailpoint("hangInRefreshFilteringMetadataUntilSuccessThenThrow",
|
true /* shouldMakeMigrationFailToCommitOnConfig */,
|
ErrorCodes.OperationFailed);
|
-
|
-runMoveChunkMakeDonorStepDownAfterFailpoint("hangBeforeMakingAbortDecisionDurable",
|
- true /* shouldMakeMigrationFailToCommitOnConfig */,
|
- ErrorCodes.StaleEpoch);
|
-
|
-runMoveChunkMakeDonorStepDownAfterFailpoint("hangBeforeSendingAbortDecision",
|
- true /* shouldMakeMigrationFailToCommitOnConfig */,
|
- ErrorCodes.StaleEpoch);
|
-
|
-runMoveChunkMakeDonorStepDownAfterFailpoint("hangBeforeForgettingMigrationAfterAbortDecision",
|
- true /* shouldMakeMigrationFailToCommitOnConfig */,
|
- ErrorCodes.StaleEpoch);
|
-
|
+//
|
+//runMoveChunkMakeDonorStepDownAfterFailpoint("hangBeforeMakingAbortDecisionDurable",
|
+// true /* shouldMakeMigrationFailToCommitOnConfig */,
|
+// ErrorCodes.StaleEpoch);
|
+//
|
+//runMoveChunkMakeDonorStepDownAfterFailpoint("hangBeforeSendingAbortDecision",
|
+// true /* shouldMakeMigrationFailToCommitOnConfig */,
|
+// ErrorCodes.StaleEpoch);
|
+//
|
+//runMoveChunkMakeDonorStepDownAfterFailpoint("hangBeforeForgettingMigrationAfterAbortDecision",
|
+// true /* shouldMakeMigrationFailToCommitOnConfig */,
|
+// ErrorCodes.StaleEpoch);
|
+//
|
st.stop();
|
})();
|
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
|
index 5931e3f4f7..519d0a9672 100644
|
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
|
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
|
@@ -2161,6 +2161,9 @@ void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx,
|
auto deadline = force ? stepDownUntil : waitUntil;
|
AutoGetRstlForStepUpStepDown arsd(
|
this, opCtx, ReplicationCoordinator::OpsKillingStateTransitionEnum::kStepDown, deadline);
|
+ LOG(0) << "xxx about to sleep after killing ops";
|
+ sleepmillis(1000);
|
+ LOG(0) << "xxx done sleeping after killing ops";
|
|
stdx::unique_lock<Latch> lk(_mutex);
|
|
diff --git a/src/mongo/db/s/migration_util.cpp b/src/mongo/db/s/migration_util.cpp
|
index 18c0647206..d904e5ed5d 100644
|
--- a/src/mongo/db/s/migration_util.cpp
|
+++ b/src/mongo/db/s/migration_util.cpp
|
@@ -524,6 +524,10 @@ void refreshFilteringMetadataUntilSuccess(OperationContext* opCtx, const Namespa
|
auto newOpCtxPtr = cc().makeOperationContext();
|
auto newOpCtx = newOpCtxPtr.get();
|
|
+ LOG(0) << "xxx about to sleep before forcing refresh";
|
+ sleepmillis(2000);
|
+ LOG(0) << "xxx done sleeping before forcing refresh";
|
+
|
forceShardFilteringMetadataRefresh(newOpCtx, nss, true);
|
|
// 'newOpCtx' won't get interrupted if a stepdown occurs while the thread is hanging in
|
@@ -531,12 +535,16 @@ void refreshFilteringMetadataUntilSuccess(OperationContext* opCtx, const Namespa
|
// MODE_X lock. To ensure the catch block is entered if the failpoint was set, throw an
|
// arbitrary error.
|
if (hangInRefreshFilteringMetadataUntilSuccessThenThrow.shouldFail()) {
|
+ LOG(0) << "xxx failpoint is on";
|
hangInRefreshFilteringMetadataUntilSuccessThenThrow.pauseWhileSet(newOpCtx);
|
uasserted(ErrorCodes::InternalError,
|
"simulate an error response for forceShardFilteringMetadataRefresh");
|
}
|
+ LOG(0) << "xxx breaking out of loop";
|
break;
|
} catch (const DBException& ex) {
|
+ LOG(0) << "xxx caught exception: " << ex.toStatus();
|
+
|
// If the server is already doing a clean shutdown, join the shutdown.
|
if (globalInShutdownDeprecated()) {
|
shutdown(waitForShutdown());
|
diff --git a/src/mongo/db/s/shard_server_catalog_cache_loader.cpp b/src/mongo/db/s/shard_server_catalog_cache_loader.cpp
|
index 682954e7b4..335c18c301 100644
|
--- a/src/mongo/db/s/shard_server_catalog_cache_loader.cpp
|
+++ b/src/mongo/db/s/shard_server_catalog_cache_loader.cpp
|
@@ -379,8 +379,10 @@ void ShardServerCatalogCacheLoader::initializeReplicaSetRole(bool isPrimary) {
|
invariant(_role == ReplicaSetRole::None);
|
|
if (isPrimary) {
|
+ LOG(0) << "xxx initializing role to primary";
|
_role = ReplicaSetRole::Primary;
|
} else {
|
+ LOG(0) << "xxx initializing role to secondary";
|
_role = ReplicaSetRole::Secondary;
|
}
|
}
|
@@ -390,6 +392,7 @@ void ShardServerCatalogCacheLoader::onStepDown() {
|
invariant(_role != ReplicaSetRole::None);
|
_contexts.interrupt(ErrorCodes::PrimarySteppedDown);
|
++_term;
|
+ LOG(0) << "xxx setting role to secondary, new term is " << _term;
|
_role = ReplicaSetRole::Secondary;
|
}
|
|
@@ -397,6 +400,7 @@ void ShardServerCatalogCacheLoader::onStepUp() {
|
stdx::lock_guard<Latch> lg(_mutex);
|
invariant(_role != ReplicaSetRole::None);
|
++_term;
|
+ LOG(0) << "xxx setting role to primary, new term is " << _term;
|
_role = ReplicaSetRole::Primary;
|
}
|
|
@@ -432,6 +436,7 @@ std::shared_ptr<Notification<void>> ShardServerCatalogCacheLoader::getChunksSinc
|
long long term;
|
std::tie(isPrimary, term) = [&] {
|
stdx::lock_guard<Latch> lock(_mutex);
|
+ LOG(0) << "xxx in getChunksSince, _role is " << (_role == ReplicaSetRole::Primary ? "primary" : "secondary") << ", _term is " << _term;
|
return std::make_tuple(_role == ReplicaSetRole::Primary, _term);
|
}();
|
|
@@ -448,6 +453,7 @@ std::shared_ptr<Notification<void>> ShardServerCatalogCacheLoader::getChunksSinc
|
// began but before the OperationContext was added to the group. So we'll check
|
// that we're still in the same _term.
|
stdx::lock_guard<Latch> lock(_mutex);
|
+ LOG(0) << "xxx in task for getChunksSince, term is now " << _term;
|
uassert(ErrorCodes::InterruptedDueToReplStateChange,
|
"Unable to refresh routing table because replica set state changed or "
|
"the node is shutting down.",
|