From 5f91f9cf19fd24bdd20145086b3c0a1c8a0997c2 Mon Sep 17 00:00:00 2001 From: Jordi Serra Torrens Date: Thu, 7 Oct 2021 14:45:35 +0000 Subject: [PATCH] SERVER-60521 repro --- jstests/sharding/server-60521-repro.js | 67 +++++++++++++++++++++ src/mongo/db/s/migration_source_manager.cpp | 3 + src/mongo/db/s/move_chunk_command.cpp | 3 + 3 files changed, 73 insertions(+) create mode 100644 jstests/sharding/server-60521-repro.js diff --git a/jstests/sharding/server-60521-repro.js b/jstests/sharding/server-60521-repro.js new file mode 100644 index 0000000000..c3841f2d49 --- /dev/null +++ b/jstests/sharding/server-60521-repro.js @@ -0,0 +1,67 @@ +(function() { +"use strict"; + +load('jstests/libs/chunk_manipulation_util.js'); +load("jstests/libs/fail_point_util.js"); + +const dbName = "test"; +const collName = "foo"; +const ns = dbName + "." + collName; + +let st = new ShardingTest({shards: 2, rs: {nodes: 1}}); +const staticMongod = MongoRunner.runMongod({}); + +assert.commandWorked(st.s.adminCommand({enableSharding: dbName})); +st.ensurePrimaryShard(dbName, st.shard0.shardName); +assert.commandWorked(st.s.adminCommand({shardCollection: ns, key: {_id: 1}})); + +const oldPrimary = st.rs0.getPrimary(); + +// Launch a migration, make it wait anywhere after the recovery document has already been persisted, +// so that an upcoming primary will need to recover it on step-up. +const fp0 = configureFailPoint(oldPrimary, "moveChunkHangAtStep4"); +let joinMoveChunk1 = moveChunkParallel(staticMongod, st.s.host, {_id: 0}, null, ns, st.shard1.shardName, false); +fp0.wait(); + +// Launch another migration and make it wait before registering to the activeMigrationsRegistry. +const fp1 = configureFailPoint(oldPrimary, "moveChunkWaitBeforeActiveMigrationsRegistry"); +let joinMoveChunk2 = moveChunkParallel(staticMongod, st.s.host, {_id: 0}, null, ns, st.shard1.shardName); +fp1.wait(); + +// Stepdown primary +assert.commandWorked(oldPrimary.adminCommand({replSetStepDown: 5, force: true})); +jsTest.log("Stepped down primary"); + +// Wait for the first migration to fail due to the stepdown +fp0.off(); +joinMoveChunk1(); + +// Let the second migration continue so it registers to the ActiveMigrationsRegistry +const fp2 = configureFailPoint(oldPrimary, "waitBeforeRefreshPreMigration"); +fp1.off(); +jsTest.log("Unset fp1"); + +// Wait for the migration to have been registered to the ActiveMigrationsRegistry +fp2.wait(); +jsTest.log("Hit fp2"); +sleep(10*1000); // Wait enough time for the node to attempt stepping up again +fp2.off(); +jsTest.log("Unset fp2"); + +// The former primary should already have been elected again (after 5 secs as set in the +// replSetStepDown command above) +jsTest.log("Waiting for new primary to get elected"); +st.rs0.awaitNodesAgreeOnPrimary(); +jsTest.log("New primary elected"); + +jsTest.log("Waiting for new primary to become writable"); +assert.soon(() => { + return oldPrimary.adminCommand({"hello": 1}).isWritablePrimary; +}); +jsTest.log("Waiting for new primary is now writable"); // BUG: Never reached! + +joinMoveChunk2(); + +st.stop(); +MongoRunner.stopMongod(staticMongod); +})(); diff --git a/src/mongo/db/s/migration_source_manager.cpp b/src/mongo/db/s/migration_source_manager.cpp index 2fa95e58d2..6fda67a87b 100644 --- a/src/mongo/db/s/migration_source_manager.cpp +++ b/src/mongo/db/s/migration_source_manager.cpp @@ -112,6 +112,7 @@ MONGO_FAIL_POINT_DEFINE(failMigrationCommit); MONGO_FAIL_POINT_DEFINE(hangBeforeLeavingCriticalSection); MONGO_FAIL_POINT_DEFINE(migrationCommitNetworkError); MONGO_FAIL_POINT_DEFINE(hangBeforePostMigrationCommitRefresh); +MONGO_FAIL_POINT_DEFINE(waitBeforeRefreshPreMigration); MigrationSourceManager* MigrationSourceManager::get(CollectionShardingRuntime* csr, CollectionShardingRuntime::CSRLock& csrLock) { @@ -140,6 +141,8 @@ MigrationSourceManager::MigrationSourceManager(OperationContext* opCtx, "requestParameters"_attr = redact(_args.toString()), "collectionEpoch"_attr = _args.getVersionEpoch()); + waitBeforeRefreshPreMigration.pauseWhileSet(); + // Make sure the latest shard version is recovered as of the time of the invocation of the // command. onShardVersionMismatch(_opCtx, getNss(), boost::none); diff --git a/src/mongo/db/s/move_chunk_command.cpp b/src/mongo/db/s/move_chunk_command.cpp index 8ee27e5636..314def9489 100644 --- a/src/mongo/db/s/move_chunk_command.cpp +++ b/src/mongo/db/s/move_chunk_command.cpp @@ -85,6 +85,7 @@ MONGO_FAIL_POINT_DEFINE(moveChunkHangAtStep3); MONGO_FAIL_POINT_DEFINE(moveChunkHangAtStep4); MONGO_FAIL_POINT_DEFINE(moveChunkHangAtStep5); MONGO_FAIL_POINT_DEFINE(moveChunkHangAtStep6); +MONGO_FAIL_POINT_DEFINE(moveChunkWaitBeforeActiveMigrationsRegistry); class MoveChunkCommand : public BasicCommand { public: @@ -134,6 +135,8 @@ public: // where we might have changed a shard's host by removing/adding a shard with the same name. Grid::get(opCtx)->shardRegistry()->reload(opCtx); + moveChunkWaitBeforeActiveMigrationsRegistry.pauseWhileSet(); + auto scopedMigration = uassertStatusOK( ActiveMigrationsRegistry::get(opCtx).registerDonateChunk(opCtx, moveChunkRequest)); -- 2.17.1