From 207ddf9bacdc007ef8e28050f36f363e76c15855 Mon Sep 17 00:00:00 2001 From: Jordi Serra Torrens Date: Thu, 22 Sep 2022 09:04:55 +0000 Subject: [PATCH] Repro SERVER-69890 --- jstests/sharding/repro-SERVER-69890.js | 63 +++++++++++++++++++++ src/mongo/db/s/move_primary_coordinator.cpp | 5 +- 2 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 jstests/sharding/repro-SERVER-69890.js diff --git a/jstests/sharding/repro-SERVER-69890.js b/jstests/sharding/repro-SERVER-69890.js new file mode 100644 index 00000000000..c4eb83d0f35 --- /dev/null +++ b/jstests/sharding/repro-SERVER-69890.js @@ -0,0 +1,63 @@ +(function() { +'use strict'; + +load('jstests/libs/fail_point_util.js'); // For configureFailPoint +load('jstests/libs/parallel_shell_helpers.js'); + +const dbName = 'test'; +const collName = 'foo'; +const ns = dbName + '.' + collName; + +const st = new ShardingTest({shards: 2}); + +const db = st.s.getDB(dbName); +const unshardedColl = db['foo']; + +// Create the database, with shard0 as db-primary. +assert.commandWorked( + st.s.adminCommand({enableSharding: dbName, primaryShard: st.shard0.shardName})); + +// Create an unsharded collection on the db. +assert.commandWorked(unshardedColl.insert({x: 0})); +assert.eq(1, unshardedColl.find().itcount()); + +let hangMovePrimaryBeforeCommit = configureFailPoint(st.shard0, 'hangMovePrimaryBeforeCommit'); + +// Start move primary +const awaitMovePrimary = startParallelShell( + funWithArgs(function(dbName, toShardName) { + jsTest.log("Starting movePrimary"); + assert.commandWorked(db.adminCommand({movePrimary: dbName, to: toShardName})); + }, dbName, st.shard1.shardName), st.s.port); + +// Wait for movePrimary to reach the commit step. Block before proceeding. +hangMovePrimaryBeforeCommit.wait(); +jsTest.log("MovePrimary completed cloning and is now ready to commit"); + +// Remove the destination shard. +assert.soon(function() { + let res = assert.commandWorked(st.s.adminCommand({removeShard: st.shard1.shardName})); + if (!res.ok && res.code === ErrorCodes.ShardNotFound) { + // If the config server primary steps down right after removing the config.shards doc + // for the shard but before responding with "state": "completed", the mongos would retry + // the _configsvrRemoveShard command against the new config server primary, which would + // not find the removed shard in its ShardRegistry if it has done a ShardRegistry reload + // after the config.shards doc for the shard was removed. This would cause the command + // to fail with ShardNotFound. + return true; + } + return res.state == 'completed'; +}); +jsTest.log("Removed shard"); + +// Let the movePrimary coordinator proceed and wait for it to finish. +hangMovePrimaryBeforeCommit.off(); +awaitMovePrimary(); +jsTest.log("MovePrimary finished"); + +// Try to read from the collection +assert.eq(1, st.s.getDB(dbName)[collName].find().itcount()); // <== Will fail here! +jsTest.log("Managed to read from collection"); + +st.stop(); +})(); diff --git a/src/mongo/db/s/move_primary_coordinator.cpp b/src/mongo/db/s/move_primary_coordinator.cpp index a19a5ebb286..9575ca28c0a 100644 --- a/src/mongo/db/s/move_primary_coordinator.cpp +++ b/src/mongo/db/s/move_primary_coordinator.cpp @@ -40,12 +40,14 @@ #include "mongo/logv2/log.h" #include "mongo/s/client/shard_registry.h" #include "mongo/s/grid.h" +#include "mongo/util/fail_point.h" #define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kSharding - namespace mongo { +MONGO_FAIL_POINT_DEFINE(hangMovePrimaryBeforeCommit); + void MovePrimaryCoordinator::appendCommandInfo(BSONObjBuilder* cmdInfoBuilder) const { stdx::lock_guard lk{_docMutex}; cmdInfoBuilder->append("request", BSON(_doc.kToShardIdFieldName << _doc.getToShardId())); @@ -107,6 +109,7 @@ ExecutorFuture MovePrimaryCoordinator::_runImpl( MovePrimarySourceManager movePrimarySourceManager( opCtx, movePrimaryRequest, dbName, primaryId, toId); uassertStatusOK(movePrimarySourceManager.clone(opCtx)); + hangMovePrimaryBeforeCommit.pauseWhileSet(); uassertStatusOK(movePrimarySourceManager.enterCriticalSection(opCtx)); uassertStatusOK(movePrimarySourceManager.commitOnConfig(opCtx)); uassertStatusOK(movePrimarySourceManager.cleanStaleData(opCtx)); -- 2.17.1