From de8e8b1198cfb1e52144a355bbe28e9db36dd640 Mon Sep 17 00:00:00 2001 From: Jordi Serra Torrens Date: Wed, 15 Sep 2021 15:11:41 +0000 Subject: [PATCH] SERVER-59965 repro --- jstests/sharding/server-59965-repro.js | 97 +++++++++++++++++++ .../db/s/rename_collection_coordinator.cpp | 5 + .../rename_collection_participant_service.cpp | 8 ++ 3 files changed, 110 insertions(+) create mode 100644 jstests/sharding/server-59965-repro.js diff --git a/jstests/sharding/server-59965-repro.js b/jstests/sharding/server-59965-repro.js new file mode 100644 index 0000000000..f557b3c48d --- /dev/null +++ b/jstests/sharding/server-59965-repro.js @@ -0,0 +1,97 @@ +(function() { +'use strict'; + +load("jstests/libs/fail_point_util.js"); +load('jstests/libs/parallel_shell_helpers.js'); + +const dbName = 'test'; +const srcCollName = 'foo'; +const dstCollName = 'bar'; +const srcNs = dbName + '.' + srcCollName; +const dstNs = dbName + '.' + dstCollName; + +let st = new ShardingTest({shards: 2}); + +// Create a collection with two chunks, one on each shard. +assert.commandWorked(st.s.adminCommand({enableSharding: dbName})); +st.ensurePrimaryShard(dbName, st.shard0.shardName); + +assert.commandWorked(st.s.adminCommand({shardCollection: srcNs, key: {_id: 1}})); + +assert.commandWorked(st.s.adminCommand({split: srcNs, middle: {_id: 0}})); +assert.commandWorked(st.s.adminCommand( + {moveChunk: srcNs, find: {_id: 0}, to: st.shard1.shardName, _waitForDelete: true})); + +// Pause the RenameCollectionCoordinator after it has called _configsvrAllowMigrations. We need to +// do this because _configsvrAllowMigrations bumps the minor collection version, and we need the +// router to have the latest collection version when it later runs the txn on shard0. If not, then +// the txn statement on shard0 would exit here (https://github.com/mongodb/mongo/blob/a1028f6738bcf0059f8e243b154208a8c2a8b499/src/mongo/db/service_entry_point_common.cpp#L1600-L1602), +// instead of attempting refreshing the collection (which blocks behind the critSect, which is important for this repro.) +let pauseRenameCoordinatorAfterFreezeMigrationsOnCoordinator = + configureFailPoint(st.shard0, "pauseRenameCoordinatorAfterFreezeMigrations"); + +// On shard0, set the pauseRenameParticipantAfterAcquiringCritSect failpoint so that +// RenameCollectionParticipant blocks AFTER acquiring the critical sections. +let pauseRenameParticipantAfterAcquiringCritSectShard0 = + configureFailPoint(st.shard0, "pauseRenameParticipantAfterAcquiringCritSect"); + +// On shard1, set the pauseRenameParticipantBeforeAcquiringCritSect failpoint so that +// RenameCollectionParticipant blocks BEFORE acquiring the critical sections. +let pauseRenameParticipantBeforeAcquiringCritSectShard1 = + configureFailPoint(st.shard1, "pauseRenameParticipantBeforeAcquiringCritSect"); + +// Run rename on a parallel shell +const awaitRename = startParallelShell( + funWithArgs(function(dbName, srcCollName, dstCollName) { + assert.commandWorked( + db.getSiblingDB(dbName).getCollection(srcCollName).renameCollection(dstCollName)); + }, dbName, srcCollName, dstCollName), st.s.port); + +pauseRenameCoordinatorAfterFreezeMigrationsOnCoordinator.wait(); +// Make the router fetch the latest metadata from the configsvr on the next query. +assert.commandWorked(st.s.adminCommand({flushRouterConfig: 1})); +pauseRenameCoordinatorAfterFreezeMigrationsOnCoordinator.off(); + +// wait pauseRenameParticipantAfterAcquiringCritSect on shard0 to be hit and unset it +pauseRenameParticipantAfterAcquiringCritSectShard0.wait(); +jsTest.log("Hit pauseRenameParticipantAfterAcquiringCritSect on shard0"); +pauseRenameParticipantAfterAcquiringCritSectShard0.off(); + +// wait pauseRenameParticipantBeforeAcquiringCritSect on shard to be hit, but don't unset it yet. +pauseRenameParticipantBeforeAcquiringCritSectShard1.wait(); +jsTest.log("Hit pauseRenameParticipantBeforeAcquiringCritSect on shard1"); + +// Launch multi-shard txn +const awaitBroadcastTxn = startParallelShell( + funWithArgs(function(dbName, collName) { + const session = db.getMongo().startSession(); + const sessionCollection = session.getDatabase(dbName).getCollection(collName); + + session.startTransaction(); + assert.commandWorked(sessionCollection.update({x: 0}, {$inc: {y: 100}}, {multi: true})); + assert.commandWorked(session.commitTransaction_forTesting()); + }, dbName, srcCollName), st.s.port); + +// Wait until the txn statements have been sent to the participant shards +sleep(5*1000); // TODO: Find a more formal way to synchronize this + +// Let the RenameCollectionParticipant on shard1 continue. +pauseRenameParticipantBeforeAcquiringCritSectShard1.off(); + +// System dead-locked: +// - On shard0, which had entered the critical section, the transaction statement threw StaleConfig, +// which on the way out of the command attempted to refresh the shard version and is now blocked +// waiting for the critical section to be released. +// - On shard1, which had not yet entered the critical section, the txn statement ran, which +// acquired the collection lock in MODE_IX and then stashed the locks. The +// RenameCollectionParticipant is unable to make progress because it's waiting to acquire the +// collection lock with MODE_S. The stashed locks will only be released when the txn statement +// manages to finish on shard0; but this can't happen until shard1's RenameCollectionParticipant +// makes progress to let the RenameCoordinator continue and eventually tell the participants to +// release their critical sections + +awaitRename(); +awaitBroadcastTxn(); + +st.stop(); +})(); diff --git a/src/mongo/db/s/rename_collection_coordinator.cpp b/src/mongo/db/s/rename_collection_coordinator.cpp index 61a8365ee3..d53f3dd705 100644 --- a/src/mongo/db/s/rename_collection_coordinator.cpp +++ b/src/mongo/db/s/rename_collection_coordinator.cpp @@ -50,10 +50,13 @@ #include "mongo/s/client/shard_registry.h" #include "mongo/s/grid.h" #include "mongo/s/request_types/sharded_ddl_commands_gen.h" +#include "mongo/util/future_util.h" namespace mongo { namespace { +MONGO_FAIL_POINT_DEFINE(pauseRenameCoordinatorAfterFreezeMigrations); + boost::optional getShardedCollection(OperationContext* opCtx, const NamespaceString& nss) { try { @@ -245,6 +248,8 @@ ExecutorFuture RenameCollectionCoordinator::_runImpl( if (_doc.getTargetIsSharded()) { sharding_ddl_util::stopMigrations(opCtx, toNss, _doc.getTargetUUID()); } + + pauseRenameCoordinatorAfterFreezeMigrations.pauseWhileSet(); })) .then(_executePhase( Phase::kBlockCrudAndRename, diff --git a/src/mongo/db/s/rename_collection_participant_service.cpp b/src/mongo/db/s/rename_collection_participant_service.cpp index f93b7732ec..5317afc3fb 100644 --- a/src/mongo/db/s/rename_collection_participant_service.cpp +++ b/src/mongo/db/s/rename_collection_participant_service.cpp @@ -46,11 +46,15 @@ #include "mongo/logv2/log.h" #include "mongo/s/catalog/sharding_catalog_client.h" #include "mongo/s/grid.h" +#include "mongo/util/future_util.h" namespace mongo { namespace { +MONGO_FAIL_POINT_DEFINE(pauseRenameParticipantBeforeAcquiringCritSect); +MONGO_FAIL_POINT_DEFINE(pauseRenameParticipantAfterAcquiringCritSect); + /* * Drop the collection locally and clear stale metadata from cache collections. */ @@ -241,6 +245,8 @@ SemiFuture RenameParticipantInstance::run( auto opCtxHolder = cc().makeOperationContext(); auto* opCtx = opCtxHolder.get(); + pauseRenameParticipantBeforeAcquiringCritSect.pauseWhileSet(); + // Acquire source/target critical sections const auto reason = BSON("command" @@ -256,6 +262,8 @@ SemiFuture RenameParticipantInstance::run( service->promoteRecoverableCriticalSectionToBlockAlsoReads( opCtx, toNss(), reason, ShardingCatalogClient::kLocalWriteConcern); + pauseRenameParticipantAfterAcquiringCritSect.pauseWhileSet(); + snapshotRangeDeletionsForRename(opCtx, fromNss(), toNss()); })) .then(_executePhase( -- 2.17.1