From 88ea0457f68fcfb6a737ef85e002bfd31a796c8d Mon Sep 17 00:00:00 2001 From: Jordi Serra Torrens Date: Tue, 7 May 2024 11:20:38 +0000 Subject: [PATCH] No repro when transitioning to dedicated config server --- ...n-transition-to-dedicated-config-server.js | 101 ++++++++++++++++++ src/mongo/db/commands/txn_cmds.cpp | 7 ++ 2 files changed, 108 insertions(+) create mode 100644 jstests/sharding/no-repro-on-transition-to-dedicated-config-server.js diff --git a/jstests/sharding/no-repro-on-transition-to-dedicated-config-server.js b/jstests/sharding/no-repro-on-transition-to-dedicated-config-server.js new file mode 100644 index 00000000000..d13aa662d7c --- /dev/null +++ b/jstests/sharding/no-repro-on-transition-to-dedicated-config-server.js @@ -0,0 +1,101 @@ +import {configureFailPoint} from "jstests/libs/fail_point_util.js"; +import {moveOutSessionChunks, removeShard} from "jstests/sharding/libs/remove_shard_util.js"; + +let st = ShardingTest({shards: 3, configShard: true}); + +let dbName = "test"; +let collName = "foo"; +let coll = st.getDB(dbName)[collName]; + +// Sanity check that shard0 is the "config shard". +assert.eq("config", st.shard0.shardName); + +// Initial placement: +// - shard0 (configShard): [-inf, 0) +// - shard1: [0, +inf] +// - shard2: nothing +assert.commandWorked( + st.s.adminCommand({enableSharding: dbName, primaryShard: st.shard1.shardName})); +assert.commandWorked(st.s.adminCommand({shardCollection: coll.getFullName(), key: {x: 1}})); +assert.commandWorked(st.s.adminCommand({split: coll.getFullName(), middle: {x: 0}})); +assert.commandWorked( + st.s.adminCommand({moveChunk: coll.getFullName(), find: {x: -1}, to: st.shard0.shardName})); +assert.commandWorked( + st.s.adminCommand({moveChunk: coll.getFullName(), find: {x: 1}, to: st.shard1.shardName})); + +// Insert two documents, one on each chunk. +assert.commandWorked(coll.insertMany([{x: -1, y: 0}, {x: 1, y: 0}])); + +// Set failpoint on shard1 so that it hangs when committing the transaction (after prepare). Also +// set a failpoint so that commitTransaction will fail once we let it continue. +let fpHangCommitTransactionOnShard1 = + configureFailPoint(st.rs1.getPrimary(), "hangBeforeCommitingTxn"); +let fpFailCommitTransactionOnShard1 = configureFailPoint( + st.rs1.getPrimary(), "transactionParticipantFailWithNetworkErrorBeforeCommitTransaction"); + +// Run a transaction that targets shard0 and shard1. Make it so shard0 will be nominated as the +// TransactionCoordinator. Start the 2PC commit but make it hang on shard1, after it has prepared. +let awaitTxn = startParallelShell(() => { + // Start transaction. First make a write to shard0 so that it will be nominated as the + // TransactionCoordinator. + let session = db.getMongo().startSession(); + session.startTransaction(); + let sessionColl = session.getDatabase("test")["foo"]; + assert.commandWorked(sessionColl.updateOne({x: -1}, {$set: {y: 1}})); + + // Write to shard1 as well, so this transaction becomes a distributed transaction that will + // require 2PC. + assert.commandWorked(sessionColl.updateOne({x: 1}, {$set: {y: 1}})); + + // Start the commit. + assert.commandWorked(session.commitTransaction_forTesting()); +}, st.s.port); + +jsTest.log("--DEBUG-- Waiting for fpHangCommitTransactionOnShard1 to be hit"); +fpHangCommitTransactionOnShard1.wait(); +jsTest.log("--DEBUG-- fpHangCommitTransactionOnShard1 hit"); + +// Transition to dedicated config server. +// To do so, first donate its chunk to shard2. +assert.commandWorked( + st.s.adminCommand({moveChunk: coll.getFullName(), find: {x: -1}, to: st.shard2.shardName})); +moveOutSessionChunks(st, st.shard0.shardName, st.shard2.shardName); +removeShard(st.s, st.shard0.shardName); +jsTest.log("--DEBUG-- Transitioned to dedicated config server"); + +// (optional) Restart the config server replica set, just to test that it indeed recovers ongoing +// coordinations. +st.stopAllConfigServers(undefined, true /* forRestart */); +st.restartAllConfigServers(); +jsTest.log("--DEBUG-- Restarted config server replica set"); + +// Unblock the transaction participant. The ongoing commit will fail. Unset the failpoints so new +// commit attempts by the TransactionCoordinator will succeed. +fpHangCommitTransactionOnShard1.off(); +sleep(1000); +fpFailCommitTransactionOnShard1.off(); + +// See that the TransactionCoordinator will recover on the now-dedicated configsvr and will +// eventually complete the 2PC on shard1. +jsTest.log("--DEBUG-- Checking that transaction commit eventually completes"); +assert.soon(() => { + return st.s.getDB("admin") + .aggregate([ + {$currentOp: {}}, + { + $match: { + shard: st.shard1.shardName, + "transaction.timePreparedMicros": {$exists: true} + } + } + ]) + .itcount() === 0; +}); + +awaitTxn(); +jsTest.log("--DEBUG-- Transaction commit completed"); + +// and now writes on the documents affected by the transaction can proceed. +assert.commandWorked(coll.updateOne({x: 1}, {$set: {y: 2}})); + +st.stop(); diff --git a/src/mongo/db/commands/txn_cmds.cpp b/src/mongo/db/commands/txn_cmds.cpp index eb60dad9b10..55eac83554b 100644 --- a/src/mongo/db/commands/txn_cmds.cpp +++ b/src/mongo/db/commands/txn_cmds.cpp @@ -73,6 +73,7 @@ namespace { MONGO_FAIL_POINT_DEFINE(participantReturnNetworkErrorForAbortAfterExecutingAbortLogic); MONGO_FAIL_POINT_DEFINE(participantReturnNetworkErrorForCommitAfterExecutingCommitLogic); +MONGO_FAIL_POINT_DEFINE(transactionParticipantFailWithNetworkErrorBeforeCommitTransaction); MONGO_FAIL_POINT_DEFINE(hangBeforeCommitingTxn); MONGO_FAIL_POINT_DEFINE(hangBeforeAbortingTxn); // TODO SERVER-39704: Remove this fail point once the router can safely retry within a transaction @@ -161,6 +162,12 @@ public: CurOpFailpointHelpers::waitWhileFailPointEnabled( &hangBeforeCommitingTxn, opCtx, "hangBeforeCommitingTxn"); + if (MONGO_unlikely(transactionParticipantFailWithNetworkErrorBeforeCommitTransaction + .shouldFail())) { + uasserted(ErrorCodes::HostUnreachable, + "returning network error because failpoint is on"); + } + auto optionalCommitTimestamp = request().getCommitTimestamp(); if (optionalCommitTimestamp) { // commitPreparedTransaction will throw if the transaction is not prepared. -- 2.34.1