diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_kill_primary.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_kill_primary.yml index 467d9301be..4770de3659 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_kill_primary.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_kill_primary.yml @@ -158,10 +158,8 @@ selector: # ChunkHelper directly talks to the config servers and doesn't support retries for network errors - jstests/concurrency/fsm_workloads/cleanupOrphanedWhileMigrating.js - - jstests/concurrency/fsm_workloads/sharded_base_partitioned.js - jstests/concurrency/fsm_workloads/sharded_mergeChunks_partitioned.js - jstests/concurrency/fsm_workloads/sharded_moveChunk_drop_shard_key_index.js - - jstests/concurrency/fsm_workloads/sharded_moveChunk_partitioned.js - jstests/concurrency/fsm_workloads/sharded_splitChunk_partitioned.js # These workloads frequently time out waiting for the distributed lock to drop a sharded @@ -195,13 +193,7 @@ selector: - jstests/concurrency/fsm_workloads/agg_merge_when_not_matched_insert.js - jstests/concurrency/fsm_workloads/agg_merge_when_matched_replace_with_new.js - # TODO SERVER-40713 moveChunk is not considered retryable by the network retry override. - - jstests/concurrency/fsm_workloads/agg_with_chunk_migrations.js - - jstests/concurrency/fsm_workloads/random_moveChunk_broadcast_delete_transaction.js - - jstests/concurrency/fsm_workloads/random_moveChunk_broadcast_update_transaction.js - - jstests/concurrency/fsm_workloads/random_moveChunk_refine_collection_shard_key.js - - jstests/concurrency/fsm_workloads/random_moveChunk_refine_collection_shard_key_broadcast_delete_transaction.js - - jstests/concurrency/fsm_workloads/random_moveChunk_refine_collection_shard_key_broadcast_update_transaction.js + # Uses the collMod command, which isn't considered to be retryable. - jstests/concurrency/fsm_workloads/random_moveChunk_index_operations.js # JS engine interruptions on mongos return ErrorCodes::Interrupted, which isn't diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_terminate_primary.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_terminate_primary.yml index e607314c23..33b0d0bfc9 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_terminate_primary.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_terminate_primary.yml @@ -158,10 +158,8 @@ selector: # ChunkHelper directly talks to the config servers and doesn't support retries for network errors - jstests/concurrency/fsm_workloads/cleanupOrphanedWhileMigrating.js - - jstests/concurrency/fsm_workloads/sharded_base_partitioned.js - jstests/concurrency/fsm_workloads/sharded_mergeChunks_partitioned.js - jstests/concurrency/fsm_workloads/sharded_moveChunk_drop_shard_key_index.js - - jstests/concurrency/fsm_workloads/sharded_moveChunk_partitioned.js - jstests/concurrency/fsm_workloads/sharded_splitChunk_partitioned.js # These workloads frequently time out waiting for the distributed lock to drop a sharded @@ -195,13 +193,7 @@ selector: - jstests/concurrency/fsm_workloads/agg_merge_when_not_matched_insert.js - jstests/concurrency/fsm_workloads/agg_merge_when_matched_replace_with_new.js - # TODO SERVER-40713 moveChunk is not considered retryable by the network retry override. - - jstests/concurrency/fsm_workloads/agg_with_chunk_migrations.js - - jstests/concurrency/fsm_workloads/random_moveChunk_broadcast_delete_transaction.js - - jstests/concurrency/fsm_workloads/random_moveChunk_broadcast_update_transaction.js - - jstests/concurrency/fsm_workloads/random_moveChunk_refine_collection_shard_key.js - - jstests/concurrency/fsm_workloads/random_moveChunk_refine_collection_shard_key_broadcast_delete_transaction.js - - jstests/concurrency/fsm_workloads/random_moveChunk_refine_collection_shard_key_broadcast_update_transaction.js + # Uses the collMod command, which isn't considered to be retryable. - jstests/concurrency/fsm_workloads/random_moveChunk_index_operations.js # JS engine interruptions on mongos return ErrorCodes::Interrupted, which isn't diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_stepdowns.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_stepdowns.yml index 40468ac7d7..56fa4b8def 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_stepdowns.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_stepdowns.yml @@ -140,10 +140,8 @@ selector: ## # ChunkHelper directly talks to the config servers and doesn't support retries for network errors - - jstests/concurrency/fsm_workloads/sharded_base_partitioned.js - jstests/concurrency/fsm_workloads/sharded_mergeChunks_partitioned.js - jstests/concurrency/fsm_workloads/sharded_moveChunk_drop_shard_key_index.js - - jstests/concurrency/fsm_workloads/sharded_moveChunk_partitioned.js - jstests/concurrency/fsm_workloads/sharded_splitChunk_partitioned.js # These workloads frequently time out waiting for the distributed lock to drop a sharded @@ -176,14 +174,7 @@ selector: - jstests/concurrency/fsm_workloads/agg_merge_when_not_matched_insert.js - jstests/concurrency/fsm_workloads/agg_merge_when_matched_replace_with_new.js - - # TODO SERVER-40713 moveChunk is not considered retryable by the network retry override. - - jstests/concurrency/fsm_workloads/agg_with_chunk_migrations.js - - jstests/concurrency/fsm_workloads/random_moveChunk_broadcast_delete_transaction.js - - jstests/concurrency/fsm_workloads/random_moveChunk_broadcast_update_transaction.js - - jstests/concurrency/fsm_workloads/random_moveChunk_refine_collection_shard_key.js - - jstests/concurrency/fsm_workloads/random_moveChunk_refine_collection_shard_key_broadcast_delete_transaction.js - - jstests/concurrency/fsm_workloads/random_moveChunk_refine_collection_shard_key_broadcast_update_transaction.js + # Uses the collMod command, which isn't considered to be retryable. - jstests/concurrency/fsm_workloads/random_moveChunk_index_operations.js # JS engine interruptions on mongos return ErrorCodes::Interrupted, which isn't diff --git a/jstests/concurrency/fsm_libs/fsm.js b/jstests/concurrency/fsm_libs/fsm.js index 3aa7347742..eb9b4a8f88 100644 --- a/jstests/concurrency/fsm_libs/fsm.js +++ b/jstests/concurrency/fsm_libs/fsm.js @@ -1,6 +1,18 @@ 'use strict'; var fsm = (function() { + const kIsRunningInsideTransaction = Symbol('isRunningInsideTransaction'); + + function forceRunningOutsideTransaction(data) { + if (data[kIsRunningInsideTransaction]) { + const err = + new Error('Intentionally thrown to stop state function from running inside of a' + + ' multi-statement transaction'); + err.isNotSupported = true; + throw err; + } + } + // args.data = 'this' object of the state functions // args.db = database object // args.collName = collection name @@ -9,6 +21,7 @@ var fsm = (function() { // args.startState = name of initial state function // args.states = state functions of the form // { stateName: function(db, collName) { ... } } + // args.tid = the thread identifier // args.transitions = transitions between state functions of the form // { stateName: { nextState1: probability, // nextState2: ... } } @@ -40,14 +53,41 @@ var fsm = (function() { return conn; }; - connCache = {mongos: [], config: [], shards: {}}; + const getReplSetName = (conn) => { + const res = assert.commandWorked(conn.getDB('admin').runCommand({isMaster: 1})); + assert.eq('string', + typeof res.setName, + () => `not connected to a replica set: ${tojson(res)}`); + return res.setName; + }; + + const makeReplSetConnWithExistingSession = (connStrList, replSetName) => { + const conn = makeNewConnWithExistingSession(`mongodb://${ + connStrList.join(',')}/?appName=tid:${args.tid}&replicaSet=${replSetName}`); + + // We set _isConfigServer=true on the Mongo connection object so + // set_read_preference_secondary.js knows to avoid overriding the read preference as + // the concurrency suite may be running with a 1-node CSRS. + conn._isConfigServer = true; + + return conn; + }; + + connCache = + {mongos: [], config: [], shards: {}, rsConns: {config: undefined, shards: {}}}; connCache.mongos = args.cluster.mongos.map(makeNewConnWithExistingSession); connCache.config = args.cluster.config.map(makeNewConnWithExistingSession); + connCache.rsConns.config = makeReplSetConnWithExistingSession( + args.cluster.config, getReplSetName(connCache.config[0])); var shardNames = Object.keys(args.cluster.shards); - shardNames.forEach(name => (connCache.shards[name] = args.cluster.shards[name].map( - makeNewConnWithExistingSession))); + shardNames.forEach(name => { + connCache.shards[name] = + args.cluster.shards[name].map(makeNewConnWithExistingSession); + connCache.rsConns.shards[name] = makeReplSetConnWithExistingSession( + args.cluster.shards[name], getReplSetName(connCache.shards[name][0])); + }); } for (var i = 0; i < args.iterations; ++i) { @@ -63,8 +103,10 @@ var fsm = (function() { let data; withTxnAndAutoRetry(args.db.getSession(), () => { data = TransactionsUtil.deepCopyObject({}, args.data); + data[kIsRunningInsideTransaction] = true; fn.call(data, args.db, args.collName, connCache); }); + delete data[kIsRunningInsideTransaction]; args.data = data; } catch (e) { // Retry state functions that threw OperationNotSupportedInTransaction or @@ -128,5 +170,9 @@ var fsm = (function() { assert(false, 'not reached'); } - return {run: runFSM, _getWeightedRandomChoice: getWeightedRandomChoice}; + return { + forceRunningOutsideTransaction, + run: runFSM, + _getWeightedRandomChoice: getWeightedRandomChoice, + }; })(); diff --git a/jstests/concurrency/fsm_libs/worker_thread.js b/jstests/concurrency/fsm_libs/worker_thread.js index 58eeed3e66..7e237e6257 100644 --- a/jstests/concurrency/fsm_libs/worker_thread.js +++ b/jstests/concurrency/fsm_libs/worker_thread.js @@ -205,6 +205,7 @@ var workerThread = (function() { passConnectionCache: config.passConnectionCache, startState: config.startState, states: config.states, + tid: args.tid, transitions: config.transitions }; }); diff --git a/jstests/concurrency/fsm_workload_helpers/chunks.js b/jstests/concurrency/fsm_workload_helpers/chunks.js index d4c78b3de5..041667fbe9 100644 --- a/jstests/concurrency/fsm_workload_helpers/chunks.js +++ b/jstests/concurrency/fsm_workload_helpers/chunks.js @@ -21,15 +21,10 @@ var ChunkHelper = (function() { return Math.min(curSleep, MAX_BACKOFF_SLEEP); } - function runCommandWithRetries(db, cmd, acceptableErrorCodes) { + function runCommandWithRetries(db, cmd, didAcceptableErrorOccurFn) { const INITIAL_BACKOFF_SLEEP = 500; // milliseconds const MAX_RETRIES = 5; - var acceptableErrorOccurred = function acceptableErrorOccurred(errorCode, - acceptableErrorCodes) { - return acceptableErrorCodes.indexOf(errorCode) > -1; - }; - var res; var retries = 0; var backoffSleep = INITIAL_BACKOFF_SLEEP; @@ -41,12 +36,15 @@ var ChunkHelper = (function() { return res; } // Assert command worked or acceptable error occurred. - var msg = tojson({command: cmd, res: res}); - assertWhenOwnColl(acceptableErrorOccurred(res.code, acceptableErrorCodes), msg); + if (didAcceptableErrorOccurFn(res)) { + // When an acceptable error occurs, sleep and then retry. + sleep(backoffSleep); + backoffSleep = getNextBackoffSleep(backoffSleep); + continue; + } - // When an acceptable error occurs, sleep and then retry. - sleep(backoffSleep); - backoffSleep = getNextBackoffSleep(backoffSleep); + // Throw an exception if the command errored for any other reason. + assertWhenOwnColl.commandWorked(res, cmd); } return res; @@ -54,14 +52,12 @@ var ChunkHelper = (function() { function splitChunkAtPoint(db, collName, splitPoint) { var cmd = {split: db[collName].getFullName(), middle: {_id: splitPoint}}; - var acceptableErrorCodes = [ErrorCodes.LockBusy]; - return runCommandWithRetries(db, cmd, acceptableErrorCodes); + return runCommandWithRetries(db, cmd, res => res.code === ErrorCodes.LockBusy); } function splitChunkWithBounds(db, collName, bounds) { var cmd = {split: db[collName].getFullName(), bounds: bounds}; - var acceptableErrorCodes = [ErrorCodes.LockBusy]; - return runCommandWithRetries(db, cmd, acceptableErrorCodes); + return runCommandWithRetries(db, cmd, res => res.code === ErrorCodes.LockBusy); } function moveChunk(db, collName, bounds, toShard, waitForDelete) { @@ -71,15 +67,24 @@ var ChunkHelper = (function() { to: toShard, _waitForDelete: waitForDelete }; - var acceptableErrorCodes = - [ErrorCodes.ConflictingOperationInProgress, ErrorCodes.ChunkRangeCleanupPending]; - return runCommandWithRetries(db, cmd, acceptableErrorCodes); + + const runningWithStepdowns = + TestData.runningWithConfigStepdowns || TestData.runningWithShardStepdowns; + + return runCommandWithRetries( + db, + cmd, + res => (res.code === ErrorCodes.ConflictingOperationInProgress || + res.code === ErrorCodes.ChunkRangeCleanupPending || + // The chunk migration has surely been aborted if the startCommit of the + // procedure was interrupted by a stepdown. + (runningWithStepdowns && res.code === ErrorCodes.CommandFailed && + res.errmsg.includes("startCommit")))); } function mergeChunks(db, collName, bounds) { var cmd = {mergeChunks: db[collName].getFullName(), bounds: bounds}; - var acceptableErrorCodes = [ErrorCodes.LockBusy]; - return runCommandWithRetries(db, cmd, acceptableErrorCodes); + return runCommandWithRetries(db, cmd, res => res.code === ErrorCodes.LockBusy); } // Take a set of connections to a shard (replica set or standalone mongod), @@ -141,11 +146,19 @@ var ChunkHelper = (function() { return {shards: shards, explain: res, query: query, shardVersion: shardVersion}; } + function itcount(collection, query) { + // We project out all of the fields in order to greatly reduce the likelihood a cursor would + // actually be returned. This is acceptable because we're only interested in how many + // documents there were and not any of their contents. The network_error_and_txn_override.js + // override would throw an exception if we attempted to use the getMore command. + return collection.find(query, {_id: 0, nonExistingField: 1}).itcount(); + } + // Return the number of docs in [lower, upper) as seen by conn. function getNumDocs(conn, collName, lower, upper) { var coll = conn.getCollection(collName); var query = {$and: [{_id: {$gte: lower}}, {_id: {$lt: upper}}]}; - return coll.find(query).itcount(); + return itcount(coll, query); } // Intended for use on config or mongos connections only. @@ -157,7 +170,7 @@ var ChunkHelper = (function() { assert(isString(ns) && ns.indexOf('.') !== -1 && !ns.startsWith('.') && !ns.endsWith('.'), ns + ' is not a valid namespace'); var query = {'ns': ns, 'min._id': {$gte: lower}, 'max._id': {$lte: upper}}; - return conn.getDB('config').chunks.find(query).itcount(); + return itcount(conn.getDB('config').chunks, query); } // Intended for use on config or mongos connections only. diff --git a/jstests/concurrency/fsm_workloads/random_moveChunk_base.js b/jstests/concurrency/fsm_workloads/random_moveChunk_base.js index 6e74c0e545..bcc5d8b16f 100644 --- a/jstests/concurrency/fsm_workloads/random_moveChunk_base.js +++ b/jstests/concurrency/fsm_workloads/random_moveChunk_base.js @@ -70,9 +70,16 @@ var $config = extendWorkload($config, function($config, $super) { * acceptable errors, e.g. ConflictingOperationInProgress, and is not guaranteed to succeed. */ $config.states.moveChunk = function moveChunk(db, collName, connCache) { + // This state function would have eventually been run outside a transaction because the + // moveChunk command cannot be run in a multi-statement transaction. The reason for bailing + // out immediately is to avoid a hang caused by one worker thread having an idle transaction + // on the config server primary repeatedly trying to run a moveChunk command and another + // worker thread blocked in committing a moveChunk on those same shard due to being unable + // to acquire the global X lock. + fsm.forceRunningOutsideTransaction(this); + // Choose a random chunk in our partition to move. - const chunk = - this.getRandomChunkInPartition(collName, ChunkHelper.getPrimary(connCache.config)); + const chunk = this.getRandomChunkInPartition(collName, connCache.rsConns.config); const fromShard = chunk.shard; // Choose a random shard to move the chunk to. diff --git a/jstests/concurrency/fsm_workloads/sharded_base_partitioned.js b/jstests/concurrency/fsm_workloads/sharded_base_partitioned.js index 2f1f3f4f10..92184d58df 100644 --- a/jstests/concurrency/fsm_workloads/sharded_base_partitioned.js +++ b/jstests/concurrency/fsm_workloads/sharded_base_partitioned.js @@ -139,7 +139,7 @@ var $config = (function() { Object.freeze(this.partition); // Verify that there is exactly 1 chunk in our partition. - var config = ChunkHelper.getPrimary(connCache.config); + var config = connCache.rsConns.config; var numChunks = ChunkHelper.getNumChunks( config, ns, this.partition.chunkLower, this.partition.chunkUpper); var chunks = ChunkHelper.getChunks(config, ns, MinKey, MaxKey); diff --git a/jstests/concurrency/fsm_workloads/sharded_moveChunk_partitioned.js b/jstests/concurrency/fsm_workloads/sharded_moveChunk_partitioned.js index 33dc02c405..54b7eec11b 100644 --- a/jstests/concurrency/fsm_workloads/sharded_moveChunk_partitioned.js +++ b/jstests/concurrency/fsm_workloads/sharded_moveChunk_partitioned.js @@ -22,9 +22,16 @@ var $config = extendWorkload($config, function($config, $super) { // verify that each node in the cluster affected by the moveChunk operation sees // the appropriate after-state regardless of whether the operation succeeded or failed. $config.states.moveChunk = function moveChunk(db, collName, connCache) { - var dbName = db.getName(); + // This state function would have eventually been run outside a transaction because the + // moveChunk command cannot be run in a multi-statement transaction. The reason for bailing + // out immediately is to avoid a hang caused by one worker thread having an idle transaction + // on the config server primary repeatedly trying to run a moveChunk command and another + // worker thread blocked in committing a moveChunk on those same shard due to being unable + // to acquire the global X lock. + fsm.forceRunningOutsideTransaction(this); + var ns = db[collName].getFullName(); - var config = ChunkHelper.getPrimary(connCache.config); + var config = connCache.rsConns.config; // Verify that more than one shard exists in the cluster. If only one shard existed, // there would be no way to move a chunk from one shard to another. @@ -73,8 +80,8 @@ var $config = extendWorkload($config, function($config, $super) { // Verify that the fromShard and toShard have the correct after-state // (see comments below for specifics). - var fromShardPrimary = ChunkHelper.getPrimary(connCache.shards[fromShard]); - var toShardPrimary = ChunkHelper.getPrimary(connCache.shards[toShard]); + var fromShardPrimary = connCache.rsConns.shards[fromShard]; + var toShardPrimary = connCache.rsConns.shards[toShard]; var fromShardNumDocsAfter = ChunkHelper.getNumDocs(fromShardPrimary, ns, chunk.min._id, chunk.max._id); var toShardNumDocsAfter = @@ -102,35 +109,27 @@ var $config = extendWorkload($config, function($config, $super) { } // Verify that all config servers have the correct after-state. - // (see comments below for specifics). - for (var conn of connCache.config) { - var res = conn.adminCommand({isMaster: 1}); - assertAlways.commandWorked(res); - if (res.ismaster) { - // If the moveChunk operation succeeded, verify that the config updated the chunk's - // shard with the toShard. If the operation failed, verify that the config kept - // the chunk's shard as the fromShard. - var chunkAfter = conn.getDB('config').chunks.findOne({_id: chunk._id}); - var msg = msgBase + '\nchunkBefore: ' + tojson(chunk) + - '\nchunkAfter: ' + tojson(chunkAfter); - if (moveChunkRes.ok) { - msg = "moveChunk succeeded but chunk's shard was not new shard.\n" + msg; - assertWhenOwnColl.eq(chunkAfter.shard, toShard, msg); - } else { - msg = "moveChunk failed but chunk's shard was not original shard.\n" + msg; - assertWhenOwnColl.eq(chunkAfter.shard, fromShard, msg); - } - - // Regardless of whether the operation succeeded or failed, - // verify that the number of chunks in our partition stayed the same. - var numChunksAfter = ChunkHelper.getNumChunks( - conn, ns, this.partition.chunkLower, this.partition.chunkUpper); - msg = 'Number of chunks in partition seen by config changed with moveChunk.\n' + - msgBase; - assertWhenOwnColl.eq(numChunksBefore, numChunksAfter, msg); - } + // If the moveChunk operation succeeded, verify that the config updated the chunk's shard + // with the toShard. If the operation failed, verify that the config kept the chunk's shard + // as the fromShard. + var chunkAfter = config.getDB('config').chunks.findOne({_id: chunk._id}); + var msg = + msgBase + '\nchunkBefore: ' + tojson(chunk) + '\nchunkAfter: ' + tojson(chunkAfter); + if (moveChunkRes.ok) { + msg = "moveChunk succeeded but chunk's shard was not new shard.\n" + msg; + assertWhenOwnColl.eq(chunkAfter.shard, toShard, msg); + } else { + msg = "moveChunk failed but chunk's shard was not original shard.\n" + msg; + assertWhenOwnColl.eq(chunkAfter.shard, fromShard, msg); } + // Regardless of whether the operation succeeded or failed, verify that the number of chunks + // in our partition stayed the same. + var numChunksAfter = ChunkHelper.getNumChunks( + config, ns, this.partition.chunkLower, this.partition.chunkUpper); + msg = 'Number of chunks in partition seen by config changed with moveChunk.\n' + msgBase; + assertWhenOwnColl.eq(numChunksBefore, numChunksAfter, msg); + // Verify that all mongos processes see the correct after-state on the shards and configs. // (see comments below for specifics). for (var mongos of connCache.mongos) { diff --git a/jstests/libs/override_methods/network_error_and_txn_override.js b/jstests/libs/override_methods/network_error_and_txn_override.js index f365e15073..d1e6442bbc 100644 --- a/jstests/libs/override_methods/network_error_and_txn_override.js +++ b/jstests/libs/override_methods/network_error_and_txn_override.js @@ -107,7 +107,6 @@ const kNonRetryableCommands = new Set([ "grantRolesToRole", "grantRolesToUser", "mapreduce.shardedfinish", - "moveChunk", "renameCollection", "revokePrivilegesFromRole", "revokeRolesFromRole", @@ -126,6 +125,7 @@ const kAcceptableNonRetryableCommands = new Set([ "drop", "dropDatabase", // Already ignores NamespaceNotFound errors, so not handled below. "dropIndexes", + "moveChunk", ]); // Returns if the given failed response is a safe response to ignore when retrying the @@ -206,6 +206,19 @@ function isRetryableShardCollectionResponse(res) { res.code === ErrorCodes.CallbackCanceled; } +// Returns true if the given response could have come from moveChunk being interrupted by a +// failover. +function isRetryableMoveChunkResponse(res) { + return res.code === ErrorCodes.OperationFailed && + (RetryableWritesUtil.errmsgContainsRetryableCodeName(res.errmsg) || + // The transaction number is bumped by the migration coordinator when its commit or abort + // decision is being made durable. + res.errmsg.includes("TransactionTooOld") || + // The range deletion task may have been interrupted. This error can occur even when + // _waitForDelete=false. + res.errmsg.includes("operation was interrupted")); +} + function hasError(res) { return res.ok !== 1 || res.writeErrors; } @@ -833,6 +846,12 @@ function shouldRetryWithNetworkErrorOverride( return kContinue; } + // Check for the retryable error codes from an interrupted moveChunk. + if (cmdName === "moveChunk" && isRetryableMoveChunkResponse(res)) { + logError("Retrying interrupted moveChunk"); + return kContinue; + } + // In a sharded cluster, drop may bury the original error code in the error message if // interrupted. if (cmdName === "drop" && RetryableWritesUtil.errmsgContainsRetryableCodeName(res.errmsg)) { diff --git a/jstests/libs/override_methods/set_read_and_write_concerns.js b/jstests/libs/override_methods/set_read_and_write_concerns.js index 8508f48b42..9a6ba7ab66 100644 --- a/jstests/libs/override_methods/set_read_and_write_concerns.js +++ b/jstests/libs/override_methods/set_read_and_write_concerns.js @@ -152,6 +152,11 @@ function runCommandWithReadAndWriteConcerns( throw new Error("Cowardly refusing to override write concern of command: " + tojson(commandObj)); } + } else if (commandName === "moveChunk") { + // _secondaryThrottle=true must be specified to use the moveChunk command with a + // write concern. Since the caller didn't specify a write concern for the command, + // we must also fill in _secondaryThrottle=true ourselves. + commandObjUnwrapped._secondaryThrottle = true; } // We create a copy of the writeConcern object to avoid mutating the parameter the diff --git a/jstests/libs/override_methods/set_read_preference_secondary.js b/jstests/libs/override_methods/set_read_preference_secondary.js index 788a53f5f1..a479dac391 100644 --- a/jstests/libs/override_methods/set_read_preference_secondary.js +++ b/jstests/libs/override_methods/set_read_preference_secondary.js @@ -114,7 +114,7 @@ function runCommandWithReadPreferenceSecondary( !OverrideHelpers.isMapReduceWithInlineOutput(commandName, commandObjUnwrapped)) { // A map-reduce operation with non-inline output must be sent to the primary. shouldForceReadPreference = false; - } else if (conn.isMongos() && kDatabasesOnConfigServers.has(dbName)) { + } else if ((conn.isMongos() && kDatabasesOnConfigServers.has(dbName)) || conn._isConfigServer) { // Avoid overriding the read preference for config server since there may only be one // of them. shouldForceReadPreference = false;