From 2680a93bd2ef5edd421f6b08b752fcd451c0ebdf Mon Sep 17 00:00:00 2001 From: jannaerin Date: Mon, 20 May 2019 11:52:25 -0400 Subject: [PATCH] SERVER-39141 The range deleter should retry on write conflict exceptions --- ...andom_moveChunk_broadcast_delete_transaction.js | 11 ++--------- ...andom_moveChunk_broadcast_update_transaction.js | 14 ++++---------- src/mongo/db/query/plan_executor_impl.cpp | 1 + src/mongo/db/s/collection_range_deleter.cpp | 13 +++++++++---- 4 files changed, 16 insertions(+), 23 deletions(-) diff --git a/jstests/concurrency/fsm_workloads/random_moveChunk_broadcast_delete_transaction.js b/jstests/concurrency/fsm_workloads/random_moveChunk_broadcast_delete_transaction.js index e4a80f533a..cacbafbc1e 100644 --- a/jstests/concurrency/fsm_workloads/random_moveChunk_broadcast_delete_transaction.js +++ b/jstests/concurrency/fsm_workloads/random_moveChunk_broadcast_delete_transaction.js @@ -30,21 +30,14 @@ var $config = extendWorkload($config, function($config, $super) { $config.data.numGroupsWithinThread = $config.data.partitionSize / 5; $config.data.nextGroupId = 0; - // A moveChunk may fail with a WriteConflict when clearing orphans on the destination shard if - // any of them are concurrently written to by a broadcast transaction operation. The error - // message and top-level code may be different depending on where the failure occurs. - // - // TODO SERVER-39141: Don't ignore WriteConflict error message once the range deleter retries on - // write conflict exceptions. - // // Additionally, because deletes don't have a shard filter stage, a migration may fail if a // broadcast delete is operating on orphans from a previous migration in the range being // migrated back in. The particular error code is replaced with a more generic one, so this is // identified by the failed migration's error message. $config.data.isMoveChunkErrorAcceptable = (err) => { + print("xxx got move chunk error " + tojsononeline(err)); return err.message && - (err.message.indexOf("WriteConflict") > -1 || - err.message.indexOf("CommandFailed") > -1 || + (err.message.indexOf("CommandFailed") > -1 || err.message.indexOf("Documents in target range may still be in use") > -1); }; diff --git a/jstests/concurrency/fsm_workloads/random_moveChunk_broadcast_update_transaction.js b/jstests/concurrency/fsm_workloads/random_moveChunk_broadcast_update_transaction.js index c3e6fedba5..27fe1d4f35 100644 --- a/jstests/concurrency/fsm_workloads/random_moveChunk_broadcast_update_transaction.js +++ b/jstests/concurrency/fsm_workloads/random_moveChunk_broadcast_update_transaction.js @@ -24,21 +24,15 @@ var $config = extendWorkload($config, function($config, $super) { // applied. $config.data.expectedCounters = {}; - // A moveChunk may fail with a WriteConflict when clearing orphans on the destination shard if - // any of them are concurrently written to by a broadcast transaction operation. The error - // message and top-level code may be different depending on where the failure occurs. - // - // TODO SERVER-39141: Don't ignore WriteConflict error message once the range deleter retries on - // write conflict exceptions. - // - // Additionally, because updates don't have a shard filter stage, a migration may fail if a + // Because updates don't have a shard filter stage, a migration may fail if a // broadcast update is operating on orphans from a previous migration in the range being // migrated back in. The particular error code is replaced with a more generic one, so this is // identified by the failed migration's error message. $config.data.isMoveChunkErrorAcceptable = (err) => { + print("xxx got move chunk error "); + print("xxx error " + tojsononeline(err)); return err.message && - (err.message.indexOf("WriteConflict") > -1 || - err.message.indexOf("CommandFailed") > -1 || + (err.message.indexOf("CommandFailed") > -1 || err.message.indexOf("Documents in target range may still be in use") > -1); }; diff --git a/src/mongo/db/query/plan_executor_impl.cpp b/src/mongo/db/query/plan_executor_impl.cpp index b01b49efba..0e8c9aa599 100644 --- a/src/mongo/db/query/plan_executor_impl.cpp +++ b/src/mongo/db/query/plan_executor_impl.cpp @@ -569,6 +569,7 @@ PlanExecutor::ExecState PlanExecutorImpl::_getNextImpl(Snapshotted* obj } else if (PlanStage::NEED_YIELD == code) { invariant(id == WorkingSet::INVALID_ID); if (!_yieldPolicy->canAutoYield() || MONGO_FAIL_POINT(skipWriteConflictRetries)) { + log() << "xxx throwing wce"; throw WriteConflictException(); } diff --git a/src/mongo/db/s/collection_range_deleter.cpp b/src/mongo/db/s/collection_range_deleter.cpp index f8cb571a14..00c09663d1 100644 --- a/src/mongo/db/s/collection_range_deleter.cpp +++ b/src/mongo/db/s/collection_range_deleter.cpp @@ -396,9 +396,15 @@ StatusWith CollectionRangeDeleter::_doDeletion(OperationContext* opCtx, int numDeleted = 0; do { - BSONObj deletedObj; - PlanExecutor::ExecState state = exec->getNext(&deletedObj, nullptr); - + log() << "xxx going to do write conflict retry num deleted " << numDeleted; + PlanExecutor::ExecState state = writeConflictRetry(opCtx, "delete range", nss.ns(), [&] { + BSONObj deletedObj; + log() << "xxx get next loop"; + auto s = exec->getNext(&deletedObj, nullptr); + log() << "xxx did s"; + return s; + }); + log() << "xxx write should not have wce"; if (state == PlanExecutor::IS_EOF) { break; } @@ -412,7 +418,6 @@ StatusWith CollectionRangeDeleter::_doDeletion(OperationContext* opCtx, invariant(PlanExecutor::ADVANCED == state); ShardingStatistics::get(opCtx).countDocsDeletedOnDonor.addAndFetch(1); - } while (++numDeleted < maxToDelete); return numDeleted; -- 2.17.1