Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-56638

Fix flushReshardingStateChanges critical section race

    XMLWordPrintableJSON

Details

    • Fully Compatible
    • ALL
    • v5.0
    • Hide

      resmoke --suites=sharding jstests/sharding/resharding_fails_on_nonempty_stash.js
      

      diff --git a/jstests/sharding/resharding_fails_on_nonempty_stash.js b/jstests/sharding/resharding_fails_on_nonempty_stash.js
      index b627d3b186..029b3cee53 100644
      --- a/jstests/sharding/resharding_fails_on_nonempty_stash.js
      +++ b/jstests/sharding/resharding_fails_on_nonempty_stash.js
      @@ -34,6 +34,9 @@ const recipient1Conn = new Mongo(topology.shards[recipientShardNames[1]].primary
       const removeRecipientDocumentFailpoint =
           configureFailPoint(recipient1Conn, "removeRecipientDocFailpoint");
       
      +const recipient0Conn = new Mongo(topology.shards[recipientShardNames[0]].primary);
      +const pauseCriticalSectionFP = configureFailPoint(recipient0Conn, "reshardingPauseCriticalSection");
      +
       reshardingTest.withReshardingInBackground(
           {
               newShardKeyPattern: {newKey: 1},
      diff --git a/src/mongo/db/s/flush_resharding_state_change_command.cpp b/src/mongo/db/s/flush_resharding_state_change_command.cpp
      index 956387907e..8c88411db1 100644
      --- a/src/mongo/db/s/flush_resharding_state_change_command.cpp
      +++ b/src/mongo/db/s/flush_resharding_state_change_command.cpp
      @@ -51,6 +51,8 @@
       #include "mongo/s/request_types/flush_resharding_state_change_gen.h"
       
       namespace mongo {
      +
      +MONGO_FAIL_POINT_DEFINE(reshardingPauseCriticalSection);
       namespace {
       
       void refreshShardVersion(OperationContext* opCtx, const NamespaceString& nss) {
      @@ -94,12 +96,25 @@ void refreshShardVersion(OperationContext* opCtx, const NamespaceString& nss) {
       
                   collLock.reset();
                   dbLock.reset();
      -
                   inRecoverOrRefresh->get(opCtx);
               } else {
                   collLock.reset();
                   dbLock.reset();
      -
      +            if (MONGO_unlikely(reshardingPauseCriticalSection.shouldFail())) {
      +                sleepsecs(2);
      +            }
      +            {
      +                boost::optional<Lock::DBLock> dbLock2;
      +                dbLock2.emplace(opCtx, nss.db(), MODE_IS);
      +
      +                boost::optional<Lock::CollectionLock> collLock2;
      +                collLock2.emplace(opCtx, nss, MODE_IS);
      +
      +                const auto csr = CollectionShardingRuntime::get(opCtx, nss);
      +                auto critSec2 =
      +                    csr->getCriticalSectionSignal(opCtx, ShardingMigrationCriticalSection::kWrite);
      +                invariant(!critSec2);
      +            }
                   onShardVersionMismatch(opCtx, nss, boost::none);
               }
           }
      

      Show
      resmoke --suites=sharding jstests/sharding/resharding_fails_on_nonempty_stash.js diff --git a/jstests/sharding/resharding_fails_on_nonempty_stash.js b/jstests/sharding/resharding_fails_on_nonempty_stash.js index b627d3b186..029b3cee53 100644 --- a/jstests/sharding/resharding_fails_on_nonempty_stash.js +++ b/jstests/sharding/resharding_fails_on_nonempty_stash.js @@ -34,6 +34,9 @@ const recipient1Conn = new Mongo(topology.shards[recipientShardNames[1]].primary const removeRecipientDocumentFailpoint = configureFailPoint(recipient1Conn, "removeRecipientDocFailpoint");   +const recipient0Conn = new Mongo(topology.shards[recipientShardNames[0]].primary); +const pauseCriticalSectionFP = configureFailPoint(recipient0Conn, "reshardingPauseCriticalSection"); + reshardingTest.withReshardingInBackground( { newShardKeyPattern: {newKey: 1}, diff --git a/src/mongo/db/s/flush_resharding_state_change_command.cpp b/src/mongo/db/s/flush_resharding_state_change_command.cpp index 956387907e..8c88411db1 100644 --- a/src/mongo/db/s/flush_resharding_state_change_command.cpp +++ b/src/mongo/db/s/flush_resharding_state_change_command.cpp @@ -51,6 +51,8 @@ #include "mongo/s/request_types/flush_resharding_state_change_gen.h"   namespace mongo { + +MONGO_FAIL_POINT_DEFINE(reshardingPauseCriticalSection); namespace {   void refreshShardVersion(OperationContext* opCtx, const NamespaceString& nss) { @@ -94,12 +96,25 @@ void refreshShardVersion(OperationContext* opCtx, const NamespaceString& nss) {   collLock.reset(); dbLock.reset(); - inRecoverOrRefresh->get(opCtx); } else { collLock.reset(); dbLock.reset(); - + if (MONGO_unlikely(reshardingPauseCriticalSection.shouldFail())) { + sleepsecs(2); + } + { + boost::optional<Lock::DBLock> dbLock2; + dbLock2.emplace(opCtx, nss.db(), MODE_IS); + + boost::optional<Lock::CollectionLock> collLock2; + collLock2.emplace(opCtx, nss, MODE_IS); + + const auto csr = CollectionShardingRuntime::get(opCtx, nss); + auto critSec2 = + csr->getCriticalSectionSignal(opCtx, ShardingMigrationCriticalSection::kWrite); + invariant(!critSec2); + } onShardVersionMismatch(opCtx, nss, boost::none); } }
    • Sharding 2021-05-17, Sharding 2021-05-31
    • 2

    Description

      Summary: It's possible for a resharding refresh to hang on a recipient if the recipient acquires the critical section between the time flushReshardingStateChanges acquires locks, checks the critical section, releases the locks, and calls onShardVersionMismatch

      Scenario of recipient0 causing a hang:

      • recipient1 errors, transitions from steady-state to error

        [js_test:resharding_fails_on_nonempty_stash] d20026| 2021-05-04T00:56:36.146+00:00 I  RESHARD  5279506 [ReshardingRecipientService-2] "Transitioned resharding recipient state","attr":{"newState":"error","oldState":"steady-state","namespace":"reshardingDb.coll","collectionUUID":{"uuid":{"$uuid":"23d3b803-bbd6-4766-beea-80d7d50e884a"}},"reshardingUUID":{"uuid":{"$uuid":"43ab79c0-6627-4c12-ac92-a4b315575c00"}}}
        

      • the coordinator transitions to error

        [js_test:resharding_fails_on_nonempty_stash] c20028| 2021-05-04T00:56:36.159+00:00 I  RESHARD  5343001 [ReshardingCoordinatorService-1] "Transitioned resharding coordinator state","attr":{"newState":"error","oldState":"blocking-writes","namespace":"reshardingDb.coll","collectionUUID":{"uuid":{"$uuid":"23d3b803-bbd6-4766-beea-80d7d50e884a"}},"reshardingUUID":{"uuid":{"$uuid":"43ab79c0-6627-4c12-ac92-a4b315575c00"}}}
        

      • recipient0 transitions to strict-consistency after flushReshardingStateChanges acquires the locks to check the critical section and releases them but before flushReshardingStateChanges calls onShardVersionMismatch

        [js_test:resharding_fails_on_nonempty_stash] d20024| 2021-05-04T00:56:36.160+00:00 I  RESHARD  5279506 [ReshardingRecipientService-0] "Transitioned resharding recipient state","attr":{"newState":"strict-consistency","oldState":"steady-state","namespace":"reshardingDb.coll","collectionUUID":{"uuid":{"$uuid":"23d3b803-bbd6-4766-beea-80d7d50e884a"}},"reshardingUUID":{"uuid":{"$uuid":"43ab79c0-6627-4c12-ac92-a4b315575c00"}}}
        

      Attachments

        Issue Links

          Activity

            People

              randolph@mongodb.com Randolph Tan
              haley.connelly@mongodb.com Haley Connelly
              Votes:
              0 Vote for this issue
              Watchers:
              2 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: