Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-42621

3 way deadlock can happen between hybrid index build, prepared transactions and stepdown thread.

    • Type: Icon: Bug Bug
    • Resolution: Fixed
    • Priority: Icon: Major - P3 Major - P3
    • 4.3.1
    • Affects Version/s: None
    • Component/s: Replication
    • Labels:
      None
    • Fully Compatible
    • ALL
    • v4.2
    • Hide
      /*
       * 3 way deadlock - Hybrid index builder, prepared txn and step down thread.
       */
      load("jstests/libs/check_log.js");
      load("jstests/core/txns/libs/prepare_helpers.js");
      
      (function() {
      
      "use strict";
      
      const testName = "hybridIndexBuildRepro";
      const dbName = "test";
      const collName = "coll";
      
      var rst = new ReplSetTest({name: testName, nodes: [{}, {rsConfig: {priority: 0}}]});
      rst.startSet();
      rst.initiate();
      
      var primary = rst.getPrimary();
      const primaryDB = primary.getDB(dbName);
      const primaryAdmin = primary.getDB("admin");
      const primaryColl = primaryDB[collName];
      const collNss = primaryColl.getFullName();
      
      TestData.dbName = dbName;
      TestData.collName = collName;
      
      jsTestLog("1. Do a document write");
      assert.writeOK(
              primaryColl.insert({_id: 1, x:1}, {"writeConcern": {"w": "majority"}}));
      rst.awaitReplication();
      
      // Clear the log.
      assert.commandWorked(primaryAdmin.runCommand({clearLog: 'global'}));
      
      // Enable failpoint which makes hybrid index build build to hang.
      assert.commandWorked(primaryAdmin.runCommand(
              {configureFailPoint: "hangAfterIndexBuildDumpsInsertsFromBulk", mode: "alwaysOn"}));
      
      const indexThread = startParallelShell(() => {
          jsTestLog("Create index");
          var primaryDB = db.getSiblingDB(TestData.dbName);
          assert.commandFailed(primaryDB[TestData.collName].createIndex({"x": 1}));
      }, primary.port);
      
      rst.awaitReplication();
      
      // Wait for hangAfterIndexBuildDumpsInsertsFromBulk failpoint to reach.
      checkLog.contains(primary, "Hanging after dumping inserts from bulk builder");
      
      jsTestLog("Start txn");
      let session = primary.startSession({causalConsistency: false});
      let sessionDB = session.getDatabase(dbName);
      const sessionColl = sessionDB.getCollection(collName);
      session.startTransaction();
      assert.commandWorked(sessionColl.insert({x: 1}, {$set: {y: 1}}));
      
      jsTestLog("Prepare txn");
      let prepareTimestamp = PrepareHelpers.prepareTransaction(session);
      
      const stepDownThread = startParallelShell(() => {
          jsTestLog("Make primary to step down");
          assert.commandWorked(db.adminCommand({"replSetStepDown": 60*60, "force": true}));
      }, primary.port);
      
      checkLog.contains(primary, "Starting to kill user operations");
      
      assert.commandWorked(primaryAdmin.runCommand(
              {configureFailPoint: "hangAfterIndexBuildDumpsInsertsFromBulk", mode: "off"}));
      
      // Wait for threads to join.
      indexThread();
      stepDownThread();
      
      rst.waitForState(primary, ReplSetTest.State.SECONDARY);
      // Unfreeze the original primary so that it can stand for election again for the next test.
      assert.commandWorked(primary.adminCommand({replSetFreeze: 0}));
      // Make the primary to reelect again.
      assert.commandWorked(primary.adminCommand({replSetStepUp: 1}));
      primary = rst.getPrimary();
      
      jsTestLog("commit txn");
      assert.commandWorked(PrepareHelpers.commitTransaction(session, prepareTimestamp));
      
      
      rst.stopSet();
      })();
      
      Show
      /* * 3 way deadlock - Hybrid index builder, prepared txn and step down thread. */ load("jstests/libs/check_log.js"); load("jstests/core/txns/libs/prepare_helpers.js"); (function() { "use strict"; const testName = "hybridIndexBuildRepro"; const dbName = "test"; const collName = "coll"; var rst = new ReplSetTest({name: testName, nodes: [{}, {rsConfig: {priority: 0}}]}); rst.startSet(); rst.initiate(); var primary = rst.getPrimary(); const primaryDB = primary.getDB(dbName); const primaryAdmin = primary.getDB("admin"); const primaryColl = primaryDB[collName]; const collNss = primaryColl.getFullName(); TestData.dbName = dbName; TestData.collName = collName; jsTestLog("1. Do a document write"); assert.writeOK( primaryColl.insert({_id: 1, x:1}, {"writeConcern": {"w": "majority"}})); rst.awaitReplication(); // Clear the log. assert.commandWorked(primaryAdmin.runCommand({clearLog: 'global'})); // Enable failpoint which makes hybrid index build build to hang. assert.commandWorked(primaryAdmin.runCommand( {configureFailPoint: "hangAfterIndexBuildDumpsInsertsFromBulk", mode: "alwaysOn"})); const indexThread = startParallelShell(() => { jsTestLog("Create index"); var primaryDB = db.getSiblingDB(TestData.dbName); assert.commandFailed(primaryDB[TestData.collName].createIndex({"x": 1})); }, primary.port); rst.awaitReplication(); // Wait for hangAfterIndexBuildDumpsInsertsFromBulk failpoint to reach. checkLog.contains(primary, "Hanging after dumping inserts from bulk builder"); jsTestLog("Start txn"); let session = primary.startSession({causalConsistency: false}); let sessionDB = session.getDatabase(dbName); const sessionColl = sessionDB.getCollection(collName); session.startTransaction(); assert.commandWorked(sessionColl.insert({x: 1}, {$set: {y: 1}})); jsTestLog("Prepare txn"); let prepareTimestamp = PrepareHelpers.prepareTransaction(session); const stepDownThread = startParallelShell(() => { jsTestLog("Make primary to step down"); assert.commandWorked(db.adminCommand({"replSetStepDown": 60*60, "force": true})); }, primary.port); checkLog.contains(primary, "Starting to kill user operations"); assert.commandWorked(primaryAdmin.runCommand( {configureFailPoint: "hangAfterIndexBuildDumpsInsertsFromBulk", mode: "off"})); // Wait for threads to join. indexThread(); stepDownThread(); rst.waitForState(primary, ReplSetTest.State.SECONDARY); // Unfreeze the original primary so that it can stand for election again for the next test. assert.commandWorked(primary.adminCommand({replSetFreeze: 0})); // Make the primary to reelect again. assert.commandWorked(primary.adminCommand({replSetStepUp: 1})); primary = rst.getPrimary(); jsTestLog("commit txn"); assert.commandWorked(PrepareHelpers.commitTransaction(session, prepareTimestamp)); rst.stopSet(); })();
    • Execution Team 2019-08-12, Repl 2019-08-26, Repl 2019-09-09
    • 7

      Currently, we can see a 3 way deadlock between hybrid index builder, prepared txn and step down thread for the above repro. The problem is that when step down thread kills "createIndex" cmd thread. As part of index teardown step, on primary, MultiIndexBlock::cleanUpAfterBuild is called with RSTL held in IX mode which then tries to acquire X lock on user collection in an uninterruptible lock guard but gets blocked behind prepared transaction due to collection lock conflict. Since createIndex is holding RSTL in IX mode, it blocks step down thread. CommitTransaction cmd waiting to acquire RSTL lock in IX mode gets blocked behind the step down thread as the step down thread has enqueued RSTL lock in X mode.

            Assignee:
            suganthi.mani@mongodb.com Suganthi Mani
            Reporter:
            suganthi.mani@mongodb.com Suganthi Mani
            Votes:
            0 Vote for this issue
            Watchers:
            9 Start watching this issue

              Created:
              Updated:
              Resolved: