Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-40594

Range deleter in prepare conflict retry loop blocks step down

    • Type: Icon: Bug Bug
    • Resolution: Fixed
    • Priority: Icon: Major - P3 Major - P3
    • 4.1.12
    • Affects Version/s: None
    • Component/s: Replication, Sharding
    • None
    • Fully Compatible
    • ALL
    • Hide

      Note that this repro relies on sleeps to have the collection range deleter run after the transaction is prepared and before the step down attempt, so it may need to be repeated to trigger the hang.

      (function() {
          "use strict";
      
          TestData.skipCheckingUUIDsConsistentAcrossCluster = true;
      
          // Helper to add generic txn fields to a command.
          function addTxnFieldsToCmd(cmd, lsid, txnNumber) {
              return Object.extend(
                  cmd, {lsid, txnNumber: NumberLong(txnNumber), stmtId: NumberInt(0), autocommit: false});
          }
      
          const dbName = "test";
          const collName = "foo";
          const ns = dbName + "." + collName;
      
          const st = new ShardingTest({shards: 2, config: 1});
      
          // Set up sharded collection with two chunks - [-inf, 0), [0, inf)
          assert.commandWorked(st.s.adminCommand({enableSharding: dbName}));
          assert.commandWorked(st.s.adminCommand({movePrimary: dbName, to: st.shard0.shardName}));
          assert.commandWorked(st.s.adminCommand({shardCollection: ns, key: {_id: 1}}));
          assert.commandWorked(st.s.adminCommand({split: ns, middle: {_id: 0}}));
      
          // Move a chunk away from Shard0 (the donor) so its range deleter will asynchronously delete the
          // chunk's range. Flush its metadata to avoid StaleConfig during the later transaction.
          assert.commandWorked(
              st.s.adminCommand({moveChunk: ns, find: {_id: 10}, to: st.shard1.shardName}));
          assert.commandWorked(st.rs0.getPrimary().adminCommand({_flushRoutingTableCacheUpdates: ns}));
      
          // Insert a doc into the chunk still owned by the donor shard in a transaction then prepare the
          // transaction so readers of that doc will enter a prepare conflict retry loop.
          const lsid = {id: UUID()};
          const txnNumber = 0;
          assert.commandWorked(st.s.getDB(dbName).runCommand(addTxnFieldsToCmd(
              {insert: collName, documents: [{_id: -5}], startTransaction: true}, lsid, txnNumber)));
      
          assert.commandWorked(st.rs0.getPrimary().adminCommand(
              addTxnFieldsToCmd({prepareTransaction: 1}, lsid, txnNumber)));
      
          // Wait for range deleter to run. It should get stuck in a prepare conflict retry loop.
          sleep(1000);
      
          // Attempt to step down the primary. As in the description, this will fail with a lock timeout
          // if the range deleter ran after the above transaction was prepared.
          assert.commandFailedWithCode(
              st.rs0.getPrimary().adminCommand({replSetStepDown: 5, force: true}),
              ErrorCodes.LockTimeout);
      
          // Cleanup the transaction so the sharding test can shut down.
          assert.commandWorked(st.rs0.getPrimary().adminCommand(
              addTxnFieldsToCmd({abortTransaction: 1}, lsid, txnNumber)));
      
          st.stop();
      })();
      
      Show
      Note that this repro relies on sleeps to have the collection range deleter run after the transaction is prepared and before the step down attempt, so it may need to be repeated to trigger the hang. (function() { "use strict" ; TestData.skipCheckingUUIDsConsistentAcrossCluster = true ; // Helper to add generic txn fields to a command. function addTxnFieldsToCmd(cmd, lsid, txnNumber) { return Object .extend( cmd, {lsid, txnNumber: NumberLong(txnNumber), stmtId: NumberInt(0), autocommit: false }); } const dbName = "test" ; const collName = "foo" ; const ns = dbName + "." + collName; const st = new ShardingTest({shards: 2, config: 1}); // Set up sharded collection with two chunks - [-inf, 0), [0, inf) assert .commandWorked(st.s.adminCommand({enableSharding: dbName})); assert .commandWorked(st.s.adminCommand({movePrimary: dbName, to: st.shard0.shardName})); assert .commandWorked(st.s.adminCommand({shardCollection: ns, key: {_id: 1}})); assert .commandWorked(st.s.adminCommand({split: ns, middle: {_id: 0}})); // Move a chunk away from Shard0 (the donor) so its range deleter will asynchronously delete the // chunk's range. Flush its metadata to avoid StaleConfig during the later transaction. assert .commandWorked( st.s.adminCommand({moveChunk: ns, find: {_id: 10}, to: st.shard1.shardName})); assert .commandWorked(st.rs0.getPrimary().adminCommand({_flushRoutingTableCacheUpdates: ns})); // Insert a doc into the chunk still owned by the donor shard in a transaction then prepare the // transaction so readers of that doc will enter a prepare conflict retry loop. const lsid = {id: UUID()}; const txnNumber = 0; assert .commandWorked(st.s.getDB(dbName).runCommand(addTxnFieldsToCmd( {insert: collName, documents: [{_id: -5}], startTransaction: true }, lsid, txnNumber))); assert .commandWorked(st.rs0.getPrimary().adminCommand( addTxnFieldsToCmd({prepareTransaction: 1}, lsid, txnNumber))); // Wait for range deleter to run. It should get stuck in a prepare conflict retry loop. sleep(1000); // Attempt to step down the primary. As in the description, this will fail with a lock timeout // if the range deleter ran after the above transaction was prepared. assert .commandFailedWithCode( st.rs0.getPrimary().adminCommand({replSetStepDown: 5, force: true }), ErrorCodes.LockTimeout); // Cleanup the transaction so the sharding test can shut down. assert .commandWorked(st.rs0.getPrimary().adminCommand( addTxnFieldsToCmd({abortTransaction: 1}, lsid, txnNumber))); st.stop(); })();
    • Sharding 2019-05-06, Sharding 2019-05-20, Sharding 2019-06-03
    • 19

      Replication step down requires the ReplicationStateTransitionLock in MODE_X and kills user operations, but it doesn't kill internal operations, like those run by the collection range deleter. If the range deleter runs and enters a prepare conflict retry loop (which waits without yielding locks), it will hang until the prepared transaction modifying the data it is reading commits or aborts. The RSTL can't be taken in exclusive mode until the range deleter operation finishes, so during this time all step down attempts will time out waiting for the RSTL.

      This should also be a problem for step up (and other operations that require the RSTL) and may be triggered by other internal operations that can read prepared data, but I've only seen this so far with step down and the range deleter. The step up case might be worse, because a prepared transaction can't commit or abort and unblock an internal operation if there's no primary.

            Assignee:
            matthew.saltz@mongodb.com Matthew Saltz (Inactive)
            Reporter:
            jack.mulrow@mongodb.com Jack Mulrow
            Votes:
            0 Vote for this issue
            Watchers:
            11 Start watching this issue

              Created:
              Updated:
              Resolved: