Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-31485

Race between move chunks and dropIndex may lead to IndexNotFound error

    XMLWordPrintable

    Details

    • Type: Bug
    • Status: Closed
    • Priority: Major - P3
    • Resolution: Fixed
    • Affects Version/s: 3.5.13
    • Fix Version/s: 3.6.0-rc3
    • Component/s: Sharding
    • Labels:
      None
    • Backwards Compatibility:
      Fully Compatible
    • Operating System:
      ALL
    • Steps To Reproduce:
      Hide

      Use the following patch:

      diff --git a/src/mongo/db/s/migration_destination_manager.cpp b/src/mongo/db/s/migration_destination_manager.cpp
      index 0dd4967..14944f0 100644
      --- a/src/mongo/db/s/migration_destination_manager.cpp
      +++ b/src/mongo/db/s/migration_destination_manager.cpp
      @@ -207,6 +207,7 @@ MONGO_FP_DECLARE(migrateThreadHangAtStep3);
       MONGO_FP_DECLARE(migrateThreadHangAtStep4);
       MONGO_FP_DECLARE(migrateThreadHangAtStep5);
       MONGO_FP_DECLARE(migrateThreadHangAtStep6);
      +MONGO_FP_DECLARE(moveChunkDropIndex);
       
       MONGO_FP_DECLARE(failMigrationReceivedOutOfRangeOperation);
       
      @@ -589,6 +590,8 @@ void MigrationDestinationManager::_migrateDriver(OperationContext* opCtx,
               }
           }
       
      +    MONGO_FAIL_POINT_PAUSE_WHILE_SET(moveChunkDropIndex);
      +
           {
               // 1. copy indexes
      

      Then run the following file using: ./mongo --nodb repro_bf6752.js

      repro_bf6752.js

      /**
       * Test race condition between moveChunk and dropIndex in a sharded cluster.
       */
      (function() {
          "use strict";
       
          load('jstests/libs/parallelTester.js');
       
          // moveChunk will initiate collection creation on secondary shard.
          function moveChunk_dropIndex(host) {
       
          	const conn = new Mongo(host);
          	const db = conn.getDB("test");
       
              assert.commandWorked(db.adminCommand({
                  moveChunk : "test.mycoll",
                  find : {moveChunk_dropIndex_field: 1},
                  to : "moveChunkDropIndex-rs1"}));
       
          }
       
          const st = new ShardingTest(
          	{name: "moveChunkDropIndex", mongos: 1, config: 1, shards: 2, rs: {nodes: 1}});
          const db = st.s.getDB("test");
          const coll = db.getCollection("mycoll");
          const mongos = st.s0;
          const shard1DB = st.shard1.getDB("test");
       
          assert.commandWorked(mongos.adminCommand({enableSharding: "test"}));
          assert.commandWorked(
          	db.adminCommand({shardCollection: "test.mycoll", key: {moveChunk_dropIndex_field: 1}}));
       
          assert.writeOK(coll.insert({moveChunk_dropIndex_field: 1}));
       
          // Enable fail point on secondary shard. This will stop execution of moveChunk right after
          // collection is created with options, but before indexes are created.
          assert.commandWorked(
          	shard1DB.adminCommand({"configureFailPoint" : "moveChunkDropIndex", "mode" : 'alwaysOn'}));
          
          // Create separate thread to run 'dropIndex' command.
          const dropIndexThread =
          	new ScopedThread(moveChunk_dropIndex, mongos.host);
          dropIndexThread.start();
       
          // This is used as a sync point. Wait until collection is created on secondary shard.
          assert.soon(function() {
          	return shard1DB.getCollectionInfos({name: "mycoll"}).length === 1;
          });
       
          // Drop index command should fail with IndexNotFound.
          assert.commandWorked(coll.dropIndex({moveChunk_dropIndex_field: 1}));
       
          // Disable failpoint and wait for dropIndexThread to exit.
          assert.commandWorked(
          	shard1DB.adminCommand({configureFailPoint : "moveChunkDropIndex", "mode" : "off"}));
       
          dropIndexThread.join();
          st.stop();
      })();
      

      Show
      Use the following patch: diff --git a/src/mongo/db/s/migration_destination_manager.cpp b/src/mongo/db/s/migration_destination_manager.cpp index 0dd4967..14944f0 100644 --- a/src/mongo/db/s/migration_destination_manager.cpp +++ b/src/mongo/db/s/migration_destination_manager.cpp @@ -207,6 +207,7 @@ MONGO_FP_DECLARE(migrateThreadHangAtStep3); MONGO_FP_DECLARE(migrateThreadHangAtStep4); MONGO_FP_DECLARE(migrateThreadHangAtStep5); MONGO_FP_DECLARE(migrateThreadHangAtStep6); +MONGO_FP_DECLARE(moveChunkDropIndex); MONGO_FP_DECLARE(failMigrationReceivedOutOfRangeOperation); @@ -589,6 +590,8 @@ void MigrationDestinationManager::_migrateDriver(OperationContext* opCtx, } } + MONGO_FAIL_POINT_PAUSE_WHILE_SET(moveChunkDropIndex); + { // 1. copy indexes Then run the following file using: ./mongo --nodb repro_bf6752.js repro_bf6752.js /** * Test race condition between moveChunk and dropIndex in a sharded cluster. */ ( function () { "use strict" ;   load( 'jstests/libs/parallelTester.js' );   // moveChunk will initiate collection creation on secondary shard. function moveChunk_dropIndex(host) {   const conn = new Mongo(host); const db = conn.getDB( "test" );   assert.commandWorked(db.adminCommand({ moveChunk : "test.mycoll" , find : {moveChunk_dropIndex_field: 1}, to : "moveChunkDropIndex-rs1" }));   }   const st = new ShardingTest( {name: "moveChunkDropIndex" , mongos: 1, config: 1, shards: 2, rs: {nodes: 1}}); const db = st.s.getDB( "test" ); const coll = db.getCollection( "mycoll" ); const mongos = st.s0; const shard1DB = st.shard1.getDB( "test" );   assert.commandWorked(mongos.adminCommand({enableSharding: "test" })); assert.commandWorked( db.adminCommand({shardCollection: "test.mycoll" , key: {moveChunk_dropIndex_field: 1}}));   assert.writeOK(coll.insert({moveChunk_dropIndex_field: 1}));   // Enable fail point on secondary shard. This will stop execution of moveChunk right after // collection is created with options, but before indexes are created. assert.commandWorked( shard1DB.adminCommand({ "configureFailPoint" : "moveChunkDropIndex" , "mode" : 'alwaysOn' })); // Create separate thread to run 'dropIndex' command. const dropIndexThread = new ScopedThread(moveChunk_dropIndex, mongos.host); dropIndexThread.start();   // This is used as a sync point. Wait until collection is created on secondary shard. assert.soon( function () { return shard1DB.getCollectionInfos({name: "mycoll" }).length === 1; });   // Drop index command should fail with IndexNotFound. assert.commandWorked(coll.dropIndex({moveChunk_dropIndex_field: 1}));   // Disable failpoint and wait for dropIndexThread to exit. assert.commandWorked( shard1DB.adminCommand({configureFailPoint : "moveChunkDropIndex" , "mode" : "off" }));   dropIndexThread.join(); st.stop(); })();
    • Sprint:
      Sharding 2017-10-23, Sharding 2017-11-13
    • Linked BF Score:
      0

      Description

      A race condition between the migration of chunks to the secondary shard (and index creation) and the drop index. With help from Max Hirschhorn we theorize the following scenario.
      1. client runs build index. Mongos broadcast to all shards.
      2. On shard 1, build index completes.
      3. On shard 2, with no data present for collection, no index is created nor implicit collection creation.
      4. At a later time, a move chunk is initiated from shard 1 to shard 2.
      5. At shard 2, chunk is migrated and hence collection exists, but before indexes are created.
      6. Drop index is broadcast to both shards. Completes successful on shard 1. But on shard 2 returns IndexNotFound.
      7. Indexes are created on collection on shard 2.
      A thought on possible solution.
      1. Instead of returning IndexNotFound, return another error code that would allow the mongos to trigger a retry in this scenario.
      2. Have dropIndexes block until collection "cloning" completes on secondary shard.
      Backlog - Sharding Team I'm going to pass this on to you guys to have a look at the possible solutions.

        Attachments

          Issue Links

            Activity

              People

              • Votes:
                0 Vote for this issue
                Watchers:
                4 Start watching this issue

                Dates

                • Created:
                  Updated:
                  Resolved: