Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-40129

DropPendingCollectionReaper::dropCollectionsOlderThan() forever skips a drop-pending collection if dropCollection() fails

    • Type: Icon: Bug Bug
    • Resolution: Won't Fix
    • Priority: Icon: Major - P3 Major - P3
    • None
    • Affects Version/s: None
    • Component/s: Storage
    • Labels:
      None
    • ALL
    • Hide

      This issue came up on the "majority reads off" build variant after changes to the rollback fuzzer to set the failNonIntentLocksIfWaitNeeded failpoint when running commands against the primary.

      python buildscripts/resmoke.py --suites=replica_sets repro_drop_pending_collection_skipped_forever.js --majorityReadConcern=off
      
      repro_drop_pending_collection_skipped_forever.js
      /**
       * Tests that even if dropping a drop-pending collection namespaces fails that it is eventually
       * retried and dropped.
       */
      (function() {
          "use strict";
      
          load("jstests/libs/check_log.js");
      
          const rst = new ReplSetTest({nodes: 2});
          rst.startSet();
          rst.initiate();
      
          const primary = rst.getPrimary();
          const primaryDB = primary.getDB("test");
      
          assert.commandWorked(primary.getDB("test1").runCommand({create: "to_be_dropped1"}));
          assert.commandWorked(primary.getDB("test1").runCommand({create: "other_collection"}));
          assert.commandWorked(primary.getDB("test2").runCommand({create: "to_be_dropped2"}));
      
          // We drop the "test1.to_be_dropped1" collection with the "dropPendingCollectionReaperHang"
          // failpoint set so the collection is renamed to "test1.system.drop.*.to_be_dropped1" but the
          // drop-pending collection won't be dropped by the background thread.
          assert.commandWorked(primary.adminCommand(
              {configureFailPoint: "dropPendingCollectionReaperHang", mode: "alwaysOn"}));
      
          assert.commandWorked(primary.getDB("test1").runCommand({drop: "to_be_dropped1"}));
      
          // We then enable the "failNonIntentLocksIfWaitNeeded" failpoint and start a transaction on the
          // "test1.other_collection" namespace in order to cause the database X lock acquisition of the
          // drop-pending collection background thread to fail with a LockTimeout.
          assert.commandWorked(primary.adminCommand(
              {configureFailPoint: "failNonIntentLocksIfWaitNeeded", mode: "alwaysOn"}));
      
          const session = primary.startSession({causalConsistency: false});
          const sessionDB = session.getDatabase("test1");
          session.startTransaction();
      
          assert.commandWorked(sessionDB.other_collection.insert({}));
      
          // We then re-enable the drop-pending collection background thread and wait for it to report
          // failing to drop the drop-pending collection on the "test1" database.
          assert.commandWorked(
              primary.adminCommand({configureFailPoint: "dropPendingCollectionReaperHang", mode: "off"}));
      
          checkLog.contains(primary, "Failed to remove drop-pending collection test1.system.drop");
      
          // We then abort the transaction to release the database IX lock and disable the
          // "failNonIntentLocksIfWaitNeeded" failpoint in order to allow the database X lock acquisition
          // of the drop-pending collection background thread to later succeed.
          session.abortTransaction();
          assert.commandWorked(
              primary.adminCommand({configureFailPoint: "failNonIntentLocksIfWaitNeeded", mode: "off"}));
      
          // We then drop a collection on the "test2" database and wait for drop-pending collection
          // background thread to actual finish the drop of the "test2.system.drop.*.to_be_dropped2"
          // collection.
          assert.commandWorked(primary.getDB("test2").runCommand({drop: "to_be_dropped2"}));
          checkLog.contains(primary, "Finishing collection drop for test2.system.drop");
      
          // XXX: We should probably augment the following explicit check of the listCollections command
          // response with checkLog.contains(primary, "Finishing collection drop for test1.system.drop")
          // beforehand once the server issue is actually fixed. There likely isn't meant to be a real
          // guarantee about the order in which the drop-pending collection background thread processes
          // the namespaces.
          let collInfos = primary.getDB("test2")
                              .runCommand({listCollections: 1, includePendingDrops: true})
                              .cursor.firstBatch;
          assert.eq([], collInfos);
      
          collInfos = primary.getDB("test1")
                          .runCommand({listCollections: 1, includePendingDrops: true})
                          .cursor.firstBatch;
          assert.eq([], collInfos.filter(collInfo => collInfo.name !== "other_collection"));
      
          rst.stopSet();
      })();
      
      Show
      This issue came up on the "majority reads off" build variant after changes to the rollback fuzzer to set the failNonIntentLocksIfWaitNeeded failpoint when running commands against the primary. python buildscripts/resmoke.py --suites=replica_sets repro_drop_pending_collection_skipped_forever.js --majorityReadConcern=off repro_drop_pending_collection_skipped_forever.js /** * Tests that even if dropping a drop-pending collection namespaces fails that it is eventually * retried and dropped. */ ( function () { "use strict" ; load( "jstests/libs/check_log.js" ); const rst = new ReplSetTest({nodes: 2}); rst.startSet(); rst.initiate(); const primary = rst.getPrimary(); const primaryDB = primary.getDB( "test" ); assert.commandWorked(primary.getDB( "test1" ).runCommand({create: "to_be_dropped1" })); assert.commandWorked(primary.getDB( "test1" ).runCommand({create: "other_collection" })); assert.commandWorked(primary.getDB( "test2" ).runCommand({create: "to_be_dropped2" })); // We drop the "test1.to_be_dropped1" collection with the "dropPendingCollectionReaperHang" // failpoint set so the collection is renamed to "test1.system.drop.*.to_be_dropped1" but the // drop-pending collection won't be dropped by the background thread. assert.commandWorked(primary.adminCommand( {configureFailPoint: "dropPendingCollectionReaperHang" , mode: "alwaysOn" })); assert.commandWorked(primary.getDB( "test1" ).runCommand({drop: "to_be_dropped1" })); // We then enable the "failNonIntentLocksIfWaitNeeded" failpoint and start a transaction on the // "test1.other_collection" namespace in order to cause the database X lock acquisition of the // drop-pending collection background thread to fail with a LockTimeout. assert.commandWorked(primary.adminCommand( {configureFailPoint: "failNonIntentLocksIfWaitNeeded" , mode: "alwaysOn" })); const session = primary.startSession({causalConsistency: false }); const sessionDB = session.getDatabase( "test1" ); session.startTransaction(); assert.commandWorked(sessionDB.other_collection.insert({})); // We then re-enable the drop-pending collection background thread and wait for it to report // failing to drop the drop-pending collection on the "test1" database. assert.commandWorked( primary.adminCommand({configureFailPoint: "dropPendingCollectionReaperHang" , mode: "off" })); checkLog.contains(primary, "Failed to remove drop-pending collection test1.system.drop" ); // We then abort the transaction to release the database IX lock and disable the // "failNonIntentLocksIfWaitNeeded" failpoint in order to allow the database X lock acquisition // of the drop-pending collection background thread to later succeed. session.abortTransaction(); assert.commandWorked( primary.adminCommand({configureFailPoint: "failNonIntentLocksIfWaitNeeded" , mode: "off" })); // We then drop a collection on the "test2" database and wait for drop-pending collection // background thread to actual finish the drop of the "test2.system.drop.*.to_be_dropped2" // collection. assert.commandWorked(primary.getDB( "test2" ).runCommand({drop: "to_be_dropped2" })); checkLog.contains(primary, "Finishing collection drop for test2.system.drop" ); // XXX: We should probably augment the following explicit check of the listCollections command // response with checkLog.contains(primary, "Finishing collection drop for test1.system.drop" ) // beforehand once the server issue is actually fixed. There likely isn't meant to be a real // guarantee about the order in which the drop-pending collection background thread processes // the namespaces. let collInfos = primary.getDB( "test2" ) .runCommand({listCollections: 1, includePendingDrops: true }) .cursor.firstBatch; assert.eq([], collInfos); collInfos = primary.getDB( "test1" ) .runCommand({listCollections: 1, includePendingDrops: true }) .cursor.firstBatch; assert.eq([], collInfos.filter(collInfo => collInfo.name !== "other_collection" )); rst.stopSet(); })();
    • Storage NYC 2019-04-08

      If StorageInterface::dropCollection() throws an exception, then a warning message is logged saying the collection wasn't dropped, but the namespace is still removed from _dropPendingNamespaces so we'll never attempt to drop it again later.

            Assignee:
            gregory.wlodarek@mongodb.com Gregory Wlodarek
            Reporter:
            max.hirschhorn@mongodb.com Max Hirschhorn
            Votes:
            0 Vote for this issue
            Watchers:
            3 Start watching this issue

              Created:
              Updated:
              Resolved: