Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-54087

After aborted tenant migration the recipient fails to delete the DB and the retry migration fails

    XMLWordPrintableJSON

Details

    • Replication
    • ALL
    • Hide

      The test will be committed as part of https://jira.mongodb.org/browse/SERVER-53457.

      /**
       * Stress test verifies that non-idempotent multi updates made during tenant migration
       * were not retried on migration abort, which would create duplicate updates. Partially
       * updated collection where each update is applied no more than once is still an expected result.
       *
       * @tags: [requires_fcv_47, requires_majority_read_concern, incompatible_with_eft]
       */
       
      (function() {
      "use strict";
       
      load("jstests/libs/fail_point_util.js");
      load("jstests/libs/parallelTester.js");
      load("jstests/libs/uuid_util.js");
      load("jstests/replsets/libs/tenant_migration_test.js");
      load("jstests/replsets/libs/tenant_migration_util.js");
       
      const tenantMigrationTest = new TenantMigrationTest({name: jsTestName()});
      if (!tenantMigrationTest.isFeatureFlagEnabled()) {
          jsTestLog("Skipping test because the tenant migrations feature flag is disabled");
          return;
      }
       
      const donorRst = tenantMigrationTest.getDonorRst();
       
      const primary = donorRst.getPrimary();
       
      const kTenantIdPrefix = "testTenantId";
      const kCollName = "testColl";
      const kTenantDefinedDbName = "0";
      const kRecords = 1000;
       
      function prepareDatabase(dbName) {
          let db = primary.getDB(dbName);
          assert.commandWorked(db.runCommand({create: kCollName}));
          for (let i = 0; i < kRecords;) {
              let bulk = db[kCollName].initializeUnorderedBulkOp();
              for (let j = 0; j < 100 && i < kRecords; ++j, ++i) {
                  bulk.insert({_id: i, x: -1, a: [1, 2]});
              }
              assert.commandWorked(bulk.execute());
          }
      }
       
      function doMultiUpdate(primaryHost, dbName, collName, records) {
          const primary = new Mongo(primaryHost);
          const cycles = 400;
          let db = primary.getDB(dbName);
          let bulkRes;
          let completeCycles;
       
          for (completeCycles = 0; completeCycles < cycles; ++completeCycles) {
              var bulk = db[collName].initializeUnorderedBulkOp();
              try {
                  bulk.find({a: 2}).update({$inc: {x: 1}});
                  bulkRes = bulk.execute({w: "majority"}).toSingleResult();
                  if (bulkRes["nModified"] != records && bulkRes["nModified"] > 0) {
                      jsTest.log('Detected partial update');
                      jsTest.log(bulkRes);
                      break;
                  }
              } catch (err) {
                  jsTest.log(err);
                  if (err["nModified"] != records && err["nModified"] > 0) {
                      break;  // The final consistency analysis allows only one failure.
                  }
                  jsTest.log('Continue the test');
              }
          }
       
          // Verify the data after update.
          let result = assert.commandWorked(db.runCommand({
              find: collName,
          }));
          for (let j = 0; j < result.length; ++j) {
              if (result[j]['x'] != completeCycles && result[j]['x'] != completeCycles - 1) {
                  assert(false, `Unexpected result ${result}`);
              }
          }
          jsTest.log('All updates completed without consistency error');
      }
       
      function testMultiWritesWhileInBlockingState() {
          const donorRst = tenantMigrationTest.getDonorRst();
          const donorPrimary = donorRst.getPrimary();
       
          const tenantId = `${kTenantIdPrefix}-multiWrites`;
          const dbName = tenantMigrationTest.tenantDB(tenantId, kTenantDefinedDbName);
       
          let db = primary.getDB(dbName);
          prepareDatabase(dbName);
       
          // Start non-idempotent writes in a thread.
          let writesThread = new Thread(doMultiUpdate, primary.host, dbName, kCollName, kRecords);
          writesThread.start();
       
          for (let i = 0; i < 10; ++i) {
              const migrationId = UUID();
              const migrationOpts = {
                  migrationIdString: extractUUIDFromObject(migrationId),
                  tenantId,
                  recipientConnString: tenantMigrationTest.getRecipientConnString(),
              };
              let abortFp = configureFailPoint(db, "abortTenantMigrationAfterBlockingStarts", {
                  blockTimeMS: Math.floor(Math.random() * 10),
              });
              assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts));
       
              const stateRes = assert.commandWorked(tenantMigrationTest.waitForMigrationToComplete(
                  migrationOpts, false /* retryOnRetryableErrors */));
              assert.eq(stateRes.state, TenantMigrationTest.State.kAborted);
       
              abortFp.wait();
              abortFp.off();
       
              assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString));
              jsTest.log('Migration cycle completed.');
          }
       
          writesThread.join();
      }
       
      jsTest.log("Test sending multi write while in migration blocking state");
      testMultiWritesWhileInBlockingState();
       
      tenantMigrationTest.stop();
      })();
      
      

      Show
      The test will be committed as part of https://jira.mongodb.org/browse/SERVER-53457 . /** * Stress test verifies that non-idempotent multi updates made during tenant migration * were not retried on migration abort, which would create duplicate updates. Partially * updated collection where each update is applied no more than once is still an expected result. * * @tags: [requires_fcv_47, requires_majority_read_concern, incompatible_with_eft] */   ( function () { "use strict" ;   load( "jstests/libs/fail_point_util.js" ); load( "jstests/libs/parallelTester.js" ); load( "jstests/libs/uuid_util.js" ); load( "jstests/replsets/libs/tenant_migration_test.js" ); load( "jstests/replsets/libs/tenant_migration_util.js" );   const tenantMigrationTest = new TenantMigrationTest({name: jsTestName()}); if (!tenantMigrationTest.isFeatureFlagEnabled()) { jsTestLog( "Skipping test because the tenant migrations feature flag is disabled" ); return ; }   const donorRst = tenantMigrationTest.getDonorRst();   const primary = donorRst.getPrimary();   const kTenantIdPrefix = "testTenantId" ; const kCollName = "testColl" ; const kTenantDefinedDbName = "0" ; const kRecords = 1000;   function prepareDatabase(dbName) { let db = primary.getDB(dbName); assert.commandWorked(db.runCommand({create: kCollName})); for (let i = 0; i < kRecords;) { let bulk = db[kCollName].initializeUnorderedBulkOp(); for (let j = 0; j < 100 && i < kRecords; ++j, ++i) { bulk.insert({_id: i, x: -1, a: [1, 2]}); } assert.commandWorked(bulk.execute()); } }   function doMultiUpdate(primaryHost, dbName, collName, records) { const primary = new Mongo(primaryHost); const cycles = 400; let db = primary.getDB(dbName); let bulkRes; let completeCycles;   for (completeCycles = 0; completeCycles < cycles; ++completeCycles) { var bulk = db[collName].initializeUnorderedBulkOp(); try { bulk.find({a: 2}).update({$inc: {x: 1}}); bulkRes = bulk.execute({w: "majority" }).toSingleResult(); if (bulkRes[ "nModified" ] != records && bulkRes[ "nModified" ] > 0) { jsTest.log( 'Detected partial update' ); jsTest.log(bulkRes); break ; } } catch (err) { jsTest.log(err); if (err[ "nModified" ] != records && err[ "nModified" ] > 0) { break ; // The final consistency analysis allows only one failure. } jsTest.log( 'Continue the test' ); } }   // Verify the data after update. let result = assert.commandWorked(db.runCommand({ find: collName, })); for (let j = 0; j < result.length; ++j) { if (result[j][ 'x' ] != completeCycles && result[j][ 'x' ] != completeCycles - 1) { assert( false , `Unexpected result ${result}`); } } jsTest.log( 'All updates completed without consistency error' ); }   function testMultiWritesWhileInBlockingState() { const donorRst = tenantMigrationTest.getDonorRst(); const donorPrimary = donorRst.getPrimary();   const tenantId = `${kTenantIdPrefix}-multiWrites`; const dbName = tenantMigrationTest.tenantDB(tenantId, kTenantDefinedDbName);   let db = primary.getDB(dbName); prepareDatabase(dbName);   // Start non-idempotent writes in a thread. let writesThread = new Thread(doMultiUpdate, primary.host, dbName, kCollName, kRecords); writesThread.start();   for (let i = 0; i < 10; ++i) { const migrationId = UUID(); const migrationOpts = { migrationIdString: extractUUIDFromObject(migrationId), tenantId, recipientConnString: tenantMigrationTest.getRecipientConnString(), }; let abortFp = configureFailPoint(db, "abortTenantMigrationAfterBlockingStarts" , { blockTimeMS: Math.floor(Math.random() * 10), }); assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts));   const stateRes = assert.commandWorked(tenantMigrationTest.waitForMigrationToComplete( migrationOpts, false /* retryOnRetryableErrors */ )); assert.eq(stateRes.state, TenantMigrationTest.State.kAborted);   abortFp.wait(); abortFp.off();   assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString)); jsTest.log( 'Migration cycle completed.' ); }   writesThread.join(); }   jsTest.log( "Test sending multi write while in migration blocking state" ); testMultiWritesWhileInBlockingState();   tenantMigrationTest.stop(); })();

    Description

      I've made a stress test for https://jira.mongodb.org/browse/SERVER-53457 and it hits a problem that aborted migration cannot be repeated because after the first cycle the receiver now contains the database that was migrated and fails the retry. The integration test is attached.

       

      The stress test is running 2 threads: 1st thread is repeatedly sending a non-idempotent multi-update to the donor, 2nd thread is looping through repeated tenant migrations, aborting each migration using fail point `abortTenantMigrationAfterBlockingStarts`.

      After the first migration is aborted the donor sends the recipient the abort request:

       

      [js_test:tenant_migration_multi_writes] 2021-01-27T21:43:42.353+0000 d20022| {"t":{"$date":"2021-01-27T21:43:42.353+00:00"},"s":"I",  "c":"REPL",     "id":4881400, "ctx":"conn15","msg":"Forgetting migration due to recipientForgetMigration command","attr":{"migrationId":{"uuid":{"$uuid":"172ff6b5-0611-4a63-bed5-0a887c5a9095"}},"tenantId":"testTenantId-multiWrites"}}
       
      [js_test:tenant_migration_multi_writes] 2021-01-27T21:43:42.356+0000 d20022| {"t":{"$date":"2021-01-27T21:43:42.356+00:00"},"s":"I",  "c":"REPL",     "id":4881401, "ctx":"TenantMigrationRecipientService-0","msg":"Migration marked to be garbage collectable due to recipientForgetMigration command","attr":{"migrationId":{"uuid":{"$uuid":"172ff6b5-0611-4a63-bed5-0a887c5a9095"}},"tenantId":"testTenantId-multiWrites","expireAt":{"$date":"2021-01-29T21:43:42.353Z"}}}
      

      On the next loop the donor tries again but the recipient fails with:

      [js_test:tenant_migration_multi_writes] 2021-01-27T21:43:42.898+0000 d20022| {"t":{"$date":"2021-01-27T21:43:42.898+00:00"},"s":"D2", "c":"TENANT_M", "id":5271500, "ctx":"TenantMigrationRecipientService-1","msg":"listExistingDatabases entry","attr":{"migrationId":{"uuid":{"$uuid":"9d949264-9b87-4f78-bf11-7952dde38a4f"}},"tenantId":"testTenantId-multiWrites","db":{"name":"testTenantId-multiWrites_0"}}}
      [js_test:tenant_migration_multi_writes] 2021-01-27T21:43:42.898+0000 d20022| {"t":{"$date":"2021-01-27T21:43:42.898+00:00"},"s":"I",  "c":"TENANT_M", "id":21077,   "ctx":"TenantMigrationRecipientService-1","msg":"Non-retryable error occurred during cloner stage","attr":{"cloner":"TenantAllDatabaseCloner","stage":"listExistingDatabases","error":{"code":48,"codeName":"NamespaceExists","errmsg":"Tenant 'testTenantId-multiWrites': databases already exist prior to data sync"}}}
      [js_test:tenant_migration_multi_writes] 2021-01-27T21:43:42.898+0000 d20022| {"t":{"$date":"2021-01-27T21:43:42.898+00:00"},"s":"I",  "c":"REPL",     "id":4878501, "ctx":"TenantMigrationRecipientService-1","msg":"Tenant migration recipient instance: Data sync completed.","attr":{"tenantId":"testTenantId-multiWrites","migrationId":{"uuid":{"$uuid":"9d949264-9b87-4f78-bf11-7952dde38a4f"}},"error":{"code":48,"codeName":"NamespaceExists","errmsg":"Tenant 'testTenantId-multiWrites': databases already exist prior to data sync"}}}
      

      the donor receives the error, using my custom log:

      [js_test:tenant_migration_multi_writes] 2021-01-27T21:43:42.898+0000 d20020| !!!! migration p4, error NamespaceExists Tenant migration recipient command failed :: caused by :: Tenant 'testTenantId-multiWrites': databases already exist prior to data sync, threads !!!!! idle threads 3 for pool TenantMigrationDonorServiceThreadPool
      

      Unfortunately this error is not properly logged by default at donor, which I will fix in my CR.

       

      Attachments

        Activity

          People

            backlog-server-repl Backlog - Replication Team
            andrew.shuvalov@mongodb.com Andrew Shuvalov (Inactive)
            Votes:
            0 Vote for this issue
            Watchers:
            4 Start watching this issue

            Dates

              Created:
              Updated:
              Resolved: