[SERVER-54087] After aborted tenant migration the recipient fails to delete the DB and the retry migration fails Created: 27/Jan/21  Updated: 27/Oct/23  Resolved: 01/Feb/21

Status: Closed
Project: Core Server
Component/s: Replication
Affects Version/s: None
Fix Version/s: None

Type: Bug Priority: Major - P3
Reporter: Andrew Shuvalov (Inactive) Assignee: Backlog - Replication Team
Resolution: Works as Designed Votes: 0
Labels: pm-1791_non-cloud-blocking
Remaining Estimate: Not Specified
Time Spent: Not Specified
Original Estimate: Not Specified

Assigned Teams:
Replication
Operating System: ALL
Steps To Reproduce:

The test will be committed as part of https://jira.mongodb.org/browse/SERVER-53457.

/**
 * Stress test verifies that non-idempotent multi updates made during tenant migration
 * were not retried on migration abort, which would create duplicate updates. Partially
 * updated collection where each update is applied no more than once is still an expected result.
 *
 * @tags: [requires_fcv_47, requires_majority_read_concern, incompatible_with_eft]
 */
 
(function() {
"use strict";
 
load("jstests/libs/fail_point_util.js");
load("jstests/libs/parallelTester.js");
load("jstests/libs/uuid_util.js");
load("jstests/replsets/libs/tenant_migration_test.js");
load("jstests/replsets/libs/tenant_migration_util.js");
 
const tenantMigrationTest = new TenantMigrationTest({name: jsTestName()});
if (!tenantMigrationTest.isFeatureFlagEnabled()) {
    jsTestLog("Skipping test because the tenant migrations feature flag is disabled");
    return;
}
 
const donorRst = tenantMigrationTest.getDonorRst();
 
const primary = donorRst.getPrimary();
 
const kTenantIdPrefix = "testTenantId";
const kCollName = "testColl";
const kTenantDefinedDbName = "0";
const kRecords = 1000;
 
function prepareDatabase(dbName) {
    let db = primary.getDB(dbName);
    assert.commandWorked(db.runCommand({create: kCollName}));
    for (let i = 0; i < kRecords;) {
        let bulk = db[kCollName].initializeUnorderedBulkOp();
        for (let j = 0; j < 100 && i < kRecords; ++j, ++i) {
            bulk.insert({_id: i, x: -1, a: [1, 2]});
        }
        assert.commandWorked(bulk.execute());
    }
}
 
function doMultiUpdate(primaryHost, dbName, collName, records) {
    const primary = new Mongo(primaryHost);
    const cycles = 400;
    let db = primary.getDB(dbName);
    let bulkRes;
    let completeCycles;
 
    for (completeCycles = 0; completeCycles < cycles; ++completeCycles) {
        var bulk = db[collName].initializeUnorderedBulkOp();
        try {
            bulk.find({a: 2}).update({$inc: {x: 1}});
            bulkRes = bulk.execute({w: "majority"}).toSingleResult();
            if (bulkRes["nModified"] != records && bulkRes["nModified"] > 0) {
                jsTest.log('Detected partial update');
                jsTest.log(bulkRes);
                break;
            }
        } catch (err) {
            jsTest.log(err);
            if (err["nModified"] != records && err["nModified"] > 0) {
                break;  // The final consistency analysis allows only one failure.
            }
            jsTest.log('Continue the test');
        }
    }
 
    // Verify the data after update.
    let result = assert.commandWorked(db.runCommand({
        find: collName,
    }));
    for (let j = 0; j < result.length; ++j) {
        if (result[j]['x'] != completeCycles && result[j]['x'] != completeCycles - 1) {
            assert(false, `Unexpected result ${result}`);
        }
    }
    jsTest.log('All updates completed without consistency error');
}
 
function testMultiWritesWhileInBlockingState() {
    const donorRst = tenantMigrationTest.getDonorRst();
    const donorPrimary = donorRst.getPrimary();
 
    const tenantId = `${kTenantIdPrefix}-multiWrites`;
    const dbName = tenantMigrationTest.tenantDB(tenantId, kTenantDefinedDbName);
 
    let db = primary.getDB(dbName);
    prepareDatabase(dbName);
 
    // Start non-idempotent writes in a thread.
    let writesThread = new Thread(doMultiUpdate, primary.host, dbName, kCollName, kRecords);
    writesThread.start();
 
    for (let i = 0; i < 10; ++i) {
        const migrationId = UUID();
        const migrationOpts = {
            migrationIdString: extractUUIDFromObject(migrationId),
            tenantId,
            recipientConnString: tenantMigrationTest.getRecipientConnString(),
        };
        let abortFp = configureFailPoint(db, "abortTenantMigrationAfterBlockingStarts", {
            blockTimeMS: Math.floor(Math.random() * 10),
        });
        assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts));
 
        const stateRes = assert.commandWorked(tenantMigrationTest.waitForMigrationToComplete(
            migrationOpts, false /* retryOnRetryableErrors */));
        assert.eq(stateRes.state, TenantMigrationTest.State.kAborted);
 
        abortFp.wait();
        abortFp.off();
 
        assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString));
        jsTest.log('Migration cycle completed.');
    }
 
    writesThread.join();
}
 
jsTest.log("Test sending multi write while in migration blocking state");
testMultiWritesWhileInBlockingState();
 
tenantMigrationTest.stop();
})();

Participants:

 Description   

I've made a stress test for https://jira.mongodb.org/browse/SERVER-53457 and it hits a problem that aborted migration cannot be repeated because after the first cycle the receiver now contains the database that was migrated and fails the retry. The integration test is attached.

 

The stress test is running 2 threads: 1st thread is repeatedly sending a non-idempotent multi-update to the donor, 2nd thread is looping through repeated tenant migrations, aborting each migration using fail point `abortTenantMigrationAfterBlockingStarts`.

After the first migration is aborted the donor sends the recipient the abort request:

 

[js_test:tenant_migration_multi_writes] 2021-01-27T21:43:42.353+0000 d20022| {"t":{"$date":"2021-01-27T21:43:42.353+00:00"},"s":"I",  "c":"REPL",     "id":4881400, "ctx":"conn15","msg":"Forgetting migration due to recipientForgetMigration command","attr":{"migrationId":{"uuid":{"$uuid":"172ff6b5-0611-4a63-bed5-0a887c5a9095"}},"tenantId":"testTenantId-multiWrites"}}
 
[js_test:tenant_migration_multi_writes] 2021-01-27T21:43:42.356+0000 d20022| {"t":{"$date":"2021-01-27T21:43:42.356+00:00"},"s":"I",  "c":"REPL",     "id":4881401, "ctx":"TenantMigrationRecipientService-0","msg":"Migration marked to be garbage collectable due to recipientForgetMigration command","attr":{"migrationId":{"uuid":{"$uuid":"172ff6b5-0611-4a63-bed5-0a887c5a9095"}},"tenantId":"testTenantId-multiWrites","expireAt":{"$date":"2021-01-29T21:43:42.353Z"}}}

On the next loop the donor tries again but the recipient fails with:

[js_test:tenant_migration_multi_writes] 2021-01-27T21:43:42.898+0000 d20022| {"t":{"$date":"2021-01-27T21:43:42.898+00:00"},"s":"D2", "c":"TENANT_M", "id":5271500, "ctx":"TenantMigrationRecipientService-1","msg":"listExistingDatabases entry","attr":{"migrationId":{"uuid":{"$uuid":"9d949264-9b87-4f78-bf11-7952dde38a4f"}},"tenantId":"testTenantId-multiWrites","db":{"name":"testTenantId-multiWrites_0"}}}
[js_test:tenant_migration_multi_writes] 2021-01-27T21:43:42.898+0000 d20022| {"t":{"$date":"2021-01-27T21:43:42.898+00:00"},"s":"I",  "c":"TENANT_M", "id":21077,   "ctx":"TenantMigrationRecipientService-1","msg":"Non-retryable error occurred during cloner stage","attr":{"cloner":"TenantAllDatabaseCloner","stage":"listExistingDatabases","error":{"code":48,"codeName":"NamespaceExists","errmsg":"Tenant 'testTenantId-multiWrites': databases already exist prior to data sync"}}}
[js_test:tenant_migration_multi_writes] 2021-01-27T21:43:42.898+0000 d20022| {"t":{"$date":"2021-01-27T21:43:42.898+00:00"},"s":"I",  "c":"REPL",     "id":4878501, "ctx":"TenantMigrationRecipientService-1","msg":"Tenant migration recipient instance: Data sync completed.","attr":{"tenantId":"testTenantId-multiWrites","migrationId":{"uuid":{"$uuid":"9d949264-9b87-4f78-bf11-7952dde38a4f"}},"error":{"code":48,"codeName":"NamespaceExists","errmsg":"Tenant 'testTenantId-multiWrites': databases already exist prior to data sync"}}}

the donor receives the error, using my custom log:

[js_test:tenant_migration_multi_writes] 2021-01-27T21:43:42.898+0000 d20020| !!!! migration p4, error NamespaceExists Tenant migration recipient command failed :: caused by :: Tenant 'testTenantId-multiWrites': databases already exist prior to data sync, threads !!!!! idle threads 3 for pool TenantMigrationDonorServiceThreadPool

Unfortunately this error is not properly logged by default at donor, which I will fix in my CR.

 



 Comments   
Comment by Lingzhi Deng [ 01/Feb/21 ]

The recipient is not expected to clean up tenant’s data on failing a migration. Instead, we rely on “Atlas tells Recipient Proxy to delete the tenant’s databases from the recipient”.

Generated at Thu Feb 08 05:32:38 UTC 2024 using Jira 9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66.