In catchup.js, we have node 20030 which is running last-lts (4.4), while node 20031 and node 20032 are running latest (5.0). Initially, 20030 is primary, but at the start of case 2, 20031 steps up. It seems like when it steps up, it creates collections for tenant migrations. We also create indexes.
[js_test:catchup] [jsTest] ReplSetTest stepUp: Finished stepping up ip-10-122-3-187.ec2.internal:20021 [js_test:catchup] d20031| 2021-08-04T17:20:29.427+00:00 I STORAGE 20320 [TenantMigrationRecipientService-0] "createCollection","attr":{"namespace":"config.tenantMigrationRecipients","uuidDisposition":"generated","uuid":{"uuid":{"$uuid":"d77f67d4-a498-4582-8ad9-e38b32597898"}},"options":{}}[js_test:catchup] d20031| 2021-08-04T17:20:29.427+00:00 I STORAGE 20320 [TenantMigrationDonorService-0] "createCollection","attr":{"namespace":"config.tenantMigrationDonors","uuidDisposition":"generated","uuid":{"uuid":{"$uuid":"8355fd88-fb91-486e-b2d1-71fb4671b237"}},"options":{}} [js_test:catchup] d20031| 2021-08-04T17:20:29.953+00:00 I INDEX 20345 [TenantMigrationDonorService-0] "Index build: done building","attr":{"buildUUID":null,"namespace":"config.tenantMigrationDonors","index":"_id_","commitTimestamp":{"$timestamp":{"t":1628097629,"i":4}}} [js_test:catchup] d20031| 2021-08-04T17:20:29.953+00:00 I INDEX 20345 [TenantMigrationDonorService-0] "Index build: done building","attr":{"buildUUID":null,"namespace":"config.tenantMigrationDonors","index":"TenantMigrationDonorTTLIndex","commitTimestamp":{"$timestamp":{"t":1628097629,"i":4}}}
Afterwards, we try to stop replication on the two secondaries (20030 and 20032) in stopReplicationAndEnforceNewPrimaryToStepUp. In between when 20030 stops replication and when 20032 stops replication, 20032 is able to apply ops for the creation of the tenant migration collections/indexes.
[js_test:catchup] d20030| 2021-08-04T17:20:29.960+00:00 W CONTROL 23829 [conn2] "Set failpoint","attr":{"failPointName":"stopReplProducer","failPoint":{"mode":1,"data":{},"timesEntered":0}} [js_test:catchup] d20030| 2021-08-04T17:20:31.083+00:00 I REPL 21079 [BackgroundSync] "bgsync - stopReplProducer fail point enabled. Blocking until fail point is disabled." [js_test:catchup] d20032| 2021-08-04T17:20:31.123+00:00 I REPL 51801 [ReplWriterWorker-0] "Applied op","attr":{"command":{"op":"c","ns":"config.$cmd","ui":{"$uuid":"d77f67d4-a498-4582-8ad9-e38b32597898"},"o":{"createIndexes":"tenantMigrationRecipients","v":2,"key":{"expireAt":1},"name":"TenantMigrationRecipientTTLIndex","expireAfterSeconds":0},"ts":{"$timestamp":{"t":1628097629,"i":3}},"t":2,"v":2,"wall":{"$date":"2021-08-04T17:20:29.694Z"}},"durationMillis":19} [js_test:catchup] d20032| 2021-08-04T17:20:31.143+00:00 I REPL 51801 [ReplWriterWorker-0] "Applied op","attr":{"command":{"op":"c","ns":"config.$cmd","ui":{"$uuid":"8355fd88-fb91-486e-b2d1-71fb4671b237"},"o":{"createIndexes":"tenantMigrationDonors","v":2,"key":{"expireAt":1},"name":"TenantMigrationDonorTTLIndex","expireAfterSeconds":0},"ts":{"$timestamp":{"t":1628097629,"i":4}},"t":2,"v":2,"wall":{"$date":"2021-08-04T17:20:29.787Z"}},"durationMillis":17} [js_test:catchup] d20032| 2021-08-04T17:20:31.159+00:00 I REPL 51801 [ReplWriterWorker-0] "Applied op","attr":{"command":{"op":"c","ns":"config.$cmd","ui":{"$uuid":"22df4715-b918-4c3c-a651-30652da4e5c5"},"o":{"create":"external_validation_keys","idIndex":{"v":2,"key":{"_id":1},"name":"_id_"}},"ts":{"$timestamp":{"t":1628097629,"i":5}},"t":2,"v":2,"wall":{"$date":"2021-08-04T17:20:30.060Z"}},"durationMillis":14} [js_test:catchup] d20030| 2021-08-04T17:20:31.167+00:00 I COMMAND 51803 [conn2] "Slow query","attr":{"type":"command","ns":"admin.$cmd","appName":"MongoDB Shell","command":{"waitForFailPoint":"stopReplProducer","timesEntered":1,"maxTimeMS":300000,"lsid":{"id":{"$uuid":"7a82b3a9-876c-4445-b220-2541ddcb3878"}},"$clusterTime":{"clusterTime":{"$timestamp":{"t":1628097629,"i":4}},"signature":{"hash":{"$binary":{"base64":"AAAAAAAAAAAAAAAAAAAAAAAAAAA=","subType":"0"}},"keyId":0}},"$readPreference":{"mode":"secondaryPreferred"},"$db":"admin"},"numYields":0,"reslen":163,"locks":{},"protocol":"op_msg","durationMillis":1205} [js_test:catchup] d20032| 2021-08-04T17:20:31.169+00:00 W CONTROL 23829 [conn2] "Set failpoint","attr":{"failPointName":"stopReplProducer","failPoint":{"mode":1,"data":{},"timesEntered":0}} [js_test:catchup] d20032| 2021-08-04T17:20:31.179+00:00 I REPL 51801 [ReplWriterWorker-0] "Applied op","attr":{"command":{"op":"c","ns":"config.$cmd","ui":{"$uuid":"22df4715-b918-4c3c-a651-30652da4e5c5"},"o":{"createIndexes":"external_validation_keys","v":2,"key":{"ttlExpiresAt":1},"name":"ExternalKeysTTLIndex","expireAfterSeconds":0},"ts":{"$timestamp":{"t":1628097630,"i":1}},"t":2,"v":2,"wall":{"$date":"2021-08-04T17:20:30.060Z"}},"durationMillis":17} [js_test:catchup] d20032| 2021-08-04T17:20:31.181+00:00 I REPL 21079 [BackgroundSync] "bgsync - stopReplProducer fail point enabled. Blocking until fail point is disabled." [js_test:catchup] d20032| 2021-08-04T17:20:31.271+00:00 I COMMAND 51803 [conn2] "Slow query","attr":{"type":"command","ns":"admin.$cmd","appName":"MongoDB Shell","command":{"waitForFailPoint":"stopReplProducer","timesEntered":1,"maxTimeMS":300000,"lsid":{"id":{"$uuid":"0109b3d6-2f06-44cc-890b-019b64b715c0"}},"$clusterTime":{"clusterTime":{"$timestamp":{"t":1628097630,"i":1}},"signature":{"hash":{"$binary":{"base64":"AAAAAAAAAAAAAAAAAAAAAAAAAAA=","subType":"0"}},"keyId":0}},"$readPreference":{"mode":"secondaryPreferred"},"$db":"admin"},"numYields":0,"reslen":163,"locks":{},"remote":"10.122.3.187:58868","protocol":"op_msg","durationMillis":100}
Because of this, afterwards, 20030 is behind both 20031 and 20032, so it is unable to step up, and the test fails.
We should awaitReplication before stopping replication in stopReplicationAndEnforceNewPrimaryToCatchUp so that the tenant migration ops are fully replicated before stopping replication.
- related to
-
SERVER-81591 Ensure that the secondary being stepped up in stopReplicationAndEnforceNewPrimaryToCatchUp is not stale
- Closed