Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-59796

Wait for replication before stopping replication on secondaries in catchup.js

    • Type: Icon: Bug Bug
    • Resolution: Fixed
    • Priority: Icon: Major - P3 Major - P3
    • 5.0.4, 5.1.0-rc0
    • Affects Version/s: None
    • Component/s: None
    • None
    • Fully Compatible
    • ALL
    • Repl 2021-09-06, Repl 2021-09-20, Repl 2021-10-04
    • 19

      In catchup.js, we have node 20030 which is running last-lts (4.4), while node 20031 and node 20032 are running latest (5.0). Initially, 20030 is primary, but at the start of case 2, 20031 steps up. It seems like when it steps up, it creates collections for tenant migrations. We also create indexes.

      [js_test:catchup] [jsTest] ReplSetTest stepUp: Finished stepping up ip-10-122-3-187.ec2.internal:20021
      
      [js_test:catchup] d20031| 2021-08-04T17:20:29.427+00:00 I  STORAGE  20320   [TenantMigrationRecipientService-0] "createCollection","attr":{"namespace":"config.tenantMigrationRecipients","uuidDisposition":"generated","uuid":{"uuid":{"$uuid":"d77f67d4-a498-4582-8ad9-e38b32597898"}},"options":{}}[js_test:catchup] d20031| 2021-08-04T17:20:29.427+00:00 I  STORAGE  20320   [TenantMigrationDonorService-0] "createCollection","attr":{"namespace":"config.tenantMigrationDonors","uuidDisposition":"generated","uuid":{"uuid":{"$uuid":"8355fd88-fb91-486e-b2d1-71fb4671b237"}},"options":{}} 
      
      [js_test:catchup] d20031| 2021-08-04T17:20:29.953+00:00 I  INDEX    20345   [TenantMigrationDonorService-0] "Index build: done building","attr":{"buildUUID":null,"namespace":"config.tenantMigrationDonors","index":"_id_","commitTimestamp":{"$timestamp":{"t":1628097629,"i":4}}}
      [js_test:catchup] d20031| 2021-08-04T17:20:29.953+00:00 I  INDEX    20345   [TenantMigrationDonorService-0] "Index build: done building","attr":{"buildUUID":null,"namespace":"config.tenantMigrationDonors","index":"TenantMigrationDonorTTLIndex","commitTimestamp":{"$timestamp":{"t":1628097629,"i":4}}}
      
      

      Afterwards, we try to stop replication on the two secondaries (20030 and 20032) in stopReplicationAndEnforceNewPrimaryToStepUp. In between when 20030 stops replication and when 20032 stops replication, 20032 is able to apply ops for the creation of the tenant migration collections/indexes.

      [js_test:catchup] d20030| 2021-08-04T17:20:29.960+00:00 W  CONTROL  23829   [conn2] "Set failpoint","attr":{"failPointName":"stopReplProducer","failPoint":{"mode":1,"data":{},"timesEntered":0}}
      
      [js_test:catchup] d20030| 2021-08-04T17:20:31.083+00:00 I  REPL     21079   [BackgroundSync] "bgsync - stopReplProducer fail point enabled. Blocking until fail point is disabled."
      
      [js_test:catchup] d20032| 2021-08-04T17:20:31.123+00:00 I  REPL     51801   [ReplWriterWorker-0] "Applied op","attr":{"command":{"op":"c","ns":"config.$cmd","ui":{"$uuid":"d77f67d4-a498-4582-8ad9-e38b32597898"},"o":{"createIndexes":"tenantMigrationRecipients","v":2,"key":{"expireAt":1},"name":"TenantMigrationRecipientTTLIndex","expireAfterSeconds":0},"ts":{"$timestamp":{"t":1628097629,"i":3}},"t":2,"v":2,"wall":{"$date":"2021-08-04T17:20:29.694Z"}},"durationMillis":19}
      [js_test:catchup] d20032| 2021-08-04T17:20:31.143+00:00 I  REPL     51801   [ReplWriterWorker-0] "Applied op","attr":{"command":{"op":"c","ns":"config.$cmd","ui":{"$uuid":"8355fd88-fb91-486e-b2d1-71fb4671b237"},"o":{"createIndexes":"tenantMigrationDonors","v":2,"key":{"expireAt":1},"name":"TenantMigrationDonorTTLIndex","expireAfterSeconds":0},"ts":{"$timestamp":{"t":1628097629,"i":4}},"t":2,"v":2,"wall":{"$date":"2021-08-04T17:20:29.787Z"}},"durationMillis":17}
      [js_test:catchup] d20032| 2021-08-04T17:20:31.159+00:00 I  REPL     51801   [ReplWriterWorker-0] "Applied op","attr":{"command":{"op":"c","ns":"config.$cmd","ui":{"$uuid":"22df4715-b918-4c3c-a651-30652da4e5c5"},"o":{"create":"external_validation_keys","idIndex":{"v":2,"key":{"_id":1},"name":"_id_"}},"ts":{"$timestamp":{"t":1628097629,"i":5}},"t":2,"v":2,"wall":{"$date":"2021-08-04T17:20:30.060Z"}},"durationMillis":14}
      
      
      
      [js_test:catchup] d20030| 2021-08-04T17:20:31.167+00:00 I  COMMAND  51803   [conn2] "Slow query","attr":{"type":"command","ns":"admin.$cmd","appName":"MongoDB Shell","command":{"waitForFailPoint":"stopReplProducer","timesEntered":1,"maxTimeMS":300000,"lsid":{"id":{"$uuid":"7a82b3a9-876c-4445-b220-2541ddcb3878"}},"$clusterTime":{"clusterTime":{"$timestamp":{"t":1628097629,"i":4}},"signature":{"hash":{"$binary":{"base64":"AAAAAAAAAAAAAAAAAAAAAAAAAAA=","subType":"0"}},"keyId":0}},"$readPreference":{"mode":"secondaryPreferred"},"$db":"admin"},"numYields":0,"reslen":163,"locks":{},"protocol":"op_msg","durationMillis":1205}
      [js_test:catchup] d20032| 2021-08-04T17:20:31.169+00:00 W  CONTROL  23829   [conn2] "Set failpoint","attr":{"failPointName":"stopReplProducer","failPoint":{"mode":1,"data":{},"timesEntered":0}}
      
      [js_test:catchup] d20032| 2021-08-04T17:20:31.179+00:00 I  REPL     51801   [ReplWriterWorker-0] "Applied op","attr":{"command":{"op":"c","ns":"config.$cmd","ui":{"$uuid":"22df4715-b918-4c3c-a651-30652da4e5c5"},"o":{"createIndexes":"external_validation_keys","v":2,"key":{"ttlExpiresAt":1},"name":"ExternalKeysTTLIndex","expireAfterSeconds":0},"ts":{"$timestamp":{"t":1628097630,"i":1}},"t":2,"v":2,"wall":{"$date":"2021-08-04T17:20:30.060Z"}},"durationMillis":17}
      
      [js_test:catchup] d20032| 2021-08-04T17:20:31.181+00:00 I  REPL     21079   [BackgroundSync] "bgsync - stopReplProducer fail point enabled. Blocking until fail point is disabled."
      [js_test:catchup] d20032| 2021-08-04T17:20:31.271+00:00 I  COMMAND  51803   [conn2] "Slow query","attr":{"type":"command","ns":"admin.$cmd","appName":"MongoDB Shell","command":{"waitForFailPoint":"stopReplProducer","timesEntered":1,"maxTimeMS":300000,"lsid":{"id":{"$uuid":"0109b3d6-2f06-44cc-890b-019b64b715c0"}},"$clusterTime":{"clusterTime":{"$timestamp":{"t":1628097630,"i":1}},"signature":{"hash":{"$binary":{"base64":"AAAAAAAAAAAAAAAAAAAAAAAAAAA=","subType":"0"}},"keyId":0}},"$readPreference":{"mode":"secondaryPreferred"},"$db":"admin"},"numYields":0,"reslen":163,"locks":{},"remote":"10.122.3.187:58868","protocol":"op_msg","durationMillis":100}
      

      Because of this, afterwards, 20030 is behind both 20031 and 20032, so it is unable to step up, and the test fails.

      We should awaitReplication before stopping replication in stopReplicationAndEnforceNewPrimaryToCatchUp so that the tenant migration ops are fully replicated before stopping replication.

            Assignee:
            huayu.ouyang@mongodb.com Huayu Ouyang
            Reporter:
            huayu.ouyang@mongodb.com Huayu Ouyang
            Votes:
            0 Vote for this issue
            Watchers:
            2 Start watching this issue

              Created:
              Updated:
              Resolved: