Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-59796

Wait for replication before stopping replication on secondaries in catchup.js

    XMLWordPrintable

    Details

    • Type: Bug
    • Status: Closed
    • Priority: Major - P3
    • Resolution: Fixed
    • Affects Version/s: None
    • Fix Version/s: 5.0.4, 5.1.0-rc0
    • Component/s: None
    • Labels:
      None
    • Backwards Compatibility:
      Fully Compatible
    • Operating System:
      ALL
    • Sprint:
      Repl 2021-09-06, Repl 2021-09-20, Repl 2021-10-04
    • Linked BF Score:
      19

      Description

      In catchup.js, we have node 20030 which is running last-lts (4.4), while node 20031 and node 20032 are running latest (5.0). Initially, 20030 is primary, but at the start of case 2, 20031 steps up. It seems like when it steps up, it creates collections for tenant migrations. We also create indexes.

      [js_test:catchup] [jsTest] ReplSetTest stepUp: Finished stepping up ip-10-122-3-187.ec2.internal:20021
       
      [js_test:catchup] d20031| 2021-08-04T17:20:29.427+00:00 I  STORAGE  20320   [TenantMigrationRecipientService-0] "createCollection","attr":{"namespace":"config.tenantMigrationRecipients","uuidDisposition":"generated","uuid":{"uuid":{"$uuid":"d77f67d4-a498-4582-8ad9-e38b32597898"}},"options":{}}[js_test:catchup] d20031| 2021-08-04T17:20:29.427+00:00 I  STORAGE  20320   [TenantMigrationDonorService-0] "createCollection","attr":{"namespace":"config.tenantMigrationDonors","uuidDisposition":"generated","uuid":{"uuid":{"$uuid":"8355fd88-fb91-486e-b2d1-71fb4671b237"}},"options":{}} 
       
      [js_test:catchup] d20031| 2021-08-04T17:20:29.953+00:00 I  INDEX    20345   [TenantMigrationDonorService-0] "Index build: done building","attr":{"buildUUID":null,"namespace":"config.tenantMigrationDonors","index":"_id_","commitTimestamp":{"$timestamp":{"t":1628097629,"i":4}}}
      [js_test:catchup] d20031| 2021-08-04T17:20:29.953+00:00 I  INDEX    20345   [TenantMigrationDonorService-0] "Index build: done building","attr":{"buildUUID":null,"namespace":"config.tenantMigrationDonors","index":"TenantMigrationDonorTTLIndex","commitTimestamp":{"$timestamp":{"t":1628097629,"i":4}}}
      
      

      Afterwards, we try to stop replication on the two secondaries (20030 and 20032) in stopReplicationAndEnforceNewPrimaryToStepUp. In between when 20030 stops replication and when 20032 stops replication, 20032 is able to apply ops for the creation of the tenant migration collections/indexes.

      [js_test:catchup] d20030| 2021-08-04T17:20:29.960+00:00 W  CONTROL  23829   [conn2] "Set failpoint","attr":{"failPointName":"stopReplProducer","failPoint":{"mode":1,"data":{},"timesEntered":0}}
       
      [js_test:catchup] d20030| 2021-08-04T17:20:31.083+00:00 I  REPL     21079   [BackgroundSync] "bgsync - stopReplProducer fail point enabled. Blocking until fail point is disabled."
       
      [js_test:catchup] d20032| 2021-08-04T17:20:31.123+00:00 I  REPL     51801   [ReplWriterWorker-0] "Applied op","attr":{"command":{"op":"c","ns":"config.$cmd","ui":{"$uuid":"d77f67d4-a498-4582-8ad9-e38b32597898"},"o":{"createIndexes":"tenantMigrationRecipients","v":2,"key":{"expireAt":1},"name":"TenantMigrationRecipientTTLIndex","expireAfterSeconds":0},"ts":{"$timestamp":{"t":1628097629,"i":3}},"t":2,"v":2,"wall":{"$date":"2021-08-04T17:20:29.694Z"}},"durationMillis":19}
      [js_test:catchup] d20032| 2021-08-04T17:20:31.143+00:00 I  REPL     51801   [ReplWriterWorker-0] "Applied op","attr":{"command":{"op":"c","ns":"config.$cmd","ui":{"$uuid":"8355fd88-fb91-486e-b2d1-71fb4671b237"},"o":{"createIndexes":"tenantMigrationDonors","v":2,"key":{"expireAt":1},"name":"TenantMigrationDonorTTLIndex","expireAfterSeconds":0},"ts":{"$timestamp":{"t":1628097629,"i":4}},"t":2,"v":2,"wall":{"$date":"2021-08-04T17:20:29.787Z"}},"durationMillis":17}
      [js_test:catchup] d20032| 2021-08-04T17:20:31.159+00:00 I  REPL     51801   [ReplWriterWorker-0] "Applied op","attr":{"command":{"op":"c","ns":"config.$cmd","ui":{"$uuid":"22df4715-b918-4c3c-a651-30652da4e5c5"},"o":{"create":"external_validation_keys","idIndex":{"v":2,"key":{"_id":1},"name":"_id_"}},"ts":{"$timestamp":{"t":1628097629,"i":5}},"t":2,"v":2,"wall":{"$date":"2021-08-04T17:20:30.060Z"}},"durationMillis":14}
       
       
       
      [js_test:catchup] d20030| 2021-08-04T17:20:31.167+00:00 I  COMMAND  51803   [conn2] "Slow query","attr":{"type":"command","ns":"admin.$cmd","appName":"MongoDB Shell","command":{"waitForFailPoint":"stopReplProducer","timesEntered":1,"maxTimeMS":300000,"lsid":{"id":{"$uuid":"7a82b3a9-876c-4445-b220-2541ddcb3878"}},"$clusterTime":{"clusterTime":{"$timestamp":{"t":1628097629,"i":4}},"signature":{"hash":{"$binary":{"base64":"AAAAAAAAAAAAAAAAAAAAAAAAAAA=","subType":"0"}},"keyId":0}},"$readPreference":{"mode":"secondaryPreferred"},"$db":"admin"},"numYields":0,"reslen":163,"locks":{},"protocol":"op_msg","durationMillis":1205}
      [js_test:catchup] d20032| 2021-08-04T17:20:31.169+00:00 W  CONTROL  23829   [conn2] "Set failpoint","attr":{"failPointName":"stopReplProducer","failPoint":{"mode":1,"data":{},"timesEntered":0}}
       
      [js_test:catchup] d20032| 2021-08-04T17:20:31.179+00:00 I  REPL     51801   [ReplWriterWorker-0] "Applied op","attr":{"command":{"op":"c","ns":"config.$cmd","ui":{"$uuid":"22df4715-b918-4c3c-a651-30652da4e5c5"},"o":{"createIndexes":"external_validation_keys","v":2,"key":{"ttlExpiresAt":1},"name":"ExternalKeysTTLIndex","expireAfterSeconds":0},"ts":{"$timestamp":{"t":1628097630,"i":1}},"t":2,"v":2,"wall":{"$date":"2021-08-04T17:20:30.060Z"}},"durationMillis":17}
       
      [js_test:catchup] d20032| 2021-08-04T17:20:31.181+00:00 I  REPL     21079   [BackgroundSync] "bgsync - stopReplProducer fail point enabled. Blocking until fail point is disabled."
      [js_test:catchup] d20032| 2021-08-04T17:20:31.271+00:00 I  COMMAND  51803   [conn2] "Slow query","attr":{"type":"command","ns":"admin.$cmd","appName":"MongoDB Shell","command":{"waitForFailPoint":"stopReplProducer","timesEntered":1,"maxTimeMS":300000,"lsid":{"id":{"$uuid":"0109b3d6-2f06-44cc-890b-019b64b715c0"}},"$clusterTime":{"clusterTime":{"$timestamp":{"t":1628097630,"i":1}},"signature":{"hash":{"$binary":{"base64":"AAAAAAAAAAAAAAAAAAAAAAAAAAA=","subType":"0"}},"keyId":0}},"$readPreference":{"mode":"secondaryPreferred"},"$db":"admin"},"numYields":0,"reslen":163,"locks":{},"remote":"10.122.3.187:58868","protocol":"op_msg","durationMillis":100}
      

      Because of this, afterwards, 20030 is behind both 20031 and 20032, so it is unable to step up, and the test fails.

      We should awaitReplication before stopping replication in stopReplicationAndEnforceNewPrimaryToCatchUp so that the tenant migration ops are fully replicated before stopping replication.

        Attachments

          Activity

            People

            Assignee:
            huayu.ouyang Huayu Ouyang
            Reporter:
            huayu.ouyang Huayu Ouyang
            Participants:
            Votes:
            0 Vote for this issue
            Watchers:
            2 Start watching this issue

              Dates

              Created:
              Updated:
              Resolved: