Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-47948

Replica set reconfig quorum check should compare configs based on version and term

    • Fully Compatible
    • ALL
    • v4.4
    • Hide
      load("jstests/libs/fail_point_util.js");
      
      // Test quorum check when the target node has same config version but lower config term.
      
      let rst = new ReplSetTest({nodes: 3, useBridge: true});
      rst.startSet();
      rst.initiate();
      
      let primary = rst.getPrimary();
      let coll = primary.getDB("test")["test"];
      let config = rst.getReplSetConfigFromNode(0);
      let origVersion = config.version;
      
      // Isolate the current primary (node 0) and block reconfigs from completing on node 1.
      jsTestLog("Isolating the old primary.");
      primary.disconnect([rst.nodes[2]]);
      let fp1 = configureFailPoint(rst.nodes[1], "blockHeartbeatReconfigFinish");
      
      // Do a no-op reconfig on the stale primary to advance the config version. We expect this will
      // timeout while waiting for the config to commit since the primary is isolated and node 1 is not
      // accepting new configs. Node 1 needs to be connected to node 0 for the quorum check to pass,
      // though.
      config.version = origVersion + 1;
      jsTestLog("Reconfig on old primary to advance config version to: " + config.version);
      let res = primary.adminCommand({replSetReconfig: config, maxTimeMS: 2000});
      assert.commandFailedWithCode(res, ErrorCodes.MaxTimeMSExpired);
      
      // Now disconnect the primary from node 1 so it's config does not propagate to it. Then re-enable
      // node 1's ability to receive new configs.
      primary.disconnect([rst.nodes[1]]);
      
      // Block node 0 from accepting new configs via heartbeat.
      let fp2 = configureFailPoint(rst.nodes[0], "blockHeartbeatReconfigFinish");
      
      // Step up node 2 so it writes a new config with an incremented term but the same version.
      jsTestLog("Stepping up node 2.");
      assert.soonNoExcept(() => {
          assert.commandWorked(rst.nodes[2].adminCommand({replSetStepUp: 1}));
          return true;
      });
      // Wait until primary is writable.
      assert.soonNoExcept(() => {return rst.nodes[2].adminCommand({isMaster: 1}).ismaster});
      
      // Let node 1 now install the original new config from the old primary.
      fp1.off();
      
      // Wait until the config that was written on step up is committed.
      assert.soon(() => isConfigCommitted(rst.nodes[2]));
      
      // Disconnect node 1 from node 2 so it cannot satisfy quorum check.
      rst.nodes[1].disconnect(rst.nodes[2]);
      
      // Connect the current primary to the stale primary so it needs it for the quorum check.
      jsTestLog("Reconnecting old primary to new primary");
      primary.reconnect([rst.nodes[2]]);
      
      jsTestLog("Doing a reconfig on node 2");
      config = rst.getReplSetConfigFromNode(2);
      config.version = origVersion + 1;
      
      assert.commandWorked(rst.nodes[2].adminCommand({replSetReconfig: config}));
      primary.reconnect([rst.nodes[1], rst.nodes[2]]);
      fp2.off();
      rst.stopSet();
      
      Show
      load( "jstests/libs/fail_point_util.js" ); // Test quorum check when the target node has same config version but lower config term. let rst = new ReplSetTest({nodes: 3, useBridge: true }); rst.startSet(); rst.initiate(); let primary = rst.getPrimary(); let coll = primary.getDB( "test" )[ "test" ]; let config = rst.getReplSetConfigFromNode(0); let origVersion = config.version; // Isolate the current primary (node 0) and block reconfigs from completing on node 1. jsTestLog( "Isolating the old primary." ); primary.disconnect([rst.nodes[2]]); let fp1 = configureFailPoint(rst.nodes[1], "blockHeartbeatReconfigFinish" ); // Do a no-op reconfig on the stale primary to advance the config version. We expect this will // timeout while waiting for the config to commit since the primary is isolated and node 1 is not // accepting new configs. Node 1 needs to be connected to node 0 for the quorum check to pass, // though. config.version = origVersion + 1; jsTestLog( "Reconfig on old primary to advance config version to: " + config.version); let res = primary.adminCommand({replSetReconfig: config, maxTimeMS: 2000}); assert.commandFailedWithCode(res, ErrorCodes.MaxTimeMSExpired); // Now disconnect the primary from node 1 so it's config does not propagate to it. Then re-enable // node 1's ability to receive new configs. primary.disconnect([rst.nodes[1]]); // Block node 0 from accepting new configs via heartbeat. let fp2 = configureFailPoint(rst.nodes[0], "blockHeartbeatReconfigFinish" ); // Step up node 2 so it writes a new config with an incremented term but the same version. jsTestLog( "Stepping up node 2." ); assert.soonNoExcept(() => { assert.commandWorked(rst.nodes[2].adminCommand({replSetStepUp: 1})); return true ; }); // Wait until primary is writable. assert.soonNoExcept(() => { return rst.nodes[2].adminCommand({isMaster: 1}).ismaster}); // Let node 1 now install the original new config from the old primary. fp1.off(); // Wait until the config that was written on step up is committed. assert.soon(() => isConfigCommitted(rst.nodes[2])); // Disconnect node 1 from node 2 so it cannot satisfy quorum check. rst.nodes[1].disconnect(rst.nodes[2]); // Connect the current primary to the stale primary so it needs it for the quorum check. jsTestLog( "Reconnecting old primary to new primary" ); primary.reconnect([rst.nodes[2]]); jsTestLog( "Doing a reconfig on node 2" ); config = rst.getReplSetConfigFromNode(2); config.version = origVersion + 1; assert.commandWorked(rst.nodes[2].adminCommand({replSetReconfig: config})); primary.reconnect([rst.nodes[1], rst.nodes[2]]); fp2.off(); rst.stopSet();
    • Repl 2020-05-18
    • 32

      Currently, when executing the quorum check for reconfig, we only compare config versions to determine if the sender's config is newer than the receiver's config. This can lead to an erroneous error if the sender's config is actually newer based on term, even if it has the same version. We should update this comparison to consider both config version and term.

            Assignee:
            siyuan.zhou@mongodb.com Siyuan Zhou
            Reporter:
            william.schultz@mongodb.com William Schultz (Inactive)
            Votes:
            0 Vote for this issue
            Watchers:
            3 Start watching this issue

              Created:
              Updated:
              Resolved: