Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-34962

addShard can overwrite primary shard in config.databases of an existing cluster database

    • Catalog and Routing
    • ALL
    • Hide

      Apply this diff on base commit e86788afb38a2418f24af8e25c0cca178cde8689

      diff --git a/src/mongo/db/s/config/sharding_catalog_manager_shard_operations.cpp b/src/mongo/db/s/config/sharding_catalog_manager_shard_operations.cpp
      index 310e22c..9e55ec3 100644
      --- a/src/mongo/db/s/config/sharding_catalog_manager_shard_operations.cpp
      +++ b/src/mongo/db/s/config/sharding_catalog_manager_shard_operations.cpp
      @@ -78,6 +78,8 @@
       namespace mongo {
       namespace {+MONGO_FP_DECLARE(hangAddShardAfterCheckingDatabases);
      +
       using CallbackHandle = executor::TaskExecutor::CallbackHandle;
       using CallbackArgs = executor::TaskExecutor::CallbackArgs;
       using RemoteCommandCallbackArgs = executor::TaskExecutor::RemoteCommandCallbackArgs;
      @@ -620,6 +622,15 @@ StatusWith<std::string> ShardingCatalogManager::addShard(
               }
           }
      +    bool firstLoop = true;
      +    while (MONGO_FAIL_POINT(hangAddShardAfterCheckingDatabases)) {
      +        if (firstLoop) {
      +            log() << "Hit hangAddShardAfterCheckingDatabase failpoint";
      +            firstLoop = false;
      +        }
      +        sleepmillis(100);
      +    }
      +
           // Check that the shard candidate does not have a local config.system.sessions collection
           auto res = _dropSessionsCollection(opCtx, targeter);
       

      Then run this repro script:

      (function() {
          "use strict";    load("jstests/libs/check_log.js");    const st = new ShardingTest({ shards: 1, verbose: 2 });    jsTest.log("about to create database 'test' on 'otherShard'");
          let otherShard = MongoRunner.runMongod({'shardsvr': ''});
          assert.commandWorked(otherShard.getDB("test").getCollection("foo").insert({ x: 1 }));    const configPrimary = st.configRS.getPrimary();    jsTest.log("about to start addShard");
          assert.commandWorked(configPrimary.adminCommand({ configureFailPoint: "hangAddShardAfterCheckingDatabases", mode: "alwaysOn" }));
          let addShardHandle = startParallelShell('assert.commandWorked(db.adminCommand({ addShard: "' + otherShard.host + '" }));', st.s.port);    jsTest.log("wait for failpoint in addShard to be hit");
          checkLog.contains(configPrimary, "Hit hangAddShardAfterCheckingDatabase failpoint");    jsTest.log("about to create database 'test' on cluster");
          assert.commandWorked(st.s.adminCommand({ enableSharding: "test" }));
          assert.eq(st.shard0.shardName, st.s.getDB("config").getCollection("databases").findOne({ _id: "test" }).primary);    jsTest.log("about to allow addShard to complete");
          assert.commandWorked(st.configRS.getPrimary().adminCommand({ configureFailPoint: "hangAddShardAfterCheckingDatabases", mode: "off" }));
          addShardHandle();    assert.eq(st.shard0.shardName, st.s.getDB("config").getCollection("databases").findOne({ _id: "test" }).primary);  // expect fail    MongoRunner.stopMongod(otherShard);
          st.stop();})(); 
      Show
      Apply this diff on base commit e86788afb38a2418f24af8e25c0cca178cde8689 diff --git a/src/mongo/db/s/config/sharding_catalog_manager_shard_operations.cpp b/src/mongo/db/s/config/sharding_catalog_manager_shard_operations.cpp index 310e22c..9e55ec3 100644 --- a/src/mongo/db/s/config/sharding_catalog_manager_shard_operations.cpp +++ b/src/mongo/db/s/config/sharding_catalog_manager_shard_operations.cpp @@ -78,6 +78,8 @@ namespace mongo { namespace {+MONGO_FP_DECLARE(hangAddShardAfterCheckingDatabases); + using CallbackHandle = executor::TaskExecutor::CallbackHandle; using CallbackArgs = executor::TaskExecutor::CallbackArgs; using RemoteCommandCallbackArgs = executor::TaskExecutor::RemoteCommandCallbackArgs; @@ -620,6 +622,15 @@ StatusWith<std::string> ShardingCatalogManager::addShard( } } + bool firstLoop = true; + while (MONGO_FAIL_POINT(hangAddShardAfterCheckingDatabases)) { + if (firstLoop) { + log() << "Hit hangAddShardAfterCheckingDatabase failpoint"; + firstLoop = false; + } + sleepmillis(100); + } + // Check that the shard candidate does not have a local config.system.sessions collection auto res = _dropSessionsCollection(opCtx, targeter); Then run this repro script: (function() { "use strict"; load("jstests/libs/check_log.js"); const st = new ShardingTest({ shards: 1, verbose: 2 }); jsTest.log("about to create database 'test' on 'otherShard'"); let otherShard = MongoRunner.runMongod({'shardsvr': ''}); assert.commandWorked(otherShard.getDB("test").getCollection("foo").insert({ x: 1 })); const configPrimary = st.configRS.getPrimary(); jsTest.log("about to start addShard"); assert.commandWorked(configPrimary.adminCommand({ configureFailPoint: "hangAddShardAfterCheckingDatabases", mode: "alwaysOn" })); let addShardHandle = startParallelShell('assert.commandWorked(db.adminCommand({ addShard: "' + otherShard.host + '" }));', st.s.port); jsTest.log("wait for failpoint in addShard to be hit"); checkLog.contains(configPrimary, "Hit hangAddShardAfterCheckingDatabase failpoint"); jsTest.log("about to create database 'test' on cluster"); assert.commandWorked(st.s.adminCommand({ enableSharding: "test" })); assert.eq(st.shard0.shardName, st.s.getDB("config").getCollection("databases").findOne({ _id: "test" }).primary); jsTest.log("about to allow addShard to complete"); assert.commandWorked(st.configRS.getPrimary().adminCommand({ configureFailPoint: "hangAddShardAfterCheckingDatabases", mode: "off" })); addShardHandle(); assert.eq(st.shard0.shardName, st.s.getDB("config").getCollection("databases").findOne({ _id: "test" }).primary); // expect fail MongoRunner.stopMongod(otherShard); st.stop();})();

      addShard attempts to absorb any databases on a shard being added by writing them to config.databases.

      It chooses the shard being added as their primary shard, then uses ShardingCatalogClient::updateDatabase() to write the database entries. Since updateDatabases() uses  an update rather than an insert, it can overwrite the fields of an existing document in config.databases.

      Either addShard should use an insert, or a database distlock should be held by addShard across checking if any databases conflict with the ones being added and actually adding them.

            Assignee:
            backlog-server-catalog-and-routing [DO NOT USE] Backlog - Catalog and Routing
            Reporter:
            esha.maharishi@mongodb.com Esha Maharishi (Inactive)
            Votes:
            0 Vote for this issue
            Watchers:
            6 Start watching this issue

              Created:
              Updated: