[SERVER-8883] failed upgrade that causes config servers to go out of sync not detected by stale mongos causing config data to be mutable Created: 06/Mar/13  Updated: 10/Dec/14  Resolved: 07/Mar/14

Status: Closed
Project: Core Server
Component/s: Sharding
Affects Version/s: None
Fix Version/s: None

Type: Bug Priority: Minor - P4
Reporter: Michael O'Brien Assignee: Greg Studer
Resolution: Won't Fix Votes: 0
Labels: None
Remaining Estimate: Not Specified
Time Spent: Not Specified
Original Estimate: Not Specified

Issue Links:
Related
Operating System: ALL
Participants:

 Description   

overview of steps:

  • run a 2.0 cluster with two mongos instances (s0 and s1)
  • create a bunch of collections (sharded) with old metadata format
  • upgrade everything to 2.2.3
  • create some more collections, from s0
  • run the mongos 2.4 --upgrade with the failpoint which causes the upgrade to stop in the critical section, so the config servers become out of sync (upgradeV3ToV4CriticalResyncFail) with data: {unclean:true}
  • try to modify the metadata from the existing mongos instances, s0 and s1, by doing chunk splits.

var keyfilepath = "/Users/mike/keyfile.key"
setVerboseShell(false);
 
var options = {
    mongosOptions : {binVersion:MongoRunner.versionIterator(["2.0.0","2.0.0"]),
                        logpath:"/tmp/stest"},
    configOptions:{binVersion:"2.0.0", logpath:"/tmp/stest-config"},
    shardOptions:{binVersion:"2.0.0", logpath:"/tmp/sttest-shards"},
    //rsOptions:{binVersion:allversions, logpath:"/tmp/sttest-rs"},
    rsOptions:{binVersion:MongoRunner.versionIterator(["2.0.0","2.0.0"]), logpath:"/tmp/sttest-rs", nopreallocj:""},
    separateConfig:true,
    sync:true,
    rs:true
}
 
var st = new ShardingTest({shards:2, mongos:2, other:options})
shards = st.s0.getDB("config").shards.find().toArray();
 
createShardedCollection = function(admin, coll){
    printjson(admin.runCommand({ enableSharding : coll.getDB() + "" }));
    printjson(admin.runCommand({ movePrimary : coll.getDB() + "", to : shards[0]._id }));
    printjson(admin.runCommand({ shardCollection : coll + "", key : { _id : 1 } }));
    printjson(admin.runCommand({ split : coll + "", middle : { _id : 0 } }));    
    printjson(admin.runCommand({ moveChunk : coll + "", find : { _id : 0 }, to : shards[1]._id }));
    printjson(admin.runCommand({ split : coll + "", middle : { _id : -300 } }));
    printjson(admin.runCommand({ split : coll + "", middle : { _id : -200 } }));
    printjson(admin.runCommand({ split : coll + "", middle : { _id : -100 } }));
    printjson(admin.runCommand({ split : coll + "", middle : { _id : 100 } }));
    printjson(admin.runCommand({ split : coll + "", middle : { _id : 200 } }));
    printjson(admin.runCommand({ split : coll + "", middle : { _id : 300 } }));
}
 
for(var i=0;i<10;i++){
    createShardedCollection(st.s0.getDB("admin"), st.s0.getDB("foo" + i).bar)
}
 
st.upgradeCluster("2.2.3")
sleep(30000)
 
for(var i=11;i<20;i++){
    createShardedCollection(st.s0.getDB("admin"), st.s0.getDB("foo" + i).bar)
}
 
st.stopBalancer();
printjson("trying to start up a new mongos - going to fail uncleanly");
var newmongos = MongoRunner.runMongos({configdb:st._configDB, binVersion:"g2.4", setParameter:"enableTestCommands=1", upgrade:"", setFailPoint:"upgradeV3ToV4CriticalResyncFail={mode:'alwaysOn',data:{unclean:true}}"})
sleep(10000)
 
 
printjson("here are the hashes of hte config servers");
var hash0 = st.c0.getDB("config").runCommand({dbhash:1}).md5
var hash1 = st.c1.getDB("config").runCommand({dbhash:1}).md5
var hash2 = st.c2.getDB("config").runCommand({dbhash:1}).md5
print(hash0, hash1, hash2)
 
 
printjson("trying to do a bunch of metadata ops");
for(var i=0;i<20;i++){
    printjson(st.s0.getDB("admin").runCommand({ split : "foo"+i+".bar", middle : { _id : -24 } }));
    printjson(st.s1.getDB("admin").runCommand({ split : "foo"+i+".bar", middle : { _id : -88 } }));
}
 
hash0 = st.c0.getDB("config").runCommand({dbhash:1}).md5
hash1 = st.c1.getDB("config").runCommand({dbhash:1}).md5
hash2 = st.c2.getDB("config").runCommand({dbhash:1}).md5
print(hash0, hash1, hash2)

So one of the mongos instances (s1) correctly refuses to run the split commands, but the other one (s0) will still run the split commands without complaining.

Also, trying to run an additional mongos (any version) at this point, will fail with this message:

 m27003| Wed Mar  6 18:49:56.732 [mongosMain] ERROR: config servers not in sync! config servers mikes-MacBook-Pro.local:29000 and mikes-MacBook-Pro.local:29002 differ
 m27003| chunks: "bd332c8ddd0be0167ce2bf9f56f14b6c"     chunks: "1ecde1726b6e799447dcd48314969317"
 m27003| databases: "01ba08e3f7efafba520270cb16529211"  databases: "01ba08e3f7efafba520270cb16529211"

However, having the one mongos up which is capable of modifying metadata is dangerous.



 Comments   
Comment by Greg Studer [ 07/Mar/14 ]

This is a 2.2->2.4 upgrade issue that has been proven unnecessary.

Comment by Greg Studer [ 11/Mar/13 ]

The failing mongos is actually failing due to reasons unrelated to the config upgrade, just because it is stale and out-of-date. There is no way (currently) to prevent a user from performing manual metadata operations after a config upgrade critical section failure aside from doc'ing that this is unsupported.

One thing that could make this slightly better is to fail in the critical section without releasing the distributed collection locks - these would eventually be forced, but would prevent users from immediately trying metadata operations to "fix" the upgrade.

Generated at Thu Feb 08 03:18:43 UTC 2024 using Jira 9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66.