Details
-
Type:
Question
-
Status: Closed
-
Priority:
Critical - P2
-
Resolution: Gone away
-
Affects Version/s: 2.4.10, 2.4.11
-
Fix Version/s: None
-
Component/s: Sharding
-
Labels:None
Description
Hi,
Mongodb have 2 shards, which both are replica sets. After I switched replica sets to new nodes, as add new members and remove old members. Then I migrate all config servers to 3 new server with the same hostname of each. And balancer not work now.
Mongodb clusters = 18 x mongos + 2 x shards ( replica sets ) + 3 x configs
mongod/mongos/mongo config server version are 2.4.11
Start mongos:
mongos> sh.getBalancerState()
|
false
|
mongos> sh.startBalancer()
|
|
assert.soon failed: function (){
|
var lock = db.getSisterDB( "config" ).locks.findOne({ _id : lockId })
|
|
if( state == false ) return ! lock || lock.state == 0
|
if( state == true ) return lock && lock.state == 2
|
if( state == undefined ) return (beginTS == undefined && lock) ||
|
(beginTS != undefined && ( !lock || lock.ts + "" != beginTS + "" ) )
|
else return lock && lock.state == state
|
}, msg:Waited too long for lock balancer to change to state undefined
|
Error: Printing Stack Trace
|
at printStackTrace (src/mongo/shell/utils.js:37:15)
|
at doassert (src/mongo/shell/assert.js:6:5)
|
at Function.assert.soon (src/mongo/shell/assert.js:110:60)
|
at Function.sh.waitForDLock (src/mongo/shell/utils_sh.js:156:12)
|
at Function.sh.waitForBalancer (src/mongo/shell/utils_sh.js:250:12)
|
at Function.sh.startBalancer (src/mongo/shell/utils_sh.js:131:8)
|
at (shell):1:4
|
Sat Oct 11 00:30:46.841 assert.soon failed: function (){
|
var lock = db.getSisterDB( "config" ).locks.findOne({ _id : lockId })
|
|
if( state == false ) return ! lock || lock.state == 0
|
if( state == true ) return lock && lock.state == 2
|
if( state == undefined ) return (beginTS == undefined && lock) ||
|
(beginTS != undefined && ( !lock || lock.ts + "" != beginTS + "" ) )
|
else return lock && lock.state == state
|
}, msg:Waited too long for lock balancer to change to state undefined at src/mongo/shell/assert.js:7
|
mongos>
|
mongos> sh.getBalancerState()
|
true
|
but balancer not work:
mongos> sh._lastMigration()
|
{
|
"_id" : "np1-2014-09-13T17:03:27-541478df0ef1637eaeaee578",
|
"server" : "np1",
|
"clientAddr" : "10.124.57.210:41684",
|
"time" : ISODate("2014-09-13T17:03:27.697Z"),
|
"what" : "moveChunk.from",
|
"ns" : "wnkyjknp2ahihwx5p9xwcjdtosnuvo4n8z2n43tmbe4v875r.PlayerData",
|
"details" : {
|
"min" : {
|
"deviceId" : "8859ABE4-5CAA-460F-BC8F-3EC00B4D3124"
|
},
|
"max" : {
|
"deviceId" : "8895206B-F8BD-4EB2-AE84-2401B3BA0BC0"
|
},
|
"step1 of 6" : 0,
|
"step2 of 6" : 256,
|
"step3 of 6" : 5955,
|
"step4 of 6" : 26640,
|
"step5 of 6" : 1357,
|
"step6 of 6" : 0
|
}
|
}
|
balancer configs:
mongos> db.settings.find()
|
{ "_id" : "chunksize", "value" : 64 }
|
{ "_id" : "balancer", "activeWindow" : { "start" : "1:00", "stop" : "5:00" }, "stopped" : false }
|
And some other info
mongos> db.locks.find( { _id : "balancer" } ).pretty()
|
{
|
"_id" : "balancer",
|
"process" : "app1:27020:1418872912:1804289383",
|
"state" : 0,
|
"ts" : ObjectId("54aeefd05d996bd55e5ba6d4"),
|
"when" : ISODate("2015-01-08T21:00:00.485Z"),
|
"who" : "app1:27020:1418872912:1804289383:Balancer:846930886",
|
"why" : "doing balance round"
|
}
|
and log of mongos@app1 at that time, and server local timezone is GMT+8
Fri Jan 9 01:02:29.998 [Balancer] distributed lock 'balancer/app1:27020:1418872912:1804289383' acquired, ts : 54aeb8255d996bd55e5ba2b7
|
Fri Jan 9 01:02:30.271 [Balancer] distributed lock 'balancer/app1:27020:1418872912:1804289383' unlocked.
|
Fri Jan 9 01:02:48.465 [Balancer] warning: distributed lock 'balancer/app1:27020:1418872912:1804289383 did not propagate properly. :: caused by :: 8017 update not consistent ns: config.locks query: { _id: "balancer", state: 0, ts: ObjectId('54aeb837a3c711e134a45a99') } update: { $set: { state: 1, who: "app1:27020:1418872912:1804289383:Balancer:846930886", process: "app1:27020:1418872912:1804289383", when: new Date(1420736568376), why: "doing balance round", ts: ObjectId('54aeb8385d996bd55e5ba2b8') } } gle1: { updatedExisting: true, n: 1, lastOp: Timestamp 1420736568000|6, connectionId: 2525, waited: 13, err: null, ok: 1.0 } gle2: { updatedExisting: false, n: 0, lastOp: Timestamp 1420736550000|3, connectionId: 6503, waited: 1, err: null, ok: 1.0 }
|
Fri Jan 9 01:02:48.469 [Balancer] lock update won, completing lock propagation for 'balancer/app1:27020:1418872912:1804289383'
|
Fri Jan 9 01:02:48.534 [Balancer] distributed lock 'balancer/app1:27020:1418872912:1804289383' acquired, ts : 54aeb8385d996bd55e5ba2b8
|
Fri Jan 9 01:02:48.808 [Balancer] distributed lock 'balancer/app1:27020:1418872912:1804289383' unlocked.
|
And I do dbHash on all mongoconfig servers
# server 1
|
configsvr> db.runCommand({dbHash:1})
|
{
|
"numCollections" : 13,
|
"host" : "mgcfg1:27019",
|
"collections" : {
|
"changelog" : "375ecc22738a993cd372979de468318c",
|
"chunks" : "7c226ccb8703980fb2435af7752fab6b",
|
"collections" : "e8b0c16d75a6eadec3298e452f5107a7",
|
"databases" : "ef13a57ab08f66b4b5c424ce80382a79",
|
"lockpings" : "e9a28df83cfefcf4a85b18b43427cd91",
|
"locks" : "84c4b019ffbee93a834a510d61f993ff",
|
"mongos" : "0b6d8e7277be46909e413c411b257741",
|
"settings" : "d4d9768de4d9c3b5168bbab7681201c1",
|
"shards" : "24c99c3de50fd10349ec93df4ed5e3fa",
|
"tags" : "d41d8cd98f00b204e9800998ecf8427e",
|
"version" : "f53c462a5f204958b665e49f27da5a14"
|
},
|
"md5" : "51bf3433e738fbb7a049e6b6337dd48e",
|
"timeMillis" : 0,
|
"fromCache" : [
|
"config.changelog",
|
"config.chunks",
|
"config.collections",
|
"config.databases",
|
"config.locks",
|
"config.settings",
|
"config.shards",
|
"config.tags",
|
"config.version"
|
],
|
"ok" : 1
|
}
|
|
# server2
|
configsvr> db.runCommand({dbHash:1})
|
{
|
"numCollections" : 13,
|
"host" : "mgcfg2:27019",
|
"collections" : {
|
"changelog" : "375ecc22738a993cd372979de468318c",
|
"chunks" : "7c226ccb8703980fb2435af7752fab6b",
|
"collections" : "e8b0c16d75a6eadec3298e452f5107a7",
|
"databases" : "ef13a57ab08f66b4b5c424ce80382a79",
|
"lockpings" : "8f0240a565553a35152e3ad5ce1bad36",
|
"locks" : "84c4b019ffbee93a834a510d61f993ff",
|
"mongos" : "e63914c568c08ea4dd1261e71cf1e125",
|
"settings" : "d4d9768de4d9c3b5168bbab7681201c1",
|
"shards" : "24c99c3de50fd10349ec93df4ed5e3fa",
|
"tags" : "d41d8cd98f00b204e9800998ecf8427e",
|
"version" : "f53c462a5f204958b665e49f27da5a14"
|
},
|
"md5" : "5201c1a258315e583dbcbfc04d99f5d2",
|
"timeMillis" : 0,
|
"fromCache" : [
|
"config.changelog",
|
"config.chunks",
|
"config.collections",
|
"config.databases",
|
"config.locks",
|
"config.settings",
|
"config.shards",
|
"config.tags",
|
"config.version"
|
],
|
"ok" : 1
|
}
|
|
# server 3
|
configsvr> db.runCommand({dbHash:1})
|
{
|
"numCollections" : 13,
|
"host" : "mgcfg3:27019",
|
"collections" : {
|
"changelog" : "375ecc22738a993cd372979de468318c",
|
"chunks" : "7c226ccb8703980fb2435af7752fab6b",
|
"collections" : "e8b0c16d75a6eadec3298e452f5107a7",
|
"databases" : "ef13a57ab08f66b4b5c424ce80382a79",
|
"lockpings" : "26e5c9c4bf4a3cb541e42fb10e6f7bec",
|
"locks" : "84c4b019ffbee93a834a510d61f993ff",
|
"mongos" : "f3410089e23f3c9dde90db27427f9260",
|
"settings" : "d4d9768de4d9c3b5168bbab7681201c1",
|
"shards" : "24c99c3de50fd10349ec93df4ed5e3fa",
|
"tags" : "d41d8cd98f00b204e9800998ecf8427e",
|
"version" : "f53c462a5f204958b665e49f27da5a14"
|
},
|
"md5" : "e6f192b490340f5562efa423049607f8",
|
"timeMillis" : 0,
|
"fromCache" : [
|
"config.changelog",
|
"config.chunks",
|
"config.collections",
|
"config.databases",
|
"config.locks",
|
"config.settings",
|
"config.shards",
|
"config.tags",
|
"config.version"
|
],
|
"ok" : 1
|
}
|
I have attached collections mongos and lockpings as attachments if that would helps.
I don't know where goes wrong and how to fix it, any reply will be appreciated.