Details
-
Bug
-
Resolution: Done
-
Major - P3
-
None
-
2.4.4
-
None
-
Ubuntu
-
Linux
Description
Setup: 3 config servers, 3 mongod and 3 mongos.
Last night I sharded 3 collections (hashed), one after the other. Documents seemed to move from the main shard to the others as time passed by. This morning, I checked if the chunk migrations have finished and it seemed like for the first two they have, but the third got stuck. I looked at 'sh.status()', and it showed this:
abc.ankara_extractor
|
shard key: { "_id" : "hashed" }
|
chunks:
|
shard0000 35
|
shard0001 31
|
shard0002 32
|
|
|
abc.ankara_parser
|
shard key: { "_id" : "hashed" }
|
chunks:
|
shard0000 49
|
shard0002 39
|
shard0001 39
|
Not perfectly balanced, but whatever.
Moving on to the third collection, nothing changed:
abc.jakarta_companies
|
shard key: { "_id" : "hashed" }
|
chunks:
|
shard0000 143
|
At this point I stopped all mongos instances, and started just one. It showed this in the log:
Fri Sep 13 08:38:59.567 [Balancer] ns: abc.ankara_parser going to move { _id: "abc.ankara_parser-_id_MinKey", lastmod: Timestamp 78000|1, lastmodEpoch: ObjectId('523220e66f82e5ddc385722d'), ns: "abc.ankara_parser", min: { _id: MinKey }, max: { _id: -9077305884563950534 }, shard: "shard0000" } from: shard0000 to: shard0001 tag []
|
Fri Sep 13 08:38:59.570 [Balancer] ns: abc.jakarta_companies going to move { _id: "abc.jakarta_companies-_id_MinKey", lastmod: Timestamp 1000|0, lastmodEpoch: ObjectId('523222126f82e5ddc3857253'), ns: "abc.jakarta_companies", min: { _id: MinKey }, max: { _id: -9094728535907602479 }, shard: "shard0000" } from: shard0000 to: shard0001 tag []
|
Fri Sep 13 08:38:59.571 [Balancer] moving chunk ns: abc.ankara_parser moving ( ns:abc.ankara_parsershard: shard0000:mongod1.abc.com:27018lastmod: 78|1||000000000000000000000000min: { _id: MinKey }max: { _id: -9077305884563950534 }) shard0000:mongod1.abc.com:27018 -> shard0001:mongod2.abc.com:27018
|
Fri Sep 13 08:38:59.656 [Balancer] moveChunk result: { who: { _id: "abc.ankara_parser", process: "ip-10-151-124-58:27018:1378994641:1905410576", state: 2, ts: ObjectId('5232610423a6193a37ce7a0a'), when: new Date(1379033348967), who: "ip-10-151-124-58:27018:1378994641:1905410576:conn751:1336024233", why: "migrate-{ _id: 7965667869890547469 }" }, ok: 0.0, errmsg: "the collection metadata could not be locked with lock migrate-{ _id: MinKey }" }
|
Fri Sep 13 08:38:59.657 [Balancer] balancer move failed: { who: { _id: "abc.ankara_parser", process: "ip-10-151-124-58:27018:1378994641:1905410576", state: 2, ts: ObjectId('5232610423a6193a37ce7a0a'), when: new Date(1379033348967), who: "ip-10-151-124-58:27018:1378994641:1905410576:conn751:1336024233", why: "migrate-{ _id: 7965667869890547469 }" }, ok: 0.0, errmsg: "the collection metadata could not be locked with lock migrate-{ _id: MinKey }" } from: shard0000 to: shard0001 chunk: min: { _id: MinKey } max: { _id: -9077305884563950534 }
|
Fri Sep 13 08:38:59.657 [Balancer] moving chunk ns: abc.jakarta_companies moving ( ns:abc.jakarta_companiesshard: shard0000:mongod1.abc.com:27018lastmod: 1|0||000000000000000000000000min: { _id: MinKey }max: { _id: -9094728535907602479 }) shard0000:mongod1.abc.com:27018 -> shard0001:mongod2.abc.com:27018
|
Looking at jakarta_companies.count() in the 3 mongods, shows:
shard0000: 3895500
|
shard0001: 27257
|
shard0002: collection doesn't exist
|
and it's been stuck like this since then.
Other info that might be useful:
mongos> db.currentOp()
|
{
|
"inprog" : [
|
{
|
"opid" : "shard0000:88205450",
|
"active" : true,
|
"secs_running" : 4510,
|
"op" : "query",
|
"ns" : "abc.jakarta_companies",
|
"query" : {
|
"moveChunk" : "abc.jakarta_companies",
|
"from" : "mongod1.abc.com:27018",
|
"to" : "mongod2.abc.com:27018",
|
"fromShard" : "shard0000",
|
"toShard" : "shard0001",
|
"min" : {
|
"_id" : { "$minKey" : 1 }
|
},
|
"max" : {
|
"_id" : NumberLong("-9094728535907602479")
|
},
|
"maxChunkSizeBytes" : NumberLong(67108864),
|
"shardId" : "abc.jakarta_companies-_id_MinKey",
|
"configdb" : "mongoc1.abc.com:27019,mongoc2.abc.com:27019,mongoc3.abc.com:27019",
|
"secondaryThrottle" : true,
|
"waitForDelete" : false
|
},
|
"client_s" : "10.151.90.78:33638",
|
"desc" : "conn755",
|
"threadId" : "0x7e3fbb5ca700",
|
"connectionId" : 755,
|
"waitingForLock" : false,
|
"msg" : "step3 of 6",
|
"numYields" : 0,
|
"lockStats" : {
|
"timeLockedMicros" : {
|
"r" : NumberLong(241859),
|
"w" : NumberLong(0)
|
},
|
"timeAcquiringMicros" : {
|
"r" : NumberLong(6),
|
"w" : NumberLong(0)
|
}
|
}
|
}
|
]
|
}
|
configsvr> db.locks.find()
|
{ "_id" : "configUpgrade", "process" : "ip-10-36-21-61:27017:1372080743:1804289383", "state" : 0, "ts" : ObjectId("51c84a67f095c43a2590802d"), "when" : ISODate("2013-06-24T13:32:23.851Z"), "who" : "ip-10-36-21-61:27017:1372080743:1804289383:mongosMain:846930886", "why" : "upgrading config database to new format v4" }
|
{ "_id" : "abc.scraped_website_resource", "process" : "ip-10-151-25-52:27018:1372842489:2092651466", "state" : 0, "ts" : ObjectId("51fe0eb2bd9a06e052152f3c"), "when" : ISODate("2013-08-04T08:20:02.526Z"), "who" : "ip-10-151-25-52:27018:1372842489:2092651466:conn15229:2046301240", "why" : "migrate-{ _id: MinKey }" }
|
{ "_id" : "abc.parsed_website_resource", "process" : "ip-10-190-202-133:27018:1378999289:1601254270", "state" : 0, "ts" : ObjectId("52321fdf0fe09cb3ce334466"), "when" : ISODate("2013-09-12T20:11:11.607Z"), "who" : "ip-10-190-202-133:27018:1378999289:1601254270:conn672:1819358977", "why" : "migrate-{ _id: -9221392181608325863 }" }
|
{ "_id" : "abc.website_catalog", "process" : "ip-10-190-202-133:27018:1372842483:2063823080", "state" : 0, "ts" : ObjectId("51f64ff6615675a179f8fbb1"), "when" : ISODate("2013-07-29T11:20:22.627Z"), "who" : "ip-10-190-202-133:27018:1372842483:2063823080:conn22035:1126107449", "why" : "migrate-{ _id: MinKey }" }
|
{ "_id" : "balancer", "process" : "ip-10-151-90-78:27017:1379061538:1804289383", "state" : 2, "ts" : ObjectId("5232cf22e84b73ca42025328"), "when" : ISODate("2013-09-13T08:38:58.442Z"), "who" : "ip-10-151-90-78:27017:1379061538:1804289383:Balancer:846930886", "why" : "doing balance round" }
|
{ "_id" : "abc.santiago_extractor", "process" : "ip-10-151-124-58:27018:1372842478:1247094076", "state" : 0, "ts" : ObjectId("51d97b45de466c4e9210a0d3"), "when" : ISODate("2013-07-07T14:29:25.296Z"), "who" : "ip-10-151-124-58:27018:1372842478:1247094076:conn16061:1645155834", "why" : "migrate-{ _id: MinKey }" }
|
{ "_id" : "abc.santiago_parser", "process" : "ip-10-190-202-133:27018:1372842483:2063823080", "state" : 0, "ts" : ObjectId("51d3ec4d615675a179f8ed66"), "when" : ISODate("2013-07-03T09:18:05.062Z"), "who" : "ip-10-190-202-133:27018:1372842483:2063823080:conn15192:1397128088", "why" : "split-{ _id: 6148914691236517204 }" }
|
{ "_id" : "abc.jakarta_contacts", "process" : "ip-10-151-124-58:27018:1372842478:1247094076", "state" : 0, "ts" : ObjectId("51fe51f1de466c4e9210c20a"), "when" : ISODate("2013-08-04T13:06:57.282Z"), "who" : "ip-10-151-124-58:27018:1372842478:1247094076:conn290924:508144576", "why" : "migrate-{ data.companyId: 2958941370527843639 }" }
|
{ "_id" : "abc.feature_results_test", "process" : "ip-10-151-2-120:27017:1377505487:1804289383", "state" : 0, "ts" : ObjectId("523217c68ff15d76d1012d9c"), "when" : ISODate("2013-09-12T19:36:38.624Z"), "who" : "ip-10-151-2-120:27017:1377505487:1804289383:conn412:1714636915", "why" : "drop" }
|
{ "_id" : "abc.ankara_extractor", "process" : "ip-10-151-124-58:27018:1378994641:1905410576", "state" : 0, "ts" : ObjectId("52325fbd23a6193a37ce7a02"), "when" : ISODate("2013-09-13T00:43:41.655Z"), "who" : "ip-10-151-124-58:27018:1378994641:1905410576:conn744:621698639", "why" : "migrate-{ _id: 9200307451568432088 }" }
|
{ "_id" : "abc.ankara_parser", "process" : "ip-10-151-124-58:27018:1378994641:1905410576", "state" : 0, "ts" : ObjectId("5232610423a6193a37ce7a0a"), "when" : ISODate("2013-09-13T00:49:08.967Z"), "who" : "ip-10-151-124-58:27018:1378994641:1905410576:conn751:1336024233", "why" : "migrate-{ _id: 7965667869890547469 }" }
|
{ "_id" : "abc.jakarta_companies", "process" : "ip-10-151-124-58:27018:1378994641:1905410576", "state" : 2, "ts" : ObjectId("5232cf1923a6193a37ce7a0f"), "when" : ISODate("2013-09-13T08:38:49.804Z"), "who" : "ip-10-151-124-58:27018:1378994641:1905410576:conn755:78850494", "why" : "migrate-{ _id: MinKey }" }
|
Since then I've tried restarting mongos, the config servers, restarting the balancer (sh.stop/startBalancer). Nothing causes the migration to resume.
How do I get out of this jam?