$ mongo localhost:31200
|
MongoDB shell version: 2.5.0-pre-
|
connecting to: localhost:31200/test
|
Server has startup warnings:
|
Wed Apr 10 18:43:07.479 [initandlisten]
|
Wed Apr 10 18:43:07.479 [initandlisten] ** NOTE: This is a development version (2.5.0-pre-) of MongoDB.
|
Wed Apr 10 18:43:07.479 [initandlisten] ** Not recommended for production.
|
Wed Apr 10 18:43:07.479 [initandlisten]
|
anden(mongod-2.5.0-pre-)[PRIMARY/SHARDING-rs1] test> rs.config()
|
{
|
"_id": "SHARDING-rs1",
|
"version": 4,
|
"members": [
|
{
|
"_id": 0,
|
"host": "anden.local:31200",
|
"tags": {
|
"dc": "ny",
|
"server": "0"
|
}
|
},
|
{
|
"_id": 1,
|
"host": "anden.local:31201",
|
"tags": {
|
"dc": "hobo",
|
"server": "1",
|
"theOtherShard": "doesntHaveThisTag"
|
}
|
},
|
{
|
"_id": 2,
|
"host": "anden.local:31202",
|
"priority": 0,
|
"tags": {
|
"dc": "sf",
|
"server": "2"
|
}
|
}
|
],
|
"settings": {
|
"getLastErrorModes": {
|
"ALL": {
|
"server": 3
|
},
|
"AllDC": {
|
"dc": 2
|
},
|
"AnyDC": {
|
"dc": 1
|
},
|
"Broken": {
|
"theOtherShard": 1
|
}
|
}
|
}
|
}
|
anden(mongod-2.5.0-pre-)[PRIMARY/SHARDING-rs1] test> db.test.insert({foo: "bar"})
|
Inserted 1 record(s) in 1ms
|
anden(mongod-2.5.0-pre-)[PRIMARY/SHARDING-rs1] test> db.runCommand({getLastError:1,w:"AllDC",wtimeout: 1000});
|
{
|
"n": 0,
|
"lastOp": Timestamp(1365655128, 1),
|
"connectionId": 417,
|
"wtimeout": true,
|
"waited": 1000,
|
"writtenTo": [
|
{
|
"_id": 0,
|
"host": "anden.local:31200",
|
"tags": {
|
"dc": "ny",
|
"server": "0"
|
}
|
},
|
{
|
"_id": 1,
|
"host": "anden.local:31201",
|
"tags": {
|
"dc": "hobo",
|
"server": "1",
|
"theOtherShard": "doesntHaveThisTag"
|
}
|
},
|
{
|
"_id": 2,
|
"host": "anden.local:31202",
|
"priority": 0,
|
"tags": {
|
"dc": "sf",
|
"server": "2"
|
}
|
}
|
],
|
"err": "timeout",
|
"ok": 1
|
}
|
anden(mongod-2.5.0-pre-)[PRIMARY/SHARDING-rs1] test>
|
Note that using the "AnyDC" tag works fine.
|
|
Author:
{u'date': u'2013-05-01T14:37:40Z', u'name': u'Eric Milkie', u'email': u'milkie@10gen.com'}
Message: SERVER-9333 wait long enough for slow builders like Windows Debug
Branch: v2.4
https://github.com/mongodb/mongo/commit/e9483034118e5ffd7665889642c3c32e51022ecd
|
|
Author:
{u'date': u'2013-05-01T13:25:03Z', u'name': u'Eric Milkie', u'email': u'milkie@10gen.com'}
Message: SERVER-9333 correct reference counting of GhostSlave members
The _ghostCache contains shared pointers to GhostSlave members.
When we clear this cache, we decrement all the reference counts to these objects.
Because we were not keeping a shared_ptr to a GhostSlave object in percolate(),
but instead were using a bare pointer, this caused the bare pointer to be deleted
out from under us when we cleared the cache. This in turn caused the internal
objects like the OplogReader to be deleted, which caused segfaults in percolate()
when we attempted to advance the oplogreader cursor.
Branch: v2.4
https://github.com/mongodb/mongo/commit/4dc1cd907aa84733c3d5f68eb8c4a52beef229a3
|
|
Author:
{u'date': u'2013-04-26T15:55:36Z', u'name': u'Eric Milkie', u'email': u'milkie@10gen.com'}
Message: SERVER-9333 clear ghostcache at reconfig time
Branch: v2.4
https://github.com/mongodb/mongo/commit/bd9e9d04bc0f0d35b19e97421bb7f28fc349443a
|
|
Author:
{u'date': u'2013-05-01T14:37:40Z', u'name': u'Eric Milkie', u'email': u'milkie@10gen.com'}
Message: SERVER-9333 wait long enough for slow builders like Windows Debug
Branch: master
https://github.com/mongodb/mongo/commit/1cbde06d282f8b380f2cc33cddc345686c9398ca
|
|
Author:
{u'date': u'2013-05-01T13:25:03Z', u'name': u'Eric Milkie', u'email': u'milkie@10gen.com'}
Message: SERVER-9333 correct reference counting of GhostSlave members
The _ghostCache contains shared pointers to GhostSlave members.
When we clear this cache, we decrement all the reference counts to these objects.
Because we were not keeping a shared_ptr to a GhostSlave object in percolate(),
but instead were using a bare pointer, this caused the bare pointer to be deleted
out from under us when we cleared the cache. This in turn caused the internal
objects like the OplogReader to be deleted, which caused segfaults in percolate()
when we attempted to advance the oplogreader cursor.
Branch: master
https://github.com/mongodb/mongo/commit/aa2f7bbbce08723d93d9e5eeb9b1f2c343d2155a
|
|
Author:
{u'date': u'2013-04-26T15:55:36Z', u'name': u'Eric Milkie', u'email': u'milkie@10gen.com'}
Message: SERVER-9333 clear ghostcache at reconfig time
Branch: master
https://github.com/mongodb/mongo/commit/bc7aee5b23c1b467638d8e1cb2171a2d0d2af994
|
|
I can readily reproduce this issue. It seems that as soon as you add a new tag and reconfig, the dcall tag set breaks. It has something to do with TagClause and TagSubGroups and how they are parsed and set up. I'm continuing to debug.
|
|
I can reproduce this issue with 3 nodes replica set. I have attached the jstest. In summary, the issue can be reproduced after we add/remove a tag from the replica set and reconfigure the replica set. (The jstest is for adding a tag. I've tested manually that the same behaviour occurs with deleting a tag.)
What I also see, but haven't added to the jstest yet is when I restart the server, the issue is gone.
I've tested on 2.4.1 and 2.2.1. Both failed.
|
function initShard(mongoscount, rsOptions, rsSettings) {
|
mongoscount = typeof mongoscount !== 'undefined' ? mongoscount : 3;
|
rsOptions = typeof rsOptions !== 'undefined' ? rsOptions : [];
|
|
rs = {
|
"nodes": 3,
|
"logpath": "/dev/null",
|
"oplogSize": 10
|
}
|
|
shardTest = new ShardingTest({
|
"name": "SHARDING",
|
"shards": 2,
|
"rs": rs,
|
"numReplicas": 2,
|
"nopreallocj": true,
|
"mongos": mongoscount,
|
"other": {
|
"mongosOptions": {
|
"logpath": "/dev/null"
|
}
|
}
|
});
|
|
if (typeof rsOptions !== 'undefined') {
|
cfg = shardTest.rs0.getReplSetConfig();
|
for (var i = 0; i < rsOptions[0].length; i++) {
|
cfg.members[i] = Object.extend(cfg.members[i], rsOptions[0][i]);
|
}
|
cfg.settings = rsSettings[0];
|
cfg.version = 3;
|
try {
|
shardTest.rs0.getMaster().getDB("admin")._adminCommand({ replSetReconfig : cfg });
|
/* Will close all the open connections, we don't care */
|
} catch(ex) {}
|
|
cfg = shardTest.rs1.getReplSetConfig();
|
for (var i = 0; i < rsOptions[1].length; i++) {
|
cfg.members[i] = Object.extend(cfg.members[i], rsOptions[1][i]);
|
}
|
cfg.settings = rsSettings[1];
|
cfg.version = 3;
|
try {
|
shardTest.rs1.getMaster().getDB("admin")._adminCommand({ replSetReconfig : cfg });
|
} catch(ex) {}
|
}
|
|
ReplSetTest.awaitRSClientHosts(shardTest.s, shardTest.rs0.getSecondaries(), { ok : true, secondary : true });
|
ReplSetTest.awaitRSClientHosts(shardTest.s, shardTest.rs1.getSecondaries(), { ok : true, secondary : true });
|
ReplSetTest.awaitRSClientHosts(shardTest.s, shardTest.rs0.getPrimary(), { ok : true, ismaster : true });
|
ReplSetTest.awaitRSClientHosts(shardTest.s, shardTest.rs1.getPrimary(), { ok : true, ismaster : true });
|
shardTest.rs1.awaitReplication();
|
shardTest.rs0.awaitReplication();
|
|
print("Doing the insert now");
|
shardTest.rs1.getMaster().getDB("test").test.insert({foo: "bar"});
|
printjson(shardTest.rs1.getMaster().getDB("local").system.replset.findOne());
|
printjson(shardTest.rs1.getMaster().getDB("test").runCommand({getLastError:1,w:"AllDC",wtimeout: 1000}));
|
print("Shouldn't have failed..");
|
|
return shardTest;
|
}
|
|
initShard(2, [[{"tags":{"server":"0","dc":"ny"}},{"tags":{"server":"1","dc":"ny"}},{"tags":{"server":"2","dc":"sf"},"priority":0}],[{"tags":{"server":"0","dc":"ny"}},{"tags":{"server":"1","dc":"ny","theOtherShard":"doesntHaveThisTag"}},{"tags":{"server":"2","dc":"sf"},"priority":0}]], [{"getLastErrorModes":{"AnyDC":{"dc":1},"AllDC":{"dc":2},"ALL":{"server":3}}},{"getLastErrorModes":{"AnyDC":{"dc":1},"AllDC":{"dc":2},"ALL":{"server":3},"Broken":{"theOtherShard":1}}}]);
|
|
this is what we are doing, and showcases this bug
|
|
Can you put together a small jstest for this?
var rt = new replSetTest({nodes:3});
|
rt.startSet()
|
rt.initiate({...})
|
...
|
You can find more examples in the jstest dir.
|
|
Note that this ReplicaSet is a member of a sharded cluster.
It does not seem to matter if I do this over mongos or directly on the ReplicaSet primary
|
|
Whenever I try to dig deeper into what exactly is going on, it magically starts working.
But then when I spin everything down and start from scratch, it stops working.
|
Generated at Thu Feb 08 03:20:05 UTC 2024 using Jira 9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66.