Details
-
Question
-
Resolution: Works as Designed
-
Major - P3
-
None
-
None
-
None
Description
max.hirschhorn and I encountered an interesting scenario in one of the tests produced by the fuzzer:
We're running a sharded replica set with with two shards. Each shard has a two node replica set. The client is using causal consistency.
First, one of the nodes in shard1's replica set goes down. Since there were only two nodes in the replica set, there can be no primary, so shard1 now has just one node that is stuck as a secondary.
Then, the client performs a write on shard0. This updates its local copy of clusterTime.
Then, the client performs a read on shard1. It will send a clusterTime and afterClusterTime both greater than shard1's oplog's current time. shard1 will then try to do a no-op write to advance its oplog's time so that it can satisfy the read (described in this paragraph in the design doc). Since there's no primary of shard1, it's not possible to do the no-op write, and so the client's read will hang.
Here's an example that reproduces this behavior:
buildscripts/resmoke.py --suites=no_server hanging-read.js
|
|
hanging-read.js |
(function() {
|
const st = new ShardingTest({mongos: 1, shards: 2, rs: {nodes: 2}, config: 1}); |
const mongos = st.s0; |
const dbName = "testDB"; |
const collName = 'testColl'; |
const ns = dbName + "." + collName; |
const coll = mongos.getCollection(ns); |
const db = mongos.getDB(dbName); |
|
|
jsTest.log('Sharding the collection and pre-splitting into two chunks...'); |
assert.commandWorked(mongos.adminCommand({enableSharding: dbName})); |
st.ensurePrimaryShard(dbName, st.shard0.shardName);
|
assert.commandWorked(mongos.adminCommand({shardCollection: ns, key: {a: 1}})); |
assert.commandWorked(mongos.adminCommand({split: ns, middle: {a: 20}})); |
|
|
jsTest.log('Inserting 6 docs into donor shard, 3 in each chunk...'); |
for (let i = 0; i < 3; ++i) |
assert.writeOK(coll.insert({a: i})); |
for (let i = 20; i < 23; ++i) |
assert.writeOK(coll.insert({a: i})); |
st.rs0.awaitReplication();
|
assert.eq(6, coll.find().itcount()); |
|
|
jsTest.log('Moving 1 (of 2) chunks to recipient shard...'); |
assert.commandWorked(mongos.adminCommand({ |
moveChunk: ns,
|
find: {a: 30}, |
to: st.shard1.shardName,
|
_secondaryThrottle: true, |
writeConcern: {w: 2}, |
_waitForDelete: true, |
}));
|
|
|
jsTest.log("Shard distribution is..."); |
jsTest.log(coll.getShardDistribution());
|
|
|
// Kill one of the nodes in shard 1. |
st.rs1.stop(1); |
|
|
// The primary should then step down. |
st.rs1.awaitNoPrimary();
|
|
|
// Insert some more documents to the original "donor" shard to advance the operationTime of the |
// shard. |
jsTest.log('Insert 3 documents into donor shard...'); |
for (let i = 10; i < 13; ++i) |
assert.writeOK(coll.insert({a: i})); |
jsTest.log("Shard distribution is..."); |
jsTest.log(coll.getShardDistribution());
|
|
|
jsTest.log('Waiting for replication on rs0...'); |
st.rs0.awaitReplication();
|
|
|
// Perform a read on shard1 while waiting for an operationTime beyond what it has ever applied. |
// It should hang. |
jsTest.log('Attempting to read from rs1...'); |
db.getMongo().setCausalConsistency(true); |
assert(db.getMongo().isCausalConsistency()); |
const res = coll.runCommand({ |
find: collName,
|
filter: {a: 21}, |
// There is no primary. |
$readPreference: {mode: "secondary"} |
});
|
|
|
jsTest.log("UNREACHABLE"); |
jsTest.log("Result was: " + tojson(res)); |
throw new Error("Shouldn't reach here"); |
|
|
// st.stop(); |
}());
|
Also tagging misha.tyulenev on this.