[SERVER-35156] secondary reads return cluster time as the operation time Created: 22/May/18  Updated: 23/Sep/19  Resolved: 31/May/18

Status: Closed
Project: Core Server
Component/s: Sharding
Affects Version/s: 4.0.0
Fix Version/s: None

Type: Bug Priority: Major - P3
Reporter: Xiangyu Yao (Inactive) Assignee: Misha Tyulenev
Resolution: Duplicate Votes: 0
Labels: todo_in_code
Remaining Estimate: Not Specified
Time Spent: Not Specified
Original Estimate: Not Specified

Issue Links:
Duplicate
duplicates SERVER-35377 Operations on new clients get latest ... Closed
Related
related to SERVER-31887 clusterTime advanced on primary witho... Closed
related to SERVER-43470 Complete TODO listed in SERVER-35156 Closed
is related to SERVER-34866 Blacklist/unblacklist tests in second... Closed
is related to SERVER-32883 Enhanced FSM testing for reading from... Closed
Operating System: ALL
Steps To Reproduce:

python buildscripts/resmoke.py --suites=no_server repro_server35156.js --storageEngine=wiredTiger

repro_server35156.js

(function() {
    "use strict";
 
    const rst = new ReplSetTest({nodes: 1, nodeOptions: {setParameter: {writePeriodicNoops: 0}}});
    rst.startSet();
    rst.initiate();
 
    const primary = rst.getPrimary();
    const session = new Mongo(rst.getURL()).startSession();
    const db = session.getDatabase("test");
 
    function logClusterTime(prefix, res) {
        jsTest.log(
            prefix + ": " +
            tojson({operationTime: res.operationTime, clusterTime: res.$clusterTime.clusterTime}));
    }
 
    let res = assert.commandWorked(db.runCommand({insert: "mycoll", documents: [{_id: 0}]}));
    logClusterTime("After doing insert", res);
    assert.eq(1, res.n, "insert should have succeeded");
 
    res = assert.commandWorked(db.runCommand({insert: "mycoll", documents: [{_id: 0}]}));
    logClusterTime("After triggering duplicate key error", res);
    assert(res.hasOwnProperty("writeErrors"), "insert should have failed");
    assert.eq(1, res.writeErrors.length, "insert should have failed");
    assert.eq(ErrorCodes.DuplicateKey,
              res.writeErrors[0].code,
              "insert should have failed due to duplicate key error");
 
    res = db.runCommand({find: "mycoll", $readPreference: {mode: "secondary"}});
    logClusterTime("After running a read command", res);
 
    // This "find" command hangs because the server returned a clusterTime for which there is no
    // corresponding entry in the oplog. This means we must wait until some other client (e.g. the
    // periodic no-op writer) comes in and writes an entry to the oplog.
    assert.commandWorked(db.runCommand({find: "mycoll", maxTimeMS: 10 * 1000}));
 
    session.endSession();
    rst.stopSet();
})();

Output

[js_test:repro_server31887] 2017-11-09T01:30:06.643-0500 ----
[js_test:repro_server31887] 2017-11-09T01:30:06.643-0500 After doing insert: {
[js_test:repro_server31887] 2017-11-09T01:30:06.643-0500 	"operationTime" : Timestamp(1510209006, 7),
[js_test:repro_server31887] 2017-11-09T01:30:06.643-0500 	"clusterTime" : Timestamp(1510209006, 7)
[js_test:repro_server31887] 2017-11-09T01:30:06.644-0500 }
[js_test:repro_server31887] 2017-11-09T01:30:06.644-0500 ----
...
[js_test:repro_server31887] 2017-11-09T01:30:06.644-0500 ----
[js_test:repro_server31887] 2017-11-09T01:30:06.645-0500 After triggering duplicate key error: {
[js_test:repro_server31887] 2017-11-09T01:30:06.645-0500 	"operationTime" : Timestamp(1510209006, 7),
[js_test:repro_server31887] 2017-11-09T01:30:06.645-0500 	"clusterTime" : Timestamp(1510209006, 8)
[js_test:repro_server31887] 2017-11-09T01:30:06.645-0500 }
[js_test:repro_server31887] 2017-11-09T01:30:06.645-0500 ----
...
[js_test:repro_server31887] 2017-11-09T01:30:06.646-0500 ----
[js_test:repro_server31887] 2017-11-09T01:30:06.646-0500 After running a read command: {
[js_test:repro_server31887] 2017-11-09T01:30:06.647-0500 	"operationTime" : Timestamp(1510209006, 8),
[js_test:repro_server31887] 2017-11-09T01:30:06.647-0500 	"clusterTime" : Timestamp(1510209006, 8)
[js_test:repro_server31887] 2017-11-09T01:30:06.647-0500 }
[js_test:repro_server31887] 2017-11-09T01:30:06.647-0500 ----
...
[js_test:repro_server31887] 2017-11-09T01:30:16.659-0500 2017-11-09T01:30:16.655-0500 E QUERY    [thread1] Error: command failed: {
[js_test:repro_server31887] 2017-11-09T01:30:16.659-0500 	"ok" : 0,
[js_test:repro_server31887] 2017-11-09T01:30:16.659-0500 	"errmsg" : "operation exceeded time limit",
[js_test:repro_server31887] 2017-11-09T01:30:16.659-0500 	"code" : 50,
[js_test:repro_server31887] 2017-11-09T01:30:16.659-0500 	"codeName" : "ExceededTimeLimit",
[js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 	"$clusterTime" : {
[js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 		"clusterTime" : Timestamp(1510209006, 8),
[js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 		"signature" : {
[js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 			"hash" : BinData(0,"AAAAAAAAAAAAAAAAAAAAAAAAAAA="),
[js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 			"keyId" : NumberLong(0)
[js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 		}
[js_test:repro_server31887] 2017-11-09T01:30:16.661-0500 	}
[js_test:repro_server31887] 2017-11-09T01:30:16.661-0500 } : undefined :
[js_test:repro_server31887] 2017-11-09T01:30:16.661-0500 _getErrorWithCode@src/mongo/shell/utils.js:25:13
[js_test:repro_server35156] 2017-11-09T01:30:16.661-0500 doassert@src/mongo/shell/assert.js:16:14
[js_test:repro_server35156] 2017-11-09T01:30:16.661-0500 assert.commandWorked@src/mongo/shell/assert.js:403:5
[js_test:repro_server35156] 2017-11-09T01:30:16.661-0500 @repro_server31887.js:36:1
[js_test:repro_server35156] 2017-11-09T01:30:16.662-0500 @repro_server31887.js:1:2
[js_test:repro_server35156] 2017-11-09T01:30:16.662-0500 failed to load: repro_server31887.js

Participants:

 Description   

As SERVER-31887 pointed out, a failed command advances the cluster time but not the operation time. If we perform a secondary read right after the failed command, the operation time being returned is actually the cluster time. It's problematic because there is no oplog entry at the operation time being returned.



 Comments   
Comment by Jack Mulrow [ 04/Jun/18 ]

This particular behavior is caused by the refactoring of operationTime and $clusterTime computing from SERVER-34843. SERVER-35377 also tracks this issue, so marking this ticket as a duplicate of that one instead.

Comment by Xiangyu Yao (Inactive) [ 22/May/18 ]

spencer Yes. I think there might be a special case that hits this code path.

Comment by Spencer Brody (Inactive) [ 22/May/18 ]

Reads should probably use the lastAppliedOpTime as the operation time, not the cluster time, as the cluster time can be ahead of any ops that this specific replica set actually has.

Generated at Thu Feb 08 04:39:00 UTC 2024 using Jira 9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66.