[SERVER-31887] clusterTime advanced on primary without anything being written to oplog Created: 09/Nov/17  Updated: 27/Oct/23  Resolved: 20/Jun/19

Status: Closed
Project: Core Server
Component/s: Sharding
Affects Version/s: 3.6.0-rc2
Fix Version/s: None

Type: Bug Priority: Major - P3
Reporter: Max Hirschhorn Assignee: Misha Tyulenev
Resolution: Works as Designed Votes: 0
Labels: None
Remaining Estimate: Not Specified
Time Spent: Not Specified
Original Estimate: Not Specified

Issue Links:
Depends
Duplicate
Related
related to SERVER-31888 Re-enable periodic no-op writer for a... Closed
is related to SERVER-31306 $clusterTime not included in response... Closed
is related to SERVER-35156 secondary reads return cluster time a... Closed
is related to SERVER-35377 Operations on new clients get latest ... Closed
Operating System: ALL
Steps To Reproduce:

python buildscripts/resmoke.py --suites=no_server repro_server31887.js --storageEngine=wiredTiger

repro_server31887.js

(function() {
    "use strict";
 
    const rst = new ReplSetTest({nodes: 1, nodeOptions: {setParameter: {writePeriodicNoops: 0}}});
    rst.startSet();
    rst.initiate();
 
    const primary = rst.getPrimary();
    const session = primary.startSession({causalConsistency: true});
    const db = session.getDatabase("test");
 
    function logClusterTime(prefix, res) {
        jsTest.log(
            prefix + ": " +
            tojson({operationTime: res.operationTime, clusterTime: res.$clusterTime.clusterTime}));
    }
 
    let res = assert.commandWorked(db.runCommand({insert: "mycoll", documents: [{_id: 0}]}));
    logClusterTime("After doing insert", res);
    assert.eq(1, res.n, "insert should have succeeded");
 
    res = assert.commandWorked(db.runCommand({insert: "mycoll", documents: [{_id: 0}]}));
    logClusterTime("After triggering duplicate key error", res);
    assert(res.hasOwnProperty("writeErrors"), "insert should have failed");
    assert.eq(1, res.writeErrors.length, "insert should have failed");
    assert.eq(ErrorCodes.DuplicateKey,
              res.writeErrors[0].code,
              "insert should have failed due to duplicate key error");
 
    res = assert.commandFailed(db.runCommand({fakeCommand: 1}));
    logClusterTime("After running a bad command", res);
 
    // This "find" command hangs because the server returned a clusterTime for which there is no
    // corresponding entry in the oplog. This means we must wait until some other client (e.g. the
    // periodic no-op writer) comes in and writes an entry to the oplog.
    assert.commandWorked(db.runCommand({find: "mycoll", maxTimeMS: 10 * 1000}));
 
    session.endSession();
    rst.stopSet();
})();

Output

[js_test:repro_server31887] 2017-11-09T01:30:06.643-0500 ----
[js_test:repro_server31887] 2017-11-09T01:30:06.643-0500 After doing insert: {
[js_test:repro_server31887] 2017-11-09T01:30:06.643-0500 	"operationTime" : Timestamp(1510209006, 7),
[js_test:repro_server31887] 2017-11-09T01:30:06.643-0500 	"clusterTime" : Timestamp(1510209006, 7)
[js_test:repro_server31887] 2017-11-09T01:30:06.644-0500 }
[js_test:repro_server31887] 2017-11-09T01:30:06.644-0500 ----
...
[js_test:repro_server31887] 2017-11-09T01:30:06.644-0500 ----
[js_test:repro_server31887] 2017-11-09T01:30:06.645-0500 After triggering duplicate key error: {
[js_test:repro_server31887] 2017-11-09T01:30:06.645-0500 	"operationTime" : Timestamp(1510209006, 7),
[js_test:repro_server31887] 2017-11-09T01:30:06.645-0500 	"clusterTime" : Timestamp(1510209006, 8)
[js_test:repro_server31887] 2017-11-09T01:30:06.645-0500 }
[js_test:repro_server31887] 2017-11-09T01:30:06.645-0500 ----
...
[js_test:repro_server31887] 2017-11-09T01:30:06.646-0500 ----
[js_test:repro_server31887] 2017-11-09T01:30:06.646-0500 After running a bad command: {
[js_test:repro_server31887] 2017-11-09T01:30:06.647-0500 	"operationTime" : Timestamp(1510209006, 8),
[js_test:repro_server31887] 2017-11-09T01:30:06.647-0500 	"clusterTime" : Timestamp(1510209006, 8)
[js_test:repro_server31887] 2017-11-09T01:30:06.647-0500 }
[js_test:repro_server31887] 2017-11-09T01:30:06.647-0500 ----
...
[js_test:repro_server31887] 2017-11-09T01:30:16.659-0500 2017-11-09T01:30:16.655-0500 E QUERY    [thread1] Error: command failed: {
[js_test:repro_server31887] 2017-11-09T01:30:16.659-0500 	"ok" : 0,
[js_test:repro_server31887] 2017-11-09T01:30:16.659-0500 	"errmsg" : "operation exceeded time limit",
[js_test:repro_server31887] 2017-11-09T01:30:16.659-0500 	"code" : 50,
[js_test:repro_server31887] 2017-11-09T01:30:16.659-0500 	"codeName" : "ExceededTimeLimit",
[js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 	"$clusterTime" : {
[js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 		"clusterTime" : Timestamp(1510209006, 8),
[js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 		"signature" : {
[js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 			"hash" : BinData(0,"AAAAAAAAAAAAAAAAAAAAAAAAAAA="),
[js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 			"keyId" : NumberLong(0)
[js_test:repro_server31887] 2017-11-09T01:30:16.660-0500 		}
[js_test:repro_server31887] 2017-11-09T01:30:16.661-0500 	}
[js_test:repro_server31887] 2017-11-09T01:30:16.661-0500 } : undefined :
[js_test:repro_server31887] 2017-11-09T01:30:16.661-0500 _getErrorWithCode@src/mongo/shell/utils.js:25:13
[js_test:repro_server31887] 2017-11-09T01:30:16.661-0500 doassert@src/mongo/shell/assert.js:16:14
[js_test:repro_server31887] 2017-11-09T01:30:16.661-0500 assert.commandWorked@src/mongo/shell/assert.js:403:5
[js_test:repro_server31887] 2017-11-09T01:30:16.661-0500 @repro_server31887.js:36:1
[js_test:repro_server31887] 2017-11-09T01:30:16.662-0500 @repro_server31887.js:1:2
[js_test:repro_server31887] 2017-11-09T01:30:16.662-0500 failed to load: repro_server31887.js

Sprint: Sharding 2019-07-01
Participants:
Case:
Linked BF Score: 60

 Description   

Command errors such as duplicate key errors still cause the clusterTime to be advanced; however, no oplog entry is written for those failed operations. We then use the current clusterTime as the operationTime in the server's error responses for other error cases (as part of the changes from SERVER-31306), leading a client to wait until a time no operation actually happened at.

The behavior of advancing the clusterTime prior to RecoveryUnit::Change::commit() (e.g. for cases where the operation fails on WiredTiger transaction commit) may have other negative effects on causal consistency that I haven't thought about. The impact of this bug insofar as I have observed failures in Evergreen is that we no longer have a guarantee of making forward progress when waiting for afterClusterTime unless the periodic no-op writer is running.



 Comments   
Comment by Misha Tyulenev [ 20/Jun/19 ]

While the reported behavior, i.e. not incrementing optime on error, persists. and it is a "work as designed" the command does not hang.

Comment by Misha Tyulenev [ 22/Nov/17 ]

I moved this ticket to our backlog. The fix is related to transactions as in this case it may require LogicalClock to be a part of the transaction.

Comment by Misha Tyulenev [ 09/Nov/17 ]

Returning the clusterTime that is rolled back is a non-blocking bug as we have a way to make progress in prod.

Comment by Max Hirschhorn [ 09/Nov/17 ]

This means that this blocking only impacts tests, but it won't be hit in practice, right?

Correct, I still find it odd that the server would be allowed to report an operationTime for which calling waitForAllEarlierOplogWritesToBeVisible() and then reading directly from the oplog wouldn't be guaranteed have an entry at least as new. I think one way to get around the liveness issue when the periodic no-op writer is disabled might be to have failed operations use the last operationTime that was returned to that Client rather than using the current clusterTime.

Comment by Kaloian Manassiev [ 09/Nov/17 ]

I am not very familiar with how this works so I will leave Misha to comment. However, the way I imagine it is that the find against the primary never completes, because it is waiting for the persisted logical time to advance up to the one specified in the read concern's afterClusterTime (which in turn is equal to the in-memory logical time).

In such a case ideally the logical time which was returned for rolled-back request should be the one which is persisted instead of the in-memory one. However I am not sure how easy that is to do, so always running the no-op writer is the closest thing available.

max.hirschhorn, you said that "the periodic no-op writer is already enabled by default". This means that this blocking only impacts tests, but it won't be hit in practice, right?

Comment by Max Hirschhorn [ 09/Nov/17 ]

Should we enable the no-op writer by default?

kaloian.manassiev, the periodic no-op writer is already enabled by default. Prior conversations with Misha around how the secondary will send the "appendOplogNote" command to the node it currently believes to be primary lead me to believe that the periodic no-op writer wasn't intended to be necessary to ensure forward progress. We recently decided to disable it in more test suites (SERVER-31637) because it masks bugs such as SERVER-31609.

Comment by Kaloian Manassiev [ 09/Nov/17 ]

In the scenario where the WT transaction rolls back, wouldn't it only be the in-memory cluster time which is advanced, whereas the persisted one would remain at whatever the last oplog entry written was?

In this case I agree that that oplog writer should run at some point in order to bring the persisted up to the in-memory one.

I don't think we can make the optime only be advanced at the WUOW commit, because if it rolls back, we don't know what to roll it back to (i.e., in the presence of multiple concurrent writes).

misha.tyulenev - while SERVER-31888 may fix the tests, the same situation may be present in production as well. Should we enable the no-op writer by default?

Comment by Max Hirschhorn [ 09/Nov/17 ]

It may also be appropriate to title this bug "clusterTime advanced when nothing written to oplog" as command errors such as duplicate key errors still cause the clusterTime to be advanced. We use the current clusterTime as the operationTime in the server's error responses for other error cases (as part of the changes from SERVER-31306); however, I've not been able to figure out why the operationTime would later move backwards.

I've renamed the title of this ticket from "operationTime can move backwards (even when no operations are rolled back via replication)". I suspect it is permissible for the server to report that the operationTime time moved backwards when a command only "happened after" from the client's perspective because it didn't actually specify afterClusterTime (e.g. as was the case before with the "ping" command). I've changed the steps to reproduce to demonstrate a "find" command for which waitForReadConcern() wouldn't return because the server has advanced its clusterTime without reflecting it in the oplog.

Generated at Thu Feb 08 04:28:30 UTC 2024 using Jira 9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66.