[SERVER-85423] CollectionScan::work() returns NEED_TIME for clustered collections which causes early yielding Created: 18/Jan/24  Updated: 22/Jan/24  Resolved: 22/Jan/24

Status: Closed
Project: Core Server
Component/s: None
Affects Version/s: None
Fix Version/s: None

Type: Bug Priority: Major - P3
Reporter: James Bronsted Assignee: James Bronsted
Resolution: Won't Fix Votes: 0
Labels: None
Remaining Estimate: Not Specified
Time Spent: Not Specified
Original Estimate: Not Specified

Issue Links:
Related
is related to SERVER-80862 Make config.transactions a clustered ... Closed
Operating System: ALL
Steps To Reproduce:

Below is a modified version of the test case, stripped down to the bare minimum to reliably reproduce the issue. In the failure case, the test will hang indefinitely.

// Test that the read concern level 'snapshot' exhibits the correct yielding behavior. That is,
// operations performed at read concern level snapshot check for interrupt but do not yield locks or
// storage engine resources.
// @tags: [
//   uses_transactions,
// ]
import {waitForCurOpByFailPoint} from "jstests/libs/curop_helpers.js";
 
const dbName = "test";
const collName = "coll";
 
const rst = new ReplSetTest({nodes: 1});
rst.startSet();
rst.initiate();
const db = rst.getPrimary().getDB(dbName);
const adminDB = db.getSiblingDB("admin");
const coll = db.coll;
TestData.numDocs = 4;
 
// Set 'internalQueryExecYieldIterations' to 2 to ensure that commands yield on the second try
// (i.e. after they have established a snapshot but before they have returned any documents).
assert.commandWorked(db.adminCommand({setParameter: 1, internalQueryExecYieldIterations: 2}));
 
// Set 'internalQueryExecYieldPeriodMS' to 24 hours to significantly reduce a probability of a
// situation occuring where the execution threads do not receive enough CPU time and commands yield
// on timeout (i.e. yield period expiration) instead of on the second try as expected by setting
// parameter 'internalQueryExecYieldIterations' to 2.
assert.commandWorked(db.adminCommand({setParameter: 1, internalQueryExecYieldPeriodMS: 86400000}));
 
function assertKillPending(opId) {
    const res =
        adminDB.aggregate([{$currentOp: {}}, {$match: {ns: coll.getFullName(), opid: opId}}])
            .toArray();
    assert.eq(
        res.length,
        1,
        tojson(
            adminDB.aggregate([{$currentOp: {}}, {$match: {ns: coll.getFullName()}}]).toArray()));
    assert(res[0].hasOwnProperty("killPending"), tojson(res));
    assert.eq(true, res[0].killPending, tojson(res));
}
 
function populateCollection() {
    db.coll.drop({writeConcern: {w: "majority"}});
    for (let i = 0; i < TestData.numDocs; i++) {
        assert.commandWorked(
            db.coll.insert({_id: i, x: 1, location: [0, 0]}, {writeConcern: {w: "majority"}}));
    }
 
    assert.commandWorked(db.runCommand({
        createIndexes: "coll",
        indexes: [{key: {location: "2d"}, name: "geo_2d"}],
        writeConcern: {w: "majority"}
    }));
}
 
function testCommand(awaitCommandFn, curOpFilter, testWriteConflict, setYieldIterations) {
    //
    // Test that the command does not yield locks.
    //
 
    TestData.txnNumber++;
    populateCollection();
 
    // Start a command that hangs before checking for interrupt.
    assert.commandWorked(db.adminCommand(
        {configureFailPoint: "setInterruptOnlyPlansCheckForInterruptHang", mode: "alwaysOn"}));
    let awaitCommand = startParallelShell(awaitCommandFn, rst.ports[0]);
    waitForCurOpByFailPoint(
        db, coll.getFullName(), "setInterruptOnlyPlansCheckForInterruptHang", curOpFilter);
 
    // Start a drop. This should block behind the command, since the command does not yield
    // locks.
    let awaitDrop = startParallelShell(function() {
        db.getSiblingDB("test").coll.drop({writeConcern: {w: "majority"}});
    }, rst.ports[0]);
 
    // Remove the hang. The command should complete successfully.
    assert.commandWorked(db.adminCommand(
        {configureFailPoint: "setInterruptOnlyPlansCheckForInterruptHang", mode: "off"}));
    awaitCommand();
 
    // Now the drop can complete.
    awaitDrop();
 
    //
    // Test that the command does not read data that is inserted during its execution.
    // 'awaitCommandFn' should fail if it reads the following document:
    //     {_id: <numDocs>, x: 1, new: 1, location: [0, 0]}
    //
 
    TestData.txnNumber++;
    populateCollection();
 
    // Start a command that hangs before checking for interrupt.
    assert.commandWorked(db.adminCommand(
        {configureFailPoint: "setInterruptOnlyPlansCheckForInterruptHang", mode: "alwaysOn"}));
    awaitCommand = startParallelShell(awaitCommandFn, rst.ports[0]);
    waitForCurOpByFailPoint(
        db, coll.getFullName(), "setInterruptOnlyPlansCheckForInterruptHang", curOpFilter);
 
    // Insert data that should not be read by the command.
    assert.commandWorked(db.coll.insert({_id: TestData.numDocs, x: 1, new: 1, location: [0, 0]},
                                        {writeConcern: {w: "majority"}}));
 
    // Remove the hang. The command should complete successfully.
    assert.commandWorked(db.adminCommand(
        {configureFailPoint: "setInterruptOnlyPlansCheckForInterruptHang", mode: "off"}));
    awaitCommand();
}
 
// Test delete.
testCommand(function() {
    const session = db.getMongo().startSession({causalConsistency: false});
    const sessionDb = session.getDatabase("test");
    session.startTransaction({readConcern: {level: "snapshot"}});
    const res = assert.commandWorked(sessionDb.runCommand(
        {delete: "coll", deletes: [{q: {}, limit: 1}, {q: {new: 1}, limit: 1}]}));
    assert.commandWorked(session.commitTransaction_forTesting());
    // Only remove one existing doc committed before the transaction.
    assert.eq(res.n, 1, tojson(res));
}, {op: "remove"}, true);
 
rst.stopSet();

Participants:

 Description   

The CollectionScan::work() method returns NEED_TIME when it is scanning a clustered collection and it is unable to seekNear() close enough to the target record. As a result, PlanExecutorImpl::_getNextImpl() will continue to the next iteration in the loop, resulting in another check for yielding.

In the case of jstests/noPassthrough/read_concern_snapshot_yielding.js, the test sets internalQueryExecYieldIterations to 2 and the internalQueryExecYieldPeriodMS to 24 hours. The effect of this is that the setInterruptOnlyPlansCheckForInterruptHang failpoint will be entered whenever the loop in PlanExecutorImpl::_getNextImpl() reaches its second iteration. So, if the collection being scanned is clustered, in this case config.transactions, and if the scan is unable to reach to the interior of the scan boundary, the failpoint will be entered before the first document is made available.

My changes to make config.transactions clustered in SERVER-80862 results in this test nondeterministically failing because the failpoint is reached too early in this way. The patch below provides a solution likely unfit for production, but which demonstrates the problem:

diff --git a/src/mongo/db/exec/collection_scan.cpp b/src/mongo/db/exec/collection_scan.cpp
index 4e88a1cc4fb..69362d2cc9f 100644
--- a/src/mongo/db/exec/collection_scan.cpp
+++ b/src/mongo/db/exec/collection_scan.cpp
@@ -156,6 +156,7 @@ CollectionScan::CollectionScan(ExpressionContext* expCtx,
 }
 
 PlanStage::StageState CollectionScan::doWork(WorkingSetID* out) {
+    _needTimeDueToClusteredCollection = false;
     if (_commonStats.isEOF) {
         _priority.reset();
         return PlanStage::IS_EOF;
@@ -474,6 +475,8 @@ PlanStage::StageState CollectionScan::returnIfMatches(WorkingSetMember* member,
     // mark their position for resuming the tailable scan later on.
     if (beforeStartOfRange(_params, *member)) {
         _workingSet->free(memberID);
+        LOGV2(8086240, "setting PlanStage::NEED_TIME bc clustered collection didn't seekNear close enough");
+        _needTimeDueToClusteredCollection = true;
         return PlanStage::NEED_TIME;
     }
 
diff --git a/src/mongo/db/exec/plan_stage.h b/src/mongo/db/exec/plan_stage.h
index bdf48637c72..4a2ae06c4a0 100644
--- a/src/mongo/db/exec/plan_stage.h
+++ b/src/mongo/db/exec/plan_stage.h
@@ -363,6 +363,11 @@ public:
         _commonStats.executionTime.emplace(0);
     }
 
+    bool needTimeDueToClusteredCollection() const {
+        return _needTimeDueToClusteredCollection;
+    }
+    bool _needTimeDueToClusteredCollection{false};
+
 protected:
     /**
      * Performs one unit of work.  See comment at work() above.
diff --git a/src/mongo/db/query/plan_executor_impl.cpp b/src/mongo/db/query/plan_executor_impl.cpp
index 0828f77067c..c68d29388c2 100644
--- a/src/mongo/db/query/plan_executor_impl.cpp
+++ b/src/mongo/db/query/plan_executor_impl.cpp
@@ -415,7 +415,7 @@ PlanExecutor::ExecState PlanExecutorImpl::_getNextImpl(Snapshotted<Document>* ob
             }
         };
 
-        if (_yieldPolicy->shouldYieldOrInterrupt(_opCtx)) {
+        if (!_root->needTimeDueToClusteredCollection() && _yieldPolicy->shouldYieldOrInterrupt(_opCtx)) {
             uassertStatusOK(_yieldPolicy->yieldOrInterrupt(_opCtx, whileYieldingFn));
         }

With this patch, the failure is no longer reproducible. I suspect that this is a bona fide bug in the query code, since it seems wrong to me that yielding would occur more often because a collection is clustered (note that the issue never occurs when the collection is nonclustered).



 Comments   
Comment by James Bronsted [ 22/Jan/24 ]

Closing in favor of SERVER-85465. The implementation of that ticket will ensure that only one API call will be necessary to seek to the start of a collection scan, which eliminates the early returns from CollectionScan::work().

Generated at Thu Feb 08 06:57:43 UTC 2024 using Jira 9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66.