Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-99002

$collStats collection scan counter does not account for repeated scans done by SBE NLJ plans

    • Type: Icon: Bug Bug
    • Resolution: Unresolved
    • Priority: Icon: Minor - P4 Minor - P4
    • None
    • Affects Version/s: None
    • Component/s: None
    • None
    • Query Execution
    • ALL
    • Hide

      Repro script which can run against a standalone mongod:

      import {getPlanStage} from "jstests/libs/query/analyze_plan.js";
      
      db.outer.drop();
      db.inner.drop();
      
      db.outer.insert([{key: 41}, {key: 42}, {key: 43}, {key: 44}]);
      db.inner.insert({key: 42});
      
      const outerSideCount = db.outer.count();
      
      // Disable hash join in order to force NLJ.
      assert.commandWorked(
          db.adminCommand({setParameter: 1, internalQueryDisableLookupExecutionUsingHashJoin: true}));
      
      function getInnerSideCollScans() {
          let collStatsArr = db.inner.aggregate([{$collStats: {queryExecStats: {}}}]).toArray();
          assert.eq(collStatsArr.length, 1);
          return collStatsArr[0].queryExecStats.collectionScans.total;
      }
      
      const innerCollScansBefore = getInnerSideCollScans();
      
      const pipeline = [{$lookup: {from: "inner", as: "as", localField: "key", foreignField: "key"}}];
      
      // Validate that we're getting an SBE NLJ plan.
      let explain = db.outer.explain().aggregate(pipeline);
      assert.eq(explain.explainVersion, "2");  // Version "2" indicates SBE.
      let lookupStage = getPlanStage(explain.queryPlanner.winningPlan, "EQ_LOOKUP");
      assert.eq(lookupStage.strategy, "NestedLoopJoin");
      
      // Execute the pipeline. We expect the plan to have performed as many collection scans on the inner
      // side as there are documents in the outer collection.
      assert.eq(outerSideCount, db.outer.aggregate(pipeline).itcount());
      const innerCollScansAfter = getInnerSideCollScans();
      assert.eq(innerCollScansBefore + outerSideCount, innerCollScansAfter);
      
      Show
      Repro script which can run against a standalone mongod: import {getPlanStage} from "jstests/libs/query/analyze_plan.js" ; db. outer .drop(); db. inner .drop(); db. outer .insert([{key: 41}, {key: 42}, {key: 43}, {key: 44}]); db. inner .insert({key: 42}); const outerSideCount = db. outer .count(); // Disable hash join in order to force NLJ. assert .commandWorked( db.adminCommand({setParameter: 1, internalQueryDisableLookupExecutionUsingHashJoin: true })); function getInnerSideCollScans() { let collStatsArr = db. inner .aggregate([{$collStats: {queryExecStats: {}}}]).toArray(); assert .eq(collStatsArr.length, 1); return collStatsArr[0].queryExecStats.collectionScans.total; } const innerCollScansBefore = getInnerSideCollScans(); const pipeline = [{$lookup: {from: " inner " , as: "as" , localField: "key" , foreignField: "key" }}]; // Validate that we're getting an SBE NLJ plan. let explain = db. outer .explain().aggregate(pipeline); assert .eq(explain.explainVersion, "2" ); // Version "2" indicates SBE. let lookupStage = getPlanStage(explain.queryPlanner.winningPlan, "EQ_LOOKUP" ); assert .eq(lookupStage.strategy, "NestedLoopJoin" ); // Execute the pipeline. We expect the plan to have performed as many collection scans on the inner // side as there are documents in the outer collection. assert .eq(outerSideCount, db. outer .aggregate(pipeline).itcount()); const innerCollScansAfter = getInnerSideCollScans(); assert .eq(innerCollScansBefore + outerSideCount, innerCollScansAfter);

      SBE supports nested loop join (NLJ) plans for executing $lookup queries. Such plans will perform a collection scan of the inner collection for every document read from the $lookup's outer side. Following my example from the attached repro script, imagine a $lookup from an "outer" collection to an "inner" collection on the "key" field, where each collection contains the following documents:

      // Contents of "outer"
      {key: 41}
      {key: 42}
      {key: 43}
      {key: 44}
      
      // Contents of "inner"
      {key: 42}
      

      There are 4 documents on the outer side. Therefore, an NLJ plan would scan the inner collection 4 times.

      We report the number of collection scans executed over a collection as part of the queryExecStats field of the $collStats output. However, for an SBE NLJ plan such as the one I described above, the number of collection scans will get incremented just once for the entire query rather than once for each scan of the inner side that is actually performed at runtime.

      It's unclear whether this was intentional or not, but it seems more useful from the diagnostic perspective to know the total number of scans of the data performed rather than the number of queries that may perform one or more collection scans.

            Assignee:
            Unassigned Unassigned
            Reporter:
            david.storch@mongodb.com David Storch
            Votes:
            0 Vote for this issue
            Watchers:
            2 Start watching this issue

              Created:
              Updated: