[SERVER-65406] Time series query plan results with 2dsphere index do not match collection scan Created: 08/Apr/22  Updated: 29/Oct/23  Resolved: 20/Apr/22

Status: Closed
Project: Core Server
Component/s: None
Affects Version/s: None
Fix Version/s: 6.0.0-rc5, 6.1.0-rc0

Type: Bug Priority: Major - P3
Reporter: Matt Boros Assignee: Dan Larkin-York
Resolution: Fixed Votes: 0
Labels: None
Remaining Estimate: Not Specified
Time Spent: Not Specified
Original Estimate: Not Specified

Issue Links:
Backports
Problem/Incident
Backwards Compatibility: Fully Compatible
Operating System: ALL
Backport Requested:
v6.0
Sprint: QO 2022-04-18, QO 2022-05-02
Participants:
Linked BF Score: 135

 Description   

Having a 2dsphere index on a time series collection can leave out results. Note that it starts to fail at 1001 documents, and a bucket can hold 1000 documents at most.

 

(function() {
"use strict";
 
load("jstests/core/timeseries/libs/timeseries.js");
 
const coll = db.collection;
coll.drop();
assert.commandWorked(
    db.createCollection(coll.getName(), {timeseries: {timeField: 'timestamp', metaField: 'meta'}}));
 
Random.srand(1);
function random(min, max) {
    return Random.rand() * (max - min + 1) + min;
}
 
function compareResultsWithAndWithoutIndex(){
    assert.commandWorked(coll.createIndex({location: '2dsphere'}));
 
    const pipeline = [{
        "$match": {
            "location":
                {"$geoWithin": {"$centerSphere": [[-78.64332757901981, 42.04289085325348], 0.001]}}
        }
    }]
 
    const results1 = coll.aggregate(pipeline).toArray();
    jsTestLog("results1")
    jsTestLog(results1);
 
    assert.commandWorked(coll.dropIndexes());
 
    const results2 = coll.aggregate(pipeline).toArray();
    jsTestLog("results2")
    jsTestLog(results2);
 
    assert.eq(results1, results2);
}
 
for (let i = 0; i < 1001; ++i) {
    const doc = {
        timestamp: new Date(),
        meta: 1,
        location: [random(-80.0, -73.0), random(40.0, 45.0)]
    };
    assert.commandWorked(coll.insert(doc));
}
compareResultsWithAndWithoutIndex();
})();

 



 Comments   
Comment by Githook User [ 06/May/22 ]

Author:

{'name': 'Dan Larkin-York', 'email': 'dan.larkin-york@mongodb.com', 'username': 'dhly-etc'}

Message: SERVER-65406 Handle compressed buckets in timeseries dotted path support library
Branch: v6.0
https://github.com/mongodb/mongo/commit/c4577480b324634daf96ce685faf73f106f33e0d

Comment by Githook User [ 20/Apr/22 ]

Author:

{'name': 'Dan Larkin-York', 'email': 'dan.larkin-york@mongodb.com', 'username': 'dhly-etc'}

Message: SERVER-65406 Handle compressed buckets in timeseries dotted path support library
Branch: master
https://github.com/mongodb/mongo/commit/22228f499e9062cbffa748c7c818dffbb81a096e

Comment by David Percy [ 09/Apr/22 ]

Another observation: the insertion order affects whether the test passes:

diff --git a/jstests/core/timeseries/geo-repro.js b/jstests/core/timeseries/geo-repro.js
index 65b5af31646..f0105557b23 100644
--- a/jstests/core/timeseries/geo-repro.js
+++ b/jstests/core/timeseries/geo-repro.js
@@ -51,7 +51,7 @@ const docs = [
 ];
 
 // Insert docs.
-for (const doc of docs) {
+for (const doc of docs.reverse()) {
     assert.commandWorked(coll.insert(doc));
 }
 // Make sure we understand how the events were bucketed.

This is surprising because the insertion order does not affect how the events are grouped into buckets (in this example).

Comment by David Percy [ 08/Apr/22 ]

We were able to shrink the test data down to 4 documents:

(function() {
"use strict";
 
load("jstests/core/timeseries/libs/timeseries.js");
 
const coll = db.collection;
coll.drop();
assert.commandWorked(
    db.createCollection(coll.getName(), {timeseries: {timeField: 'timestamp', metaField: 'meta'}}));
const buckets = db['system.buckets.' + coll.getName()];
 
// Generate docs.
const docs = [
    {
        "timestamp" : ISODate("1970-01-01T00:00:00.802Z"),
        "meta" : 1,
        "_id" : ObjectId("6250c6be1cbfd664e946abef"),
        "location" : [
                -74.09678818608975,
                41.00417341402175
        ]
    },
    {
        "timestamp" : ISODate("1970-01-01T00:00:00.803Z"),
        "meta" : 1,
        "_id" : ObjectId("6250c6be1cbfd664e946abf1"),
        "location" : [
                -79.12624790775983,
                44.06398255741807
        ]
    },
    {
        "timestamp" : ISODate("1970-01-01T00:00:00.804Z"),
        "meta" : 1,
        "_id" : ObjectId("6250c6be1cbfd664e946abf3"),
        "location" : [
                -78.60183601155738,
                42.07190152236784
        ]
    },
    {
        // This last timestamp is 10 hours ahead, so it will end up in a separate bucket.
        "timestamp" : ISODate("1970-01-01T10:00:00Z"),
        "meta" : 1,
        "_id" : ObjectId("6250c6be1cbfd664e946abf5"),
        "location" : [
                -76.77221367322493,
                44.89876182362717
        ]
    }
];
 
// Insert docs.
for (const doc of docs) {
    assert.commandWorked(coll.insert(doc));
}
assert.eq(2, buckets.count());
 
assert.commandWorked(coll.createIndex({location: '2dsphere'}));
 
const pipeline = [{
    "$match": {
        "location":
            {"$geoWithin": {"$centerSphere": [[-78.64332757901981, 42.04289085325348], 0.001]}}
    }
}]
 
const results1 = coll.aggregate(pipeline).toArray();
jsTestLog("results1")
jsTestLog(results1);
 
const results2 = coll.aggregate(pipeline, {hint: {$natural: 1}}).toArray();
jsTestLog("results2")
jsTestLog(results2);
 
assert.eq(results1, results2);
})();

Another strange observation: there are 2 buckets, with the same metadata value. The 'meta' field seems irrelevant to the query. But if you give them different meta values:

diff --git a/jstests/core/timeseries/geo-repro.js b/jstests/core/timeseries/geo-repro.js
index f318293c44c..ed87ef48674 100644
--- a/jstests/core/timeseries/geo-repro.js
+++ b/jstests/core/timeseries/geo-repro.js
@@ -41,7 +41,7 @@ const docs = [
     {
         // This last timestamp is 10 hours ahead, so it will end up in a separate bucket.
         "timestamp" : ISODate("1970-01-01T10:00:00Z"),
-        "meta" : 1,
+        "meta" : 2,
         "_id" : ObjectId("6250c6be1cbfd664e946abf5"),
         "location" : [
                 -76.77221367322493,

then the test passes.

Generated at Thu Feb 08 06:02:39 UTC 2024 using Jira 9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66.