Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-10394

when using a generic index over a list of subobjects, $elemMatch query does not properly apply bounds across data types

    • Query Optimization
    • ALL

      If you create objects of this kind:

      > for (var i = 0; i < 5000000; ++i) { var arr = []; for (var j = 0; j < 10; ++j) { var doc = {}; doc["prop" + j] =  Math.floor(Math.random() * 1000); arr.push(doc) }; db.generic2.insert({props: arr}) }
      > db.generic2.findOne()
      {
        "_id": ObjectId("515e5e6a71b0722678929760"),
        "props": [
          {
            "prop0": 881
          },
          {
            "prop1": 47
          },
      ...
          {
            "prop9": 717
          }
        ]
      }
      > db.generic2.ensureIndex({props: 1})
      > db.generic2.stats()
      {
      	"ns" : "test.generic2",
      	"count" : 860171,
      	"size" : 233966528,
      	"avgObjSize" : 272.0000186009526,
      	"storageSize" : 288092160,
      	"numExtents" : 14,
      	"nindexes" : 3,
      	"lastExtentSize" : 78135296,
      	"paddingFactor" : 1,
      	"systemFlags" : 1,
      	"userFlags" : 0,
      	"totalIndexSize" : 424882192,
      	"indexSizes" : {
      		"_id_" : 27921040,
      		"props_1" : 382236176,
      		"a_1" : 14724976
      	},
      	"ok" : 1
      }
      

      Then you can very efficiently find docs with a "prop1" between 2 values:

      > db.generic2.find({props:{$elemMatch:{$gt:{ prop1: 800}, $lt: {prop1: 900}}}}).explain()
      {
      	"cursor" : "BtreeCursor props_1",
      	"isMultiKey" : true,
      	"n" : 85584,
      	"nscannedObjects" : 85584,
      	"nscanned" : 85584,
      	"nscannedObjectsAllPlans" : 85584,
      	"nscannedAllPlans" : 85584,
      	"scanAndOrder" : false,
      	"indexOnly" : false,
      	"nYields" : 0,
      	"nChunkSkips" : 0,
      	"millis" : 364,
      	"indexBounds" : {
      		"props" : [
      			[
      				{
      					"prop1" : 800
      				},
      				{
      					"prop1" : 900
      				}
      			]
      		]
      	},
      	"server" : "agmac.local:27017"
      }
      

      It scanned and matched 85k items which is correct (10% of all docs). If you want to find docs that have "prop1" above a certain value, using a high number as boundary works:

      > db.generic2.find({props:{$elemMatch:{$gt:{ prop1: 800}, $lt: {prop1: 999999}}}}).explain()
      {
      	"cursor" : "BtreeCursor props_1",
      	"isMultiKey" : true,
      	"n" : 171705,
      	"nscannedObjects" : 171705,
      	"nscanned" : 171705,
      	"nscannedObjectsAllPlans" : 171705,
      	"nscannedAllPlans" : 171705,
      	"scanAndOrder" : false,
      	"indexOnly" : false,
      	"nYields" : 0,
      	"nChunkSkips" : 0,
      	"millis" : 776,
      	"indexBounds" : {
      		"props" : [
      			[
      				{
      					"prop1" : 800
      				},
      				{
      					"prop1" : 999999
      				}
      			]
      		]
      	},
      	"server" : "agmac.local:27017"
      }
      

      This works, it scanned and matched 171k docs (about 20% of total). But if value boundaries are not known, one would normally use MinKey/Maxkey:

      > db.generic2.find({props:{$elemMatch:{$gt:{ prop1: 800}, $lt: {prop1: MaxKey}}}}).explain()
      {
      	"cursor" : "BtreeCursor props_1",
      	"isMultiKey" : true,
      	"n" : 860171,
      	"nscannedObjects" : 7053073,
      	"nscanned" : 7053073,
      	"nscannedObjectsAllPlans" : 7053073,
      	"nscannedAllPlans" : 7053073,
      	"scanAndOrder" : false,
      	"indexOnly" : false,
      	"nYields" : 36,
      	"nChunkSkips" : 0,
      	"millis" : 35096,
      	"indexBounds" : {
      		"props" : [
      			[
      				{
      					"prop1" : 800
      				},
      				{
      					"prop1" : {
      						"$MaxKey" : true
      					}
      				}
      			]
      		]
      	},
      	"server" : "agmac.local:27017"
      }
      

      This result is completely wrong: it scans the whole index and matches every document. It's like the matcher is thrown off by the MaxKey.. If instead using a boundary of the next key with minimum value, it works fine:

      > db.generic2.find({props:{$elemMatch:{$gt:{ prop1: 800}, $lt: {prop2: 0}}}}).explain()
      {
      	"cursor" : "BtreeCursor props_1",
      	"isMultiKey" : true,
      	"n" : 171705,
      	"nscannedObjects" : 171705,
      	"nscanned" : 171705,
      	"nscannedObjectsAllPlans" : 171705,
      	"nscannedAllPlans" : 171705,
      	"scanAndOrder" : false,
      	"indexOnly" : false,
      	"nYields" : 1,
      	"nChunkSkips" : 0,
      	"millis" : 753,
      	"indexBounds" : {
      		"props" : [
      			[
      				{
      					"prop1" : 800
      				},
      				{
      					"prop2" : 0
      				}
      			]
      		]
      	},
      	"server" : "agmac.local:27017"
      }
      

      Then tested with strings, and result shows that the problem is not just with MinKey/MaxKey, but whenever a different type is used for the value. This works well:

      > db.generic2.find({"props": { $elemMatch: { $gte: {"prop1": "aaaa"}, $lt: {"prop2": "zzzz"} }}})
      

      This scans and matches all documents:

      > db.generic2.find({"props": { $elemMatch: { $gte: {"prop1": ""}, $lt: {"prop2": {}} }}}).explain()
      

      It makes it difficult to use this kind of index since boundaries are often not known.
      This points to something weird in the index boundaries and BSON comparisons when the types are different.

            Assignee:
            backlog-query-optimization [DO NOT USE] Backlog - Query Optimization
            Reporter:
            antoine Antoine Girbal
            Votes:
            0 Vote for this issue
            Watchers:
            8 Start watching this issue

              Created:
              Updated: