[SERVER-1904] map-reduce produces different results for an indexed query Created: 06/Oct/10  Updated: 12/Jul/16  Resolved: 09/Nov/10

Status: Closed
Project: Core Server
Component/s: Index Maintenance
Affects Version/s: 1.6.2
Fix Version/s: 1.7.3

Type: Bug Priority: Major - P3
Reporter: Ian Ragsdale Assignee: Eliot Horowitz (Inactive)
Resolution: Done Votes: 1
Labels: None
Remaining Estimate: Not Specified
Time Spent: Not Specified
Original Estimate: Not Specified
Environment:

Ubuntu 10.0.4 on EC2


Backwards Compatibility: Minor Change
Operating System: ALL
Participants:

 Description   

I've been investigating some unusual numbers in my map-reduce results
and made an interesting discovery. If the map reduce query is on an
indexed array, and more than one value in the array matches the query,
the document is mapped more than once. A simple example is below:
// make sure the collection is empty
> db.example.drop()
true
> db.example.save(

{ arr : [1, 2] }

)

// just aggregate by _id
> map = function()

{ emit(this._id, 1) }

function () {
emit(this._id, 1);
}

// count the values
> reduce = function(k,vals)

{ ... var sum=0; ... for(var i in vals) sum += vals[i]; ... return sum; ... }


function (k, vals) {
var sum = 0;
for (var i in vals)

{ sum += vals[i]; }


return sum;
}

// the first M/R finds the document once & produces the correct count
> res = db.example.mapReduce(map,reduce, { query : {} })

{
"result" : "tmp.mr.mapreduce_1286336126_70",
"timeMillis" : 16,
"counts" :

{ "input" : 1, "emit" : 1, "output" : 1 }

,
"ok" : 1,
}
> db[res.result].find()

{ "_id" : ObjectId("4cabee4ac0f7095167a5ab62"), "value" : 1 }

// the second query matches the array without an index, and still
produces expected results
> res = db.example.mapReduce(map,reduce, { query : { arr: {$gte:0} } })

{
"result" : "tmp.mr.mapreduce_1286336141_71",
"timeMillis" : 12,
"counts" :

{ "input" : 1, "emit" : 1, "output" : 1 }

,
"ok" : 1,
}
> db[res.result].find()

{ "_id" : ObjectId("4cabee4ac0f7095167a5ab62"), "value" : 1 }

// now index on the array and run the exact same M/R - note that it
now has 2 inputs & 2 emits, and the count has doubled
> db.example.ensureIndex(

{arr:1}

)
> res = db.example.mapReduce(map,reduce, { query : { arr: {$gte:0} } })

{
"result" : "tmp.mr.mapreduce_1286336171_72",
"timeMillis" : 15,
"counts" :

{ "input" : 2, "emit" : 2, "output" : 1 }

,
"ok" : 1,
}
> db[res.result].find()

{ "_id" : ObjectId("4cabee4ac0f7095167a5ab62"), "value" : 2 }

This seems bad - is this expected behavior?



 Comments   
Comment by auto [ 09/Nov/10 ]

Author:

{'login': 'erh', 'name': 'Eliot Horowitz', 'email': 'eliot@10gen.com'}

Message: test for SERVER-1904
/mongodb/mongo/commit/d14cb30a7a1041e7e3f57f3837cf270f0bbd86d2

Comment by auto [ 09/Nov/10 ]

Author:

{'login': 'erh', 'name': 'Eliot Horowitz', 'email': 'eliot@10gen.com'}

Message: check for dups on multikey indexes SERVER-1904
/mongodb/mongo/commit/31a3c0bc0f41db8b9146ca5af019d7da91675dab

Generated at Thu Feb 08 02:58:24 UTC 2024 using Jira 9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66.