Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-1904

map-reduce produces different results for an indexed query

    XMLWordPrintable

    Details

    • Type: Bug
    • Status: Closed
    • Priority: Major - P3
    • Resolution: Fixed
    • Affects Version/s: 1.6.2
    • Fix Version/s: 1.7.3
    • Component/s: Index Maintenance
    • Labels:
      None
    • Environment:
      Ubuntu 10.0.4 on EC2
    • Backwards Compatibility:
      Minor Change
    • Operating System:
      ALL

      Description

      I've been investigating some unusual numbers in my map-reduce results
      and made an interesting discovery. If the map reduce query is on an
      indexed array, and more than one value in the array matches the query,
      the document is mapped more than once. A simple example is below:
      // make sure the collection is empty
      > db.example.drop()
      true
      > db.example.save(

      { arr : [1, 2] }

      )

      // just aggregate by _id
      > map = function()

      { emit(this._id, 1) }

      function () {
      emit(this._id, 1);
      }

      // count the values
      > reduce = function(k,vals)

      { ... var sum=0; ... for(var i in vals) sum += vals[i]; ... return sum; ... }


      function (k, vals) {
      var sum = 0;
      for (var i in vals)

      { sum += vals[i]; }


      return sum;
      }

      // the first M/R finds the document once & produces the correct count
      > res = db.example.mapReduce(map,reduce, { query : {} })

      {
      "result" : "tmp.mr.mapreduce_1286336126_70",
      "timeMillis" : 16,
      "counts" :

      { "input" : 1, "emit" : 1, "output" : 1 }

      ,
      "ok" : 1,
      }
      > db[res.result].find()

      { "_id" : ObjectId("4cabee4ac0f7095167a5ab62"), "value" : 1 }

      // the second query matches the array without an index, and still
      produces expected results
      > res = db.example.mapReduce(map,reduce, { query : { arr: {$gte:0} } })

      {
      "result" : "tmp.mr.mapreduce_1286336141_71",
      "timeMillis" : 12,
      "counts" :

      { "input" : 1, "emit" : 1, "output" : 1 }

      ,
      "ok" : 1,
      }
      > db[res.result].find()

      { "_id" : ObjectId("4cabee4ac0f7095167a5ab62"), "value" : 1 }

      // now index on the array and run the exact same M/R - note that it
      now has 2 inputs & 2 emits, and the count has doubled
      > db.example.ensureIndex(

      {arr:1}

      )
      > res = db.example.mapReduce(map,reduce, { query : { arr: {$gte:0} } })

      {
      "result" : "tmp.mr.mapreduce_1286336171_72",
      "timeMillis" : 15,
      "counts" :

      { "input" : 2, "emit" : 2, "output" : 1 }

      ,
      "ok" : 1,
      }
      > db[res.result].find()

      { "_id" : ObjectId("4cabee4ac0f7095167a5ab62"), "value" : 2 }

      This seems bad - is this expected behavior?

        Attachments

          Activity

            People

            Assignee:
            eliot Eliot Horowitz (Inactive)
            Reporter:
            iragsdale Ian Ragsdale
            Participants:
            Votes:
            1 Vote for this issue
            Watchers:
            2 Start watching this issue

              Dates

              Created:
              Updated:
              Resolved: