Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-22060

Sharded mapReduce output option {replace: "coll", sharded: true} can create an invalid sharded collection

    • Type: Icon: Bug Bug
    • Resolution: Duplicate
    • Priority: Icon: Major - P3 Major - P3
    • None
    • Affects Version/s: None
    • Component/s: Sharding
    • Labels:
      None
    • Sharding
    • ALL
    • Hide
      (function() {
          'use strict';
      
          var st = new ShardingTest({shards: 2});
          var sdb = st.s.getDB("test");
          var inputColl = sdb.input;
          var outputColl = sdb.output;
      
          assert.commandWorked(sdb.adminCommand({enableSharding: sdb.getName()}));
          sdb.adminCommand({movePrimary: sdb.getName(), to: "shard0000"});
      
          assert.writeOK(inputColl.insert({a: 1, b: 5}));
          assert.writeOK(inputColl.insert({a: 1, b: 6}));
          assert.writeOK(inputColl.insert({a: 2, b: 7}));
          assert.writeOK(inputColl.insert({a: 2, b: 8}));
      
          // Create a sharded collection with a chunk on each shard.
          assert.commandWorked(outputColl.ensureIndex({c: 1}));
          assert.writeOK(outputColl.insert({c: -1}));
          assert.writeOK(outputColl.insert({c: 1}));
          assert.commandWorked(sdb.adminCommand({
              shardCollection: outputColl.getFullName(),
              key: {c: 1}
          }));
          assert.commandWorked(sdb.adminCommand({split: outputColl.getFullName(), middle: {c: 0}}));
          assert.commandWorked(sdb.adminCommand({
              moveChunk: outputColl.getFullName(),
              find: {c: 1},
              to: "shard0001"
          }));
      
          // This creates a collection sharded by {c: 1}, whose documents do not contain the shard key.
          sdb.runCommand({
              mapReduce: inputColl.getName(),
              map: function() {
                  emit(this.a, this.b);
              },
              reduce: function(key, values) {
                  var sum = 0;
                  for (var i = 0; i < values.length; i++) {
                      sum += values[i];
                  }
                  return sum;
              },
              out: {replace: "output", sharded: true}
          });
      
          assert.eq(2, outputColl.find().itcount());
      })();
      
      Show
      (function() { 'use strict' ; var st = new ShardingTest({shards: 2}); var sdb = st.s.getDB( "test" ); var inputColl = sdb.input; var outputColl = sdb.output; assert .commandWorked(sdb.adminCommand({enableSharding: sdb.getName()})); sdb.adminCommand({movePrimary: sdb.getName(), to: "shard0000" }); assert .writeOK(inputColl.insert({a: 1, b: 5})); assert .writeOK(inputColl.insert({a: 1, b: 6})); assert .writeOK(inputColl.insert({a: 2, b: 7})); assert .writeOK(inputColl.insert({a: 2, b: 8})); // Create a sharded collection with a chunk on each shard. assert .commandWorked(outputColl.ensureIndex({c: 1})); assert .writeOK(outputColl.insert({c: -1})); assert .writeOK(outputColl.insert({c: 1})); assert .commandWorked(sdb.adminCommand({ shardCollection: outputColl.getFullName(), key: {c: 1} })); assert .commandWorked(sdb.adminCommand({split: outputColl.getFullName(), middle: {c: 0}})); assert .commandWorked(sdb.adminCommand({ moveChunk: outputColl.getFullName(), find: {c: 1}, to: "shard0001" })); // This creates a collection sharded by {c: 1}, whose documents do not contain the shard key. sdb.runCommand({ mapReduce: inputColl.getName(), map: function() { emit( this .a, this .b); }, reduce: function(key, values) { var sum = 0; for ( var i = 0; i < values.length; i++) { sum += values[i]; } return sum; }, out: {replace: "output" , sharded: true } }); assert .eq(2, outputColl.find().itcount()); })();

      Suppose you have a collection "coll" sharded by some non-_id shard key, {shardKey: 1}. Users are allowed to run a mapReduce operation with the output option {replace: "coll", sharded: true}. This means that "coll" should be replaced by the output of the mapReduce, and that the new collection should be sharded.

      New sharded collections created by mapReduce are sharded by {_id: 1}, an assumption which is made in several places in the code. During the replace, however, the collection is never re-sharded by {_id: 1}. Instead, the sharding metadata continues to show that the shard key is {shardKey: 1}.

      The documents in the mapReduce output collection are of the form

      {_id: <key>, value: <value>}
      

      These documents are missing the shard key! The collection at this point is broken and queries against it can return incorrect results. See the repro steps for details.

            Assignee:
            backlog-server-sharding [DO NOT USE] Backlog - Sharding Team
            Reporter:
            david.storch@mongodb.com David Storch
            Votes:
            0 Vote for this issue
            Watchers:
            5 Start watching this issue

              Created:
              Updated:
              Resolved: