Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-22771

multiUpdate and multiRemove ops from a stale mongos may cause repeated work

    XMLWordPrintable

    Details

    • Type: Bug
    • Status: Closed
    • Priority: Major - P3
    • Resolution: Duplicate
    • Affects Version/s: 3.3.1
    • Fix Version/s: None
    • Component/s: Sharding
    • Labels:
      None
    • Backwards Compatibility:
      Fully Compatible
    • Operating System:
      ALL
    • Steps To Reproduce:
      Hide

      The following tests demonstrate the behavior:

      multiUpdate:

      /*
      Tests that a multiUpdate updates all matching documents even if sent from a stale mongos.
      */
      // Start sharding test with 10 documents in sharded collection 'test.foo'
      var st = new ShardingTest({shards: 2, mongos: 2, other: { noAutoSplit : ""}});
      var dbName = 'test';
      var collNS = dbName + '.foo';
      assert.commandWorked(st.s.adminCommand(
      { enableSharding: dbName }
      ));
      assert.commandWorked(st.s.adminCommand({ shardCollection: collNS, key: { _id: 1 }}));
      var numDocs = 10;
      for (var i=0; i<numDocs; i++) {
      assert.writeOK(st.s.getCollection(collNS).insert(
      { _id : i, fieldToUpdate : 0}
      ));
      }
      // Have each of the two mongoses perform a moveChunk, so that one is fresh and one is stale.
      // We don't assert that the first moveChunk worked because the chunk is probably already on the
      // first shard.
      var freshMongos = st.s0;
      var staleMongos = st.s1;
      staleMongos.adminCommand({ moveChunk: collNS, find:
      { _id: 0 }
      , to: st._shardNames[0] });
      assert.commandWorked(freshMongos.adminCommand({ moveChunk: collNS,
      find:
      { _id: 0 }
      ,
      to: st._shardNames[1] }));
      // Use the stale mongos to perform a non-idempotent multiUpdate.
      assert.writeOK(staleMongos.getCollection(collNS).update({},
      { $inc : { fieldToUpdate : 1 }},
      { multi : true }
      ));
      // Check that all documents were correctly updated.
      assert.eq(numDocs, staleMongos.getCollection(collNS).find(
      { fieldToUpdate : 1 }
      ).itcount(),
      "multiUpdate from stale mongos failed to correctly update all documents across shards");
      st.stop();
      

      multiRemove:

      /*
      Tests that a multiRemove removes all matching documents even if sent from a stale mongos.
      */
      // Start sharding test with 10 documents in sharded collection 'test.foo'
      var st = new ShardingTest({shards: 2, mongos: 2, other: { noAutoSplit : ""}});
      var dbName = 'test';
      var collNS = dbName + '.foo';
      assert.commandWorked(st.s.adminCommand(
      { enableSharding: dbName }
      ));
      assert.commandWorked(st.s.adminCommand({ shardCollection: collNS, key: { _id: 1 }}));
      var numDocs = 10;
      for (var i=0; i<numDocs; i++) {
      assert.writeOK(st.s.getCollection(collNS).insert(
      { _id : i}
      ));
      }
      // Have each of the two mongoses perform a moveChunk, so that one is fresh and one is stale.
      // We don't assert that the first moveChunk worked because the chunk is probably already on the
      // first shard.
      var freshMongos = st.s0;
      var staleMongos = st.s1;
      staleMongos.adminCommand({ moveChunk: collNS, find:
      { _id: 0 }
      , to: st._shardNames[0] });
      assert.commandWorked(freshMongos.adminCommand({ moveChunk: collNS,
      find:
      { _id: 0 }
      ,
      to: st._shardNames[1] }));
      // Use the stale mongos to perform a multiRemove.
      assert.writeOK(staleMongos.getCollection(collNS).remove({}));
      // Check that all documents were correctly removed.
      assert.eq(0, staleMongos.getCollection(collNS).find().itcount(),
      "multiRemove from stale mongos failed to remove all documents across shards");
      st.stop();
      

      Show
      The following tests demonstrate the behavior: multiUpdate: /* Tests that a multiUpdate updates all matching documents even if sent from a stale mongos. */ // Start sharding test with 10 documents in sharded collection 'test.foo' var st = new ShardingTest({shards: 2, mongos: 2, other: { noAutoSplit : ""}}); var dbName = 'test'; var collNS = dbName + '.foo'; assert.commandWorked(st.s.adminCommand( { enableSharding: dbName } )); assert.commandWorked(st.s.adminCommand({ shardCollection: collNS, key: { _id: 1 }})); var numDocs = 10; for (var i=0; i<numDocs; i++) { assert.writeOK(st.s.getCollection(collNS).insert( { _id : i, fieldToUpdate : 0} )); } // Have each of the two mongoses perform a moveChunk, so that one is fresh and one is stale. // We don't assert that the first moveChunk worked because the chunk is probably already on the // first shard. var freshMongos = st.s0; var staleMongos = st.s1; staleMongos.adminCommand({ moveChunk: collNS, find: { _id: 0 } , to: st._shardNames[0] }); assert.commandWorked(freshMongos.adminCommand({ moveChunk: collNS, find: { _id: 0 } , to: st._shardNames[1] })); // Use the stale mongos to perform a non-idempotent multiUpdate. assert.writeOK(staleMongos.getCollection(collNS).update({}, { $inc : { fieldToUpdate : 1 }}, { multi : true } )); // Check that all documents were correctly updated. assert.eq(numDocs, staleMongos.getCollection(collNS).find( { fieldToUpdate : 1 } ).itcount(), "multiUpdate from stale mongos failed to correctly update all documents across shards"); st.stop(); multiRemove: /* Tests that a multiRemove removes all matching documents even if sent from a stale mongos. */ // Start sharding test with 10 documents in sharded collection 'test.foo' var st = new ShardingTest({shards: 2, mongos: 2, other: { noAutoSplit : ""}}); var dbName = 'test'; var collNS = dbName + '.foo'; assert.commandWorked(st.s.adminCommand( { enableSharding: dbName } )); assert.commandWorked(st.s.adminCommand({ shardCollection: collNS, key: { _id: 1 }})); var numDocs = 10; for (var i=0; i<numDocs; i++) { assert.writeOK(st.s.getCollection(collNS).insert( { _id : i} )); } // Have each of the two mongoses perform a moveChunk, so that one is fresh and one is stale. // We don't assert that the first moveChunk worked because the chunk is probably already on the // first shard. var freshMongos = st.s0; var staleMongos = st.s1; staleMongos.adminCommand({ moveChunk: collNS, find: { _id: 0 } , to: st._shardNames[0] }); assert.commandWorked(freshMongos.adminCommand({ moveChunk: collNS, find: { _id: 0 } , to: st._shardNames[1] })); // Use the stale mongos to perform a multiRemove. assert.writeOK(staleMongos.getCollection(collNS).remove({})); // Check that all documents were correctly removed. assert.eq(0, staleMongos.getCollection(collNS).find().itcount(), "multiRemove from stale mongos failed to remove all documents across shards"); st.stop();

      Description

      Because mongos targets shards for multiUpdate and multiRemove, and shards check the shard version info on these requests before beginning to execute the updates and removes, if one shard returns StaleShardVersion to the mongos, the mongos will refresh its metadata, re-target and re-send the request to all relevant shards.

      Therefore, a shard that did not return StaleShardVersion will re-apply the multiUpdate or multiRemove. This is harmless for multiRemoves (since removes are idempotent), but can cause unexpected behavior (the write can get applied more than once to the same document) for non-idempotent multi-updates. It's worth noting that the semantics of multiUpdate and multiRemove already allow for situations like this even on a single mongod; this issue just exacerbates the likelihood seeing behavior like this.

      Right now, the mongos targets shards for multiUpdate and multiRemove, and the shards check versioning. A full fix requires both mongos and mongod to do the opposite of what they currently do:

      The four options are:

      1) mongos targets shards, shards check version (what we have now)
      --> multiUpdates are re-applied if mongos was stale relative to the targeted shards

      2) mongos sends request to all shards, shards check version
      --> multiUpdates are re-applied if mongos was stale relative to ANY shard (even worse than option 1)

      3) mongos targets shards, shards ignore versioning
      --> a stale mongos will target the wrong shards, so both multiUpdates and multiRemoves can be lost

      4) mongos sends request to all shards, shards ignore versioning (what we should be doing)
      --> a stale mongos will apply the update/remove to all shards, and will not re-apply anything since the shards will not return StaleShardVersion. Though this works from a correctness perspective, the mongos will remain stale in this case.

        Attachments

          Issue Links

            Activity

              People

              • Votes:
                0 Vote for this issue
                Watchers:
                11 Start watching this issue

                Dates

                • Created:
                  Updated:
                  Resolved: