Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-21726

WiredTiger primaries replicate no-op update, leading to out-of-sync secondaries and data loss

    XMLWordPrintable

    Details

    • Type: Bug
    • Status: Closed
    • Priority: Critical - P2
    • Resolution: Fixed
    • Affects Version/s: 3.1.2
    • Fix Version/s: 3.2.0-rc6
    • Component/s: Replication
    • Labels:
      None
    • Backwards Compatibility:
      Minor Change
    • Operating System:
      ALL
    • Steps To Reproduce:
      Hide

      python buildscripts/resmoke.py --storageEngine=wiredTiger --executor=no_server repro.js

      'use strict';
       
      var rst = new ReplSetTest({nodes: 2});
      rst.startSet();
      rst.initiate();
       
      var primaryColl = rst.getPrimary().getDB('test').foo;
      var oplog = rst.getPrimary().getDB('local').oplog.rs;
      var secondaryColl = rst.getSecondary().getDB('test').foo;
       
      // Note: An array element is required to repro the bug.
      var res = primaryColl.insert({a: [1]});
      assert.writeOK(res);
       
      res = primaryColl.update({}, {$pushAll: {a: []}});
      assert.writeOK(res);
       
      // nModified is 1 here
      printjson(res);
       
      rst.awaitReplication();
       
      print('oplog ' + tojson(oplog.find().toArray()));
      print('Primary doc ' + tojson(primaryColl.find().toArray()));
      print('Secondary doc ' + tojson(secondaryColl.find().toArray()));
       
      assert.eq(primaryColl.find().next(), secondaryColl.find().next());
      

      Show
      python buildscripts/resmoke.py --storageEngine=wiredTiger --executor=no_server repro.js 'use strict';   var rst = new ReplSetTest({nodes: 2}); rst.startSet(); rst.initiate();   var primaryColl = rst.getPrimary().getDB('test').foo; var oplog = rst.getPrimary().getDB('local').oplog.rs; var secondaryColl = rst.getSecondary().getDB('test').foo;   // Note: An array element is required to repro the bug. var res = primaryColl.insert({a: [1]}); assert.writeOK(res);   res = primaryColl.update({}, {$pushAll: {a: []}}); assert.writeOK(res);   // nModified is 1 here printjson(res);   rst.awaitReplication();   print('oplog ' + tojson(oplog.find().toArray())); print('Primary doc ' + tojson(primaryColl.find().toArray())); print('Secondary doc ' + tojson(secondaryColl.find().toArray()));   assert.eq(primaryColl.find().next(), secondaryColl.find().next());
    • Sprint:
      Repl D (12/11/15)
    • Linked BF Score:
      0

      Description

      This bug affects versions 3.1.2+ with WiredTiger as the storage engine. It doesn't affect 3.0.7.

      Here's the test output from the repro script:

      // The primary and secondary are synchronized up until this point.
       
      // A no-op update is issued, but an op is still logged in the oplog.
      // This leads to the secondary getting out of sync with the primary
      // (and is a form of data loss).
      > res = primaryColl.update({}, {$pushAll: {a: []}});
      > assert.writeOK(res);
       
      res => { "nMatched" : 1, "nUpserted" : 0, "nModified" : 1 }
       
      oplog [
        {
            "ts" : Timestamp(1449024242, 1),
            "h" : NumberLong("-3274359779073625624"),
            "v" : 2,
            "op" : "n",
            "ns" : "",
            "o" : {
                "msg" : "initiating set"
            }
        },
        {
            "ts" : Timestamp(1449024254, 1),
            "t" : NumberLong(1),
            "h" : NumberLong("-7260876401968846914"),
            "v" : 2,
            "op" : "n",
            "ns" : "",
            "o" : {
                "msg" : "new primary"
            }
        },
        {
            "ts" : Timestamp(1449024254, 2),
            "t" : NumberLong(1),
            "h" : NumberLong("9051320871864503306"),
            "v" : 2,
            "op" : "c",
            "ns" : "test.$cmd",
            "o" : {
                "create" : "foo"
            }
        },
        {
            "ts" : Timestamp(1449024254, 3),
            "t" : NumberLong(1),
            "h" : NumberLong("-455571380945262558"),
            "v" : 2,
            "op" : "i",
            "ns" : "test.foo",
            "o" : {
                "_id" : ObjectId("565e5afe7750e3087f8ef04f"),
                "a" : [
                    1
                ]
            }
        },
        {
            "ts" : Timestamp(1449024254, 4),
            "t" : NumberLong(1),
            "h" : NumberLong("-4618385206475519820"),
            "v" : 2,
            "op" : "u",
            "ns" : "test.foo",
            "o2" : {
                "_id" : ObjectId("565e5afe7750e3087f8ef04f")
            },
            "o" : {
       
            }
        }
      ]
      Primary doc [ { "_id" : ObjectId("565e5afe7750e3087f8ef04f"), "a" : [ 1 ] } ]
      Secondary doc [ { "_id" : ObjectId("565e5afe7750e3087f8ef04f") } ]
      

        Attachments

          Issue Links

            Activity

              People

              Assignee:
              milkie Eric Milkie
              Reporter:
              kamran.khan Kamran K.
              Participants:
              Votes:
              0 Vote for this issue
              Watchers:
              15 Start watching this issue

                Dates

                Created:
                Updated:
                Resolved: