Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-4781

replica set initial sync failure when update cannot be applied to a future version of an object received via clone

    • Type: Icon: Bug Bug
    • Resolution: Done
    • Priority: Icon: Major - P3 Major - P3
    • 2.2.1, 2.3.0
    • Affects Version/s: None
    • Component/s: Replication
    • Labels:
      None
    • ALL

      Some modifier operations may uassert when they cannot be applied to a selected document. It's possible for an update to uassert when applied to a future version of a document during the initial sync phase, and this aborts the initial sync. It appears that the initial sync is then retried but if new, bad updates arrive I imagine the initial sync failure could keep repeating

      Here is a test I wrote for this. The first attempt at initial sync fails, but the test stops sending updates so when the initial sync is retried. As a result, the current test passes. So it's just good as a demonstration right now.

      // Test update modifier uassert during initial sync. SERVER-4781
      
      load("jstests/replsets/rslib.js");
      basename = "jstests_initsync4";
      
      print("1. Bring up set");
      replTest = new ReplSetTest( {name: basename, nodes: 1} );
      replTest.startSet();
      replTest.initiate();
      
      m = replTest.getMaster();
      md = m.getDB("d");
      mc = m.getDB("d")["c"];
      
      print("2. Insert some data");
      N = 500000;
      for( i = 0; i < N; ++i ) {
          mc.save( {_id:i,a:{}} );
      }
      md.getLastError();
      
      print("3. Make sure synced");
      replTest.awaitReplication();
      
      print("4. Bring up a new node");
      ports = allocatePorts( 3 );
      basePath = "/data/db/" + basename;
      hostname = getHostName();
      
      s = startMongodTest (ports[2], basename, false, {replSet : basename, oplogSize : 2} );
      
      var config = replTest.getReplSetConfig();
      config.version = 2;
      config.members.push({_id:2, host:hostname+":"+ports[2]});
      try {
          m.getDB("admin").runCommand({replSetReconfig:config});
      }
      catch(e) {
          print(e);
      }
      reconnect(s);
      
      print("5. Wait for new node to start cloning");
      
      s.setSlaveOk();
      sc = s.getDB("d")["c"];
      
      wait( function() { printjson( sc.stats() ); return sc.stats().count > 0; } );
      
      print("6. Start updating documents on primary");
      for( i = N-1; i >= N-10000; --i ) {
          // If the document is cloned as {a:1}, the {$set:{'a.b':1}} modifier will uassert.
          mc.update( {_id:i}, {$set:{'a.b':1}} );
          mc.update( {_id:i}, {$set:{a:1}} );    
      }
      
      print("7. Wait for new node to become SECONDARY");
      wait(function() {
           var status = s.getDB("admin").runCommand({replSetGetStatus:1});
           printjson(status);
           return status.members &&
           (status.members[1].state == 2);
           });
      

      Here is some relevant logging from the test

       m31002| Wed Jan 25 20:38:29 [rsSync] replSet initial oplog application from Aaron-Staples-MacBook-Pro.local:31000 starting at Jan 25 20:37:31:9f8 to Jan 25 20:38:07:214
       m31002| Wed Jan 25 20:38:29 [rsSync] replSet initial sync failing, error applying oplog : 10145 LEFT_SUBFIELD only supports Object: a not: 1
       m31002| Wed Jan 25 20:38:29 [rsSync] replSet applying oplog from Aaron-Staples-MacBook-Pro.local:31000 failed, trying again
       m31002| Wed Jan 25 20:38:29 [rsSync] replSet not trying to sync from Aaron-Staples-MacBook-Pro.local:31000, it is vetoed for 60 more seconds
      

      Also I wrote a test for master slave mode and that seems to work ok

      // Test update modifier uassert during initial sync. SERVER-4781
      
      rt = new ReplTest( "repl13tests" );
      
      m = rt.start( true );
      mc = m.getDB( 'd' )[ 'c' ];
      
      for( i = 0; i < 100000; ++i ) {
          mc.save( {_id:i,a:{}} );
      }
      m.getDB( 'd' ).getLastError();
      
      s = rt.start( false );
      sc = s.getDB( 'd' )[ 'c' ];
      
      assert.soon( function() { printjson( sc.count() ); return sc.count() > 0; } );
      
      for( i = 99999; i >= 90000; --i ) {
          mc.update( {_id:i}, {$set:{'a.b':1}} );
          mc.update( {_id:i}, {$set:{a:1}} );    
      }
      
      mc.save( {} )
      assert.soon( function() { return sc.count() == 100001; } );
      
      printjson( sc.findOne( {_id:99999} ) );
      printjson( sc.findOne( {_id:90000} ) );
      

      I can push the tests when bb code freeze is over.

            Assignee:
            alerner Alberto Lerner
            Reporter:
            aaron Aaron Staple
            Votes:
            3 Vote for this issue
            Watchers:
            13 Start watching this issue

              Created:
              Updated:
              Resolved: