Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-33456

Shard primary can crash if it steps down during the migration critical section

    • Type: Icon: Bug Bug
    • Resolution: Fixed
    • Priority: Icon: Major - P3 Major - P3
    • 3.7.3
    • Affects Version/s: None
    • Component/s: Sharding
    • None
    • Fully Compatible
    • ALL
    • Hide
      (function() {
          "use strict";
          TestData.skipCheckingUUIDsConsistentAcrossCluster = true;
      
          const dbName = "test";
          const collName = "foo";
          const ns = dbName + "." + collName;
      
          const st = new ShardingTest({shards: 2, rs: {nodes: 1}, other: {rs: true}});
      
          assert.commandWorked(st.s.adminCommand({enableSharding: dbName}));
          st.ensurePrimaryShard(dbName, st.shard0.shardName);
          assert.commandWorked(st.s.adminCommand({shardCollection: ns, key: { "_id": 1 }}));
          assert.commandWorked(st.s.adminCommand({split: ns, middle: {_id: 0}}));
      
          jsTest.log("Make a migration on rs0's primary hang in the critical section.");
          assert.commandWorked(st.rs0.getPrimary().adminCommand({configureFailPoint: 'hangBeforeLeavingCriticalSection', mode: 'alwaysOn'}));
          startParallelShell("db.adminCommand({moveChunk: '" + ns + "', find: {_id: 0}, to: '" + st.shard1.shardName + "'})", st.s.port);
      
          jsTest.log("Sleep for 5 seconds to give ample chance to the moveChunk to reach the critical section.");
          sleep(5000);
      
          jsTest.log("Target a write to rs0, which will hanging in the critical section.");
          let joinInsert = startParallelShell("db.getSiblingDB('" + dbName + "').getCollection('" + collName + "').insert({_id: 1});", st.s.port);
      
          jsTest.log("Sleep for 5 seconds to give ample chance to the insert to start waiting on the critical section.");
          sleep(5000);
      
          jsTest.log("Cause rs0's primary to step down.");
          try {
              st.rs0.getPrimary().adminCommand({replSetStepDown: 1000, force: true });
          } catch(e) {}
      
          // >>>>>> Expect rs0's primary to crash here <<<<<
      
          jsTest.log("Wait for insert to complete");
          joinInsert();
      
         st.stop();
      })();
      
      Show
      ( function () { "use strict" ; TestData.skipCheckingUUIDsConsistentAcrossCluster = true ; const dbName = "test" ; const collName = "foo" ; const ns = dbName + "." + collName; const st = new ShardingTest({shards: 2, rs: {nodes: 1}, other: {rs: true }}); assert.commandWorked(st.s.adminCommand({enableSharding: dbName})); st.ensurePrimaryShard(dbName, st.shard0.shardName); assert.commandWorked(st.s.adminCommand({shardCollection: ns, key: { "_id" : 1 }})); assert.commandWorked(st.s.adminCommand({split: ns, middle: {_id: 0}})); jsTest.log( "Make a migration on rs0's primary hang in the critical section." ); assert.commandWorked(st.rs0.getPrimary().adminCommand({configureFailPoint: 'hangBeforeLeavingCriticalSection' , mode: 'alwaysOn' })); startParallelShell( "db.adminCommand({moveChunk: '" + ns + "' , find: {_id: 0}, to: '" + st.shard1.shardName + "' })" , st.s.port); jsTest.log( "Sleep for 5 seconds to give ample chance to the moveChunk to reach the critical section." ); sleep(5000); jsTest.log( "Target a write to rs0, which will hanging in the critical section." ); let joinInsert = startParallelShell( "db.getSiblingDB( '" + dbName + "' ).getCollection( '" + collName + "' ).insert({_id: 1});" , st.s.port); jsTest.log( "Sleep for 5 seconds to give ample chance to the insert to start waiting on the critical section." ); sleep(5000); jsTest.log( "Cause rs0's primary to step down." ); try { st.rs0.getPrimary().adminCommand({replSetStepDown: 1000, force: true }); } catch (e) {} // >>>>>> Expect rs0's primary to crash here <<<<< jsTest.log( "Wait for insert to complete" ); joinInsert(); st.stop(); })();
    • Sharding 2018-02-26, Sharding 2018-03-12
    • 0

      This issue repros as of the commit for SERVER-29908.

      Say a shard primary is in the migration critical section, and some command is waiting for the critical section to finish because it saw there was an active migration when it did the shardVersion check.

      Now if the primary steps down, the primary will crash due to an unhandled InterruptedDueToReplStateChange exception.

      Stack trace (on commit 3157be3048cdeb676579ed0d860d8416cb8c4667 for SERVER-29908):

      src/mongo/util/stacktrace_posix.cpp:172:30: mongo::printStackTrace(std::ostream&)
      src/mongo/util/signal_handlers_synchronous.cpp:229:5: mongo::(anonymous namespace)::myTerminate()
       /data/mci/cbe0dcd7144cecba8abf51bbc964b13c/toolchain-builder/build-gcc-v2.sh-szx/x86_64-mongodb-linux/libstdc++-v3/libsupc++/../../../../gcc-5.4.0/libstdc++-v3/libsupc++/eh_terminate.cc:47:0: __cxxabiv1::__terminate(void (*)())
       /data/mci/cbe0dcd7144cecba8abf51bbc964b13c/toolchain-builder/build-gcc-v2.sh-szx/x86_64-mongodb-linux/libstdc++-v3/libsupc++/../../../../gcc-5.4.0/libstdc++-v3/libsupc++/eh_terminate.cc:57:0: std::terminate()
       ??:0:0: __clang_call_terminate
      src/mongo/db/s/shard_filtering_metadata_refresh.cpp:50:5: mongo::onShardVersionMismatch(mongo::OperationContext*, mongo::NamespaceString const&, mongo::ChunkVersion)
      src/mongo/db/ops/write_ops_exec.cpp:219:13: mongo::(anonymous namespace)::handleError(mongo::OperationContext*, mongo::DBException const&, mongo::NamespaceString const&, mongo::write_ops::WriteCommandBase const&, mongo::WriteResult*)
      src/mongo/db/ops/write_ops_exec.cpp:407:17: mongo::(anonymous namespace)::insertBatchAndHandleErrors(mongo::OperationContext*, mongo::write_ops::Insert const&, std::vector<mongo::InsertStatement, std::allocator<mongo::InsertStatement> >&, mongo::(anonymous namespace)::LastOpFixer*, mongo::WriteResult*)
      src/mongo/db/ops/write_ops_exec.cpp:506:28: mongo::performInserts(mongo::OperationContext*, mongo::write_ops::Insert const&)
      src/mongo/db/commands/write_commands/write_commands.cpp:262:22: mongo::(anonymous namespace)::CmdInsert::runImpl(mongo::OperationContext*, mongo::OpMsgRequest const&, mongo::BSONObjBuilder&)
      src/mongo/db/commands/write_commands/write_commands.cpp:228:13: mongo::(anonymous namespace)::WriteCommand::enhancedRun(mongo::OperationContext*, mongo::OpMsgRequest const&, mongo::BSONObjBuilder&)
      src/mongo/db/commands.cpp:434:16: mongo::Command::publicRun(mongo::OperationContext*, mongo::OpMsgRequest const&, mongo::BSONObjBuilder&)
      src/mongo/db/service_entry_point_common.cpp:422:27: mongo::(anonymous namespace)::runCommandImpl(mongo::OperationContext*, mongo::Command*, mongo::OpMsgRequest const&, mongo::rpc::ReplyBuilderInterface*, mongo::LogicalTime, mongo::ServiceEntryPointCommon::Hooks const&)
      src/mongo/db/service_entry_point_common.cpp:685:13: mongo::(anonymous namespace)::execCommandDatabase(mongo::OperationContext*, mongo::Command*, mongo::OpMsgRequest const&, mongo::rpc::ReplyBuilderInterface*, mongo::ServiceEntryPointCommon::Hooks const&)
      src/mongo/db/service_entry_point_common.cpp:806:13: mongo::(anonymous namespace)::runCommands(mongo::OperationContext*, mongo::Message const&, mongo::ServiceEntryPointCommon::Hooks const&)::$_4::operator()() const
      src/mongo/db/service_entry_point_common.cpp:759:5: mongo::(anonymous namespace)::runCommands(mongo::OperationContext*, mongo::Message const&, mongo::ServiceEntryPointCommon::Hooks const&)
      src/mongo/db/service_entry_point_common.cpp:1084:22: mongo::ServiceEntryPointCommon::handleRequest(mongo::OperationContext*, mongo::Message const&, mongo::ServiceEntryPointCommon::Hooks const&)
      src/mongo/db/service_entry_point_mongod.cpp:124:12: mongo::ServiceEntryPointMongod::handleRequest(mongo::OperationContext*, mongo::Message const&)
      src/mongo/transport/service_state_machine.cpp:375:35: mongo::ServiceStateMachine::_processMessage(mongo::ServiceStateMachine::ThreadGuard)
      src/mongo/transport/service_state_machine.cpp:436:17: mongo::ServiceStateMachine::_runNextInGuard(mongo::ServiceStateMachine::ThreadGuard)
      src/mongo/transport/service_state_machine.cpp:476:14: mongo::ServiceStateMachine::_scheduleNextWithGuard(mongo::ServiceStateMachine::ThreadGuard, mongo::transport::ServiceExecutor::ScheduleFlags, mongo::transport::ServiceExecutorTaskName, mongo::ServiceStateMachine::Ownership)::$_0::operator()() const
       /opt/mongodbtoolchain/v2/bin/../lib/gcc/x86_64-mongodb-linux/5.4.0/../../../../include/c++/5.4.0/functional:1871:2: std::_Function_handler<void (), mongo::ServiceStateMachine::_scheduleNextWithGuard(mongo::ServiceStateMachine::ThreadGuard, mongo::transport::ServiceExecutor::ScheduleFlags, mongo::transport::ServiceExecutorTaskName, mongo::ServiceStateMachine::Ownership)::$_0>::_M_invoke(std::_Any_data const&)
       /opt/mongodbtoolchain/v2/bin/../lib/gcc/x86_64-mongodb-linux/5.4.0/../../../../include/c++/5.4.0/functional:2267:14: std::function<void ()>::operator()() const
      src/mongo/transport/service_executor_synchronous.cpp:121:13: mongo::transport::ServiceExecutorSynchronous::schedule(std::function<void ()>, mongo::transport::ServiceExecutor::ScheduleFlags, mongo::transport::ServiceExecutorTaskName)
      src/mongo/transport/service_state_machine.cpp:480:48: mongo::ServiceStateMachine::_scheduleNextWithGuard(mongo::ServiceStateMachine::ThreadGuard, mongo::transport::ServiceExecutor::ScheduleFlags, mongo::transport::ServiceExecutorTaskName, mongo::ServiceStateMachine::Ownership)
      src/mongo/transport/service_state_machine.cpp:300:16: mongo::ServiceStateMachine::_sourceCallback(mongo::Status)
      src/mongo/transport/service_state_machine.cpp:255:9: mongo::ServiceStateMachine::_sourceMessage(mongo::ServiceStateMachine::ThreadGuard)
      src/mongo/transport/service_state_machine.cpp:433:17: mongo::ServiceStateMachine::_runNextInGuard(mongo::ServiceStateMachine::ThreadGuard)
      src/mongo/transport/service_state_machine.cpp:476:14: mongo::ServiceStateMachine::_scheduleNextWithGuard(mongo::ServiceStateMachine::ThreadGuard, mongo::transport::ServiceExecutor::ScheduleFlags, mongo::transport::ServiceExecutorTaskName, mongo::ServiceStateMachine::Ownership)::$_0::operator()() const
       /opt/mongodbtoolchain/v2/bin/../lib/gcc/x86_64-mongodb-linux/5.4.0/../../../../include/c++/5.4.0/functional:1871:2: std::_Function_handler<void (), mongo::ServiceStateMachine::_scheduleNextWithGuard(mongo::ServiceStateMachine::ThreadGuard, mongo::transport::ServiceExecutor::ScheduleFlags, mongo::transport::ServiceExecutorTaskName, mongo::ServiceStateMachine::Ownership)::$_0>::_M_invoke(std::_Any_data const&)
       /opt/mongodbtoolchain/v2/bin/../lib/gcc/x86_64-mongodb-linux/5.4.0/../../../../include/c++/5.4.0/functional:2267:14: std::function<void ()>::operator()() const
      src/mongo/transport/service_executor_synchronous.cpp:138:13: mongo::transport::ServiceExecutorSynchronous::schedule(std::function<void ()>, mongo::transport::ServiceExecutor::ScheduleFlags, mongo::transport::ServiceExecutorTaskName)::$_2::operator()() const
       /opt/mongodbtoolchain/v2/bin/../lib/gcc/x86_64-mongodb-linux/5.4.0/../../../../include/c++/5.4.0/functional:1871:2: std::_Function_handler<void (), mongo::transport::ServiceExecutorSynchronous::schedule(std::function<void ()>, mongo::transport::ServiceExecutor::ScheduleFlags, mongo::transport::ServiceExecutorTaskName)::$_2>::_M_invoke(std::_Any_data const&)
       /opt/mongodbtoolchain/v2/bin/../lib/gcc/x86_64-mongodb-linux/5.4.0/../../../../include/c++/5.4.0/functional:2267:14: std::function<void ()>::operator()() const
      src/mongo/transport/service_entry_point_utils.cpp:55:5: mongo::(anonymous namespace)::runFunc(void*)
       ??:0:0: start_thread
       /build/glibc-9tT8Do/glibc-2.23/misc/../sysdeps/unix/sysv/linux/x86_64/clone.S:109:0: clone
      

            Assignee:
            josef.ahmad@mongodb.com Josef Ahmad
            Reporter:
            esha.maharishi@mongodb.com Esha Maharishi (Inactive)
            Votes:
            0 Vote for this issue
            Watchers:
            3 Start watching this issue

              Created:
              Updated:
              Resolved: