Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-21896

Chunk metadata will not get refreshed after shard is removed

    • Fully Compatible
    • ALL
    • Hide
      // Validates the remove/drain shard functionality when there is data on the shard being removed
      (function() {
      'use strict';
      
      var st = new ShardingTest({ shards: 2, mongos: 2 });
      
      assert.commandWorked(st.s0.adminCommand({ enableSharding: 'TestDB' }));
      st.ensurePrimaryShard('TestDB', 'shard0000');
      assert.commandWorked(st.s0.adminCommand({ shardCollection: 'TestDB.Coll', key: { _id: 1 } }));
      assert.commandWorked(st.s0.adminCommand({ split: 'TestDB.Coll', middle: { _id: 0 } }));
      
      // Insert some documents and make sure there are docs on both shards
      st.s0.getDB('TestDB').Coll.insert({ _id: -1, value: 'Negative value' });
      st.s0.getDB('TestDB').Coll.insert({ _id: 1, value: 'Positive value' });
      
      assert.commandWorked(st.s0.adminCommand({ moveChunk: 'TestDB.Coll',
                                                find: { _id: 1 },
                                                to: 'shard0001',
                                                _waitForDelete: true }));
      
      // Make sure both mongos instances know of the latest metadata
      assert.eq(2, st.s0.getDB('TestDB').Coll.find({}).toArray().length);
      assert.eq(2, st.s1.getDB('TestDB').Coll.find({}).toArray().length);
      
      // Remove shard0001
      var removeRes;
      removeRes = assert.commandWorked(st.s0.adminCommand({ removeShard: 'shard0001' }));
      assert.eq('started', removeRes.state);
      removeRes = assert.commandWorked(st.s0.adminCommand({ removeShard: 'shard0001' }));
      assert.eq('ongoing', removeRes.state);
      
      // Move the one chunk off shard0001
      assert.commandWorked(st.s0.adminCommand({ moveChunk: 'TestDB.Coll',
                                                find: { _id: 1 },
                                                to: 'shard0000',
                                                _waitForDelete: true }));
                                                
      // Remove shard must succeed now
      removeRes = assert.commandWorked(st.s0.adminCommand({ removeShard: 'shard0001' }));
      assert.eq('completed', removeRes.state);
      
      // Make sure both mongos instance refresh their metadata and do not reference the missing shard
      assert.eq(2, st.s0.getDB('TestDB').Coll.find({}).toArray().length);
      assert.eq(2, st.s1.getDB('TestDB').Coll.find({}).toArray().length);
      
      st.stop();
      
      })();
      
      Show
      // Validates the remove/drain shard functionality when there is data on the shard being removed (function() { 'use strict' ; var st = new ShardingTest({ shards: 2, mongos: 2 }); assert .commandWorked(st.s0.adminCommand({ enableSharding: 'TestDB' })); st.ensurePrimaryShard( 'TestDB' , 'shard0000' ); assert .commandWorked(st.s0.adminCommand({ shardCollection: 'TestDB.Coll' , key: { _id: 1 } })); assert .commandWorked(st.s0.adminCommand({ split: 'TestDB.Coll' , middle: { _id: 0 } })); // Insert some documents and make sure there are docs on both shards st.s0.getDB( 'TestDB' ).Coll.insert({ _id: -1, value: 'Negative value' }); st.s0.getDB( 'TestDB' ).Coll.insert({ _id: 1, value: 'Positive value' }); assert .commandWorked(st.s0.adminCommand({ moveChunk: 'TestDB.Coll' , find: { _id: 1 }, to: 'shard0001' , _waitForDelete: true })); // Make sure both mongos instances know of the latest metadata assert .eq(2, st.s0.getDB( 'TestDB' ).Coll.find({}).toArray().length); assert .eq(2, st.s1.getDB( 'TestDB' ).Coll.find({}).toArray().length); // Remove shard0001 var removeRes; removeRes = assert .commandWorked(st.s0.adminCommand({ removeShard: 'shard0001' })); assert .eq( 'started' , removeRes.state); removeRes = assert .commandWorked(st.s0.adminCommand({ removeShard: 'shard0001' })); assert .eq( 'ongoing' , removeRes.state); // Move the one chunk off shard0001 assert .commandWorked(st.s0.adminCommand({ moveChunk: 'TestDB.Coll' , find: { _id: 1 }, to: 'shard0000' , _waitForDelete: true })); // Remove shard must succeed now removeRes = assert .commandWorked(st.s0.adminCommand({ removeShard: 'shard0001' })); assert .eq( 'completed' , removeRes.state); // Make sure both mongos instance refresh their metadata and do not reference the missing shard assert .eq(2, st.s0.getDB( 'TestDB' ).Coll.find({}).toArray().length); assert .eq(2, st.s1.getDB( 'TestDB' ).Coll.find({}).toArray().length); st.stop(); })();
    • Sharding E (01/08/16), Sharding F (01/29/16)

      The cache of the database/collection/chunk metadata and the cache of the registered shards are separate and independent from each other.

      After a shard is completely removed (that is, first put into draining mode and then its chunks are moved off of it), any mongos instances other than the one on which removeShard was run will quickly refresh their shard caches and will note that the shard no longer exists. However, any chunk metadata, which references the just removed shard will not be invalidated.

      Next time a query is run against one of these stale mongos instances, it will use the stale chunk information, but won't find the shard and will crash with the following stack (in 3.2):

      s20006| 2015-12-14T15:39:48.165-0500 D SHARDING [conn1] found 1 shards listed on config server(s)
      s20006| 2015-12-14T15:39:48.168-0500 I CONTROL  [conn1] *** unhandled exception (access violation) at 0x00007FF65E08DEE3, terminating
      s20006| 2015-12-14T15:39:48.168-0500 I CONTROL  [conn1] *** access violation was a read from 0x0000000000000018
      s20006| 2015-12-14T15:39:48.168-0500 I CONTROL  [conn1] *** stack trace for unhandled exception:
      s20006| 2015-12-14T15:39:48.742-0500 I CONTROL  [conn1] mongos.exe     c:\program files (x86)\microsoft visual studio 12.0\vc\include\xstring(2159)     std::basic_string<char,std::char_traits<char>,std::allocator<char> >::compare+0x53
      s20006| 2015-12-14T15:39:48.742-0500 I CONTROL  [conn1] mongos.exe     c:\program files (x86)\microsoft visual studio 12.0\vc\include\xstring(2489)     std::operator==<char,std::char_traits<char>,std::allocator<char> >+0x32
      s20006| 2015-12-14T15:39:48.742-0500 I CONTROL  [conn1] mongos.exe     ...\src\mongo\s\client\shard.cpp(59)                                             mongo::Shard::isConfig+0x32
      s20006| 2015-12-14T15:39:48.743-0500 I CONTROL  [conn1] mongos.exe     ...\src\mongo\s\query\cluster_find.cpp(251)                                      mongo::`anonymous namespace'::runQueryWithoutRetrying+0x640
      s20006| 2015-12-14T15:39:48.743-0500 I CONTROL  [conn1] mongos.exe     ...\src\mongo\s\query\cluster_find.cpp(365)                                      mongo::ClusterFind::runQuery+0x486
      s20006| 2015-12-14T15:39:48.743-0500 I CONTROL  [conn1] mongos.exe     ...\src\mongo\s\commands\cluster_find_cmd.cpp(162)                               mongo::`anonymous namespace'::ClusterFindCmd::run+0x49b
      s20006| 2015-12-14T15:39:48.743-0500 I CONTROL  [conn1] mongos.exe     ...\src\mongo\s\s_only.cpp(128)                                                  mongo::Command::execCommandClientBasic+0x43b
      s20006| 2015-12-14T15:39:48.743-0500 I CONTROL  [conn1] mongos.exe     ...\src\mongo\s\s_only.cpp(171)                                                  mongo::Command::runAgainstRegistered+0x33d
      s20006| 2015-12-14T15:39:48.743-0500 I CONTROL  [conn1] mongos.exe     ...\src\mongo\s\strategy.cpp(237)                                                mongo::Strategy::clientCommandOp+0x881
      s20006| 2015-12-14T15:39:48.743-0500 I CONTROL  [conn1] mongos.exe     ...\src\mongo\s\request.cpp(110)                                                 mongo::Request::process+0x486
      s20006| 2015-12-14T15:39:48.743-0500 I CONTROL  [conn1] mongos.exe     ...\src\mongo\s\server.cpp(141)                                                  mongo::ShardedMessageHandler::process+0xfb
      s20006| 2015-12-14T15:39:48.744-0500 I CONTROL  [conn1] mongos.exe     ...\src\mongo\util\net\message_server_port.cpp(231)                              mongo::PortMessageServer::handleIncomingMsg+0x509
      s20006| 2015-12-14T15:39:48.744-0500 I CONTROL  [conn1] mongos.exe     c:\program files (x86)\microsoft visual studio 12.0\vc\include\functional(1150)  std::_Bind<1,void * __ptr64,void * __ptr64 (__cdecl*const)(void * __ptr64),mongo::`anonymous namespace'::MessagingPortWithHandler * __ptr64>::_Do_call<,0>+0x6e
      s20006| 2015-12-14T15:39:48.744-0500 I CONTROL  [conn1] mongos.exe     c:\program files (x86)\microsoft visual studio 12.0\vc\include\functional(1138)  std::_Bind<1,void * __ptr64,void * __ptr64 (__cdecl*const)(void * __ptr64),mongo::`anonymous namespace'::MessagingPortWithHandler * __ptr64>::operator()<>+0x56
      s20006| 2015-12-14T15:39:48.744-0500 I CONTROL  [conn1] mongos.exe     c:\program files (x86)\microsoft visual studio 12.0\vc\include\functional(1150)  std::_Bind<0,void,std::_Bind<1,void * __ptr64,void * __ptr64 (__cdecl*const)(void * __ptr64),mongo::`anonymous namespace'::MessagingPortWithHandler * __ptr64> >::_Do_call<>+0x35
      s20006| 2015-12-14T15:39:48.744-0500 I CONTROL  [conn1] mongos.exe     c:\program files (x86)\microsoft visual studio 12.0\vc\include\functional(1138)  std::_Bind<0,void,std::_Bind<1,void * __ptr64,void * __ptr64 (__cdecl*const)(void * __ptr64),mongo::`anonymous namespace'::MessagingPortWithHandler * __ptr64> >::operator()<>+0x56
      s20006| 2015-12-14T15:39:48.744-0500 I CONTROL  [conn1] mongos.exe     c:\program files (x86)\microsoft visual studio 12.0\vc\include\thr\xthread(196)  std::_LaunchPad<std::_Bind<0,void,std::_Bind<1,void * __ptr64,void * __ptr64 (__cdecl*const)(void * __ptr64),mongo::`anonymous namespace'::MessagingPortWithHandler * __ptr64> > >::_Run+0x51
      s20006| 2015-12-14T15:39:48.744-0500 I CONTROL  [conn1] mongos.exe     c:\program files (x86)\microsoft visual studio 12.0\vc\include\thr\xthread(188)  std::_LaunchPad<std::_Bind<0,void,std::_Bind<1,void * __ptr64,void * __ptr64 (__cdecl*const)(void * __ptr64),mongo::`anonymous namespace'::MessagingPortWithHandler * __ptr64> > >::_Go+0x28
      s20006| 2015-12-14T15:39:48.744-0500 I CONTROL  [conn1] MSVCP120D.dll                                                                                   std::_Pad::_Release+0xd9
      s20006| 2015-12-14T15:39:48.754-0500 I CONTROL  [conn1] MSVCR120D.dll                                                                                   beginthreadex+0x1f5
      s20006| 2015-12-14T15:39:48.754-0500 I CONTROL  [conn1] MSVCR120D.dll                                                                                   endthreadex+0x1d7
      s20006| 2015-12-14T15:39:48.754-0500 I CONTROL  [conn1] KERNEL32.DLL                                                                                    BaseThreadInitThunk+0x22
      

      In 3.0, there is no crash, but all finds will start failing with the following error:

      2015-12-12T04:57:28.872+0000 I -        [conn31] Assertion: 13129:can't find shard for: red_7
      ....
       mongos(_ZN5mongo15printStackTraceERNSt3__113basic_ostreamIcNS0_11char_traitsIcEEEE+0x39) [0x10e64c9b9]
       mongos(_ZN5mongo10logContextEPKc+0x100) [0x10e603280]
       mongos(_ZN5mongo11msgassertedEiPKc+0x13A) [0x10e5f07ba]
       mongos(_ZN5mongo11msgassertedEiRKNSt3__112basic_stringIcNS0_11char_traitsIcEENS0_9allocatorIcEEEE+0x1A) [0x10e5f067a]
       mongos(_ZN5mongo15StaticShardInfo13findWithRetryERKNSt3__112basic_stringIcNS1_11char_traitsIcEENS1_9allocatorIcEEEE+0x1A6) [0x10e599a56]
       mongos(_ZN5mongo15StaticShardInfo8findCopyERKNSt3__112basic_stringIcNS1_11char_traitsIcEENS1_9allocatorIcEEEE+0x21) [0x10e5963e1]
       mongos(_ZN5mongo5Shard5resetERKNSt3__112basic_stringIcNS1_11char_traitsIcEENS1_9allocatorIcEEEE+0x31) [0x10e5933f1]
       mongos(_ZN5mongo5Shard4makeERKNSt3__112basic_stringIcNS1_11char_traitsIcEENS1_9allocatorIcEEEE+0x9F) [0x10e4c320f]
       mongos(_ZN5mongo11dbgrid_cmds14RemoveShardCmd3runEPNS_16OperationContextERKNSt3__112basic_stringIcNS4_11char_traitsIcEENS4_9allocatorIcEEEERNS_7BSONObjEiRSA_RNS_14BSONObjBuilderEb+0x80) [0x10e4ea500]
       mongos(_ZN5mongo7Command22execCommandClientBasicEPNS_16OperationContextEPS0_RNS_11ClientBasicEiPKcRNS_7BSONObjERNS_14BSONObjBuilderEb+0x2FC) [0x10e590c7c]
       mongos(_ZN5mongo7Command20runAgainstRegisteredEPKcRNS_7BSONObjERNS_14BSONObjBuilderEi+0x123) [0x10e509523]
       mongos(_ZN5mongo8Strategy15clientCommandOpERNS_7RequestE+0x52C) [0x10e5a334c]
       mongos(_ZN5mongo7Request7processEi+0x4B2) [0x10e58fca2]
       mongos(_ZN5mongo21ShardedMessageHandler7processERNS_7MessageEPNS_21AbstractMessagingPortEPNS_9LastErrorE+0x65) [0x10e1c72d5]
       mongos(_ZN5mongo17PortMessageServer17handleIncomingMsgEPv+0x33C) [0x10e60dabc]
       mongos(_ZN5boost12_GLOBAL__N_112thread_proxyEPv+0xB1) [0x10e67fa71]
       libsystem_pthread.dylib(_pthread_body+0x83) [0x7fff8e4ff05a]
       libsystem_pthread.dylib(_pthread_body+0x0) [0x7fff8e4fefd7]
       libsystem_pthread.dylib(thread_start+0xD) [0x7fff8e4fc3ed]
      

            Assignee:
            misha.tyulenev@mongodb.com Misha Tyulenev (Inactive)
            Reporter:
            kaloian.manassiev@mongodb.com Kaloian Manassiev
            Votes:
            0 Vote for this issue
            Watchers:
            9 Start watching this issue

              Created:
              Updated:
              Resolved: