Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-65925

shardsvrCommitReshardCollection should check for transient errors when joining

    • Fully Compatible
    • ALL
    • v6.0, v5.0
    • Sharding NYC 2022-05-16, Sharding NYC 2022-05-30
    • 173

      orig title: Investigate fassert in ReshardingCoordinator due to recipient state document still exists after attempted commit

      orig desc

      As seen in BF-25036, there is some case wherein it is possible for the resharding recipient to fail to remove its state document during commit, causing the coordinator to fassert. The sequence of events which can cause this should be determined if possible.

      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:17.982+00:00 F  RESHARD  5277000 [ReshardingCoordinatorService-0] "Unrecoverable error past the point resharding was guaranteed to succeed","attr":{"error":"Location5795303: Failed command { _shardsvrCommitReshardCollection: \"reshardingDb.coll\", reshardingUUID: UUID(\"b0129ec7-3d42-4687-8cc0-5ced88a64de0\"), writeConcern: { w: \"majority\" }, $audit: { $impersonatedUsers: [ { user: \"__system\", db: \"local\" } ], $impersonatedRoles: [] } } for database 'admin' on shard 'shard1' :: caused by :: Recipient state document still exists after attempted commit"}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:17.982+00:00 F  ASSERT   23089   [ReshardingCoordinatorService-0] "Fatal assertion","attr":{"msgid":5277000,"file":"src/mongo/db/s/resharding/resharding_coordinator_service.cpp","line":1284}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:17.982+00:00 F  ASSERT   23090   [ReshardingCoordinatorService-0] "\n\n***aborting after fassert() failure\n\n"
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:17.982+00:00 F  CONTROL  4757800 [ReshardingCoordinatorService-0] "Writing fatal message","attr":{"message":"Got signal: 6 (Aborted).\n"}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:18.074+00:00 I  CONTROL  31380   [ReshardingCoordinatorService-0] "BACKTRACE","attr":{"bt":{"backtrace":[{"a":"55DE9569F39E","b":"55DE90B00000","o":"4B9F39E","s":"_ZN5mongo18stack_trace_detail12_GLOBAL__N_119printStackTraceImplERKNS1_7OptionsEPNS_14StackTraceSinkE.constprop.360","s+":"1FE"},{"a":"55DE956A1909","b":"55DE90B00000","o":"4BA1909","s":"_ZN5mongo15printStackTraceEv","s+":"29"},{"a":"55DE9569AE06","b":"55DE90B00000","o":"4B9AE06","s":"abruptQuit","s+":"66"},{"a":"7F78D0739D80","b":"7F78D0727000","o":"12D80","s":"funlockfile","s+":"50"},{"a":"7F78D039A93F","b":"7F78D0363000","o":"3793F","s":"gsignal","s+":"10F"},{"a":"7F78D0384C95","b":"7F78D0363000","o":"21C95","s":"abort","s+":"127"},{"a":"55DE928EDC32","b":"55DE90B00000","o":"1DEDC32","s":"_ZN5mongo25fassertFailedWithLocationEiPKcj","s+":"F6"},{"a":"55DE9245ADEF","b":"55DE90B00000","o":"195ADEF","s":"_ZZN5mongo15unique_functionIFNS_6StatusES1_EE8makeImplIZNS_28ReshardingCoordinatorService21ReshardingCoordinator32_commitAndFinishReshardOperationERKSt10shared_ptrINS_8executor18ScopedTaskExecutorEERKNS_29ReshardingCoordinatorDocumentEEUlS1_E3_EEDaOT_EN12SpecificImpl4callEOS1_.cold.4145","s+":"AF"},{"a":"55DE92D8D26A","b":"55DE90B00000","o":"228D26A","s":"_ZN5mongo14future_details10statusCallIRZZZNS_14ExecutorFutureIvE13_wrapCBHelperINS_15unique_functionIFNS_6StatusES6_EEEEEDaSt10shared_ptrINS_17OutOfLineExecutorEEOT_ENUlDpOT_E_clIJS6_EEEDaSG_ENUlS6_E_clES6_EUlvE_JNS0_8FakeVoidEEEEDaSD_DpOT0_","s+":"4A"},{"a":"55DE92D8D3C0","b":"55DE90B00000","o":"228D3C0","s":"_ZZN5mongo15unique_functionIFvNS_6StatusEEE8makeImplIZZNS_14ExecutorFutureIvE13_wrapCBHelperINS0_IFS1_S1_EEEEEDaSt10shared_ptrINS_17OutOfLineExecutorEEOT_ENUlDpOT_E_clIJS1_EEEDaSH_EUlS1_E_EEDaSE_EN12SpecificImpl4callEOS1_","s+":"A0"},{"a":"55DE95022A31","b":"55DE90B00000","o":"4522A31","s":"_ZZN5mongo15unique_functionIFvRKNS_8executor12TaskExecutor12CallbackArgsEEE8makeImplIZNS2_8scheduleENS0_IFvNS_6StatusEEEEEUlS5_E_EEDaOT_EN12SpecificImpl4callES5_","s+":"41"},{"a":"55DE939FE539","b":"55DE90B00000","o":"2EFE539","s":"_ZZN5mongo15unique_functionIFvRKNS_8executor12TaskExecutor12CallbackArgsEEE8makeImplIZNS1_18ScopedTaskExecutor4Impl13_wrapCallbackIZNSA_12scheduleWorkEOS7_EUlOT_E_S7_EENS_10StatusWithINS2_14CallbackHandleEEESE_OT0_EUlRKSD_E_EEDaSE_EN12SpecificImpl4callES5_","s+":"199"},{"a":"55DE94EC00A0","b":"55DE90B00000","o":"43C00A0","s":"_ZN5mongo8executor22ThreadPoolTaskExecutor11runCallbackESt10shared_ptrINS1_13CallbackStateEE","s+":"130"},{"a":"55DE94EC04A0","b":"55DE90B00000","o":"43C04A0","s":"_ZZN5mongo15unique_functionIFvNS_6StatusEEE8makeImplIZNS_8executor22ThreadPoolTaskExecutor23scheduleIntoPool_inlockEPNSt7__cxx114listISt10shared_ptrINS6_13CallbackStateEESaISB_EEERKSt14_List_iteratorISB_ESI_St11unique_lockINS_12latch_detail5LatchEEEUlT_E1_EEDaOSN_EN12SpecificImpl4callEOS1_","s+":"90"},{"a":"55DE9547E365","b":"55DE90B00000","o":"497E365","s":"_ZN5mongo10ThreadPool4Impl10_doOneTaskEPSt11unique_lockINS_12latch_detail5LatchEE","s+":"135"},{"a":"55DE9547F96B","b":"55DE90B00000","o":"497F96B","s":"_ZN5mongo10ThreadPool4Impl13_consumeTasksEv","s+":"8B"},{"a":"55DE95480D81","b":"55DE90B00000","o":"4980D81","s":"_ZN5mongo10ThreadPool4Impl17_workerThreadBodyERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE","s+":"161"},{"a":"55DE95481290","b":"55DE90B00000","o":"4981290","s":"_ZNSt6thread11_State_implINS_8_InvokerISt5tupleIJZN5mongo4stdx6threadC4IZNS3_10ThreadPool4Impl25_startWorkerThread_inlockEvEUlvE2_JELi0EEET_DpOT0_EUlvE_EEEEE6_M_runEv","s+":"60"},{"a":"55DE9583C86F","b":"55DE90B00000","o":"4D3C86F","s":"execute_native_thread_routine","s+":"F"},{"a":"7F78D072F2DE","b":"7F78D0727000","o":"82DE","s":"start_thread","s+":"FE"},{"a":"7F78D045FA63","b":"7F78D0363000","o":"FCA63","s":"clone","s+":"43"}],"processInfo":{"mongodbVersion":"5.3.1-44-g667c5b3","gitVersion":"667c5b359bb923f8e58c7a8af9cdc4500e410b7b","compiledModules":["enterprise"],"uname":{"sysname":"Linux","release":"4.18.0-80.1.2.el8_0.x86_64","version":"#1 SMP Sun Apr 28 09:21:22 UTC 2019","machine":"x86_64"},"somap":[{"b":"55DE90B00000","elfType":3,"buildId":"7107CA36721E6B3409C6F3B81EC368701071C850"},{"b":"7F78D0727000","path":"/lib64/libpthread.so.0","elfType":3,"buildId":"5326B8728FA01B7149DAC943100F1405533E76CE"},{"b":"7F78D0363000","path":"/lib64/libc.so.6","elfType":3,"buildId":"0598B7D6A05E64AE676133CF6331AF5578888AD0"}]}}}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:18.074+00:00 I  CONTROL  31445   [ReshardingCoordinatorService-0] "Frame","attr":{"frame":{"a":"55DE9569F39E","b":"55DE90B00000","o":"4B9F39E","s":"_ZN5mongo18stack_trace_detail12_GLOBAL__N_119printStackTraceImplERKNS1_7OptionsEPNS_14StackTraceSinkE.constprop.360","s+":"1FE"}}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:18.074+00:00 I  CONTROL  31445   [ReshardingCoordinatorService-0] "Frame","attr":{"frame":{"a":"55DE956A1909","b":"55DE90B00000","o":"4BA1909","s":"_ZN5mongo15printStackTraceEv","s+":"29"}}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:18.074+00:00 I  CONTROL  31445   [ReshardingCoordinatorService-0] "Frame","attr":{"frame":{"a":"55DE9569AE06","b":"55DE90B00000","o":"4B9AE06","s":"abruptQuit","s+":"66"}}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:18.074+00:00 I  CONTROL  31445   [ReshardingCoordinatorService-0] "Frame","attr":{"frame":{"a":"7F78D0739D80","b":"7F78D0727000","o":"12D80","s":"funlockfile","s+":"50"}}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:18.074+00:00 I  CONTROL  31445   [ReshardingCoordinatorService-0] "Frame","attr":{"frame":{"a":"7F78D039A93F","b":"7F78D0363000","o":"3793F","s":"gsignal","s+":"10F"}}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:18.074+00:00 I  CONTROL  31445   [ReshardingCoordinatorService-0] "Frame","attr":{"frame":{"a":"7F78D0384C95","b":"7F78D0363000","o":"21C95","s":"abort","s+":"127"}}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:18.074+00:00 I  CONTROL  31445   [ReshardingCoordinatorService-0] "Frame","attr":{"frame":{"a":"55DE928EDC32","b":"55DE90B00000","o":"1DEDC32","s":"_ZN5mongo25fassertFailedWithLocationEiPKcj","s+":"F6"}}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:18.074+00:00 I  CONTROL  31445   [ReshardingCoordinatorService-0] "Frame","attr":{"frame":{"a":"55DE9245ADEF","b":"55DE90B00000","o":"195ADEF","s":"_ZZN5mongo15unique_functionIFNS_6StatusES1_EE8makeImplIZNS_28ReshardingCoordinatorService21ReshardingCoordinator32_commitAndFinishReshardOperationERKSt10shared_ptrINS_8executor18ScopedTaskExecutorEERKNS_29ReshardingCoordinatorDocumentEEUlS1_E3_EEDaOT_EN12SpecificImpl4callEOS1_.cold.4145","s+":"AF"}}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:18.074+00:00 I  CONTROL  31445   [ReshardingCoordinatorService-0] "Frame","attr":{"frame":{"a":"55DE92D8D26A","b":"55DE90B00000","o":"228D26A","s":"_ZN5mongo14future_details10statusCallIRZZZNS_14ExecutorFutureIvE13_wrapCBHelperINS_15unique_functionIFNS_6StatusES6_EEEEEDaSt10shared_ptrINS_17OutOfLineExecutorEEOT_ENUlDpOT_E_clIJS6_EEEDaSG_ENUlS6_E_clES6_EUlvE_JNS0_8FakeVoidEEEEDaSD_DpOT0_","s+":"4A"}}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:18.074+00:00 I  CONTROL  31445   [ReshardingCoordinatorService-0] "Frame","attr":{"frame":{"a":"55DE92D8D3C0","b":"55DE90B00000","o":"228D3C0","s":"_ZZN5mongo15unique_functionIFvNS_6StatusEEE8makeImplIZZNS_14ExecutorFutureIvE13_wrapCBHelperINS0_IFS1_S1_EEEEEDaSt10shared_ptrINS_17OutOfLineExecutorEEOT_ENUlDpOT_E_clIJS1_EEEDaSH_EUlS1_E_EEDaSE_EN12SpecificImpl4callEOS1_","s+":"A0"}}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:18.074+00:00 I  CONTROL  31445   [ReshardingCoordinatorService-0] "Frame","attr":{"frame":{"a":"55DE95022A31","b":"55DE90B00000","o":"4522A31","s":"_ZZN5mongo15unique_functionIFvRKNS_8executor12TaskExecutor12CallbackArgsEEE8makeImplIZNS2_8scheduleENS0_IFvNS_6StatusEEEEEUlS5_E_EEDaOT_EN12SpecificImpl4callES5_","s+":"41"}}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:18.074+00:00 I  CONTROL  31445   [ReshardingCoordinatorService-0] "Frame","attr":{"frame":{"a":"55DE939FE539","b":"55DE90B00000","o":"2EFE539","s":"_ZZN5mongo15unique_functionIFvRKNS_8executor12TaskExecutor12CallbackArgsEEE8makeImplIZNS1_18ScopedTaskExecutor4Impl13_wrapCallbackIZNSA_12scheduleWorkEOS7_EUlOT_E_S7_EENS_10StatusWithINS2_14CallbackHandleEEESE_OT0_EUlRKSD_E_EEDaSE_EN12SpecificImpl4callES5_","s+":"199"}}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:18.074+00:00 I  CONTROL  31445   [ReshardingCoordinatorService-0] "Frame","attr":{"frame":{"a":"55DE94EC00A0","b":"55DE90B00000","o":"43C00A0","s":"_ZN5mongo8executor22ThreadPoolTaskExecutor11runCallbackESt10shared_ptrINS1_13CallbackStateEE","s+":"130"}}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:18.074+00:00 I  CONTROL  31445   [ReshardingCoordinatorService-0] "Frame","attr":{"frame":{"a":"55DE94EC04A0","b":"55DE90B00000","o":"43C04A0","s":"_ZZN5mongo15unique_functionIFvNS_6StatusEEE8makeImplIZNS_8executor22ThreadPoolTaskExecutor23scheduleIntoPool_inlockEPNSt7__cxx114listISt10shared_ptrINS6_13CallbackStateEESaISB_EEERKSt14_List_iteratorISB_ESI_St11unique_lockINS_12latch_detail5LatchEEEUlT_E1_EEDaOSN_EN12SpecificImpl4callEOS1_","s+":"90"}}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:18.074+00:00 I  CONTROL  31445   [ReshardingCoordinatorService-0] "Frame","attr":{"frame":{"a":"55DE9547E365","b":"55DE90B00000","o":"497E365","s":"_ZN5mongo10ThreadPool4Impl10_doOneTaskEPSt11unique_lockINS_12latch_detail5LatchEE","s+":"135"}}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:18.074+00:00 I  CONTROL  31445   [ReshardingCoordinatorService-0] "Frame","attr":{"frame":{"a":"55DE9547F96B","b":"55DE90B00000","o":"497F96B","s":"_ZN5mongo10ThreadPool4Impl13_consumeTasksEv","s+":"8B"}}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:18.074+00:00 I  CONTROL  31445   [ReshardingCoordinatorService-0] "Frame","attr":{"frame":{"a":"55DE95480D81","b":"55DE90B00000","o":"4980D81","s":"_ZN5mongo10ThreadPool4Impl17_workerThreadBodyERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE","s+":"161"}}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:18.074+00:00 I  CONTROL  31445   [ReshardingCoordinatorService-0] "Frame","attr":{"frame":{"a":"55DE95481290","b":"55DE90B00000","o":"4981290","s":"_ZNSt6thread11_State_implINS_8_InvokerISt5tupleIJZN5mongo4stdx6threadC4IZNS3_10ThreadPool4Impl25_startWorkerThread_inlockEvEUlvE2_JELi0EEET_DpOT0_EUlvE_EEEEE6_M_runEv","s+":"60"}}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:18.074+00:00 I  CONTROL  31445   [ReshardingCoordinatorService-0] "Frame","attr":{"frame":{"a":"55DE9583C86F","b":"55DE90B00000","o":"4D3C86F","s":"execute_native_thread_routine","s+":"F"}}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:18.074+00:00 I  CONTROL  31445   [ReshardingCoordinatorService-0] "Frame","attr":{"frame":{"a":"7F78D072F2DE","b":"7F78D0727000","o":"82DE","s":"start_thread","s+":"FE"}}
      [js_test:resharding_histogram_metrics] c20274| 2022-04-23T17:38:18.074+00:00 I  CONTROL  31445   [ReshardingCoordinatorService-0] "Frame","attr":{"frame":{"a":"7F78D045FA63","b":"7F78D0363000","o":"FCA63","s":"clone","s+":"43"}}
      [js_test:resharding_histogram_metrics] d20272| 2022-04-23T17:38:18.177+00:00 I  NETWORK  22944   [conn47] "Connection ended","attr":{"remote":"10.122.91.80:37460","uuid":"ac218d47-af8b-4b87-8934-abda5fb43e4c","connectionId":47,"connectionCount":19}
      [js_test:resharding_histogram_metrics] d20272| 2022-04-23T17:38:18.177+00:00 I  NETWORK  22944   [conn30] "Connection ended","attr":{"remote":"10.122.91.80:37380","uuid":"2a0584b8-1204-4772-a4ec-232ce02ac209","connectionId":30,"connectionCount":18}
      [js_test:resharding_histogram_metrics] d20271| 2022-04-23T17:38:18.178+00:00 I  NETWORK  22944   [conn17] "Connection ended","attr":{"remote":"10.122.91.80:45480","uuid":"95014d14-f90b-4a1c-887e-7b3ddf977dbb","connectionId":17,"connectionCount":13}
      [js_test:resharding_histogram_metrics] d20271| 2022-04-23T17:38:18.178+00:00 I  CONNPOOL 22566   [ReplicaSetMonitor-TaskExecutor] "Ending connection due to bad connection status","attr":{"hostAndPort":"ip-10-122-91-80.ec2.internal:20274","error":"HostUnreachable: Connection closed by peer","numOpenConns":1}
      [js_test:resharding_histogram_metrics] d20272| 2022-04-23T17:38:18.178+00:00 I  CONNPOOL 22566   [ReplicaSetMonitor-TaskExecutor] "Ending connection due to bad connection status","attr":{"hostAndPort":"ip-10-122-91-80.ec2.internal:20274","error":"HostUnreachable: Connection closed by peer","numOpenConns":1}
      [js_test:resharding_histogram_metrics] d20272| 2022-04-23T17:38:18.178+00:00 I  -        20883   [conn15] "Interrupted operation as its client disconnected","attr":{"opId":1301}
      [js_test:resharding_histogram_metrics] d20272| 2022-04-23T17:38:18.178+00:00 I  NETWORK  22944   [conn17] "Connection ended","attr":{"remote":"10.122.91.80:37330","uuid":"d024aab2-67bd-44f7-9f6f-5aceabfaf648","connectionId":17,"connectionCount":17}
      [js_test:resharding_histogram_metrics] d20272| 2022-04-23T17:38:18.178+00:00 I  NETWORK  22944   [conn16] "Connection ended","attr":{"remote":"10.122.91.80:37328","uuid":"8f76b8ee-15ad-41af-baf4-6489a95188cb","connectionId":16,"connectionCount":16}
      [js_test:resharding_histogram_metrics] d20272| 2022-04-23T17:38:18.178+00:00 I  -        4333222 [ReplicaSetMonitor-TaskExecutor] "RSM received error response","attr":{"host":"ip-10-122-91-80.ec2.internal:20274","error":"HostUnreachable: Connection closed by peer","replicaSet":"resharding_histogram_metrics-configRS","response":"{}"}
      [js_test:resharding_histogram_metrics] d20272| 2022-04-23T17:38:18.178+00:00 I  NETWORK  4712102 [ReplicaSetMonitor-TaskExecutor] "Host failed in replica set","attr":{"replicaSet":"resharding_histogram_metrics-configRS","host":"ip-10-122-91-80.ec2.internal:20274","error":{"code":6,"codeName":"HostUnreachable","errmsg":"Connection closed by peer"},"action":{"dropConnections":true,"requestImmediateCheck":true}}
      [js_test:resharding_histogram_metrics] d20272| 2022-04-23T17:38:18.178+00:00 I  CONNPOOL 22572   [ReplicaSetMonitor-TaskExecutor] "Dropping all pooled connections","attr":{"hostAndPort":"ip-10-122-91-80.ec2.internal:20274","error":"PooledConnectionsDropped: Pooled connections dropped"}
      [js_test:resharding_histogram_metrics] d20272| 2022-04-23T17:38:18.178+00:00 I  -        4333227 [ReplicaSetMonitor-TaskExecutor] "RSM monitoring host in expedited mode until we detect a primary","attr":{"host":"ip-10-122-91-80.ec2.i
      

            Assignee:
            randolph@mongodb.com Randolph Tan
            Reporter:
            brett.nawrocki@mongodb.com Brett Nawrocki
            Votes:
            0 Vote for this issue
            Watchers:
            3 Start watching this issue

              Created:
              Updated:
              Resolved: