Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-65712

In config_server_health_observer_crash.js, fix race in shutdown

    • Fully Compatible
    • ALL
    • 41

      There are few config_server_health_observer_crash.js test failures when the sharded test .stop() method complains that the mongos terminated with error:

      [js_test:config_server_health_observer_crash] c20842| 2022-04-13T16:12:45.673+00:00 I  CONTROL  23138   [SignalHandler] "Shutting down","attr":{"exitCode":0}
      [js_test:config_server_health_observer_crash] | 2022-04-13T16:12:45.707Z I  -        22819   [thread1] "Waiting for process to terminate.","attr":{"pid":"118251"}
      [js_test:config_server_health_observer_crash] b20831| 2022-04-13T16:12:45.707Z I  CONTROL  23377   [SignalHandler] "Received signal","attr":{"signal":15,"error":"Terminated"}
      [js_test:config_server_health_observer_crash] b20831| 2022-04-13T16:12:45.707Z I  CONTROL  23378   [SignalHandler] "Signal was sent by kill(2)","attr":{"pid":118180,"uid":1000}
      [js_test:config_server_health_observer_crash] b20831| 2022-04-13T16:12:45.707Z I  CONTROL  23381   [SignalHandler] "will terminate after current cmd ends"
      [js_test:config_server_health_observer_crash] b20831| 2022-04-13T16:12:45.707Z I  NETWORK  23017   [listener] "removing socket file","attr":{"path":"/tmp/mongodb-20831.sock"}
      [js_test:config_server_health_observer_crash] b20831| 2022-04-13T16:12:45.707Z I  NETWORK  22991   [SignalHandler] "Skip closing connection for connection","attr":{"connectionId":3}
      [js_test:config_server_health_observer_crash] b20831| 2022-04-13T16:12:45.707Z I  NETWORK  22991   [SignalHandler] "Skip closing connection for connection","attr":{"connectionId":1}
      [js_test:config_server_health_observer_crash] b20831| 2022-04-13T16:12:45.707Z I  CONTROL  23138   [SignalHandler] "Shutting down","attr":{"exitCode":0}
      [js_test:config_server_health_observer_crash] | 2022-04-13T16:12:45.711Z I  -        22819   [thread1] "Waiting for process to terminate.","attr":{"pid":"121735"}
      [js_test:config_server_health_observer_crash] | 2022-04-13T16:12:45.711Z I  -        22823   [thread1] "Process exited with error code","attr":{"pid":"121735","code":-6}
      [js_test:config_server_health_observer_crash] | 2022-04-13T16:12:45.712Z I  -        22819   [thread1] "Waiting for process to terminate.","attr":{"pid":"118261"}
      [js_test:config_server_health_observer_crash] b20832| 2022-04-13T16:12:45.712Z I  CONTROL  23377   [SignalHandler] "Received signal","attr":{"signal":15,"error":"Terminated"}
      [js_test:config_server_health_observer_crash] b20832| 2022-04-13T16:12:45.712Z I  CONTROL  23378   [SignalHandler] "Signal was sent by kill(2)","attr":{"pid":118180,"uid":1000}
      [js_test:config_server_health_observer_crash] b20832| 2022-04-13T16:12:45.712Z I  CONTROL  23381   [SignalHandler] "will terminate after current cmd ends"
      [js_test:config_server_health_observer_crash] b20832| 2022-04-13T16:12:45.712Z I  NETWORK  23017   [listener] "removing socket file","attr":{"path":"/tmp/mongodb-20832.sock"}
      [js_test:config_server_health_observer_crash] b20832| 2022-04-13T16:12:45.712Z I  NETWORK  22991   [SignalHandler] "Skip closing connection for connection","attr":{"connectionId":3}
      [js_test:config_server_health_observer_crash] b20832| 2022-04-13T16:12:45.712Z I  NETWORK  22991   [SignalHandler] "Skip closing connection for connection","attr":{"connectionId":1}
      [js_test:config_server_health_observer_crash] b20832| 2022-04-13T16:12:45.712Z I  CONTROL  23138   [SignalHandler] "Shutting down","attr":{"exitCode":0}
      [js_test:config_server_health_observer_crash] one more more child processes exited with an error during jstests/sharding/health_monitor/config_server_health_observer_crash.js
      [js_test:config_server_health_observer_crash] exiting with code -7
      [js_test:config_server_health_observer_crash] | 2022-04-13T16:12:45.719Z I  QUERY    22791   [js] "Failed to end logical session","attr":{"lsid":{"id":{"$uuid":"8f8344d2-9980-48d9-9a90-c1b45711624a"}},"error":{"code":6,"codeName":"HostUnreachable","errmsg":"network error while attempting to run command 'endSessions' on host 'ip-10-122-15-188:20830' "}}
      
      

      The problem might be that the shell_utils::KillMongoProgramInstances() did not return with success. This may happen if the `registry.getRegisteredPids()` still contains the crashed (by test) mongos in the registry _registeredPids.

      To fix, refresh the registry using `MongoRunner.runningChildPids()` that iterates known ids and removes those already gone by calling `ProgramRegistry::isPidDead()` for every registered process ID and removes those gone with `unregisterProgram()`.

            Assignee:
            andrew.shuvalov@mongodb.com Andrew Shuvalov (Inactive)
            Reporter:
            andrew.shuvalov@mongodb.com Andrew Shuvalov (Inactive)
            Votes:
            0 Vote for this issue
            Watchers:
            2 Start watching this issue

              Created:
              Updated:
              Resolved: