<!-- 
RSS generated by JIRA (9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66) at Thu Feb 08 04:22:20 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>MongoDB Jira</title>
    <link>https://jira.mongodb.org</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.7.1</version>
        <build-number>970001</build-number>
        <build-date>13-04-2023</build-date>
    </build-info>


<item>
            <title>[SERVER-29980] Built-in hang detection diagnostics and recovery</title>
                <link>https://jira.mongodb.org/browse/SERVER-29980</link>
                <project id="10000" key="SERVER">Core Server</project>
                    <description>&lt;p&gt;It would be useful to automatically detect hangs (due for example to software bugs) and produce diagnostics such as complete stack traces for every thread and possibly (depending on degree of confidence that there is a hang) forcefully terminate the instance.&lt;/p&gt;</description>
                <environment></environment>
        <key id="400801">SERVER-29980</key>
            <summary>Built-in hang detection diagnostics and recovery</summary>
                <type id="2" iconUrl="https://jira.mongodb.org/secure/viewavatar?size=xsmall&amp;avatarId=14711&amp;avatarType=issuetype">New Feature</type>
                                            <priority id="3" iconUrl="https://jira.mongodb.org/images/icons/priorities/major.svg">Major - P3</priority>
                        <status id="6" iconUrl="https://jira.mongodb.org/images/icons/statuses/closed.png" description="The issue is considered finished, the resolution is correct. Issues which are closed can be reopened.">Closed</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="13201">Fixed</resolution>
                                        <assignee username="alyson.cabral@mongodb.com">Alyson Cabral</assignee>
                                    <reporter username="bruce.lucas@mongodb.com">Bruce Lucas</reporter>
                        <labels>
                            <label>SWDI</label>
                    </labels>
                <created>Wed, 5 Jul 2017 19:50:14 +0000</created>
                <updated>Mon, 8 Jan 2024 15:22:53 +0000</updated>
                            <resolved>Wed, 5 Feb 2020 17:07:23 +0000</resolved>
                                                    <fixVersion>4.2.0</fixVersion>
                                    <component>Diagnostics</component>
                                        <votes>35</votes>
                                    <watches>45</watches>
                                                                                                                <comments>
                            <comment id="2785115" author="alyson.cabral" created="Wed, 5 Feb 2020 17:06:24 +0000"  >&lt;p&gt;To reflect the improvements made by moving the storage node watchdog to community in 4.2, I&apos;m closing this ticket. Please open specific server tickets about expanding types of failure checks or any additional improvements going forward.&lt;/p&gt;</comment>
                            <comment id="2539922" author="bigbourin@gmail.com" created="Wed, 13 Nov 2019 23:07:07 +0000"  >&lt;p&gt;&lt;img class=&quot;emoticon&quot; src=&quot;https://jira.mongodb.org/images/icons/emoticons/thumbs_up.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="2539900" author="daniel.hatcher" created="Wed, 13 Nov 2019 22:54:35 +0000"  >&lt;p&gt;You&apos;re right; I&apos;ve opened &lt;a href=&quot;https://jira.mongodb.org/browse/DOCS-13223&quot; title=&quot;Add Storage Watchdog Exit Code&quot; class=&quot;issue-link&quot; data-issue-key=&quot;DOCS-13223&quot;&gt;&lt;del&gt;DOCS-13223&lt;/del&gt;&lt;/a&gt; to get it added.&lt;/p&gt;</comment>
                            <comment id="2539788" author="bigbourin@gmail.com" created="Wed, 13 Nov 2019 21:49:46 +0000"  >&lt;p&gt;Oh ok great, that should probably be added to &lt;a href=&quot;https://docs.mongodb.com/manual/reference/exit-codes/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://docs.mongodb.com/manual/reference/exit-codes/&lt;/a&gt; then &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.mongodb.org/images/icons/emoticons/wink.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;br/&gt;
Then it worked fine for my case &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.mongodb.org/images/icons/emoticons/wink.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="2539786" author="schwerin" created="Wed, 13 Nov 2019 21:47:27 +0000"  >&lt;p&gt;Code 61 is indeed the watchdog. The watchdog does no logging when it terminates the process, lest it get stuck trying to write to the dead disk. It&apos;s surprisingly tricky to maybe-log when the reason you might fail to log is a dead disk, so you&apos;ve really got to watch the code.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;For reference,&#160;&lt;a href=&quot;https://github.com/mongodb/mongo/blob/b375698b7fe1f4e69761559f1cad50c5e1f18014/src/mongo/util/exit_code.h#L56&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;here&apos;s a link to the definition of the exit code.&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="2539701" author="bigbourin@gmail.com" created="Wed, 13 Nov 2019 21:17:29 +0000"  >&lt;p&gt;Ok thanks! I managed to upgrade my primary to 4.0 and secondary to 4.2 in order to be able to test the storage watchdog on my server with the faulty SSD (not replaced yet). So I can still reproduce the dead IO performance situation by adding the faulty SSD back into the RAID array, I tried to do that with the watchdog on (60s), but unfortunately I wasn&apos;t able to confirm if the watchdog detected anything or stopped mongo because mongo died without logging anything with a status exit code of 61 (not documented). I suspect this is because the log file is also on the dead IO disk (/var) and this caused some internal exceptions / timeout. Is there any way to see metrics about storage test? any message to expect in the logs? what would be the exit status in this case?&lt;/p&gt;</comment>
                            <comment id="2528576" author="daniel.hatcher" created="Mon, 11 Nov 2019 15:50:53 +0000"  >&lt;p&gt;The Watchdog&apos;s logic (as of 4.2.1) is very simple. There are two threads, a &quot;check&quot; and a &quot;monitor&quot;. The &quot;check&quot; thread constantly writes to a new file and then reads from said file. If it succeeds, it increments a counter. The &quot;monitor&quot; thread runs every &lt;a href=&quot;https://docs.mongodb.com/manual/reference/parameters/#param.watchdogPeriodSeconds&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;watchdogPeriodSeconds&lt;/a&gt; and looks at the counter. If the counter is ever the same across two runs of the &quot;monitor&quot; thread, that means that we were unable to write to disk for at minimum the length of watchdogPeriodSeconds and we intentionally shut down the server. &lt;/p&gt;</comment>
                            <comment id="2526806" author="bigbourin@gmail.com" created="Sun, 10 Nov 2019 10:09:10 +0000"  >&lt;p&gt;That is a great news! I would love to try this (I still have a the faulty SSD in the machine currently) though I&apos;m currently 3.6 so the massive upgrade in a short time span sounds quite dangerous. Is there any more precision about what the watchdog detects as &quot;unresponsive&quot;? If the IO are slow but working for example, what&apos;s the test operation and the time threshold if any?&lt;/p&gt;</comment>
                            <comment id="2525177" author="daniel.hatcher" created="Fri, 8 Nov 2019 15:37:40 +0000"  >&lt;p&gt;The &lt;a href=&quot;https://docs.mongodb.com/manual/administration/monitoring/index.html#storage-node-watchdog&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;Storage Node Watchdog&lt;/a&gt; is available for the Community version of the database in 4.2.0. It&apos;s off by default but if it was enabled by setting the &lt;a href=&quot;https://docs.mongodb.com/manual/reference/parameters/#param.watchdogPeriodSeconds&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;watchdogPeriodSeconds parameter&lt;/a&gt; it&apos;s possible that it would have caught the issue you&apos;ve most recently experienced. However, as Matt said in his last comment, we&apos;re still looking into better ways to catch these kinds of issues so your feedback is much appreciated!&lt;/p&gt;</comment>
                            <comment id="2524177" author="bigbourin@gmail.com" created="Fri, 8 Nov 2019 08:35:49 +0000"  >&lt;p&gt;For the record I just had the same issue again (SSD failure, mongo dead but doesn&apos;t know it) and thankfully &lt;a href=&quot;https://gist.github.com/jarthod/ffe9d88bbab13bebafba12265be7c1aa&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;the little script I wrote&lt;/a&gt; saved me from hours of downtime, it stopped mongo which forced failover and only caused about 10 minutes downtime, which is long only because I try to stop mongo gracefully and it takes ages.&lt;/p&gt;</comment>
                            <comment id="2213606" author="matt.lord" created="Mon, 15 Apr 2019 17:00:14 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.mongodb.org/secure/ViewProfile.jspa?name=bigbourin%40gmail.com&quot; class=&quot;user-hover&quot; rel=&quot;bigbourin@gmail.com&quot;&gt;bigbourin@gmail.com&lt;/a&gt;, I&apos;m very sorry to hear about the problems that this caused for you. Your post-mortem analysis is very helpful, so thank you for all of the details!&lt;/p&gt;

&lt;p&gt;Note that we&apos;re currently discussing short term methods that could offer some help in these specific types of cases where the I/O subsystem is stalled for long periods of time, while also discussing medium term plans to address the more general issue where a node is unable to perform meaningful work and cannot make progress for whatever reason. So your input here is very timely for those discussions. Thank you again!&lt;/p&gt;</comment>
                            <comment id="2213531" author="bigbourin@gmail.com" created="Mon, 15 Apr 2019 16:19:05 +0000"  >&lt;p&gt;Hi, to give a bit more details about the case I encountered:&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;In my case the SSD disk were unresponsive to WRITE only (they where fast with read). They were still responding but more than 100 times slower, which creates delays of sometimes several minute to execute a simple command (like db.getReplicationInfo). This is amplified by the writing of logs from mongo I guess.&lt;/li&gt;
	&lt;li&gt;When this happened I tried to do a stepDown manually but had two issues: once I waited for 5 minutes and the command never respond I had to Ctrl+C it (probably waiting on disk), another time it was always returning after 10 seconds with the error saying the secondary isn&apos;t up-to-date, whereas it was as up-to-date as possible with the primary (but the primary was lagging so maybe this breaks your condition?). I had to force the stepDown to make it happen. Basically the primary and secondary were both at the same time, which was a couple hours in the past.&lt;/li&gt;
	&lt;li&gt;I made a little script to detect these senarios and try to force shutdown on my side, I used db.getReplicationInfo() as a command executed periodically and if it takes more than 2 seconds to answer up to 5 times (10s between each attempts) I consider the mongo in a bad state and stop it. (I don&apos;t even try the stepDown as it&apos;s not reliable). I&apos;ve tried this process once with my current setup and it worked.&lt;/li&gt;
	&lt;li&gt;I still have one faulty SSD (not replaced yet) in my server so I can try some patches or commands if you want to see how it behaves in this situation. I can toggle the problem on and off by adding/removing the faulty disk to the RAID array.&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="2201380" author="bigbourin@gmail.com" created="Wed, 3 Apr 2019 23:28:21 +0000"  >&lt;p&gt;Hi, I just had another 6.5h outage due to an I/O issue (similar to&#160;&lt;a href=&quot;https://jira.mongodb.org/browse/SERVER-31457&quot; title=&quot;Mongod stop responding, takes 200 load and don&amp;#39;t even switch to secondary&quot; class=&quot;issue-link&quot; data-issue-key=&quot;SERVER-31457&quot;&gt;&lt;del&gt;SERVER-31457&lt;/del&gt;&lt;/a&gt; or &lt;a href=&quot;https://jira.mongodb.org/browse/SERVER-14139&quot; title=&quot;Disk failure on one node can (eventually) block a whole cluster&quot; class=&quot;issue-link&quot; data-issue-key=&quot;SERVER-14139&quot;&gt;&lt;del&gt;SERVER-14139&lt;/del&gt;&lt;/a&gt;) but this time I was able to SSH on the machine so I confirmed one disk was failing which made the RAID array 200 times slower than usual and mongo stopped responding without falling back to secondary as expected unfortunately. Funny thing is that I got the alerts from CloudManager saying the node was not responding, but for the RS heartbeat it was still ok and stayed Primary...&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Do you guys have any plan of improving this in community edition or will this stay a rich people privilege (&lt;a href=&quot;https://jira.mongodb.org/browse/SERVER-29947&quot; title=&quot;Implement Storage Node Watchdog&quot; class=&quot;issue-link&quot; data-issue-key=&quot;SERVER-29947&quot;&gt;&lt;del&gt;SERVER-29947&lt;/del&gt;&lt;/a&gt;)?&lt;/p&gt;

&lt;p&gt;I&apos;m also interested if you have a &quot;recommended&quot; way to monitor this from the outside to force mongo failover.&lt;/p&gt;</comment>
                            <comment id="1624966" author="max.hirschhorn@10gen.com" created="Tue, 18 Jul 2017 17:13:19 +0000"  >&lt;p&gt;After chatting with &lt;a href=&quot;https://jira.mongodb.org/secure/ViewProfile.jspa?name=pasette&quot; class=&quot;user-hover&quot; rel=&quot;pasette&quot;&gt;pasette&lt;/a&gt; about this ticket, I&apos;m moving it over to the Platforms team to triage.&lt;/p&gt;</comment>
                            <comment id="1616941" author="bruce.lucas@10gen.com" created="Sat, 8 Jul 2017 11:40:36 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.mongodb.org/secure/ViewProfile.jspa?name=daniel.hatcher&quot; class=&quot;user-hover&quot; rel=&quot;daniel.hatcher&quot;&gt;daniel.hatcher&lt;/a&gt;, they are related, but (as written) &lt;a href=&quot;https://jira.mongodb.org/browse/SERVER-14139&quot; title=&quot;Disk failure on one node can (eventually) block a whole cluster&quot; class=&quot;issue-link&quot; data-issue-key=&quot;SERVER-14139&quot;&gt;&lt;del&gt;SERVER-14139&lt;/del&gt;&lt;/a&gt; is narrower.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Depends</name>
                                            <outwardlinks description="depends on">
                                                        </outwardlinks>
                                                        </issuelinktype>
                            <issuelinktype id="10012">
                    <name>Related</name>
                                            <outwardlinks description="related to">
                                        <issuelink>
            <issuekey id="442150">SERVER-31457</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="139677">SERVER-14139</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="1001700">DOCS-13223</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                <customfield id="customfield_10050" key="com.atlassian.jira.toolkit:comments">
                        <customfieldname># Replies</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>15.0</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10011" key="com.atlassian.jira.plugin.system.customfieldtypes:radiobuttons">
                        <customfieldname>Backwards Compatibility</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10038"><![CDATA[Fully Compatible]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                    <customfield id="customfield_13552" key="com.go2group.jira.plugin.crm:crm_generic_field">
                        <customfieldname>Case</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue><![CDATA[[500A000000Zfu0qIAB, 500A000000YfSpcIAF, 5002K00000dH5fWQAS, 5002K00000iOnypQAC, 5002K00000iNAmMQAW, 5002K00000iyf49QAA, 5002K00000kCGC8QAO]]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                            <customfield id="customfield_10055" key="com.atlassian.jira.ext.charting:firstresponsedate">
                        <customfieldname>Date of 1st Reply</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Thu, 6 Jul 2017 10:10:06 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10052" key="com.atlassian.jira.toolkit:dayslastcommented">
                        <customfieldname>Days since reply</customfieldname>
                        <customfieldvalues>
                                        4 years, 1 week ago
    
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_18254" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Dependencies</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue><![CDATA[<s><a href='https://jira.mongodb.org/browse/PM-1390'>PM-1390</a></s>]]></customfieldvalue>


                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_15850" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_17050" key="com.atlassian.jira.plugin.system.customfieldtypes:radiobuttons">
                        <customfieldname>Downstream Team Attention</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="16941"><![CDATA[Not Needed]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    <customfield id="customfield_10057" key="com.atlassian.jira.toolkit:lastusercommented">
                        <customfieldname>Last comment by Customer</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>true</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10056" key="com.atlassian.jira.toolkit:lastupdaterorcommenter">
                        <customfieldname>Last commenter</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>luke.bonanomi@mongodb.com</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_11151" key="com.atlassian.jira.toolkit:LastCommentDate">
                        <customfieldname>Last public comment date</customfieldname>
                        <customfieldvalues>
                            4 years, 1 week ago
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                    <customfield id="customfield_10051" key="com.atlassian.jira.toolkit:participants">
                        <customfieldname>Participants</customfieldname>
                        <customfieldvalues>
                                        <customfieldvalue>bigbourin@gmail.com</customfieldvalue>
            <customfieldvalue>alyson.cabral@mongodb.com</customfieldvalue>
            <customfieldvalue>schwerin@mongodb.com</customfieldvalue>
            <customfieldvalue>bruce.lucas@mongodb.com</customfieldvalue>
            <customfieldvalue>daniel.hatcher@mongodb.com</customfieldvalue>
            <customfieldvalue>matt.lord</customfieldvalue>
            <customfieldvalue>max.hirschhorn@mongodb.com</customfieldvalue>
    
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                        <customfield id="customfield_14254" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Product Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hrjrlb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                <customfield id="customfield_12550" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>2|hr9hpr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10558" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10053" key="com.atlassian.jira.ext.charting:timeinstatus">
                        <customfieldname>Time In Status</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_22870" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Triagers</customfieldname>
                        <customfieldvalues>
                                

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                    <customfield id="customfield_14350" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>serverRank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|ht9z5r:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                    </customfields>
    </item>
</channel>
</rss>