<!-- 
RSS generated by JIRA (9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66) at Thu Feb 08 05:39:03 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>MongoDB Jira</title>
    <link>https://jira.mongodb.org</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.7.1</version>
        <build-number>970001</build-number>
        <build-date>13-04-2023</build-date>
    </build-info>


<item>
            <title>[SERVER-56361] Better FTDC diagnostics of RSM, including server-side isMaster handling</title>
                <link>https://jira.mongodb.org/browse/SERVER-56361</link>
                <project id="10000" key="SERVER">Core Server</project>
                    <description>&lt;p&gt;We need to corner possible problems from HELP ticket by detecting stuckness in processing topology changes both at the client and server side.&lt;/p&gt;

&lt;p&gt;The main target of such diagnostics is to point to the areas where the processing could be substantially delayed with &quot;in flight&quot; counters - counters that increment on code area entry and decrement (possibly with latency attached) when the control exits the block. This is something that cannot be properly addressed with logging without be unnecessary verbose. &lt;/p&gt;

&lt;p&gt;The data that FTDC can display easily is: how many threads are blocked waiting to enter a code section related to RSM (both server and client side) and what is the average latency for the threads that already left that section. &lt;/p&gt;</description>
                <environment></environment>
        <key id="1691704">SERVER-56361</key>
            <summary>Better FTDC diagnostics of RSM, including server-side isMaster handling</summary>
                <type id="4" iconUrl="https://jira.mongodb.org/secure/viewavatar?size=xsmall&amp;avatarId=14710&amp;avatarType=issuetype">Improvement</type>
                                            <priority id="3" iconUrl="https://jira.mongodb.org/images/icons/priorities/major.svg">Major - P3</priority>
                        <status id="6" iconUrl="https://jira.mongodb.org/images/icons/statuses/closed.png" description="The issue is considered finished, the resolution is correct. Issues which are closed can be reopened.">Closed</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="13201">Fixed</resolution>
                                        <assignee username="andrew.shuvalov@mongodb.com">Andrew Shuvalov</assignee>
                                    <reporter username="andrew.shuvalov@mongodb.com">Andrew Shuvalov</reporter>
                        <labels>
                            <label>post-rc0</label>
                    </labels>
                <created>Mon, 26 Apr 2021 17:32:53 +0000</created>
                <updated>Sun, 29 Oct 2023 21:54:29 +0000</updated>
                            <resolved>Wed, 2 Jun 2021 14:24:14 +0000</resolved>
                                    <version>4.0.24</version>
                                    <fixVersion>4.0.25</fixVersion>
                                                        <votes>0</votes>
                                    <watches>8</watches>
                                                                                                                <comments>
                            <comment id="3853584" author="xgen-internal-githook" created="Wed, 2 Jun 2021 13:44:47 +0000"  >&lt;p&gt;Author:&lt;/p&gt;
{&apos;name&apos;: &apos;Andrew Shuvalov&apos;, &apos;email&apos;: &apos;andrew.shuvalov@mongodb.com&apos;, &apos;username&apos;: &apos;shuvalov-mdb&apos;}
&lt;p&gt;Message: &lt;a href=&quot;https://jira.mongodb.org/browse/SERVER-56361&quot; title=&quot;Better FTDC diagnostics of RSM, including server-side isMaster handling&quot; class=&quot;issue-link&quot; data-issue-key=&quot;SERVER-56361&quot;&gt;&lt;del&gt;SERVER-56361&lt;/del&gt;&lt;/a&gt;: Better FTDC diagnostics of RSM&lt;br/&gt;
Branch: v4.0&lt;br/&gt;
&lt;a href=&quot;https://github.com/mongodb/mongo/commit/70842af1543116b91c94072285a33f0384308b71&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/mongodb/mongo/commit/70842af1543116b91c94072285a33f0384308b71&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="3850844" author="xgen-internal-githook" created="Tue, 1 Jun 2021 13:09:33 +0000"  >&lt;p&gt;Author:&lt;/p&gt;
{&apos;name&apos;: &apos;Andrew Shuvalov&apos;, &apos;email&apos;: &apos;andrew.shuvalov@mongodb.com&apos;, &apos;username&apos;: &apos;shuvalov-mdb&apos;}
&lt;p&gt;Message: &lt;a href=&quot;https://jira.mongodb.org/browse/SERVER-56361&quot; title=&quot;Better FTDC diagnostics of RSM, including server-side isMaster handling&quot; class=&quot;issue-link&quot; data-issue-key=&quot;SERVER-56361&quot;&gt;&lt;del&gt;SERVER-56361&lt;/del&gt;&lt;/a&gt;: revert FTDC diagnostics&lt;br/&gt;
Branch: v4.0&lt;br/&gt;
&lt;a href=&quot;https://github.com/mongodb/mongo/commit/8b565a174c9edf5451b6325085cebe0f1afe1e3a&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/mongodb/mongo/commit/8b565a174c9edf5451b6325085cebe0f1afe1e3a&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="3848707" author="JIRAUSER1256988" created="Fri, 28 May 2021 23:55:26 +0000"  >&lt;p&gt;However this will require some optional changes in FTDC gui, after 5.0 port is made.&lt;/p&gt;</comment>
                            <comment id="3848704" author="xgen-internal-githook" created="Fri, 28 May 2021 23:41:10 +0000"  >&lt;p&gt;Author:&lt;/p&gt;
{&apos;name&apos;: &apos;Andrew Shuvalov&apos;, &apos;email&apos;: &apos;andrew.shuvalov@mongodb.com&apos;, &apos;username&apos;: &apos;shuvalov-mdb&apos;}
&lt;p&gt;Message: &lt;a href=&quot;https://jira.mongodb.org/browse/SERVER-56361&quot; title=&quot;Better FTDC diagnostics of RSM, including server-side isMaster handling&quot; class=&quot;issue-link&quot; data-issue-key=&quot;SERVER-56361&quot;&gt;&lt;del&gt;SERVER-56361&lt;/del&gt;&lt;/a&gt;: Better FTDC diagnostics of RSM&lt;br/&gt;
Branch: v4.0&lt;br/&gt;
&lt;a href=&quot;https://github.com/mongodb/mongo/commit/e4faa4d08720df067a62d8e48f66944a2f2b76cc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/mongodb/mongo/commit/e4faa4d08720df067a62d8e48f66944a2f2b76cc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="3773727" author="JIRAUSER1256988" created="Fri, 14 May 2021 14:18:13 +0000"  >&lt;p&gt;I decreased the period over which max latency is calculated to 1 sec, it was 10 sec before. However we should not be afraid that it will create &quot;flickering&quot; and miss the incident on the chart because the previous max latency will be erased by new data only. If RSM is not doing any new requests for say 10 seconds, the last max latency will &quot;stick&quot; for 10 seconds because there is no new data.&lt;/p&gt;

&lt;p&gt;However if RSM is very busy and doing a lot of requests max latency will be erased quickly. But I don&apos;t think it&apos;s a problem because I also added some extra backup logging.&lt;/p&gt;</comment>
                            <comment id="3743505" author="bruce.lucas@10gen.com" created="Wed, 28 Apr 2021 12:29:11 +0000"  >&lt;p&gt;Some general patterns we use:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;Number of threads doing something
	&lt;ul&gt;
		&lt;li&gt;Instantaneous counts, e.g. serverStatus.connections.active. This is easy and fast to collect, but suffers from possible sampling errors when charted&lt;/li&gt;
		&lt;li&gt;Cumulative time spent, e.g. the various timeAcquiringMicros for locks, flow control, etc. When differentiated and scaled this tells you the number of threads waiting. To be useful though this must be updated at least several times per second while waiting (given FTDC 1-second sample rate). Since this is a cumulative metric it is less subject to sampling errors when charted.&lt;/li&gt;
	&lt;/ul&gt;
	&lt;/li&gt;
	&lt;li&gt;Time spent doing something, i.e. latency - see for example serverStatus.opLatencies. A cumulative count of operations and and cumulative time is recorded in serverStatus, and from this an average latency over time can be computed for charting. This works well for sub-second latencies. If the primary concern is with longer latencies, consider logging events instead using the &quot;durationMillis&quot; attribute.&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;Distribution percentiles is probably overkill, and also doesn&apos;t sample well when charting at various time scales, so we don&apos;t generally do it.&lt;/p&gt;

&lt;p&gt;There is a serverStatus.repl section that might be a good home for this, although I see there may be some relationship to sharding (e.g. &quot;shard01&quot; metric in your example?) and&lt;/p&gt;

&lt;p&gt;I noticed that some of your metrics relate to the &quot;hello&quot; command. We already record some metrics related to this, don&apos;t know if sufficient for what you want to do.&lt;/p&gt;

&lt;p&gt;Also some of your metrics seem to be per-shard? We can&apos;t record per-shard information in FTDC because some customers have a very large number of shards.&lt;/p&gt;
</comment>
                            <comment id="3742780" author="JIRAUSER1256988" created="Wed, 28 Apr 2021 00:37:23 +0000"  >&lt;p&gt;The generic task description: suppose we have a congested block of code that requires access to some limited resource. Some concurrent jobs are trying to execute this blocks - this could be multiple threads or tasks waiting in a queue. We want to collect and plot metrics of two different kind:&lt;/p&gt;

&lt;ol&gt;
	&lt;li&gt;The current count of threads of tasks waiting before entering this block&lt;/li&gt;
	&lt;li&gt;The average and possibly 95% latency of executing this block, which includes both waiting and execution time&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;The concrete example above - I was instrumenting the code around ReplicaSetMonitor src/mongo/client/replica_set_monitor.cpp to register delays in obtaining the current primary for a given ReplicaSet.&lt;/p&gt;

&lt;p&gt;Unfortunately I&apos;m not using the proper pattens as I&apos;m not yet familiar with FTDC code base enough. I was creating my own implementation by subclassing the &lt;em&gt;class FTDCCollectorInterface&lt;/em&gt; and registering it with:&lt;/p&gt;

&lt;p/&gt;
&lt;div id=&quot;syntaxplugin&quot; class=&quot;syntaxplugin&quot; style=&quot;border: 1px dashed #bbb; border-radius: 5px !important; overflow: auto; max-height: 30em;&quot;&gt;
&lt;table cellspacing=&quot;0&quot; cellpadding=&quot;0&quot; border=&quot;0&quot; width=&quot;100%&quot; style=&quot;font-size: 1em; line-height: 1.4em !important; font-weight: normal; font-style: normal; color: black;&quot;&gt;
		&lt;tbody &gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;  margin-top: 10px;   margin-bottom: 10px;  width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;    controller-&amp;gt;addPeriodicCollector(stdx::make_unique&amp;lt;ReplicaSetMonitorStatsFTDC&amp;gt;());&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
			&lt;/tbody&gt;
&lt;/table&gt;
&lt;/div&gt;
&lt;p/&gt;

&lt;p&gt;inside registerMongoSCollectors() in src/mongo/db/ftdc/ftdc_mongos.cpp. I think the proper way should be somehow in ServerStatus, but I can&apos;t figure out where exactly ServerStatus is bridged to FTDC and how it&apos;s hierarchical structure is bridged to flat FTDC, and how exactly the periodic collectors are fetching data from server status.&lt;/p&gt;

&lt;p&gt;The second question is how and where we handle a time-series of latencies. I see that we have few &quot;average&quot; kind of metrics, so we do have some statistical time periods processing somewhere. It&apos;s probably not different from any Gaussian or Gaussian-like stream of discrete data, but latency distribution is the most common example. Ideally, I would also collect 95-percentile, assuming the accuracy is not important, the rough algorithm could be reasonably simple:&lt;/p&gt;

&lt;ol&gt;
	&lt;li&gt;We collect the histogram of discrete data over 10-seconds window, this will give average and 95%. The baskets are allocated dynamically because we don&apos;t know the distribution yet&lt;/li&gt;
	&lt;li&gt;After 10 seconds we move the histogram to &quot;previous&quot; position and create a new empty histogram. However, as we already know the distribution for the previous 10 seconds we can pre-allocate baskets the most optimal way and have fewer baskets. My hunch is that if we know the distribution in advance, 6-7 polynomial baskets will be sufficient for visual correctness. My previous experience with histograms with &amp;gt;20 baskets tells me this granularity doesn&apos;t give much additional information anyway.&lt;/li&gt;
&lt;/ol&gt;
</comment>
                            <comment id="3742756" author="JIRAUSER1256988" created="Tue, 27 Apr 2021 23:57:22 +0000"  >&lt;p&gt;Example of metrics generated with my experimental code (explanation in the next comment):&lt;/p&gt;

&lt;p&gt; &lt;span class=&quot;image-wrap&quot; style=&quot;&quot;&gt;&lt;img src=&quot;https://jira.mongodb.org/secure/attachment/312182/312182_Screen+Shot+2021-04-27+at+7.53.32+PM.png&quot; width=&quot;70%&quot; style=&quot;border: 0px solid black&quot; /&gt;&lt;/span&gt; &lt;/p&gt;</comment>
                            <comment id="3741526" author="JIRAUSER1256988" created="Tue, 27 Apr 2021 16:06:18 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.mongodb.org/secure/ViewProfile.jspa?name=bruce.lucas&quot; class=&quot;user-hover&quot; rel=&quot;bruce.lucas&quot;&gt;bruce.lucas&lt;/a&gt; I updated the description, still not keeping it very specific and open for discussion. We got feedback from &lt;a href=&quot;https://jira.mongodb.org/secure/ViewProfile.jspa?name=jonathan.streets&quot; class=&quot;user-hover&quot; rel=&quot;jonathan.streets&quot;&gt;jonathan.streets&lt;/a&gt;:&lt;br/&gt;
&quot;1. using the existing serverStatus infrastructure and &lt;br/&gt;
 2. we should use accumulative counters, not decaying averages. &quot;&lt;/p&gt;

&lt;p&gt;So I would like to redo the implementation above to be compliant (the initial version was used in help ticket for troubleshooting). I will work offline with &lt;a href=&quot;https://jira.mongodb.org/secure/ViewProfile.jspa?name=bruce.lucas&quot; class=&quot;user-hover&quot; rel=&quot;bruce.lucas&quot;&gt;bruce.lucas&lt;/a&gt; to get necessary guidance for proper solution.&lt;/p&gt;</comment>
                            <comment id="3740819" author="bruce.lucas@10gen.com" created="Tue, 27 Apr 2021 12:38:10 +0000"  >&lt;p&gt;Can you please post a description of the proposed FTDC changes?&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10420">
                    <name>Backports</name>
                                            <outwardlinks description="backported by">
                                                        </outwardlinks>
                                                        </issuelinktype>
                            <issuelinktype id="10012">
                    <name>Related</name>
                                            <outwardlinks description="related to">
                                        <issuelink>
            <issuekey id="1769184">SERVER-57449</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="312182" name="Screen Shot 2021-04-27 at 7.53.32 PM.png" size="725485" author="andrew.shuvalov@mongodb.com" created="Tue, 27 Apr 2021 23:53:54 +0000"/>
                            <attachment id="314331" name="Screen Shot 2021-05-11 at 12.23.16 PM.png" size="210239" author="andrew.shuvalov@mongodb.com" created="Tue, 11 May 2021 16:23:47 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                <customfield id="customfield_10050" key="com.atlassian.jira.toolkit:comments">
                        <customfieldname># Replies</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>10.0</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                            <customfield id="customfield_12450" key="com.atlassian.jira.plugin.system.customfieldtypes:multicheckboxes">
                        <customfieldname>Backport Requested</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="21777"><![CDATA[v5.0]]></customfieldvalue>
    
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10011" key="com.atlassian.jira.plugin.system.customfieldtypes:radiobuttons">
                        <customfieldname>Backwards Compatibility</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10038"><![CDATA[Fully Compatible]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                            <customfield id="customfield_10055" key="com.atlassian.jira.ext.charting:firstresponsedate">
                        <customfieldname>Date of 1st Reply</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Tue, 27 Apr 2021 12:38:10 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10052" key="com.atlassian.jira.toolkit:dayslastcommented">
                        <customfieldname>Days since reply</customfieldname>
                        <customfieldvalues>
                                        2 years, 36 weeks ago
    
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_18254" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Dependencies</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue><![CDATA[]]></customfieldvalue>


                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_15850" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_17050" key="com.atlassian.jira.plugin.system.customfieldtypes:radiobuttons">
                        <customfieldname>Downstream Team Attention</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="16941"><![CDATA[Not Needed]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    <customfield id="customfield_10057" key="com.atlassian.jira.toolkit:lastusercommented">
                        <customfieldname>Last comment by Customer</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>true</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10056" key="com.atlassian.jira.toolkit:lastupdaterorcommenter">
                        <customfieldname>Last commenter</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>luke.bonanomi@mongodb.com</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_11151" key="com.atlassian.jira.toolkit:LastCommentDate">
                        <customfieldname>Last public comment date</customfieldname>
                        <customfieldvalues>
                            2 years, 36 weeks ago
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_16465" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Linked BF Score</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>180.0</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                        <customfield id="customfield_10051" key="com.atlassian.jira.toolkit:participants">
                        <customfieldname>Participants</customfieldname>
                        <customfieldvalues>
                                        <customfieldvalue>andrew.shuvalov@mongodb.com</customfieldvalue>
            <customfieldvalue>bruce.lucas@mongodb.com</customfieldvalue>
            <customfieldvalue>xgen-internal-githook</customfieldvalue>
    
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                        <customfield id="customfield_14254" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Product Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hz6o0v:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                <customfield id="customfield_12550" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>2|hyrl5r:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10558" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_23361" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Requested By</customfieldname>
                        <customfieldvalues>
                                

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                <customfield id="customfield_10053" key="com.atlassian.jira.ext.charting:timeinstatus">
                        <customfieldname>Time In Status</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_22870" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Triagers</customfieldname>
                        <customfieldvalues>
                                

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                    <customfield id="customfield_14350" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>serverRank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hz6a9z:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                    </customfields>
    </item>
</channel>
</rss>