<!-- 
RSS generated by JIRA (9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66) at Thu Feb 08 04:11:01 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>MongoDB Jira</title>
    <link>https://jira.mongodb.org</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.7.1</version>
        <build-number>970001</build-number>
        <build-date>13-04-2023</build-date>
    </build-info>


<item>
            <title>[SERVER-26055] Server fills cache rapidly, even under replication only and then operates very slowing causing oplog backlog. When serving clients queries take ages to return once cache is &apos;full&apos;</title>
                <link>https://jira.mongodb.org/browse/SERVER-26055</link>
                <project id="10000" key="SERVER">Core Server</project>
                    <description>&lt;p&gt;We upgraded a secondary of a 3 node cluster to 3.2.9.&lt;/p&gt;

&lt;p&gt;By default when we upgrade we use iptables to allow replication but block clients.&lt;/p&gt;

&lt;p&gt;Upon allowing clients cache went up to ~96% and failed to drop. Only 1 (of 16) cores appeared to be in use.&lt;/p&gt;

&lt;p&gt;Blocking clients, restarting and allowing replication caused the oplog to catch up but still over time the cache fills and the performance hits rock bottom.&lt;/p&gt;

&lt;p&gt;mongostat&lt;/p&gt;
&lt;p/&gt;
&lt;div id=&quot;syntaxplugin&quot; class=&quot;syntaxplugin&quot; style=&quot;border: 1px dashed #bbb; border-radius: 5px !important; overflow: auto; max-height: 30em;&quot;&gt;
&lt;table cellspacing=&quot;0&quot; cellpadding=&quot;0&quot; border=&quot;0&quot; width=&quot;100%&quot; style=&quot;font-size: 1em; line-height: 1.4em !important; font-weight: normal; font-style: normal; color: black;&quot;&gt;
		&lt;tbody &gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;  margin-top: 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;insert query update delete getmore command % dirty % used flushes vsize   res qr|qw ar|aw netIn netOut conn   set repl                 time&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;    *6    *0    *14     *1       0    14|0     1.3   96.0       0 21.9G 21.1G   0|0  0|16 1.23k   127k   16 floow  SEC 2016-09-10T20:18:27Z&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;    *1    *0    *21     *2       0    13|0     1.3   96.0       0 21.9G 21.1G   0|0  0|16 1.07k   127k   16 floow  SEC 2016-09-10T20:18:28Z&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;    *0    *0     *0     *0       0     9|0     1.3   96.0       0 21.9G 21.1G   0|0  0|16  917b  93.2k   16 floow  SEC 2016-09-10T20:18:29Z&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;    *9    *0    *29     *1       0    12|0     1.3   96.0       0 21.9G 21.1G   0|0  0|16 1.01k   126k   16 floow  SEC 2016-09-10T20:18:30Z&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;    *2    *0     *4     *1       0    13|0     1.3   96.0       0 21.9G 21.1G   0|0  0|16 1.17k   126k   16 floow  SEC 2016-09-10T20:18:31Z&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   margin-bottom: 10px;  width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;   *24    *0   *161    *10       0    14|0     1.3   96.0       0 21.9G 21.1G   0|0  0|15 1.13k   127k   16 floow  SEC 2016-09-10T20:18:32Z&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
			&lt;/tbody&gt;
&lt;/table&gt;
&lt;/div&gt;
&lt;p/&gt;

&lt;p&gt;iostat:&lt;/p&gt;
&lt;p/&gt;
&lt;div id=&quot;syntaxplugin&quot; class=&quot;syntaxplugin&quot; style=&quot;border: 1px dashed #bbb; border-radius: 5px !important; overflow: auto; max-height: 30em;&quot;&gt;
&lt;table cellspacing=&quot;0&quot; cellpadding=&quot;0&quot; border=&quot;0&quot; width=&quot;100%&quot; style=&quot;font-size: 1em; line-height: 1.4em !important; font-weight: normal; font-style: normal; color: black;&quot;&gt;
		&lt;tbody &gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;  margin-top: 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;09/10/2016 08:18:51 PM&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;avg-cpu:  %user   %nice %system %iowait  %steal   %idle&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;           6.51    0.00    0.06    0.13    0.00   93.30&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   width: auto; padding: 0;&quot;&gt;&amp;nbsp;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz avgqu-sz   await r_await w_await  svctm  %util&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;xvda              0.00     0.00   22.00    0.00    92.00     0.00     8.36     0.04    2.00    2.00    0.00   0.18   0.40&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;xvdh              0.00    99.00   21.00   17.00   260.00  1292.00    81.68     0.06    1.47    0.57    2.59   0.63   2.40&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   margin-bottom: 10px;  width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;xvdz              0.00     0.00    0.00    0.00     0.00     0.00     0.00     0.00    0.00    0.00    0.00   0.00   0.00&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
			&lt;/tbody&gt;
&lt;/table&gt;
&lt;/div&gt;
&lt;p/&gt;

&lt;p&gt;replication status (command took 10-15min to return):&lt;/p&gt;
&lt;p/&gt;
&lt;div id=&quot;syntaxplugin&quot; class=&quot;syntaxplugin&quot; style=&quot;border: 1px dashed #bbb; border-radius: 5px !important; overflow: auto; max-height: 30em;&quot;&gt;
&lt;table cellspacing=&quot;0&quot; cellpadding=&quot;0&quot; border=&quot;0&quot; width=&quot;100%&quot; style=&quot;font-size: 1em; line-height: 1.4em !important; font-weight: normal; font-style: normal; color: black;&quot;&gt;
		&lt;tbody &gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;  margin-top: 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;db-node2(mongod-3.2.9)[SECONDARY:floow] test&amp;gt;rs.printReplicationInfo()&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;configured oplog size:   614400MB&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;log length start to end: 958414secs (266.23hrs)&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;oplog first event time:  Tue Aug 30 2016 14:06:44 GMT+0000 (UTC)&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;oplog last event time:   Sat Sep 10 2016 16:20:18 GMT+0000 (UTC)&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   margin-bottom: 10px;  width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;now:                     Sat Sep 10 2016 20:25:01 GMT+0000 (UTC)&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
			&lt;/tbody&gt;
&lt;/table&gt;
&lt;/div&gt;
&lt;p/&gt;

&lt;p&gt;Upon restart (which often takes ages) replication catches up but then the cache fills and the scenario repeats.&lt;/p&gt;

&lt;p&gt;Note: Other nodes are running 3.0 still. &lt;/p&gt;

&lt;p&gt;I also experimented with changing WT parameters with no joy.&lt;/p&gt;

&lt;p&gt;We will downgrade but leaving at 3.2.9 with low priority for now to allow for diagnostics and logs if required.&lt;/p&gt;

&lt;p&gt;With 3.0 we still have cache filling issues but they occur once or twice a month, with our workload mmap was pretty much maintenance-free (very stable, minimal issues &lt;span class=&quot;error&quot;&gt;&amp;#91;except the disk usage&amp;#93;&lt;/span&gt;, 3.0 WT causes some pain but it&apos;s manageable, 3.2 WT is unusable.&lt;/p&gt;</description>
                <environment></environment>
        <key id="315419">SERVER-26055</key>
            <summary>Server fills cache rapidly, even under replication only and then operates very slowing causing oplog backlog. When serving clients queries take ages to return once cache is &apos;full&apos;</summary>
                <type id="1" iconUrl="https://jira.mongodb.org/secure/viewavatar?size=xsmall&amp;avatarId=14703&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.mongodb.org/images/icons/priorities/major.svg">Major - P3</priority>
                        <status id="6" iconUrl="https://jira.mongodb.org/images/icons/statuses/closed.png" description="The issue is considered finished, the resolution is correct. Issues which are closed can be reopened.">Closed</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="kelsey.schubert@mongodb.com">Kelsey Schubert</assignee>
                                    <reporter username="paul.ridgway">Paul Ridgway</reporter>
                        <labels>
                    </labels>
                <created>Sat, 10 Sep 2016 20:26:12 +0000</created>
                <updated>Sat, 19 Nov 2016 15:22:29 +0000</updated>
                            <resolved>Sat, 19 Nov 2016 15:22:29 +0000</resolved>
                                                                                        <votes>2</votes>
                                    <watches>15</watches>
                                                                                                                <comments>
                            <comment id="1395293" author="alexander.gorrod" created="Wed, 28 Sep 2016 01:31:59 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.mongodb.org/secure/ViewProfile.jspa?name=bigbourin%40gmail.com&quot; class=&quot;user-hover&quot; rel=&quot;bigbourin@gmail.com&quot;&gt;bigbourin@gmail.com&lt;/a&gt; you said:&lt;/p&gt;

&lt;blockquote&gt;&lt;p&gt;Our secondary - which is just used for replication, dump every night and some slow queries (stats) - started responding slowly and got stuck at 100% cpu, after 5 days in production. So this time it took much longer, our dumps didn&apos;t kill it, but it still ended-up in the same situation.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;That sounds like a different issue to the one discussed in this ticket. I recommend opening a new JIRA ticket with a description of your setup and workload and uploading MongoDB logs and diagnostic data from the machine where the issue occurs.&lt;/p&gt;</comment>
                            <comment id="1395157" author="bigbourin@gmail.com" created="Tue, 27 Sep 2016 21:52:29 +0000"  >&lt;p&gt;Thanks for the detailed explanation Thomas,&lt;/p&gt;

&lt;p&gt;I do understand why all these could cause big slow down to the server but I have a hard time understanding how can this kind of situation be unrecoverable ? in our case our server stayed for hours at 100% cpu with no requests and no IO usage (replication stuck at the same point in time). We let it like this to see if it would recover at some point as the load decreases (stopping our queries) but it never did and we had to restart. Can any of the issue you mentioned caused this, an infinite circle of death for the server?&lt;/p&gt;

&lt;p&gt;And by after reading all this posts we downgraded to 3.2.5 to avoid the huge regression we had in 3.2.9, it was much better BUT, we still got the same problem today. Our secondary - which is just used for replication, dump every night and some slow queries (stats) - started responding slowly and got stuck at 100% cpu, after 5 days in production. So this time it took much longer, our dumps didn&apos;t kill it, but it still ended-up in the same situation.&lt;/p&gt;

&lt;p&gt;We&apos;ll try the 3.2.10 of course but we&apos;re not sure if we&apos;ll be able to build a script to reproduce the take down of 3.2.5 so we can compare with 3.2.10. I&apos;ll have a look at the fixed bugs to maybe better understand how to replicate the issue quicker and maybe how to make sure we won&apos;t have the issue again with 3.2.10.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Adrien&lt;/p&gt;</comment>
                            <comment id="1394968" author="thomas.schubert" created="Tue, 27 Sep 2016 19:19:13 +0000"  >&lt;p&gt;Hi &lt;a href=&quot;https://jira.mongodb.org/secure/ViewProfile.jspa?name=bigbourin%40gmail.com&quot; class=&quot;user-hover&quot; rel=&quot;bigbourin@gmail.com&quot;&gt;bigbourin@gmail.com&lt;/a&gt;,&lt;/p&gt;

&lt;p&gt;As you know, by default, when the WiredTiger cache usage goes over 95%, application threads (those performing operations) are involved in the eviction process. A number, or all, of the application threads begin looking for candidate pages to be evicted so that the cache usage can be reduced to its 80% target. &lt;/p&gt;

&lt;p&gt;To go into a bit more detail about this process, there is a single single eviction server thread that walks the cache identifying potential candidate pages for eviction, and places them on a queue. At the same time, there are multiple eviction worker threads that take candidate pages from the queue and attempt to evict them, although for various reasons this may or may not succeed. When the cache utilization exceeds 95%, application threads also function as eviction worker threads and attempt to evict pages. This can be a source of performance problems if the eviction process is not efficient as the application threads are busy and cannot handle requests from clients.&lt;/p&gt;

&lt;p&gt;There are three general improvements to eviction in WiredTiger that fix distinct issues around the eviction process. These improvements correct issues that are all observed when cache reaches 95% and application threads start evicting pages. As a result, these issues have very similar symptoms even though the underlying cause is different.&lt;/p&gt;

&lt;p&gt;The first improvement was to make sure the eviction server was working as efficiently as possible to identify eviction candidates. Through MongoDB 3.2.9, this had been our focus and as a result of this effort, we resolved issues such as &lt;a href=&quot;https://jira.mongodb.org/browse/SERVER-24580&quot; title=&quot;Improve performance when WiredTiger cache is full&quot; class=&quot;issue-link&quot; data-issue-key=&quot;SERVER-24580&quot;&gt;&lt;del&gt;SERVER-24580&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;The second improvement was to improve the likelihood that candidates found by the eviction server can be successfully evicted by eviction workers (including application threads doing eviction). MongoDB 3.2.10 includes fixes that improve some cases where hazard pointers block page eviction and we are confident that eviction is working more efficiently in general as a result of these code changes.&lt;/p&gt;

&lt;p&gt;The third improvement requires a bit of background knowledge. The WiredTiger eviction server makes space available in cache by walking the set of pages currently in cache and selecting good candidates for eviction. The server concentrates those walks on parts of the cache where it has found good candidates in the recent past. Therefore, it is possible that when a new read load is introduced, the portion of the cache that holds good candidates will change. Subsequently, for a period of time after the change in cache content, the eviction server will spend more time looking for candidates in unproductive portions of the cache. Specifically, we believe that it is spending a lot of time walking trees with a lot of internal pages and very few leaf pages, since internal pages would be excluded from eviction at this stage. MongoDB 3.2.10 includes &lt;a href=&quot;https://jira.mongodb.org/browse/WT-2902&quot; title=&quot;Enhance eviction to work better with update heavy YCSB load&quot; class=&quot;issue-link&quot; data-issue-key=&quot;WT-2902&quot;&gt;&lt;del&gt;WT-2902&lt;/del&gt;&lt;/a&gt; which causes the eviction server to spend less time looking for candidates in a section of the cache if it isn&apos;t finding enough.&lt;/p&gt;

&lt;p&gt;I hope that this explanation clarifies the work that we have been putting into resolving this issue and explains some of diagnostic challenges we have faced, since many of the symptoms of these issues overlap and each workload stresses the system differently. To improve our diagnostic capabilities, we plan to include additional metrics, which will help to identify these types of issues more quickly, &lt;a href=&quot;https://jira.mongodb.org/browse/WT-2920&quot; title=&quot;Add statistic tracking application thread cache maintenance time&quot; class=&quot;issue-link&quot; data-issue-key=&quot;WT-2920&quot;&gt;&lt;del&gt;WT-2920&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;In summary, MongoDB 3.2.10 will include a number of fixes, which I think will benefit your workload. Please feel free to review the relevant WiredTiger tickets for additional details: &lt;a href=&quot;https://jira.mongodb.org/browse/WT-2902&quot; title=&quot;Enhance eviction to work better with update heavy YCSB load&quot; class=&quot;issue-link&quot; data-issue-key=&quot;WT-2902&quot;&gt;&lt;del&gt;WT-2902&lt;/del&gt;&lt;/a&gt;, &lt;a href=&quot;https://jira.mongodb.org/browse/WT-2816&quot; title=&quot;Improve WiredTiger eviction performance&quot; class=&quot;issue-link&quot; data-issue-key=&quot;WT-2816&quot;&gt;&lt;del&gt;WT-2816&lt;/del&gt;&lt;/a&gt;/&lt;a href=&quot;https://jira.mongodb.org/browse/WT-2866&quot; title=&quot;Eviction server algorithm tuning&quot; class=&quot;issue-link&quot; data-issue-key=&quot;WT-2866&quot;&gt;&lt;del&gt;WT-2866&lt;/del&gt;&lt;/a&gt;, &lt;a href=&quot;https://jira.mongodb.org/browse/WT-2928&quot; title=&quot;Eviction failing to switch queues can lead to starvation&quot; class=&quot;issue-link&quot; data-issue-key=&quot;WT-2928&quot;&gt;&lt;del&gt;WT-2928&lt;/del&gt;&lt;/a&gt;, and &lt;a href=&quot;https://jira.mongodb.org/browse/WT-2924&quot; title=&quot;Ensure we are doing eviction when threads are waiting for it&quot; class=&quot;issue-link&quot; data-issue-key=&quot;WT-2924&quot;&gt;&lt;del&gt;WT-2924&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;Kind regards,&lt;br/&gt;
Thomas&lt;/p&gt;</comment>
                            <comment id="1386398" author="bigbourin@gmail.com" created="Fri, 16 Sep 2016 16:30:25 +0000"  >&lt;p&gt;Hi all, I just wanted to add that we recently upgraded our 3.2.9 replica from MMAP to WT and discovered this bug which almost took our platform down. Fortunately we managed to catch the problem on our secondary during a nightly dump which slowed the server to a crawl up to a point it couldn&apos;t catch-up with primary or execute a single command (with 100% CPU usage and 0% IO) until we restart the process. We searched for an explanation for hours before suspecting a bug in mongoDB itself and we finally looked a the bug tracker to find this was actually a regression in WT introduced after 3.2.5.&lt;/p&gt;

&lt;p&gt;I&apos;m happy to see that this issue is being working on and probably fixed in 3.2.10 but also sad that this regression could stay unfixed in a production release for months, we&apos;ll never be confident to upgrade MongoDB &#9785;.&lt;/p&gt;

&lt;p&gt;We temporarily downgraded to 3.2.5 and it fixed the issue as expected, we expect to test 3.2.10 when out because we can&apos;t really leave our standard package management process. Is there any clear explanation of what the regression is, what has been done to fix it (is it just like 3.2.5?) and what has been done to prevent this in the future ?&lt;/p&gt;</comment>
                            <comment id="1386374" author="bartosz.debski" created="Fri, 16 Sep 2016 16:00:36 +0000"  >&lt;p&gt;Hi, So we have ran 3.2.10rc0 in production for more than a day. As WiredTiger cache was our main concern, below is a behaviour graph of WT Cache since 5PM yesterday. As Paul mentioned earlier this release looks much better than 3.2.9 but there is still some fluctuations on 3.2.10rc0 on size of the cache. This manageable so it&apos;s not a worry in a current state.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;image-wrap&quot; style=&quot;&quot;&gt;&lt;img src=&quot;https://jira.mongodb.org/secure/attachment/138800/138800_graphite.thefloow.net.png&quot; style=&quot;border: 0px solid black&quot; /&gt;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;I hope this helps to confirm or analyse further any fixes.&lt;/p&gt;</comment>
                            <comment id="1384769" author="thomas.schubert" created="Wed, 14 Sep 2016 21:56:19 +0000"  >&lt;p&gt;Hi &lt;a href=&quot;https://jira.mongodb.org/secure/ViewProfile.jspa?name=paul.ridgway&quot; class=&quot;user-hover&quot; rel=&quot;paul.ridgway&quot;&gt;paul.ridgway&lt;/a&gt;,&lt;/p&gt;

&lt;p&gt;I&apos;m glad to hear your testing is going well. The production release of MongoDB 3.2.10 should be out within a week.&lt;/p&gt;

&lt;p&gt;Kind regards,&lt;br/&gt;
Thomas&lt;/p&gt;</comment>
                            <comment id="1384673" author="paul.ridgway" created="Wed, 14 Sep 2016 20:22:29 +0000"  >&lt;p&gt;Hi all,&lt;/p&gt;

&lt;p&gt;So far this is looking good. We let replication alone run for a number of hours which worked well - during that time I pressured WT by dropping the eviction threshold all the way down to 30% in 10% increments and it all kept up fine.&lt;/p&gt;

&lt;p&gt;Now we&apos;re back to defaults (80%) and allowing it to server some reads too, still looking good.&lt;/p&gt;

&lt;p&gt;We&apos;ll open it up to all traffic tomorrow and test further.&lt;/p&gt;

&lt;p&gt;Do you know the intended release date of 3.2.10 stable?&lt;/p&gt;

&lt;p&gt;Cheers&lt;/p&gt;</comment>
                            <comment id="1384119" author="bartosz.debski" created="Wed, 14 Sep 2016 10:15:18 +0000"  >&lt;p&gt;Hi Thomas, thanks for this, I will apply RC release today and will update later on.&lt;/p&gt;</comment>
                            <comment id="1383865" author="thomas.schubert" created="Tue, 13 Sep 2016 22:23:45 +0000"  >&lt;p&gt;Hi &lt;a href=&quot;https://jira.mongodb.org/secure/ViewProfile.jspa?name=paul%40thefloow.com&quot; class=&quot;user-hover&quot; rel=&quot;paul@thefloow.com&quot;&gt;paul@thefloow.com&lt;/a&gt; and &lt;a href=&quot;https://jira.mongodb.org/secure/ViewProfile.jspa?name=bartosz.debski&quot; class=&quot;user-hover&quot; rel=&quot;bartosz.debski&quot;&gt;bartosz.debski&lt;/a&gt;,&lt;/p&gt;

&lt;p&gt;I have completed my analysis of the diagnostic.data you have provided, and I believe that we are looking at the same issue as described in &lt;a href=&quot;https://jira.mongodb.org/browse/SERVER-26001&quot; title=&quot;Insert workload stalled at 96% cache utilization&quot; class=&quot;issue-link&quot; data-issue-key=&quot;SERVER-26001&quot;&gt;&lt;del&gt;SERVER-26001&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;image-wrap&quot; style=&quot;&quot;&gt;&lt;img src=&quot;https://jira.mongodb.org/secure/attachment/138475/138475_application-eviction-threads.png&quot; width=&quot;100%&quot; style=&quot;border: 0px solid black&quot; /&gt;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;When the cache hits 96%, we see that application threads begin evicting pages. At the same time there is a significant increase in the number of eviction calls finding an empty eviction queue. This indicates that the application threads are starved for work to do.&lt;/p&gt;

&lt;p&gt;We have seen &lt;a href=&quot;https://jira.mongodb.org/browse/SERVER-26001?focusedCommentId=1379229&amp;amp;page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-1379229&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;significant improvements&lt;/a&gt; to this behavior when using MongoDB 3.2.9 with a newer version of WiredTiger. MongoDB 3.2.10-rc0 includes these fixes to WiredTiger, and we expect that it will correct this behavior.&lt;/p&gt;

&lt;p&gt;You can download &lt;a href=&quot;http://downloads.mongodb.org/linux/mongodb-linux-x86_64-ubuntu1404-3.2.10-rc0.tgz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;MongoDB 3.2.10-rc0&lt;/a&gt; to confirm that this issue is resolved. Please let me know if you have any questions.&lt;/p&gt;

&lt;p&gt;Thank you,&lt;br/&gt;
Thomas&lt;/p&gt;</comment>
                            <comment id="1383053" author="dan@10gen.com" created="Tue, 13 Sep 2016 12:06:49 +0000"  >&lt;p&gt;I understand and that makes perfect sense.  We are cutting the RC today (hopefully ready this afternoon) and we plan to analyze your data today as well.  I certainly wouldn&apos;t expect you to try out a release candidate in this case without good reason to expect it will address your issue.&lt;/p&gt;

&lt;p&gt;Thanks again for your help.&lt;/p&gt;</comment>
                            <comment id="1382965" author="bartosz.debski" created="Tue, 13 Sep 2016 09:55:20 +0000"  >&lt;p&gt;Hi Dan, how sure you guys are that RC will address issues we are seeing ? I think there is a mutual benefit, us testing for you guys but as we can&apos;t replicate this issue on any other database we have, we are forced to test it on our Live environment as Paul mentioned. This makes us worry. &lt;/p&gt;

&lt;p&gt;I will test this RC for you guys with oplog replication allowed only and hidden set to true. Providing that all will run OK for a day or two we can then enable some client traffic but ideally we would like some info on analysis of diagnostic data before we test this RC. &lt;/p&gt;</comment>
                            <comment id="1382868" author="paul.ridgway" created="Tue, 13 Sep 2016 05:00:55 +0000"  >&lt;p&gt;Thanks Dan. This is a production environment so I&apos;m not sure (unfortunately our non-prod envs do not have the same traffic pattern), in any case I will discuss it with the ops team.&lt;/p&gt;</comment>
                            <comment id="1382864" author="dan@10gen.com" created="Tue, 13 Sep 2016 04:38:18 +0000"  >&lt;p&gt;Thanks Paul.   You can downgrade. We are preparing a release candidate tmrw which has a handful of fixes for some of the symptoms you describe.  We will analyze the diagnostic data to check it fits the pattern.  If so, would you be willing to test the release candidate with your workload?&lt;br/&gt;
Thanks again.  &lt;/p&gt;</comment>
                            <comment id="1381982" author="paul.ridgway" created="Mon, 12 Sep 2016 14:25:15 +0000"  >&lt;p&gt;Done (logs-diag.7z)&lt;/p&gt;

&lt;p&gt;If this is enough please let us know so we can downgrade and restore use&lt;/p&gt;</comment>
                            <comment id="1381870" author="thomas.schubert" created="Mon, 12 Sep 2016 13:20:22 +0000"  >&lt;p&gt;Hi &lt;a href=&quot;https://jira.mongodb.org/secure/ViewProfile.jspa?name=paul.ridgway&quot; class=&quot;user-hover&quot; rel=&quot;paul.ridgway&quot;&gt;paul.ridgway&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Please use this &lt;a href=&quot;https://10gen-httpsupload.s3.amazonaws.com/upload_forms/8ee54f9a-c5c6-4e41-9a21-0eb68ad54af9.html&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;secure upload portal&lt;/a&gt; to upload the files.&lt;/p&gt;

&lt;p&gt;Thank you for your help,&lt;br/&gt;
Thomas&lt;/p&gt;</comment>
                            <comment id="1381508" author="paul.ridgway" created="Mon, 12 Sep 2016 07:19:00 +0000"  >&lt;p&gt;Sure, can you provide an upload link?&lt;/p&gt;</comment>
                            <comment id="1381442" author="dan@10gen.com" created="Mon, 12 Sep 2016 02:49:36 +0000"  >&lt;p&gt;Hi Paul,&lt;br/&gt;
Thanks for submitting this issue.&lt;br/&gt;
Could you provide us with the &lt;a href=&quot;https://docs.mongodb.com/manual/release-notes/3.2/#diagnostic-data-capture&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;diagnostic data&lt;/a&gt; files and run logs from the node running 3.2?&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="314402">SERVER-25974</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="138475" name="application-eviction-threads.png" size="219750" author="kelsey.schubert@mongodb.com" created="Tue, 13 Sep 2016 22:19:46 +0000"/>
                            <attachment id="138800" name="graphite.thefloow.net.png" size="150394" author="bartosz.debski" created="Fri, 16 Sep 2016 15:58:31 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                <customfield id="customfield_10050" key="com.atlassian.jira.toolkit:comments">
                        <customfieldname># Replies</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>17.0</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                <customfield id="customfield_10055" key="com.atlassian.jira.ext.charting:firstresponsedate">
                        <customfieldname>Date of 1st Reply</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Mon, 12 Sep 2016 02:49:36 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10052" key="com.atlassian.jira.toolkit:dayslastcommented">
                        <customfieldname>Days since reply</customfieldname>
                        <customfieldvalues>
                                        7 years, 20 weeks, 1 day ago
    
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_18254" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Dependencies</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue><![CDATA[]]></customfieldvalue>


                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_15850" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    <customfield id="customfield_10057" key="com.atlassian.jira.toolkit:lastusercommented">
                        <customfieldname>Last comment by Customer</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>true</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10056" key="com.atlassian.jira.toolkit:lastupdaterorcommenter">
                        <customfieldname>Last commenter</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>kelsey.schubert@mongodb.com</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_11151" key="com.atlassian.jira.toolkit:LastCommentDate">
                        <customfieldname>Last public comment date</customfieldname>
                        <customfieldvalues>
                            7 years, 20 weeks, 1 day ago
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                    <customfield id="customfield_10032" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Operating System</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10026"><![CDATA[ALL]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                <customfield id="customfield_10051" key="com.atlassian.jira.toolkit:participants">
                        <customfieldname>Participants</customfieldname>
                        <customfieldvalues>
                                        <customfieldvalue>bigbourin@gmail.com</customfieldvalue>
            <customfieldvalue>alexander.gorrod@mongodb.com</customfieldvalue>
            <customfieldvalue>bartosz.debski</customfieldvalue>
            <customfieldvalue>dan@mongodb.com</customfieldvalue>
            <customfieldvalue>kelsey.schubert@mongodb.com</customfieldvalue>
            <customfieldvalue>paul.ridgway</customfieldvalue>
    
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                        <customfield id="customfield_14254" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Product Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hrjwe7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                <customfield id="customfield_12550" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>2|hspmon:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10558" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_23361" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Requested By</customfieldname>
                        <customfieldvalues>
                                

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            <customfield id="customfield_10053" key="com.atlassian.jira.ext.charting:timeinstatus">
                        <customfieldname>Time In Status</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_22870" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Triagers</customfieldname>
                        <customfieldvalues>
                                    <customfieldvalue><![CDATA[kelsey.schubert@mongodb.com]]></customfieldvalue>
    

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                    <customfield id="customfield_14350" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>serverRank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hsek47:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                    </customfields>
    </item>
</channel>
</rss>