<!-- 
RSS generated by JIRA (9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66) at Thu Feb 08 06:19:42 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>MongoDB Jira</title>
    <link>https://jira.mongodb.org</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.7.1</version>
        <build-number>970001</build-number>
        <build-date>13-04-2023</build-date>
    </build-info>


<item>
            <title>[SERVER-71683] unbounded memory growth during tenant migrations</title>
                <link>https://jira.mongodb.org/browse/SERVER-71683</link>
                <project id="10000" key="SERVER">Core Server</project>
                    <description>&lt;p&gt;It appears that there is no backpressure between reading from a donor and writing on a recipient; there is an &lt;a href=&quot;https://github.com/10gen/mongo/blob/397f5973dce8b4ea065e829eae7d14054187f6bf/src/mongo/db/repl/tenant_collection_cloner.cpp#L512&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;in-memory buffer&lt;/a&gt; that lives on the recipient that tenant migration writer threads pull from to perform writes.  This buffer can grow without bound if writing on the recipient is significantly slower compared to reading on the donor.&lt;/p&gt;
</description>
                <environment></environment>
        <key id="2197992">SERVER-71683</key>
            <summary>unbounded memory growth during tenant migrations</summary>
                <type id="1" iconUrl="https://jira.mongodb.org/secure/viewavatar?size=xsmall&amp;avatarId=14703&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.mongodb.org/images/icons/priorities/critical.svg">Critical - P2</priority>
                        <status id="6" iconUrl="https://jira.mongodb.org/images/icons/statuses/closed.png" description="The issue is considered finished, the resolution is correct. Issues which are closed can be reopened.">Closed</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="13201">Fixed</resolution>
                                        <assignee username="suganthi.mani@mongodb.com">Suganthi Mani</assignee>
                                    <reporter username="milkie@mongodb.com">Eric Milkie</reporter>
                        <labels>
                    </labels>
                <created>Tue, 29 Nov 2022 21:37:56 +0000</created>
                <updated>Sun, 29 Oct 2023 21:29:57 +0000</updated>
                            <resolved>Wed, 7 Dec 2022 13:12:06 +0000</resolved>
                                                    <fixVersion>6.1.1</fixVersion>
                    <fixVersion>6.2.0-rc3</fixVersion>
                    <fixVersion>6.3.0-rc0</fixVersion>
                                                        <votes>0</votes>
                                    <watches>12</watches>
                                                                                                                <comments>
                            <comment id="5037813" author="xgen-internal-githook" created="Wed, 7 Dec 2022 17:09:52 +0000"  >&lt;p&gt;Author:&lt;/p&gt;
{&apos;name&apos;: &apos;Suganthi Mani&apos;, &apos;email&apos;: &apos;suganthi.mani@mongodb.com&apos;, &apos;username&apos;: &apos;smani87&apos;}
&lt;p&gt;Message: &lt;a href=&quot;https://jira.mongodb.org/browse/SERVER-71683&quot; title=&quot;unbounded memory growth during tenant migrations&quot; class=&quot;issue-link&quot; data-issue-key=&quot;SERVER-71683&quot;&gt;&lt;del&gt;SERVER-71683&lt;/del&gt;&lt;/a&gt; Tenant collection cloner reads the next batch from socket buffer only after writing all the documents in the current batch to storage&lt;/p&gt;

&lt;p&gt;(cherry picked from commit 5fee6fff13b8a0b9f96f6bbe228afcd9514ac952)&lt;br/&gt;
Branch: v6.2&lt;br/&gt;
&lt;a href=&quot;https://github.com/mongodb/mongo/commit/6b55bbebb1b199a9e0dcfdb4611a7a2cb58ba3a5&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/mongodb/mongo/commit/6b55bbebb1b199a9e0dcfdb4611a7a2cb58ba3a5&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="5037812" author="xgen-internal-githook" created="Wed, 7 Dec 2022 17:09:47 +0000"  >&lt;p&gt;Author:&lt;/p&gt;
{&apos;name&apos;: &apos;Suganthi Mani&apos;, &apos;email&apos;: &apos;suganthi.mani@mongodb.com&apos;, &apos;username&apos;: &apos;smani87&apos;}
&lt;p&gt;Message: &lt;a href=&quot;https://jira.mongodb.org/browse/SERVER-71683&quot; title=&quot;unbounded memory growth during tenant migrations&quot; class=&quot;issue-link&quot; data-issue-key=&quot;SERVER-71683&quot;&gt;&lt;del&gt;SERVER-71683&lt;/del&gt;&lt;/a&gt; Tenant collection cloner reads the next batch from socket buffer only after writing all the documents in the current batch to storage&lt;/p&gt;

&lt;p&gt;(cherry picked from commit 5fee6fff13b8a0b9f96f6bbe228afcd9514ac952)&lt;br/&gt;
Branch: v6.1&lt;br/&gt;
&lt;a href=&quot;https://github.com/mongodb/mongo/commit/3b45a1f1e5f0f7ff5c7180b05f4c3ae050566789&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/mongodb/mongo/commit/3b45a1f1e5f0f7ff5c7180b05f4c3ae050566789&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="5036830" author="suganthi.mani" created="Wed, 7 Dec 2022 13:06:26 +0000"  >&lt;p&gt;Just for those who are watching this ticket, we considered following 3 options&lt;/p&gt;

&lt;p&gt;&lt;b&gt;( We chose option#1 for following reasons 1)Simplicity 2) This is a problem only for MTM protocol and this protocol will be retired soon and will be replaced with split and merge.)&lt;/b&gt;&lt;/p&gt;

&lt;p&gt;Option#1: Tenant collection cloner reads the next batch from socket buffer to in-memory buffer only after inserting all the documents in the in-memory buffer to the collection, by running &lt;a href=&quot;https://github.com/10gen/mongo/blob/397f5973dce8b4ea065e829eae7d14054187f6bf/src/mongo/db/repl/tenant_collection_cloner.cpp#L518&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;insert docs&lt;/a&gt; in-line with &lt;a href=&quot;https://github.com/10gen/mongo/blob/397f5973dce8b4ea065e829eae7d14054187f6bf/src/mongo/db/repl/tenant_collection_cloner.cpp#L507&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;handleNextBatch()&lt;/a&gt;&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;&lt;b&gt;Notes&lt;/b&gt;:
	&lt;ol&gt;
		&lt;li&gt;With our current code base, irrespective of &lt;a href=&quot;https://github.com/10gen/mongo/blob/e717ef6798aad3e6ec65bee84921fc3961db4426/src/mongo/db/repl/repl_server_parameters.idl#L303&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;writer pool size&lt;/a&gt;, only one writer thread is responsible for inserting documents into the given tenant collection, it is mainly because of the &lt;a href=&quot;https://github.com/10gen/mongo/blob/397f5973dce8b4ea065e829eae7d14054187f6bf/src/mongo/db/repl/task_runner.cpp#L117-L126&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;limitation of task runner&lt;/a&gt; which is used to schedule the insert docs task. This isn&apos;t a conscious choice for tenant collection cloner. It&apos;s a carry-over from initial sync collection cloner code where insert docs for a given collection can&apos;t run parallel due to WT bulk insertion limitation.&lt;/li&gt;
		&lt;li&gt;Given the fact, we have single writer thread, option#1 isn&apos;t a major design change to tenant collection cloner.&lt;/li&gt;
		&lt;li&gt;Since each batch batch size can only be &amp;lt;= 16MB, the in-memory buffer can&apos;t grow unbounded with option#1. Even, in case of doc insertion being really slow, we expect the socket buffer (default size should be few KBs or MBs) to overflow and in-turn throttle the exhaust cursor on the donor side in generating the batches.&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;Option#2: Put explicit max size limit to the in-memory buffer - TenantCollectionCloner::_documentsToInsert&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;&lt;b&gt;Notes&lt;/b&gt;:
	&lt;ol&gt;
		&lt;li&gt;1) This fix aligns with any future efforts of &quot;improving the tenant collection cloner performance&quot; - parallelize insert doc tasks for a given tenant collection.&lt;/li&gt;
		&lt;li&gt;To avoid busy looping and unnecessary &lt;a href=&quot;https://github.com/10gen/mongo/blob/b4a99f8e20ecd06893d838fdbc9984322b66e0dd/src/mongo/db/repl/tenant_collection_cloner.cpp#L517&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;scheduling of tasks on task runner&lt;/a&gt; , we might also need to consider having receiver thread to block/wait when in-memory buffer is full and unblock when space is available (+ wait interruption, especially due to merge abort).&lt;/li&gt;
		&lt;li&gt;There may be some performance gain for option#2 due to running insertion and receiving step ( i.e, reading (+ decompressing) next batch from socket buffer to in-memory buffer) in parallel.&lt;/li&gt;
	&lt;/ol&gt;
	&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;Option#3: Like &lt;a href=&quot;https://jira.mongodb.org/browse/SERVER-45037&quot; title=&quot;CollectionBulkLoader::insertDocuments()  should be called with the collection cloner mutex held to guarantee memory and thread safety properties.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;SERVER-45037&quot;&gt;&lt;del&gt;SERVER-45037&lt;/del&gt;&lt;/a&gt;, do storage writes with tenant collection cloner mutex lock held to effectively force the receive thread to run in lock-step with insertion&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;&lt;b&gt;Rejected&lt;/b&gt;: Doing storage writes with mutex lock held is an anti-pattern (mutex lock should be used only for short critical section) and is prone to deadlocks(PM-3075)&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="5036557" author="xgen-internal-githook" created="Wed, 7 Dec 2022 11:04:53 +0000"  >&lt;p&gt;Author:&lt;/p&gt;
{&apos;name&apos;: &apos;Suganthi Mani&apos;, &apos;email&apos;: &apos;suganthi.mani@mongodb.com&apos;, &apos;username&apos;: &apos;smani87&apos;}
&lt;p&gt;Message: &lt;a href=&quot;https://jira.mongodb.org/browse/SERVER-71683&quot; title=&quot;unbounded memory growth during tenant migrations&quot; class=&quot;issue-link&quot; data-issue-key=&quot;SERVER-71683&quot;&gt;&lt;del&gt;SERVER-71683&lt;/del&gt;&lt;/a&gt; Tenant collection cloner reads the next batch from socket buffer only after writing all the documents in the current batch to storage&lt;br/&gt;
Branch: master&lt;br/&gt;
&lt;a href=&quot;https://github.com/mongodb/mongo/commit/5fee6fff13b8a0b9f96f6bbe228afcd9514ac952&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/mongodb/mongo/commit/5fee6fff13b8a0b9f96f6bbe228afcd9514ac952&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="5023362" author="suganthi.mani" created="Thu, 1 Dec 2022 19:43:06 +0000"  >&lt;p&gt;Reposting &lt;a href=&quot;https://jira.mongodb.org/secure/ViewProfile.jspa?name=milkie%40mongodb.com&quot; class=&quot;user-hover&quot; rel=&quot;milkie@mongodb.com&quot;&gt;milkie@mongodb.com&lt;/a&gt;&apos;s slack response on why this ticket is marked as P2-Critical&lt;/p&gt;
&lt;blockquote&gt;&lt;p&gt;It&apos;s hit in production twice now (one rather recently)&lt;br/&gt;
the recent one triggered some very long manual cleanup that took us 4 days to finally fix completely&lt;br/&gt;
and during that period, those tenants were all impaired&lt;br/&gt;
I marked it critical in the hopes we can get a fix into 6.2.0 if not 6.1.1&lt;/p&gt;&lt;/blockquote&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10420">
                    <name>Backports</name>
                                            <outwardlinks description="backported by">
                                                        </outwardlinks>
                                                        </issuelinktype>
                            <issuelinktype id="10012">
                    <name>Related</name>
                                            <outwardlinks description="related to">
                                                        </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="1046476">SERVER-45037</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="418674" name="0001-memory-leak.patch" size="1272" author="milkie@mongodb.com" created="Tue, 29 Nov 2022 21:36:49 +0000"/>
                            <attachment id="418673" name="t.js" size="3142" author="milkie@mongodb.com" created="Tue, 29 Nov 2022 21:37:53 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                <customfield id="customfield_10050" key="com.atlassian.jira.toolkit:comments">
                        <customfieldname># Replies</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>5.0</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_18555" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname># of Sprints</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1.0</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                <customfield id="customfield_12450" key="com.atlassian.jira.plugin.system.customfieldtypes:multicheckboxes">
                        <customfieldname>Backport Requested</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="25041"><![CDATA[v6.2]]></customfieldvalue>
    
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10011" key="com.atlassian.jira.plugin.system.customfieldtypes:radiobuttons">
                        <customfieldname>Backwards Compatibility</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10038"><![CDATA[Fully Compatible]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                            <customfield id="customfield_10055" key="com.atlassian.jira.ext.charting:firstresponsedate">
                        <customfieldname>Date of 1st Reply</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Thu, 1 Dec 2022 19:43:06 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10052" key="com.atlassian.jira.toolkit:dayslastcommented">
                        <customfieldname>Days since reply</customfieldname>
                        <customfieldvalues>
                                        1 year, 9 weeks ago
    
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_18254" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Dependencies</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue><![CDATA[]]></customfieldvalue>


                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_15850" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_17050" key="com.atlassian.jira.plugin.system.customfieldtypes:radiobuttons">
                        <customfieldname>Downstream Team Attention</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="16941"><![CDATA[Not Needed]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                <customfield id="customfield_10057" key="com.atlassian.jira.toolkit:lastusercommented">
                        <customfieldname>Last comment by Customer</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>true</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10056" key="com.atlassian.jira.toolkit:lastupdaterorcommenter">
                        <customfieldname>Last commenter</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>luke.bonanomi@mongodb.com</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_11151" key="com.atlassian.jira.toolkit:LastCommentDate">
                        <customfieldname>Last public comment date</customfieldname>
                        <customfieldvalues>
                            1 year, 9 weeks ago
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                    <customfield id="customfield_10032" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Operating System</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10026"><![CDATA[ALL]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                <customfield id="customfield_10051" key="com.atlassian.jira.toolkit:participants">
                        <customfieldname>Participants</customfieldname>
                        <customfieldvalues>
                                        <customfieldvalue>milkie@mongodb.com</customfieldvalue>
            <customfieldvalue>xgen-internal-githook</customfieldvalue>
            <customfieldvalue>suganthi.mani@mongodb.com</customfieldvalue>
    
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                        <customfield id="customfield_14254" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Product Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i1klpj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                <customfield id="customfield_12550" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>2|i1371s:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10558" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_23361" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Requested By</customfieldname>
                        <customfieldvalues>
                                

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                            <customfield id="customfield_22250" key="com.atlassian.jira.plugin.system.customfieldtypes:radiobuttons">
                        <customfieldname>Special Downgrade Instructions Required</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="23343"><![CDATA[Not Needed]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10557" key="com.pyxis.greenhopper.jira:gh-sprint">
                        <customfieldname>Sprint</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue id="6759">Server Serverless 2022-12-12</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_10750" key="com.atlassian.jira.plugin.system.customfieldtypes:textarea">
                        <customfieldname>Steps To Reproduce</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>&lt;p&gt;&lt;tt&gt;git am&lt;/tt&gt; the attached patch.&lt;br/&gt;
Then run the attached script via:&lt;br/&gt;
&lt;tt&gt;nohup buildscripts/resmoke.py run t.js&lt;/tt&gt;&lt;br/&gt;
Observe the memory growth (or even process OOM-killed)&lt;/p&gt;</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                    <customfield id="customfield_10053" key="com.atlassian.jira.ext.charting:timeinstatus">
                        <customfieldname>Time In Status</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_22870" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Triagers</customfieldname>
                        <customfieldvalues>
                                

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                    <customfield id="customfield_14350" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>serverRank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i1k7uv:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                    </customfields>
    </item>
</channel>
</rss>