<!-- 
RSS generated by JIRA (9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66) at Thu Feb 08 05:52:33 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>MongoDB Jira</title>
    <link>https://jira.mongodb.org</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.7.1</version>
        <build-number>970001</build-number>
        <build-date>13-04-2023</build-date>
    </build-info>


<item>
            <title>[SERVER-61490] transport_layer_test: asio connect race</title>
                <link>https://jira.mongodb.org/browse/SERVER-61490</link>
                <project id="10000" key="SERVER">Core Server</project>
                    <description>&lt;p&gt;&lt;a href=&quot;https://jira.mongodb.org/browse/SERVER-61016&quot; title=&quot;Swallow connection reset-related errors received during ASIOSession creation on outbound connection.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;SERVER-61016&quot;&gt;&lt;del&gt;SERVER-61016&lt;/del&gt;&lt;/a&gt; introduced a new egress connection test. The connect call can hang at a failpoint somehow on some hosts.&lt;/p&gt;

&lt;p&gt;This is difficult to reproduce in a spawnhost but EVG master builds seem to find it regularly.&lt;/p&gt;

&lt;p&gt;e.g. &lt;a href=&quot;https://evergreen.mongodb.com/task/mongodb_mongo_master_linux_64_debug_required_run_unittests_0d48c5f4d89b622af4efebee16b375378296a4d8_21_11_12_19_58_49&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://evergreen.mongodb.com/task/mongodb_mongo_master_linux_64_debug_required_run_unittests_0d48c5f4d89b622af4efebee16b375378296a4d8_21_11_12_19_58_49&lt;/a&gt;&lt;/p&gt;</description>
                <environment></environment>
        <key id="1925426">SERVER-61490</key>
            <summary>transport_layer_test: asio connect race</summary>
                <type id="1" iconUrl="https://jira.mongodb.org/secure/viewavatar?size=xsmall&amp;avatarId=14703&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.mongodb.org/images/icons/priorities/major.svg">Major - P3</priority>
                        <status id="6" iconUrl="https://jira.mongodb.org/images/icons/statuses/closed.png" description="The issue is considered finished, the resolution is correct. Issues which are closed can be reopened.">Closed</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="13201">Fixed</resolution>
                                        <assignee username="billy.donahue@mongodb.com">Billy Donahue</assignee>
                                    <reporter username="billy.donahue@mongodb.com">Billy Donahue</reporter>
                        <labels>
                    </labels>
                <created>Mon, 15 Nov 2021 16:25:04 +0000</created>
                <updated>Sun, 29 Oct 2023 21:46:01 +0000</updated>
                            <resolved>Fri, 19 Nov 2021 23:53:31 +0000</resolved>
                                                    <fixVersion>5.2.0</fixVersion>
                    <fixVersion>5.0.7</fixVersion>
                                                        <votes>0</votes>
                                    <watches>2</watches>
                                                                                                                <comments>
                            <comment id="4421832" author="xgen-internal-githook" created="Fri, 18 Mar 2022 21:12:53 +0000"  >&lt;p&gt;Author:&lt;/p&gt;
{&apos;name&apos;: &apos;Billy Donahue&apos;, &apos;email&apos;: &apos;billy.donahue@mongodb.com&apos;, &apos;username&apos;: &apos;BillyDonahue&apos;}
&lt;p&gt;Message: &lt;a href=&quot;https://jira.mongodb.org/browse/SERVER-61490&quot; title=&quot;transport_layer_test: asio connect race&quot; class=&quot;issue-link&quot; data-issue-key=&quot;SERVER-61490&quot;&gt;&lt;del&gt;SERVER-61490&lt;/del&gt;&lt;/a&gt; disable TCPFastOpen during egress test&lt;/p&gt;

&lt;p&gt;(cherry picked from commit 0cae914207ee666c9c3e1d87dd8492f2749b3bf7)&lt;br/&gt;
Branch: v5.0&lt;br/&gt;
&lt;a href=&quot;https://github.com/mongodb/mongo/commit/21a55485b4ce6bde1437a3638cc1385881a56abb&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/mongodb/mongo/commit/21a55485b4ce6bde1437a3638cc1385881a56abb&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="4201604" author="xgen-internal-githook" created="Fri, 19 Nov 2021 19:59:28 +0000"  >&lt;p&gt;Author:&lt;/p&gt;
{&apos;name&apos;: &apos;Billy Donahue&apos;, &apos;email&apos;: &apos;billy.donahue@mongodb.com&apos;, &apos;username&apos;: &apos;BillyDonahue&apos;}
&lt;p&gt;Message: &lt;a href=&quot;https://jira.mongodb.org/browse/SERVER-61490&quot; title=&quot;transport_layer_test: asio connect race&quot; class=&quot;issue-link&quot; data-issue-key=&quot;SERVER-61490&quot;&gt;&lt;del&gt;SERVER-61490&lt;/del&gt;&lt;/a&gt; disable TCPFastOpen during egress test&lt;br/&gt;
Branch: master&lt;br/&gt;
&lt;a href=&quot;https://github.com/mongodb/mongo/commit/0cae914207ee666c9c3e1d87dd8492f2749b3bf7&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/mongodb/mongo/commit/0cae914207ee666c9c3e1d87dd8492f2749b3bf7&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="4199406" author="billy.donahue" created="Thu, 18 Nov 2021 23:14:30 +0000"  >&lt;p&gt;Under TFO, the client doesn&apos;t send any packets at all to the server until it has data to write.&lt;br/&gt;
This is conditionally controlled by whether the client has a cookie for that server yet or not, which is something the TCP implementaion determines on its own, which is why it appears to be so flaky and unreproducible. So in that case there&apos;s nothing to wake up the server, and the test hangs. Disable TFO for this test.&lt;/p&gt;</comment>
                            <comment id="4199404" author="billy.donahue" created="Thu, 18 Nov 2021 23:13:34 +0000"  >&lt;p&gt;Code Review: &lt;a href=&quot;https://github.com/10gen/mongo/pull/1984&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/10gen/mongo/pull/1984&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="4193532" author="billy.donahue" created="Tue, 16 Nov 2021 22:47:50 +0000"  >&lt;p&gt;A-friggin-HA!  I&apos;ll bet it&apos;s related to the TCP_FASTOPEN_CONNECT support and an option set by transport_layer_asio set on the egress sockets. Under TFO, a connect call might be considered complete once the SYN is sent (it doesn&apos;t wait for a response from the peer). This would explain why it never happens on Mac, Windows, or some Linux versions. It also perhaps partially explains why other TransportLayerASIO tests don&apos;t run into this problem: they are all creating vanilla client sockets that don&apos;t have the TCP_FASTOPEN_CONNECT option set.&lt;/p&gt;

&lt;p&gt;The behavior of whether connect waits or not is dynamically determined, which explains perhaps why Sam&apos;s reboot caused his machine to stop reproducing the hang. The dynamics involve whether the client has an unexpired TFO &quot;cookie&quot; for the peer&apos;s IP address. If it has a cookie, it can send that along and optimize the connect handshake. If not, it has to wait for an ACK.&lt;/p&gt;

&lt;p&gt;Still working out the details... But this must be related to what&apos;s going on somehow.&lt;/p&gt;

&lt;p&gt;If the connect returns early, ASIOSession&apos;s ctor will hit the failpoint and block. The accept callback never happens, because accept only returns when the TCP state machine is ESTABLISHED, which means the client responded to SYN,ACK with an ACK. This never happens, because the client side doesn&apos;t react to the SYN,ACK for some reason. I don&apos;t know why that would be, but maybe it&apos;s related to the connecting thread being stuck at a failpoint, but this doesn&apos;t quite make sense to me as I thought this negotiation is not handled in user space. Gotta dig into that.&lt;/p&gt;</comment>
                            <comment id="4190749" author="billy.donahue" created="Tue, 16 Nov 2021 00:17:11 +0000"  >&lt;p&gt;Bad news! Using an &lt;tt&gt;io_context::post&lt;/tt&gt; to wait for the context to be up and running before trying to connect seems to have not worked. This test still hangs.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://spruce.mongodb.com/task/mongodb_mongo_master_linux_64_debug_required_run_unittests_patch_d9fcd9f124ece9ab0b3a3c46cb6d7052b7282dd2_6192ec9a3e8e8661000b5dbf_21_11_15_23_27_06/tests?execution=0&amp;amp;sortBy=STATUS&amp;amp;sortDir=ASC&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://spruce.mongodb.com/task/mongodb_mongo_master_linux_64_debug_required_run_unittests_patch_d9fcd9f124ece9ab0b3a3c46cb6d7052b7282dd2_6192ec9a3e8e8661000b5dbf_21_11_15_23_27_06/tests?execution=0&amp;amp;sortBy=STATUS&amp;amp;sortDir=ASC&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Need a new hypothesis here!&lt;/p&gt;</comment>
                            <comment id="4190572" author="billy.donahue" created="Mon, 15 Nov 2021 22:59:34 +0000"  >&lt;p&gt;Update on this.&lt;/p&gt;

&lt;p&gt;I Paired up with &lt;a href=&quot;https://jira.mongodb.org/secure/ViewProfile.jspa?name=amirsaman.memaripour&quot; class=&quot;user-hover&quot; rel=&quot;amirsaman.memaripour&quot;&gt;amirsaman.memaripour&lt;/a&gt; for a couple of hours. We&apos;re both using the same EXACT libc, kernel, and transport_layer static executable on our respective virtual workstations. His machine consistently shows the hang, and mine does not. We checked all ldd -dependencies with shasum to verify the matches. He shipped me his transport_layer static executable over GDrive.&lt;/p&gt;

&lt;p&gt;During the hang, Sam could run a netstat and see that the connection was in the SYN_SENT TCP state, which means the connect started but was unacknowledged by the server (which is local to the test and running on its own completely separate io_context and thread). The server&apos;s async_accept never happens.&lt;/p&gt;

&lt;p&gt;After rebooting his workstation, he was unable to repro!&lt;/p&gt;

&lt;p&gt;So the first thing I saw (above in the description) was that a spawnhost equivalent to a failing buildhost could not repro even when using the same executable as the failing EVG task. That&apos;s strange. But it&apos;s a different machine. But now it looks like even the same machine after a reboot (Sam&apos;s virtual workstation) can&apos;t repro. We really go from consistent failure to consistent success. It&apos;s one of the weirdest things ever.&lt;/p&gt;

&lt;p&gt;Clearly there&apos;s some race in the timing of this test&apos;s tla.connect call and the acceptor&apos;s asio socket&apos;s async_accept. Maybe the connect is happening &quot;too early&quot; and ASIO misses it somehow.&lt;br/&gt;
We came up with 2 experiments to try:&lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;In the test, post a task to the io_context and wait for it to finish. Do this before trying to connect, so we know the async_accept is queued up and processing events.&lt;/li&gt;
	&lt;li&gt;Try rewriting the acceptor in POSIX socket API instead of ASIO. This is more work (supporting Windows, etc) but it might answer the question of whether the problem is ASIO or libc/Linux.&lt;/li&gt;
&lt;/ul&gt;
</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10420">
                    <name>Backports</name>
                                            <outwardlinks description="backported by">
                                                        </outwardlinks>
                                                        </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Depends</name>
                                                                <inwardlinks description="is depended on by">
                                                        </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10520">
                    <name>Problem/Incident</name>
                                                                <inwardlinks description="is caused by">
                                        <issuelink>
            <issuekey id="1910052">SERVER-61016</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                <customfield id="customfield_10050" key="com.atlassian.jira.toolkit:comments">
                        <customfieldname># Replies</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>7.0</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_18555" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname># of Sprints</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1.0</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                <customfield id="customfield_12450" key="com.atlassian.jira.plugin.system.customfieldtypes:multicheckboxes">
                        <customfieldname>Backport Requested</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="21777"><![CDATA[v5.0]]></customfieldvalue>
    
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10011" key="com.atlassian.jira.plugin.system.customfieldtypes:radiobuttons">
                        <customfieldname>Backwards Compatibility</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10038"><![CDATA[Fully Compatible]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                            <customfield id="customfield_10055" key="com.atlassian.jira.ext.charting:firstresponsedate">
                        <customfieldname>Date of 1st Reply</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Fri, 19 Nov 2021 19:59:28 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10052" key="com.atlassian.jira.toolkit:dayslastcommented">
                        <customfieldname>Days since reply</customfieldname>
                        <customfieldvalues>
                                        1 year, 46 weeks, 5 days ago
    
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_18254" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Dependencies</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue><![CDATA[]]></customfieldvalue>


                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_15850" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_17050" key="com.atlassian.jira.plugin.system.customfieldtypes:radiobuttons">
                        <customfieldname>Downstream Team Attention</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="16941"><![CDATA[Not Needed]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                <customfield id="customfield_10057" key="com.atlassian.jira.toolkit:lastusercommented">
                        <customfieldname>Last comment by Customer</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>true</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10056" key="com.atlassian.jira.toolkit:lastupdaterorcommenter">
                        <customfieldname>Last commenter</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>luke.bonanomi@mongodb.com</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_11151" key="com.atlassian.jira.toolkit:LastCommentDate">
                        <customfieldname>Last public comment date</customfieldname>
                        <customfieldvalues>
                            1 year, 46 weeks, 5 days ago
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_16465" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Linked BF Score</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>162.0</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_10032" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Operating System</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10026"><![CDATA[ALL]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                <customfield id="customfield_10051" key="com.atlassian.jira.toolkit:participants">
                        <customfieldname>Participants</customfieldname>
                        <customfieldvalues>
                                        <customfieldvalue>billy.donahue@mongodb.com</customfieldvalue>
            <customfieldvalue>xgen-internal-githook</customfieldvalue>
    
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                        <customfield id="customfield_14254" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Product Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i0acvj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                <customfield id="customfield_12550" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>2|hztqxz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10558" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_23361" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Requested By</customfieldname>
                        <customfieldvalues>
                                

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                        <customfield id="customfield_10557" key="com.pyxis.greenhopper.jira:gh-sprint">
                        <customfieldname>Sprint</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue id="5299">Service Arch 2021-11-22</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_10750" key="com.atlassian.jira.plugin.system.customfieldtypes:textarea">
                        <customfieldname>Steps To Reproduce</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>&lt;p&gt;seems to happen on Evergreen RHEL8.0. Not reliably repro&apos;d though.&lt;/p&gt;</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                    <customfield id="customfield_10053" key="com.atlassian.jira.ext.charting:timeinstatus">
                        <customfieldname>Time In Status</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_22870" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Triagers</customfieldname>
                        <customfieldvalues>
                                

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                    <customfield id="customfield_14350" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>serverRank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i09z0v:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                    </customfields>
    </item>
</channel>
</rss>