<!-- 
RSS generated by JIRA (9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66) at Thu Feb 08 03:07:28 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>MongoDB Jira</title>
    <link>https://jira.mongodb.org</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.7.1</version>
        <build-number>970001</build-number>
        <build-date>13-04-2023</build-date>
    </build-info>


<item>
            <title>[SERVER-4960] Optimize shard selection on $in queries</title>
                <link>https://jira.mongodb.org/browse/SERVER-4960</link>
                <project id="10000" key="SERVER">Core Server</project>
                    <description>&lt;p&gt;Selecting the appropriate shard to send a query to takes a long time on $in queries.  &lt;a href=&quot;https://jira.mongodb.org/browse/SERVER-4745&quot; title=&quot;Figuring out which shard to send a query to takes a long time when doing large $in queries on the shard key&quot; class=&quot;issue-link&quot; data-issue-key=&quot;SERVER-4745&quot;&gt;&lt;del&gt;SERVER-4745&lt;/del&gt;&lt;/a&gt; mitigates the affect of this, but this code could use rewriting and further optimization.&lt;/p&gt;</description>
                <environment></environment>
        <key id="30530">SERVER-4960</key>
            <summary>Optimize shard selection on $in queries</summary>
                <type id="4" iconUrl="https://jira.mongodb.org/secure/viewavatar?size=xsmall&amp;avatarId=14710&amp;avatarType=issuetype">Improvement</type>
                                            <priority id="3" iconUrl="https://jira.mongodb.org/images/icons/priorities/major.svg">Major - P3</priority>
                        <status id="10038" iconUrl="https://jira.mongodb.org/images/icons/subtask.gif" description="">Backlog</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="backlog-query-execution">Backlog - Query Execution</assignee>
                                    <reporter username="spencer@mongodb.com">Spencer Brody</reporter>
                        <labels>
                    </labels>
                <created>Mon, 13 Feb 2012 19:56:30 +0000</created>
                <updated>Tue, 6 Dec 2022 05:36:24 +0000</updated>
                                                                            <component>Querying</component>
                    <component>Sharding</component>
                                        <votes>5</votes>
                                    <watches>24</watches>
                                                                                                                <comments>
                            <comment id="311387" author="spencer" created="Thu, 11 Apr 2013 19:26:34 +0000"  >&lt;p&gt;I&apos;m moving the part of this ticket about re-writing the query sent to the shards to &lt;a href=&quot;https://jira.mongodb.org/browse/SERVER-9332&quot; title=&quot;Mongos should re-write $in queries on the shard key before sending them to the shards so that the shards only receive the relevant portion of the $in&quot; class=&quot;issue-link&quot; data-issue-key=&quot;SERVER-9332&quot;&gt;&lt;del&gt;SERVER-9332&lt;/del&gt;&lt;/a&gt;, and leaving this one to focus on the performance optimizations that can be done here.&lt;/p&gt;</comment>
                            <comment id="303644" author="antoine" created="Mon, 1 Apr 2013 22:30:07 +0000"  >&lt;p&gt;In a use case, the $in has 50 items so most likely the targeted shards will almost always be all shards.&lt;br/&gt;
A good optimization would be if mongos would rewrite the $in list to only include values that may be on a shard.&lt;br/&gt;
Mongos needs to lookup that value in the chunk list anyway in order to decide which shards to use, so it could create $in lists at the same time.&lt;/p&gt;

&lt;p&gt;This may create race conditions issues in the context of migration though.&lt;br/&gt;
Instead, could sent the full $in to all shards, but then mongod checks the values against the chunk ranges 1st (much smaller than btree).&lt;/p&gt;</comment>
                            <comment id="120273" author="blee" created="Wed, 16 May 2012 22:16:19 +0000"  >&lt;p&gt;Is it possible to hint a query as an all-shards query?  For large $in queries it seems like most of the effort in getShardsForQuery is wasted anyways&lt;/p&gt;</comment>
                            <comment id="114883" author="spencer" created="Mon, 30 Apr 2012 16:30:55 +0000"  >&lt;p&gt;@Ben, my timing didn&apos;t get that fine-grained.  I just verified that a large $in query (with no actual data) took longer on mongos than the sum of the mongods by enough of a margin to seem more than the expected overhead - most likely due to the shard selection code.&lt;/p&gt;</comment>
                            <comment id="114610" author="blee" created="Sat, 28 Apr 2012 21:13:33 +0000"  >&lt;p&gt;Spencer, did you say were you able to reproduce a slow FieldRange constructor with a large $in query?  From my timing blocks, it seemed like inserting into:&lt;/p&gt;

&lt;p&gt;set&amp;lt;BSONElement,element_lt&amp;gt; vals;&lt;/p&gt;

&lt;p&gt;was going slowly (Not sure why).&lt;/p&gt;
</comment>
                            <comment id="87997" author="aaron" created="Mon, 13 Feb 2012 20:19:51 +0000"  >&lt;p&gt;Thanks Spencer,&lt;/p&gt;

&lt;p&gt;Just in case this ticket gets picked up in the future here&apos;s some info from one more email:&lt;/p&gt;

&lt;p&gt;It&apos;s worth keeping an eye on how performance of the overall operation changes in the new implementation when there is high C and low R.  There are various optimizations that can be done (with additional work), would also be possible to use a hybrid of the current and proposed fancy implementations, or to chose just one of them selectively based on values of C and R.&lt;/p&gt;



&lt;p&gt;Also...&lt;/p&gt;

&lt;p&gt;One other note, that stuff about # chunks above is really # chunk ranges.  I&apos;ve been assuming that in the worst case we might have really fragmented chunks on all shards but one, and as a result # chunk ranges ~ # chunks but there is still a good chance of a query not hitting all shards.  But I don&apos;t know if this is actually right.&lt;/p&gt;</comment>
                            <comment id="87989" author="spencer" created="Mon, 13 Feb 2012 20:00:07 +0000"  >&lt;p&gt;Some notes about how to do this taken from an email with Aaron:&lt;/p&gt;

&lt;p&gt;The btree scanning code uses FieldRangeVectorIterator to traverse&lt;br/&gt;
btrees.  However the interface of FieldRangeVectorIterator doesn&apos;t&lt;br/&gt;
make it explicit whether an object matches a FieldRangeVector (ordered&lt;br/&gt;
list of valid field ranges, in your case &amp;lt; a &#8712; &lt;/p&gt;
{1,3,...}, b &#8712;{1,3,...}
&lt;p&gt; &amp;gt;).  Also the FieldRangeVectorIterator interface is&lt;br/&gt;
optimized to prevent building BSONObjs - it is a special purpose class&lt;br/&gt;
right now and is probably not particularly easy to use for other&lt;br/&gt;
things (and I think the original version is about 1.5 years old).  I&apos;d&lt;br/&gt;
recommend that that we add another class that can be used for&lt;br/&gt;
iterating through a list of ranges (in this case shard ranges) that&lt;br/&gt;
delegates most of its functionality to FieldRangeVectorIterator (and&lt;br/&gt;
modify FieldRangeVectorIterator a bit if necessary).&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;...&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;I think a better option might be to ignore FieldRangeVectorIterator and write a function similar to FieldRangeVector::matches(), something like FieldRangeVector::matchesBetween( start, end ).  High level you will basically want to check FieldRangeVector::matchingLowElement for all the field ranges of the start and end keys and then see if any matches are possible between these low element indexes.  And also be sure to keep in mind inclusivity of the interval bounds.&lt;/p&gt;

&lt;p&gt;Bascially the way this works if you have a query like {a:{$in:&lt;span class=&quot;error&quot;&gt;&amp;#91;1,2&amp;#93;&lt;/span&gt;},b:{$in:&lt;span class=&quot;error&quot;&gt;&amp;#91;3,4&amp;#93;&lt;/span&gt;}} and index &lt;/p&gt;
{a:1,b:1}
&lt;p&gt; your FieldRangeVector will keep a structure like &lt;/p&gt;
{ a: [ [1,1], [2,2] ], b: [ [3,3], [4,4] ] }
&lt;p&gt;.  And matchingLowElement for a will do a binary search on bounds of field a, so a binary search on [ 1, 1, 2, 2 ] will return the index of the leftmost value below a match I think.  So if your query element is 0 you will get -1, if 1 you will get 0, if 1.5 you will get 1, if 2 you will get 2, etc.  The parity of the index will tell you if the element matches (see FieldRangeVector::matchesElement()).  (This is my recollection of how it works at least.)&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;...&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;So if C = # chunks and R = combinatorial # of valid key ranges that need to be scanned, the current implementation is O(R*log(C)+C).  I&apos;m thinking that we might flip things around and do an implementation that is O(C*log(R)) since I&apos;m guessing that in the problematic cases R &amp;gt;&amp;gt; C.  (Even if R is not that much bigger, the implementation will save us from precomputing the different ranges which is the bulk of the time in the case you are looking at now.  But in cases with high R it will help a lot.  Btw, I don&apos;t really know what real world values of C might be so feel free to correct me if this sounds wrong.  I can definitely see R growing faster than C more easily - we have a hard cap on R right now looks like 1 or 4 million (someone seems to have added inconsistent caps in different places, we should fix that), but we could probably allow a lot higher.)&lt;/p&gt;

&lt;p&gt;One way we could do this would be to iterate through the chunks, and for each chunk we do a binary search through the combinatorial space of valid field ranges looking for the start and end of the chunk.  And we see if there are any valid field ranges between the start and end.  We can do this binary search directly with a FieldRangeVector (after adding on some functionality), often without exploring large portions of the set product space.&lt;/p&gt;

&lt;p&gt;So simple example if I&apos;m matching values in &lt;/p&gt;
{ 1, 3, 5, 7 } and my chunk is [2,2.5) I do a binary search for 2 and a binary search for 2.5 and see they&apos;re in the same spot and match nothing.  If my chunk is [4,6) I do the same thing and can see that my binary search sends me to different positions meaning that there is a match between them.&lt;br/&gt;
&lt;br/&gt;
Similarly for a compound key if I&apos;m matching values in { 1, 3, 5, 7 }
&lt;p&gt; x &lt;/p&gt;
{ 1, 3, 5, 7 }
&lt;p&gt; == &lt;/p&gt;
{ &amp;lt;1,1&amp;gt;, &amp;lt;1,3&amp;gt;, ... }
&lt;p&gt; and my chunk is [&amp;lt;1,8&amp;gt;,&amp;lt;2,2&amp;gt;) the start and end key are in the same region of the set product range space and not within a valid range, but if my chunk is [&amp;lt;1,8&amp;gt;,&amp;lt;3,2&amp;gt;) the start and end bounds aren&apos;t in the same region so there must be a region between them with matches.  And we can determine this without generating or exploring the whole set product.&lt;/p&gt;

&lt;p&gt;Another important optimization that would make sense to do here would be that we should only do this binary search and check the ranges for a chunk if that chunk&apos;s shard is not already in the set of added shards.&lt;/p&gt;

&lt;p&gt;There is an alternative implementation where we would iterate through the combinatorial space of matching ranges in parallel with the iteration through chunks.  And we would basically be improving on binary search (in an amortized sense) to find where in the space of field bounds the start and end keys of the chunks lie because we are iterating through the chunks in order and can exclude regions of the binary search space below the previous searched value.  This is somewhat related to what the FieldRangeVectorIterator does.  But I think it would be more work to implement.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Depends</name>
                                                                <inwardlinks description="is depended on by">
                                                        </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="70415">SERVER-9205</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10012">
                    <name>Related</name>
                                            <outwardlinks description="related to">
                                        <issuelink>
            <issuekey id="28879">SERVER-4745</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="27240">SERVER-4555</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="71539">SERVER-9332</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="1385949">SERVER-50299</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                <customfield id="customfield_10050" key="com.atlassian.jira.toolkit:comments">
                        <customfieldname># Replies</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>7.0</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                <customfield id="customfield_12751" key="com.atlassian.jira.plugin.system.customfieldtypes:multiselect">
                        <customfieldname>Assigned Teams</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="25125"><![CDATA[Query Execution]]></customfieldvalue>
    
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                            <customfield id="customfield_13552" key="com.go2group.jira.plugin.crm:crm_generic_field">
                        <customfieldname>Case</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue><![CDATA[[5002K00000r2vBQQAY]]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                            <customfield id="customfield_10055" key="com.atlassian.jira.ext.charting:firstresponsedate">
                        <customfieldname>Date of 1st Reply</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Mon, 13 Feb 2012 20:19:51 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10052" key="com.atlassian.jira.toolkit:dayslastcommented">
                        <customfieldname>Days since reply</customfieldname>
                        <customfieldvalues>
                                        10 years, 44 weeks, 6 days ago
    
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_18254" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Dependencies</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue><![CDATA[]]></customfieldvalue>


                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_15850" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10057" key="com.atlassian.jira.toolkit:lastusercommented">
                        <customfieldname>Last comment by Customer</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>true</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10056" key="com.atlassian.jira.toolkit:lastupdaterorcommenter">
                        <customfieldname>Last commenter</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>alexander.golin@mongodb.com</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_11151" key="com.atlassian.jira.toolkit:LastCommentDate">
                        <customfieldname>Last public comment date</customfieldname>
                        <customfieldvalues>
                            10 years, 44 weeks, 6 days ago
                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_10000" key="com.atlassian.jira.plugin.system.customfieldtypes:radiobuttons">
                        <customfieldname>Old_Backport</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10000"><![CDATA[No]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                <customfield id="customfield_10051" key="com.atlassian.jira.toolkit:participants">
                        <customfieldname>Participants</customfieldname>
                        <customfieldvalues>
                                        <customfieldvalue>aaron</customfieldvalue>
            <customfieldvalue>antoine</customfieldvalue>
            <customfieldvalue>backlog-query-execution</customfieldvalue>
            <customfieldvalue>blee</customfieldvalue>
            <customfieldvalue>spencer@mongodb.com</customfieldvalue>
    
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                        <customfield id="customfield_14254" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Product Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hrodgv:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                <customfield id="customfield_12550" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>2|hr28wf:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10558" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>3902</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            <customfield id="customfield_22870" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Triagers</customfieldname>
                        <customfieldvalues>
                                

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                    <customfield id="customfield_14350" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>serverRank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|ht0pdb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                    </customfields>
    </item>
</channel>
</rss>