<!-- 
RSS generated by JIRA (9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66) at Thu Feb 08 03:19:49 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>MongoDB Jira</title>
    <link>https://jira.mongodb.org</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.7.1</version>
        <build-number>970001</build-number>
        <build-date>13-04-2023</build-date>
    </build-info>


<item>
            <title>[SERVER-9250] using bucket to group small docs</title>
                <link>https://jira.mongodb.org/browse/SERVER-9250</link>
                <project id="10000" key="SERVER">Core Server</project>
                    <description>&lt;p&gt;We have a collection to store user activity, in terms of who did what at what time. The access to this collection will be user based, and fetch from latest to oldest.&lt;/p&gt;

&lt;p&gt;The doc looks like &lt;br/&gt;
{_id:ObjectId,&lt;br/&gt;
 userId:Number, &lt;br/&gt;
type:String, &lt;br/&gt;
objectid:String//foreign key to other collection on the actual action data, &lt;br/&gt;
timestamp:Date } &lt;/p&gt;

&lt;p&gt;We are considering to group such tiny doc into an array in one doc, which turns it to be,&lt;/p&gt;

&lt;p&gt;{&lt;br/&gt;
_id:ObjectId,&lt;br/&gt;
userId:Number,&lt;br/&gt;
pageNumber:Number,&lt;br/&gt;
activities:&lt;span class=&quot;error&quot;&gt;&amp;#91;Activity&amp;#93;&lt;/span&gt;//like above with only type, refObjectId,timestamp&lt;br/&gt;
}&lt;/p&gt;

&lt;p&gt;We thought this will give better storage since number of documents is reduced, indexes should be smaller. While reading, the number of documents to seek is also reduced, thus read should be faster.&lt;/p&gt;

&lt;p&gt;However, when I ran a testing script thru shell to test this schema for reading and writing, I see with larger bucket size the storage is actually higher and insertion is higher. The raw doc without bucketing give highest insertion rate with relatively increase in storage size.&lt;/p&gt;

&lt;p&gt;So my question is, is it a good design to group documents together into buckets, especially large bucket like 100 &#8211; 500? And dose the number of documents matter, say we have 10 million users all using this product with each having 100 activities, the doc number will go up to 1 billion.&lt;/p&gt;

&lt;p&gt;My testing script and result is as below,&lt;/p&gt;

&lt;p&gt;Big = 500 per array&lt;br/&gt;
small = 5 per array&lt;br/&gt;
no bucket = each as a document.&lt;br/&gt;
total 10000 &quot;atomic doc&quot; inserted.&lt;/p&gt;

&lt;p&gt;&amp;gt; load(&apos;bucket.js&apos;)&lt;br/&gt;
big bucket size runtime: 20.368&lt;br/&gt;
storageSize with big bucket size  : 11182080&lt;br/&gt;
paddingFactor with big bucket  size: 1.0060000000002791&lt;br/&gt;
small bucket size runtime: 0.375&lt;br/&gt;
storageSize with small bucket size  : 2793472&lt;br/&gt;
paddingFactor with small bucket  size: 1.9980000000003342&lt;br/&gt;
no bucket size runtime: 0.303&lt;br/&gt;
storageSize with no bucket size  : 3055616&lt;br/&gt;
paddingFactor with no bucket  size: 1&lt;br/&gt;
1 big doc size 38.925&lt;br/&gt;
100 small doc size 41.5&lt;br/&gt;
500 raw doc size 50.5&lt;/p&gt;

&lt;p&gt;script,&lt;/p&gt;

&lt;p&gt;var d = db.getSisterDB(&quot;bucket_test&quot;);&lt;/p&gt;

&lt;p&gt;var total = 10000;&lt;br/&gt;
var f = function(usecase , col, bucketSize) {&lt;br/&gt;
    col.drop();&lt;br/&gt;
    col.ensureIndex(&lt;/p&gt;
{&apos;activities.type&apos;:1}
&lt;p&gt;);&lt;br/&gt;
    var count = total/bucketSize;&lt;br/&gt;
    var start = Date.now();&lt;br/&gt;
    for (var i=0; i &amp;lt; count; i++) {&lt;br/&gt;
        // Document created with only the _id field&lt;br/&gt;
        col.insert({_id:i});&lt;br/&gt;
        for(var j=0;j&amp;lt;bucketSize;j++){&lt;br/&gt;
            col.update({_id:i}, {$push : {&quot;activities&quot;: &lt;/p&gt;
{type:&apos;post&apos;,objectid:&apos;515bc88a8ce3b4718e6a1099&apos;,time:new Date}
&lt;p&gt;}}, true);&lt;br/&gt;
        }&lt;/p&gt;


&lt;p&gt;    }&lt;br/&gt;
    var t = (Date.now() - start)/1000;&lt;br/&gt;
    print(usecase +&quot; size runtime: &quot; + t);&lt;br/&gt;
    print(&quot;storageSize with &quot; + usecase +&quot; size  : &quot; + col.stats().storageSize);&lt;br/&gt;
    print(&quot;paddingFactor with &quot; + usecase+&quot;  size: &quot; + col.stats().paddingFactor);&lt;/p&gt;

&lt;p&gt;    return t;&lt;br/&gt;
}&lt;/p&gt;

&lt;p&gt;var insrt = function(usecase , col, count){&lt;br/&gt;
    col.drop();&lt;br/&gt;
    col.ensureIndex(&lt;/p&gt;
{&apos;activity.type&apos;:1}
&lt;p&gt;);&lt;br/&gt;
    var start = Date.now();&lt;br/&gt;
    for (var i=0; i &amp;lt; count; i++) {&lt;br/&gt;
        // Document created with only the _id field&lt;br/&gt;
        col.insert({_id:i,activity:{type:&apos;post&apos;,objectid:&apos;515bc88a8ce3b4718e6a1099&apos;,time:new Date}});&lt;/p&gt;


&lt;p&gt;    }&lt;br/&gt;
    var t = (Date.now() - start)/1000;&lt;br/&gt;
    print(usecase +&quot; size runtime: &quot; + t);&lt;br/&gt;
    print(&quot;storageSize with &quot; + usecase +&quot; size  : &quot; + col.stats().storageSize);&lt;br/&gt;
    print(&quot;paddingFactor with &quot; + usecase+&quot;  size: &quot; + col.stats().paddingFactor);&lt;/p&gt;

&lt;p&gt;    return t;&lt;br/&gt;
}&lt;br/&gt;
f(&apos;big bucket&apos;,d.big,500);&lt;br/&gt;
f(&apos;small bucket&apos;,d.small,5);&lt;br/&gt;
insrt(&apos;no bucket&apos;, d.no_bucket, total);&lt;/p&gt;

&lt;p&gt;var bigDoc = d.big.findOne();&lt;br/&gt;
print(&apos;1 big doc size &apos;+(Object.bsonsize(bigDoc)/1000));&lt;/p&gt;

&lt;p&gt;var smallDoc = d.small.findOne();&lt;br/&gt;
print(&apos;100 small doc size &apos;+(Object.bsonsize(smallDoc)*100/1000));&lt;/p&gt;


&lt;p&gt;var rawDoc = d.no_bucket.findOne();&lt;br/&gt;
print(&apos;500 raw doc size &apos;+(Object.bsonsize(rawDoc) * 500 / 1000));&lt;/p&gt;
</description>
                <environment>osx 10.8.3</environment>
        <key id="70809">SERVER-9250</key>
            <summary>using bucket to group small docs</summary>
                <type id="6" iconUrl="https://jira.mongodb.org/secure/viewavatar?size=xsmall&amp;avatarId=14720&amp;avatarType=issuetype">Question</type>
                                            <priority id="3" iconUrl="https://jira.mongodb.org/images/icons/priorities/major.svg">Major - P3</priority>
                        <status id="6" iconUrl="https://jira.mongodb.org/images/icons/statuses/closed.png" description="The issue is considered finished, the resolution is correct. Issues which are closed can be reopened.">Closed</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="4">Incomplete</resolution>
                                        <assignee username="stephen.steneker@mongodb.com">Stennie Steneker</assignee>
                                    <reporter username="stevenluan">hui luan</reporter>
                        <labels>
                    </labels>
                <created>Thu, 4 Apr 2013 22:35:38 +0000</created>
                <updated>Wed, 10 Dec 2014 23:18:21 +0000</updated>
                            <resolved>Wed, 10 Apr 2013 05:20:44 +0000</resolved>
                                    <version>2.2.3</version>
                                                    <component>Performance</component>
                                        <votes>0</votes>
                                    <watches>3</watches>
                                                                                                                <comments>
                            <comment id="309967" author="stennie" created="Wed, 10 Apr 2013 05:20:44 +0000"  >&lt;p&gt;Hi,&lt;/p&gt;

&lt;p&gt;The SERVER project is for reporting bugs or feature suggestions for the MongoDB server.&lt;/p&gt;

&lt;p&gt;For MongoDB-related support discussion you should post on the mongodb-users group (&lt;a href=&quot;http://groups.google.com/group/mongodb-user&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://groups.google.com/group/mongodb-user&lt;/a&gt;) or Stack Overflow / ServerFault.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Stephen&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                <customfield id="customfield_10050" key="com.atlassian.jira.toolkit:comments">
                        <customfieldname># Replies</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1.0</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                <customfield id="customfield_10055" key="com.atlassian.jira.ext.charting:firstresponsedate">
                        <customfieldname>Date of 1st Reply</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Wed, 10 Apr 2013 05:20:44 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10052" key="com.atlassian.jira.toolkit:dayslastcommented">
                        <customfieldname>Days since reply</customfieldname>
                        <customfieldvalues>
                                        10 years, 45 weeks ago
    
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_18254" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Dependencies</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue><![CDATA[]]></customfieldvalue>


                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_15850" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            <customfield id="customfield_10057" key="com.atlassian.jira.toolkit:lastusercommented">
                        <customfieldname>Last comment by Customer</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>true</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10056" key="com.atlassian.jira.toolkit:lastupdaterorcommenter">
                        <customfieldname>Last commenter</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>ramon.fernandez@mongodb.com</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_11151" key="com.atlassian.jira.toolkit:LastCommentDate">
                        <customfieldname>Last public comment date</customfieldname>
                        <customfieldvalues>
                            10 years, 45 weeks ago
                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_10000" key="com.atlassian.jira.plugin.system.customfieldtypes:radiobuttons">
                        <customfieldname>Old_Backport</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10000"><![CDATA[No]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                <customfield id="customfield_10051" key="com.atlassian.jira.toolkit:participants">
                        <customfieldname>Participants</customfieldname>
                        <customfieldvalues>
                                        <customfieldvalue>stevenluan</customfieldvalue>
            <customfieldvalue>stephen.steneker@mongodb.com</customfieldvalue>
    
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                        <customfield id="customfield_14254" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Product Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hrmybj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                <customfield id="customfield_12550" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>2|hrn3tb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10558" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>48555</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_23361" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Requested By</customfieldname>
                        <customfieldvalues>
                                

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                <customfield id="customfield_10053" key="com.atlassian.jira.ext.charting:timeinstatus">
                        <customfieldname>Time In Status</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_22870" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Triagers</customfieldname>
                        <customfieldvalues>
                                

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                    <customfield id="customfield_14350" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>serverRank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|ht0367:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                    </customfields>
    </item>
</channel>
</rss>