<!-- 
RSS generated by JIRA (9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66) at Thu Feb 08 09:05:34 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>MongoDB Jira</title>
    <link>https://jira.mongodb.org</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.7.1</version>
        <build-number>970001</build-number>
        <build-date>13-04-2023</build-date>
    </build-info>


<item>
            <title>[KAFKA-92] Kafka connector - Number of sink tasks</title>
                <link>https://jira.mongodb.org/browse/KAFKA-92</link>
                <project id="16285" key="KAFKA">Kafka Connector</project>
                    <description>&lt;div class=&quot;panel&quot; style=&quot;background-color: #c2d2c2;border-color: #cccccc;border-style: dashed;border-width: 1px;&quot;&gt;&lt;div class=&quot;panelHeader&quot; style=&quot;border-bottom-width: 1px;border-bottom-style: dashed;border-bottom-color: #cccccc;background-color: #239eb0;&quot;&gt;&lt;b&gt;Epic Summary&lt;/b&gt;&lt;/div&gt;&lt;div class=&quot;panelContent&quot; style=&quot;background-color: #c2d2c2;&quot;&gt;
&lt;h4&gt;&lt;a name=&quot;Summary&quot;&gt;&lt;/a&gt;Summary&lt;/h4&gt;
&lt;p&gt;The Kafka sink connector only ever supports a single task. Users should be able to use the &lt;tt&gt;tasks.max&lt;/tt&gt; setting to increase parallelism with the connector:&lt;/p&gt;

&lt;blockquote&gt;&lt;p&gt;&lt;tt&gt;tasks.max&lt;/tt&gt; - The maximum number of tasks that should be created for this connector. The connector may create fewer tasks if it cannot achieve this level of parallelism.&#160;&lt;/p&gt;&lt;/blockquote&gt;

&lt;hr /&gt;
&lt;p&gt;Was:&lt;/p&gt;

&lt;p&gt;I am testing this MongoDB sink connector to migrate large datasets (multi-TB) from one mongodb cluster to mongodb another. A challenge I am facing is throughput on the sink side. Irrespective of task.max parameter, only one sink task is created. Then I found below 2 tickets related to limiting the number of sink tasks to only 1. In this case, how do I improve throughput on the sink side? Just curious to know why number of tasks should limit to one? Were there any plans to improve this?&#160;&lt;br/&gt;
&lt;a href=&quot;https://jira.mongodb.org/browse/KAFKA-62&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;https://jira.mongodb.org/browse/KAFKA-62&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://jira.mongodb.org/browse/KAFKA-46&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;https://jira.mongodb.org/browse/KAFKA-46&lt;/a&gt;&lt;/p&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>AWS EC2, Sharded cluster</environment>
        <key id="1285529">KAFKA-92</key>
            <summary>Kafka connector - Number of sink tasks</summary>
                <type id="11" iconUrl="https://jira.mongodb.org/secure/viewavatar?size=xsmall&amp;avatarId=14707&amp;avatarType=issuetype">Epic</type>
                                            <priority id="3" iconUrl="https://jira.mongodb.org/images/icons/priorities/major.svg">Major - P3</priority>
                        <status id="6" iconUrl="https://jira.mongodb.org/images/icons/statuses/closed.png" description="The issue is considered finished, the resolution is correct. Issues which are closed can be reopened.">Closed</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="13201">Fixed</resolution>
                                        <assignee username="ross@mongodb.com">Ross Lawley</assignee>
                                    <reporter username="yaramati@adobe.com">Rajaramesh Yaramati</reporter>
                        <labels>
                    </labels>
                <created>Sun, 22 Mar 2020 15:04:42 +0000</created>
                <updated>Sat, 28 Oct 2023 10:46:25 +0000</updated>
                            <resolved>Tue, 19 May 2020 07:57:43 +0000</resolved>
                                    <version>1.0</version>
                                    <fixVersion>1.2.0</fixVersion>
                                    <component>Sink</component>
                                        <votes>1</votes>
                                    <watches>7</watches>
                                                                                                                <comments>
                            <comment id="3094454" author="xgen-internal-githook" created="Tue, 19 May 2020 07:57:18 +0000"  >&lt;p&gt;Author:&lt;/p&gt;
{&apos;name&apos;: &apos;Ross Lawley&apos;, &apos;email&apos;: &apos;ross.lawley@gmail.com&apos;, &apos;username&apos;: &apos;rozza&apos;}
&lt;p&gt;Message: Allow the Sink connector to use multiple tasks&lt;/p&gt;

&lt;p&gt;Kafka Connect provides built-in support for parallelism and scalable data copying&lt;br/&gt;
by assigning topic partitions to tasks. This allows for parallelism at the cost of&lt;br/&gt;
sequentially processing the data.&lt;/p&gt;

&lt;p&gt;When using multiple tasks, data will be processeed out of order to the order of&lt;br/&gt;
the topic. Each task will be assigned partitions from a topic and these will be&lt;br/&gt;
processed independently of the other partitions.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jira.mongodb.org/browse/KAFKA-92&quot; title=&quot;Kafka connector - Number of sink tasks&quot; class=&quot;issue-link&quot; data-issue-key=&quot;KAFKA-92&quot;&gt;&lt;del&gt;KAFKA-92&lt;/del&gt;&lt;/a&gt;&lt;br/&gt;
Branch: master&lt;br/&gt;
&lt;a href=&quot;https://github.com/mongodb/mongo-kafka/commit/32f5458946d976d63d26be7a0f515be176c2cb14&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/mongodb/mongo-kafka/commit/32f5458946d976d63d26be7a0f515be176c2cb14&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="3081324" author="martin.andersson@kambi.com" created="Tue, 12 May 2020 17:28:44 +0000"  >&lt;blockquote&gt;&lt;p&gt;The only complication is that tasks will process the messages as they see them and this can lead to out of order processing of data from a topic when using multiple tasks. That could be an issue depending on the users dataflow.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;As long as there is only one task processing for each kafka partition (and tasks flush their data on a rebalance) then it seems to me that the connector would hold the same ordering guarantees as kafka (e.g. ordering is only guaranteed by topic-partition). As a user of this connector I wouldn&apos;t expect anything more.&lt;/p&gt;</comment>
                            <comment id="3080304" author="ross@10gen.com" created="Tue, 12 May 2020 10:23:44 +0000"  >&lt;p&gt;PR: &lt;a href=&quot;https://github.com/rozza/mongo-kafka/pull/17&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/rozza/mongo-kafka/pull/17&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;For the sink we can give the control of processing the messages from a topic over to Kafka.  So multiple tasks will be assigned messages based on their topic - partition automatically.&lt;/p&gt;

&lt;p&gt;The only complication is that tasks will process the messages as they see them and this can lead to out of order processing of data from a topic when using multiple tasks.  That could be an issue depending on the users dataflow.&lt;/p&gt;</comment>
                            <comment id="3062729" author="alexey.menshikov" created="Thu, 30 Apr 2020 15:47:39 +0000"  >&lt;p&gt;The customer is trying to reach 10k insert rate for 2kb documents, but seems like that&apos;s not possible using just one sink connector. The database better handles multiple inserting threads than just one. The ability to increase the level of parallelism should significantly increase the insertion rate.&lt;/p&gt;</comment>
                            <comment id="3020789" author="ross@10gen.com" created="Wed, 1 Apr 2020 13:06:29 +0000"  >&lt;p&gt;Hi &lt;a href=&quot;https://jira.mongodb.org/secure/ViewProfile.jspa?name=yaramati%40adobe.com&quot; class=&quot;user-hover&quot; rel=&quot;yaramati@adobe.com&quot;&gt;yaramati@adobe.com&lt;/a&gt;,&lt;/p&gt;

&lt;p&gt;I&apos;m glad you were able to work around the issue and improve the parallelism,&lt;/p&gt;

&lt;blockquote&gt;&lt;p&gt;We could also potentially use a thread pool of workers mapped per topic to improve the concurrency across topics&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;This could be a future improvement to the connector that would stop the need for registering multiple connectors and allow the same as parallelism but with a single registered connector, by using a thread pool of workers, with one worker thread per topic.&lt;/p&gt;

&lt;p&gt;Ross&lt;/p&gt;</comment>
                            <comment id="3019535" author="yaramati@adobe.com" created="Tue, 31 Mar 2020 17:10:22 +0000"  >&lt;p&gt;Thank you, Ross Lawley, for addressing this ticket.&lt;/p&gt;

&lt;p&gt;For the use case (Data migration from one cluster to another) I am using, it required sequential write on target. I realized this only after getting to know the reason why only one sink task.&#160;&lt;/p&gt;

&lt;p&gt;Now I created topic per shared and at least I am getting sink task parallelisms equal to the number of shards.&#160;&lt;/p&gt;

&lt;p&gt;Can you please elaborate on what do you mean by &quot;We could also potentially use a thread pool of workers mapped per topic to improve the concurrency across topics.&quot;?&#160;&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Rajaramesh Yaramati&lt;/p&gt;</comment>
                            <comment id="3018248" author="ross@10gen.com" created="Tue, 31 Mar 2020 12:39:01 +0000"  >&lt;p&gt;Hi &lt;a href=&quot;https://jira.mongodb.org/secure/ViewProfile.jspa?name=yaramati%40adobe.com&quot; class=&quot;user-hover&quot; rel=&quot;yaramati@adobe.com&quot;&gt;yaramati@adobe.com&lt;/a&gt;,&lt;/p&gt;

&lt;p&gt;Thanks for the ticket, at the moment only a single task is used when writing data to MongoDB.  This is by design to ensure that all writes are sequential and ordered as seen on the topic. If you are watching multiple topics then it may be more efficient to set up a connector per topic, as that would improve the concurrency of the operations.&lt;/p&gt;

&lt;p&gt;Do you require all writes to happen sequentially and in order? If that is not the case then potentially a new feature could be added to allow this.  &lt;br/&gt;
Are you watching multiple topics? We could also potentially use a thread pool of workers mapped per topic to improve the concurrency across topics.&lt;/p&gt;

&lt;p&gt;Kind Regards,&lt;/p&gt;

&lt;p&gt;Ross Lawley&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10012">
                    <name>Related</name>
                                            <outwardlinks description="related to">
                                                        </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                                                                                    <customfield id="customfield_13552" key="com.go2group.jira.plugin.crm:crm_generic_field">
                        <customfieldname>Case</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue><![CDATA[[5002K00000mqvAXQAY]]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                            <customfield id="customfield_13653" key="com.atlassian.jira.plugin.system.customfieldtypes:textarea">
                        <customfieldname>Detailed Project Statuses</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>&lt;p&gt;Engineer(s): Ross&lt;/p&gt;

&lt;p&gt;This epic is more like a one off ticket. We converted this work to an epic so it showed up on our board for quarterly planning. &lt;br/&gt;
The actual work required for this ticket turned out to be easier than expected and is now in code review.&lt;/p&gt;

&lt;hr /&gt;</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_15850" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                        <customfield id="customfield_10858" key="com.pyxis.greenhopper.jira:gh-epic-label">
                        <customfieldname>Epic Name</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Kafka connector - Number of sink tasks</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10859" key="com.pyxis.greenhopper.jira:gh-epic-status">
                        <customfieldname>Epic Status</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10642"><![CDATA[Done]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                <customfield id="customfield_12550" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>2|ht6x3r:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10558" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>