<!-- 
RSS generated by JIRA (9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66) at Thu Feb 08 09:05:37 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>MongoDB Jira</title>
    <link>https://jira.mongodb.org</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.7.1</version>
        <build-number>970001</build-number>
        <build-date>13-04-2023</build-date>
    </build-info>


<item>
            <title>[KAFKA-117] Is it possible to implement copying new collections if pipeline has been changed?</title>
                <link>https://jira.mongodb.org/browse/KAFKA-117</link>
                <project id="16285" key="KAFKA">Kafka Connector</project>
                    <description>&lt;p&gt;Use case workflow:&lt;/p&gt;

&lt;p&gt;Create connector with config:&lt;/p&gt;
&lt;p/&gt;
&lt;div id=&quot;syntaxplugin&quot; class=&quot;syntaxplugin&quot; style=&quot;border: 1px dashed #bbb; border-radius: 5px !important; overflow: auto; max-height: 30em;&quot;&gt;
&lt;table cellspacing=&quot;0&quot; cellpadding=&quot;0&quot; border=&quot;0&quot; width=&quot;100%&quot; style=&quot;font-size: 1em; line-height: 1.4em !important; font-weight: normal; font-style: normal; color: black;&quot;&gt;
		&lt;tbody &gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;  margin-top: 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: blue; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;&quot;pipeline&quot;&lt;/span&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;: &lt;/span&gt;&lt;span style=&quot;color: blue; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;&quot;[{\&quot;$match\&quot;: {\&quot;ns.coll\&quot;: {\&quot;$regex\&quot;: /^(col1|col2)$/}}}]&quot;&lt;/span&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;,&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: blue; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;&quot;copy.existing&quot;&lt;/span&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;: &lt;/span&gt;&lt;span style=&quot;color: blue; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;&quot;true&quot;&lt;/span&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;,&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   margin-bottom: 10px;  width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;...&#160;&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
			&lt;/tbody&gt;
&lt;/table&gt;
&lt;/div&gt;
&lt;p/&gt;
&lt;p&gt;After some time update pipeline in connector&apos;s config to:&lt;/p&gt;
&lt;p/&gt;
&lt;div id=&quot;syntaxplugin&quot; class=&quot;syntaxplugin&quot; style=&quot;border: 1px dashed #bbb; border-radius: 5px !important; overflow: auto; max-height: 30em;&quot;&gt;
&lt;table cellspacing=&quot;0&quot; cellpadding=&quot;0&quot; border=&quot;0&quot; width=&quot;100%&quot; style=&quot;font-size: 1em; line-height: 1.4em !important; font-weight: normal; font-style: normal; color: black;&quot;&gt;
		&lt;tbody &gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;  margin-top: 10px;   margin-bottom: 10px;  width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: blue; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;&quot;[{\&quot;$match\&quot;: {\&quot;ns.coll\&quot;: {\&quot;$regex\&quot;: /^(col1|col2|col3)$/}}}]&quot;&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
			&lt;/tbody&gt;
&lt;/table&gt;
&lt;/div&gt;
&lt;p/&gt;

&lt;p&gt;Desired result after restart:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;save resume token&lt;/li&gt;
	&lt;li&gt;somehow understand that need to copy only documents from &apos;col3&apos; collection&lt;/li&gt;
	&lt;li&gt;copy documents from &apos;col3&apos;&lt;/li&gt;
	&lt;li&gt;start streaming from saved resume token&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;What do you think about it?&lt;/p&gt;</description>
                <environment></environment>
        <key id="1383907">KAFKA-117</key>
            <summary>Is it possible to implement copying new collections if pipeline has been changed?</summary>
                <type id="2" iconUrl="https://jira.mongodb.org/secure/viewavatar?size=xsmall&amp;avatarId=14711&amp;avatarType=issuetype">New Feature</type>
                                            <priority id="3" iconUrl="https://jira.mongodb.org/images/icons/priorities/major.svg">Major - P3</priority>
                        <status id="6" iconUrl="https://jira.mongodb.org/images/icons/statuses/closed.png" description="The issue is considered finished, the resolution is correct. Issues which are closed can be reopened.">Closed</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="2">Won&apos;t Fix</resolution>
                                        <assignee username="ross@mongodb.com">Ross Lawley</assignee>
                                    <reporter username="andreworty@gmail.com">Andrey B</reporter>
                        <labels>
                    </labels>
                <created>Thu, 18 Jun 2020 15:28:17 +0000</created>
                <updated>Thu, 2 Jun 2022 15:21:01 +0000</updated>
                            <resolved>Mon, 10 Aug 2020 12:14:21 +0000</resolved>
                                                                    <component>Source</component>
                                        <votes>0</votes>
                                    <watches>2</watches>
                                                                                                                <comments>
                            <comment id="3354343" author="JIRAUSER1269378" created="Mon, 24 Aug 2020 10:06:05 +0000"  >&lt;p&gt;I created a separate ticket for the last question. &lt;a href=&quot;https://jira.mongodb.org/browse/KAFKA-147&quot; title=&quot;Copying existing namespaces by regex&quot; class=&quot;issue-link&quot; data-issue-key=&quot;KAFKA-147&quot;&gt;&lt;del&gt;KAFKA-147&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="3331068" author="JIRAUSER1269378" created="Mon, 10 Aug 2020 17:02:47 +0000"  >&lt;blockquote&gt;&lt;p&gt;I think at the moment reconfiguring the connector requires too much state to be stored.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;I agree.&lt;/p&gt;
&lt;blockquote&gt;&lt;p&gt;If you wish to add a new collection and copy the existing data over the process should be something like:&lt;/p&gt;

&lt;p&gt;1) Add a new connector to copy and monitor the new collection&lt;br/&gt;
2) Once the data copying process has finished and normal change stream events are being published, stop the new connector&lt;br/&gt;
3) Reconfigure the existing connector to include the newly added collection.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;This approach could lead to data gaps.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;What do you think about explicitly configuration which collections should be copied? I don&apos;t speak about saving state and checking if there are new collections that should be copied. Just config property which defines which collections should be copied at the beginning of work.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Andrey&lt;/p&gt;</comment>
                            <comment id="3329927" author="ross@10gen.com" created="Mon, 10 Aug 2020 12:14:11 +0000"  >&lt;p&gt;Hi &lt;a href=&quot;https://jira.mongodb.org/secure/ViewProfile.jspa?name=andreworty%40gmail.com&quot; class=&quot;user-hover&quot; rel=&quot;andreworty@gmail.com&quot;&gt;andreworty@gmail.com&lt;/a&gt;,&lt;/p&gt;

&lt;p&gt;I think at the moment reconfiguring the connector requires too much state to be stored.  &lt;/p&gt;

&lt;p&gt;If you wish to add a new collection and copy the existing data over the process should be something like:&lt;/p&gt;

&lt;p&gt;1) Add a new connector to copy and monitor the new collection&lt;br/&gt;
2) Once the data copying process has finished and normal change stream events are being published, stop the new connector&lt;br/&gt;
3) Reconfigure the existing connector to include the newly added collection.&lt;/p&gt;

&lt;p&gt;That is probably more efficient than running lots of change stream cursors and connectors and allows for the growth of watching and copying new collections.&lt;/p&gt;

&lt;p&gt;I&apos;m going to close this ticket for now as &quot;Won&apos;t Fix&quot; however, should more people require this functionality and comment on this ticket we can always reopen it in the future.&lt;/p&gt;

&lt;p&gt;Ross&lt;/p&gt;</comment>
                            <comment id="3328997" author="JIRAUSER1269378" created="Sat, 8 Aug 2020 15:06:29 +0000"  >&lt;p&gt;Hi again, Ross, &lt;br/&gt;
 What do you think about it?&lt;/p&gt;

&lt;p&gt;A little bit more about my case:&lt;br/&gt;
 I have a couple of thousand collections and streaming will be turned on gradually, not all at once. I also need to copy the existing data for new collections. Usually, these collections are quite small, a few hundred or thousands of documents. So, I guess, creating a new connector every time when I want to start streaming new collections should work for me.&lt;/p&gt;

&lt;p&gt;What do you think about &lt;b&gt;copy.existing.collections&lt;/b&gt; or &lt;b&gt;copy.existing.collection.regex&lt;/b&gt; parameters? &lt;br/&gt;
 There could be a pipeline, which ignores certain collections, but code &lt;a href=&quot;https://github.com/mongodb/mongo-kafka/blob/master/src/main/java/com/mongodb/kafka/connect/source/MongoCopyDataManager.java#L125-L129&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;MongoCopyDataManager.copyDataFrom()&lt;/a&gt; could take some time for really big collections and it will be wasted. In my case for a single collection could be wasted up to a few minutes, but I guess there is could be more.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;It will be great to define explicitly to which collections should be copied.&lt;/p&gt;</comment>
                            <comment id="3229849" author="JIRAUSER1269378" created="Tue, 30 Jun 2020 12:57:01 +0000"  >&lt;p&gt;Hi Ross,&lt;/p&gt;

&lt;p&gt;thanks for reply&lt;/p&gt;
&lt;blockquote&gt;&lt;p&gt;Due to the pipeline possibly containing any valid pipeline operation, it would be hard to determine if any new collections existed.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Maybe it&apos;s better to add new parameters to connector config, like &lt;b&gt;copy.existing.collections&lt;/b&gt; or &lt;b&gt;copy.existing.collection.regex&lt;/b&gt;&lt;/p&gt;
&lt;blockquote&gt;&lt;p&gt;Also where to keep the metadata about what had already been seen / processed.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt; I thought about some special Kafka topic.&lt;/p&gt;
&lt;blockquote&gt;&lt;p&gt;I think for the level of complexity it would add, registering a new connector instance would potentially be the simplest solution.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;I agree it&apos;s a much simpler solution.&lt;br/&gt;
 Maybe you could prompt how many connector instances we could create? e.g. there would be performance downgrade on mongo itself if we create 2000 instances? what about 10000?&lt;/p&gt;

&lt;p&gt;Andrey&lt;/p&gt;</comment>
                            <comment id="3229713" author="ross@10gen.com" created="Tue, 30 Jun 2020 11:03:17 +0000"  >&lt;p&gt;Hi &lt;a href=&quot;https://jira.mongodb.org/secure/ViewProfile.jspa?name=andreworty%40gmail.com&quot; class=&quot;user-hover&quot; rel=&quot;andreworty@gmail.com&quot;&gt;andreworty@gmail.com&lt;/a&gt;,&lt;/p&gt;

&lt;p&gt;Thanks for the ticket.  Due to the pipeline possibly containing any valid pipeline operation, it would be hard to determine if any new collections existed.  Also where to keep the metadata about what had already been seen / processed.&lt;/p&gt;

&lt;p&gt;I think for the level of complexity it would add, registering a new connector instance would potentially be the simplest solution. &lt;/p&gt;

&lt;p&gt;Ross&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_15850" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    <customfield id="customfield_12550" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>2|hxdirz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10558" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>