<!-- 
RSS generated by JIRA (9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66) at Thu Feb 08 04:04:43 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>MongoDB Jira</title>
    <link>https://jira.mongodb.org</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.7.1</version>
        <build-number>970001</build-number>
        <build-date>13-04-2023</build-date>
    </build-info>


<item>
            <title>[SERVER-23881] allow regex word character (\w) and word boundary (\b) escapes to be unicode-aware</title>
                <link>https://jira.mongodb.org/browse/SERVER-23881</link>
                <project id="10000" key="SERVER">Core Server</project>
                    <description>&lt;p&gt;Provide a way to use regular expressions in MongoDB where the word character (\w) and word boundary (\b) escapes work for code points greater than or equal to 256.&lt;/p&gt;

&lt;h5&gt;&lt;a name=&quot;Originaldescription&quot;&gt;&lt;/a&gt;Original description&lt;/h5&gt;

&lt;p&gt;$regex word boundary fails by treating Danish &#248; character as a non-character&lt;/p&gt;

&lt;p/&gt;
&lt;div id=&quot;syntaxplugin&quot; class=&quot;syntaxplugin&quot; style=&quot;border: 1px dashed #bbb; border-radius: 5px !important; overflow: auto; max-height: 30em;&quot;&gt;
&lt;table cellspacing=&quot;0&quot; cellpadding=&quot;0&quot; border=&quot;0&quot; width=&quot;100%&quot; style=&quot;font-size: 1em; line-height: 1.4em !important; font-weight: normal; font-style: normal; color: black;&quot;&gt;
		&lt;tbody &gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;  margin-top: 10px;   margin-bottom: 10px;  width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;db.collection.find({ &quot;name&quot; : { &quot;$regex&quot; : &quot;.*\\bden\\b.*&quot; , &quot;$options&quot; : &quot;i&quot;} })&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
			&lt;/tbody&gt;
&lt;/table&gt;
&lt;/div&gt;
&lt;p/&gt;

&lt;p&gt;returns a document:&lt;/p&gt;
&lt;p/&gt;
&lt;div id=&quot;syntaxplugin&quot; class=&quot;syntaxplugin&quot; style=&quot;border: 1px dashed #bbb; border-radius: 5px !important; overflow: auto; max-height: 30em;&quot;&gt;
&lt;table cellspacing=&quot;0&quot; cellpadding=&quot;0&quot; border=&quot;0&quot; width=&quot;100%&quot; style=&quot;font-size: 1em; line-height: 1.4em !important; font-weight: normal; font-style: normal; color: black;&quot;&gt;
		&lt;tbody &gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;  margin-top: 10px;   margin-bottom: 10px;  width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;{  &quot;name&quot;: &quot;Death Is A Caress(D&#248;den Er Et Kj&#230;rtegn).sub&quot; }&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
			&lt;/tbody&gt;
&lt;/table&gt;
&lt;/div&gt;
&lt;p/&gt;</description>
                <environment></environment>
        <key id="281755">SERVER-23881</key>
            <summary>allow regex word character (\w) and word boundary (\b) escapes to be unicode-aware</summary>
                <type id="2" iconUrl="https://jira.mongodb.org/secure/viewavatar?size=xsmall&amp;avatarId=14711&amp;avatarType=issuetype">New Feature</type>
                                            <priority id="3" iconUrl="https://jira.mongodb.org/images/icons/priorities/major.svg">Major - P3</priority>
                        <status id="10038" iconUrl="https://jira.mongodb.org/images/icons/subtask.gif" description="">Backlog</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="backlog-query-optimization">Backlog - Query Optimization</assignee>
                                    <reporter username="niccottrell">Nic Cottrell (Personal)</reporter>
                        <labels>
                    </labels>
                <created>Fri, 22 Apr 2016 17:06:28 +0000</created>
                <updated>Tue, 6 Dec 2022 04:27:12 +0000</updated>
                                            <version>3.0.11</version>
                                                    <component>Querying</component>
                                        <votes>2</votes>
                                    <watches>15</watches>
                                                                                                                <comments>
                            <comment id="3947702" author="asya" created="Fri, 16 Jul 2021 19:16:40 +0000"  >&lt;p&gt;I think this user ran into the same issue:&#160; &lt;a href=&quot;https://www.mongodb.com/community/forums/t/regex-whole-word-match-not-working-for-vietnamese-language/114780/4&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://www.mongodb.com/community/forums/t/regex-whole-word-match-not-working-for-vietnamese-language/114780/4&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="1248290" author="niccottrell" created="Wed, 27 Apr 2016 19:32:30 +0000"  >&lt;p&gt;It seems that this will work for my case:&lt;/p&gt;

&lt;p/&gt;
&lt;div id=&quot;syntaxplugin&quot; class=&quot;syntaxplugin&quot; style=&quot;border: 1px dashed #bbb; border-radius: 5px !important; overflow: auto; max-height: 30em;&quot;&gt;
&lt;table cellspacing=&quot;0&quot; cellpadding=&quot;0&quot; border=&quot;0&quot; width=&quot;100%&quot; style=&quot;font-size: 1em; line-height: 1.4em !important; font-weight: normal; font-style: normal; color: black;&quot;&gt;
		&lt;tbody &gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;  margin-top: 10px;   margin-bottom: 10px;  width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;{ &quot;name&quot; : { &quot;$regex&quot; : &quot;(^|.*\\s+)den(\\s+.*|$)&quot; , &quot;$options&quot; : &quot;i&quot;} }&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
			&lt;/tbody&gt;
&lt;/table&gt;
&lt;/div&gt;
&lt;p/&gt;</comment>
                            <comment id="1248275" author="niccottrell" created="Wed, 27 Apr 2016 19:21:18 +0000"  >&lt;p&gt;Thanks - I just read up and understand. It&apos;s a bit of a shock since I&apos;m used to Java&apos;s engine that seems to be the exception..&lt;/p&gt;</comment>
                            <comment id="1247750" author="david.storch" created="Wed, 27 Apr 2016 13:15:52 +0000"  >&lt;p&gt;Hi &lt;a href=&quot;https://jira.mongodb.org/secure/ViewProfile.jspa?name=niccottrell&quot; class=&quot;user-hover&quot; rel=&quot;niccottrell&quot;&gt;niccottrell&lt;/a&gt;, just to clarify: I believe that we do build PCRE with UTF-8 support enabled. It appears that the default behavior of the escapes \b and \w, among others, is simply not changed when unicode support is enabled.&lt;/p&gt;</comment>
                            <comment id="1247587" author="niccottrell" created="Wed, 27 Apr 2016 08:26:52 +0000"  >&lt;p&gt;Thanks for the details. Strange that there&apos;s not a way to flip PCRE into Unicode mode. Most regex platforms/libraries I know allow a &quot;u&quot; Unicode flag to force behaviour like this. It feels a bit wrong for Mongo to not include regex support for at least other European languages out of the box.&lt;/p&gt;</comment>
                            <comment id="1247291" author="david.storch" created="Tue, 26 Apr 2016 22:12:07 +0000"  >&lt;p&gt;Hi &lt;a href=&quot;https://jira.mongodb.org/secure/ViewProfile.jspa?name=niccottrell&quot; class=&quot;user-hover&quot; rel=&quot;niccottrell&quot;&gt;niccottrell&lt;/a&gt;,&lt;/p&gt;

&lt;p&gt;Thanks for reporting this issue! You are absolutely correct that PCRE considers the Danish character &#248; to be a non-word character. This can be seen more clearly in the following example:&lt;/p&gt;
&lt;p/&gt;
&lt;div id=&quot;syntaxplugin&quot; class=&quot;syntaxplugin&quot; style=&quot;border: 1px dashed #bbb; border-radius: 5px !important; overflow: auto; max-height: 30em;&quot;&gt;
&lt;table cellspacing=&quot;0&quot; cellpadding=&quot;0&quot; border=&quot;0&quot; width=&quot;100%&quot; style=&quot;font-size: 1em; line-height: 1.4em !important; font-weight: normal; font-style: normal; color: black;&quot;&gt;
		&lt;tbody &gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;  margin-top: 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;&amp;gt; db.c.drop();&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;&amp;gt; db.c.insert({test: &quot;&#248;&quot;});&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;&amp;gt; db.c.insert({test: &quot;a&quot;});&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;&amp;gt; db.c.find({test: /\w/});&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;{ &quot;_id&quot; : ObjectId(&quot;571fe12e5bf4a8f145edf56c&quot;), &quot;test&quot; : &quot;a&quot; }&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;&amp;gt; db.c.find({test: /\W/});&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
				&lt;tr id=&quot;syntaxplugin_code_and_gutter&quot;&gt;
						&lt;td  style=&quot; line-height: 1.4em !important; padding: 0em; vertical-align: top;&quot;&gt;
					&lt;pre style=&quot;font-size: 1em; margin: 0 10px;   margin-bottom: 10px;  width: auto; padding: 0;&quot;&gt;&lt;span style=&quot;color: black; font-family: &apos;Consolas&apos;, &apos;Bitstream Vera Sans Mono&apos;, &apos;Courier New&apos;, Courier, monospace !important;&quot;&gt;{ &quot;_id&quot; : ObjectId(&quot;571fe12b5bf4a8f145edf56b&quot;), &quot;test&quot; : &quot;&#248;&quot; }&lt;/span&gt;&lt;/pre&gt;
			&lt;/td&gt;
		&lt;/tr&gt;
			&lt;/tbody&gt;
&lt;/table&gt;
&lt;/div&gt;
&lt;p/&gt;
&lt;p&gt;This is why &#248; is forming a word boundary. The PCRE manual has the following to say on the subject:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;       PCRE handles caseless matching, and determines whether  characters  are&lt;br/&gt;
       letters,  digits, or whatever, by reference to a set of tables, indexed&lt;br/&gt;
       by character code point. When running in UTF-8 mode, or in the  16-  or&lt;br/&gt;
       32-bit libraries, this applies only to characters with code points less&lt;br/&gt;
       than 256. By default, higher-valued code  points  never  match  escapes&lt;br/&gt;
       such  as \w or \d. However, if PCRE is built with Unicode property sup-&lt;br/&gt;
       port, all characters can be tested with \p and \P,  or,  alternatively,&lt;br/&gt;
       the  PCRE_UCP option can be set when a pattern is compiled; this causes&lt;br/&gt;
       \w and friends to use Unicode property support instead of the  built-in&lt;br/&gt;
       tables.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;It elaborates:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test characters of any code value, but, by default, the characters that PCRE recognizes as digits, spaces, or word characters remain the same set as in non-UTF mode, all with values less than 256. This remains true even when PCRE is built to include Unicode property support, because to do otherwise would slow down PCRE in many common cases. Note in particular that this applies to \b and \B, because they are defined in terms of \w and \W. If you really want to test for a wider sense of, say, &quot;digit&quot;, you can use explicit Unicode property tests such as \p{Nd}. Alternatively, if you set the PCRE_UCP option, the way that the character escapes work is changed so that Unicode properties are used to determine which characters match. There are more details in the section on generic character types in the pcrepattern documentation.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;So, to summarize, this is the expected behavior of PCRE. Unfortunately, as a user of MongoDB, there is no way to make use of the PCRE_UCP option or to otherwise coerce PCRE to use the unicode-aware regular expression semantics you desire. Therefore, we will keep this ticket open as a feature request. I am going to move it into Needs Triage state, and our development team will evaluate it at our next triage meeting. Please continue to watch for updates.&lt;/p&gt;

&lt;p&gt;Best,&lt;br/&gt;
Dave&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10012">
                    <name>Related</name>
                                            <outwardlinks description="related to">
                                        <issuelink>
            <issuekey id="51954">SERVER-7218</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                <customfield id="customfield_10050" key="com.atlassian.jira.toolkit:comments">
                        <customfieldname># Replies</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>6.0</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                <customfield id="customfield_12751" key="com.atlassian.jira.plugin.system.customfieldtypes:multiselect">
                        <customfieldname>Assigned Teams</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="25126"><![CDATA[Query Optimization]]></customfieldvalue>
    
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                            <customfield id="customfield_13552" key="com.go2group.jira.plugin.crm:crm_generic_field">
                        <customfieldname>Case</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue><![CDATA[[5002K00000wXkR6QAK]]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                            <customfield id="customfield_10055" key="com.atlassian.jira.ext.charting:firstresponsedate">
                        <customfieldname>Date of 1st Reply</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Tue, 26 Apr 2016 20:59:07 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10052" key="com.atlassian.jira.toolkit:dayslastcommented">
                        <customfieldname>Days since reply</customfieldname>
                        <customfieldvalues>
                                        2 years, 29 weeks, 5 days ago
    
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_18254" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Dependencies</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue><![CDATA[]]></customfieldvalue>


                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_15850" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10057" key="com.atlassian.jira.toolkit:lastusercommented">
                        <customfieldname>Last comment by Customer</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>true</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10056" key="com.atlassian.jira.toolkit:lastupdaterorcommenter">
                        <customfieldname>Last commenter</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>alexander.golin@mongodb.com</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_11151" key="com.atlassian.jira.toolkit:LastCommentDate">
                        <customfieldname>Last public comment date</customfieldname>
                        <customfieldvalues>
                            2 years, 29 weeks, 5 days ago
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                    <customfield id="customfield_10051" key="com.atlassian.jira.toolkit:participants">
                        <customfieldname>Participants</customfieldname>
                        <customfieldvalues>
                                        <customfieldvalue>asya.kamsky@mongodb.com</customfieldvalue>
            <customfieldvalue>backlog-query-optimization</customfieldvalue>
            <customfieldvalue>david.storch@mongodb.com</customfieldvalue>
            <customfieldvalue>niccottrell</customfieldvalue>
    
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                        <customfield id="customfield_14254" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Product Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hrk9if:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                <customfield id="customfield_12550" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>2|hr2ej3:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10558" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_23361" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Requested By</customfieldname>
                        <customfieldvalues>
                                

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    <customfield id="customfield_22870" key="com.onresolve.jira.groovy.groovyrunner:scripted-field">
                        <customfieldname>Triagers</customfieldname>
                        <customfieldvalues>
                                    <customfieldvalue><![CDATA[kelsey.schubert@mongodb.com]]></customfieldvalue>
    

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                    <customfield id="customfield_14350" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>serverRank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hrkwxz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                    </customfields>
    </item>
</channel>
</rss>