|
There are a couple issues in play here.
1. PCRE patterns cannot contain embedded NUL bytes
The pattern is converted to a C-string within the PCRE library, so it is necessary to escape NUL bytes within the pattern. The $regex pattern specified in the query
db.test.find({_id: {"$regex: "^a\x00b"}})
|
is interpreted by the PCRE library as "^a" because the string is truncated upon the first NUL byte. This is why it matches the documents {_id: "a"} and {_id: "a\u0000"}. To have the PCRE library match a leading 'a' character followed by a NUL byte, the NUL byte within the string must be escaped. For example,
db.test.find({_id: {"$regex: "^a\\x00b"}})
|
See the "Non-printing characters" section of http://www.pcre.org/pcre.txt for all the different ways the NUL byte can be escaped (e.g.
etc.)
2. There is a bug in MongoDB's integration with the PCRE library that causes the string data stored in a document to be truncated at the first NUL byte when attempting to do pattern matching on it. This line is the cause of the issue because we will end up using the StringPiece(const char* str) constructor, which calls strlen(), and thus causes the pattern matching on the string data to stop at the first NUL byte.
We should instead use the StringPiece(const char* offset, int len) constructor and pass e.valuestrsize() - 1 as the length of the string data.
|
|
v2.4.7 and v3.0.3
> db.version()
|
3.0.3
|
> use testdb
|
switched to db testdb
|
> db
|
testdb
|
> db.test.find()
|
> db.test.save({_id:"a","tag":"a"})
|
WriteResult({ "nMatched" : 0, "nUpserted" : 1, "nModified" : 0, "_id" : "a" })
|
> db.test.find()
|
{ "_id" : "a", "tag" : "a" }
|
> db.test.save({_id:"a\x00","tag":"a0"})
|
WriteResult({ "nMatched" : 0, "nUpserted" : 1, "nModified" : 0, "_id" : "a\u0000" })
|
> db.test.find()
|
{ "_id" : "a", "tag" : "a" }
|
{ "_id" : "a\u0000", "tag" : "a0" }
|
> db.test.find({_id: "a"})
|
{ "_id" : "a", "tag" : "a" }
|
> db.test.find({_id: "a\x00"})
|
{ "_id" : "a\u0000", "tag" : "a0" }
|
> db.test.find({_id: "a\u0000"})
|
{ "_id" : "a\u0000", "tag" : "a0" }
|
> db.test.find({_id: "a\0"})
|
{ "_id" : "a\u0000", "tag" : "a0" }
|
> db.test.find({_id:{"$regex":"^a"}}) // correct
|
{ "_id" : "a", "tag" : "a" }
|
{ "_id" : "a\u0000", "tag" : "a0" }
|
> db.test.find({_id:{"$regex":"^a\x00"}}) // here { "_id" : "a", "tag" : "a" } is unexpect
|
{ "_id" : "a", "tag" : "a" }
|
{ "_id" : "a\u0000", "tag" : "a0" }
|
> db.test.find({_id:{"$regex":"^a\u0000"}}) // here { "_id" : "a", "tag" : "a" } is unexpect
|
{ "_id" : "a", "tag" : "a" }
|
{ "_id" : "a\u0000", "tag" : "a0" }
|
>
|
another test in v3.0.3
> db.test.find()
|
{ "_id" : "a", "tag" : "a" }
|
{ "_id" : "a\u0000", "tag" : "a0" }
|
{ "_id" : "ab", "tag" : "ab" }
|
{ "_id" : "a\u0000b", "tag" : "a0b" }
|
{ "_id" : "aab" }
|
> db.test.find({"_id":{"$regex":"^a\x00b"}}) // should only match one item
|
{ "_id" : "a", "tag" : "a" }
|
{ "_id" : "a\u0000", "tag" : "a0" }
|
{ "_id" : "a\u0000b", "tag" : "a0b" }
|
{ "_id" : "aab" }
|
{ "_id" : "ab", "tag" : "ab" }
|
> db.test.find({"_id":{"$regex":"a\x00b"}}) // should only match one item
|
{ "_id" : "a", "tag" : "a" }
|
{ "_id" : "a\u0000", "tag" : "a0" }
|
{ "_id" : "a\u0000b", "tag" : "a0b" }
|
{ "_id" : "aab" }
|
{ "_id" : "ab", "tag" : "ab" }
|
>
|
|
> db.test.find({"_id":{"$regex":"b\x00sdfsdjkfsdk"}}) // no item should match
|
{ "_id" : "aab" }
|
{ "_id" : "ab", "tag" : "ab" }
|
> db.test.find({"_id":{"$regex":"\x00sdfsdjkfsdk"}}) // no item should match, but here get all, it can cause an security problem !!!!!!
|
{ "_id" : "a", "tag" : "a" }
|
{ "_id" : "a\u0000", "tag" : "a0" }
|
{ "_id" : "a\u0000b", "tag" : "a0b" }
|
{ "_id" : "aab" }
|
{ "_id" : "ab", "tag" : "ab" }
|
>
|
v1.8.3
in v1.8.3, looks "a" and "a\u0000" are same.
> db.version()
|
1.8.3
|
> use testdb
|
switched to db testdb
|
> db
|
testdb
|
> db.test.find()
|
> db.test.save({_id:"a","tag":"a"})
|
> db.test.find()
|
{ "_id" : "a", "tag" : "a" }
|
> db.test.save({_id:"a\x00","tag":"a0"})
|
> db.test.find() // expect two items
|
{ "_id" : "a", "tag" : "a0" }
|
> db.test.find({_id: "a"})
|
{ "_id" : "a", "tag" : "a0" }
|
> db.test.find({_id: "a\x00"})
|
{ "_id" : "a", "tag" : "a0" }
|
> db.test.find({_id: "a\u0000"})
|
{ "_id" : "a", "tag" : "a0" }
|
> db.test.find({_id: "a\0"})
|
{ "_id" : "a", "tag" : "a0" }
|
> db.test.find({_id:{"$regex":"^a"}})
|
{ "_id" : "a", "tag" : "a0" }
|
> db.test.find({_id:{"$regex":"^a\x00"}})
|
{ "_id" : "a", "tag" : "a0" }
|
> db.test.find({_id:{"$regex":"^a\u0000"}})
|
{ "_id" : "a", "tag" : "a0" }
|
>
|
go test code:
package main
|
|
import (
|
"flag"
|
"fmt"
|
"log"
|
"regexp"
|
|
"labix.org/v2/mgo"
|
"labix.org/v2/mgo/bson"
|
)
|
|
var (
|
host = flag.String("h", "127.0.0.1", "host")
|
port = flag.String("p", "27017", "port")
|
db = flag.String("d", "test", "db")
|
coll = flag.String("c", "test", "coll")
|
)
|
|
func main() {
|
log.SetFlags(log.Lshortfile | log.LstdFlags)
|
flag.Parse()
|
session, err := mgo.Dial(*host + ":" + *port)
|
if err != nil {
|
log.Fatal(err)
|
}
|
coll := session.DB(*db).C(*coll)
|
// drop collection
|
err = coll.DropCollection()
|
if err != nil {
|
log.Println(err)
|
}
|
// insert some item
|
err = coll.Insert(bson.M{"_id": "a"}, bson.M{"_id": "a\x00b"}, bson.M{"_id": "ab"}, bson.M{"_id": "bc"})
|
if err != nil {
|
log.Fatal(err)
|
}
|
// try to find
|
all := []bson.M{}
|
err = coll.Find(nil).All(&all)
|
if err != nil {
|
log.Fatal(err)
|
}
|
fmt.Printf("all items:\t%#v\n", all)
|
// try to find "\x00"
|
err = coll.Find(bson.M{"_id": bson.M{"$regex": regexp.QuoteMeta("\x00")}}).All(&all)
|
if err != nil {
|
log.Fatal(err)
|
}
|
fmt.Printf("search \\x00:\t%#v\n", all)
|
|
// try to find "a\x00"
|
err = coll.Find(bson.M{"_id": bson.M{"$regex": regexp.QuoteMeta("a\x00")}}).All(&all)
|
if err != nil {
|
log.Fatal(err)
|
}
|
fmt.Printf("search a\\x00:\t%#v\n", all)
|
|
// try to find "a\x00b"
|
err = coll.Find(bson.M{"_id": bson.M{"$regex": regexp.QuoteMeta("a\x00b")}}).All(&all)
|
if err != nil {
|
log.Fatal(err)
|
}
|
fmt.Printf("search a\\x00b:\t%#v\n", all)
|
|
// try to find "^\x00"
|
err = coll.Find(bson.M{"_id": bson.M{"$regex": "^" + regexp.QuoteMeta("\x00")}}).All(&all)
|
if err != nil {
|
log.Fatal(err)
|
}
|
fmt.Printf("search ^\\x00:\t%#v\n", all)
|
|
// try to find "^a\x00"
|
err = coll.Find(bson.M{"_id": bson.M{"$regex": "^" + regexp.QuoteMeta("a\x00")}}).All(&all)
|
if err != nil {
|
log.Fatal(err)
|
}
|
fmt.Printf("search ^a\\x00:\t%#v\n", all)
|
|
// try to find "^\x00test"
|
err = coll.Find(bson.M{"_id": bson.M{"$regex": "^" + regexp.QuoteMeta("\x00test")}}).All(&all)
|
if err != nil {
|
log.Fatal(err)
|
}
|
fmt.Printf("search ^\\x00test:\t%#v\n", all)
|
return
|
}
|
go test code result:
all items: []bson.M{bson.M{"_id":"a"}, bson.M{"_id":"a\x00b"}, bson.M{"_id":"ab"}, bson.M{"_id":"bc"}}
|
search \x00: []bson.M{bson.M{"_id":"a"}, bson.M{"_id":"a\x00b"}, bson.M{"_id":"ab"}, bson.M{"_id":"bc"}}
|
search a\x00: []bson.M{bson.M{"_id":"a"}, bson.M{"_id":"a\x00b"}, bson.M{"_id":"ab"}}
|
search a\x00b: []bson.M{bson.M{"_id":"a"}, bson.M{"_id":"a\x00b"}, bson.M{"_id":"ab"}}
|
search ^\x00: []bson.M{bson.M{"_id":"a"}, bson.M{"_id":"a\x00b"}, bson.M{"_id":"ab"}, bson.M{"_id":"bc"}}
|
search ^a\x00: []bson.M{bson.M{"_id":"a"}, bson.M{"_id":"a\x00b"}, bson.M{"_id":"ab"}}
|
search ^\x00test: []bson.M{bson.M{"_id":"a"}, bson.M{"_id":"a\x00b"}, bson.M{"_id":"ab"}, bson.M{"_id":"bc"}}
|
|