|
My database contains the following documents:
use test;
|
db.test.insert(
|
{
|
'name' : 'first',
|
'color' : 'blue',
|
'type' : 'break'
|
}
|
)
|
|
db.test.insert(
|
{
|
'name' : 'second',
|
'color' : 'red',
|
//'type' : 'break'
|
}
|
)
|
Note that the second document does not have a type field, it's missing. The aggregation pipeline produces inconsistent results sometimes showing 'type' :null in the id field, and sometimes not.
For example, the following aggregation does NOT return the missing type in the key:
db.test.aggregate([
|
{
|
$group : {
|
'_id' : {
|
'name' : '$name',
|
'type' : '$type',
|
'color' : '$color'
|
},
|
'count' : {$sum : 1}
|
}
|
|
},
|
{
|
$group : {
|
'_id' : {
|
'type' : '$_id.type',
|
'color' : '$_id.color'
|
},
|
'count' : {$sum : 1}
|
}
|
|
}
|
])
|
Results in:
{ "_id" : { "type" : "break", "color" : "blue" }, "count" : 1 }
|
{ "_id" : { "color" : "red" }, "count" : 1 }
|
However, this aggregation, returns back a 'type' : null,
db.test.aggregate([
|
{
|
$group : {
|
'_id' : {
|
'name' : '$name',
|
'type' : '$type',
|
'color' : '$color'
|
},
|
'count' : {$sum : 1}
|
}
|
|
},
|
{
|
$group : {
|
'_id' : {
|
'type' : '$_id.type',
|
//'color' : '$_id.color'
|
},
|
'count' : {$sum : 1}
|
}
|
|
}
|
])
|
Results in:
{ "_id" : { "type" : "break" }, "count" : 1 }
|
{ "_id" : { "type" : null }, "count" : 1 }
|
Note that the only difference was the comment out of the 'color' field in the second group.
The results should be consistent between these two queries. Either the first results should look like:
|
|
When we get to fixing this, I agree that the latest patch shows behavior I would consider more correct.
|
|
Ah okay, good point. Then I would propose the following patch instead:
diff --git a/src/mongo/db/pipeline/document_source_group.cpp b/src/mongo/db/pipeline/document_source_group.cpp
|
index e605d14..6bbb58c 100644
|
--- a/src/mongo/db/pipeline/document_source_group.cpp
|
+++ b/src/mongo/db/pipeline/document_source_group.cpp
|
@@ -329,9 +329,14 @@ void DocumentSourceGroup::populate() {
|
/* get the _id value */
|
Value id = computeId(_variables.get());
|
|
- /* treat missing values the same as NULL SERVER-4674 */
|
- if (id.missing())
|
+ // If the _id was a single value, then treat a missing value the same as null. If the _id
|
+ // was an object, then leave it as a missing value. For example, a {$group: {_id: "$x"}}
|
+ // should treat a document without the "x" field as if the _id was null, but a
|
+ // {$group: {_id: {x: "$x"}}} should treat the same document as if the _id was an empty
|
+ // object. See SERVER-21992 for more details.
|
+ if (id.missing() && _idFieldNames.empty()) {
|
id = Value(BSONNULL);
|
+ }
|
|
/*
|
Look for the _id value in the map; if it's not there, add a
|
Behavior before the patch (using version 3.2.1)
> db.test.drop()
|
true
|
> db.test.insert({n: "f"})
|
WriteResult({ "nInserted" : 1 })
|
> db.test.aggregate({"$group":{_id: "$none" } })
|
{ "_id" : null }
|
> db.test.aggregate({"$group":{_id:{ "none":"$none"} } })
|
{ "_id" : { "none" : null } }
|
> db.test.aggregate({"$group":{_id:{ "none":"$none" , "n": "$n"} } })
|
{ "_id" : { "n" : "f" } }
|
> db.test.aggregate({"$group":{_id:{ "none":"$none", "none2":"$none2"} } })
|
{ "_id" : { } }
|
Behavior after the patch
> db.test.drop()
|
false
|
> db.test.insert({n: "f"})
|
WriteResult({ "nInserted" : 1 })
|
> db.test.aggregate({"$group":{_id: "$none" } })
|
{ "_id" : null }
|
> db.test.aggregate({"$group":{_id:{ "none":"$none"} } })
|
{ "_id" : { } }
|
> db.test.aggregate({"$group":{_id:{ "none":"$none" , "n": "$n"} } })
|
{ "_id" : { "n" : "f" } }
|
> db.test.aggregate({"$group":{_id:{ "none":"$none", "none2":"$none2"} } })
|
{ "_id" : { } }
|
|
|
I think the before is more correct. We convert missing _ids to null in $group, but this is not a case of a missing _id. The _id is (logically) an ExpressionObject that should have the same behavior as in $project. The fact that we optimize handling of it shouldn't change behavior. Have look at the output of this stage:
{$project: {n : '$n', obj: {n: '$n', none: '$none'}}}
|
|
|
apara, you have a point. I think I was able to fix it with this patch:
diff --git a/src/mongo/db/pipeline/document_source_group.cpp b/src/mongo/db/pipeline/document_source_group.cpp
|
index e605d14..c84c1f8 100644
|
--- a/src/mongo/db/pipeline/document_source_group.cpp
|
+++ b/src/mongo/db/pipeline/document_source_group.cpp
|
@@ -498,7 +498,8 @@ Value DocumentSourceGroup::computeId(Variables* vars) {
|
vector<Value> vals;
|
vals.reserve(_idExpressions.size());
|
for (size_t i = 0; i < _idExpressions.size(); i++) {
|
- vals.push_back(_idExpressions[i]->evaluate(vars));
|
+ Value val = _idExpressions[i]->evaluate(vars);
|
+ vals.push_back(val.missing() ? Value(BSONNULL) : val);
|
}
|
return Value(std::move(vals));
|
}
|
After the patch, I got this behavior:
> db.test.drop()
|
false
|
> db.test.insert({n: "f"})
|
WriteResult({ "nInserted" : 1 })
|
> db.test.aggregate({"$group":{_id:{ "none":"$none"} } })
|
{ "_id" : { "none" : null } }
|
> db.test.aggregate({"$group":{_id:{ "none":"$none" , "n": "$n"} } })
|
{ "_id" : { "none" : null, "n" : "f" } }
|
> db.test.aggregate({"$group":{_id:{ "none":"$none", "none2":"$none2"} } })
|
{ "_id" : { "none" : null, "none2" : null } }
|
This is a backwards breaking change, but I think it's worth doing.
Alternatively, we could remove these lines, but that looks more intentional, to resolve SERVER-4674. It looks to me like the above behavior is desired, and the fix to SERVER-4674 just didn't cover all the cases. i.e. This is also a weirdness that would be resolved by the above patch:
> db.test.drop()
|
true
|
> db.test.insert({n: null})
|
WriteResult({ "nInserted" : 1 })
|
> db.test.insert({})
|
WriteResult({ "nInserted" : 1 })
|
> db.test.find().table()
|
╔══════════════════════════════════════╤══════╗
|
║ _id │ n ║
|
╠══════════════════════════════════════╪══════╣
|
║ ObjectId("56ccb70b9138fcb143fdfb6f") │ null ║
|
╟──────────────────────────────────────┼──────╢
|
║ ObjectId("56ccb7199138fcb143fdfb70") │ ║
|
╚══════════════════════════════════════╧══════╝
|
Before patch
> db.test.aggregate([{$group: {_id: "$n", count: {$sum: 1}}}])
|
{ "_id" : null, "count" : 2 }
|
> db.test.aggregate([{$group: {_id: {n: "$n", none: "$none"}, count: {$sum: 1}}}])
|
{ "_id" : { }, "count" : 1 }
|
{ "_id" : { "n" : null }, "count" : 1 }
|
After patch:
> db.test.aggregate([{$group: {_id: "$n", count: {$sum: 1}}}])
|
{ "_id" : null, "count" : 2 }
|
> db.test.aggregate([{$group: {_id: {n: "$n", none: "$none"}, count: {$sum: 1}}}])
|
{ "_id" : { "n" : null, "none" : null }, "count" : 2 }
|
What are your thoughts asya, redbeard0531?
|
|
We worked around this issue in our code, however, for consistency sake either all missing fields be returned with null, or none of them should.
|
|
Here's a much simpler reproducer of weirdness - the issue is grouping on subdocument and one of the named fields being null - includes it with null value when that's the only field, but grouping on more than one field when any are null, omits those null fields entirely.
db.test.drop()
|
db.test.insert({"n":"f"})
|
db.test.aggregate({"$group":{_id:{ "none":"$none"} } })
|
db.test.aggregate({"$group":{_id:{ "none":"$none" , "n": "$n"} } })
|
db.test.aggregate({"$group":{_id:{ "none":"$none", "none2":"$none2"} } })
|
|
// results of three aggs above:
|
{ "_id" : { "none" : null } }
|
{ "_id" : { "n" : "f" } }
|
{ "_id" : { } }
|
When there is exactly one field in subdocument and it is missing, you get explicit null. I couldn't get null to show up in any other cases (this is with 3.2).
|
|
Hi apara,
Thank you for the detailed example. I am sending this to the query team to be scheduled. Updates will be posted on this ticket as they happen.
Regards,
Thomas
|
|
This has been tested with 3.0.8 using WiredTiger engine.
|
|
Either the first result should look like:
{ "_id" : { "type" : "break", "color" : "blue" }, "count" : 1 }
|
{ "_id" : { "type" : null, "color" : "red" }, "count" : 1 }
|
or the second result should look like:
{ "_id" : { "type" : "break" }, "count" : 1 }
|
{ "_id" : { }, "count" : 1 }
|
|
Generated at Thu Feb 08 03:59:04 UTC 2024 using Jira 9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66.