Description
A conditional count within a GroupBy aggregation is not currently optimised by the AstOptimizer, resulting in a $group that causes all documents to be pushed and then a following $project stage.
This has a significant impact on grouping, particularly when the group is actually just leveraging an index. With the unoptimised post-$group $project stage, the entire document set has to be collated, whereas with the $project optimised into the group (and with an appropriate index on the collection) the $group becomes non-blocking, performing at basically the same speed regardless of collection size.
The following demonstrates the difference in queries
No filter expression:
|
|
|
db.Foo.Aggregate([]).GroupBy(x => x.FooName, (x, y) => new Summary() {FooName = x, Count = y.Count()})
|
|
|
Optimised query:
|
|
|
db.Foo.Aggregate([{ "$group" : { "_id" : "$FooName", "__agg0" : { "$sum" : 1 } } }, { "$project" : { "FooName" : "$_id", "Count" : "$__agg0", "_id" : 0 } }])
|
|
|
Filtered expression:
|
|
|
db.Foo.Aggregate([]).GroupBy(x => x.FooName, (x, y) => new Summary() {FooName = x, Count = y.Count(x => (Convert(x.State, Int32) == 1))})
|
|
|
Unoptimised query:
|
|
|
db.Foo.Aggregate([{ "$group" : { "_id" : "$FooName", "_elements" : { "$push" : "$$ROOT" } } }, { "$project" : { "FooName" : "$_id", "Count" : { "$size" : { "$filter" : { "input" : "$_elements", "as" : "x", "cond" : { "$eq" : ["$$x.State", 1] } } } }, "_id" : 0 } }])
|
As you can see, once we make the Count filtered it fails to optimise and falls back to a full $push: "$$ROOT".
I think this can be optimised by converting the expression from a $size to a:
$sum: {
|
$cond: [
|
{ $eq: ["$State", 1]},
|
1,
|
0
|
]
|
}
|
I had a scan of the source and I think this would be done in AstGroupingPipelineOptimizer.AccumulatorMover.VisitUnaryExpression.TryOptimizeSizeOfElements. It looks like this currently only optimizes counts for the entire _element set.
Demo source code:
using MongoDB.Driver;
|
using MongoDB.Driver.Linq;
|
using static System.Linq.Queryable; |
|
|
public class Program |
{
|
public enum State |
{
|
Started,
|
Running,
|
Complete
|
}
|
|
|
public class Foo |
{
|
public string FooName; |
public State State; |
}
|
|
|
public class Summary |
{
|
public string FooName; |
public int Count; |
}
|
|
|
public static void Main(string[] args) |
{
|
// Setup |
string uri = "mongodb://localhost:27017"; |
MongoClientSettings settings = MongoClientSettings.FromConnectionString(uri);
|
settings.LinqProvider = LinqProvider.V3;
|
MongoClient client = new(settings: settings); |
IMongoDatabase database = client.GetDatabase("db"); |
IMongoCollection<Foo> foos = database.GetCollection<Foo>("Foo"); |
foos.DeleteMany(x => true); |
|
|
// Test data |
foos.InsertOne(new() { FooName = "foo1", State = State.Started }); |
foos.InsertOne(new() { FooName = "foo1", State = State.Running }); |
foos.InsertOne(new() { FooName = "foo2", State = State.Running }); |
foos.InsertOne(new() { FooName = "foo2", State = State.Running }); |
foos.InsertOne(new() { FooName = "foo2", State = State.Complete }); |
foos.InsertOne(new() { FooName = "foo3", State = State.Complete }); |
// Queryable |
IQueryable<Foo> foosQ = foos.AsQueryable();
|
|
|
// No filter, just count |
var countByName = foosQ.GroupBy(x => x.FooName, (x, y) => new Summary() |
{
|
FooName = x,
|
Count = y.Count()
|
});
|
|
|
// Filter by running |
var runningByName = foosQ.GroupBy(x => x.FooName, (x, y) => new Summary() |
{
|
FooName = x,
|
Count = y.Count(x => x.State == State.Running)
|
});
|
|
|
Console.WriteLine($"No filter expression: {countByName.Expression}"); |
Console.WriteLine($"Optimised query = {countByName}"); |
Console.WriteLine();
|
Console.WriteLine($"Filtered expression: {runningByName.Expression}"); |
Console.WriteLine($"Unoptimised query = {runningByName}"); } |
}
|