[SERVER-28942] sort by shard key or prefix of shard key may not require merge before group Created: 24/Apr/17  Updated: 06/Dec/22

Status: Backlog
Project: Core Server
Component/s: Aggregation Framework
Affects Version/s: None
Fix Version/s: None

Type: Improvement Priority: Major - P3
Reporter: Asya Kamsky Assignee: Backlog - Query Optimization
Resolution: Unresolved Votes: 0
Labels: asya, merge, optimization
Remaining Estimate: Not Specified
Time Spent: Not Specified
Original Estimate: Not Specified

Issue Links:
Related
related to SERVER-28327 ClientCursorMonitor can timeout curso... Closed
related to SERVER-5477 when sharded, no need to merge groups... Backlog
is related to SERVER-26321 Long-running aggregations can artific... Closed
Assigned Teams:
Query Optimization
Participants:
Case:

 Description   

Consider the following scenario:

Collection is sharded on field "symbol":

db.coll.aggregate({$sort:{symbol:1, date:1}},{$group:{_id:"$symbol", first:{$first:"$start"},last:{$last:"$start"}, sum:{$sum:"$volume"}}})

This pipeline will merge because of sort even though it does not need to.

Related but not quite the same is SERVER-5477 which would allow the group to take place on the single shard in this example, but it wouldn't happen if $sort by shard key also wasn't correctly handled.



 Comments   
Comment by Asya Kamsky [ 24/Apr/17 ]

The proposed optimization here would make the merge happen in $group rather than $sort and optimization proposed in SERVER-5477 would avoid the merge in this pipeline till after $group (and in fact could be merged by mongos once both tickets were done).

Comment by Asya Kamsky [ 24/Apr/17 ]

Here is explain plan for such an aggregation:

db.ticks.explain().aggregate({$sort:{Symbol:1, Timestamp:1}},{$group:{_id:"$Symbol", first:{$first:"$start"},last:{$last:"$start"}, sum:{$sum:"$Volume"}}})
{
	"needsPrimaryShardMerger" : false,
	"splitPipeline" : {
		"shardsPart" : [
			{
				"$sort" : {
					"sortKey" : {
						"Symbol" : 1,
						"Timestamp" : 1
					}
				}
			},
			{
				"$project" : {
					"_id" : false,
					"Volume" : true,
					"Timestamp" : true,
					"start" : true,
					"Symbol" : true
				}
			}
		],
		"mergerPart" : [
			{
				"$sort" : {
					"sortKey" : {
						"Symbol" : 1,
						"Timestamp" : 1
					},
					"mergePresorted" : true
				}
			},
			{
				"$group" : {
					"_id" : "$Symbol",
					"first" : {
						"$first" : "$start"
					},
					"last" : {
						"$last" : "$start"
					},
					"sum" : {
						"$sum" : "$Volume"
					}
				}
			}
		]
	},
	"shards" : {
		"shard0000" : {
			"host" : "Asyas-MacBook-Pro.local:20000",
			"stages" : [
				{
					"$cursor" : {
						"query" : {
 
						},
						"sort" : {
							"Symbol" : 1,
							"Timestamp" : 1
						},
						"fields" : {
							"Symbol" : 1,
							"Timestamp" : 1,
							"Volume" : 1,
							"start" : 1,
							"_id" : 0
						},
						"queryPlanner" : {
							"plannerVersion" : 1,
							"namespace" : "spark.ticks",
							"indexFilterSet" : false,
							"parsedQuery" : {
 
							},
							"winningPlan" : {
								"stage" : "FETCH",
								"inputStage" : {
									"stage" : "SHARDING_FILTER",
									"inputStage" : {
										"stage" : "IXSCAN",
										"keyPattern" : {
											"Symbol" : 1,
											"Timestamp" : 1
										},
										"indexName" : "Symbol_1_Timestamp_1",
										"isMultiKey" : false,
										"multiKeyPaths" : {
											"Symbol" : [ ],
											"Timestamp" : [ ]
										},
										"isUnique" : false,
										"isSparse" : false,
										"isPartial" : false,
										"indexVersion" : 2,
										"direction" : "forward",
										"indexBounds" : {
											"Symbol" : [
												"[MinKey, MaxKey]"
											],
											"Timestamp" : [
												"[MinKey, MaxKey]"
											]
										}
									}
								}
							},
							"rejectedPlans" : [ ]
						}
					}
				},
				{
					"$project" : {
						"_id" : false,
						"Symbol" : true,
						"start" : true,
						"Timestamp" : true,
						"Volume" : true
					}
				}
			]
		},
		"shard0001" : {
			"host" : "Asyas-MacBook-Pro.local:20001",
			"stages" : [
				{
					"$cursor" : {
						"query" : {
 
						},
						"sort" : {
							"Symbol" : 1,
							"Timestamp" : 1
						},
						"fields" : {
							"Symbol" : 1,
							"Timestamp" : 1,
							"Volume" : 1,
							"start" : 1,
							"_id" : 0
						},
						"queryPlanner" : {
							"plannerVersion" : 1,
							"namespace" : "spark.ticks",
							"indexFilterSet" : false,
							"parsedQuery" : {
 
							},
							"winningPlan" : {
								"stage" : "FETCH",
								"inputStage" : {
									"stage" : "SHARDING_FILTER",
									"inputStage" : {
										"stage" : "IXSCAN",
										"keyPattern" : {
											"Symbol" : 1,
											"Timestamp" : 1
										},
										"indexName" : "Symbol_1_Timestamp_1",
										"isMultiKey" : false,
										"multiKeyPaths" : {
											"Symbol" : [ ],
											"Timestamp" : [ ]
										},
										"isUnique" : false,
										"isSparse" : false,
										"isPartial" : false,
										"indexVersion" : 2,
										"direction" : "forward",
										"indexBounds" : {
											"Symbol" : [
												"[MinKey, MaxKey]"
											],
											"Timestamp" : [
												"[MinKey, MaxKey]"
											]
										}
									}
								}
							},
							"rejectedPlans" : [ ]
						}
					}
				},
				{
					"$project" : {
						"_id" : false,
						"Symbol" : true,
						"start" : true,
						"Timestamp" : true,
						"Volume" : true
					}
				}
			]
		},
		"shard0002" : {
			"host" : "Asyas-MacBook-Pro.local:20002",
			"stages" : [
				{
					"$cursor" : {
						"query" : {
 
						},
						"sort" : {
							"Symbol" : 1,
							"Timestamp" : 1
						},
						"fields" : {
							"Symbol" : 1,
							"Timestamp" : 1,
							"Volume" : 1,
							"start" : 1,
							"_id" : 0
						},
						"queryPlanner" : {
							"plannerVersion" : 1,
							"namespace" : "spark.ticks",
							"indexFilterSet" : false,
							"parsedQuery" : {
 
							},
							"winningPlan" : {
								"stage" : "FETCH",
								"inputStage" : {
									"stage" : "SHARDING_FILTER",
									"inputStage" : {
										"stage" : "IXSCAN",
										"keyPattern" : {
											"Symbol" : 1,
											"Timestamp" : 1
										},
										"indexName" : "Symbol_1_Timestamp_1",
										"isMultiKey" : false,
										"multiKeyPaths" : {
											"Symbol" : [ ],
											"Timestamp" : [ ]
										},
										"isUnique" : false,
										"isSparse" : false,
										"isPartial" : false,
										"indexVersion" : 2,
										"direction" : "forward",
										"indexBounds" : {
											"Symbol" : [
												"[MinKey, MaxKey]"
											],
											"Timestamp" : [
												"[MinKey, MaxKey]"
											]
										}
									}
								}
							},
							"rejectedPlans" : [ ]
						}
					}
				},
				{
					"$project" : {
						"_id" : false,
						"Symbol" : true,
						"start" : true,
						"Timestamp" : true,
						"Volume" : true
					}
				}
			]
		}
	},
	"ok" : 1,
	"logicalTime" : {
		"clusterTime" : Timestamp(1493051213, 1),
		"signature" : {
			"hash" : BinData(0,"A0w/++3Gcd+g/TXcMAth+fF5UxY="),
			"keyId" : NumberLong(0)
		}
	}
}

Generated at Thu Feb 08 04:19:29 UTC 2024 using Jira 9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66.