Details
-
Improvement
-
Status: Backlog
-
Major - P3
-
Resolution: Unresolved
-
None
-
None
-
Query Optimization
Description
Hi,
I have a time-series collection where I'm keeping the prices of cryptocurrencies for 30 days with the second granularity. Around 2000 cryptocurrency ticker data have been inserted into the time-series collection every one or two seconds. For some reason, I require to retrieve the distinct symbols. I have more than 2 Billion records in the time-series collection.
When I run the following query, it didn't complete in a few minutes and I had to stop it.
db.cryptoTickerBinance.distinct("symbol") |
Available indexes on the collection are listed below:
[
|
{
|
"v": 2, |
"key": { |
"symbol": 1, |
"time": 1 |
},
|
"name": "symbol_1_time_1" |
},
|
{
|
"v": 2, |
"key": { |
"time": 1 |
},
|
"name": "time_1" |
},
|
{
|
"v": 2, |
"key": { |
"symbol": 1, |
"time": -1 |
},
|
"name": "sym2" |
},
|
{
|
"v": 2, |
"key": { |
"time": -1 |
},
|
"name": "time_2" |
}
|
]
|
However, when I use $group staging it works fast as expected since it's using DISTINCT_SCAN step.
db.cryptoTickerBinance.aggregate([
|
{
|
'$group': { |
'_id': '$symbol' |
}
|
}
|
]).explain('executionStats') |
{ explainVersion: '1', |
stages:
|
[ { '$cursor': |
{ queryPlanner:
|
{ namespace: 'exchange.system.buckets.cryptoTickerBinance', |
indexFilterSet: false, |
parsedQuery: {},
|
queryHash: 'A855245D', |
planCacheKey: 'A855245D', |
maxIndexedOrSolutionsReached: false, |
maxIndexedAndSolutionsReached: false, |
maxScansToExplodeReached: false, |
winningPlan:
|
{ stage: 'PROJECTION_COVERED', |
transformBy: { meta: 1, _id: 0 }, |
inputStage:
|
{ stage: 'DISTINCT_SCAN', |
keyPattern: { meta: 1, 'control.min.time': 1, 'control.max.time': 1 }, |
indexName: 'symbol_1_time_1', |
isMultiKey: false, |
multiKeyPaths: { meta: [], 'control.min.time': [], 'control.max.time': [] }, |
isUnique: false, |
isSparse: false, |
isPartial: false, |
indexVersion: 2, |
direction: 'forward', |
indexBounds:
|
{ meta: [ '[MinKey, MaxKey]' ], |
'control.min.time': [ '[MinKey, MaxKey]' ], |
'control.max.time': [ '[MinKey, MaxKey]' ] } } }, |
rejectedPlans: [] },
|
executionStats:
|
{ executionSuccess: true, |
nReturned: 2080, |
executionTimeMillis: 52, |
totalKeysExamined: 2080, |
totalDocsExamined: 0, |
executionStages:
|
{ stage: 'PROJECTION_COVERED', |
nReturned: 2080, |
executionTimeMillisEstimate: 49, |
works: 2081, |
advanced: 2080, |
needTime: 0, |
needYield: 0, |
saveState: 4, |
restoreState: 4, |
isEOF: 1, |
transformBy: { meta: 1, _id: 0 }, |
inputStage:
|
{ stage: 'DISTINCT_SCAN', |
nReturned: 2080, |
executionTimeMillisEstimate: 49, |
works: 2081, |
advanced: 2080, |
needTime: 0, |
needYield: 0, |
saveState: 4, |
restoreState: 4, |
isEOF: 1, |
keyPattern: { meta: 1, 'control.min.time': 1, 'control.max.time': 1 }, |
indexName: 'symbol_1_time_1', |
isMultiKey: false, |
multiKeyPaths: { meta: [], 'control.min.time': [], 'control.max.time': [] }, |
isUnique: false, |
isSparse: false, |
isPartial: false, |
indexVersion: 2, |
direction: 'forward', |
indexBounds:
|
{ meta: [ '[MinKey, MaxKey]' ], |
'control.min.time': [ '[MinKey, MaxKey]' ], |
'control.max.time': [ '[MinKey, MaxKey]' ] }, |
keysExamined: 2080 } } } }, |
nReturned: 2080, |
executionTimeMillisEstimate: 49 }, |
{ '$groupByDistinctScan': { newRoot: { _id: '$meta' } }, |
nReturned: 2080, |
executionTimeMillisEstimate: 49 } ], |
serverInfo:
|
{ host: 'atlas-3uvq7m-shard-00-02.iubjy.mongodb.net', |
port: 27017, |
version: '6.0.0', |
gitVersion: 'e61bf27c2f6a83fed36e5a13c008a32d563babe2' }, |
serverParameters:
|
{ internalQueryFacetBufferSizeBytes: 104857600, |
internalQueryFacetMaxOutputDocSizeBytes: 104857600, |
internalLookupStageIntermediateDocumentMaxSizeBytes: 104857600, |
internalDocumentSourceGroupMaxMemoryBytes: 104857600, |
internalQueryMaxBlockingSortMemoryUsageBytes: 104857600, |
internalQueryProhibitBlockingMergeOnMongoS: 0, |
internalQueryMaxAddToSetBytes: 104857600, |
internalDocumentSourceSetWindowFieldsMaxMemoryBytes: 104857600 }, |
command:
|
{ aggregate: 'system.buckets.cryptoTickerBinance', |
pipeline:
|
[ { '$_internalUnpackBucket': |
{ timeField: 'time', |
metaField: 'symbol', |
bucketMaxSpanSeconds: 3600, |
assumeNoMixedSchemaData: true } }, |
{ '$group': { _id: '$symbol' } } ], |
cursor: {},
|
collation: { locale: 'simple' } }, |
ok: 1, |
'$clusterTime': |
{ clusterTime: Timestamp({ t: 1659631070, i: 2089 }), |
signature:
|
{ hash: Binary(Buffer.from("d16b946bcb75e6999bd12f47bf93c85c1bab16ea", "hex"), 0), |
keyId: 7079845501138373000 } }, |
operationTime: Timestamp({ t: 1659631070, i: 2089 }) } |
How the distinct() query can be optimized on time-series collections? It may be linked to another ticket (https://jira.mongodb.org/browse/SERVER-14227).
Attachments
Issue Links
- depends on
-
SERVER-37715 Use DISTINCT_SCAN for $unwind-$group pipelines
-
- Backlog
-