|
Hi,
I have a time-series collection where I'm keeping the prices of cryptocurrencies for 30 days with the second granularity. Around 2000 cryptocurrency ticker data have been inserted into the time-series collection every one or two seconds. For some reason, I require to retrieve the distinct symbols. I have more than 2 Billion records in the time-series collection.
When I run the following query, it didn't complete in a few minutes and I had to stop it.
db.cryptoTickerBinance.distinct("symbol")
|
Available indexes on the collection are listed below:
[
|
{
|
"v": 2,
|
"key": {
|
"symbol": 1,
|
"time": 1
|
},
|
"name": "symbol_1_time_1"
|
},
|
{
|
"v": 2,
|
"key": {
|
"time": 1
|
},
|
"name": "time_1"
|
},
|
{
|
"v": 2,
|
"key": {
|
"symbol": 1,
|
"time": -1
|
},
|
"name": "sym2"
|
},
|
{
|
"v": 2,
|
"key": {
|
"time": -1
|
},
|
"name": "time_2"
|
}
|
]
|
However, when I use $group staging it works fast as expected since it's using DISTINCT_SCAN step.
db.cryptoTickerBinance.aggregate([
|
{
|
'$group': {
|
'_id': '$symbol'
|
}
|
}
|
]).explain('executionStats')
|
{ explainVersion: '1',
|
stages:
|
[ { '$cursor':
|
{ queryPlanner:
|
{ namespace: 'exchange.system.buckets.cryptoTickerBinance',
|
indexFilterSet: false,
|
parsedQuery: {},
|
queryHash: 'A855245D',
|
planCacheKey: 'A855245D',
|
maxIndexedOrSolutionsReached: false,
|
maxIndexedAndSolutionsReached: false,
|
maxScansToExplodeReached: false,
|
winningPlan:
|
{ stage: 'PROJECTION_COVERED',
|
transformBy: { meta: 1, _id: 0 },
|
inputStage:
|
{ stage: 'DISTINCT_SCAN',
|
keyPattern: { meta: 1, 'control.min.time': 1, 'control.max.time': 1 },
|
indexName: 'symbol_1_time_1',
|
isMultiKey: false,
|
multiKeyPaths: { meta: [], 'control.min.time': [], 'control.max.time': [] },
|
isUnique: false,
|
isSparse: false,
|
isPartial: false,
|
indexVersion: 2,
|
direction: 'forward',
|
indexBounds:
|
{ meta: [ '[MinKey, MaxKey]' ],
|
'control.min.time': [ '[MinKey, MaxKey]' ],
|
'control.max.time': [ '[MinKey, MaxKey]' ] } } },
|
rejectedPlans: [] },
|
executionStats:
|
{ executionSuccess: true,
|
nReturned: 2080,
|
executionTimeMillis: 52,
|
totalKeysExamined: 2080,
|
totalDocsExamined: 0,
|
executionStages:
|
{ stage: 'PROJECTION_COVERED',
|
nReturned: 2080,
|
executionTimeMillisEstimate: 49,
|
works: 2081,
|
advanced: 2080,
|
needTime: 0,
|
needYield: 0,
|
saveState: 4,
|
restoreState: 4,
|
isEOF: 1,
|
transformBy: { meta: 1, _id: 0 },
|
inputStage:
|
{ stage: 'DISTINCT_SCAN',
|
nReturned: 2080,
|
executionTimeMillisEstimate: 49,
|
works: 2081,
|
advanced: 2080,
|
needTime: 0,
|
needYield: 0,
|
saveState: 4,
|
restoreState: 4,
|
isEOF: 1,
|
keyPattern: { meta: 1, 'control.min.time': 1, 'control.max.time': 1 },
|
indexName: 'symbol_1_time_1',
|
isMultiKey: false,
|
multiKeyPaths: { meta: [], 'control.min.time': [], 'control.max.time': [] },
|
isUnique: false,
|
isSparse: false,
|
isPartial: false,
|
indexVersion: 2,
|
direction: 'forward',
|
indexBounds:
|
{ meta: [ '[MinKey, MaxKey]' ],
|
'control.min.time': [ '[MinKey, MaxKey]' ],
|
'control.max.time': [ '[MinKey, MaxKey]' ] },
|
keysExamined: 2080 } } } },
|
nReturned: 2080,
|
executionTimeMillisEstimate: 49 },
|
{ '$groupByDistinctScan': { newRoot: { _id: '$meta' } },
|
nReturned: 2080,
|
executionTimeMillisEstimate: 49 } ],
|
serverInfo:
|
{ host: 'atlas-3uvq7m-shard-00-02.iubjy.mongodb.net',
|
port: 27017,
|
version: '6.0.0',
|
gitVersion: 'e61bf27c2f6a83fed36e5a13c008a32d563babe2' },
|
serverParameters:
|
{ internalQueryFacetBufferSizeBytes: 104857600,
|
internalQueryFacetMaxOutputDocSizeBytes: 104857600,
|
internalLookupStageIntermediateDocumentMaxSizeBytes: 104857600,
|
internalDocumentSourceGroupMaxMemoryBytes: 104857600,
|
internalQueryMaxBlockingSortMemoryUsageBytes: 104857600,
|
internalQueryProhibitBlockingMergeOnMongoS: 0,
|
internalQueryMaxAddToSetBytes: 104857600,
|
internalDocumentSourceSetWindowFieldsMaxMemoryBytes: 104857600 },
|
command:
|
{ aggregate: 'system.buckets.cryptoTickerBinance',
|
pipeline:
|
[ { '$_internalUnpackBucket':
|
{ timeField: 'time',
|
metaField: 'symbol',
|
bucketMaxSpanSeconds: 3600,
|
assumeNoMixedSchemaData: true } },
|
{ '$group': { _id: '$symbol' } } ],
|
cursor: {},
|
collation: { locale: 'simple' } },
|
ok: 1,
|
'$clusterTime':
|
{ clusterTime: Timestamp({ t: 1659631070, i: 2089 }),
|
signature:
|
{ hash: Binary(Buffer.from("d16b946bcb75e6999bd12f47bf93c85c1bab16ea", "hex"), 0),
|
keyId: 7079845501138373000 } },
|
operationTime: Timestamp({ t: 1659631070, i: 2089 }) }
|
How the distinct() query can be optimized on time-series collections? It may be linked to another ticket (https://jira.mongodb.org/browse/SERVER-14227).
|