-
Type:
New Feature
-
Resolution: Won't Do
-
Priority:
Minor - P4
-
Affects Version/s: None
-
Component/s: Shell API
-
None
-
3
-
Not Needed
The "Sharding Pitfalls Part III: Chunk Balancing and Collection Limits" blog post alludes to the sharding limits on Sharding Existing Collection Data Size, however this calculation could be rolled into the shell helper to increase visibility of the theoretical limit.
For example, the sh.shardCollection utility method could be expanded as follows:
sh.shardCollection = function(fullName, key, unique, options) { sh._checkFullName(fullName); assert(key, "need a key"); assert(typeof (key) == "object", "key needs to be an object"); var calculateInitialShardingStats = function (fullName, key) { const CHUNK_SIZE_DEFAULT = 64; const BSON_MAX_SIZE = 16777216; var chunkSizeDoc = db.getSiblingDB("config").settings.findOne({ _id: "chunksize" }); var chunkSize = (chunkSizeDoc !== null) ? chunkSizeDoc.value : CHUNK_SIZE_DEFAULT; // collection size prior to sharding var namespace = fullName.split('.'); var collection = db.getSiblingDB(namespace[0]).getCollection(namespace.slice(1).join('.')); var stats = collection.stats(); var collectionSize = stats.shards[stats.primary].size; // retrieve a sample document and project only the values defined by the shard key var projection = Object.assign({}, key); if (!Object.hasOwnProperty("_id")) projection["_id"] = 0; var sample = collection.aggregate([ { $sample: { size: 1 } }, { $project: projection } ]).toArray() var avgKeyValueSize = Object.bsonsize(sample); var maxSplits = BSON_MAX_SIZE / avgKeyValueSize; var maxCollectionSizeMB = maxSplits * (chunkSize / 2); return { shardKey: key, sampleDocumentValues: sample, averageShardKeyValueSize: avgKeyValueSize, chunkSizeMB: chunkSize, maxSplitNum: parseInt(maxSplits), collectionSize: collectionSize, maxCollectionSize: maxCollectionSizeMB * 1048576 // convert from MB to bytes } } var bypassCollectionSizeValidation = false; var cmd = { shardCollection: fullName, key: key }; if (unique) { cmd.unique = true; } if (options) { if (typeof (options) !== "object") { throw new Error("options must be an object"); } if (options.hasOwnProperty("bypassCollectionSizeValidation") && typeof (options.bypassCollectionSizeValidation) == "boolean") { bypassCollectionSizeValidation = options.bypassCollectionSizeValidation; delete options.bypassCollectionSizeValidation; } Object.extend(cmd, options); } var collectionShardingStats = calculateInitialShardingStats(fullName, key); printjson(collectionShardingStats) print("The theoretical maximum collection size to shard is: " + collectionShardingStats.maxCollectionSize); print("The current collection size is: " + collectionShardingStats.collectionSize); if (!bypassCollectionSizeValidation && (collectionShardingStats.collectionSize > collectionShardingStats.maxCollectionSize)) { print("Sharding collection aborted due to theoretical maximum collection size being exceeded.") print("Set 'options.bypassCollectionSizeValidation' = true to shard anyway"); } return sh._adminCommand(cmd); }
Note the above adds a lot of extra output just for verification / testing which could be removed.
Sample output when run with a simple collection:
function setup() { use test; sh.enableSharding("test"); db.foo.drop(); function generate_random_data(size){ var chars = 'abcdefghijklmnopqrstuvwxyz'.split(''); var len = chars.length; var random_data = []; while (size--) { random_data.push(chars[Math.random()*len | 0]); } return random_data.join(''); } for (var i = 0; i < 100; i++) { db.foo.insert({ d: new Date(), data: generate_random_data(1024) }); } } setup(); db.foo.createIndex({ data: 1, d: 1 }) sh.shardCollection("test.foo", { data: 1, d: 1 }) /* { "shardKey" : { "data" : 1, "d" : 1 }, "sampleDocumentValues" : [ { "d" : ISODate("2020-07-02T11:04:00.145Z"), "data" : "ycxzwsiybchmdfvllyxwbzasqcoonbyupgudqlwunpspvdmlqmlhguzpgwkhpypukfydvmrlpefasyepnfdioikcbdisynxifdzbargahylffevbsshibbwoaskqzvmfgxyvsvwhbdbrbosshrvtoqbjqsahphcekjhxiseaqdqjzvgiqiszdsbvamjliavgqybljwefwdntcmddoiuhvjbfhsjpnttatgxjsumlkjifjwssudzcydngpsesqudkmxxdruhhbwfsojtrdvgkrwajsfktgyezgnxzjboejioxjhernhaefdxgubiotvikerqilqlselzrttfjmcuiwlkfqhradxmtazzaghxlmaiyiwzyxdvlcetpzrbzhhapfhfctxsshuxcnqmyyuisrhwvfxxanufuxwcskoutxeaiertjpntqcsnwvaeqioiybtoxfmdswecjlmnmhaayvhadjrzanmvjxopnwakkdafcvtllmjxpisijafcdwbzokxxdnyvpovuldoqceiyaxjudewbjidispedpmlfpglokahjycjpyhrqpbzicuecuiivdylcrzwtbkqtthkifjevywtctsrrzuvmdoazffxniertktbwsyzqxkknwxciaircmogydbgvqoisygeifjcvoezpuehjmxjmoivjpoecatyggdfstgiwlmvnxsnovrzvdoiwaftneeklvyjxbtmqfdtfdpqclapzuvryrnynaobpokvxcyiiqsxnjsiqjlliyekrrcvwnhwobcoyighfbmaggzbxivcoqmtiystheevbhcjuapayuybzeofilsbvzrpbklezsvomrarwdexrxyzzmjqnttqthneubffoqktqcxkuvjtyjkpdstatdlwbrjdpyavouteomobfqpmvjzspkoqvznfseqshkhdnpxowgxczzfvjjreaekuyihyulyojoyrvxshqkpuzhztxrdtckngwuvpnuvibpnamslyrpojpjwkgvssbwoxvf" } ], "averageShardKeyValueSize" : 1059, "chunkSizeMB" : 64, "maxSplitNum" : 15842, "collectionSize" : 106800, "maxCollectionSize" : 531586358282.6365 } The theoretical maximum collection size to shard is: 531586358282.6365 The current collection size is: 106800 { "collectionsharded" : "test.foo", "collectionUUID" : UUID("85f43966-69a5-4f37-ae5f-68c0292b512b"), "ok" : 1, "operationTime" : Timestamp(1593687856, 9), "$clusterTime" : { "clusterTime" : Timestamp(1593687856, 9), "signature" : { "hash" : BinData(0,"AAAAAAAAAAAAAAAAAAAAAAAAAAA="), "keyId" : NumberLong(0) } } } */