Uploaded image for project: 'MongoDB Shell'
  1. MongoDB Shell
  2. MONGOSH-279

Incorporate theoretical maxCollectionSize calculation into shardCollection shell helper

    • Type: Icon: New Feature New Feature
    • Resolution: Won't Do
    • Priority: Icon: Minor - P4 Minor - P4
    • No version
    • Affects Version/s: None
    • Component/s: Shell API
    • Labels:
      None
    • 3
    • Not Needed

      The "Sharding Pitfalls Part III: Chunk Balancing and Collection Limits" blog post alludes to the sharding limits on Sharding Existing Collection Data Size, however this calculation could be rolled into the shell helper to increase visibility of the theoretical limit.

      For example, the sh.shardCollection utility method could be expanded as follows:

      sh.shardCollection = function(fullName, key, unique, options) {
        sh._checkFullName(fullName);
        assert(key, "need a key");
        assert(typeof (key) == "object", "key needs to be an object");
      
        var calculateInitialShardingStats = function (fullName, key) {
          const CHUNK_SIZE_DEFAULT = 64;
          const BSON_MAX_SIZE      = 16777216;
      
          var chunkSizeDoc = db.getSiblingDB("config").settings.findOne({ _id: "chunksize" });
          var chunkSize = (chunkSizeDoc !== null) ? chunkSizeDoc.value : CHUNK_SIZE_DEFAULT;
      
          // collection size prior to sharding
          var namespace = fullName.split('.');
          var collection = db.getSiblingDB(namespace[0]).getCollection(namespace.slice(1).join('.'));
          var stats = collection.stats();
          var collectionSize = stats.shards[stats.primary].size;
      
          // retrieve a sample document and project only the values defined by the shard key
          var projection = Object.assign({}, key);
          if (!Object.hasOwnProperty("_id"))
              projection["_id"] = 0;
          var sample = collection.aggregate([ { $sample: { size: 1 } }, { $project: projection } ]).toArray()
      
          var avgKeyValueSize = Object.bsonsize(sample);
          var maxSplits = BSON_MAX_SIZE / avgKeyValueSize;
          var maxCollectionSizeMB = maxSplits * (chunkSize / 2);
       
          return {
            shardKey: key,
            sampleDocumentValues: sample,
            averageShardKeyValueSize: avgKeyValueSize,
            chunkSizeMB: chunkSize,
            maxSplitNum: parseInt(maxSplits),
            collectionSize: collectionSize,
            maxCollectionSize: maxCollectionSizeMB * 1048576 // convert from MB to bytes
          }
        }
        
        var bypassCollectionSizeValidation = false;
        var cmd = { shardCollection: fullName, key: key };
      
        if (unique) {
          cmd.unique = true;
        }
        if (options) {
          if (typeof (options) !== "object") {
            throw new Error("options must be an object");
          }
          if (options.hasOwnProperty("bypassCollectionSizeValidation") && typeof (options.bypassCollectionSizeValidation) == "boolean") {
            bypassCollectionSizeValidation = options.bypassCollectionSizeValidation;
            delete options.bypassCollectionSizeValidation;
          }
          Object.extend(cmd, options);
        }
      
        var collectionShardingStats = calculateInitialShardingStats(fullName, key);
        printjson(collectionShardingStats)
        print("The theoretical maximum collection size to shard is: " + collectionShardingStats.maxCollectionSize);
        print("The current collection size is: " + collectionShardingStats.collectionSize);
        if (!bypassCollectionSizeValidation && (collectionShardingStats.collectionSize > collectionShardingStats.maxCollectionSize)) {
          print("Sharding collection aborted due to theoretical maximum collection size being exceeded.")
          print("Set 'options.bypassCollectionSizeValidation' = true to shard anyway");
        }
      
        return sh._adminCommand(cmd);
      }
      

      Note the above adds a lot of extra output just for verification / testing which could be removed.

      Sample output when run with a simple collection:

      function setup() {
          use test;
          sh.enableSharding("test");
          db.foo.drop();
          function generate_random_data(size){
              var chars = 'abcdefghijklmnopqrstuvwxyz'.split('');
              var len = chars.length;
              var random_data = [];
          
              while (size--) {
                  random_data.push(chars[Math.random()*len | 0]);
              }
          
              return random_data.join('');
          }
          for (var i = 0; i < 100; i++) { db.foo.insert({ d: new Date(), data: generate_random_data(1024) }); }
      }
      
      setup();
      db.foo.createIndex({ data: 1, d: 1 })
      sh.shardCollection("test.foo", { data: 1, d: 1 })
      
      /*
      {
      	"shardKey" : {
      		"data" : 1,
      		"d" : 1
      	},
      	"sampleDocumentValues" : [
      		{
      			"d" : ISODate("2020-07-02T11:04:00.145Z"),
      			"data" : "ycxzwsiybchmdfvllyxwbzasqcoonbyupgudqlwunpspvdmlqmlhguzpgwkhpypukfydvmrlpefasyepnfdioikcbdisynxifdzbargahylffevbsshibbwoaskqzvmfgxyvsvwhbdbrbosshrvtoqbjqsahphcekjhxiseaqdqjzvgiqiszdsbvamjliavgqybljwefwdntcmddoiuhvjbfhsjpnttatgxjsumlkjifjwssudzcydngpsesqudkmxxdruhhbwfsojtrdvgkrwajsfktgyezgnxzjboejioxjhernhaefdxgubiotvikerqilqlselzrttfjmcuiwlkfqhradxmtazzaghxlmaiyiwzyxdvlcetpzrbzhhapfhfctxsshuxcnqmyyuisrhwvfxxanufuxwcskoutxeaiertjpntqcsnwvaeqioiybtoxfmdswecjlmnmhaayvhadjrzanmvjxopnwakkdafcvtllmjxpisijafcdwbzokxxdnyvpovuldoqceiyaxjudewbjidispedpmlfpglokahjycjpyhrqpbzicuecuiivdylcrzwtbkqtthkifjevywtctsrrzuvmdoazffxniertktbwsyzqxkknwxciaircmogydbgvqoisygeifjcvoezpuehjmxjmoivjpoecatyggdfstgiwlmvnxsnovrzvdoiwaftneeklvyjxbtmqfdtfdpqclapzuvryrnynaobpokvxcyiiqsxnjsiqjlliyekrrcvwnhwobcoyighfbmaggzbxivcoqmtiystheevbhcjuapayuybzeofilsbvzrpbklezsvomrarwdexrxyzzmjqnttqthneubffoqktqcxkuvjtyjkpdstatdlwbrjdpyavouteomobfqpmvjzspkoqvznfseqshkhdnpxowgxczzfvjjreaekuyihyulyojoyrvxshqkpuzhztxrdtckngwuvpnuvibpnamslyrpojpjwkgvssbwoxvf"
      		}
      	],
      	"averageShardKeyValueSize" : 1059,
      	"chunkSizeMB" : 64,
      	"maxSplitNum" : 15842,
      	"collectionSize" : 106800,
      	"maxCollectionSize" : 531586358282.6365
      }
      The theoretical maximum collection size to shard is: 531586358282.6365
      The current collection size is: 106800
      {
      	"collectionsharded" : "test.foo",
      	"collectionUUID" : UUID("85f43966-69a5-4f37-ae5f-68c0292b512b"),
      	"ok" : 1,
      	"operationTime" : Timestamp(1593687856, 9),
      	"$clusterTime" : {
      		"clusterTime" : Timestamp(1593687856, 9),
      		"signature" : {
      			"hash" : BinData(0,"AAAAAAAAAAAAAAAAAAAAAAAAAAA="),
      			"keyId" : NumberLong(0)
      		}
      	}
      }
      */
      

            Assignee:
            Unassigned Unassigned
            Reporter:
            alex.bevilacqua@mongodb.com Alex Bevilacqua
            Votes:
            0 Vote for this issue
            Watchers:
            7 Start watching this issue

              Created:
              Updated:
              Resolved: