Loading...

Type: Question
Resolution: Incomplete
Priority: Major - P3
Fix Version/s: None
Affects Version/s: 2.2.3
Component/s: Performance
Labels:
None
Environment:
osx 10.8.3

Confidence Status:
None
Work Order:
3
CAR Domain/s:
None

Aha! Reference:
None
Tracking Level:
None
Risk Status:
None
Exec Notes:
None
Goal Name(s):
None
Goal Link:
None

We have a collection to store user activity, in terms of who did what at what time. The access to this collection will be user based, and fetch from latest to oldest.

The doc looks like
{_id:ObjectId,
userId:Number,
type:String,
objectid:String//foreign key to other collection on the actual action data,
timestamp:Date }

We are considering to group such tiny doc into an array in one doc, which turns it to be,

{
_id:ObjectId,
userId:Number,
pageNumber:Number,
activities:[Activity]//like above with only type, refObjectId,timestamp
}

We thought this will give better storage since number of documents is reduced, indexes should be smaller. While reading, the number of documents to seek is also reduced, thus read should be faster.

However, when I ran a testing script thru shell to test this schema for reading and writing, I see with larger bucket size the storage is actually higher and insertion is higher. The raw doc without bucketing give highest insertion rate with relatively increase in storage size.

So my question is, is it a good design to group documents together into buckets, especially large bucket like 100 – 500? And dose the number of documents matter, say we have 10 million users all using this product with each having 100 activities, the doc number will go up to 1 billion.

My testing script and result is as below,

Big = 500 per array
small = 5 per array
no bucket = each as a document.
total 10000 "atomic doc" inserted.

> load('bucket.js')
big bucket size runtime: 20.368
storageSize with big bucket size : 11182080
paddingFactor with big bucket size: 1.0060000000002791
small bucket size runtime: 0.375
storageSize with small bucket size : 2793472
paddingFactor with small bucket size: 1.9980000000003342
no bucket size runtime: 0.303
storageSize with no bucket size : 3055616
paddingFactor with no bucket size: 1
1 big doc size 38.925
100 small doc size 41.5
500 raw doc size 50.5

script,

var d = db.getSisterDB("bucket_test");

var total = 10000;
var f = function(usecase , col, bucketSize) {
col.drop();
col.ensureIndex(

{'activities.type':1}

);
var count = total/bucketSize;
var start = Date.now();
for (var i=0; i < count; i++) {
// Document created with only the _id field
col.insert({_id:i});
for(var j=0;j<bucketSize;j++){
col.update({_id:i}, {$push : {"activities":

{type:'post',objectid:'515bc88a8ce3b4718e6a1099',time:new Date}

}}, true);
}

}
var t = (Date.now() - start)/1000;
print(usecase +" size runtime: " + t);
print("storageSize with " + usecase +" size : " + col.stats().storageSize);
print("paddingFactor with " + usecase+" size: " + col.stats().paddingFactor);

return t;
}

var insrt = function(usecase , col, count){
col.drop();
col.ensureIndex(

{'activity.type':1}

);
var start = Date.now();
for (var i=0; i < count; i++) {
// Document created with only the _id field
col.insert({_id:i,activity:{type:'post',objectid:'515bc88a8ce3b4718e6a1099',time:new Date}});

}
var t = (Date.now() - start)/1000;
print(usecase +" size runtime: " + t);
print("storageSize with " + usecase +" size : " + col.stats().storageSize);
print("paddingFactor with " + usecase+" size: " + col.stats().paddingFactor);

return t;
}
f('big bucket',d.big,500);
f('small bucket',d.small,5);
insrt('no bucket', d.no_bucket, total);

var bigDoc = d.big.findOne();
print('1 big doc size '+(Object.bsonsize(bigDoc)/1000));

var smallDoc = d.small.findOne();
print('100 small doc size '+(Object.bsonsize(smallDoc)*100/1000));

var rawDoc = d.no_bucket.findOne();
print('500 raw doc size '+(Object.bsonsize(rawDoc) * 500 / 1000));

Details

Description

Attachments

Activity

People

Dates