If many values are emitted for same key, and object keeps growing, MR does not flush it to disk until it reaches a threshold size.
But a large object becomes very slow to handle for JS and it may use much more memory than we think it's using.
It triggers many reduce steps and potential GC.
Example is:
map = function() { emit(this.full_name, this._id); } reduce = function(k,vals) { var tmp = {}; vals.forEach(function(i) { if(typeof(i) == 'string') { tmp[i] = true; } else { for(var z in i) tmp[z] = true; } }); return tmp; }
Against a collection with 1m docs like:
{ "_id" : {__rand: "str", len: 20}, "soc_id" : {__rand: "str", len: 10}, "exp" : {__rand: "int", min: 0, max: 100000000}, "full_name" : "Natalya", "last_entrance" : 1321935873, "score" : 5000 }