[SERVER-15004] MR program silently drops records bug or some configuration missing Created: 22/Aug/14  Updated: 11/Jan/15  Resolved: 10/Jan/15

Status: Closed
Project: Core Server
Component/s: MapReduce
Affects Version/s: 2.6.1
Fix Version/s: None

Type: Question Priority: Critical - P2
Reporter: Vipul Assignee: Ramon Fernandez Marina
Resolution: Done Votes: 0
Labels: None
Remaining Estimate: Not Specified
Time Spent: Not Specified
Original Estimate: Not Specified

Attachments: Text File code.txt     Text File code.txt     Text File input_filter_12_elements_of_553.txt     Text File order_10000.txt     Text File output_gives_only_11_elements_of_553.txt    
Issue Links:
Duplicate
Participants:

 Description   

I am running a MR on mongodb, It silently drops records when I try to do a denormalize operation on 10000 records, it happens somewhere in the middle of the collections (~5K). I tried removing near by records (when sorted), its nothing wrong with the data.

Here is my code:

mapOrd = function() {
    var values = {
        customerId: this.customerId,
        orderNr: this.orderNr,
        productId: this.productId
    };
    emit(this.productId, values);
};
 
mapPrd = function() {
    var values = {
    code: this.code
    };
    emit(this.id, values);
};
 
reduceOrdPrd =  function(k, values) {
    var result = {};
    values.forEach(function(value) {
    var field;
        if ("orderNr" in value) {
            if (!("cust_ids" in result)) {
                result.cust_ids = [];
            }
            result.cust_ids.push(value);
        } else {
    for (field in value) {
        if (value.hasOwnProperty(field) ) {
                result[field] = value[field];
        }
         };  
       }
      });
       return result;
};

db.prd_ord.drop();
 
db.order_10000.mapReduce(mapOrd, reduceOrdPrd, {"out": {"reduce": "prd_ord"}, "sort": {"productId": 1}});

I am running this on a machine which has very low config. 512 MB RAM with 1GB of SWAP memory.

But whatever the case, it should not silently (randomly) drop elements of groups.

The objects are not exceeding the BSON object limit.(just 10 to 15 array of objects per key).

Any suggestions, what could be causing this issue?

My code is working as designed. The only issue is dropping records for same emit key at a particular point after 5500 records in my code. e.g. productid=553 (emit key) has 12 elements.which is ending at 5502 nd record in input collection order_10000. 5501st and 5502nd record is getting dropped from the output of MR.

Input Order_10000:

db.order_10000.find({productId:553}).pretty();
{
       "_id" : ObjectId("53f75e2ab4e41522bccf3410"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderNr" : NumberLong(6000058)
}
{
       "_id" : ObjectId("53f75e2ab4e41522bccf3411"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderNr" : NumberLong(6000059)
}
{
       "_id" : ObjectId("53f75e2ab4e41522bccf3412"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderNr" : NumberLong(6000060)
}
{
       "_id" : ObjectId("53f75e2ab4e41522bccf3413"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderNr" : NumberLong(6000061)
}
{
       "_id" : ObjectId("53f75e2ab4e41522bccf3414"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderNr" : NumberLong(6000062)
}
{
       "_id" : ObjectId("53f75e2eb4e41522bccf3415"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderLineNr" : NumberLong(1),
       "orderNr" : NumberLong(6000063)
}
{
       "_id" : ObjectId("53f75e2eb4e41522bccf3416"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderNr" : NumberLong(6000064)
}
{
       "_id" : ObjectId("53f75e2eb4e41522bccf3417"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderNr" : NumberLong(6000065)
}
{
       "_id" : ObjectId("53f75e2eb4e41522bccf3418"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderNr" : NumberLong(6000066)
}
{
       "_id" : ObjectId("53f75e97b4e41522bccf3419"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderNr" : NumberLong(6000067)
}
{
       "_id" : ObjectId("53f75e97b4e41522bccf341a"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderNr" : NumberLong(6000068)
}
{
       "_id" : ObjectId("53f76127b4e41522bccf341e"),
       "customerId" : NumberLong(5699),
       "productId" : NumberLong(553),
       "orderNr" : NumberLong(6000071)
}
 

Output of MR:

db.prd_ord.find({'value.cust_ids.productId': 553}).pretty();
{
       "_id" : NumberLong(553),
       "value" : {
               "cust_ids" : [
                       {
                               "customerId" : NumberLong(5699),
                               "orderNr" : NumberLong(6000068),
                               "productId" : NumberLong(553)
                       },
                       {
                               "customerId" : NumberLong(5699),
                               "orderNr" : NumberLong(6000067),
                               "productId" : NumberLong(553)
                       },
                       {
                               "customerId" : NumberLong(5699),
                               "orderNr" : NumberLong(6000066),
                               "productId" : NumberLong(553)
                       },
                       {
                               "customerId" : NumberLong(5699),
                               "orderNr" : NumberLong(6000065),
                               "productId" : NumberLong(553)
                       },
                       {
                               "customerId" : NumberLong(5699),
                               "orderNr" : NumberLong(6000064),
                               "productId" : NumberLong(553)
                       },
                       {
                               "customerId" : NumberLong(5699),
                               "orderNr" : NumberLong(6000063),
                               "productId" : NumberLong(553)
                       },
                       {
                               "customerId" : NumberLong(5699),
                               "orderNr" : NumberLong(6000062),
                               "productId" : NumberLong(553)
                       },
                       {
                               "customerId" : NumberLong(5699),
                               "orderNr" : NumberLong(6000061),
                               "productId" : NumberLong(553)
                       },
                       {
                               "customerId" : NumberLong(5699),
                               "orderNr" : NumberLong(6000060),
                               "productId" : NumberLong(553)
                       },
                       {
                               "customerId" : NumberLong(5699),
                               "orderNr" : NumberLong(6000059),
                               "productId" : NumberLong(553)
                       }
               ],
               "cust_ids_length" : 10
       }    
}
 

Next group starts fine from here. Same thing happens if a particular group crosses next threshold.
Am I missing some configuration or its a known bug



 Comments   
Comment by Ramon Fernandez Marina [ 11/Jan/15 ]

vipulmehta13, as per my message above, not all elements/values of the key come in the same record from a practical standpoint.

My understanding of the internals is that the first call to reduce may contain all mapped values for a given key, but the reduce function may need to yield. If that's the case, it will be called a second time – only this time the results from the first reduce call will be passed as an additional value. This yielding can happen multiple times, so the reduce function may be called multiple times for one mapReduce() operation. Each additional time reduce is called, the results from the previous call will be passed as an additional value.

You may want to read SERVER-16045 for a similar case. The solution to handle these multiple calls to reduce is to make the reduce function idempotent, associative, and commutative as per the documentation.

If you have further questions, please post on the mongodb-user group or Stack Overflow with the mongodb tag, where your question will reach a larger audience. A question like this involving more discussion would be best posted on the mongodb-user group, as the SERVER project is for reporting bugs and improvement suggestions against the MongoDB kernel.

Regards,
Ramón.

Comment by Vipul [ 11/Jan/15 ]

May be I am not able to understand it clearly. Even if the reduce function has been invoked more than once, All the elements of the key should come in the same record.

Thanks,
Vipul

Comment by Ramon Fernandez Marina [ 10/Jan/15 ]

vipulmehta13, as pointed out in the stackoverflow thread where you first posted this, you need make sure to meet the requirements for the reduce function as described in the documentation:

MongoDB can invoke the reduce function more than once for the same key. In this case, the previous output from the reduce function for that key will become one of the input values to the next reduce function invocation for that key.

Regards,
Ramón.

Comment by Vipul [ 23/Aug/14 ]

Update: The alternate solution (code below) worked fine for me. But my previous code is still a mystery to me as I don't know what is wrong in my earlier code.

mapOrd = function() {
    var values = {cust_ids: [{
        customerId: this.customerId,
        orderNr: this.orderNr,
        productId: this.productId
    }]};
    emit(this.productId, values);
};
 
reduceOrd = function(k, values) {
    var result = [];
    values.forEach(function(value) {
        value.cust_ids.forEach(function(e) {
            result.push(e);
        });
    });
    return {cust_ids: result};
};
 
 
mapPrd = function() {
    var values = {
	code: this.code
    };
    emit(this.id, values);
};
 
reducePrd =  function(k, values) {
	var result = {};
	values.forEach(function(value) {
	var field;
	for (field in value) {
		if (value.hasOwnProperty(field) ) {
                result[field] = value[field];
		}
	     };  
	  });
	  result["length"] = result.cust_ids.length;
	   return result;
};
 
 
db.prd_ord.drop();
 
db.order.mapReduce(mapOrd, reduceOrd, {"out": {"reduce": "prd_ord"}, "sort": {"productId": 1}});
db.product.mapReduce(mapPrd, reducePrd, {"out": {"reduce": "prd_ord"}, "sort": {"id": 1} });
 
mapCZ = function() {
    var that = this;
    if ("cust_ids" in this.value) {
        this.value.cust_ids.forEach(function(value) {
            emit(value.orderNr, {
                customerId: value.customerId,
                productId: value.productId,
				code: that.value.code
            });
        });
    }
};
 
reduceCZ = function(k, values) {
    var result = {};
    values.forEach(function(value) {
        var field;
        for (field in value) {
            if (value.hasOwnProperty(field)) {
                result[field] = value[field];
            }
        }
    });
    return result;
};
 
 
db.prd_ord_flat.drop();
db.prd_ord.mapReduce(mapCZ, reduceCZ, {"out": "prd_ord_flat", "sort": {"_id": 1}}); 

Comment by Vipul [ 23/Aug/14 ]

Attaching detailed code and data
order_10000 has 5503 records with 5501st record as 12th occurrence of productId553

input_filter_12_elemets_of_553: Filter on input data.
code.txt-The actual code being run on machine of 512 MB RAM. MongoDB shell version: 2.6.1

output_gives_only_11_elements_of_553.txt: Filter on output data.

Hope this helps replicating the issue.

Generated at Thu Feb 08 03:36:38 UTC 2024 using Jira 9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66.