|
Author:
{u'username': u'erh', u'name': u'Eliot Horowitz', u'email': u'eliot@10gen.com'}
Message: SERVER-16351: make sure when we swap out recovery units, we don't commit active transactions
Branch: master
https://github.com/mongodb/mongo/commit/2c3dea56d483ed0dca1413a69ef429f1429a7cad
|
|
Restarted mms-dev primary again. No issues!
|
|
I just restarted both the mms-dev and mms-qa primaries, and no collection/index inconsistencies appeared.
NB: I did one polite restart and looked for any issues. Then I did a bunch of ruder restarts (rebooting the actual servers) as I also took this opportunity to clear all the TPH warnings. No issues appeared during the rude restarts either.
I will now let everything run without restarts for another ~24 hours, and then will do another round of polite restarts.
|
|
This just happened again.
Same sequence of events:
1. Performance became very poor. Documented that in SERVER-16366 .
2. Killed the WiredTiger primary (using kill PID)
3. After a few minutes, I let the Automation Agent restart the WiredTiger node.
4. It went through a brief period of catching up on replication lag, then became primary again.
5. When it came back, there was a document whose data was inconsistent with its index.
Note that its a document in the same collection as before.
mmsdev:PRIMARY> db.data.alerts.v2.find({status: 'OPEN', cid:ObjectId('5401dc1df64b2377fcd0b08a'), et: {$in: ['BACKUP_AGENT_DOWN']}}).pretty()
|
{
|
"_id" : ObjectId("547e6b17e4b054aa200904ec"),
|
"et" : "BACKUP_AGENT_DOWN",
|
"tag" : "unassigned",
|
"_t" : "AGENT",
|
"cid" : ObjectId("5401dc1df64b2377fcd0b08a"),
|
"acids" : [
|
ObjectId("540bb750e4b039a8d31a04c4")
|
],
|
"status" : "CLOSED",
|
"cre" : ISODate("2014-12-03T01:44:55.168Z"),
|
"upd" : ISODate("2014-12-03T03:26:19.586Z"),
|
"ln" : [
|
{
|
"to" : "5401dc1df64b2377fcd0b08a",
|
"sent" : ISODate("2014-12-03T03:26:19.805Z"),
|
"c" : 40
|
}
|
],
|
"lnd" : ISODate("2014-12-03T03:26:19.805Z"),
|
"res" : ISODate("2014-12-03T03:26:19.586Z")
|
}
|
mmsdev:PRIMARY> db.data.alerts.v2.find({status: 'OPEN', cid:ObjectId('5401dc1df64b2377fcd0b08a'), et: {$in: ['BACKUP_AGENT_DOWN']}}).explain("allPlansExecution")
|
{
|
"queryPlanner" : {
|
"plannerVersion" : 1,
|
"parsedQuery" : {
|
"$and" : [
|
{
|
"cid" : {
|
"$eq" : ObjectId("5401dc1df64b2377fcd0b08a")
|
}
|
},
|
{
|
"status" : {
|
"$eq" : "OPEN"
|
}
|
},
|
{
|
"et" : {
|
"$in" : [
|
"BACKUP_AGENT_DOWN"
|
]
|
}
|
}
|
]
|
},
|
"winningPlan" : {
|
"stage" : "KEEP_MUTATIONS",
|
"inputStage" : {
|
"stage" : "FETCH",
|
"filter" : {
|
"et" : {
|
"$in" : [
|
"BACKUP_AGENT_DOWN"
|
]
|
}
|
},
|
"inputStage" : {
|
"stage" : "IXSCAN",
|
"keyPattern" : {
|
"cid" : 1,
|
"status" : 1,
|
"cre" : -1
|
},
|
"isMultiKey" : false,
|
"direction" : "forward",
|
"indexBounds" : {
|
"cid" : [
|
"[ObjectId('5401dc1df64b2377fcd0b08a'), ObjectId('5401dc1df64b2377fcd0b08a')]"
|
],
|
"status" : [
|
"[\"OPEN\", \"OPEN\"]"
|
],
|
"cre" : [
|
"[MaxKey, MinKey]"
|
]
|
}
|
}
|
}
|
},
|
"rejectedPlans" : [
|
{
|
"stage" : "KEEP_MUTATIONS",
|
"inputStage" : {
|
"stage" : "FETCH",
|
"filter" : {
|
"$and" : [
|
{
|
"cid" : {
|
"$eq" : ObjectId("5401dc1df64b2377fcd0b08a")
|
}
|
},
|
{
|
"et" : {
|
"$in" : [
|
"BACKUP_AGENT_DOWN"
|
]
|
}
|
}
|
]
|
},
|
"inputStage" : {
|
"stage" : "IXSCAN",
|
"keyPattern" : {
|
"status" : 1,
|
"acids" : 1
|
},
|
"isMultiKey" : false,
|
"direction" : "forward",
|
"indexBounds" : {
|
"status" : [
|
"[\"OPEN\", \"OPEN\"]"
|
],
|
"acids" : [
|
"[MinKey, MaxKey]"
|
]
|
}
|
}
|
}
|
}
|
]
|
},
|
"executionStats" : {
|
"executionSuccess" : true,
|
"nReturned" : 1,
|
"executionTimeMillis" : 0,
|
"totalKeysExamined" : 1,
|
"totalDocsExamined" : 1,
|
"executionStages" : {
|
"stage" : "KEEP_MUTATIONS",
|
"nReturned" : 1,
|
"executionTimeMillisEstimate" : 0,
|
"works" : 3,
|
"advanced" : 1,
|
"needTime" : 0,
|
"needFetch" : 0,
|
"saveState" : 0,
|
"restoreState" : 0,
|
"isEOF" : 1,
|
"invalidates" : 0,
|
"inputStage" : {
|
"stage" : "FETCH",
|
"filter" : {
|
"et" : {
|
"$in" : [
|
"BACKUP_AGENT_DOWN"
|
]
|
}
|
},
|
"nReturned" : 1,
|
"executionTimeMillisEstimate" : 0,
|
"works" : 2,
|
"advanced" : 1,
|
"needTime" : 0,
|
"needFetch" : 0,
|
"saveState" : 0,
|
"restoreState" : 0,
|
"isEOF" : 1,
|
"invalidates" : 0,
|
"docsExamined" : 1,
|
"alreadyHasObj" : 0,
|
"inputStage" : {
|
"stage" : "IXSCAN",
|
"nReturned" : 1,
|
"executionTimeMillisEstimate" : 0,
|
"works" : 2,
|
"advanced" : 1,
|
"needTime" : 0,
|
"needFetch" : 0,
|
"saveState" : 0,
|
"restoreState" : 0,
|
"isEOF" : 1,
|
"invalidates" : 0,
|
"keyPattern" : {
|
"cid" : 1,
|
"status" : 1,
|
"cre" : -1
|
},
|
"isMultiKey" : false,
|
"direction" : "forward",
|
"indexBounds" : {
|
"cid" : [
|
"[ObjectId('5401dc1df64b2377fcd0b08a'), ObjectId('5401dc1df64b2377fcd0b08a')]"
|
],
|
"status" : [
|
"[\"OPEN\", \"OPEN\"]"
|
],
|
"cre" : [
|
"[MaxKey, MinKey]"
|
]
|
},
|
"keysExamined" : 1,
|
"dupsTested" : 0,
|
"dupsDropped" : 0,
|
"seenInvalidated" : 0,
|
"matchTested" : 0
|
}
|
}
|
},
|
"allPlansExecution" : [
|
{
|
"nReturned" : 0,
|
"executionTimeMillisEstimate" : 0,
|
"totalKeysExamined" : 2,
|
"totalDocsExamined" : 2,
|
"executionStages" : {
|
"stage" : "KEEP_MUTATIONS",
|
"nReturned" : 0,
|
"executionTimeMillisEstimate" : 0,
|
"works" : 2,
|
"advanced" : 0,
|
"needTime" : 2,
|
"needFetch" : 0,
|
"saveState" : 0,
|
"restoreState" : 0,
|
"isEOF" : 0,
|
"invalidates" : 0,
|
"inputStage" : {
|
"stage" : "FETCH",
|
"filter" : {
|
"$and" : [
|
{
|
"cid" : {
|
"$eq" : ObjectId("5401dc1df64b2377fcd0b08a")
|
}
|
},
|
{
|
"et" : {
|
"$in" : [
|
"BACKUP_AGENT_DOWN"
|
]
|
}
|
}
|
]
|
},
|
"nReturned" : 0,
|
"executionTimeMillisEstimate" : 0,
|
"works" : 2,
|
"advanced" : 0,
|
"needTime" : 2,
|
"needFetch" : 0,
|
"saveState" : 0,
|
"restoreState" : 0,
|
"isEOF" : 0,
|
"invalidates" : 0,
|
"docsExamined" : 2,
|
"alreadyHasObj" : 0,
|
"inputStage" : {
|
"stage" : "IXSCAN",
|
"nReturned" : 2,
|
"executionTimeMillisEstimate" : 0,
|
"works" : 2,
|
"advanced" : 2,
|
"needTime" : 0,
|
"needFetch" : 0,
|
"saveState" : 0,
|
"restoreState" : 0,
|
"isEOF" : 0,
|
"invalidates" : 0,
|
"keyPattern" : {
|
"status" : 1,
|
"acids" : 1
|
},
|
"isMultiKey" : false,
|
"direction" : "forward",
|
"indexBounds" : {
|
"status" : [
|
"[\"OPEN\", \"OPEN\"]"
|
],
|
"acids" : [
|
"[MinKey, MaxKey]"
|
]
|
},
|
"keysExamined" : 2,
|
"dupsTested" : 0,
|
"dupsDropped" : 0,
|
"seenInvalidated" : 0,
|
"matchTested" : 0
|
}
|
}
|
}
|
},
|
{
|
"nReturned" : 1,
|
"executionTimeMillisEstimate" : 0,
|
"totalKeysExamined" : 1,
|
"totalDocsExamined" : 1,
|
"executionStages" : {
|
"stage" : "KEEP_MUTATIONS",
|
"nReturned" : 1,
|
"executionTimeMillisEstimate" : 0,
|
"works" : 2,
|
"advanced" : 1,
|
"needTime" : 0,
|
"needFetch" : 0,
|
"saveState" : 0,
|
"restoreState" : 0,
|
"isEOF" : 1,
|
"invalidates" : 0,
|
"inputStage" : {
|
"stage" : "FETCH",
|
"filter" : {
|
"et" : {
|
"$in" : [
|
"BACKUP_AGENT_DOWN"
|
]
|
}
|
},
|
"nReturned" : 1,
|
"executionTimeMillisEstimate" : 0,
|
"works" : 2,
|
"advanced" : 1,
|
"needTime" : 0,
|
"needFetch" : 0,
|
"saveState" : 0,
|
"restoreState" : 0,
|
"isEOF" : 1,
|
"invalidates" : 0,
|
"docsExamined" : 1,
|
"alreadyHasObj" : 0,
|
"inputStage" : {
|
"stage" : "IXSCAN",
|
"nReturned" : 1,
|
"executionTimeMillisEstimate" : 0,
|
"works" : 2,
|
"advanced" : 1,
|
"needTime" : 0,
|
"needFetch" : 0,
|
"saveState" : 0,
|
"restoreState" : 0,
|
"isEOF" : 1,
|
"invalidates" : 0,
|
"keyPattern" : {
|
"cid" : 1,
|
"status" : 1,
|
"cre" : -1
|
},
|
"isMultiKey" : false,
|
"direction" : "forward",
|
"indexBounds" : {
|
"cid" : [
|
"[ObjectId('5401dc1df64b2377fcd0b08a'), ObjectId('5401dc1df64b2377fcd0b08a')]"
|
],
|
"status" : [
|
"[\"OPEN\", \"OPEN\"]"
|
],
|
"cre" : [
|
"[MaxKey, MinKey]"
|
]
|
},
|
"keysExamined" : 1,
|
"dupsTested" : 0,
|
"dupsDropped" : 0,
|
"seenInvalidated" : 0,
|
"matchTested" : 0
|
}
|
}
|
}
|
}
|
]
|
},
|
"serverInfo" : {
|
"host" : "ip-10-169-132-210",
|
"port" : 27018,
|
"version" : "2.8.0-rc1",
|
"gitVersion" : "c8171e7f969519af8b87a43425ae291ee69a0191"
|
},
|
"ok" : 1
|
}
|
|
|
The document was created at 11/29/14 - 03:47:43, and based on other application tracers, I expect that the value of "status" should have been set to CLOSED at 11/29/14 - 15:34:13. The log for this time period is here: https://dropbox.10gen.com/cailin/2014-11-29-23-07/mongodb.log.2014-11-29T20-09-43
[java] 2014-11-29T15:34:13.641+0000 [Thread-120] gid:5401c3c2f64b2377fcd0aff2 DEBUG com.xgen.svc.brs.svc.BackupAlertSvc [closeAlertInternal:64] - Closing alert: id=547941dfe4b03b02cc78d079, type=RESYNC_REQUIRED, groupId=5401c3c2f64b2377fcd0aff2, configIds=[5401e247e4b0eb78e88d3da1]
|
However, I do not believe that the document was corrupted at 11/29/14 - 15:34:13. I believe that the document was correctly responding to queries as CLOSED for at least the period of 11/29/14 - 15:34:13 through 11/29/14 - 15:48. If this was not the case, the application would have been sending a certain kind of email, and it was not.
However, during this same period, the replica set was experiencing extreme performance issues. Eventually the application became unresponsive. At 11/29/14 - 15:48 I shut down the wiredtiger primary and failed over to the mmapv1 secondary. I left it like this until ~20:15, at which point I restarted the wiredtiger node. The wiredtiger node caught up with the 4 hours of replication lag, and then became the primary again. When it became the primary again at ~20:26 at this point this query was returning the erroneous document.
Most unfortunately, in an extremely tragic accident of data shuffling, I deleted the log file for the wiredtiger node after 15:48.
|
|
Based on the fact that the bounds for the index scan include the range ["OPEN", "OPEN"] for "status", I'd guess that some operation corrupted the {cid: 1, status: 1, cre: -1} index entry for this document (at the very least). I'd also guess that this document has been targeted by an update that changed the value of "status" from "OPEN" to "CLOSED" (or vice-versa); if so, can you excerpt the update from your application code that would have changed this value? The raw data files and the mongod log for this instance (including a time interval that would have covered this update, if you know when the update would have happened) would be useful for diagnosing further.
|
|
It's reproducible every time.
> db.data.alerts.v2.find({status: 'OPEN', cid:ObjectId('5401c3c2f64b2377fcd0aff2'), et: {$in: ['RESYNC_REQUIRED']}}).explain("allPlansExecution")
|
{
|
"queryPlanner" : {
|
"plannerVersion" : 1,
|
"parsedQuery" : {
|
"$and" : [
|
{
|
"cid" : {
|
"$eq" : ObjectId("5401c3c2f64b2377fcd0aff2")
|
}
|
},
|
{
|
"status" : {
|
"$eq" : "OPEN"
|
}
|
},
|
{
|
"et" : {
|
"$in" : [
|
"RESYNC_REQUIRED"
|
]
|
}
|
}
|
]
|
},
|
"winningPlan" : {
|
"stage" : "KEEP_MUTATIONS",
|
"inputStage" : {
|
"stage" : "FETCH",
|
"filter" : {
|
"et" : {
|
"$in" : [
|
"RESYNC_REQUIRED"
|
]
|
}
|
},
|
"inputStage" : {
|
"stage" : "IXSCAN",
|
"keyPattern" : {
|
"cid" : 1,
|
"status" : 1,
|
"cre" : -1
|
},
|
"isMultiKey" : false,
|
"direction" : "forward",
|
"indexBounds" : {
|
"cid" : [
|
"[ObjectId('5401c3c2f64b2377fcd0aff2'), ObjectId('5401c3c2f64b2377fcd0aff2')]"
|
],
|
"status" : [
|
"[\"OPEN\", \"OPEN\"]"
|
],
|
"cre" : [
|
"[MaxKey, MinKey]"
|
]
|
}
|
}
|
}
|
},
|
"rejectedPlans" : [
|
{
|
"stage" : "KEEP_MUTATIONS",
|
"inputStage" : {
|
"stage" : "FETCH",
|
"filter" : {
|
"$and" : [
|
{
|
"cid" : {
|
"$eq" : ObjectId("5401c3c2f64b2377fcd0aff2")
|
}
|
},
|
{
|
"et" : {
|
"$in" : [
|
"RESYNC_REQUIRED"
|
]
|
}
|
}
|
]
|
},
|
"inputStage" : {
|
"stage" : "IXSCAN",
|
"keyPattern" : {
|
"status" : 1,
|
"acids" : 1
|
},
|
"isMultiKey" : false,
|
"direction" : "forward",
|
"indexBounds" : {
|
"status" : [
|
"[\"OPEN\", \"OPEN\"]"
|
],
|
"acids" : [
|
"[MinKey, MaxKey]"
|
]
|
}
|
}
|
}
|
}
|
]
|
},
|
"executionStats" : {
|
"executionSuccess" : true,
|
"nReturned" : 1,
|
"executionTimeMillis" : 0,
|
"totalKeysExamined" : 1,
|
"totalDocsExamined" : 1,
|
"executionStages" : {
|
"stage" : "KEEP_MUTATIONS",
|
"nReturned" : 1,
|
"executionTimeMillisEstimate" : 0,
|
"works" : 3,
|
"advanced" : 1,
|
"needTime" : 0,
|
"needFetch" : 0,
|
"saveState" : 0,
|
"restoreState" : 0,
|
"isEOF" : 1,
|
"invalidates" : 0,
|
"inputStage" : {
|
"stage" : "FETCH",
|
"filter" : {
|
"et" : {
|
"$in" : [
|
"RESYNC_REQUIRED"
|
]
|
}
|
},
|
"nReturned" : 1,
|
"executionTimeMillisEstimate" : 0,
|
"works" : 2,
|
"advanced" : 1,
|
"needTime" : 0,
|
"needFetch" : 0,
|
"saveState" : 0,
|
"restoreState" : 0,
|
"isEOF" : 1,
|
"invalidates" : 0,
|
"docsExamined" : 1,
|
"alreadyHasObj" : 0,
|
"inputStage" : {
|
"stage" : "IXSCAN",
|
"nReturned" : 1,
|
"executionTimeMillisEstimate" : 0,
|
"works" : 2,
|
"advanced" : 1,
|
"needTime" : 0,
|
"needFetch" : 0,
|
"saveState" : 0,
|
"restoreState" : 0,
|
"isEOF" : 1,
|
"invalidates" : 0,
|
"keyPattern" : {
|
"cid" : 1,
|
"status" : 1,
|
"cre" : -1
|
},
|
"isMultiKey" : false,
|
"direction" : "forward",
|
"indexBounds" : {
|
"cid" : [
|
"[ObjectId('5401c3c2f64b2377fcd0aff2'), ObjectId('5401c3c2f64b2377fcd0aff2')]"
|
],
|
"status" : [
|
"[\"OPEN\", \"OPEN\"]"
|
],
|
"cre" : [
|
"[MaxKey, MinKey]"
|
]
|
},
|
"keysExamined" : 1,
|
"dupsTested" : 0,
|
"dupsDropped" : 0,
|
"seenInvalidated" : 0,
|
"matchTested" : 0
|
}
|
}
|
},
|
"allPlansExecution" : [
|
{
|
"nReturned" : 0,
|
"executionTimeMillisEstimate" : 0,
|
"totalKeysExamined" : 2,
|
"totalDocsExamined" : 2,
|
"executionStages" : {
|
"stage" : "KEEP_MUTATIONS",
|
"nReturned" : 0,
|
"executionTimeMillisEstimate" : 0,
|
"works" : 2,
|
"advanced" : 0,
|
"needTime" : 2,
|
"needFetch" : 0,
|
"saveState" : 0,
|
"restoreState" : 0,
|
"isEOF" : 0,
|
"invalidates" : 0,
|
"inputStage" : {
|
"stage" : "FETCH",
|
"filter" : {
|
"$and" : [
|
{
|
"cid" : {
|
"$eq" : ObjectId("5401c3c2f64b2377fcd0aff2")
|
}
|
},
|
{
|
"et" : {
|
"$in" : [
|
"RESYNC_REQUIRED"
|
]
|
}
|
}
|
]
|
},
|
"nReturned" : 0,
|
"executionTimeMillisEstimate" : 0,
|
"works" : 2,
|
"advanced" : 0,
|
"needTime" : 2,
|
"needFetch" : 0,
|
"saveState" : 0,
|
"restoreState" : 0,
|
"isEOF" : 0,
|
"invalidates" : 0,
|
"docsExamined" : 2,
|
"alreadyHasObj" : 0,
|
"inputStage" : {
|
"stage" : "IXSCAN",
|
"nReturned" : 2,
|
"executionTimeMillisEstimate" : 0,
|
"works" : 2,
|
"advanced" : 2,
|
"needTime" : 0,
|
"needFetch" : 0,
|
"saveState" : 0,
|
"restoreState" : 0,
|
"isEOF" : 0,
|
"invalidates" : 0,
|
"keyPattern" : {
|
"status" : 1,
|
"acids" : 1
|
},
|
"isMultiKey" : false,
|
"direction" : "forward",
|
"indexBounds" : {
|
"status" : [
|
"[\"OPEN\", \"OPEN\"]"
|
],
|
"acids" : [
|
"[MinKey, MaxKey]"
|
]
|
},
|
"keysExamined" : 2,
|
"dupsTested" : 0,
|
"dupsDropped" : 0,
|
"seenInvalidated" : 0,
|
"matchTested" : 0
|
}
|
}
|
}
|
},
|
{
|
"nReturned" : 1,
|
"executionTimeMillisEstimate" : 0,
|
"totalKeysExamined" : 1,
|
"totalDocsExamined" : 1,
|
"executionStages" : {
|
"stage" : "KEEP_MUTATIONS",
|
"nReturned" : 1,
|
"executionTimeMillisEstimate" : 0,
|
"works" : 2,
|
"advanced" : 1,
|
"needTime" : 0,
|
"needFetch" : 0,
|
"saveState" : 0,
|
"restoreState" : 0,
|
"isEOF" : 1,
|
"invalidates" : 0,
|
"inputStage" : {
|
"stage" : "FETCH",
|
"filter" : {
|
"et" : {
|
"$in" : [
|
"RESYNC_REQUIRED"
|
]
|
}
|
},
|
"nReturned" : 1,
|
"executionTimeMillisEstimate" : 0,
|
"works" : 2,
|
"advanced" : 1,
|
"needTime" : 0,
|
"needFetch" : 0,
|
"saveState" : 0,
|
"restoreState" : 0,
|
"isEOF" : 1,
|
"invalidates" : 0,
|
"docsExamined" : 1,
|
"alreadyHasObj" : 0,
|
"inputStage" : {
|
"stage" : "IXSCAN",
|
"nReturned" : 1,
|
"executionTimeMillisEstimate" : 0,
|
"works" : 2,
|
"advanced" : 1,
|
"needTime" : 0,
|
"needFetch" : 0,
|
"saveState" : 0,
|
"restoreState" : 0,
|
"isEOF" : 1,
|
"invalidates" : 0,
|
"keyPattern" : {
|
"cid" : 1,
|
"status" : 1,
|
"cre" : -1
|
},
|
"isMultiKey" : false,
|
"direction" : "forward",
|
"indexBounds" : {
|
"cid" : [
|
"[ObjectId('5401c3c2f64b2377fcd0aff2'), ObjectId('5401c3c2f64b2377fcd0aff2')]"
|
],
|
"status" : [
|
"[\"OPEN\", \"OPEN\"]"
|
],
|
"cre" : [
|
"[MaxKey, MinKey]"
|
]
|
},
|
"keysExamined" : 1,
|
"dupsTested" : 0,
|
"dupsDropped" : 0,
|
"seenInvalidated" : 0,
|
"matchTested" : 0
|
}
|
}
|
}
|
}
|
]
|
},
|
"serverInfo" : {
|
"host" : "ip-10-169-132-210",
|
"port" : 9999,
|
"version" : "2.8.0-rc1",
|
"gitVersion" : "c8171e7f969519af8b87a43425ae291ee69a0191"
|
},
|
"ok" : 1
|
}
|
NB: I've done a full resync of this node now, however, I saved the data directory before I did this. The above query is from a new mongo standalone process, running on top of the saved data directory. The query returns the erroneous document, just as it did before.
|
|
Can you reproduce this every time, or only under certain conditions? And, can you post the output of db.data.alerts.v2.find(...).explain("allStatsExecution") for this query (with a 2.8.0-rc1 shell)?
|
Generated at Thu Feb 08 03:40:46 UTC 2024 using Jira 9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66.