Loading...

XML

Word

Printable

JSON

Type: Bug
Resolution: Fixed
Priority: Major - P3
Fix Version/s: 4.3.4
Affects Version/s: None
Component/s: Index Maintenance
Labels:
None

Backwards Compatibility:
Fully Compatible
Operating System:
ALL
Steps To Reproduce:
Hide

After applying the patch with 'git am < bf.patch', run 'python buildscripts/resmoke.py --suites=no_passthrough jsTests/noPassthrough/repro.js'

(This patch was taken from base c92e20479618b22355b9fb7efa935ff6db5883a9).

From 23f0b72f7560359573b92db569676ebff5e01dce Mon Sep 17 00:00:00 2001 From: Gregory Wlodarek <gregory.wlodarek@mongodb.com> Date: Thu, 2 Jan 2020 15:35:25 -0500 Subject: [PATCH] BF-15731 --- jstests/noPassthrough/repro.js | 45 +++++++++++++++++++++++ src/mongo/db/index_builds_coordinator.cpp | 6 +++ 2 files changed, 51 insertions(+) create mode 100644 jstests/noPassthrough/repro.js diff --git a/jstests/noPassthrough/repro.js b/jstests/noPassthrough/repro.js new file mode 100644 index 0000000000..2414b30edd --- /dev/null +++ b/jstests/noPassthrough/repro.js @@ -0,0 +1,45 @@ +/** + * @tags: [requires_replication] + */ +(function() { +"use strict"; + +const dbName = "test"; +const collName = "geo"; + +const rst = new ReplSetTest({nodes: 2}); +rst.startSet(); +rst.initiate(); + +const db = rst.getPrimary().getDB(dbName); +assert.commandWorked(db.createCollection(collName)); +assert.commandWorked(db.getCollection(collName).insert({pos: "invalid"})); +rst.awaitReplication(); + +const failpoint = 'hangBeforeRunningIndexBuildWithoutLocks'; +assert.commandWorked(db.adminCommand({configureFailPoint: failpoint, mode: "alwaysOn"})); + +const awaitCreateIndex = startParallelShell(() => { + assert.commandFailedWithCode( + db.getCollection("geo").createIndex({pos: "geoHaystack", type: 1}, {bucketSize: 1}), + ErrorCodes.InterruptedDueToReplStateChange); +}, rst.getPrimary().port); + +checkLog.contains(rst.getPrimary(), "Hang before running index build without locks"); + +jsTestLog("Making Primary step down"); +assert.commandWorked(rst.getPrimary().adminCommand( + {"replSetStepDown": 100, secondaryCatchUpPeriodSecs: 0, "force": true})); + +// New primary fails validate because we managed to built a corrupted index. +const validateRes = + assert.commandWorked(rst.getPrimary().getDB(dbName).getCollection(collName).validate()); +jsTestLog("Validate Response: " + tojson(validateRes)); +assert.eq(false, validateRes.valid); + +// Old primary fasserts() when trying to commit the index build. +assert.commandWorked(rst.getSecondary().adminCommand({configureFailPoint: failpoint, mode: "off"})); +awaitCreateIndex(); + +rst.stopSet(); +}()); diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp index 5e52d4403c..51b319d3b9 100644 --- a/src/mongo/db/index_builds_coordinator.cpp +++ b/src/mongo/db/index_builds_coordinator.cpp @@ -67,6 +67,7 @@ using namespace indexbuildentryhelpers; MONGO_FAIL_POINT_DEFINE(hangAfterIndexBuildFirstDrain); MONGO_FAIL_POINT_DEFINE(hangAfterIndexBuildSecondDrain); MONGO_FAIL_POINT_DEFINE(hangAfterIndexBuildDumpsInsertsFromBulk); +MONGO_FAIL_POINT_DEFINE(hangBeforeRunningIndexBuildWithoutLocks); namespace { @@ -1372,6 +1373,11 @@ void IndexBuildsCoordinator::_runIndexBuildInner(OperationContext* opCtx, // This Status stays unchanged unless we catch an exception in the following try-catch block. auto status = Status::OK(); try { + if (hangBeforeRunningIndexBuildWithoutLocks.shouldFail()) { + log() << "Hang before running index build without locks"; + hangBeforeRunningIndexBuildWithoutLocks.pauseWhileSet(); + } + // Lock acquisition might throw, and we would still need to clean up the index build state, // so do it in the try-catch block AutoGetDb autoDb(opCtx, replState->dbName, MODE_IX); -- 2.23.0
Show
After applying the patch with 'git am < bf.patch', run 'python buildscripts/resmoke.py --suites=no_passthrough jsTests/noPassthrough/repro.js' (This patch was taken from base c92e20479618b22355b9fb7efa935ff6db5883a9). From 23f0b72f7560359573b92db569676ebff5e01dce Mon Sep 17 00:00:00 2001 From: Gregory Wlodarek <gregory.wlodarek@mongodb.com> Date: Thu, 2 Jan 2020 15:35:25 -0500 Subject: [PATCH] BF-15731 --- jstests/noPassthrough/repro.js | 45 +++++++++++++++++++++++ src/mongo/db/index_builds_coordinator.cpp | 6 +++ 2 files changed, 51 insertions(+) create mode 100644 jstests/noPassthrough/repro.js diff --git a/jstests/noPassthrough/repro.js b/jstests/noPassthrough/repro.js new file mode 100644 index 0000000000..2414b30edd --- /dev/ null +++ b/jstests/noPassthrough/repro.js @@ -0,0 +1,45 @@ +/** + * @tags: [requires_replication] + */ +(function() { + "use strict" ; + + const dbName = "test" ; + const collName = "geo" ; + + const rst = new ReplSetTest({nodes: 2}); +rst.startSet(); +rst.initiate(); + + const db = rst.getPrimary().getDB(dbName); + assert .commandWorked(db.createCollection(collName)); + assert .commandWorked(db.getCollection(collName).insert({pos: "invalid" })); +rst.awaitReplication(); + + const failpoint = 'hangBeforeRunningIndexBuildWithoutLocks' ; + assert .commandWorked(db.adminCommand({configureFailPoint: failpoint, mode: "alwaysOn" })); + + const awaitCreateIndex = startParallelShell(() => { + assert .commandFailedWithCode( + db.getCollection( "geo" ).createIndex({pos: "geoHaystack" , type: 1}, {bucketSize: 1}), + ErrorCodes.InterruptedDueToReplStateChange); +}, rst.getPrimary().port); + +checkLog.contains(rst.getPrimary(), "Hang before running index build without locks" ); + +jsTestLog( "Making Primary step down" ); + assert .commandWorked(rst.getPrimary().adminCommand( + { "replSetStepDown" : 100, secondaryCatchUpPeriodSecs: 0, "force" : true })); + + // New primary fails validate because we managed to built a corrupted index. + const validateRes = + assert .commandWorked(rst.getPrimary().getDB(dbName).getCollection(collName).validate()); +jsTestLog( "Validate Response: " + tojson(validateRes)); + assert .eq( false , validateRes.valid); + + // Old primary fasserts() when trying to commit the index build. + assert .commandWorked(rst.getSecondary().adminCommand({configureFailPoint: failpoint, mode: "off" })); +awaitCreateIndex(); + +rst.stopSet(); +}()); diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp index 5e52d4403c..51b319d3b9 100644 --- a/src/mongo/db/index_builds_coordinator.cpp +++ b/src/mongo/db/index_builds_coordinator.cpp @@ -67,6 +67,7 @@ using namespace indexbuildentryhelpers; MONGO_FAIL_POINT_DEFINE(hangAfterIndexBuildFirstDrain); MONGO_FAIL_POINT_DEFINE(hangAfterIndexBuildSecondDrain); MONGO_FAIL_POINT_DEFINE(hangAfterIndexBuildDumpsInsertsFromBulk); +MONGO_FAIL_POINT_DEFINE(hangBeforeRunningIndexBuildWithoutLocks); namespace { @@ -1372,6 +1373,11 @@ void IndexBuildsCoordinator::_runIndexBuildInner(OperationContext* opCtx, // This Status stays unchanged unless we catch an exception in the following try - catch block. auto status = Status::OK(); try { + if (hangBeforeRunningIndexBuildWithoutLocks.shouldFail()) { + log() << "Hang before running index build without locks" ; + hangBeforeRunningIndexBuildWithoutLocks.pauseWhileSet(); + } + // Lock acquisition might throw , and we would still need to clean up the index build state, // so do it in the try - catch block AutoGetDb autoDb(opCtx, replState->dbName, MODE_IX); -- 2.23.0
Sprint:
Execution Team 2020-01-27, Execution Team 2020-02-10
Linked BF Score:
26
Confidence Status:
None
Work Order:
3
CAR Domain/s:
None

Aha! Reference:
None
Tracking Level:
None
Risk Status:
None
Exec Notes:
None
Goal Name(s):
None
Goal Link:
None

Looking at the code, it looks like the issue stems from IndexBuildsCoordinator::onStepUp() as it commits index builds regardless of whether they should be aborted.

I was able to reproduce this, but here is a timeline of events leading up to this problem:

Node 1 (Primary): Start a geo index build that should fail at the end due to an invalid document for this type of index in the collection.
Node 2 (Secondary): Start the same index build after seeing the start index build oplog entry, and finish the index build up until the final phase before the primary node. The secondary node now waits for the commit/abort oplog entry from the primary.
Node 1 (Primary): Step-down after finishing the index build but before aborting the index build and sending the abort oplog entry to the secondary.
Node 2 becomes primary, Node 1 becomes secondary.
Node 2 (Primary): Step-up and commit the index build. This was rather unexpected, I would expect an abort index build oplog entry here. Perhaps the onStepUp() function forgets to check the state of the index build.
Node 2 (Primary): Running validate on this node will fail because we committed an invalid index and validate sees that as a case of corruption:

{
 	"valid" : false,
 	"warnings" : [ ],
 	"errors" : [
 		"exception during collection validation: Location16775: cannot extract [lng, lat] array or object from { _id: ObjectId('5e0f730ab7d4d029001cd7b7'), pos: \"invalid\" }"
 	],
 	"extraIndexEntries" : [ ],
 	"missingIndexEntries" : [ ],
 	"advice" : "A corrupt namespace has been detected. See http://dochub.mongodb.org/core/data-recovery for recovery steps.",
 	"ok" : 1,
 	"$clusterTime" : {
 		"clusterTime" : Timestamp(1578070806, 2),
 		"signature" : {
 			"hash" : BinData(0,"AAAAAAAAAAAAAAAAAAAAAAAAAAA="),
 			"keyId" : NumberLong(0)
 		}
 	},
 	"operationTime" : Timestamp(1578070806, 2)
}

* Node 1 (Secondary): fasserts() when trying to commit the index build after receiving the commit index build oplog entry from the new primary.

[IndexBuildsCoordinatorMongod-0] Fatal assertion 51101 Location16775: Index build: 21ddcd36-e6c3-4b41-a687-12304a79ac9a; Database: test :: caused by :: index build failed on this node but we received a commitIndexBuild oplog entry from the primary with timestamp: Timestamp(1578070806, 2) :: caused by :: cannot extract [lng, lat] array or object from { _id: ObjectId('5e0f730ab7d4d029001cd7b7'), pos: "invalid" } at src/mongo/db/index_builds_coordinator.cpp 1355

is duplicated by

SERVER-39428 Record all indexing errors during simultaneous index builds for later constraint checking

Closed

is related to

SERVER-45852 Two-phase index build constraints should be checked at the completion of the index build

Closed

related to

SERVER-46246 index_failover_resolved_key_errors.js hangs when document level locking is not supported

Closed

SERVER-47605 Single-phase index builds should only check constraint violations upon completion

Closed

SERVER-44654 allow unique index builds to continue running on stepdown

Closed

links to

MongoDB Documentation: Compound Multikey Indexes

(1 links to)

Assignee:: Louis Williams
Reporter:: Gregory Wlodarek
Participants:: Benety Goh, Githook User, Gregory Wlodarek, Louis Williams
Votes:: 0 Vote for this issue
Watchers:: 5 Start watching this issue

Created:: Jan 03 2020 05:20:36 PM UTC
Updated:: Oct 29 2023 10:13:42 PM UTC
Resolved:: Jan 30 2020 04:21:36 PM UTC
Confidence Status Last Update:: 27/Jan/20 5:53 PM

Details

Description

Attachments

Issue Links

Forms

Activity

People

Dates