-
Type:
Bug
-
Resolution: Unresolved
-
Priority:
Major - P3
-
None
-
Affects Version/s: None
-
Component/s: None
-
None
-
Storage Execution
-
ALL
-
None
-
None
-
None
-
None
-
None
-
None
-
None
In an attempt to clean up some tech debt in SERVER-103659 and force all operations to use commitQuorum, I think I found a bug.
[js_test:rollback_index_build_start_abort] d20050| {"t":{"$date":"2025-04-16T21:48:15.082+00:00"},"s":"I", "c":"INDEX", "id":20438, "ctx":"conn108","msg":"Index build: registering","attr":{"buildUUID":{"uuid":{"$uuid":"90e2da38-d6b4-4137-8abc-928b6500cdd9"}},"namespace":"test.coll_2","collectionUUID":{"uuid":{"$uuid":"ab0fdb86-ca02-45af-87ee-efdf4aba07ba"}},"indexes":1,"firstIndex":{"name":"a_1"},"command":{"createIndexes":"coll_2","v":2,"indexes":[{"key":{"a":1},"name":"a_1"}],"ignoreUnknownIndexOptions":false,"commitQuorum":1,"lsid":{"id":{"$uuid":"eaaa6c32-f731-4cff-8d85-44393a3c3b9b"}},"$clusterTime":{"clusterTime":{"$timestamp":{"t":1744840094,"i":2}},"signature":{"hash":{"$binary":{"base64":"AAAAAAAAAAAAAAAAAAAAAAAAAAA=","subType":"0"}},"keyId":0}}}}} [js_test:rollback_index_build_start_abort] d20050| {"t":{"$date":"2025-04-16T21:48:15.102+00:00"},"s":"I", "c":"INDEX", "id":20384, "ctx":"IndexBuildsCoordinatorMongod-0","msg":"Index build: starting","attr":{"buildUUID":{"uuid":{"$uuid":"90e2da38-d6b4-4137-8abc-928b6500cdd9"}},"collectionUUID":{"uuid":{"$uuid":"ab0fdb86-ca02-45af-87ee-efdf4aba07ba"}},"namespace":"test.coll_2","properties":{"v":2,"key":{"a":1},"name":"a_1"},"specIndex":0,"numSpecs":1,"method":"Hybrid","ident":"index-4f0ecfa0-d0f7-4c24-8528-fd6fda18db8d","collectionIdent":"collection-a79518fc-80a9-4385-a8a7-523c15412abc","maxTemporaryMemoryUsageMB":200}} [js_test:rollback_index_build_start_abort] d20050| {"t":{"$date":"2025-04-16T21:48:15.103+00:00"},"s":"I", "c":"INDEX", "id":20346, "ctx":"IndexBuildsCoordinatorMongod-0","msg":"Index build: initialized","attr":{"buildUUID":{"uuid":{"$uuid":"90e2da38-d6b4-4137-8abc-928b6500cdd9"}},"collectionUUID":{"uuid":{"$uuid":"ab0fdb86-ca02-45af-87ee-efdf4aba07ba"}},"namespace":"test.coll_2","initializationTimestamp":{"$timestamp":{"t":1744840095,"i":2}}}} [js_test:rollback_index_build_start_abort] d20050| {"t":{"$date":"2025-04-16T21:48:15.103+00:00"},"s":"I", "c":"INDEX", "id":20440, "ctx":"conn108","msg":"Index build: waiting for index build to complete","attr":{"buildUUID":{"uuid":{"$uuid":"90e2da38-d6b4-4137-8abc-928b6500cdd9"}},"deadline":{"$date":{"$numberLong":"9223372036854775807"}}}} [js_test:rollback_index_build_start_abort] d20050| {"t":{"$date":"2025-04-16T21:48:15.142+00:00"},"s":"I", "c":"STORAGE", "id":3856201, "ctx":"conn105","msg":"Index build: commit quorum satisfied","attr":{"indexBuildEntry":{"_id":{"$uuid":"90e2da38-d6b4-4137-8abc-928b6500cdd9"},"collectionUUID":{"$uuid":"ab0fdb86-ca02-45af-87ee-efdf4aba07ba"},"commitQuorum":1,"indexNames":["a_1"],"commitReadyMembers":["ip-10-122-7-78:20041"]}}} [js_test:rollback_index_build_start_abort] d20050| {"t":{"$date":"2025-04-16T21:48:15.271+00:00"},"s":"I", "c":"INDEX", "id":4984704, "ctx":"IndexBuildsCoordinatorMongod-0","msg":"Index build: collection scan stopped","attr":{"buildUUID":{"uuid":{"$uuid":"90e2da38-d6b4-4137-8abc-928b6500cdd9"}},"collectionUUID":{"uuid":{"$uuid":"ab0fdb86-ca02-45af-87ee-efdf4aba07ba"}},"totalRecords":1,"durationMillis":167,"phase":"collection scan","collectionScanPosition":"1","readSource":"kNoTimestamp","error":{"code":11601,"codeName":"Interrupted","errmsg":"operation was interrupted"}}} [js_test:rollback_index_build_start_abort] d20050| {"t":{"$date":"2025-04-16T21:48:15.271+00:00"},"s":"I", "c":"STORAGE", "id":20649, "ctx":"IndexBuildsCoordinatorMongod-0","msg":"Index build: failed","attr":{"buildUUID":{"uuid":{"$uuid":"90e2da38-d6b4-4137-8abc-928b6500cdd9"}},"collectionUUID":{"uuid":{"$uuid":"ab0fdb86-ca02-45af-87ee-efdf4aba07ba"}},"namespace":"test.coll_2","error":{"code":11601,"codeName":"Interrupted","errmsg":"collection scan stopped. totalRecords: 1; durationMillis: 167ms; phase: collection scan; collectionScanPosition: RecordId(1); readSource: kNoTimestamp :: caused by :: operation was interrupted"}}} [js_test:rollback_index_build_start_abort] d20050| {"t":{"$date":"2025-04-16T21:48:15.272+00:00"},"s":"I", "c":"STORAGE", "id":7419402, "ctx":"IndexBuildsCoordinatorMongod-0","msg":"Index build: signaling primary to abort index build","attr":{"buildUUID":{"uuid":{"$uuid":"90e2da38-d6b4-4137-8abc-928b6500cdd9"}},"db":"test","collectionUUID":{"uuid":{"$uuid":"ab0fdb86-ca02-45af-87ee-efdf4aba07ba"}},"reason":{"code":11601,"codeName":"Interrupted","errmsg":"collection scan stopped. totalRecords: 1; durationMillis: 167ms; phase: collection scan; collectionScanPosition: RecordId(1); readSource: kNoTimestamp :: caused by :: operation was interrupted"}}} [js_test:rollback_index_build_start_abort] d20050| {"t":{"$date":"2025-04-16T21:48:15.272+00:00"},"s":"I", "c":"INDEX", "id":20449, "ctx":"conn108","msg":"Index build: failed","attr":{"buildUUID":{"uuid":{"$uuid":"90e2da38-d6b4-4137-8abc-928b6500cdd9"}},"error":{"code":11601,"codeName":"Interrupted","errmsg":"collection scan stopped. totalRecords: 1; durationMillis: 167ms; phase: collection scan; collectionScanPosition: RecordId(1); readSource: kNoTimestamp :: caused by :: operation was interrupted"}}} [js_test:rollback_index_build_start_abort] d20050| {"t":{"$date":"2025-04-16T21:48:15.273+00:00"},"s":"I", "c":"COMMAND", "id":21962, "ctx":"conn108","msg":"Assertion while executing command","attr":{"command":"createIndexes","db":"test","commandArgs":{"createIndexes":"coll_2","indexes":[{"key":{"a":1},"name":"a_1"}],"commitQuorum":1,"lsid":{"id":{"$uuid":"eaaa6c32-f731-4cff-8d85-44393a3c3b9b"}},"$clusterTime":{"clusterTime":{"$timestamp":{"t":1744840094,"i":2}},"signature":{"hash":{"$binary":{"base64":"AAAAAAAAAAAAAAAAAAAAAAAAAAA=","subType":"0"}},"keyId":0}},"$db":"test"},"error":"Interrupted: Index build failed: 90e2da38-d6b4-4137-8abc-928b6500cdd9: Collection test.coll_2 ( ab0fdb86-ca02-45af-87ee-efdf4aba07ba ) :: caused by :: collection scan stopped. totalRecords: 1; durationMillis: 167ms; phase: collection scan; collectionScanPosition: RecordId(1); readSource: kNoTimestamp :: caused by :: operation was interrupted"}} [js_test:rollback_index_build_start_abort] d20050| {"t":{"$date":"2025-04-16T21:48:15.273+00:00"},"s":"I", "c":"COMMAND", "id":51803, "ctx":"conn108","msg":"Slow query","attr":{"type":"command","isFromUserConnection":true,"ns":"test.coll_2","collectionType":"normal","appName":"MongoDB Shell","command":{"createIndexes":"coll_2","indexes":[{"key":{"a":1},"name":"a_1"}],"commitQuorum":1,"lsid":{"id":{"$uuid":"eaaa6c32-f731-4cff-8d85-44393a3c3b9b"}},"$clusterTime":{"clusterTime":{"$timestamp":{"t":1744840094,"i":2}},"signature":{"hash":{"$binary":{"base64":"AAAAAAAAAAAAAAAAAAAAAAAAAAA=","subType":"0"}},"keyId":0}},"$db":"test"},"numYields":0,"ok":0,"errMsg":"Index build failed: 90e2da38-d6b4-4137-8abc-928b6500cdd9: Collection test.coll_2 ( ab0fdb86-ca02-45af-87ee-efdf4aba07ba ) :: caused by :: collection scan stopped. totalRecords: 1; durationMillis: 167ms; phase: collection scan; collectionScanPosition: RecordId(1); readSource: kNoTimestamp :: caused by :: operation was interrupted","errName":"Interrupted","errCode":11601,"reslen":542,"locks":{"ReplicationStateTransition":{"acquireCount":{"w":5}},"Global":{"acquireCount":{"r":2,"w":3}},"Database":{"acquireCount":{"w":2}},"Collection":{"acquireCount":{"w":1,"W":1}}},"flowControl":{"acquireCount":3},"readConcern":{"level":"local","provenance":"implicitDefault"},"writeConcern":{"w":1,"wtimeout":0,"provenance":"customDefault"},"storage":{"timeWaitingMicros":{"storageExecutionMicros":633}},"cpuNanos":2823719,"remote":"10.122.7.78:53950","protocol":"op_msg","queues":{"ingress":{"admissions":1},"execution":{"admissions":6}},"workingMillis":192,"durationMillis":192}} [js_test:rollback_index_build_start_abort] d20050| {"t":{"$date":"2025-04-16T21:48:15.537+00:00"},"s":"I", "c":"COMMAND", "id":51803, "ctx":"conn105","msg":"Slow query","attr":{"type":"command","isFromUserConnection":true,"ns":"config.system.indexBuilds","collectionType":"system","appName":"/home/ubuntu/mongo/bazel-bin/install/bin/mongod","command":{"voteCommitIndexBuild":{"$uuid":"90e2da38-d6b4-4137-8abc-928b6500cdd9"},"hostAndPort":"ip-10-122-7-78:20041","writeConcern":{"w":"majority"},"$clusterTime":{"clusterTime":{"$timestamp":{"t":1744840095,"i":2}}},"$db":"admin"},"totalOplogSlotDurationMicros":251,"keysInserted":0,"keysDeleted":0,"numYields":0,"queryFramework":"classic","reslen":375,"locks":{"ReplicationStateTransition":{"acquireCount":{"w":2}},"Global":{"acquireCount":{"r":1,"w":1}},"Database":{"acquireCount":{"r":1,"w":1}},"Collection":{"acquireCount":{"r":1,"w":1}},"Mutex":{"acquireCount":{"r":1}}},"flowControl":{"acquireCount":1},"writeConcern":{"w":"majority","wtimeout":0,"provenance":"clientSupplied"},"waitForWriteConcernDurationMillis":394,"cpuNanos":3058542,"remote":"10.122.7.78:53920","protocol":"op_msg","queues":{"ingress":{"admissions":1},"execution":{"admissions":3}},"workingMillis":396,"interruptLatencyNanos":1362237,"durationMillis":396}} [js_test:rollback_index_build_start_abort] d20050| {"t":{"$date":"2025-04-16T21:48:15.772+00:00"},"s":"I", "c":"STORAGE", "id":20650, "ctx":"BackgroundSync","msg":"Active index builds","attr":{"context":"IndexBuildsCoordinator::stopIndexBuildsForRollback","builds":[{"buildUUID":{"uuid":{"$uuid":"90e2da38-d6b4-4137-8abc-928b6500cdd9"}},"collectionUUID":{"uuid":{"$uuid":"ab0fdb86-ca02-45af-87ee-efdf4aba07ba"}},"indexNames":["a_1"],"protocol":"two phase"}]}} [js_test:rollback_index_build_start_abort] d20050| {"t":{"$date":"2025-04-16T21:48:15.772+00:00"},"s":"I", "c":"STORAGE", "id":4656010, "ctx":"BackgroundSync","msg":"Attempting to abort index build","attr":{"buildUUID":{"uuid":{"$uuid":"90e2da38-d6b4-4137-8abc-928b6500cdd9"}}}} [js_test:rollback_index_build_start_abort] d20050| {"t":{"$date":"2025-04-16T21:48:16.773+00:00"},"s":"I", "c":"STORAGE", "id":4656010, "ctx":"BackgroundSync","msg":"Attempting to abort index build","attr":{"buildUUID":{"uuid":{"$uuid":"90e2da38-d6b4-4137-8abc-928b6500cdd9"}}}} [js_test:rollback_index_build_start_abort] d20050| {"t":{"$date":"2025-04-16T21:48:17.773+00:00"},"s":"I", "c":"STORAGE", "id":4656010, "ctx":"BackgroundSync","msg":"Attempting to abort index build","attr":{"buildUUID":{"uuid":{"$uuid":"90e2da38-d6b4-4137-8abc-928b6500cdd9"}}}} ... loops forever
It looks like this is happening:
- The index build is created with commitQuorum of 1
- Node d20050 is primary, but hasn't finished building the index because it is blocked by a failpoint
- Node d20051 finished building the index and votes on d20050 to commit
- The singular vote immediately satisfies the commit quorum, but doesn’t commit the index build because d20050 has not finished yet, because it is blocked
- The test aborts the index build with killOp, but this doesn't fully clean up the index build (why?)
- Rollback tries to kill the index build, but it can’t because it’s in the “commit quorum satisfied” state, and that doesn’t let it abort the index build.
- is depended on by
-
SERVER-103659 Prevent commitQuorum from being disabled
-
- Blocked
-