-
Type:
Bug
-
Resolution: Fixed
-
Priority:
Major - P3
-
Affects Version/s: None
-
Component/s: None
-
Fully Compatible
-
v6.0
-
Sharding 2022-05-02, Sharding NYC 2022-05-16
-
None
-
3
-
None
-
None
-
None
-
None
-
None
-
None
The transaction API currently handles non-retryable commit errors by aborting the transaction. This is not safe if the transaction is going through two-phase commit. Consider a transaction started internally by a router (mongos0) and with two participant shards (shard0 and shard1). After shard0 (coordinator) has written down a commit decision for the transaction, mongos0 receives a killSession command.
- The opCtx running the transaction on mongos0 is interrupted but the TransactionRouter correctly identifies that coordinating commit has been handed off to shard0 so it does not implicitly abort the transaction.
- The opCtx for the API on mongos0 is interrupted so it issues an abortTransaction command against the mongos. The explicit abort causes the TransactionRouter to send the abortTransaction command to shard0 and shard1.
- There is a race between committing and aborting on the participant shards, leading to an inconsistent state where one participant shard committed the txn and the other aborted.
Logs:
[j0:s0] | 2022-04-28T18:40:29.283+00:00 I TXN 51805 [Sharding-Fixed-2] "transaction","attr":{"parameters":{"lsid":{"id":{"$uuid":"a99bdda1-9007-4540-9f06-eec84e7f2807"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}},"txnNumber":4,"txnUUID":{"$uuid":"074df197-e580-4f0b-869f-2dda2e3a092b"}},"txnNumber":0,"txnRetryCounter":0,"autocommit":false,"readConcern":{"level":"local"}},"numParticipants":2,"coordinator":"shard-rs0","terminationCause":"aborted","abortCause":"Interrupted","commitType":"twoPhaseCommit","commitDurationMicros":47288,"timeActiveMicros":57584,"timeInactiveMicros":528,"durationMillis":58} [j0:s0] | 2022-04-28T18:40:29.283+00:00 D3 TXN 22896 [Sharding-Fixed-2] "Not sending implicit abortTransaction to participant shards after error because coordinating the commit decision may have been handed off to the coordinator shard","attr":{"sessionId":{"id":{"$uuid":"a99bdda1-9007-4540-9f06-eec84e7f2807"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}},"txnNumber":4,"txnUUID":{"$uuid":"074df197-e580-4f0b-869f-2dda2e3a092b"}},"txnNumber":0,"txnRetryCounter":0,"error":"Interrupted: Transaction was aborted :: caused by :: operation was interrupted"} [j0:s0] | 2022-04-28T18:40:29.284+00:00 D3 TXN 5875905 [Sharding-Fixed-2] "Internal transaction handling error","attr":{"error":{"code":11601,"codeName":"Interrupted","errmsg":"Command error committing internal transaction :: caused by :: Transaction was aborted :: caused by :: operation was interrupted"},"hasTransientTransactionErrorLabel":false,"txnInfo":{"execContext":"client retryable write","sessionInfo":{"lsid":{"id":{"$uuid":"a99bdda1-9007-4540-9f06-eec84e7f2807"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}},"txnNumber":4,"txnUUID":{"$uuid":"074df197-e580-4f0b-869f-2dda2e3a092b"}},"txnNumber":0,"autocommit":false},"state":"started commit"},"retriesLeft":10} [j0:s0] | 2022-04-28T18:40:29.284+00:00 I TXN 5918600 [Sharding-Fixed-2] "Chose internal transaction error handling step","attr":{"nextStep":"abort and do not retry","txnInfo":{"execContext":"client retryable write","sessionInfo":{"lsid":{"id":{"$uuid":"a99bdda1-9007-4540-9f06-eec84e7f2807"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}},"txnNumber":4,"txnUUID":{"$uuid":"074df197-e580-4f0b-869f-2dda2e3a092b"}},"txnNumber":0,"autocommit":false},"state":"started commit"}} [j0:s0] | 2022-04-28T18:40:29.284+00:00 D3 TXN 22895 [Sharding-Fixed-2] "Aborting transaction on all participant shards","attr":{"sessionId":{"id":{"$uuid":"a99bdda1-9007-4540-9f06-eec84e7f2807"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}},"txnNumber":4,"txnUUID":{"$uuid":"074df197-e580-4f0b-869f-2dda2e3a092b"}},"txnNumber":0,"txnRetryCounter":0,"numParticipantShards":2} [j0:s0:n0] | 2022-04-28T18:40:29.283+00:00 D3 TXN 22481 [Sharding-Fixed-1] "Coordinator going to send command to shard","attr":{"sessionId":{"id":{"$uuid":"a99bdda1-9007-4540-9f06-eec84e7f2807"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}},"txnNumber":4,"txnUUID":{"$uuid":"074df197-e580-4f0b-869f-2dda2e3a092b"}},"txnNumberAndRetryCounter":{"txnNumber":0,"txnRetryCounter":0},"command":{"commitTransaction":1,"commitTimestamp":{"$timestamp":{"t":1651171229,"i":64}},"lsid":{"id":{"$uuid":"a99bdda1-9007-4540-9f06-eec84e7f2807"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}},"txnNumber":4,"txnUUID":{"$uuid":"074df197-e580-4f0b-869f-2dda2e3a092b"}},"txnNumber":0,"autocommit":false,"writeConcern":{"w":"majority"}},"localOrRemote":"local","shardId":"shard-rs0"} [j0:s0:n0] | 2022-04-28T18:40:29.283+00:00 D3 TXN 22481 [Sharding-Fixed-1] "Coordinator going to send command to shard","attr":{"sessionId":{"id":{"$uuid":"a99bdda1-9007-4540-9f06-eec84e7f2807"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}},"txnNumber":4,"txnUUID":{"$uuid":"074df197-e580-4f0b-869f-2dda2e3a092b"}},"txnNumberAndRetryCounter":{"txnNumber":0,"txnRetryCounter":0},"command":{"commitTransaction":1,"commitTimestamp":{"$timestamp":{"t":1651171229,"i":64}},"lsid":{"id":{"$uuid":"a99bdda1-9007-4540-9f06-eec84e7f2807"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}},"txnNumber":4,"txnUUID":{"$uuid":"074df197-e580-4f0b-869f-2dda2e3a092b"}},"txnNumber":0,"autocommit":false,"writeConcern":{"w":"majority"}},"localOrRemote":"remote","shardId":"shard-rs1"} [j0:s0:n0] | 2022-04-28T18:40:29.284+00:00 D3 TXN 20507 [TransactionCoordinator] "Received commitTransaction","attr":{"txnNumberAndRetryCounter":{"txnNumber":0,"txnRetryCounter":0},"sessionId":{"id":{"$uuid":"a99bdda1-9007-4540-9f06-eec84e7f2807"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}},"txnNumber":4,"txnUUID":{"$uuid":"074df197-e580-4f0b-869f-2dda2e3a092b"}}} [j0:s0:n0] | 2022-04-28T18:40:29.284+00:00 I TXN 51802 [TransactionCoordinator] "transaction","attr":{"parameters":{"lsid":{"id":{"$uuid":"a99bdda1-9007-4540-9f06-eec84e7f2807"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}},"txnNumber":4,"txnUUID":{"$uuid":"074df197-e580-4f0b-869f-2dda2e3a092b"}},"txnNumber":0,"txnRetryCounter":0,"autocommit":false,"readConcern":{"level":"local","provenance":"clientSupplied"}},"readTimestamp":"Timestamp(0, 0)","ninserted":1,"keysInserted":2,"terminationCause":"committed","timeActiveMicros":976,"timeInactiveMicros":52453,"numYields":0,"locks":{"ReplicationStateTransition":{"acquireCount":{"w":10}},"Global":{"acquireCount":{"r":1,"w":4}},"Database":{"acquireCount":{"w":4}},"Collection":{"acquireCount":{"w":4}},"Mutex":{"acquireCount":{"r":8}}},"storage":{},"wasPrepared":true,"totalPreparedDurationMicros":32891,"prepareOpTime":{"ts":{"$timestamp":{"t":1651171229,"i":63}},"t":3},"durationMillis":53} [j0:s1:n1] | 2022-04-28T18:40:29.284+00:00 D3 TXN 20508 [conn114] "Received abortTransaction","attr":{"txnNumberAndRetryCounter":{"txnNumber":0,"txnRetryCounter":0},"sessionId":{"id":{"$uuid":"a99bdda1-9007-4540-9f06-eec84e7f2807"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}},"txnNumber":4,"txnUUID":{"$uuid":"074df197-e580-4f0b-869f-2dda2e3a092b"}}}[j0:s1:n1] | 2022-04-28T18:40:29.285+00:00 I TXN 51802 [conn114] "transaction","attr":{"parameters":{"lsid":{"id":{"$uuid":"a99bdda1-9007-4540-9f06-eec84e7f2807"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}},"txnNumber":4,"txnUUID":{"$uuid":"074df197-e580-4f0b-869f-2dda2e3a092b"}},"txnNumber":0,"txnRetryCounter":0,"autocommit":false,"readConcern":{"level":"local","provenance":"clientSupplied"}},"readTimestamp":"Timestamp(0, 0)","ninserted":1,"keysInserted":2,"terminationCause":"aborted","timeActiveMicros":860,"timeInactiveMicros":48897,"numYields":0,"locks":{"ReplicationStateTransition":{"acquireCount":{"w":9}},"Global":{"acquireCount":{"r":1,"w":3}},"Database":{"acquireCount":{"w":3}},"Collection":{"acquireCount":{"w":3}},"Mutex":{"acquireCount":{"r":6}}},"storage":{},"wasPrepared":true,"totalPreparedDurationMicros":31900,"prepareOpTime":{"ts":{"$timestamp":{"t":1651171229,"i":64}},"t":3},"durationMillis":49} [j0:s0:n0] | 2022-04-28T18:40:29.286+00:00 D3 TXN 20508 [conn28] "Received abortTransaction","attr":{"txnNumberAndRetryCounter":{"txnNumber":0,"txnRetryCounter":0},"sessionId":{"id":{"$uuid":"a99bdda1-9007-4540-9f06-eec84e7f2807"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}},"txnNumber":4,"txnUUID":{"$uuid":"074df197-e580-4f0b-869f-2dda2e3a092b"}}}