As a part of the changes in SERVER-73848, the _commit phase of the resharding operation performs write to the config.tags documents in the transaction in writePresistedState(). When the resharding operations has a large number of zones in config.tags, the transaction become "too large" and crashes. Change the behavior of the _commit to abort the resharding operation when this error occurs.
[js_test:resharding_large_number_of_initial_chunks] c20042| 2023-04-19T14:24:26.308+00:00 I TXN 51802 [ReshardingCoordinatorService-3] "transaction","attr":{"parameters":{"lsid":{"id":{"$uuid":"810b3027-5cff-4b39-810e-a01f575394c9"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}}},"txnNumber":44,"txnRetryCounter":0,"autocommit":false,"readConcern":{"provenance":"implicitDefault"}},"readTimestamp":"Timestamp(0, 0)","keysExamined":0,"docsExamined":0,"nreturned":0,"nBatches":6,"ninserted":6,"keysInserted":268528,"keysDeleted":268008,"durationMillis":5197,"terminationCause":"aborted","timeActiveMicros":5196624,"timeInactiveMicros":1190,"numYields":0,"locks":{"FeatureCompatibilityVersion":{"acquireCount":{"r":134004}},"ReplicationStateTransition":{"acquireCount":{"w":134004}},"Global":{"acquireCount":{"r":134004}},"Database":{"acquireCount":{"r":134004}},"Mutex":{"acquireCount":{"r":402017}}},"storage":{"data":{"bytesRead":13535288,"timeReadingMicros":3231},"timeWaitingMicros":{"cache":535550}},"wasPrepared":false}[js_test:resharding_large_number_of_initial_chunks] c20042| 2023-04-19T14:24:26.376+00:00 I WRITE 51803 [ReshardingCoordinatorService-3] "Slow query","attr":{"type":"update","ns":"config.tags","command":{"q":{"ns":"db.system.resharding.447662a2-f100-4adb-be3b-7cc4c834c1a8"},"u":{"$set":{"ns":"db.foo"}},"hint":{"ns":1,"min":1},"multi":true,"upsert":false},"planSummary":"IXSCAN { ns: 1, min: 1 }","keysInserted":268008,"keysDeleted":268008,"numYields":0,"ok":0,"errMsg":"-31800: transaction is too large and will not fit in the storage engine cache","errName":"TransactionTooLargeForCache","errCode":388,"locks":{"FeatureCompatibilityVersion":{"acquireCount":{"r":134004}},"ReplicationStateTransition":{"acquireCount":{"w":134004}},"Global":{"acquireCount":{"r":134004}},"Database":{"acquireCount":{"r":134004}},"Mutex":{"acquireCount":{"r":402017}}},"flowControl":{"acquireCount":45},"storage":{"data":{"bytesRead":13535103,"timeReadingMicros":3224},"timeWaitingMicros":{"cache":535550}},"cpuNanos":5106992034,"durationMillis":5253}[js_test:resharding_large_number_of_initial_chunks] c20042| 2023-04-19T14:24:26.376+00:00 I COMMAND 51803 [ReshardingCoordinatorService-3] "Slow query","attr":{"type":"command","ns":"config.$cmd","command":{"update":"tags","bypassDocumentValidation":false,"ordered":true,"updates":[{"q":{"ns":"db.system.resharding.447662a2-f100-4adb-be3b-7cc4c834c1a8"},"u":{"$set":{"ns":"db.foo"}},"hint":{"ns":1,"min":1},"multi":true,"upsert":false}],"autocommit":false,"txnNumber":44,"lsid":{"id":{"$uuid":"810b3027-5cff-4b39-810e-a01f575394c9"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}}},"$db":"config"},"planSummary":"COLLSCAN","planningTimeMicros":48,"keysExamined":0,"docsExamined":0,"fromPlanCache":true,"nBatches":1,"ninserted":1,"cursorExhausted":true,"keysInserted":88,"numYields":0,"nreturned":0,"queryHash":"E475932B","planCacheKey":"4D32A35E","queryFramework":"sbe","reslen":373,"locks":{"FeatureCompatibilityVersion":{"acquireCount":{"r":5100468,"w":47}},"ReplicationStateTransition":{"acquireCount":{"w":5100603}},"Global":{"acquireCount":{"r":5100468,"w":47}},"Database":{"acquireCount":{"r":5100468,"w":46}},"Collection":{"acquireCount":{"r":1,"w":178}},"Mutex":{"acquireCount":{"r":15302410}}},"flowControl":{"acquireCount":45},"storage":{},"cpuNanos":217602966481,"protocol":"op_msg","durationMillis":220978}[js_test:resharding_large_number_of_initial_chunks] c20042| 2023-04-19T14:24:26.377+00:00 F RESHARD 5277000 [ReshardingCoordinatorService-3] "Unrecoverable error past the point resharding was guaranteed to succeed","attr":{"error":"TransactionTooLargeForCache: -31800: transaction is too large and will not fit in the storage engine cache"}[js_test:resharding_large_number_of_initial_chunks] c20042| 2023-04-19T14:24:26.377+00:00 F ASSERT 23089 [ReshardingCoordinatorService-3] "Fatal assertion","attr":{"msgid":5277000,"file":"src/mongo/db/s/resharding/resharding_coordinator_service.cpp","line":1535}[js_test:resharding_large_number_of_initial_chunks] c20042| 2023-04-19T14:24:26.377+00:00 F ASSERT 23090 [ReshardingCoordinatorService-3] "\n\n***aborting after fassert() failure\n\n"[js_test:resharding_large_number_of_initial_chunks] c20042| 2023-04-19T14:24:26.377+00:00 F CONTROL 6384300 [ReshardingCoordinatorService-3] "Writing fatal message","attr":{"message":"\n"}[js_test:resharding_large_number_of_initial_chunks] c20042| 2023-04-19T14:24:26.377+00:00 F CONTROL 6384300 [ReshardingCoordinatorService-3] "Writing fatal message","attr":{"message":"Got signal: 6 (Aborted).\n"}
Link to full logs. The original version of the jstest resharding_large_number_of_initial_chunks.js which created 175,000 zones can be used to reproduce this error.
- is related to
-
SERVER-77159 Do not start resharding operation if zone information is too large
- Backlog
-
SERVER-77140 config metadata format should use collection uuid instead of namespace string
- Closed
- related to
-
SERVER-77429 Renaming a sharded collection can hit transaction limit
- Backlog
-
SERVER-73848 Hashed shard keys with zones can cause issues with resharding
- Closed