Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-76988

Abort the reshardCollection operation when the zone information is too large

    • Type: Icon: Bug Bug
    • Resolution: Fixed
    • Priority: Icon: Major - P3 Major - P3
    • 7.1.0-rc0, 6.0.7, 5.0.19, 7.0.0-rc3
    • Affects Version/s: None
    • Component/s: None
    • None
    • Sharding NYC
    • Fully Compatible
    • ALL
    • v7.0, v6.0, v5.0
    • Sharding NYC 2023-05-15, Sharding NYC 2023-05-29

      As a part of the changes in SERVER-73848, the _commit phase of the resharding operation performs write to the config.tags documents in the transaction in writePresistedState(). When the resharding operations has a large number of zones in config.tags, the transaction become "too large" and crashes. Change the behavior of the _commit to abort the resharding operation when this error occurs.

       

      Logs
      [js_test:resharding_large_number_of_initial_chunks] c20042| 2023-04-19T14:24:26.308+00:00 I  TXN      51802   [ReshardingCoordinatorService-3] "transaction","attr":{"parameters":{"lsid":{"id":{"$uuid":"810b3027-5cff-4b39-810e-a01f575394c9"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}}},"txnNumber":44,"txnRetryCounter":0,"autocommit":false,"readConcern":{"provenance":"implicitDefault"}},"readTimestamp":"Timestamp(0, 0)","keysExamined":0,"docsExamined":0,"nreturned":0,"nBatches":6,"ninserted":6,"keysInserted":268528,"keysDeleted":268008,"durationMillis":5197,"terminationCause":"aborted","timeActiveMicros":5196624,"timeInactiveMicros":1190,"numYields":0,"locks":{"FeatureCompatibilityVersion":{"acquireCount":{"r":134004}},"ReplicationStateTransition":{"acquireCount":{"w":134004}},"Global":{"acquireCount":{"r":134004}},"Database":{"acquireCount":{"r":134004}},"Mutex":{"acquireCount":{"r":402017}}},"storage":{"data":{"bytesRead":13535288,"timeReadingMicros":3231},"timeWaitingMicros":{"cache":535550}},"wasPrepared":false}[js_test:resharding_large_number_of_initial_chunks] c20042| 2023-04-19T14:24:26.376+00:00 I  WRITE    51803   [ReshardingCoordinatorService-3] "Slow query","attr":{"type":"update","ns":"config.tags","command":{"q":{"ns":"db.system.resharding.447662a2-f100-4adb-be3b-7cc4c834c1a8"},"u":{"$set":{"ns":"db.foo"}},"hint":{"ns":1,"min":1},"multi":true,"upsert":false},"planSummary":"IXSCAN { ns: 1, min: 1 }","keysInserted":268008,"keysDeleted":268008,"numYields":0,"ok":0,"errMsg":"-31800: transaction is too large and will not fit in the storage engine cache","errName":"TransactionTooLargeForCache","errCode":388,"locks":{"FeatureCompatibilityVersion":{"acquireCount":{"r":134004}},"ReplicationStateTransition":{"acquireCount":{"w":134004}},"Global":{"acquireCount":{"r":134004}},"Database":{"acquireCount":{"r":134004}},"Mutex":{"acquireCount":{"r":402017}}},"flowControl":{"acquireCount":45},"storage":{"data":{"bytesRead":13535103,"timeReadingMicros":3224},"timeWaitingMicros":{"cache":535550}},"cpuNanos":5106992034,"durationMillis":5253}[js_test:resharding_large_number_of_initial_chunks] c20042| 2023-04-19T14:24:26.376+00:00 I  COMMAND  51803   [ReshardingCoordinatorService-3] "Slow query","attr":{"type":"command","ns":"config.$cmd","command":{"update":"tags","bypassDocumentValidation":false,"ordered":true,"updates":[{"q":{"ns":"db.system.resharding.447662a2-f100-4adb-be3b-7cc4c834c1a8"},"u":{"$set":{"ns":"db.foo"}},"hint":{"ns":1,"min":1},"multi":true,"upsert":false}],"autocommit":false,"txnNumber":44,"lsid":{"id":{"$uuid":"810b3027-5cff-4b39-810e-a01f575394c9"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}}},"$db":"config"},"planSummary":"COLLSCAN","planningTimeMicros":48,"keysExamined":0,"docsExamined":0,"fromPlanCache":true,"nBatches":1,"ninserted":1,"cursorExhausted":true,"keysInserted":88,"numYields":0,"nreturned":0,"queryHash":"E475932B","planCacheKey":"4D32A35E","queryFramework":"sbe","reslen":373,"locks":{"FeatureCompatibilityVersion":{"acquireCount":{"r":5100468,"w":47}},"ReplicationStateTransition":{"acquireCount":{"w":5100603}},"Global":{"acquireCount":{"r":5100468,"w":47}},"Database":{"acquireCount":{"r":5100468,"w":46}},"Collection":{"acquireCount":{"r":1,"w":178}},"Mutex":{"acquireCount":{"r":15302410}}},"flowControl":{"acquireCount":45},"storage":{},"cpuNanos":217602966481,"protocol":"op_msg","durationMillis":220978}[js_test:resharding_large_number_of_initial_chunks] c20042| 2023-04-19T14:24:26.377+00:00 F  RESHARD  5277000 [ReshardingCoordinatorService-3] "Unrecoverable error past the point resharding was guaranteed to succeed","attr":{"error":"TransactionTooLargeForCache: -31800: transaction is too large and will not fit in the storage engine cache"}[js_test:resharding_large_number_of_initial_chunks] c20042| 2023-04-19T14:24:26.377+00:00 F  ASSERT   23089   [ReshardingCoordinatorService-3] "Fatal assertion","attr":{"msgid":5277000,"file":"src/mongo/db/s/resharding/resharding_coordinator_service.cpp","line":1535}[js_test:resharding_large_number_of_initial_chunks] c20042| 2023-04-19T14:24:26.377+00:00 F  ASSERT   23090   [ReshardingCoordinatorService-3] "\n\n***aborting after fassert() failure\n\n"[js_test:resharding_large_number_of_initial_chunks] c20042| 2023-04-19T14:24:26.377+00:00 F  CONTROL  6384300 [ReshardingCoordinatorService-3] "Writing fatal message","attr":{"message":"\n"}[js_test:resharding_large_number_of_initial_chunks] c20042| 2023-04-19T14:24:26.377+00:00 F  CONTROL  6384300 [ReshardingCoordinatorService-3] "Writing fatal message","attr":{"message":"Got signal: 6 (Aborted).\n"}

      Link to full logs. The original version of the jstest resharding_large_number_of_initial_chunks.js which created 175,000 zones can be used to reproduce this error. 

            Assignee:
            cheahuychou.mao@mongodb.com Cheahuychou Mao
            Reporter:
            kruti.shah@mongodb.com Kruti Shah
            Votes:
            0 Vote for this issue
            Watchers:
            3 Start watching this issue

              Created:
              Updated:
              Resolved: