-
Type: Bug
-
Resolution: Unresolved
-
Priority: Major - P3
-
None
-
Affects Version/s: 5.0.23
-
Component/s: None
-
None
-
Environment:Ubunutu
MongoDB is deployed in Kubernetes
-
Catalog and Routing
-
ALL
-
-
1
We are using MongoDB Version 5.0.23.
During our HA testing, we found that if the Shard Server loses a Secondary and if only a Primary and arbiter is running then the reads to the Shard Server fail with the error StaleConfig (13388).
Error Logs:
{"t": {"$date":"2024-02-16T11:46:06.874+00:00"} ,"s":"I", "c":"COMMAND", "id":51803, "ctx":"conn285","msg":"Slow query","attr":{"type":"command","ns":"testDB.$cmd","appName":"app1-bd6f5b6df-d59cv","command":{"find":"collection1","filter": {"documentType":"menu"} ,"maxTimeMS":240000,"readConcern": {"level":"local","provenance":"customDefault"} ,"shardVersion":[{"$timestamp":{"t":0,"i":0}}, {"$oid":"000000000000000000000000"} ],"databaseVersion":{"uuid": {"$uuid":"8fbc8088-cae8-4dcf-95e4-1224a78eb18b"} ,"timestamp":{"$timestamp":{"t":1707289503,"i":4}},"lastMod":1},"clientOperationKey": {"$uuid":"6d6adabb-b826-4011-8fdf-d3759295a039"} ,"lsid":{"id": {"$uuid":"5b6c98f8-336b-4ac8-878b-b98537a71c53"} ,"uid":{"$binary": {"base64":"O0CMtIVItQN4IsEOsJdrPL8s7jv5xwh5a/A5Qfvs2A8=","subType":"0"} }},"maxTimeMSOpOnly":55544,"$clusterTime":{"clusterTime":{"$timestamp":{"t":1708083911,"i":20}},"signature":{"hash":{"$binary":{"base64":"M2DyFYhpqBXIjo9nUct5rJ962tk=","subType":"0"}},"keyId":7332751961713803278}},"$configTime":{"$timestamp":{"t":1708083402,"i":1}},"$topologyTime":{"$timestamp":{"t":1707289434,"i":1}},"$audit":{"$impersonatedUsers":[ {"user":"admin","db":"admin"} ],"$impersonatedRoles":[ {"role":"root","db":"admin"} ]},"$client":{"driver": {"name":"mongo-java-driver|sync","version":"3.12.11"} ,"os": {"type":"Linux","name":"Linux","architecture":"amd64","version":"5.14.0-362.18.1.el9_3.x86_64"} ,"platform":"Java/IcedTea/1.8.0_382-b05","application": {"name":"app1-bd6f5b6df-d59cv"} ,"mongos":{"host":"mongo-routerdb-1:27017","client":"127.0.0.6:39473","version":"5.0.23"}},"$configServerState":{"opTime":{"ts":{"$timestamp":{"t":1708083402,"i":1}},"t":-1}},"$db":"testDB"},"numYields":0,"ok":0,"errMsg":"sharding status of collection testDB.collection1 is not currently available for description and needs to be recovered from the config server","errName":"StaleConfig","errCode":13388,"reslen":654,"locks":{"ParallelBatchWriterMode":{"acquireCount":{"r":1}},"FeatureCompatibilityVersion":{"acquireCount":{"r":2}},"ReplicationStateTransition":{"acquireCount":{"w":1}},"Global":{"acquireCount":{"r":2}},"Database":{"acquireCount":{"r":1}},"Collection":{"acquireCount":{"r":1}},"Mutex":{"acquireCount": {"r":4} ,"acquireWaitCount": {"r":1} ,"timeAcquiringMicros": {"r":68} }},"readConcern": {"level":"local","provenance":"customDefault"} ,"remote":"127.0.0.6:37135","protocol":"op_msg","durationMillis":55599}} {"t": {"$date":"2024-02-17T09:08:47.235+00:00"} ,"s":"I", "c":"COMMAND", "id":51803, "ctx":"conn656","msg":"Slow query","attr":{"type":"command","ns":"testDB.collection2.details","appName":"app1-6596859899-5267r","command":{"find":"collection2.details","maxTimeMS":240000,"readConcern": {"level":"local","provenance":"customDefault"} ,"shardVersion":[{"$timestamp":{"t":0,"i":0}}, {"$oid":"000000000000000000000000"} ],"databaseVersion":{"uuid": {"$uuid":"8fbc8088-cae8-4dcf-95e4-1224a78eb18b"} ,"timestamp":{"$timestamp":{"t":1707289503,"i":4}},"lastMod":1},"clientOperationKey": {"$uuid":"08a84926-1491-46a0-a9b4-362ec36e9882"} ,"lsid":{"id": {"$uuid":"c1ab7ac6-7b61-4e08-91c9-08368b6a4770"} ,"uid":{"$binary": {"base64":"O0CMtIVItQN4IsEOsJdrPL8s7jv5xwh5a/A5Qfvs2A8=","subType":"0"} }},"maxTimeMSOpOnly":51505,"$clusterTime":{"clusterTime":{"$timestamp":{"t":1708160870,"i":1}},"signature":{"hash":{"$binary":{"base64":"1DTy2vcqBXrtZ6kFxwcY6IuzUCA=","subType":"0"}},"keyId":7332751961713803278}},"$configTime":{"$timestamp":{"t":1708160530,"i":1}},"$topologyTime":{"$timestamp":{"t":1707289434,"i":1}},"$audit":{"$impersonatedUsers":[ {"user":"admin","db":"admin"} ],"$impersonatedRoles":[ {"role":"root","db":"admin"} ]},"$client":{"driver": {"name":"mongo-java-driver|sync","version":"3.12.11"} ,"os": {"type":"Linux","name":"Linux","architecture":"amd64","version":"5.14.0-362.18.1.el9_3.x86_64"} ,"platform":"Java/IcedTea/1.8.0_382-b05","application": {"name":"app1-6596859899-5267r"} ,"mongos":{"host":"mongo-routerdb-0:27017","client":"127.0.0.6:34109","version":"5.0.23"}},"$configServerState":{"opTime":{"ts":{"$timestamp":{"t":1708160530,"i":1}},"t":-1}},"$db":"testDB"},"numYields":0,"ok":0,"errMsg":"sharding status of collection testDB.collection2.details is not currently known and needs to be recovered","errName":"StaleConfig","errCode":13388,"reslen":675,"locks":{"ParallelBatchWriterMode":{"acquireCount":{"r":1}},"FeatureCompatibilityVersion":{"acquireCount":{"r":2}},"ReplicationStateTransition":{"acquireCount":{"w":1}},"Global":{"acquireCount":{"r":2}},"Database":{"acquireCount":{"r":1}},"Collection":{"acquireCount":{"r":1}},"Mutex":{"acquireCount": {"r":8,"W":1} }},"readConcern": {"level":"local","provenance":"customDefault"} ,"remote":"127.0.0.6:36305","protocol":"op_msg","durationMillis":51554}} {"t": {"$date":"2024-02-17T09:11:17.785+00:00"} ,"s":"I", "c":"SH_REFR", "id":4619903, "ctx":"CatalogCache-84","msg":"Error refreshing cached collection","attr":{"namespace":"testDB.collection2","durationMillis":60083,"error":"WriteConcernFailed: waiting for replication timed out"}} {"t": {"$date":"2024-02-17T09:11:17.786+00:00"} ,"s":"I", "c":"SHARDING", "id":22062, "ctx":"conn733","msg":"Failed to refresh metadata for collection","attr":{"namespace":"testDB.collection2","error":"WriteConcernFailed: waiting for replication timed out"}} We have set up Shard Servers in two data centers as a PSA deployment with one data center having a data bearing member and the other having two (Primary and Arbiter). Whenever we bring down the data center with the single secondary member, we encounter this issue. This issue is occurring only in versions >= 5.0.23. A possible reason for this issue might be https://jira.mongodb.org/browse/SERVER-78115. When we downgraded the version to 5.0.22 the issue was not reproducible. This led us to the release notes and assumption that the ticket SERVER-78115 might be the reason for this issue. When all of the data centers were operational, refreshing the internal cache (Catalog Cache) took only milliseconds. {"t": {"$date":"2024-02-17T13:20:08.138+00:00"} ,"s":"I", "c":"SH_REFR", "id":4619902, "ctx":"CatalogCache-2","msg":"Collection has found to be unsharded after refresh","attr":{"namespace":"test.collection1","durationMillis":26}} {"t": {"$date":"2024-02-17T13:32:00.428+00:00"} ,"s":"I", "c":"SH_REFR", "id":4619902, "ctx":"CatalogCache-12","msg":"Collection has found to be unsharded after refresh","attr":{"namespace":"test.collection2","durationMillis":73}} When the data center with the secondary go down, the same operation fails with timeout errors. {"t": {"$date":"2024-02-17T13:37:31.248+00:00"} ,"s":"I", "c":"SH_REFR", "id":4619903, "ctx":"CatalogCache-15","msg":"Error refreshing cached collection","attr":{"namespace":"test.collection3","durationMillis":30001,"error":"NetworkInterfaceExceededTimeLimit: Request 9460 timed out, deadline was 2024-02-17T13:37:31.247+00:00, op was RemoteCommand 9460 – target:[mongo-shareddb-3.mongo-shareddb-service.namespace.svc.cluster.local:27017] db:admin expDate:2024-02-17T13:37:31.247+00:00 cmd: { _flushRoutingTableCacheUpdates: \"test.collection3\", maxTimeMS: 30000 } "}} {"t": {"$date":"2024-02-17T13:38:01.332+00:00"} ,"s":"I", "c":"SH_REFR", "id":4619903, "ctx":"CatalogCache-16","msg":"Error refreshing cached collection","attr":{"namespace":"test.collection3","durationMillis":1304,"error":"WriteConcernFailed: waiting for replication timed out"}} {"t": {"$date":"2024-02-17T13:45:01.894+00:00"} ,"s":"I", "c":"SHARDING", "id":22062, "ctx":"conn126","msg":"Failed to refresh metadata for collection","attr":{"namespace":"test.collection4","error":"MaxTimeMSExpired: operation exceeded time limit"}} {"t": {"$date":"2024-02-17T13:45:01.894+00:00"} ,"s":"W", "c":"COMMAND", "id":20525, "ctx":"conn126","msg":"Failed to gather storage statistics for slow operation","attr":{"opId":86624,"error":"lock acquire timeout"}} {"t": {"$date":"2024-02-17T13:45:01.894+00:00"} ,"s":"I", "c":"COMMAND", "id":51803, "ctx":"conn126","msg":"Slow query","attr":{"type":"command","ns":"test.$cmd","command":{"find":"collection4","filter": {"status":"Certificate fetch is in progress"} ,"shardVersion":{"t":{"$timestamp":{"t":0,"i":0}},"e": {"$oid":"000000000000000000000000"} ,"v":{"$timestamp": {"t":0,"i":0} }},"databaseVersion":{"uuid": {"$uuid":"8fbc8088-cae8-4dcf-95e4-1224a78eb18b"} ,"timestamp":{"$timestamp":{"t":1707289503,"i":4}},"lastMod":1},"batchSize":1,"singleBatch":true,"maxTimeMS":1000,"$readPreference": {"mode":"secondaryPreferred"} ,"readConcern": {"level":"local"} ,"$db":"test"},"numYields":0,"ok":0,"errMsg":"sharding status of collection test.collection4 is not currently available for description and needs to be recovered from the config server","errName":"StaleConfig","errCode":13388,"locks":{"ParallelBatchWriterMode":{"acquireCount": {"r":1} ,"acquireWaitCount": {"r":1} ,"timeAcquiringMicros":{"r":2946}},"FeatureCompatibilityVersion":{"acquireCount":{"r":2}},"ReplicationStateTransition":{"acquireCount":{"w":1}},"Global":{"acquireCount":{"r":2}},"Database":{"acquireCount":{"r":1}},"Collection":{"acquireCount":{"r":1}},"Mutex":{"acquireCount": {"r":7,"W":1} }},"readConcern": {"level":"local","provenance":"clientSupplied"} ,"remote":"127.0.0.6:58049","protocol":"op_msg","durationMillis":1001}}