Loading...

Type: Bug
Resolution: Unresolved
Priority: Major - P3
Fix Version/s: None
Affects Version/s: 5.0.23
Component/s: None
Labels:
None
Environment:
Ubunutu
MongoDB is deployed in Kubernetes

Assigned Teams:

Catalog and Routing
Operating System:
ALL
Steps To Reproduce:

Hide

1) Setup a Shard Server as PSA.

2) Bring down the Secondary.

3) Attempt read operations.

If the issue is not reproducible then repeat step 3 after restarting the primary so that the cache is lost.

Show
1) Setup a Shard Server as PSA. 2) Bring down the Secondary. 3) Attempt read operations. If the issue is not reproducible then repeat step 3 after restarting the primary so that the cache is lost.
Story Points:
1
CAR Domain/s:
None

Aha! Reference:
None
Tracking Level:
None
Risk Status:
None
Exec Notes:
None
Goal Name(s):
None
Goal Link:
None

We are using MongoDB Version 5.0.23.

During our HA testing, we found that if the Shard Server loses a Secondary and if only a Primary and arbiter is running then the reads to the Shard Server fail with the error StaleConfig (13388).

Error Logs:

{"t":
{"$date":"2024-02-16T11:46:06.874+00:00"}
,"s":"I",  "c":"COMMAND",  "id":51803,   "ctx":"conn285","msg":"Slow query","attr":{"type":"command","ns":"testDB.$cmd","appName":"app1-bd6f5b6df-d59cv","command":{"find":"collection1","filter":
{"documentType":"menu"}
,"maxTimeMS":240000,"readConcern":
{"level":"local","provenance":"customDefault"}
,"shardVersion":[{"$timestamp":{"t":0,"i":0}},
{"$oid":"000000000000000000000000"}
],"databaseVersion":{"uuid":
{"$uuid":"8fbc8088-cae8-4dcf-95e4-1224a78eb18b"}
,"timestamp":{"$timestamp":{"t":1707289503,"i":4}},"lastMod":1},"clientOperationKey":
{"$uuid":"6d6adabb-b826-4011-8fdf-d3759295a039"}
,"lsid":{"id":
{"$uuid":"5b6c98f8-336b-4ac8-878b-b98537a71c53"}
,"uid":{"$binary":
{"base64":"O0CMtIVItQN4IsEOsJdrPL8s7jv5xwh5a/A5Qfvs2A8=","subType":"0"}
}},"maxTimeMSOpOnly":55544,"$clusterTime":{"clusterTime":{"$timestamp":{"t":1708083911,"i":20}},"signature":{"hash":{"$binary":{"base64":"M2DyFYhpqBXIjo9nUct5rJ962tk=","subType":"0"}},"keyId":7332751961713803278}},"$configTime":{"$timestamp":{"t":1708083402,"i":1}},"$topologyTime":{"$timestamp":{"t":1707289434,"i":1}},"$audit":{"$impersonatedUsers":[
{"user":"admin","db":"admin"}
],"$impersonatedRoles":[
{"role":"root","db":"admin"}
]},"$client":{"driver":
{"name":"mongo-java-driver|sync","version":"3.12.11"}
,"os":
{"type":"Linux","name":"Linux","architecture":"amd64","version":"5.14.0-362.18.1.el9_3.x86_64"}
,"platform":"Java/IcedTea/1.8.0_382-b05","application":
{"name":"app1-bd6f5b6df-d59cv"}
,"mongos":{"host":"mongo-routerdb-1:27017","client":"127.0.0.6:39473","version":"5.0.23"}},"$configServerState":{"opTime":{"ts":{"$timestamp":{"t":1708083402,"i":1}},"t":-1}},"$db":"testDB"},"numYields":0,"ok":0,"errMsg":"sharding status of collection testDB.collection1 is not currently available for description and needs to be recovered from the config server","errName":"StaleConfig","errCode":13388,"reslen":654,"locks":{"ParallelBatchWriterMode":{"acquireCount":{"r":1}},"FeatureCompatibilityVersion":{"acquireCount":{"r":2}},"ReplicationStateTransition":{"acquireCount":{"w":1}},"Global":{"acquireCount":{"r":2}},"Database":{"acquireCount":{"r":1}},"Collection":{"acquireCount":{"r":1}},"Mutex":{"acquireCount":
{"r":4}
,"acquireWaitCount":
{"r":1}
,"timeAcquiringMicros":
{"r":68}
}},"readConcern":
{"level":"local","provenance":"customDefault"}
,"remote":"127.0.0.6:37135","protocol":"op_msg","durationMillis":55599}}
{"t":
{"$date":"2024-02-17T09:08:47.235+00:00"}
,"s":"I",  "c":"COMMAND",  "id":51803,   "ctx":"conn656","msg":"Slow query","attr":{"type":"command","ns":"testDB.collection2.details","appName":"app1-6596859899-5267r","command":{"find":"collection2.details","maxTimeMS":240000,"readConcern":
{"level":"local","provenance":"customDefault"}
,"shardVersion":[{"$timestamp":{"t":0,"i":0}},
{"$oid":"000000000000000000000000"}
],"databaseVersion":{"uuid":
{"$uuid":"8fbc8088-cae8-4dcf-95e4-1224a78eb18b"}
,"timestamp":{"$timestamp":{"t":1707289503,"i":4}},"lastMod":1},"clientOperationKey":
{"$uuid":"08a84926-1491-46a0-a9b4-362ec36e9882"}
,"lsid":{"id":
{"$uuid":"c1ab7ac6-7b61-4e08-91c9-08368b6a4770"}
,"uid":{"$binary":
{"base64":"O0CMtIVItQN4IsEOsJdrPL8s7jv5xwh5a/A5Qfvs2A8=","subType":"0"}
}},"maxTimeMSOpOnly":51505,"$clusterTime":{"clusterTime":{"$timestamp":{"t":1708160870,"i":1}},"signature":{"hash":{"$binary":{"base64":"1DTy2vcqBXrtZ6kFxwcY6IuzUCA=","subType":"0"}},"keyId":7332751961713803278}},"$configTime":{"$timestamp":{"t":1708160530,"i":1}},"$topologyTime":{"$timestamp":{"t":1707289434,"i":1}},"$audit":{"$impersonatedUsers":[
{"user":"admin","db":"admin"}
],"$impersonatedRoles":[
{"role":"root","db":"admin"}
]},"$client":{"driver":
{"name":"mongo-java-driver|sync","version":"3.12.11"}
,"os":
{"type":"Linux","name":"Linux","architecture":"amd64","version":"5.14.0-362.18.1.el9_3.x86_64"}
,"platform":"Java/IcedTea/1.8.0_382-b05","application":
{"name":"app1-6596859899-5267r"}
,"mongos":{"host":"mongo-routerdb-0:27017","client":"127.0.0.6:34109","version":"5.0.23"}},"$configServerState":{"opTime":{"ts":{"$timestamp":{"t":1708160530,"i":1}},"t":-1}},"$db":"testDB"},"numYields":0,"ok":0,"errMsg":"sharding status of collection testDB.collection2.details is not currently known and needs to be recovered","errName":"StaleConfig","errCode":13388,"reslen":675,"locks":{"ParallelBatchWriterMode":{"acquireCount":{"r":1}},"FeatureCompatibilityVersion":{"acquireCount":{"r":2}},"ReplicationStateTransition":{"acquireCount":{"w":1}},"Global":{"acquireCount":{"r":2}},"Database":{"acquireCount":{"r":1}},"Collection":{"acquireCount":{"r":1}},"Mutex":{"acquireCount":
{"r":8,"W":1}
}},"readConcern":
{"level":"local","provenance":"customDefault"}
,"remote":"127.0.0.6:36305","protocol":"op_msg","durationMillis":51554}}
{"t":
{"$date":"2024-02-17T09:11:17.785+00:00"}
,"s":"I",  "c":"SH_REFR",  "id":4619903, "ctx":"CatalogCache-84","msg":"Error refreshing cached collection","attr":{"namespace":"testDB.collection2","durationMillis":60083,"error":"WriteConcernFailed: waiting for replication timed out"}}
{"t":
{"$date":"2024-02-17T09:11:17.786+00:00"}
,"s":"I",  "c":"SHARDING", "id":22062,   "ctx":"conn733","msg":"Failed to refresh metadata for collection","attr":{"namespace":"testDB.collection2","error":"WriteConcernFailed: waiting for replication timed out"}}
We have set up Shard Servers in two data centers as a PSA deployment with one data center having a data bearing member and the other having two (Primary and Arbiter). Whenever we bring down the data center with the single secondary member, we encounter this issue. 
This issue is occurring only in versions >= 5.0.23.
A possible reason for this issue might be https://jira.mongodb.org/browse/SERVER-78115.
When we downgraded the version to 5.0.22 the issue was not reproducible. This led us to the release notes and assumption that the ticket SERVER-78115 might be the reason for this issue.
When all of the data centers were operational, refreshing the internal cache (Catalog Cache) took only milliseconds.
{"t":
{"$date":"2024-02-17T13:20:08.138+00:00"}
,"s":"I",  "c":"SH_REFR",  "id":4619902, "ctx":"CatalogCache-2","msg":"Collection has found to be unsharded after refresh","attr":{"namespace":"test.collection1","durationMillis":26}}
{"t":
{"$date":"2024-02-17T13:32:00.428+00:00"}
,"s":"I",  "c":"SH_REFR",  "id":4619902, "ctx":"CatalogCache-12","msg":"Collection has found to be unsharded after refresh","attr":{"namespace":"test.collection2","durationMillis":73}}
When the data center with the secondary go down, the same operation fails with timeout errors.
{"t":
{"$date":"2024-02-17T13:37:31.248+00:00"}
,"s":"I",  "c":"SH_REFR",  "id":4619903, "ctx":"CatalogCache-15","msg":"Error refreshing cached collection","attr":{"namespace":"test.collection3","durationMillis":30001,"error":"NetworkInterfaceExceededTimeLimit: Request 9460 timed out, deadline was 2024-02-17T13:37:31.247+00:00, op was RemoteCommand 9460 – target:[mongo-shareddb-3.mongo-shareddb-service.namespace.svc.cluster.local:27017] db:admin expDate:2024-02-17T13:37:31.247+00:00 cmd:
{ _flushRoutingTableCacheUpdates: \"test.collection3\", maxTimeMS: 30000 }
"}}
{"t":
{"$date":"2024-02-17T13:38:01.332+00:00"}
,"s":"I",  "c":"SH_REFR",  "id":4619903, "ctx":"CatalogCache-16","msg":"Error refreshing cached collection","attr":{"namespace":"test.collection3","durationMillis":1304,"error":"WriteConcernFailed: waiting for replication timed out"}}
{"t":
{"$date":"2024-02-17T13:45:01.894+00:00"}
,"s":"I",  "c":"SHARDING", "id":22062,   "ctx":"conn126","msg":"Failed to refresh metadata for collection","attr":{"namespace":"test.collection4","error":"MaxTimeMSExpired: operation exceeded time limit"}}
{"t":
{"$date":"2024-02-17T13:45:01.894+00:00"}
,"s":"W",  "c":"COMMAND",  "id":20525,   "ctx":"conn126","msg":"Failed to gather storage statistics for slow operation","attr":{"opId":86624,"error":"lock acquire timeout"}}
{"t":
{"$date":"2024-02-17T13:45:01.894+00:00"}
,"s":"I",  "c":"COMMAND",  "id":51803,   "ctx":"conn126","msg":"Slow query","attr":{"type":"command","ns":"test.$cmd","command":{"find":"collection4","filter":
{"status":"Certificate fetch is in progress"}
,"shardVersion":{"t":{"$timestamp":{"t":0,"i":0}},"e":
{"$oid":"000000000000000000000000"}
,"v":{"$timestamp":
{"t":0,"i":0}
}},"databaseVersion":{"uuid":
{"$uuid":"8fbc8088-cae8-4dcf-95e4-1224a78eb18b"}
,"timestamp":{"$timestamp":{"t":1707289503,"i":4}},"lastMod":1},"batchSize":1,"singleBatch":true,"maxTimeMS":1000,"$readPreference":
{"mode":"secondaryPreferred"}
,"readConcern":
{"level":"local"}
,"$db":"test"},"numYields":0,"ok":0,"errMsg":"sharding status of collection test.collection4 is not currently available for description and needs to be recovered from the config server","errName":"StaleConfig","errCode":13388,"locks":{"ParallelBatchWriterMode":{"acquireCount":
{"r":1}
,"acquireWaitCount":
{"r":1}
,"timeAcquiringMicros":{"r":2946}},"FeatureCompatibilityVersion":{"acquireCount":{"r":2}},"ReplicationStateTransition":{"acquireCount":{"w":1}},"Global":{"acquireCount":{"r":2}},"Database":{"acquireCount":{"r":1}},"Collection":{"acquireCount":{"r":1}},"Mutex":{"acquireCount":
{"r":7,"W":1}
}},"readConcern":
{"level":"local","provenance":"clientSupplied"}
,"remote":"127.0.0.6:58049","protocol":"op_msg","durationMillis":1001}}

Details

Description

Attachments

Activity

People

Dates