Details
-
Bug
-
Resolution: Gone away
-
Major - P3
-
None
-
4.2.5
-
None
-
None
-
Service Arch
-
ALL
-
-
(copied to CRM)
Description
We've recently updated our 3.4 Replica Set to 4.2.5 and faced an issue tonight.
One of replica set members (db3) suddenly stopped replicating. All data was stale on it and optime showed that it lags 9+ hours behind primary.
The last meaningfull log message on db3 was:
2020-04-14T22:32:57.641+0000 IÂ CONNPOOL [ReplicaSetMonitor-TaskExecutor] Dropping all pooled connections to db3:27017 due to ShutdownInProgress: Pool for db3:27017 has expired.
|
2020-04-14T22:32:57.641+0000 IÂ CONNPOOL [ReplicaSetMonitor-TaskExecutor] Dropping all pooled connections to db1:27017 due to ShutdownInProgress: Pool for db1:27017 has expired.
|
2020-04-14T22:32:57.641+0000 IÂ CONNPOOL [ReplicaSetMonitor-TaskExecutor] Dropping all pooled connections to db2:27017 due to ShutdownInProgress: Pool for db2:27017 has expired.
|
Replica Set status as seen from Primary at the time of incident (db1):
{
|
"set" : "***",
|
"date" : ISODate("2020-04-15T08:26:26.804Z"),
|
"myState" : 1,
|
"term" : NumberLong(22),
|
"syncingTo" : "",
|
"syncSourceHost" : "",
|
"syncSourceId" : -1,
|
"heartbeatIntervalMillis" : NumberLong(2000),
|
"majorityVoteCount" : 3,
|
"writeMajorityCount" : 3,
|
"optimes" : {
|
"lastCommittedOpTime" : {
|
"ts" : Timestamp(1586939186, 813),
|
"t" : NumberLong(22)
|
},
|
"lastCommittedWallTime" : ISODate("2020-04-15T08:26:26.798Z"),
|
"readConcernMajorityOpTime" : {
|
"ts" : Timestamp(1586939186, 813),
|
"t" : NumberLong(22)
|
},
|
"readConcernMajorityWallTime" : ISODate("2020-04-15T08:26:26.798Z"),
|
"appliedOpTime" : {
|
"ts" : Timestamp(1586939186, 813),
|
"t" : NumberLong(22)
|
},
|
"durableOpTime" : {
|
"ts" : Timestamp(1586939186, 813),
|
"t" : NumberLong(22)
|
},
|
"lastAppliedWallTime" : ISODate("2020-04-15T08:26:26.798Z"),
|
"lastDurableWallTime" : ISODate("2020-04-15T08:26:26.798Z")
|
},
|
"lastStableRecoveryTimestamp" : Timestamp(1586939153, 301),
|
"lastStableCheckpointTimestamp" : Timestamp(1586939153, 301),
|
"electionCandidateMetrics" : {
|
"lastElectionReason" : "priorityTakeover",
|
"lastElectionDate" : ISODate("2020-04-07T06:22:02.485Z"),
|
"electionTerm" : NumberLong(22),
|
"lastCommittedOpTimeAtElection" : {
|
"ts" : Timestamp(1586240522, 526),
|
"t" : NumberLong(21)
|
},
|
"lastSeenOpTimeAtElection" : {
|
"ts" : Timestamp(1586240522, 526),
|
"t" : NumberLong(21)
|
},
|
"numVotesNeeded" : 3,
|
"priorityAtElection" : 2,
|
"electionTimeoutMillis" : NumberLong(10000),
|
"priorPrimaryMemberId" : 9,
|
"numCatchUpOps" : NumberLong(4),
|
"newTermStartDate" : ISODate("2020-04-07T06:22:02.539Z"),
|
"wMajorityWriteAvailabilityDate" : ISODate("2020-04-07T06:22:03.616Z")
|
},
|
"members" : [
|
{
|
"_id" : 7,
|
"name" : "db4:27017",
|
"health" : 1,
|
"state" : 2,
|
"stateStr" : "SECONDARY",
|
"uptime" : 748567,
|
"optime" : {
|
"ts" : Timestamp(1586939185, 78),
|
"t" : NumberLong(22)
|
},
|
"optimeDurable" : {
|
"ts" : Timestamp(1586939185, 78),
|
"t" : NumberLong(22)
|
},
|
"optimeDate" : ISODate("2020-04-15T08:26:25Z"),
|
"optimeDurableDate" : ISODate("2020-04-15T08:26:25Z"),
|
"lastHeartbeat" : ISODate("2020-04-15T08:26:25.058Z"),
|
"lastHeartbeatRecv" : ISODate("2020-04-15T08:26:25.034Z"),
|
"pingMs" : NumberLong(0),
|
"lastHeartbeatMessage" : "",
|
"syncingTo" : "db1:27017",
|
"syncSourceHost" : "db1:27017",
|
"syncSourceId" : 8,
|
"infoMessage" : "",
|
"configVersion" : 128
|
},
|
{
|
"_id" : 8,
|
"name" : "db1:27017",
|
"health" : 1,
|
"state" : 1,
|
"stateStr" : "PRIMARY",
|
"uptime" : 751010,
|
"optime" : {
|
"ts" : Timestamp(1586939186, 813),
|
"t" : NumberLong(22)
|
},
|
"optimeDate" : ISODate("2020-04-15T08:26:26Z"),
|
"syncingTo" : "",
|
"syncSourceHost" : "",
|
"syncSourceId" : -1,
|
"infoMessage" : "",
|
"electionTime" : Timestamp(1586240522, 538),
|
"electionDate" : ISODate("2020-04-07T06:22:02Z"),
|
"configVersion" : 128,
|
"self" : true,
|
"lastHeartbeatMessage" : ""
|
},
|
{
|
"_id" : 9,
|
"name" : "db2:27017",
|
"health" : 1,
|
"state" : 2,
|
"stateStr" : "SECONDARY",
|
"uptime" : 751002,
|
"optime" : {
|
"ts" : Timestamp(1586939185, 995),
|
"t" : NumberLong(22)
|
},
|
"optimeDurable" : {
|
"ts" : Timestamp(1586939185, 991),
|
"t" : NumberLong(22)
|
},
|
"optimeDate" : ISODate("2020-04-15T08:26:25Z"),
|
"optimeDurableDate" : ISODate("2020-04-15T08:26:25Z"),
|
"lastHeartbeat" : ISODate("2020-04-15T08:26:25.969Z"),
|
"lastHeartbeatRecv" : ISODate("2020-04-15T08:26:24.893Z"),
|
"pingMs" : NumberLong(0),
|
"lastHeartbeatMessage" : "",
|
"syncingTo" : "db1:27017",
|
"syncSourceHost" : "db1:27017",
|
"syncSourceId" : 8,
|
"infoMessage" : "",
|
"configVersion" : 128
|
},
|
{
|
"_id" : 10,
|
"name" : "db3:27017",
|
"health" : 1,
|
"state" : 2,
|
"stateStr" : "SECONDARY",
|
"uptime" : 751002,
|
"optime" : {
|
"ts" : Timestamp(1586903526, 1069),
|
"t" : NumberLong(22)
|
},
|
"optimeDurable" : {
|
"ts" : Timestamp(1586903526, 1069),
|
"t" : NumberLong(22)
|
},
|
"optimeDate" : ISODate("2020-04-14T22:32:06Z"),
|
"optimeDurableDate" : ISODate("2020-04-14T22:32:06Z"),
|
"lastHeartbeat" : ISODate("2020-04-15T08:26:25.034Z"),
|
"lastHeartbeatRecv" : ISODate("2020-04-15T08:26:25.594Z"),
|
"pingMs" : NumberLong(0),
|
"lastHeartbeatMessage" : "",
|
"syncingTo" : "db1:27017",
|
"syncSourceHost" : "db1:27017",
|
"syncSourceId" : 8,
|
"infoMessage" : "",
|
"configVersion" : 128
|
},
|
{
|
"_id" : 11,
|
"name" : "db1-dc2:27017",
|
"health" : 1,
|
"state" : 2,
|
"stateStr" : "SECONDARY",
|
"uptime" : 751002,
|
"optime" : {
|
"ts" : Timestamp(1586939185, 627),
|
"t" : NumberLong(22)
|
},
|
"optimeDurable" : {
|
"ts" : Timestamp(1586939185, 622),
|
"t" : NumberLong(22)
|
},
|
"optimeDate" : ISODate("2020-04-15T08:26:25Z"),
|
"optimeDurableDate" : ISODate("2020-04-15T08:26:25Z"),
|
"lastHeartbeat" : ISODate("2020-04-15T08:26:25.660Z"),
|
"lastHeartbeatRecv" : ISODate("2020-04-15T08:26:24.804Z"),
|
"pingMs" : NumberLong(1),
|
"lastHeartbeatMessage" : "",
|
"syncingTo" : "db1:27017",
|
"syncSourceHost" : "db1:27017",
|
"syncSourceId" : 8,
|
"infoMessage" : "",
|
"configVersion" : 128
|
}
|
],
|
"ok" : 1,
|
"$clusterTime" : {
|
"clusterTime" : Timestamp(1586939186, 814),
|
"signature" : {
|
"hash" : BinData(0,"zGvmG5qYkIif17pfXKXh2dgvwIg="),
|
"keyId" : NumberLong("6812519156018054020")
|
}
|
},
|
"operationTime" : Timestamp(1586939186, 813)
|
}
|
***:PRIMARY> rs.printSlaveReplicationInfo()
|
source: db4:27017
|
syncedTo: Wed Apr 15 2020 08:22:15 GMT+0000 (UTC)
|
1 secs (0 hrs) behind the primary
|
source: db2:27017
|
syncedTo: Wed Apr 15 2020 08:22:15 GMT+0000 (UTC)
|
1 secs (0 hrs) behind the primary
|
source: db3:27017
|
syncedTo: Tue Apr 14 2020 22:32:06 GMT+0000 (UTC)
|
35410 secs (9.84 hrs) behind the primary
|
source: db1-dc2:27017
|
syncedTo: Wed Apr 15 2020 08:22:15 GMT+0000 (UTC)
|
1 secs (0 hrs) behind the primary
|
All other metrics (CPU Usage, Memory, Network, etc) were fine.
Moreover db3 served read queries perfectly fine with outdated data and that caused major problems due to write conflicts on primary db1.
Additionally attaching diagnostic.data from all dbs.
Attachments
Issue Links
- related to
-
SERVER-60164 db.serverStatus() hang on SECONDARY server #secondtime
-
- Closed
-
-
SERVER-50228 Convert ThreadPool to use predicated waits
-
- Closed
-