|
We've recently updated our 3.4 Replica Set to 4.2.5 and faced an issue tonight.
One of replica set members (db3) suddenly stopped replicating. All data was stale on it and optime showed that it lags 9+ hours behind primary.
The last meaningfull log message on db3 was:
2020-04-14T22:32:57.641+0000 I CONNPOOL [ReplicaSetMonitor-TaskExecutor] Dropping all pooled connections to db3:27017 due to ShutdownInProgress: Pool for db3:27017 has expired.
|
2020-04-14T22:32:57.641+0000 I CONNPOOL [ReplicaSetMonitor-TaskExecutor] Dropping all pooled connections to db1:27017 due to ShutdownInProgress: Pool for db1:27017 has expired.
|
2020-04-14T22:32:57.641+0000 I CONNPOOL [ReplicaSetMonitor-TaskExecutor] Dropping all pooled connections to db2:27017 due to ShutdownInProgress: Pool for db2:27017 has expired.
|
Replica Set status as seen from Primary at the time of incident (db1):
{
|
"set" : "***",
|
"date" : ISODate("2020-04-15T08:26:26.804Z"),
|
"myState" : 1,
|
"term" : NumberLong(22),
|
"syncingTo" : "",
|
"syncSourceHost" : "",
|
"syncSourceId" : -1,
|
"heartbeatIntervalMillis" : NumberLong(2000),
|
"majorityVoteCount" : 3,
|
"writeMajorityCount" : 3,
|
"optimes" : {
|
"lastCommittedOpTime" : {
|
"ts" : Timestamp(1586939186, 813),
|
"t" : NumberLong(22)
|
},
|
"lastCommittedWallTime" : ISODate("2020-04-15T08:26:26.798Z"),
|
"readConcernMajorityOpTime" : {
|
"ts" : Timestamp(1586939186, 813),
|
"t" : NumberLong(22)
|
},
|
"readConcernMajorityWallTime" : ISODate("2020-04-15T08:26:26.798Z"),
|
"appliedOpTime" : {
|
"ts" : Timestamp(1586939186, 813),
|
"t" : NumberLong(22)
|
},
|
"durableOpTime" : {
|
"ts" : Timestamp(1586939186, 813),
|
"t" : NumberLong(22)
|
},
|
"lastAppliedWallTime" : ISODate("2020-04-15T08:26:26.798Z"),
|
"lastDurableWallTime" : ISODate("2020-04-15T08:26:26.798Z")
|
},
|
"lastStableRecoveryTimestamp" : Timestamp(1586939153, 301),
|
"lastStableCheckpointTimestamp" : Timestamp(1586939153, 301),
|
"electionCandidateMetrics" : {
|
"lastElectionReason" : "priorityTakeover",
|
"lastElectionDate" : ISODate("2020-04-07T06:22:02.485Z"),
|
"electionTerm" : NumberLong(22),
|
"lastCommittedOpTimeAtElection" : {
|
"ts" : Timestamp(1586240522, 526),
|
"t" : NumberLong(21)
|
},
|
"lastSeenOpTimeAtElection" : {
|
"ts" : Timestamp(1586240522, 526),
|
"t" : NumberLong(21)
|
},
|
"numVotesNeeded" : 3,
|
"priorityAtElection" : 2,
|
"electionTimeoutMillis" : NumberLong(10000),
|
"priorPrimaryMemberId" : 9,
|
"numCatchUpOps" : NumberLong(4),
|
"newTermStartDate" : ISODate("2020-04-07T06:22:02.539Z"),
|
"wMajorityWriteAvailabilityDate" : ISODate("2020-04-07T06:22:03.616Z")
|
},
|
"members" : [
|
{
|
"_id" : 7,
|
"name" : "db4:27017",
|
"health" : 1,
|
"state" : 2,
|
"stateStr" : "SECONDARY",
|
"uptime" : 748567,
|
"optime" : {
|
"ts" : Timestamp(1586939185, 78),
|
"t" : NumberLong(22)
|
},
|
"optimeDurable" : {
|
"ts" : Timestamp(1586939185, 78),
|
"t" : NumberLong(22)
|
},
|
"optimeDate" : ISODate("2020-04-15T08:26:25Z"),
|
"optimeDurableDate" : ISODate("2020-04-15T08:26:25Z"),
|
"lastHeartbeat" : ISODate("2020-04-15T08:26:25.058Z"),
|
"lastHeartbeatRecv" : ISODate("2020-04-15T08:26:25.034Z"),
|
"pingMs" : NumberLong(0),
|
"lastHeartbeatMessage" : "",
|
"syncingTo" : "db1:27017",
|
"syncSourceHost" : "db1:27017",
|
"syncSourceId" : 8,
|
"infoMessage" : "",
|
"configVersion" : 128
|
},
|
{
|
"_id" : 8,
|
"name" : "db1:27017",
|
"health" : 1,
|
"state" : 1,
|
"stateStr" : "PRIMARY",
|
"uptime" : 751010,
|
"optime" : {
|
"ts" : Timestamp(1586939186, 813),
|
"t" : NumberLong(22)
|
},
|
"optimeDate" : ISODate("2020-04-15T08:26:26Z"),
|
"syncingTo" : "",
|
"syncSourceHost" : "",
|
"syncSourceId" : -1,
|
"infoMessage" : "",
|
"electionTime" : Timestamp(1586240522, 538),
|
"electionDate" : ISODate("2020-04-07T06:22:02Z"),
|
"configVersion" : 128,
|
"self" : true,
|
"lastHeartbeatMessage" : ""
|
},
|
{
|
"_id" : 9,
|
"name" : "db2:27017",
|
"health" : 1,
|
"state" : 2,
|
"stateStr" : "SECONDARY",
|
"uptime" : 751002,
|
"optime" : {
|
"ts" : Timestamp(1586939185, 995),
|
"t" : NumberLong(22)
|
},
|
"optimeDurable" : {
|
"ts" : Timestamp(1586939185, 991),
|
"t" : NumberLong(22)
|
},
|
"optimeDate" : ISODate("2020-04-15T08:26:25Z"),
|
"optimeDurableDate" : ISODate("2020-04-15T08:26:25Z"),
|
"lastHeartbeat" : ISODate("2020-04-15T08:26:25.969Z"),
|
"lastHeartbeatRecv" : ISODate("2020-04-15T08:26:24.893Z"),
|
"pingMs" : NumberLong(0),
|
"lastHeartbeatMessage" : "",
|
"syncingTo" : "db1:27017",
|
"syncSourceHost" : "db1:27017",
|
"syncSourceId" : 8,
|
"infoMessage" : "",
|
"configVersion" : 128
|
},
|
{
|
"_id" : 10,
|
"name" : "db3:27017",
|
"health" : 1,
|
"state" : 2,
|
"stateStr" : "SECONDARY",
|
"uptime" : 751002,
|
"optime" : {
|
"ts" : Timestamp(1586903526, 1069),
|
"t" : NumberLong(22)
|
},
|
"optimeDurable" : {
|
"ts" : Timestamp(1586903526, 1069),
|
"t" : NumberLong(22)
|
},
|
"optimeDate" : ISODate("2020-04-14T22:32:06Z"),
|
"optimeDurableDate" : ISODate("2020-04-14T22:32:06Z"),
|
"lastHeartbeat" : ISODate("2020-04-15T08:26:25.034Z"),
|
"lastHeartbeatRecv" : ISODate("2020-04-15T08:26:25.594Z"),
|
"pingMs" : NumberLong(0),
|
"lastHeartbeatMessage" : "",
|
"syncingTo" : "db1:27017",
|
"syncSourceHost" : "db1:27017",
|
"syncSourceId" : 8,
|
"infoMessage" : "",
|
"configVersion" : 128
|
},
|
{
|
"_id" : 11,
|
"name" : "db1-dc2:27017",
|
"health" : 1,
|
"state" : 2,
|
"stateStr" : "SECONDARY",
|
"uptime" : 751002,
|
"optime" : {
|
"ts" : Timestamp(1586939185, 627),
|
"t" : NumberLong(22)
|
},
|
"optimeDurable" : {
|
"ts" : Timestamp(1586939185, 622),
|
"t" : NumberLong(22)
|
},
|
"optimeDate" : ISODate("2020-04-15T08:26:25Z"),
|
"optimeDurableDate" : ISODate("2020-04-15T08:26:25Z"),
|
"lastHeartbeat" : ISODate("2020-04-15T08:26:25.660Z"),
|
"lastHeartbeatRecv" : ISODate("2020-04-15T08:26:24.804Z"),
|
"pingMs" : NumberLong(1),
|
"lastHeartbeatMessage" : "",
|
"syncingTo" : "db1:27017",
|
"syncSourceHost" : "db1:27017",
|
"syncSourceId" : 8,
|
"infoMessage" : "",
|
"configVersion" : 128
|
}
|
],
|
"ok" : 1,
|
"$clusterTime" : {
|
"clusterTime" : Timestamp(1586939186, 814),
|
"signature" : {
|
"hash" : BinData(0,"zGvmG5qYkIif17pfXKXh2dgvwIg="),
|
"keyId" : NumberLong("6812519156018054020")
|
}
|
},
|
"operationTime" : Timestamp(1586939186, 813)
|
}
|
***:PRIMARY> rs.printSlaveReplicationInfo()
|
source: db4:27017
|
syncedTo: Wed Apr 15 2020 08:22:15 GMT+0000 (UTC)
|
1 secs (0 hrs) behind the primary
|
source: db2:27017
|
syncedTo: Wed Apr 15 2020 08:22:15 GMT+0000 (UTC)
|
1 secs (0 hrs) behind the primary
|
source: db3:27017
|
syncedTo: Tue Apr 14 2020 22:32:06 GMT+0000 (UTC)
|
35410 secs (9.84 hrs) behind the primary
|
source: db1-dc2:27017
|
syncedTo: Wed Apr 15 2020 08:22:15 GMT+0000 (UTC)
|
1 secs (0 hrs) behind the primary
|
All other metrics (CPU Usage, Memory, Network, etc) were fine.
Moreover db3 served read queries perfectly fine with outdated data and that caused major problems due to write conflicts on primary db1.
Additionally attaching diagnostic.data from all dbs.
|
|
Seems like the cause of this issue is the following glibc bug: https://sourceware.org/bugzilla/show_bug.cgi?id=25847
We did not verify that ourselves, but our Debian 9 systems with glibc 2.24 do not experience replication hangs and Debian 10 systems with glibc 2.28 do hang.
Sourceware Bugzilla has a reproduction program for this case and states that issue appeared in glibc 2.27. We do not have enough capacity to reproduce this issue and meanwhile we'll just downgrade to Debian 9 to see if that helps.
Maybe it would be nice to add a documentation note about buggy glibc >= 2.27 unless this issue is resolved in upstream release.
Here are additional links that may be of interest:
|
|
4.4.1 version error info
{"t":\{"$date":"2020-12-24T08:48:54.474+00:00"}
,"s":"I", "c":"CONNPOOL", "id":22572, "ctx":"TaskExecutorPool-0","msg":"Dropping all pooled connections","attr":{"hostAndPort":"mongodb-sharded-shard0-data-1.mongodb-sharded-headless.paas.svc.cluster.local:27017","error":"ShutdownInProgress: Pool for mongodb-sharded-shard0-data-1.mongodb-sharded-headless.paas.svc.cluster.local:27017 has expired."}}
|
|
Hi neanton@gmail.com,
After thorough inspection of all the data available by several teams, multiple attempts to reproduce this issue and code inspection, we suspect the reported issue is outside of mongod process. We also suspect that there's something specific to the OS or the libc binary that is triggering this behavior. Though we cannot address issues outside of MongoDB in this case, we've opened SERVER-50228 which will allow us to definitively say that we did not "miss a notification" (based on the information from the core-dump, worker threads which are polling here might be are missing the notification issued here), thus if we see this issue again we can actively question the environment for the server (OS, libs, etc).
Please let us know if you have any questions.
Dima
|
|
Here it is:
(gdb) p this._state
|
$10 = mongo::ThreadPool::running
|
|
|
Hi neanton@gmail.com, could you please provide _state from the core-dump?
|
|
Hello, Dmitry,
Please see the log:
(gdb) f 6
|
#6 mongo::ThreadPool::waitForIdle (this=0x5557ac172900) at src/mongo/util/concurrency/thread_pool.cpp:227
|
227 _poolIsIdle.wait(lk);
|
(gdb) info locals
|
lk = {_M_device = 0x5557ac172980, _M_owns = false}
|
(gdb) list
|
222
|
223 void ThreadPool::waitForIdle() {
|
224 stdx::unique_lock<Latch> lk(_mutex);
|
225 // If there are any pending tasks, or non-idle threads, the pool is not idle.
|
226 while (!_pendingTasks.empty() || _numIdleThreads < _threads.size()) {
|
227 _poolIsIdle.wait(lk);
|
228 }
|
229 }
|
230
|
231 ThreadPool::Stats ThreadPool::getStats() const {
|
(gdb) p _numIdleThreads
|
$1 = 16
|
(gdb) p _threads
|
$2 = std::vector of length 16, capacity 16 = {{<std::thread> = {_M_id = {_M_thread = 140493928863488}}, <No data fields>}, {<std::thread> = {_M_id = {_M_thread = 140493920470784}}, <No data fields>}, {<std::thread> = {_M_id = {
|
_M_thread = 140493912078080}}, <No data fields>}, {<std::thread> = {_M_id = {_M_thread = 140493903685376}}, <No data fields>}, {<std::thread> = {_M_id = {_M_thread = 140493895292672}}, <No data fields>}, {<std::thread> = {
|
_M_id = {_M_thread = 140493886899968}}, <No data fields>}, {<std::thread> = {_M_id = {_M_thread = 140493878507264}}, <No data fields>}, {<std::thread> = {_M_id = {_M_thread = 140493870114560}}, <No data fields>},
|
{<std::thread> = {_M_id = {_M_thread = 140493861721856}}, <No data fields>}, {<std::thread> = {_M_id = {_M_thread = 140493853329152}}, <No data fields>}, {<std::thread> = {_M_id = {
|
_M_thread = 140493844936448}}, <No data fields>}, {<std::thread> = {_M_id = {_M_thread = 140493836543744}}, <No data fields>}, {<std::thread> = {_M_id = {_M_thread = 140493828151040}}, <No data fields>}, {<std::thread> = {
|
_M_id = {_M_thread = 140493819758336}}, <No data fields>}, {<std::thread> = {_M_id = {_M_thread = 140493811365632}}, <No data fields>}, {<std::thread> = {_M_id = {_M_thread = 140493802972928}}, <No data fields>}}
|
(gdb) p _threads.size()
|
$3 = 16
|
(gdb) p _pendingTasks
|
$4 = std::deque with 1 element = {{impl = std::unique_ptr<mongo::unique_function<void(mongo::Status)>::Impl> = {get() = 0x5560ba5f9620}}}
|
(gdb) p _pendingTasks.empty()
|
$5 = false
|
(gdb) p _pendingTasks.size()
|
$6 = 1
|
|
|
Hi neanton@gmail.com, it would be helpful to have the values for the following values from the core-dump: _numIdleThreads, _threads.size(), and _pendingTasks.empty(). These should be captured for the thread blocked on ThreadPool::waitForIdle().
|
|
All of the 16 replication threads were in _workAvailable.wait() state and _numIdleThreads is 16. We can provide any info from the coredump if that helps.
|
|
Hello, Dmitry,
So right now we have a core dump. It's 65 GB in size and is not transferable.
The data we have corresponds with A. Jesse Jiryu Davis comment.
ThreadPool::waitForIdle waits for _poolIsIdle AND _pendingTasks contains one element.
From the perspective of worker threads ThreadPool::_consumeTasks does not resume due to no more ThreadPool::schedule calls (SyncTail::multiApply is hanging in waitForIdle).
Seems like due to some race condition ThreadPool::_doOneTask does not call _poolIsIdle.notify_all() probably because _pendingTasks may have not been empty by the time of execution.
That is just a guess, we are out of ideas how that may happen.
(gdb) t 39
|
[Switching to thread 39 (Thread 0x7fc74c394700 (LWP 12822))]
|
#0 futex_wait_cancelable (private=0, expected=0, futex_word=0x5557ac172a84) at ../sysdeps/unix/sysv/linux/futex-internal.h:88
|
88 in ../sysdeps/unix/sysv/linux/futex-internal.h
|
(gdb) bt
|
#0 futex_wait_cancelable (private=0, expected=0, futex_word=0x5557ac172a84) at ../sysdeps/unix/sysv/linux/futex-internal.h:88
|
#1 __pthread_cond_wait_common (abstime=0x0, mutex=0x5557ab02c390, cond=0x5557ac172a58) at pthread_cond_wait.c:502
|
#2 __pthread_cond_wait (cond=0x5557ac172a58, mutex=0x5557ab02c390) at pthread_cond_wait.c:655
|
#3 0x00005557a0ba40bc in __gthread_cond_wait (__mutex=<optimized out>, __cond=__cond@entry=0x5557ac172a58)
|
at /data/mci/10ef014392f1678bc62c2080782313b4/toolchain-builder/tmp/build-gcc-v3.sh-Xg1/build/x86_64-mongodb-linux/libstdc++-v3/include/x86_64-mongodb-linux/bits/gthr-default.h:864
|
#4 std::condition_variable::wait (this=this@entry=0x5557ac172a58, __lock=...) at ../../../../../src/combined/libstdc++-v3/src/c++11/condition_variable.cc:53
|
#5 0x000055579f8ab263 in std::_V2::condition_variable_any::wait<std::unique_lock<mongo::latch_detail::Latch> > (__lock=..., this=0x5557ac172a58)
|
at /opt/mongodbtoolchain/revisions/e5348beb43e147b74a40f4ca5fb05a330ea646cf/stow/gcc-v3.OR2/include/c++/8.2.0/bits/std_mutex.h:238
|
#6 mongo::ThreadPool::waitForIdle (this=0x5557ac172900) at src/mongo/util/concurrency/thread_pool.cpp:227
|
#7 0x000055579f2b00a2 in mongo::repl::SyncTail::multiApply (this=0x5557abf452d0, opCtx=0x55640f9f2280, ops=std::vector of length 9, capacity 5000 = {...}) at src/mongo/db/repl/sync_tail.cpp:1363
|
#8 0x000055579f2b189b in mongo::repl::SyncTail::_oplogApplication (this=0x5557abf452d0, replCoord=0x5557a3941800, batcher=0x7fc74c3931d0)
|
at /opt/mongodbtoolchain/revisions/e5348beb43e147b74a40f4ca5fb05a330ea646cf/stow/gcc-v3.OR2/include/c++/8.2.0/bits/move.h:182
|
#9 0x000055579f2b2a52 in mongo::repl::SyncTail::oplogApplication(mongo::repl::OplogBuffer*, std::function<mongo::StatusWith<std::vector<mongo::repl::OplogEntry, std::allocator<mongo::repl::OplogEntry> > > (mongo::OperationContext*, mongo::repl::OplogApplier::BatchLimits const&)>, mongo::repl::ReplicationCoordinator*) (this=0x5557abf452d0, oplogBuffer=<optimized out>, getNextApplierBatchFn=..., replCoord=0x5557a3941800)
|
at src/mongo/db/repl/sync_tail.cpp:753
|
#10 0x000055579f2a273e in mongo::repl::OplogApplierImpl::_run (this=<optimized out>, oplogBuffer=<optimized out>)
|
at /opt/mongodbtoolchain/revisions/e5348beb43e147b74a40f4ca5fb05a330ea646cf/stow/gcc-v3.OR2/include/c++/8.2.0/new:169
|
#11 0x000055579f2f9b77 in mongo::repl::OplogApplier::<lambda(const CallbackArgs&)>::operator()(const mongo::repl::CallbackArgs &) (__closure=0x5557ac1e78e8, args=...) at src/mongo/db/repl/oplog_applier.cpp:108
|
#12 0x00005557a017a373 in mongo::unique_function<void (mongo::executor::TaskExecutor::CallbackArgs const&)>::operator()(mongo::executor::TaskExecutor::CallbackArgs const&) const (args#0=..., this=<synthetic pointer>)
|
at src/mongo/util/invariant.h:64
|
#13 mongo::executor::ThreadPoolTaskExecutor::runCallback (this=0x5557ac01b980, cbStateArg=std::shared_ptr<class mongo::executor::ThreadPoolTaskExecutor::CallbackState> (use count 4, weak count 0) = {...})
|
at src/mongo/executor/thread_pool_task_executor.cpp:659
|
#14 0x00005557a017a892 in mongo::executor::ThreadPoolTaskExecutor::<lambda(auto:10)>::operator()<mongo::Status> (status=..., __closure=0x5557ac1e78a8)
|
at /opt/mongodbtoolchain/revisions/e5348beb43e147b74a40f4ca5fb05a330ea646cf/stow/gcc-v3.OR2/include/c++/8.2.0/ext/atomicity.h:96
|
#15 mongo::unique_function<void(mongo::Status)>::callRegularVoid<mongo::executor::ThreadPoolTaskExecutor::scheduleIntoPool_inlock(mongo::executor::ThreadPoolTaskExecutor::WorkQueue*, const iterator&, const iterator&, std::unique_lock<mongo::latch_detail::Latch>)::<lambda(auto:10)> > (args#0=..., f=..., isVoid=...) at src/mongo/util/functional.h:145
|
#16 mongo::unique_function<void(mongo::Status)>::SpecificImpl::call(mongo::Status &&) (this=0x5557ac1e78a0, args#0=...) at src/mongo/util/functional.h:159
|
#17 0x000055579f8aac30 in mongo::unique_function<void (mongo::Status)>::operator()(mongo::Status) const (args#0=..., this=<synthetic pointer>) at src/mongo/util/invariant.h:64
|
#18 mongo::ThreadPool::_doOneTask (this=0x5557ac172300, lk=0x7fc74c393660) at src/mongo/util/concurrency/thread_pool.cpp:341
|
#19 0x000055579f8abe81 in mongo::ThreadPool::_consumeTasks (this=0x5557ac172300) at src/mongo/util/concurrency/thread_pool.cpp:294
|
#20 0x000055579f8ad065 in mongo::ThreadPool::_workerThreadBody (pool=0x5557ac172300, threadName=...) at src/mongo/util/concurrency/thread_pool.cpp:247
|
#21 0x000055579f8ad2a2 in mongo::ThreadPool::<lambda()>::operator() (__closure=0x5557ab02d390) at src/mongo/util/concurrency/thread_pool.cpp:374
|
#22 std::__invoke_impl<void, mongo::ThreadPool::_startWorkerThread_inlock()::<lambda()> > (__f=...) at /opt/mongodbtoolchain/revisions/e5348beb43e147b74a40f4ca5fb05a330ea646cf/stow/gcc-v3.OR2/include/c++/8.2.0/bits/invoke.h:60
|
#23 std::__invoke<mongo::ThreadPool::_startWorkerThread_inlock()::<lambda()> > (__fn=...) at /opt/mongodbtoolchain/revisions/e5348beb43e147b74a40f4ca5fb05a330ea646cf/stow/gcc-v3.OR2/include/c++/8.2.0/bits/invoke.h:95
|
#24 std::__apply_impl<mongo::ThreadPool::_startWorkerThread_inlock()::<lambda()>, std::tuple<> > (__t=..., __f=...)
|
at /opt/mongodbtoolchain/revisions/e5348beb43e147b74a40f4ca5fb05a330ea646cf/stow/gcc-v3.OR2/include/c++/8.2.0/tuple:1678
|
#25 std::apply<mongo::ThreadPool::_startWorkerThread_inlock()::<lambda()>, std::tuple<> > (__t=..., __f=...) at /opt/mongodbtoolchain/revisions/e5348beb43e147b74a40f4ca5fb05a330ea646cf/stow/gcc-v3.OR2/include/c++/8.2.0/tuple:1687
|
#26 mongo::stdx::thread::<lambda()>::operator() (this=0x5557ab02d388) at src/mongo/stdx/thread.h:172
|
#27 std::__invoke_impl<void, mongo::stdx::thread::thread(Function&&, Args&& ...) [with Function = mongo::ThreadPool::_startWorkerThread_inlock()::<lambda()>; Args = {}; typename std::enable_if<(! std::is_same<mongo::stdx::thread, typename std::decay<_Tp>::type>::value), int>::type <anonymous> = 0]::<lambda()> > (__f=...) at /opt/mongodbtoolchain/revisions/e5348beb43e147b74a40f4ca5fb05a330ea646cf/stow/gcc-v3.OR2/include/c++/8.2.0/bits/invoke.h:60
|
#28 std::__invoke<mongo::stdx::thread::thread(Function&&, Args&& ...) [with Function = mongo::ThreadPool::_startWorkerThread_inlock()::<lambda()>; Args = {}; typename std::enable_if<(! std::is_same<mongo::stdx::thread, typename std::decay<_Tp>::type>::value), int>::type <anonymous> = 0]::<lambda()> > (__fn=...) at /opt/mongodbtoolchain/revisions/e5348beb43e147b74a40f4ca5fb05a330ea646cf/stow/gcc-v3.OR2/include/c++/8.2.0/bits/invoke.h:95
|
#29 std::thread::_Invoker<std::tuple<mongo::stdx::thread::thread(Function&&, Args&& ...) [with Function = mongo::ThreadPool::_startWorkerThread_inlock()::<lambda()>; Args = {}; typename std::enable_if<(! std::is_same<mongo::stdx::thread, typename std::decay<_Tp>::type>::value), int>::type <anonymous> = 0]::<lambda()> > >::_M_invoke<0> (this=0x5557ab02d388)
|
at /opt/mongodbtoolchain/revisions/e5348beb43e147b74a40f4ca5fb05a330ea646cf/stow/gcc-v3.OR2/include/c++/8.2.0/thread:234
|
#30 std::thread::_Invoker<std::tuple<mongo::stdx::thread::thread(Function&&, Args&& ...) [with Function = mongo::ThreadPool::_startWorkerThread_inlock()::<lambda()>; Args = {}; typename std::enable_if<(! std::is_same<mongo::stdx::thread, typename std::decay<_Tp>::type>::value), int>::type <anonymous> = 0]::<lambda()> > >::operator() (this=0x5557ab02d388)
|
at /opt/mongodbtoolchain/revisions/e5348beb43e147b74a40f4ca5fb05a330ea646cf/stow/gcc-v3.OR2/include/c++/8.2.0/thread:243
|
#31 std::thread::_State_impl<std::thread::_Invoker<std::tuple<mongo::stdx::thread::thread(Function&&, Args&& ...) [with Function = mongo::ThreadPool::_startWorkerThread_inlock()::<lambda()>; Args = {}; typename std::enable_if<(! std::is_same<mongo::stdx::thread, typename std::decay<_Tp>::type>::value), int>::type <anonymous> = 0]::<lambda()> > > >::_M_run(void) (this=0x5557ab02d380)
|
at /opt/mongodbtoolchain/revisions/e5348beb43e147b74a40f4ca5fb05a330ea646cf/stow/gcc-v3.OR2/include/c++/8.2.0/thread:186
|
#32 0x00005557a0ba725f in std::execute_native_thread_routine (__p=0x5557ab02d380) at ../../../../../src/combined/libstdc++-v3/src/c++11/thread.cc:80
|
#33 0x00007fc76253efa3 in start_thread (arg=<optimized out>) at pthread_create.c:486
|
#34 0x00007fc76246f4cf in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
|
(gdb) f 6
|
#6 mongo::ThreadPool::waitForIdle (this=0x5557ac172900) at src/mongo/util/concurrency/thread_pool.cpp:227
|
227 _poolIsIdle.wait(lk);
|
(gdb) p _pendingTasks
|
$4 = std::deque with 1 element = {{impl = std::unique_ptr<mongo::unique_function<void(mongo::Status)>::Impl> = {get() = 0x5560ba5f9620}}}
|
(gdb) t 42
|
[Switching to thread 42 (Thread 0x7fc74a390700 (LWP 12826))]
|
#0 futex_wait_cancelable (private=0, expected=0, futex_word=0x5557ac1729f8) at ../sysdeps/unix/sysv/linux/futex-internal.h:88
|
88 ../sysdeps/unix/sysv/linux/futex-internal.h: No such file or directory.
|
(gdb) bt
|
#0 futex_wait_cancelable (private=0, expected=0, futex_word=0x5557ac1729f8) at ../sysdeps/unix/sysv/linux/futex-internal.h:88
|
#1 __pthread_cond_wait_common (abstime=0x0, mutex=0x5557ab02c3d0, cond=0x5557ac1729d0) at pthread_cond_wait.c:502
|
#2 __pthread_cond_wait (cond=0x5557ac1729d0, mutex=0x5557ab02c3d0) at pthread_cond_wait.c:655
|
#3 0x00005557a0ba40bc in __gthread_cond_wait (__mutex=<optimized out>, __cond=__cond@entry=0x5557ac1729d0) at /data/mci/10ef014392f1678bc62c2080782313b4/toolchain-builder/tmp/build-gcc-v3.sh-Xg1/build/x86_64-mongodb-linux/libstdc++-v3/include/x86_64-mongodb-linux/bits/gthr-default.h:864
|
#4 std::condition_variable::wait (this=this@entry=0x5557ac1729d0, __lock=...) at ../../../../../src/combined/libstdc++-v3/src/c++11/condition_variable.cc:53
|
#5 0x000055579f8ac345 in std::_V2::condition_variable_any::wait<std::unique_lock<mongo::latch_detail::Latch> > (__lock=..., this=0x5557ac1729d0) at /opt/mongodbtoolchain/revisions/e5348beb43e147b74a40f4ca5fb05a330ea646cf/stow/gcc-v3.OR2/include/c++/8.2.0/bits/std_mutex.h:238
|
#6 mongo::ThreadPool::_consumeTasks (this=0x5557ac172900) at src/mongo/util/concurrency/thread_pool.cpp:289
|
#7 0x000055579f8ad065 in mongo::ThreadPool::_workerThreadBody (pool=0x5557ac172900, threadName=...) at src/mongo/util/concurrency/thread_pool.cpp:247
|
#8 0x000055579f8ad2a2 in mongo::ThreadPool::<lambda()>::operator() (__closure=0x5557ab02c290) at src/mongo/util/concurrency/thread_pool.cpp:374
|
#9 std::__invoke_impl<void, mongo::ThreadPool::_startWorkerThread_inlock()::<lambda()> > (__f=...) at /opt/mongodbtoolchain/revisions/e5348beb43e147b74a40f4ca5fb05a330ea646cf/stow/gcc-v3.OR2/include/c++/8.2.0/bits/invoke.h:60
|
#10 std::__invoke<mongo::ThreadPool::_startWorkerThread_inlock()::<lambda()> > (__fn=...) at /opt/mongodbtoolchain/revisions/e5348beb43e147b74a40f4ca5fb05a330ea646cf/stow/gcc-v3.OR2/include/c++/8.2.0/bits/invoke.h:95
|
#11 std::__apply_impl<mongo::ThreadPool::_startWorkerThread_inlock()::<lambda()>, std::tuple<> > (__t=..., __f=...) at /opt/mongodbtoolchain/revisions/e5348beb43e147b74a40f4ca5fb05a330ea646cf/stow/gcc-v3.OR2/include/c++/8.2.0/tuple:1678
|
#12 std::apply<mongo::ThreadPool::_startWorkerThread_inlock()::<lambda()>, std::tuple<> > (__t=..., __f=...) at /opt/mongodbtoolchain/revisions/e5348beb43e147b74a40f4ca5fb05a330ea646cf/stow/gcc-v3.OR2/include/c++/8.2.0/tuple:1687
|
#13 mongo::stdx::thread::<lambda()>::operator() (this=0x5557ab02c288) at src/mongo/stdx/thread.h:172
|
#14 std::__invoke_impl<void, mongo::stdx::thread::thread(Function&&, Args&& ...) [with Function = mongo::ThreadPool::_startWorkerThread_inlock()::<lambda()>; Args = {}; typename std::enable_if<(! std::is_same<mongo::stdx::thread, typename std::decay<_Tp>::type>::value), int>::type <anonymous> = 0]::<lambda()> > (__f=...) at /opt/mongodbtoolchain/revisions/e5348beb43e147b74a40f4ca5fb05a330ea646cf/stow/gcc-v3.OR2/include/c++/8.2.0/bits/invoke.h:60
|
#15 std::__invoke<mongo::stdx::thread::thread(Function&&, Args&& ...) [with Function = mongo::ThreadPool::_startWorkerThread_inlock()::<lambda()>; Args = {}; typename std::enable_if<(! std::is_same<mongo::stdx::thread, typename std::decay<_Tp>::type>::value), int>::type <anonymous> = 0]::<lambda()> > (__fn=...) at /opt/mongodbtoolchain/revisions/e5348beb43e147b74a40f4ca5fb05a330ea646cf/stow/gcc-v3.OR2/include/c++/8.2.0/bits/invoke.h:95
|
#16 std::thread::_Invoker<std::tuple<mongo::stdx::thread::thread(Function&&, Args&& ...) [with Function = mongo::ThreadPool::_startWorkerThread_inlock()::<lambda()>; Args = {}; typename std::enable_if<(! std::is_same<mongo::stdx::thread, typename std::decay<_Tp>::type>::value), int>::type <anonymous> = 0]::<lambda()> > >::_M_invoke<0> (this=0x5557ab02c288) at /opt/mongodbtoolchain/revisions/e5348beb43e147b74a40f4ca5fb05a330ea646cf/stow/gcc-v3.OR2/include/c++/8.2.0/thread:234
|
#17 std::thread::_Invoker<std::tuple<mongo::stdx::thread::thread(Function&&, Args&& ...) [with Function = mongo::ThreadPool::_startWorkerThread_inlock()::<lambda()>; Args = {}; typename std::enable_if<(! std::is_same<mongo::stdx::thread, typename std::decay<_Tp>::type>::value), int>::type <anonymous> = 0]::<lambda()> > >::operator() (this=0x5557ab02c288) at /opt/mongodbtoolchain/revisions/e5348beb43e147b74a40f4ca5fb05a330ea646cf/stow/gcc-v3.OR2/include/c++/8.2.0/thread:243
|
#18 std::thread::_State_impl<std::thread::_Invoker<std::tuple<mongo::stdx::thread::thread(Function&&, Args&& ...) [with Function = mongo::ThreadPool::_startWorkerThread_inlock()::<lambda()>; Args = {}; typename std::enable_if<(! std::is_same<mongo::stdx::thread, typename std::decay<_Tp>::type>::value), int>::type <anonymous> = 0]::<lambda()> > > >::_M_run(void) (this=0x5557ab02c280) at /opt/mongodbtoolchain/revisions/e5348beb43e147b74a40f4ca5fb05a330ea646cf/stow/gcc-v3.OR2/include/c++/8.2.0/thread:186
|
#19 0x00005557a0ba725f in std::execute_native_thread_routine (__p=0x5557ab02c280) at ../../../../../src/combined/libstdc++-v3/src/c++11/thread.cc:80
|
#20 0x00007fc76253efa3 in start_thread (arg=<optimized out>) at pthread_create.c:486
|
#21 0x00007fc76246f4cf in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
|
(gdb) f 6
|
#6 mongo::ThreadPool::_consumeTasks (this=0x5557ac172900) at src/mongo/util/concurrency/thread_pool.cpp:289
|
289 _workAvailable.wait(lk);
|
(gdb) list
|
284 // because any new threads that put the number of total threads above minThreads
|
285 // would be eligible for retirement once they had no work left to do.
|
286 LOG(3) << "waiting for work; I am one of " << _threads.size() << " thread(s);"
|
287 << " the minimum number of threads is " << _options.minThreads;
|
288 MONGO_IDLE_THREAD_BLOCK;
|
289 _workAvailable.wait(lk);
|
290 }
|
291 continue;
|
292 }
|
293
|
(gdb) p _pendingTasks
|
$2 = std::deque with 1 element = {{impl = std::unique_ptr<mongo::unique_function<void(mongo::Status)>::Impl> = {get() = 0x5560ba5f9620}}}
|
|
|
Hi neanton@gmail.com,
I think the next step would be to collect more verbose system logs (which currently do not print any/any useful information) and provide them un-redacted/non-filtered. There is no need to add more debugging commands to gdb as we clearly see what's going on during this event. As for a core dump, unless it's a full memory dump I do not see much value looking at redacted data.
Regarding Production Notes, I did not see many sections which were in fact defined as per our Production Notes but going through every section and setting it to MongoDB recommended values is out of scope for this project. Perhaps you can ask our community by posting on the MongoDB Community Forums or on Stack Overflow with the mongodb tag.
Thanks,
Dima
|
|
Hello, Dmitry,
Thank you for your effort with investigating this incident.
Yesterday at ~07:40 UTC Jul 21 our replica set was updated to latest patch version 4.2.8
At ~22:53 UTC Jul 21 db3 from that replica set again stopped replication process. So it seems that no changes made from 4.2.5 till 4.2.8 fixed anything.
We did not collect gdb traces from that incident. I've uploaded diagnostic.data and mongod.log from db3. Also there were NO reasonable log messages anywhere in system logs. Nothing suspicious, system works just fine. Just regular cron jobs and some firewall noise. No kern logs, no segfaults anywhere, all clear.
- Maybe someone from tech team can provide additional debugging commands for gdb, like tracing some internal db state during the next incident, printing some internal vars, dumping internal state of some locks, etc?
- Maybe you can provide a command to generate a debuggable core file without full memory dump of our database data?
Regarding Production Notes - we've walked through the document once again and do not understand which sections of it are not implemented in our setup.
Please provide information about which exact values/sections you've spotted in our setup are not inline with the Production Notes so we can fix that and make sure it does not affect this incident.
|
|
Hi neanton@gmail.com,
I suggest we'll revisit this investigation after the replica set is upgraded to the latest. However, we need to figure out how to get the meaningful information about the system events. Currently, it's either being redacted or contains only a few lines which does not help with this investigation. We need to figure out a way to get a more verbose, un-redacted data for system events covering the time of the incidents. If this is still not feasible, I can try to think of other approaches.
As requested, here is the link to our Production Notes.
Thanks,
Dima
|
|
Hello, Dmitry,
We are sorry, but some of the logs were cleaned up before the upload. This is due to our security policy.
- diagnostic.data was corruped unintentionally. I can upload raw files on request, if that is required.
- syslog was not redacted. Those few lines it the full output we have.
- dmesg section contained exclusively iptables block logs, non of which contained requests to mongodb port or was coming from another mongodb instance.
- messages log does not exist in our system.
Also we've recorded one more incident on Jul 17. With all of the logs collected (gdb/mdiag/diagnostics.data). But I'm sure those logs won't show anything new. Just let us know if you need them uploaded for diagnosis.
All of those incidents happened on one db cluster. One more cluster with same underlying hardware/os/configuration is not affected. That second cluster has another workload though.
We've scheduled an update to the latest patch release meanwhile.
And what about Production Notes. Can you point out please which sections do not comply?
|
|
Hi neanton@gmail.com, I have reviewed the latest data and did not find anything new or suspicious from MongoDB side. However, just wanted to pint out a few oddities:
- The uploaded diagnostic.data was corrupted (this is not critical for the latest bulk of data as we saw the needed information inside gdb stacks)
- The mdiag output did not include the most interesting section I was hoping to see - dmesg
- The uploaded syslog contains just a few lines covering almost 24-hour scope
In addition, I've noticed that many sections from our Production Notes are not implemented. Though I cannot correlate any of the missing sections to the reported issue, this is the basic requirement prior to diagnostics efforts. Although we are investigating a potential issue which is external to MongoDB but getting your cluster to the latest 4.2 minor version would be our preferred choice.
For now, could you please upload just these files for us to understand if we need to change something in your deployment's configuration to get more useful information:
- /var/log/dmesg (should included timestamps in seconds since reboot)
- /var/log/messages
- /var/log/syslog
Please let me know if anything was removed from the uploaded files.
Thanks,
Dima
|
|
BTW: One more incident happened with db2 today. We did not collect logs, please let us know if we need to collect logs for further reoccurences of the issue.
|
|
Any news on that, Dmitry? Please reopen the ticket
|
|
Hi Dmitry!
We had an issue few days ado. I uploaded output from mdiag.sh, gdb and also some logs, that you mentioned.
It's the same cluster as early mentioned by Anton Neznaenko.
|
|
Hi neanton@gmail.com, since there were no reoccurrence of this issue, I will go ahead and close this ticket. Please feel free to reopen if this happens again.
Regards,
Dima
|
|
Thanks for the update neanton@gmail.com, so this is still a mystery. Please let us know if anything changes.
|
|
Hello, Dmitry,
No issues were recorded for the last month. Last issue we’ve seen was on the 3rd of May. No issues from that time yet. No changes were made to hardware or software configurations. We are still monitoring MongoDB toughly.
Also we have one more cluster still on 3.4 and we will try to update it to latest version soon. Will let you know if replica lag reoccurs.
|
|
Hi neanton@gmail.com, just wanted to check if you had another occurrence of the reported issue and if you had a chance to collect the requested information.
|
|
Hi neanton@gmail.com, after investigating the latest set of data, we saw in the collected gdb stacks that the main oplog applier waiting for workers to finish, yet there are no workers in progress. We were not able to identify why this condition would occur and the history of thread_pool.cpp doesn't seem to show any relevant changes or bug fixes since 3.2.
Give the above, I'd like to collect some more data (if this issue reoccurs) which will hopefully help us to progress this case.
Please download the mdiag.sh script from our github repository. Please run this script on the node after the reported event. I will check the output of this script to see any inconsistencies in HW/OS configurations as well as any settings which are not in line with our production notes.
Note: This script should be run from the command line/OS shell with root/sudo privileges as follows:
sudo bash mdiag.sh SERVER-47554
|
It will take about 5 minutes to run this script. Once complete, please upload the resulting /tmp/mdiag-$HOSTNAME.txt from the host to the secure uploader.
Regarding gdb execution, I recommend instead the following format to make it easier to parse:
for i in $(seq 10); do gdb -p $(pidof mongod) -batch -ex 'thread apply all bt' >gdb.$(date -u +%FT%TZ).txt; sleep 10; done
|
You can run this gdb command a few times, say 5-10 times, a few seconds apart
To summarize, the new set of data we are looking for to collect: diagnostic.data, mongod.log, currentOp, mdiag.sh, full and un-redacted syslog (covering at least few hours before and after the event) and gdb.txt stacks. I realize this is more data than we usually request, hopefully this will help us to get to the bottom of this issue.
Thanks,
Dima
|
|
If it's of any help, we can issue some additional debug commands when an incident reappears. Just let us know and provide further instructions.
|
|
I agree with Bruce's diagnosis. SyncTail::multiApply is waiting in ThreadPool::waitForIdle, meaning it has scheduled a batch of oplog entries to be applied by worker threads. Since the applier thread is blocked, it isn't clearing entries from the oplog buffer queue, so the queue fills up. The oplog fetcher eventually blocks in OplogBufferBlockingQueue::waitForSpace.
I see threads in ThreadPool::_consumeTasks waiting on four different condition variables, indicating they belong to four different thread pools.
cond=0x564ed05a85d0 2 threads
|
cond=0x564ede7226d0 1 thread
|
cond=0x564ede7229d0 16 threads
|
cond=0x564edd70a3d0 1 thread
|
The oplog applier thread pool always has 16 threads, so I assume that's the pool with condition variable 0x564ede7229d0. All 16 of its threads are waiting for tasks. So it indeed appears that the pool is idle, but waitForIdle() somehow isn't detecting that. The history of thread_pool.cpp doesn't seem to show relevant changes or bugfixes since 3.2, and I couldn't find any related bugs in Jira.
|
|
Thanks neanton@gmail.com, we are looking at the last incident and will provide updates as we progress.
|
|
Logs and diagnostic.data uploaded. Unfortunately I don't have an exact time frame when gdb was executed. I can only say that it was started after ~20:10 UTC and took approximately 1 minute to dump all threads. I think you will see that in diagnostic.data as a hole without any data, because gdb pauses process during dump.
|
|
Thanks neanton@gmail.com, please upload mongod log and diagnostic.data from the same server you've collected currentOp and gdb. Also, please post the exact time and timestamp when gdb was collected (start-end).
|
|
Hello, Dmitry,
One more incident happened with db2 at Apr 25 ~15:17 UTC
One more incident happened with db3 at Apr 26 ~19:08 UTC
First of all regarding our observations:
Both incidents occurred around the same time of the day, 8 days apart
That is a peak load period (from ~3 PM UTC till 03 AM UTC)
These incidents involved different secondary members
Yep
Internal communication was still ongoing
Yep
External communication (monitoring tool) was timing out
Yep, but not because of some network issues. DB commands like db.listDatabases() and db.serverStatus() were hanging in MongoDB Shell aswell.
Client/s communication stopped
All write operations stopped abruptly (read operations were still going through for some time)
Thanks to maxStalenessSeconds driver option. That is the expected behavior.
Connecting with mongo shell (second incident) showed some signs of unresponsiveness
Nope. We had some time to debug mongodb instance today when it got stuck and found no signs of unresponsiveness
From HW perspective, nothing was under stress during this time (but disk write activity was flat at zero)
Yep
From MongoDB perspective, nothing suspicious about threads/connections/locks/cache (same increase in decommit activity)
Not sure. This time we have even more debug output.
During the second incidents, both DB2 and DB4 were affected at the same time
As we see, db4 was not affected by this problem, it switched to syncing from db3 after db2 lag happened.
Also none of 5 replicaset members share any physical resources: 4 members (db1-4) are in one Campus, db1 and db4 one rack, all 4 are vms on different hardware. 5th is AWS EC2 vm (db1-dc2 in logs).
When an incident happened we've been able to collect db.currentOp() output. Killing each op one by one did not resolve the issue with stopped replication. Uploading db.currentOp() to secure portal.
Also we were able to collect gdb backtrace of all running threads with thread apply all bt full that one is uploaded too. Hope gdb backtrace will finally reveal the issue.
After that, when we've tried killing all ops and had no luck with restoring database replication, we've triggered process restart with systemd. Restart process did not happen successfuly, log file shows:
2020-04-26T20:21:50.145+0000 I CONTROL [signalProcessingThread] got signal 15 (Terminated), will terminate after current cmd ends
|
And then systemd kills process by timeout.
A short portion of syslog and kern.log from the previous incident is also uploaded.
|
|
Hi neanton@gmail.com, after reviewing the latest information, I still was not able to surface any issue related to MongoDB. This is what we know so far:
- Both incidents occurred around the same time of the day, 8 days apart
- These incidents involved different secondary members
- Internal communication was still ongoing
- External communication (monitoring tool) was timing out
- Client/s communication stopped
- All write operations stopped abruptly (read operations were still going through for some time)
- Connecting with mongo shell (second incident) showed some signs of unresponsiveness
- From HW perspective, nothing was under stress during this time (but disk write activity was flat at zero)
- From MongoDB perspective, nothing suspicious about threads/connections/locks/cache (same increase in decommit activity)
During the second incidents, both DB2 and DB4 were affected at the same time. Though DB4 has recovered after 30 seconds after switching it's sync source from DB2 to DB3.
At this point I'd like to understand if there are any shared HW resources between the members of this replica set or between other cluster's members. Especially I am interested to see if anything is being shared around memory and storage. Could you provide the HW mapping (if anything is shared) between all members in this cluster?
Also, please upload /var/log/messages and the output from /var/log/dmesg from DB2 and DB4 covering the time of the latest event?
|
|
Thanks for the update neanton@gmail.com, after reviewing the latest data, I will update you tomorrow on my findings.
|
|
Hello, Dmitry,
I'm afraid I must disappoint you.
After all of the fixes we've applied to our MongoDB instances, our issue reappeared again tonight.
This time db2 replication process stopped. Here is a brief walkthrough of all of the actions we've taken.
1. We've enabled maxStalenessSeconds = 90 driver option for read queries from slaves on April 17 at ~14:30 UTC
2. cpu_mode = host-passthrough was set on all db instances, each instance was rebooted.
db3 (secondary) was rebooted on April 16 at ~10:25 UTC
db2 (secondary) was rebooted on April 21 at ~8:39 UTC
db4 (secondary + hidden) was rebooted on April 21 at ~8:50 UTC
db1 (primary) was rebooted on April 21 at ~9:00 UTC
3. We've enabled snappy wire protocol compressor in our application driver.
That happened gradually, on April 21 at ~ 10:42 UTC, 12:20 UTC, 12:45 UTC, 13:10 UTC
Network utilization dropped ~ 5x times.
No observed CPU / Memory / Lanency issues were detected.
4. db2 replication process stuck again on April 22 at ~22:48 UTC.
Our engineer logged into db2 instance and restarted mongodb process.
From his perspective, mongo shell was functional but showed some signs of unresponsiveness.
As before, there were no signs of other issues, no suspicious entries in system logs.
As before listDatabases command was hanging for 300 secs during stalled replication process on db2.
As our monitoring shows, application correctly switched to reading from db3 thanks to maxStalenessSeconds driver option.
This time we've collected even more logs, from the time db2 was rebooted till mongodb service was restarted and db2 replication lag gone.
I've uploaded 2 archives to support file uploader one with logs from all dbs and one with diagnostics data.
Hope that helps with identifying an issue.
|
|
No, actually that is our monitoring tool, written in pure c, it does some querying like once in 1 minute or so. I do not think it may have affected something.
|
|
Thanks neanton@gmail.com, a quick question, is this also PHP ext-mongodb 1.7.4 plus some compiler flags?
{ driver: { name: "mongoc", version: "1.14.0" }, os: { type: "Linux", name: "Debian GNU/Linux", version: "10", architecture: "x86_64" }, platform: "cfg=0xa15e80e9 posix=200809 stdc=201710 CC=GCC 8.2.0 CFLAGS="-g -O2 -fdebug-prefix-map=/build/mongo-c-driver-U8U2A9/mongo-c-driver-1.14.0=. -fstack-pr..." }
|
|
|
Some additional digging in a log file brought me to this simple monitoring query our monitoring tool performs:
2020-04-14T22:39:34.647+0000 I COMMAND [conn2774404] command admin.$cmd command: listDatabases { listDatabases: 1, $db: "admin", $readPreference: { mode: "primaryPreferred" }, lsid: { id: UUID("deaf14fd-3ef0-4905-965b-8c08ec4fb068") }, $clusterTime: { clusterTime: Timestamp(1586903674, 883), signature: { hash: BinData(0, 7DB05D0B9C9F809C6F0AB5E89564FDAC4A3EFB97), keyId: 6812519156018054020 } } } numYields:0 ok:0 errMsg:"operation was interrupted" errName:ClientDisconnect errCode:279 reslen:242 locks:{ ParallelBatchWriterMode: { acquireCount: { r: 1 }, acquireWaitCount: { r: 1 }, timeAcquiringMicros: { r: 299798488 } } } protocol:op_msg 300064ms
|
Right after replication stopped, our monitoring tool started timeouting. See the timestamp, it started right after db3 stuck. All further listDatabases queries were stuck for 300 seconds too till MongoDB was restarted
|
|
Hey, Dmitry,
libc was the latest version available in Debian 10 – https://packages.debian.org/buster/libc6, actually OS was setup from scratch and all software was of the latest versions.
Debian 10 performance is not an issue for us anymore after switching KVM cpu type as mentioned earlier, and we are not willing to invest time on it anymore. Maybe it would be a good idea from your side to do benchmarks and maybe add some notes to production checklist regarding KVM cpu type.
Regarding drivers, we are using PHP ext-mongodb 1.7.4 – https://pecl.php.net/package/mongodb/1.7.4
We've upgraded PHP drivers before the actual database upgrade procedure and we don't see any signs that it may relate to driver (no errors on application side). All of our application servers read a portion of data from Secondaries, read a portion from primary and write to primary. Most of the reads from Secondaries are tolerant to stale data (~5-10 secs of lag is ok, but not 9h+).
Yep, you are right that cluster worked perfectly fine for 8 days, but then one node replication stuck. We were not able to find any reason for that. Nothing changed at our side, no updates, no changes to application code, etc. Also it happened quite late at night so no human factor suspected.
The ShutdownInProgress you see in logs was not caused by us. We did not issue any kind of shutdown commands. Only usernames / db names / hostnames were changed in logs. No lines were removed.
To me it seems like RS connection pool was marked as expired for some reason https://github.com/mongodb/mongo/blob/r4.2.5/src/mongo/executor/connection_pool.cpp#L1110 and then never recreated/restarted again, maybe due to some internal locking issue or something like that.
From all other perspectives db3 was looking healthy to all other RS members, was responding to heartbeats and served read operations sucessfuly.
|
|
Thanks neanton@gmail.com for all the answers and uploaded logs.
I agree that Debian 9 vs Debian 10 performance is a different issue and if you'd like, we can discuss it in a separate ticket. As for the potential libc issue you've mentioned, did you applied the latest update available?
After reviewing all the data again (both diagnostic.data and mongod logs), I am still not able to spot anything suspicious on the mongodb side. The only difference I see on db3 as opposed to other secondaries is:
- increased pageheat commit and decommit rate, but the values are not so high to indicate this to be our main suspect
- there was an oplog truncate operation on the primary db1 where we deleted ~12k pages, but if this is an issue, I would have expected this issue to be manifested on all secondaries and not just on db3
One thing I've noticed in mongod logs and wanted to clarify at this point. Could you provide all the driver types and driver versions you currently use and map these drivers between the primary and secondaries? Also, during the server upgrade process, did you upgrade all of these drivers?
In addition, I understand that the cluster worked fine w/o any issue for 8 days after the upgrade. Did something change from the application side / driver configuration on the day of the event (Apr 14th)?
Finally, regarding the ShutdownInProgress we've seen in logs, could you clarify if the provided logs are redacted and this is the reason we do not see when the shut down command was issued?
|
|
Logs were uploaded to secure portal for a relevant time period. A few string were replaced (dbusers, dbnames, hostnames)
Regarding the questions:
- Replica set was updated to 4.2.5 at April 6, 2020 and the update procedure was as per documentation
- all nodes were running 3.4.x version with feature compatibility 3.2 on Debian 9 KVM
- feature compatibility switched to 3.4
- upgrade all nodes one by one to 3.6, switch feature compatibility to 3.6
- upgrade all nodes one by one to 4.0, switch feature compatibility to 4.0
- upgrade all nodes one by one to 4.2, switch feature compatibility to 4.2
- all VMs were recreated one by one with Debian 10 and data disks were moved from Debian 9
- everything was looking fine, except higher CPU Usage on Debian 10 compared to Debian 9 machines
- db3 was no different from other machines
- that was a one-time event, we've added replication lag monitoring + we are in the process of enabling maxStalenessSeconds PHP driver option for reads from Secondaries
Not sure if this matters but after we've moved to Debian 10 we found out that CPU usage looks like 30+ percent higher than on Debian 9.
We've done a series of benchmarks to identify the issue. Same 4.2.5 version on Debian 9 was showing 30% less CPU utilization.
Here is the perf output from Debian 10 machine running ycsb benchmark workload/workloada with 15k RPS cap.
47.16% mongod [.] __wt_txn_get_snapshot ◆
|
2.35% mongod [.] snappy::internal::CompressFragment ▒
|
2.07% mongod [.] operator new[] ▒
|
1.51% mongod [.] __wt_hazard_check ▒
|
1.47% [kernel] [k] finish_task_switch ▒
|
1.38% mongod [.] __wt_row_search ▒
|
1.13% [kernel] [k] _raw_spin_unlock_irqrestore ▒
|
1.05% mongod [.] __wt_checksum_sw ▒
|
1.04% mongod [.] tc_deletearray_aligned_nothrow ▒
|
0.86% [kernel] [k] iowrite16 ▒
|
0.65% mongod [.] __wt_page_in_func ▒
|
0.58% [kernel] [k] __softirqentry_text_start ▒
|
0.56% [kernel] [k] copy_user_generic_string ▒
|
0.51% mongod [.] __config_next ▒
|
0.49% mongod [.] tc_malloc ▒
|
0.46% mongod [.] __wt_rec_upd_select ▒
|
0.45% libpthread-2.28.so [.] __pthread_mutex_trylock ▒
|
0.43% mongod [.] __wt_rec_row_leaf ▒
|
0.41% [kernel] [k] do_syscall_64 ▒
|
0.36% [kernel] [k] nft_do_chain ▒
|
0.29% mongod [.] __wt_hazard_set ▒
|
0.28% mongod [.] __wt_cursor_cache_get ▒
|
0.25% libpthread-2.28.so [.] __pthread_mutex_unlock_usercnt ▒
|
0.24% libpthread-2.28.so [.] __pthread_mutex_lock ▒
|
0.23% mongod [.] mongo::LockerImpl::lockBegin ▒
|
0.23% mongod [.] tc_realloc ▒
|
0.22% mongod [.] __txn_oldest_scan ▒
|
0.21% mongod [.] mongo::(anonymous namespace)::receivedCommands(mong▒
|
0.19% [kernel] [k] __nf_conntrack_find_get ▒
|
0.19% mongod [.] mongo::BSONElement::BSONElement ▒
|
0.19% [kernel] [k] run_timer_softirq ▒
|
0.18% [kernel] [k] native_queued_spin_lock_slowpath
|
Here is Debian 9 perf top
28.89% mongod [.] __wt_txn_get_snapshot ◆
|
4.24% mongod [.] snappy::internal::CompressFragment ▒
|
2.52% mongod [.] __wt_hazard_check ▒
|
2.20% mongod [.] __wt_row_search ▒
|
1.79% mongod [.] operator new[] ▒
|
1.78% mongod [.] __wt_checksum_sw ▒
|
1.69% [kernel] [k] finish_task_switch ▒
|
1.67% [kernel] [k] __lock_text_start ▒
|
1.45% [kernel] [k] iowrite16 ▒
|
1.13% mongod [.] tc_deletearray_aligned_nothrow ▒
|
1.00% mongod [.] __wt_page_in_func ▒
|
0.99% [kernel] [k] copy_user_generic_string ▒
|
0.79% mongod [.] __config_next ▒
|
0.65% [kernel] [k] __softirqentry_text_start ▒
|
0.55% libpthread-2.24.so [.] pthread_mutex_trylock ▒
|
0.54% libc-2.24.so [.] strlen ▒
|
0.50% [kernel] [k] do_syscall_64 ▒
|
0.47% [kernel] [k] ipt_do_table ▒
|
0.46% mongod [.] __wt_hazard_set ▒
|
0.44% mongod [.] tc_malloc ▒
|
0.36% mongod [.] __wt_cursor_cache_get ▒
|
0.32% mongod [.] __block_merge ▒
|
0.31% libpthread-2.24.so [.] pthread_mutex_lock ▒
|
0.29% mongod [.] __wt_block_off_remove_overlap ▒
|
0.29% [kernel] [k] iomap_set_range_uptodate ▒
|
0.29% mongod [.] mongo::(anonymous namespace)::receivedCommands(mong▒
|
0.28% mongod [.] mongo::LockerImpl::lockBegin ▒
|
0.28% mongod [.] __wt_rec_row_leaf ▒
|
0.27% [kernel] [k] run_timer_softirq ▒
|
0.25% libpthread-2.24.so [.] __pthread_mutex_unlock_usercnt ▒
|
0.25% libc-2.24.so [.] 0x000000000008409e ▒
|
0.24% mongod [.] __session_begin_transaction
|
After a series of tests we've identified that changing cpu_type from qemu64 to host-passthroug droped down CPU utilization on Debian 10 to the levels of Debian 9. So the slowdown was related to some cpu flags passed to VM internally (and our guess it also may relate to Spectre/Meltdown too). Changing kernel to a higher version on Debian 9 did not make any difference to the results. So our guess is that it's a libc issue. We did not identify which cpu flags do affect the performance so much though.
The issue with stopped replication was on qemu64 cpu type, if that makes any difference. Right now we are enabling host-passthrough on all nodes.
|
|
Thank you for the report neanton@gmail.com, in order to help with the investigation, could you please upload archived mongod logs from all members of the replica set covering the time of this event? I've created a secure uploader for you. Files uploaded to this portal are visible only to MongoDB employees and are routinely deleted after some time.
Also could you please clarify:
- When exactly was this replica set updated to 4.2.5 and what upgrade procedure was followed?
- Is there anything different with db3 as compared to the rest of the servers?
- Was it a one-time event or do you still experience replication lag?
Thanks,
Dima
|
Generated at Thu Feb 08 05:14:31 UTC 2024 using Jira 9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66.