[SERVER-16623] Cannot allocate memory (wiredtiger) (mms-dev) Created: 21/Dec/14  Updated: 15/Jan/15  Resolved: 06/Jan/15

Status: Closed
Project: Core Server
Component/s: Stability, Storage
Affects Version/s: 2.8.0-rc3, 2.8.0-rc4
Fix Version/s: 2.8.0-rc5

Type: Bug Priority: Major - P3
Reporter: Cailin Nelson Assignee: Michael Cahill (Inactive)
Resolution: Done Votes: 1
Labels: wiredtiger
Remaining Estimate: Not Specified
Time Spent: Not Specified
Original Estimate: Not Specified

Attachments: PNG File 12-21-mem.png     File 54972d6de4b0c8d9d7bcaca1.json     PNG File incident1-oplog.png     PNG File incident1.png     PNG File linux.png     PNG File mmsdev-2015-01-01.png     PNG File mmsdev-2015-01-02.png     PNG File osx.png    
Issue Links:
Related
Backwards Compatibility: Fully Compatible
Operating System: ALL
Participants:

 Description   

I recently upgraded the mms-dev primary from a 2.8.0rc3-pre (e067fff4e3f3079d070ec168f32c24db9a51a944) to 2.8.0-rc3. After about a half an hour, the primary had a segfault with the following message.

This node has not had a fresh sync on 2.8.0-rc3. I believe the last time it had a fresh sync was 2.8.0-rc2.

2014-12-21T20:29:44.685+0000 E STORAGE  [conn1337] WiredTiger (12) [1419193784:682102][8148:0x7f0d3ebdd700], file:collection-8--5314993906140581602.wt, cursor.insert: memory allocation: Cannot allocate memory
2014-12-21T20:29:44.718+0000 F -        [conn1337] Invalid access at address: 0x30b18
2014-12-21T20:29:44.783+0000 F -        [conn1337] Got signal: 11 (Segmentation fault).
 
 0xf0bd99 0xf0b442 0xf0b76e 0x7f0d7e286340 0x13141b8 0x1318967 0x1319f7a 0x12e99a7 0x12ea2bd 0x12e7e6d 0x129fa9e 0x12d5d06 0xd33918 0xd30c0d 0x8ebb5c 0xc12e8f 0xc109c3 0xa08df1 0xa099cc 0xba969a 0xbaa42d 0x989c70 0x98aec2 0x98b405 0x98d4bd 0x9a9784 0x9aa5d3 0x9ab08b 0xb77c2a 0xa8af55 0x7e1770 0xec9d61 0x7f0d7e27e182 0x7f0d7d37efbd
----- BEGIN BACKTRACE -----
{"backtrace":[{"b":"400000","o":"B0BD99"},{"b":"400000","o":"B0B442"},{"b":"400000","o":"B0B76E"},{"b":"7F0D7E276000","o":"10340"},{"b":"400000","o":"F141B8"},{"b":"400000","o":"F18967"},{"b":"400000","o":"F19F7A"},{"b":"400000","o":"EE99A7"},{"b":"400000","o":"EEA2BD"},{"b":"400000","o":"EE7E6D"},{"b":"400000","o":"E9FA9E"},{"b":"400000","o":"ED5D06"},{"b":"400000","o":"933918"},{"b":"400000","o":"930C0D"},{"b":"400000","o":"4EBB5C"},{"b":"400000","o":"812E8F"},{"b":"400000","o":"8109C3"},{"b":"400000","o":"608DF1"},{"b":"400000","o":"6099CC"},{"b":"400000","o":"7A969A"},{"b":"400000","o":"7AA42D"},{"b":"400000","o":"589C70"},{"b":"400000","o":"58AEC2"},{"b":"400000","o":"58B405"},{"b":"400000","o":"58D4BD"},{"b":"400000","o":"5A9784"},{"b":"400000","o":"5AA5D3"},{"b":"400000","o":"5AB08B"},{"b":"400000","o":"777C2A"},{"b":"400000","o":"68AF55"},{"b":"400000","o":"3E1770"},{"b":"400000","o":"AC9D61"},{"b":"7F0D7E276000","o":"8182"},{"b":"7F0D7D284000","o":"FAFBD"}],"processInfo":{ "mongodbVersion" : "2.8.0-rc3", "gitVersion" : "2d679247f17dab05a492c8b6d2c250dab18e54f2", "uname" : { "sysname" : "Linux", "release" : "3.13.0-36-generic", "version" : "#63-Ubuntu SMP Wed Sep 3 21:30:07 UTC 2014", "machine" : "x86_64" }, "somap" : [ { "elfType" : 2, "b" : "400000" }, { "b" : "7FFF82EFE000", "elfType" : 3 }, { "b" : "7F0D7E276000", "path" : "/lib/x86_64-linux-gnu/libpthread.so.0", "elfType" : 3 }, { "b" : "7F0D7E06E000", "path" : "/lib/x86_64-linux-gnu/librt.so.1", "elfType" : 3 }, { "b" : "7F0D7DE6A000", "path" : "/lib/x86_64-linux-gnu/libdl.so.2", "elfType" : 3 }, { "b" : "7F0D7DB66000", "path" : "/usr/lib/x86_64-linux-gnu/libstdc++.so.6", "elfType" : 3 }, { "b" : "7F0D7D860000", "path" : "/lib/x86_64-linux-gnu/libm.so.6", "elfType" : 3 }, { "b" : "7F0D7D64A000", "path" : "/lib/x86_64-linux-gnu/libgcc_s.so.1", "elfType" : 3 }, { "b" : "7F0D7D284000", "path" : "/lib/x86_64-linux-gnu/libc.so.6", "elfType" : 3 }, { "b" : "7F0D7E494000", "path" : "/lib64/ld-linux-x86-64.so.2", "elfType" : 3 } ] }}
 mongod(_ZN5mongo15printStackTraceERSo+0x29) [0xf0bd99]
 mongod(+0xB0B442) [0xf0b442]
 mongod(+0xB0B76E) [0xf0b76e]
 libpthread.so.0(+0x10340) [0x7f0d7e286340]
 mongod(+0xF141B8) [0x13141b8]
 mongod(+0xF18967) [0x1318967]
 mongod(__wt_reconcile+0x1AA) [0x1319f7a]
 mongod(+0xEE99A7) [0x12e99a7]
 mongod(__wt_evict+0x7D) [0x12ea2bd]
 mongod(__wt_evict_page+0x2D) [0x12e7e6d]
 mongod(__wt_btcur_insert+0x12CE) [0x129fa9e]
 mongod(+0xED5D06) [0x12d5d06]
 mongod(_ZN5mongo21WiredTigerRecordStore12insertRecordEPNS_16OperationContextEPKcib+0xC8) [0xd33918]
 mongod(_ZN5mongo21WiredTigerRecordStore12insertRecordEPNS_16OperationContextEPKNS_9DocWriterEb+0x8D) [0xd30c0d]
 mongod(_ZN5mongo10Collection14insertDocumentEPNS_16OperationContextEPKNS_9DocWriterEb+0x5C) [0x8ebb5c]
 mongod(+0x812E8F) [0xc12e8f]
 mongod(_ZN5mongo4repl5logOpEPNS_16OperationContextEPKcS4_RKNS_7BSONObjEPS5_Pbb+0xA3) [0xc109c3]
 mongod(_ZN5mongo11UpdateStage8doInsertEv+0x511) [0xa08df1]
 mongod(_ZN5mongo11UpdateStage4workEPm+0xCC) [0xa099cc]
 mongod(_ZN5mongo12PlanExecutor7getNextEPNS_7BSONObjEPNS_8RecordIdE+0x9A) [0xba969a]
 mongod(_ZN5mongo12PlanExecutor11executePlanEv+0x3D) [0xbaa42d]
 mongod(_ZN5mongo18WriteBatchExecutor10execUpdateERKNS_12BatchItemRefEPNS_7BSONObjEPPNS_16WriteErrorDetailE+0x610) [0x989c70]
 mongod(_ZN5mongo18WriteBatchExecutor11bulkExecuteERKNS_21BatchedCommandRequestEPSt6vectorIPNS_19BatchedUpsertDetailESaIS6_EEPS4_IPNS_16WriteErrorDetailESaISB_EE+0x1F2) [0x98aec2]
 mongod(_ZN5mongo18WriteBatchExecutor12executeBatchERKNS_21BatchedCommandRequestEPNS_22BatchedCommandResponseE+0x395) [0x98b405]
 mongod(_ZN5mongo8WriteCmd3runEPNS_16OperationContextERKSsRNS_7BSONObjEiRSsRNS_14BSONObjBuilderEb+0x15D) [0x98d4bd]
 mongod(_ZN5mongo12_execCommandEPNS_16OperationContextEPNS_7CommandERKSsRNS_7BSONObjEiRSsRNS_14BSONObjBuilderEb+0x34) [0x9a9784]
 mongod(_ZN5mongo7Command11execCommandEPNS_16OperationContextEPS0_iPKcRNS_7BSONObjERNS_14BSONObjBuilderEb+0xC13) [0x9aa5d3]
 mongod(_ZN5mongo12_runCommandsEPNS_16OperationContextEPKcRNS_7BSONObjERNS_11_BufBuilderINS_16TrivialAllocatorEEERNS_14BSONObjBuilderEbi+0x28B) [0x9ab08b]
 mongod(_ZN5mongo8runQueryEPNS_16OperationContextERNS_7MessageERNS_12QueryMessageERNS_5CurOpES3_b+0x76A) [0xb77c2a]
 mongod(_ZN5mongo16assembleResponseEPNS_16OperationContextERNS_7MessageERNS_10DbResponseERKNS_11HostAndPortEb+0xB25) [0xa8af55]
 mongod(_ZN5mongo16MyMessageHandler7processERNS_7MessageEPNS_21AbstractMessagingPortEPNS_9LastErrorE+0xE0) [0x7e1770]
 mongod(_ZN5mongo17PortMessageServer17handleIncomingMsgEPv+0x411) [0xec9d61]
 libpthread.so.0(+0x8182) [0x7f0d7e27e182]
 libc.so.6(clone+0x6D) [0x7f0d7d37efbd]
-----  END BACKTRACE  -----



 Comments   
Comment by Michael Cahill (Inactive) [ 06/Jan/15 ]

The fix to limit the maximum amount of cached memory per session will be in the next merge of WiredTiger.

Comment by Cailin Nelson [ 21/Dec/14 ]

Please see attached for full MMS ping, 1 minute before the error. The host in question is ip-10-169-132-210.ec2.internal:27018, the only one with a WT sub-document inside of serverStatus.

Here is the WT sub-document:

"wiredTiger": {
"LSM": {
"tree maintenance operations discarded": 0,
"rows merged in an LSM tree": 0,
"merge work units currently queued": 0,
"switch work units currently queued": 0,
"sleep for LSM checkpoint throttle": 0,
"application work units currently queued": 0,
"sleep for LSM merge throttle": 0,
"tree maintenance operations executed": 0,
"tree queue hit maximum": 0,
"tree maintenance operations scheduled": 0
},
"data-handle": {
"connection candidate referenced": 0,
"session dhandles swept": 66,
"connection sweeps": 6,
"session sweep attempts": 410,
"connection dhandles swept": 1,
"connection time-of-death sets": 969
},
"connection": {
"memory allocations": 573386,
"memory frees": 268617,
"files currently open": 1161,
"pthread mutex shared lock write-lock calls": 3405,
"pthread mutex condition wait calls": 898,
"pthread mutex shared lock read-lock calls": 158281,
"total write I/Os": 119803,
"memory re-allocations": 1345,
"total read I/Os": 143616
},
"cursor": {
"cursor reset calls": 705626,
"cursor search calls": 667668,
"cursor update calls": 9179,
"cursor search near calls": 21986,
"cursor create calls": 37667,
"cursor insert calls": 23586,
"cursor remove calls": 123572,
"cursor prev calls": 389013,
"cursor next calls": 144209
},
"cache": {
"maximum bytes configured": 3221225472,
"eviction server candidate queue empty when topping up": 0,
"pages selected for eviction unable to be evicted": 352,
"eviction server evicting pages": 0,
"internal pages evicted": 0,
"tracked dirty bytes in the cache": 9544765,
"pages evicted because they exceeded the in-memory maximum": 1410,
"page split during eviction deepened the tree": 0,
"failed eviction of pages that exceeded the in-memory maximum": 352,
"eviction server populating queue, but not evicting pages": 0,
"checkpoint blocked page eviction": 0,
"pages currently held in the cache": 27028,
"pages evicted by application threads": 0,
"bytes written from cache": 97504316,
"eviction server unable to reach eviction goal": 0,
"bytes currently in the cache": 696782087,
"pages split during eviction": 4,
"pages walked for eviction": 0,
"pages read into cache": 141202,
"in-memory page splits": 0,
"tracked dirty pages in the cache": 435,
"bytes read into cache": 4480708607,
"pages written from cache": 4078,
"hazard pointer blocked page eviction": 352,
"modified pages evicted": 1410,
"unmodified pages evicted": 0,
"eviction server candidate queue not empty when topping up": 0
},
"transaction": {
"transaction checkpoint currently running": 0,
"transaction range of IDs currently pinned": 0,
"transaction checkpoint min time (msecs)": 6,
"transaction failures due to cache overflow": 0,
"transaction checkpoints": 3,
"transaction begins": 131610,
"transactions committed": 131296,
"transaction checkpoint total time (msecs)": 361,
"transactions rolled back": 72,
"transaction checkpoint most recent time (msecs)": 312,
"transaction checkpoint max time (msecs)": 312
},
"session": {
"open session count": 94,
"open cursor count": 5153
},
"uri": "statistics:",
"log": {
"total in-memory size of compressed records": 25433066,
"maximum log file size": 104857600,
"yields waiting for previous log file close": 0,
"pre-allocated log files used": 1,
"slots selected for switching that were unavailable": 8786,
"log records compressed": 12538,
"log buffer size increases": 1,
"log scan records requiring two reads": 164,
"log scan operations": 2,
"record size exceeded maximum": 0,
"number of pre-allocated log files to create": 1,
"log read operations": 0,
"log sync operations": 1,
"log records too small to compress": 110905,
"records processed by log scan": 240,
"total log buffer size": 466599936,
"consolidated slot joins": 123896,
"log records not compressed": 917,
"total size of compressed records": 9716680,
"consolidated slot join transitions": 1,
"logging bytes consolidated": 110259840,
"log bytes written": 117803904,
"failed to find a slot large enough for record": 1,
"consolidated slot join races": 162,
"log write operations": 124316,
"consolidated slot closures": 114854,
"pre-allocated log files prepared": 2,
"log bytes of payload data": 106381219
},
"async": {
"current work queue length": 0,
"number of allocation state races": 0,
"number of times operation allocation failed": 0,
"maximum work queue length": 0,
"number of operation slots viewed for allocation": 0,
"total search calls": 0,
"total allocations": 0,
"number of times worker found no work": 0,
"total compact calls": 0,
"total remove calls": 0,
"number of flush calls": 0,
"total update calls": 0,
"total insert calls": 0
},
"reconciliation": {
"page reconciliation calls": 5224,
"split objects currently awaiting free": 0,
"page reconciliation calls for eviction": 1393,
"split bytes currently awaiting free": 0
},
"block-manager": {
"blocks pre-loaded": 26414,
"mapped bytes read": 0,
"blocks read": 142044,
"mapped blocks read": 0,
"bytes read": 4090253312,
"bytes written": 111697920,
"blocks written": 4432
}
},

Comment by Daniel Pasette (Inactive) [ 21/Dec/14 ]

may be SERVER-16546. if we have a serverStatus sample with wiredTiger section from before crash might be able to make positive id.

Comment by Cailin Nelson [ 21/Dec/14 ]

Yup. Updated.

Comment by Andy Schwerin [ 21/Dec/14 ]

Could you post the git hash of the rc3-pre mongod?

Comment by Cailin Nelson [ 21/Dec/14 ]

The node was immediately restarted by an MMS Automation Agent. It then died again with:

2014-12-21T20:30:55.085+0000 I REPL     [ReplicationExecutor] syncing from: ip-10-218-181-240.ec2.internal:27017
2014-12-21T20:30:55.086+0000 I REPL     [SyncSourceFeedback] replset setting syncSourceFeedback to ip-10-218-181-240.ec2.internal:27017
2014-12-21T20:30:55.116+0000 F -        [repl writer worker 14] Invalid access at address: 0x1
2014-12-21T20:30:55.133+0000 F -        [repl writer worker 14] Got signal: 11 (Segmentation fault).
 
 0xf0bd99 0xf0b442 0xf0b76e 0x7f941e571340 0xf5eab3 0xf5ed96 0xf6b6e0 0xf6b77b 0x13abbb0 0x7e09bb 0xd44c9d 0x8f217d 0xafd4f9 0xb7d5cf 0xaf8299 0xc14d22 0xc6c320 0xc6f36d 0xead5cb 0xf580a4 0x7f941e569182 0x7f941d669fbd
----- BEGIN BACKTRACE -----
{"backtrace":[{"b":"400000","o":"B0BD99"},{"b":"400000","o":"B0B442"},{"b":"400000","o":"B0B76E"},{"b":"7F941E561000","o":"10340"},{"b":"400000","o":"B5EAB3"},{"b":"400000","o":"B5ED96"},{"b":"400000","o":"B6B6E0"},{"b":"400000","o":"B6B77B"},{"b":"400000","o":"FABBB0"},{"b":"400000","o":"3E09BB"},{"b":"400000","o":"944C9D"},{"b":"400000","o":"4F217D"},{"b":"400000","o":"6FD4F9"},{"b":"400000","o":"77D5CF"},{"b":"400000","o":"6F8299"},{"b":"400000","o":"814D22"},{"b":"400000","o":"86C320"},{"b":"400000","o":"86F36D"},{"b":"400000","o":"AAD5CB"},{"b":"400000","o":"B580A4"},{"b":"7F941E561000","o":"8182"},{"b":"7F941D56F000","o":"FAFBD"}],"processInfo":{ "mongodbVersion" : "2.8.0-rc3", "gitVersion" : "2d679247f17dab05a492c8b6d2c250dab18e54f2", "uname" : { "sysname" : "Linux", "release" : "3.13.0-36-generic", "version" : "#63-Ubuntu SMP Wed Sep 3 21:30:07 UTC 2014", "machine" : "x86_64" }, "somap" : [ { "elfType" : 2, "b" : "400000" }, { "b" : "7FFF85EFE000", "elfType" : 3 }, { "b" : "7F941E561000", "path" : "/lib/x86_64-linux-gnu/libpthread.so.0", "elfType" : 3 }, { "b" : "7F941E359000", "path" : "/lib/x86_64-linux-gnu/librt.so.1", "elfType" : 3 }, { "b" : "7F941E155000", "path" : "/lib/x86_64-linux-gnu/libdl.so.2", "elfType" : 3 }, { "b" : "7F941DE51000", "path" : "/usr/lib/x86_64-linux-gnu/libstdc++.so.6", "elfType" : 3 }, { "b" : "7F941DB4B000", "path" : "/lib/x86_64-linux-gnu/libm.so.6", "elfType" : 3 }, { "b" : "7F941D935000", "path" : "/lib/x86_64-linux-gnu/libgcc_s.so.1", "elfType" : 3 }, { "b" : "7F941D56F000", "path" : "/lib/x86_64-linux-gnu/libc.so.6", "elfType" : 3 }, { "b" : "7F941E77F000", "path" : "/lib64/ld-linux-x86-64.so.2", "elfType" : 3 } ] }}
 mongod(_ZN5mongo15printStackTraceERSo+0x29) [0xf0bd99]
 mongod(+0xB0B442) [0xf0b442]
 mongod(+0xB0B76E) [0xf0b76e]
 libpthread.so.0(+0x10340) [0x7f941e571340]
 mongod(_ZN8tcmalloc15CentralFreeList18ReleaseListToSpansEPv+0x13) [0xf5eab3]
 mongod(_ZN8tcmalloc15CentralFreeList11InsertRangeEPvS1_i+0x56) [0xf5ed96]
 mongod(_ZN8tcmalloc11ThreadCache21ReleaseToCentralCacheEPNS0_8FreeListEmi+0x100) [0xf6b6e0]
 mongod(_ZN8tcmalloc11ThreadCache11ListTooLongEPNS0_8FreeListEm+0x1B) [0xf6b77b]
 mongod(tc_delete+0x1F0) [0x13abbb0]
 mongod(_ZNSt8_Rb_treeISsSsSt9_IdentityISsESt4lessISsESaISsEE8_M_eraseEPSt13_Rb_tree_nodeISsE+0x4B) [0x7e09bb]
 mongod(_ZN5mongo15UpdateIndexData5clearEv+0xD) [0xd44c9d]
 mongod(_ZN5mongo19CollectionInfoCache16computeIndexKeysEPNS_16OperationContextE+0x2D) [0x8f217d]
 mongod(_ZNK5mongo19UpdateLifecycleImpl12getIndexKeysEPNS_16OperationContextE+0x29) [0xafd4f9]
 mongod(_ZN5mongo17getExecutorUpdateEPNS_16OperationContextEPNS_10CollectionEPNS_12ParsedUpdateEPNS_7OpDebugEPPNS_12PlanExecutorE+0x11F) [0xb7d5cf]
 mongod(_ZN5mongo6updateEPNS_16OperationContextEPNS_8DatabaseERKNS_13UpdateRequestEPNS_7OpDebugE+0xD9) [0xaf8299]
 mongod(_ZN5mongo4repl21applyOperation_inlockEPNS_16OperationContextEPNS_8DatabaseERKNS_7BSONObjEbb+0x11B2) [0xc14d22]
 mongod(_ZN5mongo4repl8SyncTail9syncApplyEPNS_16OperationContextERKNS_7BSONObjEb+0x310) [0xc6c320]
 mongod(_ZN5mongo4repl14multiSyncApplyERKSt6vectorINS_7BSONObjESaIS2_EEPNS0_8SyncTailE+0x5D) [0xc6f36d]
 mongod(_ZN5mongo10threadpool6Worker4loopERKSs+0x2FB) [0xead5cb]
 mongod(+0xB580A4) [0xf580a4]
 libpthread.so.0(+0x8182) [0x7f941e569182]
 libc.so.6(clone+0x6D) [0x7f941d669fbd]
-----  END BACKTRACE  -----

It was then restarted again, and so far is running ok.

Generated at Thu Feb 08 03:41:43 UTC 2024 using Jira 9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66.