Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-19882

Excessive memory utilisation - open cursor count very high

    • Type: Icon: Bug Bug
    • Resolution: Duplicate
    • Priority: Icon: Critical - P2 Critical - P2
    • None
    • Affects Version/s: None
    • Component/s: WiredTiger
    • Labels:
      None
    • Linux

      We are using a MongoDB WiredTiger (3.0.4) replica set on our production environment hosted on 32GB , 4 CPU instances.

      [root@prmddata0015apse01 ankit]# uname -a
      Linux prmddata0015apse01.in.bsbportal.com 3.14.44-32.39.amzn1.x86_64 #1 SMP Thu Jun 11 20:33:38 UTC 2015 x86_64 x86_64 x86_64 GNU/Linux
      

      We are seeing very high memory utilisation and have configured Swap as our MongoDB process has earlier been killed by the OOM killer.
      Also, the wired tiger cursor counts are very high so not sure if its related to https://jira.mongodb.org/browse/SERVER-17386-

      "session" : {
      			"open cursor count" : 235088,
      			"open session count" : 5147
      		},
      

      Here is the output from the top command -

       PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND                                                                                                                                                                      
       3558 mongod    20   0 65.4g  26g    0 S 182.7 88.7  67395:24 mongod     
      

      Here is the output from free -m -

      [root@prmddata0015apse01 ankit]# free -m
                   total       used       free     shared    buffers     cached
      Mem:         30683      30469        213          0          2        622
      -/+ buffers/cache:      29844        838
      Swap:        30719       4695      26024
      

      As you can see, we are already utilising swap space which is leading to increased latency.

      Here is the output from:

      > db.serverStatus({tcmalloc:true}) - 
      {
      	"host" : "prmddata0015apse01.in.bsbportal.com",
      	"version" : "3.0.4",
      	"process" : "mongod",
      	"pid" : NumberLong(3558),
      	"uptime" : 2821369,
      	"uptimeMillis" : NumberLong("2821368555"),
      	"uptimeEstimate" : 2768311,
      	"localTime" : ISODate("2015-08-12T06:06:00.379Z"),
      	"asserts" : {
      		"regular" : 0,
      		"warning" : 0,
      		"msg" : 0,
      		"user" : 7818,
      		"rollovers" : 0
      	},
      	"connections" : {
      		"current" : 23554,
      		"available" : 56446,
      		"totalCreated" : NumberLong(1104060)
      	},
      	"cursors" : {
      		"note" : "deprecated, use server status metrics",
      		"clientCursors_size" : 1,
      		"totalOpen" : 1,
      		"pinned" : 0,
      		"totalNoTimeout" : 0,
      		"timedOut" : 45
      	},
      	"extra_info" : {
      		"note" : "fields vary by platform",
      		"heap_usage_bytes" : -585593184,
      		"page_faults" : 1499960
      	},
      	"globalLock" : {
      		"totalTime" : NumberLong("2821368564000"),
      		"currentQueue" : {
      			"total" : 0,
      			"readers" : 0,
      			"writers" : 0
      		},
      		"activeClients" : {
      			"total" : 23580,
      			"readers" : 0,
      			"writers" : 2
      		}
      	},
      	"locks" : {
      		"Global" : {
      			"acquireCount" : {
      				"r" : NumberLong("5272572429"),
      				"w" : NumberLong(1301332620),
      				"R" : NumberLong(4),
      				"W" : NumberLong(1632)
      			},
      			"acquireWaitCount" : {
      				"r" : NumberLong(49155740),
      				"w" : NumberLong(504028),
      				"R" : NumberLong(1),
      				"W" : NumberLong(11)
      			},
      			"timeAcquiringMicros" : {
      				"r" : NumberLong("3928535306845"),
      				"w" : NumberLong("135662589249"),
      				"R" : NumberLong(1743235313),
      				"W" : NumberLong("1880903500420")
      			}
      		},
      		"Database" : {
      			"acquireCount" : {
      				"r" : NumberLong(1985622220),
      				"w" : NumberLong(1301328878),
      				"R" : NumberLong(351),
      				"W" : NumberLong(3742)
      			},
      			"acquireWaitCount" : {
      				"r" : NumberLong(4825),
      				"w" : NumberLong(3827),
      				"R" : NumberLong(41),
      				"W" : NumberLong(29)
      			},
      			"timeAcquiringMicros" : {
      				"r" : NumberLong("573867055983"),
      				"w" : NumberLong("5562738569"),
      				"R" : NumberLong(15087365),
      				"W" : NumberLong(987128840)
      			}
      		},
      		"Collection" : {
      			"acquireCount" : {
      				"r" : NumberLong(1374098870),
      				"w" : NumberLong(721932046)
      			}
      		},
      		"oplog" : {
      			"acquireCount" : {
      				"r" : NumberLong(612956693),
      				"w" : NumberLong(579396834)
      			}
      		}
      	},
      	"network" : {
      		"bytesIn" : 888171586121,
      		"bytesOut" : NumberLong("5932457060637"),
      		"numRequests" : 2742864996
      	},
      	"opcounters" : {
      		"insert" : 9580210,
      		"query" : 811601952,
      		"update" : 584348397,
      		"delete" : 5611241,
      		"getmore" : 356223677,
      		"command" : 975447275
      	},
      	"opcountersRepl" : {
      		"insert" : 72,
      		"query" : 0,
      		"update" : 5266,
      		"delete" : 7,
      		"getmore" : 0,
      		"command" : 0
      	},
      	"repl" : {
      		"setName" : "music_production",
      		"setVersion" : 22,
      		"ismaster" : true,
      		"secondary" : false,
      		"hosts" : [
      			"prmddata0014apse01.in.bsbportal.com:27017",
      			"prmddata0015apse01.in.bsbportal.com:27017"
      		],
      		"arbiters" : [
      			"prmdarbi0006apse01.in.bsbportal.com:27017"
      		],
      		"primary" : "prmddata0015apse01.in.bsbportal.com:27017",
      		"me" : "prmddata0015apse01.in.bsbportal.com:27017",
      		"electionId" : ObjectId("559fe24e594aa31bd054be49"),
      		"rbid" : 1914364434
      	},
      	"storageEngine" : {
      		"name" : "wiredTiger"
      	},
      	"tcmalloc" : {
      		"generic" : {
      			"current_allocated_bytes" : NumberLong("29481841624"),
      			"heap_size" : NumberLong("44665241600")
      		},
      		"tcmalloc" : {
      			"pageheap_free_bytes" : 893059072,
      			"pageheap_unmapped_bytes" : NumberLong("11006992384"),
      			"max_total_thread_cache_bytes" : NumberLong(1073741824),
      			"current_total_thread_cache_bytes" : NumberLong("2332452056"),
      			"central_cache_free_bytes" : 936993512,
      			"transfer_cache_free_bytes" : 14364672,
      			"thread_cache_free_bytes" : NumberLong("2332376072"),
      			"aggressive_memory_decommit" : 0
      		},
      		"formattedString" : "------------------------------------------------\nMALLOC:    29481389824 (28115.6 MiB) Bytes in use by application\nMALLOC: +    893124608 (  851.8 MiB) Bytes in page heap freelist\nMALLOC: +    936993880 (  893.6 MiB) Bytes in central cache freelist\nMALLOC: +     14364672 (   13.7 MiB) Bytes in transfer cache freelist\nMALLOC: +   2332376232 ( 2224.3 MiB) Bytes in thread cache freelists\nMALLOC: +    145158304 (  138.4 MiB) Bytes in malloc metadata\nMALLOC:   ------------\nMALLOC: =  33803407520 (32237.4 MiB) Actual memory used (physical + swap)\nMALLOC: +  11006992384 (10497.1 MiB) Bytes released to OS (aka unmapped)\nMALLOC:   ------------\nMALLOC: =  44810399904 (42734.5 MiB) Virtual address space used\nMALLOC:\nMALLOC:         689802              Spans in use\nMALLOC:          23607              Thread heaps in use\nMALLOC:           8192              Tcmalloc page size\n------------------------------------------------\nCall ReleaseFreeMemory() to release freelist memory to the OS (via madvise()).\nBytes released to the OS take up virtual address space but no physical memory.\n"
      	},
      	"wiredTiger" : {
      		"uri" : "statistics:",
      		"LSM" : {
      			"sleep for LSM checkpoint throttle" : 0,
      			"sleep for LSM merge throttle" : 0,
      			"rows merged in an LSM tree" : 0,
      			"application work units currently queued" : 0,
      			"merge work units currently queued" : 0,
      			"tree queue hit maximum" : 0,
      			"switch work units currently queued" : 0,
      			"tree maintenance operations scheduled" : 0,
      			"tree maintenance operations discarded" : 0,
      			"tree maintenance operations executed" : 0
      		},
      		"async" : {
      			"number of allocation state races" : 0,
      			"number of operation slots viewed for allocation" : 0,
      			"current work queue length" : 0,
      			"number of flush calls" : 0,
      			"number of times operation allocation failed" : 0,
      			"maximum work queue length" : 0,
      			"number of times worker found no work" : 0,
      			"total allocations" : 0,
      			"total compact calls" : 0,
      			"total insert calls" : 0,
      			"total remove calls" : 0,
      			"total search calls" : 0,
      			"total update calls" : 0
      		},
      		"block-manager" : {
      			"mapped bytes read" : 0,
      			"bytes read" : NumberLong("9597345456128"),
      			"bytes written" : NumberLong("3042089844736"),
      			"mapped blocks read" : 0,
      			"blocks pre-loaded" : 3871,
      			"blocks read" : 727933230,
      			"blocks written" : 379101322
      		},
      		"cache" : {
      			"tracked dirty bytes in the cache" : 215059760,
      			"tracked bytes belonging to internal pages in the cache" : 918492649765,
      			"bytes currently in the cache" : 12033032810,
      			"tracked bytes belonging to leaf pages in the cache" : NumberLong("9223372036854775807"),
      			"maximum bytes configured" : 15032385536,
      			"tracked bytes belonging to overflow pages in the cache" : 0,
      			"bytes read into cache" : NumberLong("11902116813693"),
      			"bytes written from cache" : NumberLong("6852502157315"),
      			"pages evicted by application threads" : 6746077,
      			"checkpoint blocked page eviction" : 250118,
      			"unmodified pages evicted" : 490787810,
      			"page split during eviction deepened the tree" : 77,
      			"modified pages evicted" : 236200310,
      			"pages selected for eviction unable to be evicted" : 32382187,
      			"pages evicted because they exceeded the in-memory maximum" : 329116,
      			"pages evicted because they had chains of deleted items" : 2951170,
      			"failed eviction of pages that exceeded the in-memory maximum" : 1105451,
      			"hazard pointer blocked page eviction" : 24134870,
      			"internal pages evicted" : 55090162,
      			"maximum page size at eviction" : 255390207,
      			"eviction server candidate queue empty when topping up" : 1340167,
      			"eviction server candidate queue not empty when topping up" : 14616238,
      			"eviction server evicting pages" : 14966732,
      			"eviction server populating queue, but not evicting pages" : 14800262,
      			"eviction server unable to reach eviction goal" : 0,
      			"pages split during eviction" : 1818399,
      			"pages walked for eviction" : NumberLong("1800640322422"),
      			"eviction worker thread evicting pages" : 720198162,
      			"in-memory page splits" : 2671,
      			"percentage overhead" : 8,
      			"tracked dirty pages in the cache" : 2714,
      			"pages currently held in the cache" : 196396,
      			"pages read into cache" : 726180195,
      			"pages written from cache" : 375322582
      		},
      		"connection" : {
      			"pthread mutex condition wait calls" : 1014004963,
      			"files currently open" : 126,
      			"memory allocations" : 24430212601,
      			"memory frees" : 23894944135,
      			"memory re-allocations" : 177443074,
      			"total read I/Os" : 728214804,
      			"pthread mutex shared lock read-lock calls" : 26445784,
      			"pthread mutex shared lock write-lock calls" : 6480656,
      			"total write I/Os" : 905695052
      		},
      		"cursor" : {
      			"cursor create calls" : 13213008,
      			"cursor insert calls" : 1445490561,
      			"cursor next calls" : 67756068722,
      			"cursor prev calls" : 1309064,
      			"cursor remove calls" : 280364618,
      			"cursor reset calls" : 11108972495,
      			"cursor search calls" : 2375311150,
      			"cursor search near calls" : 5152362290,
      			"cursor update calls" : 0
      		},
      		"data-handle" : {
      			"connection dhandles swept" : 2,
      			"connection candidate referenced" : 1765481,
      			"connection sweeps" : 547471,
      			"connection time-of-death sets" : 1828812,
      			"session dhandles swept" : 2460857,
      			"session sweep attempts" : 212068
      		},
      		"log" : {
      			"log buffer size increases" : 74,
      			"total log buffer size" : 44255232,
      			"log bytes of payload data" : NumberLong("2146119367965"),
      			"log bytes written" : NumberLong("2182457885056"),
      			"yields waiting for previous log file close" : 0,
      			"total size of compressed records" : NumberLong("2145236240592"),
      			"total in-memory size of compressed records" : NumberLong("11190656412523"),
      			"log records too small to compress" : 5389279,
      			"log records not compressed" : 3113697,
      			"log records compressed" : 568492909,
      			"maximum log file size" : 104857600,
      			"pre-allocated log files prepared" : 20821,
      			"number of pre-allocated log files to create" : 1,
      			"pre-allocated log files used" : 20820,
      			"log read operations" : 0,
      			"log release advances write LSN" : 22179050,
      			"records processed by log scan" : 10,
      			"log scan records requiring two reads" : 0,
      			"log scan operations" : 5,
      			"consolidated slot closures" : 526520855,
      			"logging bytes consolidated" : NumberLong("2182872173696"),
      			"consolidated slot joins" : 577017335,
      			"consolidated slot join races" : 91139,
      			"slots selected for switching that were unavailable" : 55632080203,
      			"record size exceeded maximum" : 0,
      			"failed to find a slot large enough for record" : 68,
      			"consolidated slot join transitions" : 65969364,
      			"log sync operations" : 20693734,
      			"log sync_dir operations" : 20799,
      			"log server thread advances write LSN" : 504344650,
      			"log write operations" : 576981534
      		},
      		"reconciliation" : {
      			"page reconciliation calls" : 353454547,
      			"page reconciliation calls for eviction" : 174417760,
      			"split bytes currently awaiting free" : 367983,
      			"split objects currently awaiting free" : 14
      		},
      		"session" : {
      			"open cursor count" : 235151,
      			"open session count" : 5147
      		},
      		"thread-yield" : {
      			"page acquire busy blocked" : 954,
      			"page acquire eviction blocked" : 96034,
      			"page acquire locked blocked" : 2877324617,
      			"page acquire read blocked" : 502415742,
      			"page acquire time sleeping (usecs)" : 116917461502
      		},
      		"transaction" : {
      			"transaction begins" : 3177756307,
      			"transaction checkpoints" : 44237,
      			"transaction checkpoint generation" : 44237,
      			"transaction checkpoint currently running" : 0,
      			"transaction checkpoint max time (msecs)" : 123577,
      			"transaction checkpoint min time (msecs)" : 6,
      			"transaction checkpoint most recent time (msecs)" : 2736,
      			"transaction checkpoint total time (msecs)" : 95515810,
      			"transactions committed" : 576710308,
      			"transaction failures due to cache overflow" : 0,
      			"transaction range of IDs currently pinned by a checkpoint" : 0,
      			"transaction range of IDs currently pinned" : 15,
      			"transactions rolled back" : 2620348972
      		},
      		"concurrentTransactions" : {
      			"write" : {
      				"out" : 1,
      				"available" : 127,
      				"totalTickets" : 128
      			},
      			"read" : {
      				"out" : 1,
      				"available" : 127,
      				"totalTickets" : 128
      			}
      		}
      	},
      	"writeBacksQueued" : false,
      	"mem" : {
      		"bits" : 64,
      		"resident" : 27240,
      		"virtual" : 67000,
      		"supported" : true,
      		"mapped" : 0,
      		"mappedWithJournal" : 0
      	},
      	"metrics" : {
      		"commands" : {
      			"<UNKNOWN>" : NumberLong(2),
      			"_isSelf" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(2)
      			},
      			"buildInfo" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(19609)
      			},
      			"collStats" : {
      				"failed" : NumberLong(3),
      				"total" : NumberLong(14)
      			},
      			"count" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(3899113)
      			},
      			"createIndexes" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(436)
      			},
      			"dbStats" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(11)
      			},
      			"delete" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(88)
      			},
      			"drop" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(1)
      			},
      			"explain" : {
      				"failed" : NumberLong(11),
      				"total" : NumberLong(70)
      			},
      			"getLastError" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(597903131)
      			},
      			"getLog" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(278)
      			},
      			"getnonce" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(8)
      			},
      			"insert" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(5)
      			},
      			"isMaster" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(140111015)
      			},
      			"listCollections" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(326)
      			},
      			"listDatabases" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(19463)
      			},
      			"listIndexes" : {
      				"failed" : NumberLong(7),
      				"total" : NumberLong(47)
      			},
      			"ping" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(32)
      			},
      			"profile" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(4)
      			},
      			"replSetElect" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(1)
      			},
      			"replSetFresh" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(1)
      			},
      			"replSetGetRBID" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(2)
      			},
      			"replSetGetStatus" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(43374)
      			},
      			"replSetHeartbeat" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(2818096)
      			},
      			"replSetStepDown" : {
      				"failed" : NumberLong(1371),
      				"total" : NumberLong(1371)
      			},
      			"replSetUpdatePosition" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(230473730)
      			},
      			"serverStatus" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(156857)
      			},
      			"top" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(20)
      			},
      			"update" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(19638)
      			},
      			"whatsmyuri" : {
      				"failed" : NumberLong(0),
      				"total" : NumberLong(278)
      			}
      		},
      		"cursor" : {
      			"timedOut" : NumberLong(45),
      			"open" : {
      				"noTimeout" : NumberLong(0),
      				"pinned" : NumberLong(0),
      				"total" : NumberLong(1)
      			}
      		},
      		"document" : {
      			"deleted" : NumberLong(2911853),
      			"inserted" : NumberLong(9580210),
      			"returned" : NumberLong(1062270874),
      			"updated" : NumberLong(588922673)
      		},
      		"getLastError" : {
      			"wtime" : {
      				"num" : 0,
      				"totalMillis" : 0
      			},
      			"wtimeouts" : NumberLong(0)
      		},
      		"operation" : {
      			"fastmod" : NumberLong(0),
      			"idhack" : NumberLong(1417635),
      			"scanAndOrder" : NumberLong(0),
      			"writeConflicts" : NumberLong(3679090)
      		},
      		"queryExecutor" : {
      			"scanned" : NumberLong(1255929382),
      			"scannedObjects" : NumberLong("70740482289")
      		},
      		"record" : {
      			"moves" : NumberLong(0)
      		},
      		"repl" : {
      			"apply" : {
      				"batches" : {
      					"num" : 1624,
      					"totalMillis" : 4760
      				},
      				"ops" : NumberLong(5345)
      			},
      			"buffer" : {
      				"count" : NumberLong(0),
      				"maxSizeBytes" : 268435456,
      				"sizeBytes" : NumberLong(0)
      			},
      			"network" : {
      				"bytes" : NumberLong(9747165),
      				"getmores" : {
      					"num" : 2428,
      					"totalMillis" : 13219
      				},
      				"ops" : NumberLong(5345),
      				"readersCreated" : NumberLong(1)
      			},
      			"preload" : {
      				"docs" : {
      					"num" : 0,
      					"totalMillis" : 0
      				},
      				"indexes" : {
      					"num" : 0,
      					"totalMillis" : 0
      				}
      			}
      		},
      		"storage" : {
      			"freelist" : {
      				"search" : {
      					"bucketExhausted" : NumberLong(0),
      					"requests" : NumberLong(0),
      					"scanned" : NumberLong(0)
      				}
      			}
      		},
      		"ttl" : {
      			"deletedDocuments" : NumberLong(0),
      			"passes" : NumberLong(46965)
      		}
      	},
      	"ok" : 1
      }
      

      Wired Tiger is supposed to use half of the available RAM but that is not what we are seeing here. Can you please help me in figuring out what is consuming the RAM. We are using this setup in our production environment and this is mission critical so please advice if I need to update to a more stable version of MongoDB with Wired Tiger or should I try reverting to MMAP

            Assignee:
            Unassigned Unassigned
            Reporter:
            ankit srivastava ankit srivastava
            Votes:
            0 Vote for this issue
            Watchers:
            9 Start watching this issue

              Created:
              Updated:
              Resolved: