Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-42196

Experiencing lots of long Global.timeAcquiringMicros

    XMLWordPrintable

    Details

    • Type: Bug
    • Status: Closed
    • Priority: Major - P3
    • Resolution: Incomplete
    • Affects Version/s: 3.2.11
    • Fix Version/s: None
    • Component/s: Performance, WiredTiger
    • Labels:
      None
    • Operating System:
      ALL

      Description

      We are experiencing lots of on both primary and secondary server. Server is reporting lots of slow queries both read and write takes from 1 sec to 5 sec many times.

       

      Infra:

      We have a 3 node replica set, reads are secondary preferred and goes to secondary node.

      CPU: 8 Core

      Ram: 64 GB

      Hard disk: 1 TB

      database size: 26 GB

       

      Stats:

      "version" : "3.2.11",
       "process" : "mongod",
       "pid" : NumberLong(14048),
       "uptime" : 721859,
       "uptimeMillis" : NumberLong(721859388),
       "uptimeEstimate" : 711873,
       "localTime" : ISODate("2019-07-12T08:46:12.565Z"),
       "asserts" : {
       "regular" : 0,
       "warning" : 0,
       "msg" : 0,
       "user" : 1765,
       "rollovers" : 0
       },
       "connections" : {
       "current" : 329,
       "available" : 51671,
       "totalCreated" : NumberLong(532759)
       },
       "extra_info" : {
       "note" : "fields vary by platform",
       "heap_usage_bytes" : -722884464,
       "page_faults" : 0
       },
       "globalLock" : {
       "totalTime" : NumberLong("721859386000"),
       "currentQueue" : {
       "total" : 0,
       "readers" : 0,
       "writers" : 0
       },
       "activeClients" : {
       "total" : 359,
       "readers" : 0,
       "writers" : 0
       }
       },
       "locks" : {
       "Global" : {
       "acquireCount" : {
       "r" : NumberLong("3615532147"),
       "w" : NumberLong(1117697411),
       "R" : NumberLong(133474904),
       "W" : NumberLong(133474912)
       },
       "acquireWaitCount" : {
       "r" : NumberLong(364301766),
       "w" : NumberLong(170745),
       "R" : NumberLong(170159),
       "W" : NumberLong(64636930)
       },
       "timeAcquiringMicros" : {
       "r" : NumberLong("1378950998368"),
       "w" : NumberLong(15866347),
       "R" : NumberLong(294570579),
       "W" : NumberLong("24766601500")
       }
       },
       "Database" : {
       "acquireCount" : {
       "r" : NumberLong(1540626945),
       "w" : NumberLong(717272259),
       "R" : NumberLong(7),
       "W" : NumberLong(400425191)
       },
       "acquireWaitCount" : {
       "r" : NumberLong(20),
       "w" : NumberLong(28),
       "W" : NumberLong(63)
       },
       "timeAcquiringMicros" : {
       "r" : NumberLong(13860),
       "w" : NumberLong(172757),
       "W" : NumberLong(34112)
       }
       },
       "Collection" : {
       "acquireCount" : {
       "r" : NumberLong(1544469564),
       "w" : NumberLong(716911434)
       }
       },
       "Metadata" : {
       "acquireCount" : {
       "w" : NumberLong(181305883),
       "W" : NumberLong(211017228)
       },
       "acquireWaitCount" : {
       "W" : NumberLong(740)
       },
       "timeAcquiringMicros" : {
       "W" : NumberLong(3339408)
       }
       },
       "oplog" : {
       "acquireCount" : {
       "r" : NumberLong(945928),
       "w" : NumberLong(360827)
       }
       }
       },
       "network" : {
       "bytesIn" : NumberLong("109939674088"),
       "bytesOut" : NumberLong("1258717688195"),
       "numRequests" : NumberLong(791610131)
       },
       "opcounters" : {
       "insert" : 0,
       "query" : 750622665,
       "update" : 0,
       "delete" : 0,
       "getmore" : 39725,
       "command" : 40947665
       },
       "opcountersRepl" : {
       "insert" : 287955553,
       "query" : 0,
       "update" : 312318966,
       "delete" : 116431872,
       "getmore" : 0,
       "command" : 0
       },
       
       "storageEngine" : {
       "name" : "wiredTiger",
       "supportsCommittedReads" : true,
       "persistent" : true
       },
       "tcmalloc" : {
       "generic" : {
       "current_allocated_bytes" : NumberLong("25046843408"),
       "heap_size" : NumberLong("37098684416")
       },
       "tcmalloc" : {
       "pageheap_free_bytes" : NumberLong("6705545216"),
       "pageheap_unmapped_bytes" : NumberLong("3345612800"),
       "max_total_thread_cache_bytes" : NumberLong(1073741824),
       "current_total_thread_cache_bytes" : 897080584,
       "total_free_bytes" : NumberLong(2000682992),
       "central_cache_free_bytes" : NumberLong(1074572792),
       "transfer_cache_free_bytes" : 29029616,
       "thread_cache_free_bytes" : 897080584,
       "aggressive_memory_decommit" : 0,
       "formattedString" : "------------------------------------------------\nMALLOC: 25046843408 (23886.5 MiB) Bytes in use by application\nMALLOC: + 6705545216 ( 6394.9 MiB) Bytes in page heap freelist\nMALLOC: + 1074572792 ( 1024.8 MiB) Bytes in central cache freelist\nMALLOC: + 29029616 ( 27.7 MiB) Bytes in transfer cache freelist\nMALLOC: + 897080584 ( 855.5 MiB) Bytes in thread cache freelists\nMALLOC: + 72036512 ( 68.7 MiB) Bytes in malloc metadata\nMALLOC: ------------\nMALLOC: = 33825108128 (32258.1 MiB) Actual memory used (physical + swap)\nMALLOC: + 3345612800 ( 3190.6 MiB) Bytes released to OS (aka unmapped)\nMALLOC: ------------\nMALLOC: = 37170720928 (35448.8 MiB) Virtual address space used\nMALLOC:\nMALLOC: 465577 Spans in use\nMALLOC: 392 Thread heaps in use\nMALLOC: 8192 Tcmalloc page size\n------------------------------------------------\nCall ReleaseFreeMemory() to release freelist memory to the OS (via madvise()).\nBytes released to the OS take up virtual address space but no physical memory.\n"
       }
       },
       "wiredTiger" : {
       "uri" : "statistics:",
       "LSM" : {
       "application work units currently queued" : 0,
       "merge work units currently queued" : 0,
       "rows merged in an LSM tree" : 0,
       "sleep for LSM checkpoint throttle" : 0,
       "sleep for LSM merge throttle" : 0,
       "switch work units currently queued" : 0,
       "tree maintenance operations discarded" : 0,
       "tree maintenance operations executed" : 0,
       "tree maintenance operations scheduled" : 0,
       "tree queue hit maximum" : 0
       },
       "async" : {
       "current work queue length" : 0,
       "maximum work queue length" : 0,
       "number of allocation state races" : 0,
       "number of flush calls" : 0,
       "number of operation slots viewed for allocation" : 0,
       "number of times operation allocation failed" : 0,
       "number of times worker found no work" : 0,
       "total allocations" : 0,
       "total compact calls" : 0,
       "total insert calls" : 0,
       "total remove calls" : 0,
       "total search calls" : 0,
       "total update calls" : 0
       },
       "block-manager" : {
       "blocks pre-loaded" : 7346,
       "blocks read" : 35012853,
       "blocks written" : 148897645,
       "bytes read" : NumberLong("3022997778432"),
       "bytes written" : NumberLong("11316445532160"),
       "bytes written for checkpoint" : NumberLong("2225311752192"),
       "mapped blocks read" : 0,
       "mapped bytes read" : 0
       },
       "cache" : {
       "application threads page read from disk to cache count" : 34691777,
       "application threads page read from disk to cache time (usecs)" : 9183239214,
       "application threads page write from cache to disk count" : 99917,
       "application threads page write from cache to disk time (usecs)" : 22517702,
       "bytes belonging to page images in the cache" : 20362013458,
       "bytes currently in the cache" : 25407877408,
       "bytes not belonging to page images in the cache" : 5045863950,
       "bytes read into cache" : NumberLong("8376738574241"),
       "bytes written from cache" : NumberLong("34970531548366"),
       "checkpoint blocked page eviction" : 7702,
       "eviction calls to get a page" : 142431764,
       "eviction calls to get a page found queue empty" : 1121248,
       "eviction calls to get a page found queue empty after locking" : 4089642,
       "eviction currently operating in aggressive mode" : 0,
       "eviction empty score" : 0,
       "eviction server candidate queue empty when topping up" : 486463,
       "eviction server candidate queue not empty when topping up" : 4003797,
       "eviction server evicting pages" : 30723019,
       "eviction server slept, because we did not make progress with eviction" : 39224635,
       "eviction server unable to reach eviction goal" : 0,
       "eviction state" : 16,
       "eviction walks abandoned" : 7301923,
       "eviction worker thread evicting pages" : 66614470,
       "failed eviction of pages that exceeded the in-memory maximum" : 498670,
       "files with active eviction walks" : 0,
       "files with new eviction walks started" : 72677384,
       "hazard pointer blocked page eviction" : 200994,
       "hazard pointer check calls" : 148022103,
       "hazard pointer check entries walked" : 1321770357,
       "hazard pointer maximum array length" : 2,
       "in-memory page passed criteria to be split" : 4906773,
       "in-memory page splits" : 2449073,
       "internal pages evicted" : 921244,
       "internal pages split during eviction" : 7134,
       "leaf pages split during eviction" : 2890244,
       "lookaside table insert calls" : 0,
       "lookaside table remove calls" : 0,
       "maximum bytes configured" : 32212254720,
       "maximum page size at eviction" : 8391330,
       "modified pages evicted" : 47311362,
       "modified pages evicted by application threads" : 31584,
       "overflow pages read into cache" : 0,
       "overflow values cached in memory" : 0,
       "page split during eviction deepened the tree" : 1,
       "page written requiring lookaside records" : 0,
       "pages currently held in the cache" : 195586,
       "pages evicted because they exceeded the in-memory maximum" : 2445104,
       "pages evicted because they had chains of deleted items" : 58723,
       "pages evicted by application threads" : 52067,
       "pages queued for eviction" : 431137876,
       "pages queued for urgent eviction" : 691632,
       "pages queued for urgent eviction during walk" : 691632,
       "pages read into cache" : 34687640,
       "pages read into cache requiring lookaside entries" : 0,
       "pages requested from the cache" : 43184978458,
       "pages seen by eviction walk" : 14566839956,
       "pages selected for eviction unable to be evicted" : 3691349,
       "pages walked for eviction" : 23220496296,
       "pages written from cache" : 148241530,
       "pages written requiring in-memory restoration" : 7696,
       "percentage overhead" : 8,
       "tracked bytes belonging to internal pages in the cache" : 25218407,
       "tracked bytes belonging to leaf pages in the cache" : 25382659001,
       "tracked dirty bytes in the cache" : 1451931192,
       "tracked dirty pages in the cache" : 2524,
       "unmodified pages evicted" : 94543730
       },
       "connection" : {
       "auto adjusting condition resets" : 1908818,
       "auto adjusting condition wait calls" : 7564662,
       "files currently open" : 639,
       "memory allocations" : 14340521518,
       "memory frees" : 14295292627,
       "memory re-allocations" : 403873247,
       "pthread mutex condition wait calls" : 55298247,
       "pthread mutex shared lock read-lock calls" : 3505177583,
       "pthread mutex shared lock write-lock calls" : 833276234,
       "total fsync I/Os" : 115144650,
       "total read I/Os" : 35048322,
       "total write I/Os" : 299666640
       },
       "cursor" : {
       "cursor create calls" : 5254683,
       "cursor insert calls" : 3500471317,
       "cursor next calls" : 79475107535,
       "cursor prev calls" : 183577736,
       "cursor remove calls" : 2232044491,
       "cursor reset calls" : 9935878576,
       "cursor restarted searches" : 45900698,
       "cursor search calls" : 77578365099,
       "cursor search near calls" : 1913983323,
       "cursor update calls" : 0,
       "truncate calls" : 181458163
       },
       "data-handle" : {
       "connection data handles currently active" : 672,
       "connection sweep candidate became referenced" : 0,
       "connection sweep dhandles closed" : 294,
       "connection sweep dhandles removed from hash list" : 6593682,
       "connection sweep time-of-death sets" : 6853952,
       "connection sweeps" : 72181,
       "session dhandles swept" : 3482,
       "session sweep attempts" : 35866
       },
       "lock" : {
       "checkpoint lock acquisitions" : 11230,
       "checkpoint lock application thread wait time (usecs)" : 0,
       "checkpoint lock internal thread wait time (usecs)" : 1834,
       "handle-list lock acquisitions" : 6690708,
       "handle-list lock application thread wait time (usecs)" : 115673,
       "handle-list lock internal thread wait time (usecs)" : 189543,
       "metadata lock acquisitions" : 11227,
       "metadata lock application thread wait time (usecs)" : 0,
       "metadata lock internal thread wait time (usecs)" : 1752,
       "schema lock acquisitions" : 12170,
       "schema lock application thread wait time (usecs)" : 11,
       "schema lock internal thread wait time (usecs)" : 405,
       "table lock acquisitions" : 161550,
       "table lock application thread time waiting for the table lock (usecs)" : 4329709,
       "table lock internal thread time waiting for the table lock (usecs)" : 219
       },
      

       

      disk util :

      avg-cpu: %user %nice %system %iowait %steal %idle
       10.77 0.00 1.37 1.91 0.19 85.77
       
      Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util
      xvda 0.00 287.21 0.02 603.80 0.45 48016.93 79.52 0.41 0.68 0.46 27.67
       
      avg-cpu: %user %nice %system %iowait %steal %idle
       8.33 0.00 1.31 2.23 0.26 87.87
       
      Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util
      xvda 0.00 338.50 0.00 646.00 0.00 36184.00 56.01 0.48 0.74 0.56 36.20
       
      avg-cpu: %user %nice %system %iowait %steal %idle
       12.78 0.00 2.54 1.69 0.26 82.72
       
      Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util
      xvda 0.00 379.50 0.00 681.50 0.00 39676.00 58.22 0.42 0.62 0.43 29.60
       
      avg-cpu: %user %nice %system %iowait %steal %idle
       9.36 0.00 1.37 1.70 0.20 87.37
       
      Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util
      xvda 0.00 367.50 0.00 666.00 0.00 36448.00 54.73 0.46 0.68 0.48 32.20
       
       

      mongostat from one of secondary:

       

      insert query update delete getmore command % dirty % used flushes vsize res qr|qw ar|aw netIn netOut conn set repl time
       *474 516 *170 *0 0 4|0 4.9 79.2 0 35.7G 30.7G 0|0 0|0 68.3k 1.08m 358 rs0 SEC 2019-07-11T12:59:18Z
       *450 515 *172 *0 0 7|0 5.3 79.1 0 35.7G 30.7G 0|0 5|0 66.2k 874k 358 rs0 SEC 2019-07-11T12:59:19Z
       *423 483 *159 *0 0 9|0 4.6 78.9 0 35.7G 30.7G 0|0 5|0 62.5k 1.26m 358 rs0 SEC 2019-07-11T12:59:20Z
       *464 567 *169 *0 0 8|0 4.9 79.1 1 35.7G 30.7G 1|0 0|0 73.7k 1.11m 358 rs0 SEC 2019-07-11T12:59:21Z
       *458 481 *174 *0 0 5|0 4.5 79.0 0 35.7G 30.7G 0|0 1|0 63.2k 1.45m 358 rs0 SEC 2019-07-11T12:59:22Z
      [root@mongodb03-load ec2-user]# mongostat -h localhost:27017 --authenticationDatabase admin -u superuser -p Netgear123 --rowcount 5 1
      insert query update delete getmore command % dirty % used flushes vsize res qr|qw ar|aw netIn netOut conn set repl time
       *494 516 *168 *0 0 7|0 2.6 79.9 0 35.7G 30.7G 0|0 0|0 67.7k 1.27m 358 rs0 SEC 2019-07-11T12:59:27Z
       *494 503 *170 *0 0 3|0 3.0 79.8 0 35.7G 30.7G 0|0 0|1 63.9k 1.48m 358 rs0 SEC 2019-07-11T12:59:28Z
       *494 553 *178 *0 0 14|0 3.4 80.0 0 35.7G 30.7G 0|0 0|0 71.7k 2.14m 358 rs0 SEC 2019-07-11T12:59:29Z
       *446 524 *170 *0 0 4|0 3.8 80.1 0 35.7G 30.7G 0|0 0|1 68.6k 1.27m 358 rs0 SEC 2019-07-11T12:59:30Z
       *480 548 *166 *0 0 10|0 4.1 80.0 0 35.7G 30.7G 0|0 0|1 69.9k 1.51m 358 rs0 SEC 2019-07-11T12:59:31Z
      [root@mongodb03-load ec2-user]# mongostat -h localhost:27017 --authenticationDatabase admin -u superuser -p Netgear123 --rowcount 5 1
      insert query update delete getmore command % dirty % used flushes vsize res qr|qw ar|aw netIn netOut conn set repl time
       *524 576 *184 *0 0 7|0 5.1 79.6 0 35.7G 30.7G 0|0 0|0 75.4k 1.56m 358 rs0 SEC 2019-07-11T12:59:35Z
       *466 574 *175 *0 0 10|0 4.5 79.2 0 35.7G 30.7G 0|0 0|1 75.7k 1.41m 358 rs0 SEC 2019-07-11T12:59:36Z
       *392 519 *170 *0 0 9|0 4.9 79.4 0 35.7G 30.7G 0|0 1|0 67.5k 1.48m 358 rs0 SEC 2019-07-11T12:59:37Z
       *430 557 *171 *0 0 3|0 4.7 79.5 0 35.7G 30.7G 0|0 0|1 71.8k 1.49m 358 rs0 SEC 2019-07-11T12:59:38Z
       *482 434 *158 *0 0 13|0 5.1 79.5 0 35.7G 30.7G 0|0 0|0 57.3k 1.19m 358 rs0 SEC 2019-07-11T12:59:39Z
      

       

      mongostat from primary:

      nsert query update delete getmore command % dirty % used flushes vsize res qr|qw ar|aw netIn netOut conn set repl time
       461 *0 697 *0 501 477|0 4.2 70.4 0 43.9G 29.5G 0|0 0|0 1.36m 85.0m 426 rs0 PRI 2019-07-11T13:00:22Z
       411 *0 813 *0 569 588|0 4.6 70.4 0 43.9G 29.5G 0|0 0|2 1.55m 95.8m 426 rs0 PRI 2019-07-11T13:00:23Z
       462 *0 732 *0 574 573|0 5.0 70.7 0 43.9G 29.5G 0|0 0|2 1.52m 93.2m 426 rs0 PRI 2019-07-11T13:00:24Z
       494 *0 767 *0 554 563|0 4.8 70.6 0 43.9G 29.5G 0|0 0|0 1.49m 91.8m 426 rs0 PRI 2019-07-11T13:00:25Z
       494 *0 735 *0 549 548|0 5.0 70.5 0 43.9G 29.5G 0|0 0|1 1.49m 91.0m 426 rs0 PRI 2019-07-11T13:00:26Z
      [root@mongodb01-load ec2-user]# mongostat -h localhost:27017 --authenticationDatabase admin -u superuser -p Netgear123 --rowcount 5 1
      insert query update delete getmore command % dirty % used flushes vsize res qr|qw ar|aw netIn netOut conn set repl time
       492 *0 537 *0 508 532|0 4.4 70.1 0 43.9G 29.5G 0|0 0|1 1.26m 85.5m 426 rs0 PRI 2019-07-11T13:00:33Z
       494 *0 494 *0 490 476|0 4.8 70.0 0 43.9G 29.5G 0|1 0|2 1.17m 76.6m 426 rs0 PRI 2019-07-11T13:00:34Z
       494 *0 557 *0 557 587|0 5.1 70.2 0 43.9G 29.5G 0|0 0|0 1.50m 92.1m 426 rs0 PRI 2019-07-11T13:00:35Z
       494 *0 633 *0 546 535|0 4.9 70.3 0 43.9G 29.5G 0|0 0|1 1.71m 82.7m 426 rs0 PRI 2019-07-11T13:00:36Z
       494 *0 669 *0 590 569|0 5.0 70.1 0 43.9G 29.4G 0|0 0|1 1.78m 92.6m 426 rs0 PRI 2019-07-11T13:00:37Z
      [root@mongodb01-load ec2-user]# mongostat -h localhost:27017 --authenticationDatabase admin -u superuser -p Netgear123 --rowcount 5 1
      insert query update delete getmore command % dirty % used flushes vsize res qr|qw ar|aw netIn netOut conn set repl time
       452 *0 908 *0 634 618|0 4.8 69.8 0 43.9G 29.4G 0|0 0|3 2.50m 94.3m 426 rs0 PRI 2019-07-11T13:00:42Z
       490 *0 962 *0 663 573|0 4.7 69.9 0 43.9G 29.4G 0|0 0|1 2.57m 106m 426 rs0 PRI 2019-07-11T13:00:43Z
       415 *0 893 *0 636 580|0 5.2 69.9 0 43.9G 29.4G 0|0 2|7 2.52m 104m 426 rs0 PRI 2019-07-11T13:00:44Z
       458 *0 969 *0 646 620|0 4.7 69.8 0 43.9G 29.4G 0|2 0|1 2.61m 98.5m 426 rs0 PRI 2019-07-11T13:00:45Z
       470 *0 931 *0 630 578|0 5.2 69.8 0 43.9G 29.4G 0|0 0|1 2.36m 97.1m 426 rs0 PRI 2019-07-11T13:00:46Z
       
       

      Issue is seen for queries which are executing frequently and which are operating on collection having records more than 600000 records, sometime issue is seen with small no of records as well like 50,000.  Issue is observed more with aggregate and count queries. - may be because they are executing more and they are mostly under this profile capped collection.

       

      Please suggest how to reduce the lock acquirewait count and why there is lock at global and database level when all operations are insesrt/update/delete/find/aggregate which are happening at document level.

       

       

       

        Attachments

          Activity

            People

            • Votes:
              2 Vote for this issue
              Watchers:
              5 Start watching this issue

              Dates

              • Created:
                Updated:
                Resolved: