Uploaded image for project: 'WiredTiger'
  1. WiredTiger
  2. WT-2118

WiredTiger dump hung with cache full

    • Type: Icon: Bug Bug
    • Resolution: Cannot Reproduce
    • Priority: Icon: Major - P3 Major - P3
    • None
    • Affects Version/s: None
    • Component/s: None
    • None

      A test/format configuration has led to a situation where the dump command is hung due to the cache being full.

      This can be reproduced using a copy of the database locally. I'll attach the database.

      The symptoms:

      $pmp `pidof t`
            2
            1 waitpid,do_system,system,wts_dump,wts_salvage,main
      
      $ ps ax | grep dump
      20156 ?        SN     0:00 sh s_dumpcmp -h RUNDIR -n file:wt
      20157 ?        SNl    6:16 /home/jenkins/jenkins/workspace/wiredtiger-test-format-stress/build_posix/.libs/lt-wt -h RUNDIR dump file:wt
      

      Then some GDB output from the hung process:

      (gdb) where
      #0  0x0000003467a0b98e in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
      #1  0x00007ffc6a4a57ca in __wt_cond_wait_signal (session=0x173dfb0, cond=0x173bd40, usecs=100000,
          signalled=0x7fff4b1128fc) at ../src/os_posix/os_mtx_cond.c:82
      #2  0x00007ffc6a47ae62 in __wt_cond_wait (session=0x173dfb0, cond=0x173bd40, usecs=100000)
          at ../src/include/misc.i:18
      #3  0x00007ffc6a47f171 in __wt_cache_eviction_worker (session=0x173dfb0, busy=1, pct_full=133)
          at ../src/evict/evict_lru.c:1533
      #4  0x00007ffc6a4173b5 in __wt_cache_eviction_check (session=0x173dfb0, busy=1, didworkp=0x0)
          at ../src/include/cache.i:237
      #5  0x00007ffc6a419a90 in __wt_page_in_func (session=0x173dfb0, ref=0x173bdb0, flags=64,
          file=0x7ffc6a4f5c83 "../src/btree/bt_walk.c", line=301) at ../src/btree/bt_read.c:459
      #6  0x00007ffc6a4316f8 in __wt_page_swap_func (session=0x173dfb0, held=0x17e2c90, want=0x173bdb0,
          flags=64, file=0x7ffc6a4f5c83 "../src/btree/bt_walk.c", line=301) at ../src/include/btree.i:1234
      #7  0x00007ffc6a431dd8 in __wt_tree_walk (session=0x173dfb0, refp=0x17e3198, walkcntp=0x0, flags=64)
          at ../src/btree/bt_walk.c:301
      #8  0x00007ffc6a3fd34b in __wt_btcur_next (cbt=0x17e3050, truncating=0)
          at ../src/btree/bt_curnext.c:532
      #9  0x00007ffc6a45d6ee in __curfile_next (cursor=0x17e3050) at ../src/cursor/cur_file.c:113
      #10 0x00007ffc6a45c769 in __curdump_next (cursor=0x17e3310) at ../src/cursor/cur_dump.c:294
      #11 0x00000000004039db in dump_record (cursor=0x17e3310, reverse=0, json=0)
          at ../src/utilities/util_dump.c:588
      #12 0x00000000004026d6 in util_dump (session=0x173dfb0, argc=1, argv=0x7fff4b112f28)
          at ../src/utilities/util_dump.c:121
      #13 0x0000000000407c44 in main (argc=2, argv=0x7fff4b112f20) at ../src/utilities/util_main.c:226
      (gdb) f 3
      #3  0x00007ffc6a47f171 in __wt_cache_eviction_worker (session=0x173dfb0, busy=1, pct_full=133)
          at ../src/evict/evict_lru.c:1533
      1533			WT_RET(
      (gdb) p conn
      $1 = (WT_CONNECTION_IMPL *) 0x172b070
      (gdb) p *conn->cache
      $2 = {bytes_inmem = 3896897, pages_inmem = 5, bytes_internal = 3894395, bytes_overflow = 0,
        bytes_evict = 0, pages_evict = 0, bytes_dirty = 2502, pages_dirty = 1, bytes_read = 379728,
        app_evicts = 0, app_waits = 934806, evict_max_page_size = 0, read_gen = 916653,
        read_gen_oldest = 916752, evict_cond = 0x173bcd0, evict_lock = {lock = {__data = {__lock = 0,
              __count = 0, __owner = 0, __nusers = 0, __kind = 0, __spins = 0, __list = {__prev = 0x0,
                __next = 0x0}}, __size = '\000' <repeats 39 times>, __align = 0},
          name = 0x7ffc6a4fa6b7 "cache eviction", initialized = 1 '\001'}, evict_walk_lock = {lock = {
            __data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 0, __spins = 0,
              __list = {__prev = 0x0, __next = 0x0}}, __size = '\000' <repeats 39 times>, __align = 0},
          name = 0x7ffc6a4fa6c6 "cache walk", initialized = 1 '\001'}, evict_waiter_cond = 0x173bd40,
        eviction_trigger = 95, eviction_target = 80, eviction_dirty_target = 80,
        eviction_dirty_trigger = 95, overhead_pct = 8, evict_queue = 0x1759b70, evict_current = 0x0,
        evict_candidates = 1, evict_entries = 1, evict_max = 2, evict_slots = 400, evict_file_next = 0x0,
        cp_pass_pressure = 0, cp_quota = 0, cp_reserved = 0, cp_session = 0x0, cp_skip_count = 0,
        cp_tid = 0, cp_saved_app_evicts = 0, cp_saved_app_waits = 0, cp_saved_read = 0, state = 3,
        flags = 24}
      (gdb) p conn->cache_size
      $3 = 3145728
      (gdb) p /x conn->cache->flags
      $4 = 0x18
      

      The test/format configuration file:

      ############################################
      #  RUN PARAMETERS
      ############################################
      abort=0
      auto_throttle=1
      firstfit=0
      bitcnt=8
      bloom=1
      bloom_bit_count=59
      bloom_hash_count=17
      bloom_oldest=0
      cache=3
      checkpoints=1
      checksum=uncompressed
      chunk_size=6
      compaction=0
      compression=bzip-raw
      data_extend=0
      data_source=file
      delete_pct=0
      dictionary=0
      encryption=rotn-7
      evict_max=1
      file_type=row-store
      backups=0
      huffman_key=0
      huffman_value=0
      insert_pct=21
      internal_key_truncation=1
      internal_page_max=16
      isolation=read-uncommitted
      key_gap=9
      key_max=65
      key_min=20
      leak_memory=0
      leaf_page_max=13
      logging=0
      logging_compression=snappy
      logging_archive=1
      logging_prealloc=0
      long_running_txn=0
      lsm_worker_threads=3
      merge_max=6
      mmap=1
      ops=100000
      prefix_compression=1
      prefix_compression_min=3
      repeat_data_pct=6
      reverse=0
      rows=100000
      runs=100
      salvage=1
      split_pct=46
      statistics=0
      statistics_server=0
      threads=23
      timer=20
      value_max=2053
      value_min=10
      verify=1
      wiredtiger_config=
      write_pct=21
      ############################################
      

        1. dump_stuck.tgz
          6.27 MB
          Alexander Gorrod

            Assignee:
            backlog-server-execution [DO NOT USE] Backlog - Storage Execution Team
            Reporter:
            alexander.gorrod@mongodb.com Alexander Gorrod
            Votes:
            0 Vote for this issue
            Watchers:
            4 Start watching this issue

              Created:
              Updated:
              Resolved: