Uploaded image for project: 'WiredTiger'
  1. WiredTiger
  2. WT-3105

Fix a deadlock caused by allocating eviction thread sessions dynamically

    • Type: Icon: Task Task
    • Resolution: Done
    • Priority: Icon: Major - P3 Major - P3
    • WT2.9.2, 3.2.13, 3.4.3, 3.5.2
    • Affects Version/s: None
    • Component/s: None
    • None

      We are currently seeing hangs in test/format runs within the dynamic eviction code.

      Trace:

      Thread 6 (Thread 0x7ff1a89a8700 (LWP 29351)):
      #0  0x00007ff1ac11bf4d in __lll_lock_wait () from /lib64/libpthread.so.0
      #1  0x00007ff1ac117d02 in _L_lock_791 () from /lib64/libpthread.so.0
      #2  0x00007ff1ac117c08 in pthread_mutex_lock () from /lib64/libpthread.so.0
      #3  0x00000000009793aa in __wt_spin_lock (session=0x632000002790, t=0x62c000000638) at ../src/include/mutex.i:161
      #4  0x0000000000977a1c in __wt_spin_lock_track (session=0x632000002790, t=0x62c000000638) at ../src/include/mutex.i:290
      #5  0x0000000000976f4e in __session_get_dhandle (session=0x632000002790, uri=0x1198220 <.str7> "file:WiredTigerLAS.wt", checkpoint=0x0) at ../src/session/session_dhandle.c:453
      #6  0x0000000000973d83 in __wt_session_get_btree (session=0x632000002790, uri=0x1198220 <.str7> "file:WiredTigerLAS.wt", checkpoint=0x0, cfg=0x7ff1a89a5aa0, flags=0) at ../src/session/session_dhandle.c:482
      #7  0x0000000000973389 in __wt_session_get_btree_ckpt (session=0x632000002790, uri=0x1198220 <.str7> "file:WiredTigerLAS.wt", cfg=0x7ff1a89a5aa0, flags=0) at ../src/session/session_dhandle.c:336
      #8  0x0000000000db341d in __wt_curfile_open (session=<optimized out>, uri=<optimized out>, owner=<optimized out>, cfg=<optimized out>, cursorp=<optimized out>) at ../src/cursor/cur_file.c:566
      #9  0x000000000090aeae in __session_open_cursor_int (session=0x632000002790, uri=0x1198220 <.str7> "file:WiredTigerLAS.wt", owner=0x0, other=0x0, cfg=0x7ff1a89a5aa0, cursorp=0x6320000028f0) at ../src/session/session_api.c:391
      #10 0x0000000000908b4a in __wt_open_cursor (session=0x632000002790, uri=0x1198220 <.str7> "file:WiredTigerLAS.wt", owner=0x0, cfg=0x7ff1a89a5aa0, cursorp=0x6320000028f0) at ../src/session/session_api.c:444
      #11 0x0000000000d2de33 in __wt_las_cursor_open (session=0x632000002790, cursorp=0x6320000028f0) at ../src/cache/cache_las.c:172
      #12 0x0000000000917c30 in __wt_open_internal_session (conn=0x62c000000200, name=0x1146ee0 <.str22> "eviction-server", open_metadata=false, session_flags=2049, sessionp=0x6040002bf3d0) at ../src/session/session_api.c:1970
      #13 0x00000000009f0b69 in __thread_group_resize (session=0x6320000014a0, group=0x62c000007018, new_min=1, new_max=8, flags=3) at ../src/support/thread_group.c:185
      #14 0x00000000009eeaa3 in __wt_thread_group_resize (session=0x6320000014a0, group=0x62c000007018, new_min=1, new_max=8, flags=3) at ../src/support/thread_group.c:233
      #15 0x000000000064997d in __evict_tune_workers (session=0x6320000014a0) at ../src/evict/evict_lru.c:1001
      #16 0x0000000000644218 in __evict_pass (session=0x6320000014a0) at ../src/evict/evict_lru.c:562
      #17 0x00000000006211c4 in __evict_server (session=0x6320000014a0, did_work=0x7ff1a89a7c70) at ../src/evict/evict_lru.c:319
      #18 0x000000000061fab5 in __wt_evict_thread_run (session=0x6320000014a0, thread=0x60400000a410) at ../src/evict/evict_lru.c:255
      #19 0x00000000009edde7 in __wt_thread_run (arg=0x60400000a410) at ../src/support/thread_group.c:25
      #20 0x00007ff1ac115dc5 in start_thread () from /lib64/libpthread.so.0
      #21 0x00007ff1ab2ffc9d in clone () from /lib64/libc.so.6
      
      Thread 5 (Thread 0x7ff1a6fff700 (LWP 29352)):
      #0  0x00007ff1ac11bf4d in __lll_lock_wait () from /lib64/libpthread.so.0
      #1  0x00007ff1ac117d02 in _L_lock_791 () from /lib64/libpthread.so.0
      #2  0x00007ff1ac117c08 in pthread_mutex_lock () from /lib64/libpthread.so.0
      #3  0x00000000005ccc6a in __wt_spin_lock (session=0x632000002140, t=0x62c000000638) at ../src/include/mutex.i:161
      #4  0x00000000005cbbfc in __wt_spin_lock_track (session=0x632000002140, t=0x62c000000638) at ../src/include/mutex.i:290
      #5  0x00000000005ca0cd in __sweep_remove_handles (session=0x632000002140) at ../src/conn/conn_sweep.c:236
      #6  0x00000000005c407b in __sweep_server (arg=0x632000002140) at ../src/conn/conn_sweep.c:317
      #7  0x00007ff1ac115dc5 in start_thread () from /lib64/libpthread.so.0
      #8  0x00007ff1ab2ffc9d in clone () from /lib64/libc.so.6
      
      Thread 4 (Thread 0x7ff1967e2700 (LWP 29382)):
      #0  0x00007ff1ac119a82 in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
      #1  0x00000000007944da in __wt_cond_wait_signal (session=0x6320000017c8, cond=0x60c00000b200, usecs=10000, signalled=0x7ff1967e17a0) at ../src/os_posix/os_mtx_cond.c:71
      #2  0x000000000062f879 in __wt_cond_wait (session=0x6320000017c8, cond=0x60c00000b200, usecs=10000) at ../src/include/misc.i:18
      #3  0x0000000000623400 in __evict_lru_pages (session=0x6320000017c8, is_server=false) at ../src/evict/evict_lru.c:1050
      #4  0x000000000061ff35 in __wt_evict_thread_run (session=0x6320000017c8, thread=0x60400000a390) at ../src/evict/evict_lru.c:266
      #5  0x00000000009edde7 in __wt_thread_run (arg=0x60400000a390) at ../src/support/thread_group.c:25
      #6  0x00007ff1ac115dc5 in start_thread () from /lib64/libpthread.so.0
      #7  0x00007ff1ab2ffc9d in clone () from /lib64/libc.so.6
      
      Thread 3 (Thread 0x7ff1977e4700 (LWP 29383)):
      #0  0x00007ff1ac119a82 in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
      #1  0x00000000007944da in __wt_cond_wait_signal (session=0x632000001af0, cond=0x60c00000b200, usecs=10000, signalled=0x7ff1977e37a0) at ../src/os_posix/os_mtx_cond.c:71
      #2  0x000000000062f879 in __wt_cond_wait (session=0x632000001af0, cond=0x60c00000b200, usecs=10000) at ../src/include/misc.i:18
      #3  0x0000000000623400 in __evict_lru_pages (session=0x632000001af0, is_server=false) at ../src/evict/evict_lru.c:1050
      #4  0x000000000061ff35 in __wt_evict_thread_run (session=0x632000001af0, thread=0x60400000a310) at ../src/evict/evict_lru.c:266
      #5  0x00000000009edde7 in __wt_thread_run (arg=0x60400000a310) at ../src/support/thread_group.c:25
      #6  0x00007ff1ac115dc5 in start_thread () from /lib64/libpthread.so.0
      #7  0x00007ff1ab2ffc9d in clone () from /lib64/libc.so.6
      
      Thread 2 (Thread 0x7ff1957e0700 (LWP 29413)):
      #0  0x00007ff1ac119a82 in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
      #1  0x00000000007944da in __wt_cond_wait_signal (session=0x632000001e18, cond=0x60c00000b200, usecs=10000, signalled=0x7ff1957df7a0) at ../src/os_posix/os_mtx_cond.c:71
      #2  0x000000000062f879 in __wt_cond_wait (session=0x632000001e18, cond=0x60c00000b200, usecs=10000) at ../src/include/misc.i:18
      #3  0x0000000000623400 in __evict_lru_pages (session=0x632000001e18, is_server=false) at ../src/evict/evict_lru.c:1050
      #4  0x000000000061ff35 in __wt_evict_thread_run (session=0x632000001e18, thread=0x60400000a290) at ../src/evict/evict_lru.c:266
      #5  0x00000000009edde7 in __wt_thread_run (arg=0x60400000a290) at ../src/support/thread_group.c:25
      #6  0x00007ff1ac115dc5 in start_thread () from /lib64/libpthread.so.0
      #7  0x00007ff1ab2ffc9d in clone () from /lib64/libc.so.6
      
      Thread 1 (Thread 0x7ff1accc37c0 (LWP 29344)):
      #0  0x00007ff1ac11bf4d in __lll_lock_wait () from /lib64/libpthread.so.0
      #1  0x00007ff1ac117d02 in _L_lock_791 () from /lib64/libpthread.so.0
      #2  0x00007ff1ac117c08 in pthread_mutex_lock () from /lib64/libpthread.so.0
      #3  0x000000000061e1ba in __wt_spin_lock (session=0x632000002468, t=0x61800000f9a8) at ../src/include/mutex.i:161
      #4  0x0000000000624dd7 in __wt_spin_lock_track (session=0x632000002468, t=0x61800000f9a8) at ../src/include/mutex.i:301
      #5  0x0000000000626c47 in __wt_evict_file_exclusive_on (session=0x632000002468) at ../src/evict/evict_lru.c:781
      #6  0x0000000000ea8aca in __wt_evict_file (session=0x632000002468, syncop=WT_SYNC_DISCARD) at ../src/evict/evict_file.c:26
      #7  0x0000000000c4f86e in __wt_cache_op (session=0x632000002468, op=WT_SYNC_DISCARD) at ../src/btree/bt_sync.c:319
      #8  0x0000000000978e15 in __wt_session_lock_checkpoint (session=0x632000002468, checkpoint=0x603000a90de0 "WiredTigerCheckpoint.3") at ../src/session/session_dhandle.c:576
      #9  0x0000000000a10a7c in __checkpoint_lock_tree (session=<optimized out>, is_checkpoint=<optimized out>, need_tracking=<optimized out>, cfg=<optimized out>) at ../src/txn/txn_ckpt.c:1199
      #10 0x0000000000a1a69d in __wt_checkpoint_close (session=<optimized out>, final=<optimized out>) at ../src/txn/txn_ckpt.c:1650
      #11 0x0000000000d5ea18 in __wt_conn_btree_sync_and_close (session=0x632000002468, final=false, force=false) at ../src/conn/conn_dhandle.c:196
      #12 0x0000000000d65da9 in __wt_conn_dhandle_close_all (session=<optimized out>, uri=<optimized out>, force=<optimized out>) at ../src/conn/conn_dhandle.c:500
      #13 0x0000000000900cc2 in __wt_schema_worker (session=<optimized out>, uri=<optimized out>, file_func=<optimized out>, name_func=<optimized out>, cfg=<optimized out>, open_flags=<optimized out>) at ../src/schema/schema_worker.c:52
      #14 0x0000000000946df5 in __session_verify (wt_session=0x632000002468, uri=0x611000009f00 "file:wt", config=0x1131b40 <.str183> "strict") at ../src/session/session_api.c:1374
      #15 0x000000000050450b in wts_verify (tag=<optimized out>) at ../../../test/format/wts.c:529
      #16 0x00000000004f0104 in main (argc=<optimized out>, argv=<optimized out>) at ../../../test/format/t.c:230
      (gdb)
      

      Config

      ############################################
      #  RUN PARAMETERS
      ############################################
      abort=0
      alter=0
      auto_throttle=1
      backups=0
      bitcnt=2
      bloom=0
      bloom_bit_count=11
      bloom_hash_count=18
      bloom_oldest=0
      cache=93
      checkpoints=1
      checksum=uncompressed
      chunk_size=3
      compaction=0
      compression=snappy
      data_extend=0
      data_source=file
      delete_pct=30
      dictionary=0
      direct_io=0
      encryption=rotn-7
      evict_max=0
      file_type=variable-length column-store
      firstfit=1
      huffman_key=0
      huffman_value=0
      in_memory=0
      insert_pct=32
      internal_key_truncation=1
      internal_page_max=12
      isolation=random
      key_gap=3
      key_max=24
      key_min=24
      leaf_page_max=9
      leak_memory=0
      logging=0
      logging_archive=1
      logging_compression=none
      logging_prealloc=1
      long_running_txn=0
      lsm_worker_threads=4
      merge_max=14
      mmap=0
      ops=100000
      prefix_compression=1
      prefix_compression_min=8
      quiet=1
      repeat_data_pct=75
      reverse=0
      rows=100000
      runs=1
      rebalance=1
      salvage=1
      split_pct=71
      statistics=1
      statistics_server=0
      threads=29
      timer=20
      transaction-frequency=97
      value_max=3305
      value_min=9
      verify=1
      wiredtiger_config=
      write_pct=63
      ############################################
      

      Making runs of this config locally to confirm how reproducible this is.

            Assignee:
            sue.loverso@mongodb.com Susan LoVerso
            Reporter:
            david.hows David Hows
            Votes:
            0 Vote for this issue
            Watchers:
            5 Start watching this issue

              Created:
              Updated:
              Resolved: