Uploaded image for project: 'WiredTiger'
  1. WiredTiger
  2. WT-1844

test/format hung on column store.

    • Type: Icon: Task Task
    • Resolution: Done
    • Priority: Icon: Major - P3 Major - P3
    • WT2.7.0
    • Affects Version/s: None
    • Component/s: None
    • Labels:

      A couple of Jenkins jobs are hung running column store formats.

      The configuration file:

      ############################################
      #  RUN PARAMETERS
      ############################################
      abort=0
      auto_throttle=1
      firstfit=0
      bitcnt=4
      bloom=1
      bloom_bit_count=48
      bloom_hash_count=12
      bloom_oldest=0
      cache=40
      checkpoints=1
      checksum=uncompressed
      chunk_size=1
      compaction=0
      compression=none
      data_extend=0
      data_source=file
      delete_pct=43
      dictionary=0
      evict_max=4
      file_type=variable-length column-store
      backups=0
      huffman_key=0
      huffman_value=0
      insert_pct=16
      internal_key_truncation=1
      internal_page_max=14
      isolation=read-committed
      key_gap=18
      key_max=27
      key_min=27
      leak_memory=0
      leaf_page_max=9
      logging=0
      logging_archive=0
      logging_prealloc=0
      lsm_worker_threads=4
      merge_max=9
      mmap=0
      ops=100000
      prefix_compression=1
      prefix_compression_min=7
      repeat_data_pct=68
      reverse=0
      rows=100000
      runs=100
      split_pct=75
      statistics=0
      statistics_server=0
      threads=20
      timer=20
      value_max=910
      value_min=8
      wiredtiger_config=
      write_pct=73
      ############################################
      

      The call stacks from the process. It's not that obvious how it's stuck - i.e, I don't think this is a deadlock:

      (gdb) thread apply all where
      
      Thread 10 (Thread 0x7f333d2c7700 (LWP 17262)):
      #0  0x0000003467a0b98e in pthread_cond_timedwait@@GLIBC_2.3.2 ()
         from /lib64/libpthread.so.0
      WT-1  0x00000000004421d8 in __wt_cond_wait (session=0x2144040, cond=0x215dbc0, 
          usecs=100000) at ../src/os_posix/os_mtx_cond.c:79
      WT-2  0x000000000042b09b in __evict_server (arg=0x2144040)
          at ../src/evict/evict_lru.c:210
      WT-3  0x0000003467a079d1 in start_thread () from /lib64/libpthread.so.0
      WT-4  0x00000034676e8b6d in clone () from /lib64/libc.so.6
      
      Thread 9 (Thread 0x7f33361fc700 (LWP 17263)):
      #0  0x0000003467a0b98e in pthread_cond_timedwait@@GLIBC_2.3.2 ()
         from /lib64/libpthread.so.0
      WT-1  0x00000000004421d8 in __wt_cond_wait (session=0x2144b40, cond=0x218ee50, 
          usecs=10000000) at ../src/os_posix/os_mtx_cond.c:79
      WT-2  0x000000000042151a in __sweep_server (arg=0x2144b40)
          at ../src/conn/conn_sweep.c:171
      WT-3  0x0000003467a079d1 in start_thread () from /lib64/libpthread.so.0
      WT-4  0x00000034676e8b6d in clone () from /lib64/libc.so.6
      
      Thread 8 (Thread 0x7f33357fb700 (LWP 17284)):
      #0  0x00000000004eb9b3 in __col_insert_search_match (inshead=0x7f32e0080a70, 
          recno=64587) at ../src/include/column.i:111
      WT-1  0x00000000004eca07 in __cursor_var_prev (cbt=0x7f32c403aaf0, newpage=0)
          at ../src/btree/bt_curprev.c:341
      WT-2  0x00000000004ed388 in __wt_btcur_prev (cbt=0x7f32c403aaf0, truncating=0)
          at ../src/btree/bt_curprev.c:589
      WT-3  0x00000000004b8e8e in __curfile_prev (cursor=0x7f32c403aaf0)
          at ../src/cursor/cur_file.c:156
      WT-4  0x0000000000410ab9 in nextprev (cursor=0x7f32c403aaf0, next=0, 
          notfoundp=0x7f33357fada4) at ../../../test/format/ops.c:674
      WT-5  0x0000000000410337 in ops (arg=0x24f3400) at ../../../test/format/ops.c:475
      WT-6  0x0000003467a079d1 in start_thread () from /lib64/libpthread.so.0
      WT-7  0x00000034676e8b6d in clone () from /lib64/libc.so.6
      
      Thread 7 (Thread 0x7f333dcc8700 (LWP 17287)):
      #0  __col_insert_search_lt (inshead=0x7f32f40851b0, recno=39196)
          at ../src/include/column.i:82
      WT-1  0x00000000004ecb5f in __cursor_var_prev (cbt=0x7f32b403aaf0, newpage=0)
          at ../src/btree/bt_curprev.c:379
      WT-2  0x00000000004ed388 in __wt_btcur_prev (cbt=0x7f32b403aaf0, truncating=0)
          at ../src/btree/bt_curprev.c:589
      WT-3  0x00000000004b8e8e in __curfile_prev (cursor=0x7f32b403aaf0)
          at ../src/cursor/cur_file.c:156
      WT-4  0x0000000000410ab9 in nextprev (cursor=0x7f32b403aaf0, next=0, 
          notfoundp=0x7f333dcc7da4) at ../../../test/format/ops.c:674
      WT-5  0x0000000000410337 in ops (arg=0x24f3508) at ../../../test/format/ops.c:475
      WT-6  0x0000003467a079d1 in start_thread () from /lib64/libpthread.so.0
      ---Type <return> to continue, or q <return> to quit---
      WT-7  0x00000034676e8b6d in clone () from /lib64/libc.so.6
      
      Thread 6 (Thread 0x7f3337fff700 (LWP 17288)):
      #0  0x00000000004eca73 in __cursor_var_prev (cbt=0x7f32ac03aaf0, newpage=0)
          at ../src/btree/bt_curprev.c:346
      WT-1  0x00000000004ed388 in __wt_btcur_prev (cbt=0x7f32ac03aaf0, truncating=0)
          at ../src/btree/bt_curprev.c:589
      WT-2  0x00000000004b8e8e in __curfile_prev (cursor=0x7f32ac03aaf0)
          at ../src/cursor/cur_file.c:156
      WT-3  0x0000000000410ab9 in nextprev (cursor=0x7f32ac03aaf0, next=0, 
          notfoundp=0x7f3337ffeda4) at ../../../test/format/ops.c:674
      WT-4  0x0000000000410337 in ops (arg=0x24f3560) at ../../../test/format/ops.c:475
      WT-5  0x0000003467a079d1 in start_thread () from /lib64/libpthread.so.0
      WT-6  0x00000034676e8b6d in clone () from /lib64/libc.so.6
      
      Thread 5 (Thread 0x7f3336bfd700 (LWP 17290)):
      #0  __col_insert_search_match (inshead=0x7f3300000e60, recno=38371)
          at ../src/include/column.i:118
      WT-1  0x00000000004eca07 in __cursor_var_prev (cbt=0x7f33241896f0, newpage=0)
          at ../src/btree/bt_curprev.c:341
      WT-2  0x00000000004ed388 in __wt_btcur_prev (cbt=0x7f33241896f0, truncating=0)
          at ../src/btree/bt_curprev.c:589
      WT-3  0x00000000004b8e8e in __curfile_prev (cursor=0x7f33241896f0)
          at ../src/cursor/cur_file.c:156
      WT-4  0x0000000000410ab9 in nextprev (cursor=0x7f33241896f0, next=0, 
          notfoundp=0x7f3336bfcda4) at ../../../test/format/ops.c:674
      WT-5  0x0000000000410337 in ops (arg=0x24f3610) at ../../../test/format/ops.c:475
      WT-6  0x0000003467a079d1 in start_thread () from /lib64/libpthread.so.0
      WT-7  0x00000034676e8b6d in clone () from /lib64/libc.so.6
      
      Thread 4 (Thread 0x7f3331bf5700 (LWP 17296)):
      #0  0x00000000004e9f3e in __wt_cell_unpack_safe (cell=0x238794d, 
          unpack=0x7f3331bf4ad0, end=0x0) at ../src/include/cell.i:622
      WT-1  0x00000000004ea1c3 in __wt_cell_unpack (cell=0x238794d, 
          unpack=0x7f3331bf4ad0) at ../src/include/cell.i:710
      WT-2  0x00000000004ecb15 in __cursor_var_prev (cbt=0x7f332c086a10, newpage=0)
          at ../src/btree/bt_curprev.c:365
      WT-3  0x00000000004ed388 in __wt_btcur_prev (cbt=0x7f332c086a10, truncating=0)
          at ../src/btree/bt_curprev.c:589
      WT-4  0x00000000004b8e8e in __curfile_prev (cursor=0x7f332c086a10)
          at ../src/cursor/cur_file.c:156
      WT-5  0x0000000000410ab9 in nextprev (cursor=0x7f332c086a10, next=0, 
          notfoundp=0x7f3331bf4da4) at ../../../test/format/ops.c:674
      WT-6  0x0000000000410337 in ops (arg=0x24f3820) at ../../../test/format/ops.c:475
      WT-7  0x0000003467a079d1 in start_thread () from /lib64/libpthread.so.0
      WT-8  0x00000034676e8b6d in clone () from /lib64/libc.so.6
      
      Thread 3 (Thread 0x7f33311f4700 (LWP 17297)):
      #0  __cursor_var_prev (cbt=0x7f3320086610, newpage=1)
      ---Type <return> to continue, or q <return> to quit---
          at ../src/btree/bt_curprev.c:342
      WT-1  0x00000000004ed388 in __wt_btcur_prev (cbt=0x7f3320086610, truncating=0)
          at ../src/btree/bt_curprev.c:589
      WT-2  0x00000000004b8e8e in __curfile_prev (cursor=0x7f3320086610)
          at ../src/cursor/cur_file.c:156
      WT-3  0x0000000000410ab9 in nextprev (cursor=0x7f3320086610, next=0, 
          notfoundp=0x7f33311f3da4) at ../../../test/format/ops.c:674
      WT-4  0x0000000000410337 in ops (arg=0x24f3878) at ../../../test/format/ops.c:475
      WT-5  0x0000003467a079d1 in start_thread () from /lib64/libpthread.so.0
      WT-6  0x00000034676e8b6d in clone () from /lib64/libc.so.6
      
      Thread 2 (Thread 0x7f332a1fc700 (LWP 17301)):
      #0  0x00000000004ebddf in __col_var_search (page=0x25332e0, recno=84228, 
          start_recnop=0x7f332a1fbac8) at ../src/include/column.i:259
      WT-1  0x00000000004ec94a in __cursor_var_prev (cbt=0x7f32ec03aaf0, newpage=0)
          at ../src/btree/bt_curprev.c:335
      WT-2  0x00000000004ed388 in __wt_btcur_prev (cbt=0x7f32ec03aaf0, truncating=0)
          at ../src/btree/bt_curprev.c:589
      WT-3  0x00000000004b8e8e in __curfile_prev (cursor=0x7f32ec03aaf0)
          at ../src/cursor/cur_file.c:156
      WT-4  0x0000000000410ab9 in nextprev (cursor=0x7f32ec03aaf0, next=0, 
          notfoundp=0x7f332a1fbda4) at ../../../test/format/ops.c:674
      WT-5  0x0000000000410337 in ops (arg=0x24f39d8) at ../../../test/format/ops.c:475
      WT-6  0x0000003467a079d1 in start_thread () from /lib64/libpthread.so.0
      WT-7  0x00000034676e8b6d in clone () from /lib64/libc.so.6
      
      Thread 1 (Thread 0x7f333eed2720 (LWP 13068)):
      #0  0x00000034676accdd in nanosleep () from /lib64/libc.so.6
      WT-1  0x00000034676e1e54 in usleep () from /lib64/libc.so.6
      WT-2  0x000000000040f8b7 in wts_ops (lastrun=0) at ../../../test/format/ops.c:168
      WT-3  0x0000000000412937 in main (argc=0, argv=0x7ffff9709f40)
          at ../../../test/format/t.c:202
      (gdb) thread 2
      [Switching to thread 2 (Thread 0x7f332a1fc700 (LWP 17301))]#0  0x00000000004ebddf in __col_var_search (page=0x25332e0, recno=84228, 
          start_recnop=0x7f332a1fbac8) at ../src/include/column.i:259
      259			repeat = page->pg_var_repeats + indx;
      (gdb) p page->pg_var_repeats
      There is no member named pg_var_repeats.
      (gdb) p page
      $1 = (WT_PAGE *) 0x25332e0
      (gdb) p page->pg_var_repeats
      There is no member named pg_var_repeats.
      (gdb) p page->u.colv_var.repeats
      There is no member named colv_var.
      (gdb) p page->u.col_var.repeats
      $2 = (WT_COL_RLE *) 0x2533610
      (gdb) p *$2
      $3 = {recno = 84216, rle = 4, indx = 4}
      (gdb) p limit
      $4 = 1
      (gdb) n
      260			if (recno >= repeat->recno &&
      (gdb) 
      261			    recno < repeat->recno + repeat->rle) {
      (gdb) 
      260			if (recno >= repeat->recno &&
      (gdb) 
      262				if (start_recnop != NULL)
      (gdb) 
      263					*start_recnop = repeat->recno;
      (gdb) 
      264				return (page->pg_var_d + repeat->indx);
      (gdb) 
      289	}
      (gdb) 
      __cursor_var_prev (cbt=0x7f32ec03aaf0, newpage=0)
          at ../src/btree/bt_curprev.c:334
      334			if ((cip =
      (gdb) n
      337			cbt->slot = WT_COL_SLOT(page, cip);
      (gdb) p cip
      $5 = (WT_COL *) 0x2533350
      (gdb) n
      340			cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
      (gdb) 
      341			cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
      

      The failed builds:
      http://build.wiredtiger.com:8080/job/wiredtiger-test-format-stress/6524/console

      http://build.wiredtiger.com:8080/job/wiredtiger-test-format-recovery/306/

      It seems likely due to the recent column store changes from WT-1833

            Assignee:
            Unassigned Unassigned
            Reporter:
            alexander.gorrod@mongodb.com Alexander Gorrod
            Votes:
            0 Vote for this issue
            Watchers:
            1 Start watching this issue

              Created:
              Updated:
              Resolved: