Uploaded image for project: 'WiredTiger'
  1. WiredTiger
  2. WT-687

test/format failure: lsm_merge.c, 45: j == nchunks

    • Type: Icon: Task Task
    • Resolution: Done
    • WT1.6.5
    • Affects Version/s: None
    • Component/s: None
    • Labels:

       124: lsm, row-store                                                          
      file:wt-000084.lsm: ../src/lsm/lsm_merge.c, 45: j == nchunks              
      file:wt-000084.lsm: aborting WiredTiger library
      file:wt-000084.lsm: process ID 1146: waiting for debugger...
      

      with a pretty simple, single-threaded CONFIG:

      ############################################
      #  RUN PARAMETERS
      ############################################
      # bitcnt not applicable to this run
      cache=80
      compression=raw
      data_extend=0
      data_source=lsm
      delete_pct=19
      dictionary=0
      file_type=row-store
      hot_backups=1
      huffman_key=0
      huffman_value=1
      insert_pct=41
      internal_key_truncation=0
      internal_page_max=14
      key_gap=14
      key_max=83
      key_min=25
      leaf_page_max=15
      ops=50000
      prefix=1
      repeat_data_pct=29
      reverse=0
      rows=1000
      runs=0
      split_pct=83
      statistics=0
      threads=1
      value_max=3648
      value_min=13
      # wiredtiger_config not applicable to this run
      write_pct=5
      ############################################
      

      This isn't a test/format core, it's a "wt dump" core, where wt is re-opening the database to dump the table and compare it with the Berkeley DB copy.

      Stacks:

      (gdb) thread apply all where 12
      
      Thread 4 (Thread 0x7f9eeb162700 (LWP 1162)):
      #0  0x000000378920b7bb in pthread_cond_timedwait@@GLIBC_2.3.2 ()
         from /lib64/libpthread.so.0
      WT-1  0x00007f9eebc136af in __wt_cond_wait (session=0x19dff70, cond=0x19e69c0, 
          usecs=100000) at ../src/os_posix/os_mtx.c:75
      WT-2  0x00007f9eebbb0f3e in __wt_cache_evict_server (arg=0x19dff70)
          at ../src/btree/bt_evict.c:167
      WT-3  0x0000003789207851 in start_thread () from /lib64/libpthread.so.0
      WT-4  0x0000003788ae767d in clone () from /lib64/libc.so.6
      
      Thread 3 (Thread 0x7f9ee9d60700 (LWP 1164)):
      #0  0x0000003788ae0263 in select () from /lib64/libc.so.6
      WT-1  0x00007f9eebc15099 in __wt_sleep (seconds=100, micro_seconds=0)
          at ../src/os_posix/os_sleep.c:22
      WT-2  0x00007f9eebc2f51c in __wt_attach (session=0x19e07b0)
          at ../src/support/global.c:97
      WT-3  0x00007f9eebc11aec in __wt_abort (session=0x19e07b0)
          at ../src/os_posix/os_abort.c:21
      WT-4  0x00007f9eebc2efd6 in __wt_assert (session=0x19e07b0, error=0, 
          file_name=0x7f9eebc4033e "../src/lsm/lsm_merge.c", line_number=45, 
          fmt=0x7f9eebc4033b "%s") at ../src/support/err.c:408
      WT-5  0x00007f9eebc0725c in __wt_lsm_merge_update_tree (session=0x19e07b0, 
          lsm_tree=0x1a105a0, start_chunk=5, nchunks=15, chunk=0x7f9edc0008c0)
          at ../src/lsm/lsm_merge.c:45
      WT-6  0x00007f9eebc081ce in __wt_lsm_merge (session=0x19e07b0, 
          lsm_tree=0x1a105a0, id=0, aggressive=0) at ../src/lsm/lsm_merge.c:318
      WT-7  0x00007f9eebc0cd8a in __wt_lsm_merge_worker (vargs=0x1a12fd0)
          at ../src/lsm/lsm_worker.c:90
      WT-8  0x0000003789207851 in start_thread () from /lib64/libpthread.so.0
      WT-9  0x0000003788ae767d in clone () from /lib64/libc.so.6
      
      Thread 2 (Thread 0x7f9ee935f700 (LWP 1165)):
      #0  0x000000378920a836 in pthread_rwlock_rdlock () from /lib64/libpthread.so.0
      WT-1  0x00007f9eebc13aef in __wt_readlock (session=0x19e05a0, rwlock=0x1a10260)
          at ../src/os_posix/os_mtx.c:202
      WT-2  0x00007f9eebc0cb82 in __lsm_copy_chunks (session=0x19e05a0, 
          lsm_tree=0x1a105a0, cookie=0x7f9ee935ee30, old_chunks=0)
          at ../src/lsm/lsm_worker.c:30
      WT-3  0x00007f9eebc0d29e in __wt_lsm_checkpoint_worker (arg=0x1a105a0)
          at ../src/lsm/lsm_worker.c:241
      WT-4  0x0000003789207851 in start_thread () from /lib64/libpthread.so.0
      WT-5  0x0000003788ae767d in clone () from /lib64/libc.so.6
      
      Thread 1 (Thread 0x7f9eeb96a720 (LWP 1146)):
      #0  0x00000037892080ad in pthread_join () from /lib64/libpthread.so.0
      WT-1  0x00007f9eebc15164 in __wt_thread_join (session=0x19dfd60, 
          tid=140320504678144) at ../src/os_posix/os_thread.c:35
      WT-2  0x00007f9eebc0a368 in __lsm_tree_close (session=0x19dfd60, 
          lsm_tree=0x1a105a0) at ../src/lsm/lsm_tree.c:87
      WT-3  0x00007f9eebc0a7a0 in __wt_lsm_tree_close_all (session=0x19dfd60)
          at ../src/lsm/lsm_tree.c:166
      WT-4  0x00007f9eebbe9847 in __wt_connection_close (conn=0x19df010)
          at ../src/conn/conn_open.c:83
      WT-5  0x00007f9eebbe2dd0 in __conn_close (wt_conn=0x19df010, config=0x0)
          at ../src/conn/conn_api.c:518
      WT-6  0x00000000004051ee in main (argc=2, argv=0x7fff5050d9c0)
          at ../src/utilities/util_main.c:184
      

      It looks like this code:

              /* Copy entries one at a time, so we can reuse gaps in the list. */
              for (i = j = 0; j < nchunks && i < lsm_tree->nold_chunks; i++) {
                      if (lsm_tree->old_chunks[i] == NULL) {
                              lsm_tree->old_chunks[i] =
                                  lsm_tree->chunk[start_chunk + j];
                              ++j;
                              --lsm_tree->old_avail;
                      }
              }
      
              WT_ASSERT(session, j == nchunks);
      

      I don't think this assert makes sense in the context of the loop immediately above it: if we know j == nchunks is what causes the loop to terminate, why are we checking i < lsm_tree->noldchunks at all?

      Here's what we know:

      (gdb) p j
      $7 = 0
      (gdb) p nchunks
      $8 = 15
      (gdb) p i
      $9 = 13
      (gdb) p lsm_tree->nold_chunks
      $10 = 13
      (gdb) p lsm_tree->old_avail
      $11 = 4294967285                <<< ??
      

      This code has been in and running for quite a long time, it's part of 3bb9917.

            Assignee:
            Unassigned Unassigned
            Reporter:
            keith.bostic@mongodb.com Keith Bostic (Inactive)
            Votes:
            0 Vote for this issue
            Watchers:
            1 Start watching this issue

              Created:
              Updated:
              Resolved: