Details
-
Task
-
Status: Closed
-
Resolution: Done
-
None
-
None
-
None
-
None
Description
A Jenkins job is hanging doing a compact in an LSM tree.
http://mjc.homeunix.org:8180/job/wiredtiger-perf-med-lsm-compact/977/
Potentially relevant stack traces:
(gdb) thread apply all where
|
|
Thread 9 (Thread 0x7fdf97fff700 (LWP 26291)):
|
#0 pthread_cond_timedwait@@GLIBC_2.3.2 ()
|
at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S:239
|
WT-1 0x000000000042386f in __wt_cond_wait (session=0x7fdf98041280, cond=0x7fdf98016160,
|
usecs=<value optimized out>) at ../src/os_posix/os_mtx.c:77
|
WT-2 0x0000000000444b88 in __evict_server (arg=0x7fdf98041280) at ../src/btree/bt_evict.c:194
|
WT-3 0x00007fdf98ca1ddb in start_thread (arg=0x7fdf97fff700) at pthread_create.c:301
|
WT-4 0x00007fdf989efa1d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
|
|
Thread 6 (Thread 0x7fdf967fc700 (LWP 26294)):
|
#0 0x00007fdf989e8aa3 in select () at ../sysdeps/unix/syscall-template.S:82
|
WT-1 0x0000000000425958 in __wt_sleep (seconds=<value optimized out>,
|
micro_seconds=<value optimized out>) at ../src/os_posix/os_sleep.c:22
|
WT-2 0x000000000041eb77 in __wt_lsm_compact (session=0x7fdf98041a00, name=<value optimized out>,
|
skip=<value optimized out>) at ../src/lsm/lsm_tree.c:1134
|
WT-3 0x00000000004318c7 in __wt_schema_worker (session=0x7fdf98041a00, uri=0x7fdf75879060 "lsm:test",
|
file_func=0, name_func=0x41e910 <__wt_lsm_compact>, cfg=0x7fdf967fbd20, open_flags=0)
|
at ../src/schema/schema_worker.c:37
|
WT-4 0x0000000000431deb in __wt_schema_worker (session=0x7fdf98041a00, uri=<value optimized out>,
|
file_func=0, name_func=0x41e910 <__wt_lsm_compact>, cfg=0x7fdf967fbd20, open_flags=0)
|
at ../src/schema/schema_worker.c:86
|
WT-5 0x000000000043538a in __wt_session_compact (wt_session=0x7fdf98041a00,
|
uri=0x7fdf98020270 "table:test", config=<value optimized out>)
|
at ../src/session/session_compact.c:229
|
WT-6 0x000000000044195f in __async_worker_execop (arg=0x7fdf98041a00)
|
at ../src/async/async_worker.c:198
|
WT-7 __async_worker_op (arg=0x7fdf98041a00) at ../src/async/async_worker.c:253
|
WT-8 __wt_async_worker (arg=0x7fdf98041a00) at ../src/async/async_worker.c:310
|
WT-9 0x00007fdf98ca1ddb in start_thread (arg=0x7fdf967fc700) at pthread_create.c:301
|
WT-10 0x00007fdf989efa1d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
|
|
Thread 5 (Thread 0x7fdf95ffb700 (LWP 26295)):
|
#0 0x00007fdf989e8aa3 in select () at ../sysdeps/unix/syscall-template.S:82
|
WT-1 0x0000000000425958 in __wt_sleep (seconds=<value optimized out>,
|
---Type <return> to continue, or q <return> to quit---
|
micro_seconds=<value optimized out>) at ../src/os_posix/os_sleep.c:22
|
WT-2 0x000000000041bece in __lsm_manager_run_server (arg=<value optimized out>)
|
at ../src/lsm/lsm_manager.c:312
|
WT-3 __lsm_worker_manager (arg=<value optimized out>) at ../src/lsm/lsm_manager.c:377
|
WT-4 0x00007fdf98ca1ddb in start_thread (arg=0x7fdf95ffb700) at pthread_create.c:301
|
WT-5 0x00007fdf989efa1d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
|
|
Thread 4 (Thread 0x7fdf957fa700 (LWP 26296)):
|
#0 pthread_cond_timedwait@@GLIBC_2.3.2 ()
|
at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S:239
|
WT-1 0x000000000042386f in __wt_cond_wait (session=0x7fdf98042180, cond=0x7fdf98016080,
|
usecs=<value optimized out>) at ../src/os_posix/os_mtx.c:77
|
WT-2 0x000000000041f177 in __lsm_worker (arg=0x7fdf9800f0a0) at ../src/lsm/lsm_worker.c:149
|
WT-3 0x00007fdf98ca1ddb in start_thread (arg=0x7fdf957fa700) at pthread_create.c:301
|
WT-4 0x00007fdf989efa1d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
|
|
Thread 3 (Thread 0x7fdf947ff700 (LWP 26297)):
|
#0 pthread_cond_timedwait@@GLIBC_2.3.2 ()
|
at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S:239
|
WT-1 0x000000000042386f in __wt_cond_wait (session=0x7fdf98042400, cond=0x7fdf98016080,
|
usecs=<value optimized out>) at ../src/os_posix/os_mtx.c:77
|
WT-2 0x000000000041f177 in __lsm_worker (arg=0x7fdf9800f0c0) at ../src/lsm/lsm_worker.c:149
|
WT-3 0x00007fdf98ca1ddb in start_thread (arg=0x7fdf947ff700) at pthread_create.c:301
|
WT-4 0x00007fdf989efa1d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
|
|
Thread 2 (Thread 0x7fdf93ffe700 (LWP 26298)):
|
#0 pthread_cond_timedwait@@GLIBC_2.3.2 ()
|
at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S:239
|
WT-1 0x000000000042386f in __wt_cond_wait (session=0x7fdf98042680, cond=0x7fdf98016080,
|
usecs=<value optimized out>) at ../src/os_posix/os_mtx.c:77
|
WT-2 0x000000000041f177 in __lsm_worker (arg=0x7fdf9800f0e0) at ../src/lsm/lsm_worker.c:149
|
WT-3 0x00007fdf98ca1ddb in start_thread (arg=0x7fdf93ffe700) at pthread_create.c:301
|
WT-4 0x00007fdf989efa1d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
|
|
Information:
(gdb) p /x lsm_tree->flags
|
$2 = 0x33
|
(gdb) list lsm.h:197
|
192 int freeing_old_chunks; /* Whether chunks are being freed */
|
193 uint32_t merge_aggressiveness; /* Increase amount of work per merge */
|
194
|
195 #define WT_LSM_TREE_ACTIVE 0x01 /* Workers are active */
|
196 #define WT_LSM_TREE_COMPACT_FLUSH 0x02 /* Flushed for compact */
|
197 #define WT_LSM_TREE_COMPACTING 0x04 /* Tree being compacted */
|
198 #define WT_LSM_TREE_NEED_SWITCH 0x08 /* New chunk needs creating */
|
199 #define WT_LSM_TREE_OPEN 0x10 /* The tree is open */
|
200 #define WT_LSM_TREE_THROTTLE 0x20 /* Throttle updates */
|
201 uint32_t flags;
|
(gdb) p lsm_tree->chunk[0]
|
$4 = (WT_LSM_CHUNK *) 0x7fdf86045480
|
(gdb) p lsm_tree->chunk[1]
|
$5 = (WT_LSM_CHUNK *) 0x7fdf8640e580
|
(gdb) p *lsm_tree->chunk[0]
|
$6 = {uri = 0x7fdf3d5c3520 "file:test-000110.lsm", bloom_uri = 0x0, create_ts = {
|
tv_sec = 1410541054, tv_nsec = 666855286}, count = 49624376, size = 6401531904, switch_txn = 0,
|
id = 110, generation = 3, refcnt = 0, bloom_busy = 0, empty = 0 '\000', evicted = 0 '\000',
|
flags = 4}
|
(gdb) p *lsm_tree->chunk[1]
|
$7 = {uri = 0x7fdf3df38ca0 "file:test-000106.lsm", bloom_uri = 0x0, create_ts = {
|
tv_sec = 1410540986, tv_nsec = 133071004}, count = 375624, size = 0, switch_txn = 50001018,
|
id = 106, generation = 0, refcnt = 1, bloom_busy = 0, empty = 0 '\000', evicted = 0 '\000',
|
flags = 0}
|
There are only three entries in the app queue:
(gdb) p manager->appqh
|
$17 = {tqh_first = 0x7fdf948a93a0, tqh_last = 0x7fdf948a9460}
|
(gdb) p $17->tqh_first
|
$19 = (struct __wt_lsm_work_unit *) 0x7fdf948a93a0
|
(gdb) p *$19
|
$20 = {q = {tqe_next = 0x7fdf948a9400, tqe_prev = 0x7fdf9802be50}, flags = 2,
|
lsm_tree = 0x7fdf9808d000}
|
(gdb) p *$19->q.tqe_next
|
$21 = {q = {tqe_next = 0x7fdf948a9460, tqe_prev = 0x7fdf948a93a0}, flags = 4,
|
lsm_tree = 0x7fdf9808d000}
|
(gdb) p *$21->q.tqe_next
|
$22 = {q = {tqe_next = 0x0, tqe_prev = 0x7fdf948a9400}, flags = 1, lsm_tree = 0x7fdf9808d000}
|
(gdb) p *$22->q.tqe_next
|
Cannot access memory at address 0x0
|
One entry in the manager queue:
(gdb) p manager->managerqh
|
$18 = {tqh_first = 0x7fdf948a9280, tqh_last = 0x7fdf948a94a0}
|
(gdb) p *$18->tqh_first->q.tqe_next
|
$23 = {q = {tqe_next = 0x0, tqe_prev = 0x7fdf948a9280}, flags = 8, lsm_tree = 0x7fdf9808d000}
|
(gdb) p *$23->q.tqe_next
|
None in the switch queue.
The worker code is never retrieving a chunk to flush (chunk is always NULL at line 56 of lsm_worker.c).
The compact code is never pushing more forced flushes. I think it's a problem with:
/*
|
* The flush flag is cleared when the the chunk has been
|
* flushed. Continue to push forced flushes until the
|
* chunk is on disk. Once it is on disk move to the compacting
|
* phase.
|
*/
|
if (flushing && !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACT_FLUSH)) {
|
if (chunk != NULL &&
|
!F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
|
WT_ERR(__wt_verbose(session, WT_VERB_LSM,
|
"Compact flush retry %s chunk %u",
|
name, chunk->id));
|
F_SET(lsm_tree, WT_LSM_TREE_COMPACT_FLUSH);
|
WT_ERR(__wt_lsm_manager_push_entry(session,
|
WT_LSM_WORK_FLUSH | WT_LSM_WORK_FORCE,
|
lsm_tree));
|
} else {
|
if (ref) {
|
WT_ASSERT(session, chunk != NULL);
|
WT_ERR(__wt_verbose(session,
|
WT_VERB_LSM,
|
"Compact flush done %s chunk %u",
|
name, chunk->id));
|
(void)WT_ATOMIC_SUB(chunk->refcnt, 1);
|
}
|
flushing = ref = 0;
|
compacting = 1;
|
F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
|
}
|
}
|
It should be
(flushing && F_ISSET(lsm_tree, WT_LSM_TREE_COMPACT_FLUSH))
|
Attachments
Issue Links
- is related to
-
WT-1201 medium-lsm-multi failure in WT_CONNECTION::close
- Closed
-
WT-1247 compaction still hanging
- Closed
- related to
-
WT-2 What does metadata look like?
- Closed
-
WT-3 What file formats are required?
- Closed
-
WT-4 Flexible cursor traversals
- Closed
-
WT-5 How does pget work: is it necessary?
- Closed
-
WT-6 Complex schema example
- Closed
-
WT-7 Do we need the handle->err/errx methods?
- Closed
-
WT-8 Do we need table load, bulk-load and/or dump methods?
- Closed
-
WT-9 Does adding schema need to be transactional?
- Closed
-
WT-10 Basic "getting started" tutorial
- Closed
-
WT-1226 Fix a hang in LSM compact. Refs #1225
- Closed