Details
-
Bug
-
Status: Closed
-
Major - P3
-
Resolution: Fixed
-
None
-
None
Description
Found this while working on WT-5676, but was able to show the problem in develop with a relatively short workload. To reproduce, put the attached Python script into bench/workgen/runner and run it via gdb --args python rwstress.py . It takes about 20 seconds overall. Sometimes I see malloc complaints like: corrupted size vs. prev_size or double free or corruption (!prev) . Other times assertions. But invariably, __split_multi_inmem_ _and _wt_split_rewrite are in the stack.
(gdb) bt
|
#0 0x00007ffff7a22e97 in raise () from /lib/x86_64-linux-gnu/libc.so.6
|
#1 0x00007ffff7a24801 in abort () from /lib/x86_64-linux-gnu/libc.so.6
|
#2 0x00007ffff67559ac in __wt_abort (session=session@entry=0x7ffff48c4868)
|
at ../src/os_common/os_abort.c:30
|
#3 0x00007ffff667f290 in __split_multi_inmem (
|
session=session@entry=0x7ffff48c4868, orig=orig@entry=0x7ffe3009cb20,
|
multi=multi@entry=0x7fff58883f90, ref=0x7fff5821d220)
|
at ../src/btree/bt_split.c:1469
|
#4 0x00007ffff668b0d9 in __wt_split_rewrite (
|
session=session@entry=0x7ffff48c4868, ref=ref@entry=0x7fff38920150,
|
multi=0x7fff58883f90) at ../src/btree/bt_split.c:2273
|
#5 0x00007ffff6720a9f in __evict_page_dirty_update (evict_flags=0,
|
ref=0x7fff38920150, session=0x7ffff48c4868)
|
at ../src/evict/evict_page.c:394
|
#6 __wt_evict (session=session@entry=0x7ffff48c4868,
|
ref=ref@entry=0x7fff38920150,
|
previous_state=previous_state@entry=3 '\003', flags=flags@entry=0)
|
at ../src/evict/evict_page.c:219
|
#7 0x00007ffff671848d in __evict_page (session=session@entry=0x7ffff48c4868,
|
is_server=is_server@entry=false) at ../src/evict/evict_lru.c:2263
|
#8 0x00007ffff6719730 in __wt_cache_eviction_worker (
|
session=session@entry=0x7ffff48c4868, busy=<optimized out>,
|
readonly=readonly@entry=false, pct_full=<optimized out>)
|
at ../src/evict/evict_lru.c:2350
|
#9 0x00007ffff66441ca in __wt_cache_eviction_check (didworkp=0x0,
|
readonly=false, busy=<optimized out>, session=0x7ffff48c4868)
|
at ../src/include/cache.i:427
|
#10 __cursor_enter (session=0x7ffff48c4868) at ../src/include/cursor.i:165
|
#11 __cursor_func_init (cbt=0x555556593f00, reenter=<optimized out>)
|
at ../src/include/cursor.i:395
|
#12 0x00007ffff6647d1e in __wt_btcur_search (cbt=cbt@entry=0x555556593f00)
|
at ../src/btree/bt_cursor.c:568
|
#13 0x00007ffff66e6a7f in __curfile_search (cursor=0x555556593f00)
|
at ../src/cursor/cur_file.c:200
|
#14 0x00007ffff59103cd in workgen::ThreadRunner::op_run (this=0x555555f31cc0,
|
op=0x555555e859f8) at ../../../bench/workgen/workgen.cxx:896
|
#15 0x00007ffff5910d5d in workgen::ThreadRunner::run (this=0x555555f31cc0)
|
at ../../../bench/workgen/workgen.cxx:625
|
#16 0x00007ffff5910f09 in workgen::thread_runner_main (arg=0x555555f31cc0)
|
at ../../../bench/workgen/workgen.cxx:118
|
#17 0x00007ffff77cc6db in start_thread ()
|
from /lib/x86_64-linux-gnu/libpthread.so.0
|
#18 0x00007ffff7b0588f in clone () from /lib/x86_64-linux-gnu/libc.so.6
|
(gdb) up
|
#1 0x00007ffff7a24801 in abort () from /lib/x86_64-linux-gnu/libc.so.6
|
(gdb) up
|
#2 0x00007ffff67559ac in __wt_abort (session=session@entry=0x7ffff48c4868)
|
at ../src/os_common/os_abort.c:30
|
30 abort();
|
(gdb) up
|
#3 0x00007ffff667f290 in __split_multi_inmem (
|
session=session@entry=0x7ffff48c4868, orig=orig@entry=0x7ffe3009cb20,
|
multi=multi@entry=0x7fff58883f90, ref=0x7fff5821d220)
|
at ../src/btree/bt_split.c:1469
|
1469 WT_ASSERT(session, upd != supd->onpage_upd);
|
(gdb) p upd
|
$1 = (WT_UPDATE *) 0x7ffe98b1f300
|
(gdb) p supd->onpage_upd
|
$2 = (WT_UPDATE *) 0x7ffe98b1f300
|
This workload populates a table and then runs a couple hundred worker threads to insert/update/read, then stops after 10 seconds. After the 10 seconds, gdb reports one or two worker threads as exited, and at that point one of the remaining ones malloc-error-aborts, assert-aborts or SEGVs.
Here's another stack where malloc detects the error, very similar:
double free or corruption (!prev)
|
Thread 61 "python" received signal SIGABRT, Aborted.
|
[Switching to Thread 0x7fff3f7fe700 (LWP 13417)]
|
0x00007ffff7a22e97 in raise () from /lib/x86_64-linux-gnu/libc.so.6
|
(gdb) bt
|
#0 0x00007ffff7a22e97 in raise () from /lib/x86_64-linux-gnu/libc.so.6
|
#1 0x00007ffff7a24801 in abort () from /lib/x86_64-linux-gnu/libc.so.6
|
#2 0x00007ffff7a6d897 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
|
#3 0x00007ffff7a7490a in ?? () from /lib/x86_64-linux-gnu/libc.so.6
|
#4 0x00007ffff7a7be84 in free () from /lib/x86_64-linux-gnu/libc.so.6
|
#5 0x00007ffff676f1c3 in __wt_free_int (session=session@entry=0x7ffff4a303f0, p_arg=p_arg@entry=0x7fff3f7fd888)
|
at ../src/os_common/os_alloc.c:301
|
#6 0x00007ffff6687b59 in __wt_free_update_list (session=session@entry=0x7ffff4a303f0,
|
updp=updp@entry=0x7fff4443d060) at ../src/btree/bt_discard.c:443
|
#7 0x00007ffff66a5f74 in __split_multi_inmem_final (session=session@entry=0x7ffff4a303f0,
|
orig=orig@entry=0x7ffeb09b0800, multi=multi@entry=0x7fff441ebdc0) at ../src/btree/bt_split.c:1578
|
#8 0x00007ffff66aee8e in __wt_split_rewrite (session=session@entry=0x7ffff4a303f0, ref=ref@entry=0x7ffe1879a060,
|
multi=0x7fff441ebdc0) at ../src/btree/bt_split.c:2280
|
#9 0x00007ffff673f57d in __evict_page_dirty_update (evict_flags=0, ref=0x7ffe1879a060, session=0x7ffff4a303f0)
|
at ../src/evict/evict_page.c:394
|
#10 __wt_evict (session=session@entry=0x7ffff4a303f0, ref=ref@entry=0x7ffe1879a060,
|
previous_state=previous_state@entry=3 '\003', flags=flags@entry=0) at ../src/evict/evict_page.c:219
|
#11 0x00007ffff673790c in __evict_page (session=session@entry=0x7ffff4a303f0, is_server=is_server@entry=false)
|
at ../src/evict/evict_lru.c:2263
|
#12 0x00007ffff6738a52 in __wt_cache_eviction_worker (session=session@entry=0x7ffff4a303f0, busy=<optimized out>,
|
readonly=readonly@entry=true, pct_full=<optimized out>) at ../src/evict/evict_lru.c:2350
|
#13 0x00007ffff67bf94f in __wt_cache_eviction_check (didworkp=0x0, readonly=true, busy=<optimized out>,
|
session=0x7ffff4a303f0) at ../src/include/cache.i:427
|
#14 __wt_txn_begin (cfg=0x7fff3f7fdb90, session=0x7ffff4a303f0) at ../src/include/txn.i:931
|
#15 __session_begin_transaction (wt_session=0x7ffff4a303f0, config=<optimized out>)
|
at ../src/session/session_api.c:1627
|
#16 0x00007ffff5b8b386 in workgen::ThreadRunner::op_run (this=0x555555f36218, op=0x555555f0f718)
|
at ../../../bench/workgen/workgen.cxx:881
|
#17 0x00007ffff5b8bd5d in workgen::ThreadRunner::run (this=0x555555f36218)
|
at ../../../bench/workgen/workgen.cxx:625
|
#18 0x00007ffff5b8bf09 in workgen::thread_runner_main (arg=0x555555f36218)
|
at ../../../bench/workgen/workgen.cxx:118
|
#19 0x00007ffff77cc6db in start_thread () from /lib/x86_64-linux-gnu/libpthread.so.0
|
#20 0x00007ffff7b0588f in clone () from /lib/x86_64-linux-gnu/libc.so.6
|
This seems easily reproducible in the develop branch.