-
Type:
Bug
-
Resolution: Cannot Reproduce
-
Priority:
Major - P3
-
None
-
Affects Version/s: None
-
Component/s: None
-
None
-
Storage Engines 2020-01-13, Storage Engines 2020-01-27
-
5
Investigate a race condition with las usage if __wt_txn_read() uses a las cursor. This was uncovered while doing some experimentation where I made the following code change to open, sleep and close lookaside cursor every 1 in 5 of __wt_txn_read() calls.
diff --git a/src/include/txn.i b/src/include/txn.i
index 757627dff..ad61a9076 100644
--- a/src/include/txn.i
+++ b/src/include/txn.i
@@ -707,6 +707,16 @@ __wt_txn_read(WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_UPDATE **updp)
static WT_UPDATE tombstone = {.txnid = WT_TXN_NONE, .type = WT_UPDATE_TOMBSTONE};
WT_VISIBLE_TYPE upd_visible;
bool skipped_birthmark;
+ WT_CURSOR *cursor;
+ uint32_t session_flags;
+ session_flags = 0;
+
+ // 20% of reads open a las cursor and position
+ if (__wt_random(&session->rnd) % 5 == 0) {
+ __wt_las_cursor(session, &cursor, &session_flags);
+ __wt_sleep(0, 30000);
+ WT_RET(__wt_las_cursor_close(session, &cursor, session_flags));
+ }
*updp = NULL;
for (skipped_birthmark = false; upd != NULL; upd = upd->next) {
Then I ran the following wtperf config:
# Drive a constant high workload through, even if WiredTiger isn't keeping # up by dividing the workload across a lot of threads. This needs to be # tuned to the particular machine so the workload is close to capacity in the # steady state, but not overwhelming. conn_config="cache_size=1GB,session_max=1000,eviction=(threads_min=4,threads_max=8),log=(enabled=false),transaction_sync=(enabled=false),checkpoint_sync=true,checkpoint=(wait=60)" table_config="allocation_size=4k,memory_page_max=10MB,prefix_compression=false,split_pct=90,leaf_page_max=32k,internal_page_max=16k,type=file" # Divide original icount by database_count. table_count=8 compression=snappy icount=10000000 populate_threads=1 reopen_connection=false log_like_table=true #pareto=5 report_interval=1 run_time=15 threads=((count=10,throttle=250,inserts=1),(count=10,throttle=250,updates=1),(count=128,reads=1,ops_per_txn=3)) value_sz=500 sample_interval=5 sample_rate=1
This results in a crash:
#0 0x00000000005d740a in __cursor_row_slot_return (cbt=0x9aab20, rip=0x7f7c0c08d900, upd=0x0) at ../src/include/cursor.i:373
373 page = cbt->ref->page;
[Current thread is 1 (Thread 0x7f7c23fff700 (LWP 3600))]
(gdb) bt
#0 0x00000000005d740a in __cursor_row_slot_return (cbt=0x9aab20, rip=0x7f7c0c08d900, upd=0x0) at ../src/include/cursor.i:373
#1 0x00000000005d866f in __cursor_row_next (cbt=0x9aab20, newpage=false, restart=false) at ../src/btree/bt_curnext.c:384
#2 0x00000000005d90ba in __wt_btcur_next (cbt=0x9aab20, truncating=false) at ../src/btree/bt_curnext.c:653
#3 0x000000000057223e in __curfile_next (cursor=0x9aab20) at ../src/cursor/cur_file.c:92
#4 0x0000000000555636 in __wt_las_cursor_position (cursor=0x9aab20, pageid=15) at ../src/cache/cache_las.c:817
#5 0x000000000051353f in __las_page_instantiate (session=0x7f7c475e0860, ref=0x7f7a5c62d5b0) at ../src/btree/bt_read.c:165
#6 0x0000000000513fee in __page_read_lookaside (session=0x7f7c475e0860, ref=0x7f7a5c62d5b0, previous_state=4, final_statep=0x7f7c23ffe3ac) at ../src/btree/bt_read.c:404
#7 0x00000000005144d1 in __page_read (session=0x7f7c475e0860, ref=0x7f7a5c62d5b0, flags=1024) at ../src/btree/bt_read.c:526
#8 0x0000000000514c14 in __wt_page_in_func (session=0x7f7c475e0860, ref=0x7f7a5c62d5b0, flags=1024, func=0x626b70 <__func__.35297> "__wt_row_search", line=425) at ../src/btree/bt_read.c:640
#9 0x000000000054fe15 in __wt_page_swap_func (session=0x7f7c475e0860, held=0x7f7c2c006ad0, want=0x7f7a5c62d5b0, flags=1024, func=0x626b70 <__func__.35297> "__wt_row_search", line=425)
at ../src/include/btree.i:1631
#10 0x000000000055149a in __wt_row_search (cbt=0x7f7c0c009610, srch_key=0x7f7c0c009710, insert=true, leaf=0x0, leaf_safe=false, leaf_foundp=0x0) at ../src/btree/row_srch.c:425
#11 0x00000000004f1e9a in __cursor_row_search (cbt=0x7f7c0c009610, insert=true, leaf=0x0, leaf_foundp=0x0) at ../src/btree/bt_cursor.c:392
#12 0x00000000004f3459 in __wt_btcur_insert (cbt=0x7f7c0c009610) at ../src/btree/bt_cursor.c:845
#13 0x0000000000574221 in __curfile_insert (cursor=0x7f7c0c009610) at ../src/cursor/cur_file.c:258
#14 0x0000000000408dff in worker (arg=0x7f7c3a6c8400) at ../../../bench/wtperf/wtperf.c:695
#15 0x00007f7c46b456db in start_thread (arg=0x7f7c23fff700) at pthread_create.c:463
#16 0x00007f7c4622d88f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
cbt->ref is NULL.
I did not spend a lot of time debugging. I feel like this points to a bug in our code, but I am not sure. I thought it is best to create a ticket and if someone has time, it can be looked at.
- related to
-
WT-5323 Design history store concurrent access mechanism
-
- Closed
-