Details
-
Bug
-
Status: Closed
-
Major - P3
-
Resolution: Fixed
-
None
-
None
-
None
Description
In developing a new python test I hit a bug where WT aborts when separate threads are calling flush_tier and checkpoint.
I've attached a current version of my test as test_tiered08.py.
The failing thread is performing a checkpoint:
Thread 1 (Thread 0x7f0f5288e700 (LWP 2160)):
|
#0 0x00007f0f5b2f1fb7 in raise () from /lib/x86_64-linux-gnu/libc.so.6
|
#1 0x00007f0f5b2f3921 in abort () from /lib/x86_64-linux-gnu/libc.so.6
|
#2 0x00007f0f58b10e23 in __wt_abort (session=session@entry=0x1f11550) at ../src/os_common/os_abort.c:30
|
#3 0x00007f0f58b08911 in __meta_blk_mods_load (config=<optimized out>, rename=<optimized out>, ckpt=<optimized out>, base_ckpt=<optimized out>, session=<optimized out>) at ../src/meta/meta_ckpt.c:541
|
#4 __meta_ckptlist_allocate_new_ckpt (config=<optimized out>, allocated=<optimized out>, ckptbasep=<optimized out>, session=<optimized out>) at ../src/meta/meta_ckpt.c:708
|
#5 __wt_meta_saved_ckptlist_get (session=session@entry=0x1f11550, fname=0x1db66c0 "tiered:test_tiered08", ckptbasep=ckptbasep@entry=0x7f0f5288d098) at ../src/meta/meta_ckpt.c:736
|
#6 0x00007f0f58bb0683 in __checkpoint_lock_dirty_tree (session=0x1f11550, is_checkpoint=<optimized out>, force=<optimized out>, need_tracking=<optimized out>, cfg=0x7f0f5288d6d0) at ../src/txn/txn_ckpt.c:1472
|
#7 0x00007f0f58bb1db8 in __wt_checkpoint_get_handles (session=0x1f11550, cfg=0x7f0f5288d6d0) at ../src/txn/txn_ckpt.c:301
|
#8 0x00007f0f58a5ac00 in __conn_btree_apply_internal (session=session@entry=0x1f11550, dhandle=dhandle@entry=0x1fa7420, file_func=file_func@entry=0x7f0f58bb1bf0 <__wt_checkpoint_get_handles>, name_func=name_func@entry=0x0, cfg=cfg@entry=0x7f0f5288d6d0) at ../src/conn/conn_dhandle.c:662
|
#9 0x00007f0f58a5c878 in __wt_conn_btree_apply (session=session@entry=0x1f11550, uri=uri@entry=0x0, file_func=file_func@entry=0x7f0f58bb1bf0 <__wt_checkpoint_get_handles>, name_func=name_func@entry=0x0, cfg=cfg@entry=0x7f0f5288d6d0) at ../src/conn/conn_dhandle.c:730
|
#10 0x00007f0f58bafacd in __checkpoint_apply_operation (session=0x1f11550, cfg=0x7f0f5288d6d0, op=0x7f0f58bb1bf0 <__wt_checkpoint_get_handles>) at ../src/txn/txn_ckpt.c:174
|
#11 0x00007f0f58bb14b8 in __checkpoint_prepare (session=session@entry=0x1f11550, trackingp=trackingp@entry=0x7f0f5288d56f, cfg=cfg@entry=0x7f0f5288d6d0) at ../src/txn/txn_ckpt.c:655
|
#12 0x00007f0f58bb3972 in __txn_checkpoint (session=session@entry=0x1f11550, cfg=cfg@entry=0x7f0f5288d6d0) at ../src/txn/txn_ckpt.c:855
|
#13 0x00007f0f58bb4fec in __txn_checkpoint_wrapper (cfg=0x7f0f5288d6d0, session=0x1f11550) at ../src/txn/txn_ckpt.c:1116
|
#14 __wt_txn_checkpoint (session=session@entry=0x1f11550, cfg=cfg@entry=0x7f0f5288d6d0, waiting=waiting@entry=true) at ../src/txn/txn_ckpt.c:1170
|
#15 0x00007f0f58b866e5 in __session_checkpoint (wt_session=0x1f11550, config=0x0) at ../src/session/session_api.c:1902
|
#16 0x00007f0f58e4b9db in _wrap_Session_checkpoint (self=<optimized out>, args=<optimized out>) at wiredtiger_wrap.c:7467
|
The failed assert is in __meta_blk_mods_load:
__meta_blk_mods_load(
|
WT_SESSION_IMPL *session, const char *config, WT_CKPT *base_ckpt, WT_CKPT *ckpt, bool rename) |
{
|
/* |
* Load most recent checkpoint backup blocks to this checkpoint, either from metadata or from a
|
* previous checkpoint.
|
*/
|
if (config != NULL) { |
/* Load from metadata. */ |
WT_RET(__ckpt_load_blk_mods(session, config, ckpt));
|
WT_RET(__wt_meta_block_metadata(session, config, ckpt));
|
} else { |
/* Load from an existing base checkpoint. */ |
=====>> WT_ASSERT(session, base_ckpt != NULL);
|
WT_RET(__ckpt_copy_blk_mods(session, base_ckpt, ckpt));
|
WT_RET(__wt_strndup(session, base_ckpt->block_metadata, strlen(base_ckpt->block_metadata),
|
&ckpt->block_metadata));
|
}
|
We have a concurrent flush_tier call from another thread
#0 0x00007f0f5b3b6ef7 in sched_yield () from /lib/x86_64-linux-gnu/libc.so.6
|
#1 0x00007f0f58b1a0a8 in __wt_yield () at ../src/os_posix/os_yield.c:25
|
#2 0x00007f0f58a67b82 in __flush_tier_wait (session=0x1f11158) at ../src/conn/conn_tiered.c:52
|
#3 __wt_flush_tier (session=session@entry=0x1f11158, config=config@entry=0x0)
|
at ../src/conn/conn_tiered.c:393
|
#4 0x00007f0f58b7d30f in __session_flush_tier (wt_session=0x1f11158, config=0x0)
|
at ../src/session/session_api.c:1960
|
#5 0x00007f0f58e48fb8 in _wrap_Session_flush_tier (self=<optimized out>, args=<optimized out>)
|
at wiredtiger_wrap.c:5691
|
That thread is at the end of __wt_flush_tier, and is waiting for the background flush work to complete. The tiered server is in the process of copying a file to shared storage:
#0 0x00007f0f5b3c1747 in pread64 () from /lib/x86_64-linux-gnu/libc.so.6 |
#1 0x00007f0f58b17813 in pread (__offset=10813440, __nbytes=65536, __buf=0x7f0f57887c90, |
__fd=<optimized out>) at /usr/include/x86_64-linux-gnu/bits/unistd.h:83 |
#2 __posix_file_read (file_handle=0x7f0f48003bc0, wt_session=0x1f0e5b0, offset=10813440, |
len=65536, buf=<optimized out>) at ../src/os_posix/os_fs.c:425 |
#3 0x00007f0f5789ac84 in local_file_copy (local=local@entry=0x1c583a0, |
session=session@entry=0x1f0e5b0, |
src_path=src_path@entry=0x7f0f4800a8d5 "test_tiered08-0000000004.wtobj", |
dest_path=dest_path@entry=0x7f0f4800eb30 "mybucket/test_tiered08-0000000004.wtobj", |
type=WT_FS_OPEN_FILE_TYPE_DATA)
|
at ../../../../ext/storage_sources/local_store/local_store.c:557 |
#4 0x00007f0f5789b8a3 in local_flush (storage_source=0x1c583a0, session=0x1f0e5b0, |
file_system=0x1e951e0, source=0x7f0f4800a8d5 "test_tiered08-0000000004.wtobj", |
object=<optimized out>, config=<optimized out>)
|
at ../../../../ext/storage_sources/local_store/local_store.c:603 |
#5 0x00007f0f58a666e8 in __wt_tier_do_flush (session=session@entry=0x1f0e5b0, |
tiered=tiered@entry=0x1fa7420, local_uri=<optimized out>, obj_uri=<optimized out>) |
at ../src/conn/conn_tiered.c:240 |
#6 0x00007f0f58a67383 in __wt_tier_flush (session=session@entry=0x1f0e5b0, tiered=0x1fa7420, id=4) |
at ../src/conn/conn_tiered.c:269 |
#7 0x00007f0f58a674c7 in __tier_storage_copy (session=0x1f0e5b0) at ../src/conn/conn_tiered.c:299 |
#8 __tiered_server (arg=0x1f0e5b0) at ../src/conn/conn_tiered.c:483 |
If I remove the calls to flush_tier from the test, it passes. So it appears that the combination of flush_tier and checkpoint is what triggers the problem.