In developing a new python test I hit a bug where WT aborts when separate threads are calling flush_tier and checkpoint.
I've attached a current version of my test as test_tiered08.py.
The failing thread is performing a checkpoint:
Thread 1 (Thread 0x7f0f5288e700 (LWP 2160)): #0 0x00007f0f5b2f1fb7 in raise () from /lib/x86_64-linux-gnu/libc.so.6 #1 0x00007f0f5b2f3921 in abort () from /lib/x86_64-linux-gnu/libc.so.6 #2 0x00007f0f58b10e23 in __wt_abort (session=session@entry=0x1f11550) at ../src/os_common/os_abort.c:30 #3 0x00007f0f58b08911 in __meta_blk_mods_load (config=<optimized out>, rename=<optimized out>, ckpt=<optimized out>, base_ckpt=<optimized out>, session=<optimized out>) at ../src/meta/meta_ckpt.c:541 #4 __meta_ckptlist_allocate_new_ckpt (config=<optimized out>, allocated=<optimized out>, ckptbasep=<optimized out>, session=<optimized out>) at ../src/meta/meta_ckpt.c:708 #5 __wt_meta_saved_ckptlist_get (session=session@entry=0x1f11550, fname=0x1db66c0 "tiered:test_tiered08", ckptbasep=ckptbasep@entry=0x7f0f5288d098) at ../src/meta/meta_ckpt.c:736 #6 0x00007f0f58bb0683 in __checkpoint_lock_dirty_tree (session=0x1f11550, is_checkpoint=<optimized out>, force=<optimized out>, need_tracking=<optimized out>, cfg=0x7f0f5288d6d0) at ../src/txn/txn_ckpt.c:1472 #7 0x00007f0f58bb1db8 in __wt_checkpoint_get_handles (session=0x1f11550, cfg=0x7f0f5288d6d0) at ../src/txn/txn_ckpt.c:301 #8 0x00007f0f58a5ac00 in __conn_btree_apply_internal (session=session@entry=0x1f11550, dhandle=dhandle@entry=0x1fa7420, file_func=file_func@entry=0x7f0f58bb1bf0 <__wt_checkpoint_get_handles>, name_func=name_func@entry=0x0, cfg=cfg@entry=0x7f0f5288d6d0) at ../src/conn/conn_dhandle.c:662 #9 0x00007f0f58a5c878 in __wt_conn_btree_apply (session=session@entry=0x1f11550, uri=uri@entry=0x0, file_func=file_func@entry=0x7f0f58bb1bf0 <__wt_checkpoint_get_handles>, name_func=name_func@entry=0x0, cfg=cfg@entry=0x7f0f5288d6d0) at ../src/conn/conn_dhandle.c:730 #10 0x00007f0f58bafacd in __checkpoint_apply_operation (session=0x1f11550, cfg=0x7f0f5288d6d0, op=0x7f0f58bb1bf0 <__wt_checkpoint_get_handles>) at ../src/txn/txn_ckpt.c:174 #11 0x00007f0f58bb14b8 in __checkpoint_prepare (session=session@entry=0x1f11550, trackingp=trackingp@entry=0x7f0f5288d56f, cfg=cfg@entry=0x7f0f5288d6d0) at ../src/txn/txn_ckpt.c:655 #12 0x00007f0f58bb3972 in __txn_checkpoint (session=session@entry=0x1f11550, cfg=cfg@entry=0x7f0f5288d6d0) at ../src/txn/txn_ckpt.c:855 #13 0x00007f0f58bb4fec in __txn_checkpoint_wrapper (cfg=0x7f0f5288d6d0, session=0x1f11550) at ../src/txn/txn_ckpt.c:1116 #14 __wt_txn_checkpoint (session=session@entry=0x1f11550, cfg=cfg@entry=0x7f0f5288d6d0, waiting=waiting@entry=true) at ../src/txn/txn_ckpt.c:1170 #15 0x00007f0f58b866e5 in __session_checkpoint (wt_session=0x1f11550, config=0x0) at ../src/session/session_api.c:1902 #16 0x00007f0f58e4b9db in _wrap_Session_checkpoint (self=<optimized out>, args=<optimized out>) at wiredtiger_wrap.c:7467
The failed assert is in __meta_blk_mods_load:
__meta_blk_mods_load( WT_SESSION_IMPL *session, const char *config, WT_CKPT *base_ckpt, WT_CKPT *ckpt, bool rename) { /* * Load most recent checkpoint backup blocks to this checkpoint, either from metadata or from a * previous checkpoint. */ if (config != NULL) { /* Load from metadata. */ WT_RET(__ckpt_load_blk_mods(session, config, ckpt)); WT_RET(__wt_meta_block_metadata(session, config, ckpt)); } else { /* Load from an existing base checkpoint. */ =====>> WT_ASSERT(session, base_ckpt != NULL); WT_RET(__ckpt_copy_blk_mods(session, base_ckpt, ckpt)); WT_RET(__wt_strndup(session, base_ckpt->block_metadata, strlen(base_ckpt->block_metadata), &ckpt->block_metadata)); }
We have a concurrent flush_tier call from another thread
#0 0x00007f0f5b3b6ef7 in sched_yield () from /lib/x86_64-linux-gnu/libc.so.6 #1 0x00007f0f58b1a0a8 in __wt_yield () at ../src/os_posix/os_yield.c:25 #2 0x00007f0f58a67b82 in __flush_tier_wait (session=0x1f11158) at ../src/conn/conn_tiered.c:52 #3 __wt_flush_tier (session=session@entry=0x1f11158, config=config@entry=0x0) at ../src/conn/conn_tiered.c:393 #4 0x00007f0f58b7d30f in __session_flush_tier (wt_session=0x1f11158, config=0x0) at ../src/session/session_api.c:1960 #5 0x00007f0f58e48fb8 in _wrap_Session_flush_tier (self=<optimized out>, args=<optimized out>) at wiredtiger_wrap.c:5691
That thread is at the end of __wt_flush_tier, and is waiting for the background flush work to complete. The tiered server is in the process of copying a file to shared storage:
#0 0x00007f0f5b3c1747 in pread64 () from /lib/x86_64-linux-gnu/libc.so.6 #1 0x00007f0f58b17813 in pread (__offset=10813440, __nbytes=65536, __buf=0x7f0f57887c90, __fd=<optimized out>) at /usr/include/x86_64-linux-gnu/bits/unistd.h:83 #2 __posix_file_read (file_handle=0x7f0f48003bc0, wt_session=0x1f0e5b0, offset=10813440, len=65536, buf=<optimized out>) at ../src/os_posix/os_fs.c:425 #3 0x00007f0f5789ac84 in local_file_copy (local=local@entry=0x1c583a0, session=session@entry=0x1f0e5b0, src_path=src_path@entry=0x7f0f4800a8d5 "test_tiered08-0000000004.wtobj", dest_path=dest_path@entry=0x7f0f4800eb30 "mybucket/test_tiered08-0000000004.wtobj", type=WT_FS_OPEN_FILE_TYPE_DATA) at ../../../../ext/storage_sources/local_store/local_store.c:557 #4 0x00007f0f5789b8a3 in local_flush (storage_source=0x1c583a0, session=0x1f0e5b0, file_system=0x1e951e0, source=0x7f0f4800a8d5 "test_tiered08-0000000004.wtobj", object=<optimized out>, config=<optimized out>) at ../../../../ext/storage_sources/local_store/local_store.c:603 #5 0x00007f0f58a666e8 in __wt_tier_do_flush (session=session@entry=0x1f0e5b0, tiered=tiered@entry=0x1fa7420, local_uri=<optimized out>, obj_uri=<optimized out>) at ../src/conn/conn_tiered.c:240 #6 0x00007f0f58a67383 in __wt_tier_flush (session=session@entry=0x1f0e5b0, tiered=0x1fa7420, id=4) at ../src/conn/conn_tiered.c:269 #7 0x00007f0f58a674c7 in __tier_storage_copy (session=0x1f0e5b0) at ../src/conn/conn_tiered.c:299 #8 __tiered_server (arg=0x1f0e5b0) at ../src/conn/conn_tiered.c:483
If I remove the calls to flush_tier from the test, it passes. So it appears that the combination of flush_tier and checkpoint is what triggers the problem.