Description
@keithbostic, the test/format config below shows some pathological behavior with zlib raw compression. There are a few interconnected issues:
- this config has 512 byte pages, with overflow keys and values;
- the page header takes 64 bytes, zlib's header ~12, so there is only 436 bytes available on the page for a compressed image;
- the first slot the btree layer tells raw compression about is at 449 bytes;
- if raw compression doesn't take any bytes, it is called again with more, even if it doesn't return EAGAIN (this changed in e737009);
- eviction here is being triggered by a page growing past 5MB, and after many failed calls, the whole 5MB page is eventually written. Next time it is read in and modified, forced eviction is triggered and the whole thing starts again.
I'd suggest two changes – one is to include some slots before the first allocation size (see patch below). The other is to only repeat the call to raw compression if EAGAIN is returned, but that apparently causes problems figuring out the next key for row store splits.
Here is the CONFIG:
############################################
|
# RUN PARAMETERS
|
############################################
|
auto_throttle=1
|
firstfit=0
|
# bitcnt not applicable to this run
|
bloom=1
|
bloom_bit_count=47
|
bloom_hash_count=26
|
bloom_oldest=0
|
cache=56
|
checkpoints=1
|
checksum=uncompressed
|
chunk_size=4
|
compaction=0
|
compression=zlib
|
data_extend=0
|
data_source=file
|
delete_pct=13
|
dictionary=0
|
file_type=row-store
|
hot_backups=0
|
huffman_key=0
|
huffman_value=0
|
insert_pct=32
|
internal_key_truncation=1
|
internal_page_max=9
|
key_gap=16
|
key_max=256
|
key_min=256
|
leaf_page_max=9
|
logging=0
|
merge_max=7
|
merge_threads=3
|
mmap=1
|
ops=100000
|
prefix_compression=1
|
prefix_compression_min=7
|
repeat_data_pct=90
|
reverse=0
|
rows=100000
|
runs=1
|
split_pct=72
|
statistics=1
|
threads=1
|
value_max=1699
|
value_min=256
|
# wiredtiger_config not applicable to this run
|
write_pct=43
|
############################################
|
And here is half of the proposed change:
diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c
|
--- a/src/btree/rec_write.c
|
+++ b/src/btree/rec_write.c
|
@@ -1945,10 +1945,10 @@ static int
|
* We can't compress the first 64B of the block (it must be
|
* written without compression), and a possible split point
|
* may appear in that 64B; keep it simple, ignore the first
|
- * allocation size of data, anybody splitting smaller than
|
+ * half allocation size of data, anybody splitting smaller than
|
* that (as calculated before compression), is doing it wrong.
|
*/
|
- if ((len = WT_PTRDIFF(cell, dsk)) > btree->allocsize)
|
+ if ((len = WT_PTRDIFF(cell, dsk)) > btree->allocsize / 2)
|
r->raw_offsets[++slots] =
|
WT_STORE_SIZE(len - WT_BLOCK_COMPRESS_SKIP);
|
|
@@ -1959,10 +1959,11 @@ static int
|
}
|
|
/*
|
- * If we haven't managed to find at least one split point, we're done,
|
- * don't bother calling the underlying compression function.
|
+ * If we haven't managed to find at least one split point, or all of
|
+ * the rows fit into a single block, we're done, don't bother calling
|
+ * the underlying compression function.
|
*/
|
- if (slots == 0) {
|
+ if (slots == 0 || len <= btree->allocsize) {
|
result_slots = 0;
|
goto no_slots;
|
} |