Files
scylladb/gen_segmented_compress_params.py
Calle Wilund 43f7eecf9e compress: move compress.cc/hh to sstables/compressor
Fixes #22106

Moves the shared compress components to sstables, and rename to
match class type.

Adjust includes, removing redundant/unneeded ones where possible.

Closes scylladb/scylladb#25103
2025-07-31 13:10:41 +03:00

202 lines
6.0 KiB
Python
Executable File

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright (C) 2017-present ScyllaDB
#
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
#
'''
Generates compression segmentation parameters.
These parameters are used to reduce the memory footprint of the
in-memory compression database. See sstables/compress.hh for more
details.
'''
import argparse
import math
def data_size_range_log2():
return range(4, 51)
def chunk_size_range_log2():
return range(4, 31)
def base_offset_size(data_size, chunk_size, n):
return int(math.ceil(math.log2(data_size)))
def relative_offset_size(data_size, chunk_size, n):
if n == 1:
return int(0)
else:
return int(math.ceil(math.log2((n - 1) * (chunk_size + 64))))
def segment_size(data_size, chunk_size, n):
return base_offset_size(data_size, chunk_size, n) + (n - 1) * relative_offset_size(data_size, chunk_size, n)
def no_of_segments(data_size, chunk_size, n):
return int(math.ceil((data_size / chunk_size) / n))
def n_for(data_size, chunk_size, n_values):
nominal_data_size = int(math.ceil(math.log2(data_size)))
chunk_size_log2 = int(math.ceil(math.log2(chunk_size)))
return next(filter(lambda x: x[0] == nominal_data_size and x[1] == chunk_size_log2, n_values))[2]
def size_deque(data_size, chunk_size):
return int(math.ceil(data_size / chunk_size)) * 64
def size_grouped_segments(data_size, chunk_size, n):
return no_of_segments(data_size, chunk_size, n) * segment_size(data_size, chunk_size, n)
def best_nominal_data_size_for_bucket_size(chunk_size, bucket_size, n_values):
def addressable_space(data_size_log2):
data_size = 2**data_size_log2
n = n_for(data_size, chunk_size, n_values)
bucket_size_bits = bucket_size * 8
total_size_bits = size_grouped_segments(data_size, chunk_size, n)
if bucket_size_bits >= total_size_bits:
return data_size, data_size_log2
else:
segments_pb = segments_per_bucket(data_size, chunk_size, n, bucket_size)
return n * segments_pb * chunk_size, data_size_log2
space = map(addressable_space, data_size_range_log2())
return max(space, key=lambda x: x[0])[1]
def segments_per_bucket(data_size, chunk_size, n, bucket_size):
# A safety padding of 7 bytes has to be reserved at the end
bucket_size_bits = bucket_size * 8 - 56
segment_size_bits = segment_size(data_size, chunk_size, n)
fits = int(math.floor(bucket_size_bits / segment_size_bits))
# We can't have more segments than the sizes support
return min(no_of_segments(data_size, chunk_size, n), fits)
def all_n_values():
optimal_sizes = {}
for f in data_size_range_log2():
for c in chunk_size_range_log2():
optimal_size = None
for n in range(1, 201):
s = size_grouped_segments(2**f, 2**c, n)
if optimal_size is None or optimal_size[3] > s:
optimal_size = (f, c, n, s)
optimal_sizes[(f, c)] = optimal_size
n_values = []
for k in sorted(optimal_sizes.keys()):
f, c, n, s = optimal_sizes[k]
n_values.append((f, c, n))
return n_values
file_str = """
/*
* Copyright (C) 2017-present ScyllaDB
*/
// SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
/*
* This file was autogenerated by gen_segmented_compress_params.py.
*/
#include <array>
namespace sstables {{
const uint64_t bucket_size{{{bucket_size}}};
struct bucket_info {{
uint64_t chunk_size_log2;
uint64_t best_data_size_log2;
uint64_t segments_per_bucket;
}};
// The largest data chunk from the file a bucketful of offsets can
// cover, precalculated for different chunk sizes, plus the number
// of segments that are needed to address the whole area.
const std::array<bucket_info, {bucket_infos_size}> bucket_infos{{{{
{bucket_infos}}}}};
struct segment_info {{
uint8_t data_size_log2;
uint8_t chunk_size_log2;
uint8_t grouped_offsets;
}};
// Precomputed optimal segment information for different data and chunk sizes.
const std::array<segment_info, {segment_infos_size}> segment_infos{{{{
{segment_infos}}}}};
}} // namespace sstables
"""
if __name__ == '__main__':
cmdline_parser = argparse.ArgumentParser()
cmdline_parser.add_argument('--bucket-size-log2', action='store', help='specify bucket size (defaults to 4K)')
args = cmdline_parser.parse_args()
if args.bucket_size_log2 is not None:
bucket_size_log2 = int(args.bucket_size_log2)
if bucket_size_log2 < 10 or bucket_size_log2 > 30:
print("Bucket size is either too large or too small")
exit(1)
else:
bucket_size_log2 = 12 # 4K
bucket_size = 2**bucket_size_log2
n_values = all_n_values()
with open("sstables/segmented_compress_params.hh", "w") as infos_file:
bucket_infos = []
data_sizes = []
for chunk_size_log2 in chunk_size_range_log2():
chunk_size = 2**chunk_size_log2
data_size_log2 = best_nominal_data_size_for_bucket_size(chunk_size, bucket_size, n_values)
data_size = 2**data_size_log2
n = n_for(data_size, chunk_size, n_values)
bucket_infos.append(" {{{}, {}, {} /*out of the max of {}*/}}".format(
chunk_size_log2,
data_size_log2,
segments_per_bucket(data_size, chunk_size, n, bucket_size), # no of segments that fit into the bucket
no_of_segments(data_size, chunk_size, n))) # normal no of segments for these sizes
data_sizes.append(data_size_log2)
segment_infos = []
for n_value in n_values:
if n_value[0] in data_sizes:
segment_infos.append(" {{{}, {}, {}}}".format(*n_value))
infos_file.write(file_str.format(
bucket_size=bucket_size,
bucket_infos=",\n".join(bucket_infos),
bucket_infos_size=len(bucket_infos),
segment_infos=",\n".join(segment_infos),
segment_infos_size=len(segment_infos)))