Files
scylladb/tools/scylla-sstable-scripts/writetime-histogram.lua
Botond Dénes 92f614dc5a tools/scylla-sstable-scripts: introduce writetime-histogram.lua
Produces a histogram with the writetime (timestamp) of the data in the
sstable(s). The histogram is printed to the output, along with general
stats about the processed data.
2025-09-19 11:54:01 +03:00

166 lines
4.4 KiB
Lua

--
-- Copyright (C) 2025-present ScyllaDB
--
-- SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
--
-- Produce a histogram of write times (timestamps) in the sstable(s).
--
-- Crawl over all timestamps in the data component and add them to a histogram.
-- The bucket size by default is a month, tunable with the --bucket option.
-- The timestamp of all objects that have one are added to the histogram:
-- * cells (atomic and collection cells)
-- * tombstones (partition-tombstone, range-tombstone, row-tombstone,
-- shadowable-tombstone, cell-tombstone, collection-tombstone, cell-tombstone)
-- * row-marker
--
-- This allows determining when the data was written, provided the writer of the
-- data didn't mangle with the timestamps.
-- This produces two lines of output:
-- 1) Stats about the histogram and the collected data.
-- 2) A JSON object containing the histogram, with two arrays: "buckets" and "counts".
--
-- following example python script:
--
-- import datetime
-- import json
-- import matplotlib.pyplot as plt # requires the matplotlib python package
--
-- with open('histogram.json', 'r') as f:
-- data = json.load(f)
--
-- x = data['buckets']
-- y = data['counts']
--
-- max_y = max(y)
--
-- x = [datetime.date.fromtimestamp(i / 1000000).strftime('%Y.%m') for i in x]
-- y = [i / max_y for i in y]
--
-- fig, ax = plt.subplots()
--
-- ax.set_xlabel('Timestamp')
-- ax.set_ylabel('Normalized cell count')
-- ax.set_title('Histogram of data write-time')
-- ax.bar(x, y)
--
-- plt.show()
-- the unit of time to use as bucket, one of (years, months, weeks, days, hours)
BUCKET = "months"
partitions = 0
rows = 0
cells = 0
timestamps = 0
histogram = {}
function timestamp_bucket(ts)
date = os.date("*t", ts // 1000000)
bucket_start_date = {}
if BUCKET == "years" then
bucket_start_date = {year = date.year, month = 1, day = 1}
elseif BUCKET == "months" then
bucket_start_date = {year = date.year, month = date.month, day = 1}
elseif BUCKET == "weeks" then
bucket_start_date = {year = date.year, month = date.month, day = 1 + date.day // 7}
elseif BUCKET == "days" then
bucket_start_date = {year = date.year, month = date.month, day = date.day}
elseif BUCKET == "hours" then
bucket_start_date = {year = date.year, month = date.month, day = date.day, hour = date.hour}
else
error("Invalid BUCKET value: " .. BUCKET)
end
return os.time(bucket_start_date) * 1000000
end
function collect_timestamp(ts)
ts = timestamp_bucket(ts)
timestamps = timestamps + 1
if histogram[ts] == nil then
histogram[ts] = 1
else
histogram[ts] = histogram[ts] + 1
end
end
function collect_column(cell)
if cell.type == "collection" then
if cell.tombstone then
collect_timestamp(cell.tombstone.timestamp)
end
for _, v in ipairs(cell.values) do
cells = cells + 1
collect_timestamp(cell.timestamp)
end
else
cells = cells + 1
collect_timestamp(cell.timestamp)
end
end
function collect_row(row)
for name, cell in pairs(row) do
rows = rows + 1
collect_column(cell)
end
end
-- Consume API hooks
function consume_partition_start(ps)
partitions = partitions + 1
if ps.tombstone then
collect_timestamp(ps.tombstone.timestamp)
end
end
function consume_static_row(sr)
collect_row(sr.cells)
end
function consume_clustering_row(cr)
if cr.marker then
collect_timestamp(cr.marker.timestamp)
end
if cr.tombstone then
collect_timestamp(cr.tombstone.timestamp)
end
collect_row(cr.cells)
end
function consume_range_tombstone_change(crt)
if crt.tombstone then
collect_timestamp(crt.tombstone.timestamp)
end
end
function consume_stream_end()
print(string.format("Histogram has %d entries, collected from %d partitions, %d rows, %d cells: %d timestamps total", #histogram, partitions, rows, cells, timestamps))
writer = Scylla.new_json_writer()
writer:start_object()
writer:key("buckets")
writer:start_array()
for bucket, _ in pairs(histogram) do
writer:int(bucket)
end
writer:end_array()
writer:key("counts")
writer:start_array()
for _, count in pairs(histogram) do
writer:int(count)
end
writer:end_array()
writer:end_object()
end