Produces a histogram with the writetime (timestamp) of the data in the sstable(s). The histogram is printed to the output, along with general stats about the processed data.
166 lines
4.4 KiB
Lua
166 lines
4.4 KiB
Lua
--
|
|
-- Copyright (C) 2025-present ScyllaDB
|
|
--
|
|
-- SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
|
--
|
|
|
|
-- Produce a histogram of write times (timestamps) in the sstable(s).
|
|
--
|
|
-- Crawl over all timestamps in the data component and add them to a histogram.
|
|
-- The bucket size by default is a month, tunable with the --bucket option.
|
|
-- The timestamp of all objects that have one are added to the histogram:
|
|
-- * cells (atomic and collection cells)
|
|
-- * tombstones (partition-tombstone, range-tombstone, row-tombstone,
|
|
-- shadowable-tombstone, cell-tombstone, collection-tombstone, cell-tombstone)
|
|
-- * row-marker
|
|
--
|
|
-- This allows determining when the data was written, provided the writer of the
|
|
-- data didn't mangle with the timestamps.
|
|
-- This produces two lines of output:
|
|
-- 1) Stats about the histogram and the collected data.
|
|
-- 2) A JSON object containing the histogram, with two arrays: "buckets" and "counts".
|
|
--
|
|
-- following example python script:
|
|
--
|
|
-- import datetime
|
|
-- import json
|
|
-- import matplotlib.pyplot as plt # requires the matplotlib python package
|
|
--
|
|
-- with open('histogram.json', 'r') as f:
|
|
-- data = json.load(f)
|
|
--
|
|
-- x = data['buckets']
|
|
-- y = data['counts']
|
|
--
|
|
-- max_y = max(y)
|
|
--
|
|
-- x = [datetime.date.fromtimestamp(i / 1000000).strftime('%Y.%m') for i in x]
|
|
-- y = [i / max_y for i in y]
|
|
--
|
|
-- fig, ax = plt.subplots()
|
|
--
|
|
-- ax.set_xlabel('Timestamp')
|
|
-- ax.set_ylabel('Normalized cell count')
|
|
-- ax.set_title('Histogram of data write-time')
|
|
-- ax.bar(x, y)
|
|
--
|
|
-- plt.show()
|
|
|
|
-- the unit of time to use as bucket, one of (years, months, weeks, days, hours)
|
|
BUCKET = "months"
|
|
|
|
partitions = 0
|
|
rows = 0
|
|
cells = 0
|
|
timestamps = 0
|
|
histogram = {}
|
|
|
|
function timestamp_bucket(ts)
|
|
date = os.date("*t", ts // 1000000)
|
|
bucket_start_date = {}
|
|
if BUCKET == "years" then
|
|
bucket_start_date = {year = date.year, month = 1, day = 1}
|
|
elseif BUCKET == "months" then
|
|
bucket_start_date = {year = date.year, month = date.month, day = 1}
|
|
elseif BUCKET == "weeks" then
|
|
bucket_start_date = {year = date.year, month = date.month, day = 1 + date.day // 7}
|
|
elseif BUCKET == "days" then
|
|
bucket_start_date = {year = date.year, month = date.month, day = date.day}
|
|
elseif BUCKET == "hours" then
|
|
bucket_start_date = {year = date.year, month = date.month, day = date.day, hour = date.hour}
|
|
else
|
|
error("Invalid BUCKET value: " .. BUCKET)
|
|
end
|
|
|
|
return os.time(bucket_start_date) * 1000000
|
|
end
|
|
|
|
function collect_timestamp(ts)
|
|
ts = timestamp_bucket(ts)
|
|
|
|
timestamps = timestamps + 1
|
|
|
|
if histogram[ts] == nil then
|
|
histogram[ts] = 1
|
|
else
|
|
histogram[ts] = histogram[ts] + 1
|
|
end
|
|
end
|
|
|
|
function collect_column(cell)
|
|
if cell.type == "collection" then
|
|
if cell.tombstone then
|
|
collect_timestamp(cell.tombstone.timestamp)
|
|
end
|
|
for _, v in ipairs(cell.values) do
|
|
cells = cells + 1
|
|
collect_timestamp(cell.timestamp)
|
|
end
|
|
else
|
|
cells = cells + 1
|
|
collect_timestamp(cell.timestamp)
|
|
end
|
|
end
|
|
|
|
function collect_row(row)
|
|
for name, cell in pairs(row) do
|
|
rows = rows + 1
|
|
collect_column(cell)
|
|
end
|
|
end
|
|
|
|
-- Consume API hooks
|
|
|
|
function consume_partition_start(ps)
|
|
partitions = partitions + 1
|
|
if ps.tombstone then
|
|
collect_timestamp(ps.tombstone.timestamp)
|
|
end
|
|
end
|
|
|
|
function consume_static_row(sr)
|
|
collect_row(sr.cells)
|
|
end
|
|
|
|
function consume_clustering_row(cr)
|
|
if cr.marker then
|
|
collect_timestamp(cr.marker.timestamp)
|
|
end
|
|
|
|
if cr.tombstone then
|
|
collect_timestamp(cr.tombstone.timestamp)
|
|
end
|
|
|
|
collect_row(cr.cells)
|
|
end
|
|
|
|
function consume_range_tombstone_change(crt)
|
|
if crt.tombstone then
|
|
collect_timestamp(crt.tombstone.timestamp)
|
|
end
|
|
end
|
|
|
|
function consume_stream_end()
|
|
print(string.format("Histogram has %d entries, collected from %d partitions, %d rows, %d cells: %d timestamps total", #histogram, partitions, rows, cells, timestamps))
|
|
|
|
writer = Scylla.new_json_writer()
|
|
|
|
writer:start_object()
|
|
|
|
writer:key("buckets")
|
|
writer:start_array()
|
|
for bucket, _ in pairs(histogram) do
|
|
writer:int(bucket)
|
|
end
|
|
writer:end_array()
|
|
|
|
writer:key("counts")
|
|
writer:start_array()
|
|
for _, count in pairs(histogram) do
|
|
writer:int(count)
|
|
end
|
|
writer:end_array()
|
|
|
|
writer:end_object()
|
|
end
|