mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-20 08:30:35 +00:00
166 lines
4.4 KiB
Lua
166 lines
4.4 KiB
Lua
--
|
|
-- Copyright (C) 2025-present ScyllaDB
|
|
--
|
|
-- SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
|
--
|
|
|
|
-- Produce a histogram of write times (timestamps) in the sstable(s).
|
|
--
|
|
-- Crawl over all timestamps in the data component and add them to a histogram.
|
|
-- The bucket size by default is a month, tunable with the --bucket option.
|
|
-- The timestamp of all objects that have one are added to the histogram:
|
|
-- * cells (atomic and collection cells)
|
|
-- * tombstones (partition-tombstone, range-tombstone, row-tombstone,
|
|
-- shadowable-tombstone, cell-tombstone, collection-tombstone, cell-tombstone)
|
|
-- * row-marker
|
|
--
|
|
-- This allows determining when the data was written, provided the writer of the
|
|
-- data didn't mangle with the timestamps.
|
|
-- This produces two lines of output:
|
|
-- 1) Stats about the histogram and the collected data.
|
|
-- 2) A JSON object containing the histogram, with two arrays: "buckets" and "counts".
|
|
--
|
|
-- following example python script:
|
|
--
|
|
-- import datetime
|
|
-- import json
|
|
-- import matplotlib.pyplot as plt # requires the matplotlib python package
|
|
--
|
|
-- with open('histogram.json', 'r') as f:
|
|
-- data = json.load(f)
|
|
--
|
|
-- x = data['buckets']
|
|
-- y = data['counts']
|
|
--
|
|
-- max_y = max(y)
|
|
--
|
|
-- x = [datetime.date.fromtimestamp(i / 1000000).strftime('%Y.%m') for i in x]
|
|
-- y = [i / max_y for i in y]
|
|
--
|
|
-- fig, ax = plt.subplots()
|
|
--
|
|
-- ax.set_xlabel('Timestamp')
|
|
-- ax.set_ylabel('Normalized cell count')
|
|
-- ax.set_title('Histogram of data write-time')
|
|
-- ax.bar(x, y)
|
|
--
|
|
-- plt.show()
|
|
|
|
-- the unit of time to use as bucket, one of (years, months, weeks, days, hours)
|
|
BUCKET = "months"
|
|
|
|
partitions = 0
|
|
rows = 0
|
|
cells = 0
|
|
timestamps = 0
|
|
histogram = {}
|
|
|
|
function timestamp_bucket(ts)
|
|
date = os.date("*t", ts // 1000000)
|
|
bucket_start_date = {}
|
|
if BUCKET == "years" then
|
|
bucket_start_date = {year = date.year, month = 1, day = 1}
|
|
elseif BUCKET == "months" then
|
|
bucket_start_date = {year = date.year, month = date.month, day = 1}
|
|
elseif BUCKET == "weeks" then
|
|
bucket_start_date = {year = date.year, month = date.month, day = 1 + date.day // 7}
|
|
elseif BUCKET == "days" then
|
|
bucket_start_date = {year = date.year, month = date.month, day = date.day}
|
|
elseif BUCKET == "hours" then
|
|
bucket_start_date = {year = date.year, month = date.month, day = date.day, hour = date.hour}
|
|
else
|
|
error("Invalid BUCKET value: " .. BUCKET)
|
|
end
|
|
|
|
return os.time(bucket_start_date) * 1000000
|
|
end
|
|
|
|
function collect_timestamp(ts)
|
|
ts = timestamp_bucket(ts)
|
|
|
|
timestamps = timestamps + 1
|
|
|
|
if histogram[ts] == nil then
|
|
histogram[ts] = 1
|
|
else
|
|
histogram[ts] = histogram[ts] + 1
|
|
end
|
|
end
|
|
|
|
function collect_column(cell)
|
|
if cell.type == "collection" then
|
|
if cell.tombstone then
|
|
collect_timestamp(cell.tombstone.timestamp)
|
|
end
|
|
for _, v in ipairs(cell.values) do
|
|
cells = cells + 1
|
|
collect_timestamp(cell.timestamp)
|
|
end
|
|
else
|
|
cells = cells + 1
|
|
collect_timestamp(cell.timestamp)
|
|
end
|
|
end
|
|
|
|
function collect_row(row)
|
|
for name, cell in pairs(row) do
|
|
rows = rows + 1
|
|
collect_column(cell)
|
|
end
|
|
end
|
|
|
|
-- Consume API hooks
|
|
|
|
function consume_partition_start(ps)
|
|
partitions = partitions + 1
|
|
if ps.tombstone then
|
|
collect_timestamp(ps.tombstone.timestamp)
|
|
end
|
|
end
|
|
|
|
function consume_static_row(sr)
|
|
collect_row(sr.cells)
|
|
end
|
|
|
|
function consume_clustering_row(cr)
|
|
if cr.marker then
|
|
collect_timestamp(cr.marker.timestamp)
|
|
end
|
|
|
|
if cr.tombstone then
|
|
collect_timestamp(cr.tombstone.timestamp)
|
|
end
|
|
|
|
collect_row(cr.cells)
|
|
end
|
|
|
|
function consume_range_tombstone_change(crt)
|
|
if crt.tombstone then
|
|
collect_timestamp(crt.tombstone.timestamp)
|
|
end
|
|
end
|
|
|
|
function consume_stream_end()
|
|
print(string.format("Histogram has %d entries, collected from %d partitions, %d rows, %d cells: %d timestamps total", #histogram, partitions, rows, cells, timestamps))
|
|
|
|
writer = Scylla.new_json_writer()
|
|
|
|
writer:start_object()
|
|
|
|
writer:key("buckets")
|
|
writer:start_array()
|
|
for bucket, _ in pairs(histogram) do
|
|
writer:int(bucket)
|
|
end
|
|
writer:end_array()
|
|
|
|
writer:key("counts")
|
|
writer:start_array()
|
|
for _, count in pairs(histogram) do
|
|
writer:int(count)
|
|
end
|
|
writer:end_array()
|
|
|
|
writer:end_object()
|
|
end
|