scylladb/tools/scylla-sstable-scripts/writetime-histogram.lua

--
-- Copyright (C) 2025-present ScyllaDB
--
-- SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
--

-- Produce a histogram of write times (timestamps) in the sstable(s).
--
-- Crawl over all timestamps in the data component and add them to a histogram.
-- The bucket size by default is a month, tunable with the --bucket option.
-- The timestamp of all objects that have one are added to the histogram:
-- * cells (atomic and collection cells)
-- * tombstones (partition-tombstone, range-tombstone, row-tombstone,
--   shadowable-tombstone, cell-tombstone, collection-tombstone, cell-tombstone)
-- * row-marker
--
-- This allows determining when the data was written, provided the writer of the
-- data didn't mangle with the timestamps.
-- This produces two lines of output:
-- 1) Stats about the histogram and the collected data.
-- 2) A JSON object containing the histogram, with two arrays: "buckets" and "counts".
--
-- following example python script:
--
--      import datetime
--      import json
--      import matplotlib.pyplot as plt # requires the matplotlib python package
--
--      with open('histogram.json', 'r') as f:
--          data = json.load(f)
--
--      x = data['buckets']
--      y = data['counts']
--
--      max_y = max(y)
--
--      x = [datetime.date.fromtimestamp(i / 1000000).strftime('%Y.%m') for i in x]
--      y = [i / max_y for i in y]
--
--      fig, ax = plt.subplots()
--
--      ax.set_xlabel('Timestamp')
--      ax.set_ylabel('Normalized cell count')
--      ax.set_title('Histogram of data write-time')
--      ax.bar(x, y)
--
--      plt.show()

-- the unit of time to use as bucket, one of (years, months, weeks, days, hours)
BUCKET = "months"

partitions = 0
rows = 0
cells = 0
timestamps = 0
histogram = {}

function timestamp_bucket(ts)
    date = os.date("*t", ts // 1000000)
    bucket_start_date = {}
    if BUCKET == "years" then
        bucket_start_date = {year = date.year, month = 1, day = 1}
    elseif BUCKET == "months" then
        bucket_start_date = {year = date.year, month = date.month, day = 1}
    elseif BUCKET == "weeks" then
        bucket_start_date = {year = date.year, month = date.month, day = 1 + date.day // 7}
    elseif BUCKET == "days" then
        bucket_start_date = {year = date.year, month = date.month, day = date.day}
    elseif BUCKET == "hours" then
        bucket_start_date = {year = date.year, month = date.month, day = date.day, hour = date.hour}
    else
        error("Invalid BUCKET value: " .. BUCKET)
    end

    return os.time(bucket_start_date) * 1000000
end

function collect_timestamp(ts)
    ts = timestamp_bucket(ts)

    timestamps = timestamps + 1

    if histogram[ts] == nil then
        histogram[ts] = 1
    else
        histogram[ts] = histogram[ts] + 1
    end
end

function collect_column(cell)
    if cell.type == "collection" then
        if cell.tombstone then
            collect_timestamp(cell.tombstone.timestamp)
        end
        for _, v in ipairs(cell.values) do
            cells = cells + 1
            collect_timestamp(cell.timestamp)
        end
    else
        cells = cells + 1
        collect_timestamp(cell.timestamp)
    end
end

function collect_row(row)
    for name, cell in pairs(row) do
        rows = rows + 1
        collect_column(cell)
    end
end

-- Consume API hooks

function consume_partition_start(ps)
    partitions = partitions + 1
    if ps.tombstone then
        collect_timestamp(ps.tombstone.timestamp)
    end
end

function consume_static_row(sr)
    collect_row(sr.cells)
end

function consume_clustering_row(cr)
    if cr.marker then
        collect_timestamp(cr.marker.timestamp)
    end

    if cr.tombstone then
        collect_timestamp(cr.tombstone.timestamp)
    end

    collect_row(cr.cells)
end

function consume_range_tombstone_change(crt)
    if crt.tombstone then
        collect_timestamp(crt.tombstone.timestamp)
    end
end

function consume_stream_end()
    print(string.format("Histogram has %d entries, collected from %d partitions, %d rows, %d cells: %d timestamps total", #histogram, partitions, rows, cells, timestamps))

    writer = Scylla.new_json_writer()

    writer:start_object()

    writer:key("buckets")
    writer:start_array()
    for bucket, _ in pairs(histogram) do
        writer:int(bucket)
    end
    writer:end_array()

    writer:key("counts")
    writer:start_array()
    for _, count in pairs(histogram) do
        writer:int(count)
    end
    writer:end_array()

    writer:end_object()
end