Drop the AGPL license in favor of a source-available license. See the blog post [1] for details. [1] https://www.scylladb.com/2024/12/18/why-were-moving-to-a-source-available-license/
233 lines
8.2 KiB
Python
Executable File
233 lines
8.2 KiB
Python
Executable File
#!/usr/bin/python3
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# Copyright (C) 2023-present ScyllaDB
|
|
#
|
|
|
|
#
|
|
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
|
#
|
|
|
|
import argparse
|
|
import datetime
|
|
import re
|
|
import string
|
|
import uuid
|
|
|
|
# If the option of "uuid_sstable_identifiers_enabled" is set, scylla and
|
|
# Cassandra 4.x generate sstables using timeuuid as their identifiers. But the
|
|
# string representation does not follow the formal definition defined by
|
|
# rfc4122. Instead, it uses a base36-based presentation which is used by
|
|
# Cassandra. The filename of a Data component file might look like
|
|
# "nb-3fw2_0tj4_46w3k2cpidnirvjy7k-big-Data.db". where
|
|
# "3fw2_0tj4_46w3k2cpidnirvjy7k" is the identifier for the SSTable. If this
|
|
# option is disabled, a positive integer is used instead. In comparison to
|
|
# the rfc4122 representation, the base36 encoded UUID is shorter.
|
|
#
|
|
# This tool encodes the given 64-bit least significant bits and most significant
|
|
# bits to the base36-based string representation, and decodes the string. Because
|
|
# the timeuuid encodes the timestamp in it, this tool also prints the human readable
|
|
# time encoded in the timeuuid.
|
|
|
|
# base36 alphabet
|
|
ALPHABET = string.digits + string.ascii_lowercase
|
|
# 1/10 of a microsecond
|
|
DECIMICRO_RATIO = 10_000_000
|
|
|
|
|
|
def decode(s: str) -> int:
|
|
'''
|
|
decode a string represented using base36
|
|
|
|
the input should have the most significant bit first
|
|
'''
|
|
|
|
alphabet_len = len(ALPHABET)
|
|
output = 0
|
|
for char in s:
|
|
output *= alphabet_len
|
|
try:
|
|
output += ALPHABET.index(char)
|
|
except ValueError:
|
|
raise ValueError(f'"{s}" contains a character "{char}" not in alphabet')
|
|
return output
|
|
|
|
|
|
def encode(n: int) -> str:
|
|
'''
|
|
encode an integer represented using base36
|
|
|
|
the output has the most significant bit first
|
|
'''
|
|
assert len(ALPHABET) > 1
|
|
|
|
if n < 0:
|
|
raise ValueError(f'{n} is negative')
|
|
|
|
output = ''
|
|
alphabet_len = len(ALPHABET)
|
|
while n:
|
|
n, index = divmod(n, alphabet_len)
|
|
output += ALPHABET[index]
|
|
return output[::-1]
|
|
|
|
|
|
class TimeUuid:
|
|
def __init__(self, msb: int, lsb: int) -> None:
|
|
bytes = msb.to_bytes(8, byteorder='big') + lsb.to_bytes(8, byteorder='big')
|
|
self.uuid = uuid.UUID(bytes=bytes)
|
|
|
|
@staticmethod
|
|
def decode_with_base36(s: str) -> 'TimeUuid':
|
|
'''decode an string represented using base36'''
|
|
matched = re.match(r'''(?P<days>\w{4})_
|
|
(?P<seconds>\w{4})_
|
|
(?P<decimicrosecs>\w{5})
|
|
(?P<lsb>\w{13})''', s, re.ASCII | re.VERBOSE)
|
|
if matched is None:
|
|
raise ValueError(f'malformatted uuid: "{s}"')
|
|
days = decode(matched.group('days'))
|
|
seconds = decode(matched.group('seconds'))
|
|
decimicrosecs = decode(matched.group('decimicrosecs'))
|
|
lsb = decode(matched.group('lsb'))
|
|
delta = datetime.timedelta(days=days, seconds=seconds)
|
|
timestamp = decimicrosecs + int(delta.total_seconds()) * DECIMICRO_RATIO
|
|
msb = TimeUuid._time_to_msb(timestamp)
|
|
return TimeUuid(msb, lsb)
|
|
|
|
def encode_with_base36(self) -> str:
|
|
'''encode an integer using base36 representation'''
|
|
seconds, decimicro = divmod(self.uuid.time, DECIMICRO_RATIO)
|
|
delta = datetime.timedelta(seconds=seconds)
|
|
encoded_days = encode(delta.days)
|
|
encoded_seconds = encode(delta.seconds)
|
|
encoded_decimicro = encode(decimicro)
|
|
encoded_lsb = encode(self.lsb)
|
|
return (f'{encoded_days:0>4}_'
|
|
f'{encoded_seconds:0>4}_'
|
|
f'{encoded_decimicro:0>5}'
|
|
f'{encoded_lsb:0>13}')
|
|
|
|
@staticmethod
|
|
def _time_to_msb(time: int) -> int:
|
|
# see https://datatracker.ietf.org/doc/html/rfc4122.html
|
|
time_low = (2 ** 32 - 1) & time
|
|
time >>= 32
|
|
time_mid = (2 ** 16 - 1) & time
|
|
time >>= 16
|
|
if time >> 12 != 0:
|
|
raise ValueError(f'time "{time:#016x}" is too large to fit in')
|
|
time_hi = (2 ** 12 - 1) & time
|
|
# sets the version to 1
|
|
time_hi_version = 1 << 12 | time_hi
|
|
return (time_low << 32 |
|
|
time_mid << 16 |
|
|
time_hi_version)
|
|
|
|
@property
|
|
def msb(self) -> int:
|
|
return int.from_bytes(self.uuid.bytes[:8], byteorder='big')
|
|
|
|
@property
|
|
def lsb(self) -> int:
|
|
return int.from_bytes(self.uuid.bytes[8:], byteorder='big')
|
|
|
|
# the duration between 00:00 15 Oct 1582 and UNIX epoch
|
|
# see also utils/UUID_gen.hh
|
|
UNIX_EPOCH_SINCE_GREGORIAN_DAY0 = 122192928000000000
|
|
|
|
@property
|
|
def timestamp(self) -> (datetime.datetime, int):
|
|
# UUID v1 uses a timestamp epoch derived from Gregorian calendar, so we
|
|
# need to translate the timestamp to the UNIX time
|
|
unix_time = self.uuid.time - self.UNIX_EPOCH_SINCE_GREGORIAN_DAY0
|
|
seconds, decimicro_seconds = divmod(unix_time, DECIMICRO_RATIO)
|
|
return datetime.datetime.fromtimestamp(seconds), decimicro_seconds
|
|
|
|
def print_field(self, field: str, print_in_hex: bool) -> None:
|
|
def print_num(n: int, bits: int) -> str:
|
|
if print_in_hex:
|
|
# each hex char represents 4 bits
|
|
width = bits // 4
|
|
print(f'{field} = 0x{n:0{width}x}')
|
|
else:
|
|
print(f'{field} = {n}')
|
|
|
|
if field == 'lsb':
|
|
print_num(self.lsb, 64)
|
|
elif field == 'msb':
|
|
print_num(self.msb, 64)
|
|
elif field == 'date':
|
|
datetime, _ = self.timestamp
|
|
print(f'date = {datetime}')
|
|
elif field == 'decimicro_seconds':
|
|
_, decimicro_seconds = self.timestamp
|
|
print_num(decimicro_seconds, DECIMICRO_RATIO.bit_length())
|
|
elif field == 'time':
|
|
print_num(self.uuid.time, 60)
|
|
elif field == 'node':
|
|
print_num(self.uuid.node, 48)
|
|
else:
|
|
assert False, f'unknown field: {field}'
|
|
|
|
def __str__(self) -> str:
|
|
return self.encode_with_base36()
|
|
|
|
|
|
def test_dencode_base36() -> None:
|
|
# a minimal pytest, run it with 'pytest base36-uuid.py'
|
|
# the dataset comes from test/boost/sstable_generation_test.cc
|
|
encoded_uuid = "3fw2_0tj4_46w3k2cpidnirvjy7k"
|
|
expected_msb = 0x6636ac00da8411ec
|
|
expected_lsb = 0x9abaf56e1443def0
|
|
timeuuid = TimeUuid.decode_with_base36(encoded_uuid)
|
|
assert timeuuid.msb == expected_msb
|
|
assert timeuuid.lsb == expected_lsb
|
|
assert timeuuid.encode_with_base36() == encoded_uuid
|
|
|
|
timestamp, decimicro_seconds = timeuuid.timestamp
|
|
assert timestamp == datetime.datetime(2022, 5, 23, 18, 37, 52)
|
|
assert decimicro_seconds == 7040000
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Encode and decode timeuuid using base36 representation.")
|
|
dencode_parser = parser.add_mutually_exclusive_group(required=True)
|
|
dencode_parser.add_argument('-d', '--decode',
|
|
help='Decode base36-encoded timeuuid',
|
|
metavar='UUID')
|
|
dencode_parser.add_argument('-e', '--encode',
|
|
nargs=2,
|
|
type=lambda x: int(x, 0),
|
|
help='Encode 64-bit hex MSB and LSB using base36',
|
|
metavar='N')
|
|
default_fields = ['date', 'decimicro_seconds', 'lsb']
|
|
parser.add_argument('--field',
|
|
action='append',
|
|
choices=['lsb', 'msb', 'date', 'decimicro_seconds' 'time', 'node'],
|
|
help='Field to be printed (default: {})'.format(
|
|
", ".join(default_fields)),
|
|
dest='fields')
|
|
parser.add_argument('--hex',
|
|
action=argparse.BooleanOptionalAction,
|
|
help='Format numbers in hex',
|
|
default=True,
|
|
dest='print_in_hex')
|
|
|
|
args = parser.parse_args()
|
|
if args.decode:
|
|
uuid = TimeUuid.decode_with_base36(args.decode.lower())
|
|
if args.fields:
|
|
fields = args.fields
|
|
else:
|
|
fields = default_fields
|
|
for field in fields:
|
|
uuid.print_field(field, args.print_in_hex)
|
|
else:
|
|
print(TimeUuid(*args.encode))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|