Files
scylladb/scripts/base36-uuid.py
Kefu Chai 50332f796e script/base36-uuid.py: interpret timestamp with Gregorian calendar
UUID v1 uses an epoch derived frmo Gregorian calendar. but
base36-uuid.py interprets the timestamp with the UNIX epoch time.
that's why it prints a UUID like

```console
$ ./scripts/base36-uuid.py -d 3gbi_0mhs_4sjf42oac6rxqdsnyx
date = 2411-02-16 16:05:52
decimicro_seconds = 0x7ad550
lsb = 0xafe141a195fe0d59
```

even this UUID is generated on nov 30, 2023. so in this change,
we shift the time with the timestamp of UNIX epoch derived from
the Gregorian calendar's day 0. so, after this change, we have:

```console
$ ./scripts/base36-uuid.py -d 3gbi_0mhs_4sjf42oac6rxqdsnyx
date = 2023-11-30 16:05:52
decimicro_seconds = 0x7ad550
lsb = 0xafe141a195fe0d59
```

see https://datatracker.ietf.org/doc/html/rfc4122#section-4.1.4

Signed-off-by: Kefu Chai <kefu.chai@scylladb.com>

Closes scylladb/scylladb#16235
2023-12-05 07:39:34 +02:00

233 lines
8.2 KiB
Python
Executable File

#!/usr/bin/python3
# -*- coding: utf-8 -*-
#
# Copyright (C) 2023-present ScyllaDB
#
#
# SPDX-License-Identifier: AGPL-3.0-or-later
#
import argparse
import datetime
import re
import string
import uuid
# If the option of "uuid_sstable_identifiers_enabled" is set, scylla and
# Cassandra 4.x generate sstables using timeuuid as their identifiers. But the
# string representation does not follow the formal definition defined by
# rfc4122. Instead, it uses a base36-based presentation which is used by
# Cassandra. The filename of a Data component file might look like
# "nb-3fw2_0tj4_46w3k2cpidnirvjy7k-big-Data.db". where
# "3fw2_0tj4_46w3k2cpidnirvjy7k" is the identifier for the SSTable. If this
# option is disabled, a positive integer is used instead. In comparison to
# the rfc4122 representation, the base36 encoded UUID is shorter.
#
# This tool encodes the given 64-bit least significant bits and most significant
# bits to the base36-based string representation, and decodes the string. Because
# the timeuuid encodes the timestamp in it, this tool also prints the human readable
# time encoded in the timeuuid.
# base36 alphabet
ALPHABET = string.digits + string.ascii_lowercase
# 1/10 of a microsecond
DECIMICRO_RATIO = 10_000_000
def decode(s: str) -> int:
'''
decode a string represented using base36
the input should have the most significant bit first
'''
alphabet_len = len(ALPHABET)
output = 0
for char in s:
output *= alphabet_len
try:
output += ALPHABET.index(char)
except ValueError:
raise ValueError(f'"{s}" contains a character "{char}" not in alphabet')
return output
def encode(n: int) -> str:
'''
encode an integer represented using base36
the output has the most significant bit first
'''
assert len(ALPHABET) > 1
if n < 0:
raise ValueError(f'{n} is negative')
output = ''
alphabet_len = len(ALPHABET)
while n:
n, index = divmod(n, alphabet_len)
output += ALPHABET[index]
return output[::-1]
class TimeUuid:
def __init__(self, msb: int, lsb: int) -> None:
bytes = msb.to_bytes(8, byteorder='big') + lsb.to_bytes(8, byteorder='big')
self.uuid = uuid.UUID(bytes=bytes)
@staticmethod
def decode_with_base36(s: str) -> 'TimeUuid':
'''decode an string represented using base36'''
matched = re.match(r'''(?P<days>\w{4})_
(?P<seconds>\w{4})_
(?P<decimicrosecs>\w{5})
(?P<lsb>\w{13})''', s, re.ASCII | re.VERBOSE)
if matched is None:
raise ValueError(f'malformatted uuid: "{s}"')
days = decode(matched.group('days'))
seconds = decode(matched.group('seconds'))
decimicrosecs = decode(matched.group('decimicrosecs'))
lsb = decode(matched.group('lsb'))
delta = datetime.timedelta(days=days, seconds=seconds)
timestamp = decimicrosecs + int(delta.total_seconds()) * DECIMICRO_RATIO
msb = TimeUuid._time_to_msb(timestamp)
return TimeUuid(msb, lsb)
def encode_with_base36(self) -> str:
'''encode an integer using base36 representation'''
seconds, decimicro = divmod(self.uuid.time, DECIMICRO_RATIO)
delta = datetime.timedelta(seconds=seconds)
encoded_days = encode(delta.days)
encoded_seconds = encode(delta.seconds)
encoded_decimicro = encode(decimicro)
encoded_lsb = encode(self.lsb)
return (f'{encoded_days:0>4}_'
f'{encoded_seconds:0>4}_'
f'{encoded_decimicro:0>5}'
f'{encoded_lsb:0>13}')
@staticmethod
def _time_to_msb(time: int) -> int:
# see https://datatracker.ietf.org/doc/html/rfc4122.html
time_low = (2 ** 32 - 1) & time
time >>= 32
time_mid = (2 ** 16 - 1) & time
time >>= 16
if time >> 12 != 0:
raise ValueError(f'time "{time:#016x}" is too large to fit in')
time_hi = (2 ** 12 - 1) & time
# sets the version to 1
time_hi_version = 1 << 12 | time_hi
return (time_low << 32 |
time_mid << 16 |
time_hi_version)
@property
def msb(self) -> int:
return int.from_bytes(self.uuid.bytes[:8], byteorder='big')
@property
def lsb(self) -> int:
return int.from_bytes(self.uuid.bytes[8:], byteorder='big')
# the duration between 00:00 15 Oct 1582 and UNIX epoch
# see also utils/UUID_gen.hh
UNIX_EPOCH_SINCE_GREGORIAN_DAY0 = 122192928000000000
@property
def timestamp(self) -> (datetime.datetime, int):
# UUID v1 uses a timestamp epoch derived from Gregorian calendar, so we
# need to translate the timetamp to the UNIX time
unix_time = self.uuid.time - self.UNIX_EPOCH_SINCE_GREGORIAN_DAY0
seconds, decimicro_seconds = divmod(unix_time, DECIMICRO_RATIO)
return datetime.datetime.fromtimestamp(seconds), decimicro_seconds
def print_field(self, field: str, print_in_hex: bool) -> None:
def print_num(n: int, bits: int) -> str:
if print_in_hex:
# each hex char represents 4 bits
width = bits // 4
print(f'{field} = 0x{n:0{width}x}')
else:
print(f'{field} = {n}')
if field == 'lsb':
print_num(self.lsb, 64)
elif field == 'msb':
print_num(self.msb, 64)
elif field == 'date':
datetime, _ = self.timestamp
print(f'date = {datetime}')
elif field == 'decimicro_seconds':
_, decimicro_seconds = self.timestamp
print_num(decimicro_seconds, DECIMICRO_RATIO.bit_length())
elif field == 'time':
print_num(self.uuid.time, 60)
elif field == 'node':
print_num(self.uuid.node, 48)
else:
assert False, f'unknown field: {field}'
def __str__(self) -> str:
return self.encode_with_base36()
def test_dencode_base36() -> None:
# a minimal pytest, run it with 'pytest base36-uuid.py'
# the dataset comes from test/boost/sstable_generation_test.cc
encoded_uuid = "3fw2_0tj4_46w3k2cpidnirvjy7k"
expected_msb = 0x6636ac00da8411ec
expected_lsb = 0x9abaf56e1443def0
timeuuid = TimeUuid.decode_with_base36(encoded_uuid)
assert timeuuid.msb == expected_msb
assert timeuuid.lsb == expected_lsb
assert timeuuid.encode_with_base36() == encoded_uuid
timestamp, decimicro_seconds = timeuuid.timestamp
assert timestamp == datetime.datetime(2022, 5, 23, 18, 37, 52)
assert decimicro_seconds == 7040000
def main():
parser = argparse.ArgumentParser(
description="Encode and decode timeuuid using base36 representation.")
dencode_parser = parser.add_mutually_exclusive_group(required=True)
dencode_parser.add_argument('-d', '--decode',
help='Decode base36-encoded timeuuid',
metavar='UUID')
dencode_parser.add_argument('-e', '--encode',
nargs=2,
type=lambda x: int(x, 0),
help='Encode 64-bit hex MSB and LSB using base36',
metavar='N')
default_fields = ['date', 'decimicro_seconds', 'lsb']
parser.add_argument('--field',
action='append',
choices=['lsb', 'msb', 'date', 'decimicro_seconds' 'time', 'node'],
help='Field to be printed (default: {})'.format(
", ".join(default_fields)),
dest='fields')
parser.add_argument('--hex',
action=argparse.BooleanOptionalAction,
help='Format numbers in hex',
default=True,
dest='print_in_hex')
args = parser.parse_args()
if args.decode:
uuid = TimeUuid.decode_with_base36(args.decode.lower())
if args.fields:
fields = args.fields
else:
fields = default_fields
for field in fields:
uuid.print_field(field, args.print_in_hex)
else:
print(TimeUuid(*args.encode))
if __name__ == '__main__':
main()