From 2b8ce83eeaaa52b88e50f4734993d014ef407cda Mon Sep 17 00:00:00 2001 From: Konstantin Osipov Date: Fri, 4 Dec 2020 23:57:28 +0300 Subject: [PATCH] lists: use query timestamp for list cell values during append Scylla list cells are represented internally as a map of timeuuid => value. To append a new value to a list the coordinator generates a timeuuid reflecting the current time as key and adds a value to the map using this key. Before this patch, Scylla always generated a timeuuid for a new value, even if the query had a user supplied or LWT timestamp. This could break LWT linearizability. User supplied timestamps were ignored. This is reported as https://github.com/scylladb/scylla/issues/7611 A statement which appended multiple values to a list or a BATCH generated an own microsecond-resolution timeuuid for each value: BEGIN BATCH UPDATE ... SET a = a + [3] UPDATE ... SET a = a + [4] APPLY BATCH UPDATE ... SET a = a + [3, 4] To fix the bug, it's necessary to preserve monotonicity of timeuuids within a batch or multi-value append, but make sure they all use the microsecond time, as is set by LWT or user. To explain the fix, it's first necessary to recall the structure of time-based UUIDs: 60 bits: time since start of GMT epoch, year 1582, represented in 100-nanosecond units 4 bits: version 14 bits: clock sequence, a random number to avoid duplicates in case system clock is adjusted 2 bits: type 48 bits: MAC address (or other hardware address) The purpose of clockseq bits is as defined in https://tools.ietf.org/html/rfc4122#section-4.1.5 is to reduce the probability of UUID collision in case clock goes back in time or node id changes. The implementation should reset it whenever one of these events may occur. Since LWT microsecond time is guaranteed to be unique by Paxos, the RFC provisioning for clockseq and MAC slots becomes excessive. The fix thus changes timeuuid slot content in the following way: - time component now contains the same microsecond time for all values of a statement or a batch. The time is unique and monotonic in case of LWT. Otherwise it's most always monotonic, but may not be unique if two timestamps are created on different coordinators. - clockseq component is used to store a sequence number which is unique and monotonic for all values within the statement/batch. - to protect against time back-adjustments and duplicates if time is auto-generated, MAC component contains a random (spoof) MAC address, re-created on each restart. The address is different at each shard. The change is made for all sources of time: user, generated, LWT. Conditioning the list key generation algorithm on the source of time would unnecessarily complicate the code while not increase quality (uniqueness) of created list keys. Since 14 bits of clockseq provide us with only 16383 distinct slots per statement or batch, 3 extra bits in nanosecond part of the time are used to extend the range to 131071 values per statement/batch. If the rang is exceeded beyond the limit, an exception is produced. A twist on the use of clockseq to extend timeuuid uniqueness is that Scylla, like Cassandra, uses int8 compare to compare lower bits of timeuuid for ordering. The patch takes this into account and sign-complements the clockseq value to make it monotonic according to the legacy compare function. Fixes #7611 test: unit (dev) --- cql3/lists.cc | 16 ++++++-- cql3/query_options.hh | 16 ++++++++ utils/UUID_gen.cc | 1 + utils/UUID_gen.hh | 85 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 114 insertions(+), 4 deletions(-) diff --git a/cql3/lists.cc b/cql3/lists.cc index b9d0132ea2..3bb455cc22 100644 --- a/cql3/lists.cc +++ b/cql3/lists.cc @@ -387,10 +387,18 @@ lists::do_append(shared_ptr value, collection_mutation_description appended; appended.cells.reserve(to_add.size()); for (auto&& e : to_add) { - auto uuid1 = utils::UUID_gen::get_time_UUID_bytes(); - auto uuid = bytes(reinterpret_cast(uuid1.data()), uuid1.size()); - // FIXME: can e be empty? - appended.cells.emplace_back(std::move(uuid), params.make_cell(*ltype->value_comparator(), *e, atomic_cell::collection_member::yes)); + try { + auto uuid1 = utils::UUID_gen::get_time_UUID_bytes_from_micros_and_submicros( + params.timestamp(), + params._options.next_list_append_seq()); + auto uuid = bytes(reinterpret_cast(uuid1.data()), uuid1.size()); + // FIXME: can e be empty? + appended.cells.emplace_back( + std::move(uuid), + params.make_cell(*ltype->value_comparator(), *e, atomic_cell::collection_member::yes)); + } catch (utils::timeuuid_submicro_out_of_range) { + throw exceptions::invalid_request_exception("Too many list values per single CQL statement or batch"); + } } m.set_cell(prefix, column, appended.serialize(*ltype)); } else { diff --git a/cql3/query_options.hh b/cql3/query_options.hh index 936c9fc825..43c8d1b61f 100644 --- a/cql3/query_options.hh +++ b/cql3/query_options.hh @@ -83,6 +83,16 @@ private: const specific_options _options; cql_serialization_format _cql_serialization_format; std::optional> _batch_options; + // We must use the same microsecond-precision timestamp for + // all cells created by an LWT statement or when a statement + // has a user-provided timestamp. In case the statement or + // a BATCH appends many values to a list, each value should + // get a unique and monotonic timeuuid. This sequence is + // used to make all time-based UUIDs: + // 1) share the same microsecond, + // 2) monotonic + // 3) unique. + mutable int _list_append_seq = 0; private: /** @@ -241,6 +251,12 @@ public: return _cql_config; } + // Generate a next unique list sequence for list append, e.g. + // a = a + [val1, val2, ...] + int next_list_append_seq() const { + return _list_append_seq++; + } + void prepare(const std::vector>& specs); private: void fill_value_views(); diff --git a/utils/UUID_gen.cc b/utils/UUID_gen.cc index cbd84a23db..4fc69e7e5c 100644 --- a/utils/UUID_gen.cc +++ b/utils/UUID_gen.cc @@ -168,6 +168,7 @@ UUID UUID_gen::get_name_UUID(const unsigned char *s, size_t len) { return get_UUID(digest); } +const thread_local int64_t UUID_gen::spoof_node = make_thread_local_node(make_random_node()); const thread_local int64_t UUID_gen::clock_seq_and_node = make_clock_seq_and_node(); thread_local const std::unique_ptr UUID_gen::instance (new UUID_gen()); diff --git a/utils/UUID_gen.hh b/utils/UUID_gen.hh index 0877637d7a..a4171747f1 100644 --- a/utils/UUID_gen.hh +++ b/utils/UUID_gen.hh @@ -50,6 +50,15 @@ namespace utils { +// Scylla uses specialized timeuuids for list keys. They use +// limited space of timeuuid clockseq component to store +// sub-microsecond time. This exception is thrown when an attempt +// is made to construct such a UUID with a sub-microsecond argument +// which is outside the available bit range. +struct timeuuid_submicro_out_of_range: public std::out_of_range { + using out_of_range::out_of_range; +}; + /** * The goods are here: www.ietf.org/rfc/rfc4122.txt. */ @@ -58,6 +67,14 @@ class UUID_gen private: // A grand day! millis at 00:00:00.000 15 Oct 1582. static constexpr int64_t START_EPOCH = -12219292800000L; + // A random mac address for use in timeuuids + // where we can not use clockseq to randomize the physical + // node, and prefer using a random address to a physical one + // to avoid duplicate timeuuids when system time goes back + // while scylla is restarting. Using a spoof node also helps + // avoid timeuuid duplicates when multiple nodes run on the + // same host and share the physical MAC address. + static thread_local const int64_t spoof_node; static thread_local const int64_t clock_seq_and_node; /* @@ -86,6 +103,12 @@ private: } public: + // We have only 17 timeuuid bits available to store this + // value. + static constexpr int SUBMICRO_LIMIT = (1<<17); + // UUID timestamp time component is represented in intervals + // of 1/10 of a microsecond since the beginning of GMT epoch. + using decimicroseconds = std::chrono::duration>; /** * Creates a type 1 UUID (time-based UUID). * @@ -162,6 +185,56 @@ public: assert(uuid.is_timestamp()); return uuid; } + // Generate a time-based (Version 1) UUID using + // a microsecond-precision Unix time and a unique number in + // range [0, 131072). + // Used to generate many unique, monotonic UUIDs + // sharing the same microsecond part. In lightweight + // transactions we must ensure monotonicity between all UUIDs + // which belong to one lightweight transaction and UUIDs of + // another transaction, but still need multiple distinct and + // monotonic UUIDs within the same transaction. + // \throws timeuuid_submicro_out_of_range + // + static std::array + get_time_UUID_bytes_from_micros_and_submicros(int64_t when_in_micros, int submicros) { + std::array uuid_bytes; + + if (submicros < 0 || submicros >= SUBMICRO_LIMIT) { + throw timeuuid_submicro_out_of_range("timeuuid submicro component does not fit into available bits"); + } + + auto dmc = from_unix_timestamp(std::chrono::microseconds(when_in_micros)); + // We have roughly 3 extra bits we will use to increase + // sub-microsecond component range from clockseq's 2^14 to 2^17. + int64_t msb = create_time(dmc + decimicroseconds((submicros >> 14) & 0b111)); + // See RFC 4122 for details. + msb = net::hton(msb); + + std::copy_n(reinterpret_cast(&msb), sizeof(msb), uuid_bytes.data()); + + // Use 14-bit clockseq to store the rest of sub-microsecond component. + int64_t clockseq = submicros & 0b11'1111'1111'1111; + // Scylla, like Cassandra, uses signed int8 compare to + // compare lower bits of timeuuid. It means 0xA0 > 0xFF. + // Bit-xor the sign bit to "fix" the order. See also + // https://issues.apache.org/jira/browse/CASSANDRA-8730 + // and Cassandra commit 6d266253a5bdaf3a25eef14e54deb56aba9b2944 + // + // Turn 0 into -127, 1 into -126, ... and 128 into 0, ... + clockseq ^= 0b0000'0000'1000'0000; + // Least significant bits: UUID variant (1), clockseq and node. + // To protect against the system clock back-adjustment, + // use a random (spoof) node identifier. Normally this + // protection is provided by clockseq component, but we've + // just stored sub-microsecond time in it. + int64_t lsb = ((clockseq | 0b1000'0000'0000'0000) << 48) | UUID_gen::spoof_node; + lsb = net::hton(lsb); + + std::copy_n(reinterpret_cast(&lsb), sizeof(lsb), uuid_bytes.data() + sizeof(msb)); + + return uuid_bytes; + } /** validates uuid from raw bytes. */ static bool is_valid_UUID(bytes raw) { @@ -273,6 +346,10 @@ public: } private: + template + static decimicroseconds from_unix_timestamp(std::chrono::duration> d) { + return d - std::chrono::milliseconds(START_EPOCH); + } /** * @param timestamp milliseconds since Unix epoch * @return @@ -370,6 +447,14 @@ private: return create_time(nanos_since); } + // std::chrono typeaware wrapper around create_time(). + // Creates a timeuuid compatible time (decimicroseconds since + // the start of GMT epoch). + template + static int64_t create_time(std::chrono::duration> d) { + return create_time(duration_cast(d).count()); + } + static int64_t create_time(uint64_t nanos_since) { uint64_t msb = 0L;