From c3da9f2bd4a767b53c58e09dbfc2642fa82a581e Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Sun, 24 May 2020 19:03:38 +0300 Subject: [PATCH] alternator: add mandatory configurable write isolation mode Alternator supports four ways in which write operations can use quorum writes or LWT or both, which we called "write isolation policies". Until this patch, Alternator defaulted to the most generally safe policy, "always_use_lwt". This default could have been overriden for each table separately, but there was no way to change this default for all tables. This patch adds a "--alternator-write-isolation" configuration option which allows changing the default. Moreover, @dorlaor asked that users must *explicitly* choose this default mode, and not get "always_use_lwt" without noticing. The previous default, "always_use_lwt" supports any workload correctly but because it uses LWT for all writes it may be disappointingly slow for users who run write-only workloads (including most benchmarks) - such users might find the slow writes so disappointing that they will drop Scylla. Conversely, a default of "forbid_rmw" will be faster and still correct, but will fail on workloads which need read-modify-write operations - and suprise users that need these operations. So Dor asked that that *none* of the write modes be made the default, and users must make an informed choice between the different write modes, rather than being disappointed by a default choice they weren't aware of. So after this patch, Scylla refuses to boot if Alternator is enabled but a "--alternator-write-isolation" option is missing. The patch also modifies the relevant documentation, adds the same option to our docker image, and the modifies the test-running script test/alternator/run to run Scylla with the old default mode (always_use_lwt), which we need because we want to test RMW operations as well. Fixes #6452 Signed-off-by: Nadav Har'El Message-Id: <20200524160338.108417-1-nyh@scylladb.com> --- alternator/executor.cc | 72 ++++++++++++++++--------- alternator/rmw_operation.hh | 4 ++ db/config.cc | 1 + db/config.hh | 2 + dist/docker/redhat/commandlineparser.py | 1 + dist/docker/redhat/scyllasetup.py | 4 ++ docs/alternator/alternator.md | 35 ++++++++---- docs/alternator/getting-started.md | 8 +-- docs/docker-hub.md | 6 +++ main.cc | 2 + test/alternator/run | 1 + 11 files changed, 98 insertions(+), 38 deletions(-) diff --git a/alternator/executor.cc b/alternator/executor.cc index 6f5f01e51a..132e219634 100644 --- a/alternator/executor.cc +++ b/alternator/executor.cc @@ -573,24 +573,61 @@ static bool validate_legal_tag_chars(std::string_view tag) { return std::all_of(tag.begin(), tag.end(), &is_legal_tag_char); } +static const std::unordered_set allowed_write_isolation_values = { + "f", "forbid", "forbid_rmw", + "a", "always", "always_use_lwt", + "o", "only_rmw_uses_lwt", + "u", "unsafe", "unsafe_rmw", +}; + static void validate_tags(const std::map& tags) { - static const std::unordered_set allowed_values = { - "f", "forbid", "forbid_rmw", - "a", "always", "always_use_lwt", - "o", "only_rmw_uses_lwt", - "u", "unsafe", "unsafe_rmw", - }; auto it = tags.find(rmw_operation::WRITE_ISOLATION_TAG_KEY); if (it != tags.end()) { std::string_view value = it->second; - elogger.warn("Allowed values count {} {}", value, allowed_values.count(value)); - if (allowed_values.count(value) == 0) { + if (allowed_write_isolation_values.count(value) == 0) { throw api_error("ValidationException", - format("Incorrect write isolation tag {}. Allowed values: {}", value, allowed_values)); + format("Incorrect write isolation tag {}. Allowed values: {}", value, allowed_write_isolation_values)); } } } +static rmw_operation::write_isolation parse_write_isolation(std::string_view value) { + if (!value.empty()) { + switch (value[0]) { + case 'f': + return rmw_operation::write_isolation::FORBID_RMW; + case 'a': + return rmw_operation::write_isolation::LWT_ALWAYS; + case 'o': + return rmw_operation::write_isolation::LWT_RMW_ONLY; + case 'u': + return rmw_operation::write_isolation::UNSAFE_RMW; + } + } + // Shouldn't happen as validate_tags() / set_default_write_isolation() + // verify allow only a closed set of values. + return rmw_operation::default_write_isolation; + +} +// This default_write_isolation is always overwritten in main.cc, which calls +// set_default_write_isolation(). +rmw_operation::write_isolation rmw_operation::default_write_isolation = + rmw_operation::write_isolation::LWT_ALWAYS; +void rmw_operation::set_default_write_isolation(std::string_view value) { + if (value.empty()) { + throw std::runtime_error("When Alternator is enabled, write " + "isolation policy must be selected, using the " + "'--alternator-write-isolation' option. " + "See docs/alternator/alternator.md for instructions."); + } + if (allowed_write_isolation_values.count(value) == 0) { + throw std::runtime_error(format("Invalid --alternator-write-isolation " + "setting '{}'. Allowed values: {}.", + value, allowed_write_isolation_values)); + } + default_write_isolation = parse_write_isolation(value); +} + // FIXME: Updating tags currently relies on updating schema, which may be subject // to races during concurrent updates of the same table. Once Scylla schema updates // are fixed, this issue will automatically get fixed as well. @@ -1203,22 +1240,9 @@ rmw_operation::write_isolation rmw_operation::get_write_isolation_for_schema(sch const auto& tags = get_tags_of_table(schema); auto it = tags.find(WRITE_ISOLATION_TAG_KEY); if (it == tags.end() || it->second.empty()) { - // By default, fall back to always enforcing LWT - return write_isolation::LWT_ALWAYS; - } - switch (it->second[0]) { - case 'f': - return write_isolation::FORBID_RMW; - case 'a': - return write_isolation::LWT_ALWAYS; - case 'o': - return write_isolation::LWT_RMW_ONLY; - case 'u': - return write_isolation::UNSAFE_RMW; - default: - // In case of an incorrect tag, fall back to the safest option: LWT_ALWAYS - return write_isolation::LWT_ALWAYS; + return default_write_isolation; } + return parse_write_isolation(it->second); } // shard_for_execute() checks whether execute() must be called on a specific diff --git a/alternator/rmw_operation.hh b/alternator/rmw_operation.hh index 948668d57e..6f4b015eae 100644 --- a/alternator/rmw_operation.hh +++ b/alternator/rmw_operation.hh @@ -63,6 +63,10 @@ public: static write_isolation get_write_isolation_for_schema(schema_ptr schema); + static write_isolation default_write_isolation; +public: + static void set_default_write_isolation(std::string_view mode); + protected: // The full request JSON rjson::value _request; diff --git a/db/config.cc b/db/config.cc index 2aa2b61524..5f1096e957 100644 --- a/db/config.cc +++ b/db/config.cc @@ -736,6 +736,7 @@ db::config::config(std::shared_ptr exts) , alternator_https_port(this, "alternator_https_port", value_status::Used, 0, "Alternator API HTTPS port") , alternator_address(this, "alternator_address", value_status::Used, "0.0.0.0", "Alternator API listening address") , alternator_enforce_authorization(this, "alternator_enforce_authorization", value_status::Used, false, "Enforce checking the authorization header for every request in Alternator") + , alternator_write_isolation(this, "alternator_write_isolation", value_status::Used, "", "Default write isolation policy for Alternator") , abort_on_ebadf(this, "abort_on_ebadf", value_status::Used, true, "Abort the server on incorrect file descriptor access. Throws exception when disabled.") , redis_port(this, "redis_port", value_status::Used, 0, "Port on which the REDIS transport listens for clients.") , redis_ssl_port(this, "redis_ssl_port", value_status::Used, 0, "Port on which the REDIS TLS native transport listens for clients.") diff --git a/db/config.hh b/db/config.hh index 96da0e74b3..c726e34b10 100644 --- a/db/config.hh +++ b/db/config.hh @@ -314,6 +314,8 @@ public: named_value alternator_https_port; named_value alternator_address; named_value alternator_enforce_authorization; + named_value alternator_write_isolation; + named_value abort_on_ebadf; named_value redis_port; diff --git a/dist/docker/redhat/commandlineparser.py b/dist/docker/redhat/commandlineparser.py index 53a39bde98..526a73e5c5 100644 --- a/dist/docker/redhat/commandlineparser.py +++ b/dist/docker/redhat/commandlineparser.py @@ -19,6 +19,7 @@ def parse(): parser.add_argument('--api-address', default=None, dest='apiAddress') parser.add_argument('--alternator-address', default=None, dest='alternatorAddress', help="Alternator API address to listen to. Defaults to listen address.") parser.add_argument('--alternator-port', default=None, dest='alternatorPort', help="Alternator API port to listen to. Disabled by default.") + parser.add_argument('--alternator-write-isolation', default=None, dest='alternatorWriteIsolation', help="Alternator default write isolation policy.") parser.add_argument('--disable-version-check', default=False, action='store_true', dest='disable_housekeeping', help="Disable version check") parser.add_argument('--authenticator', default=None, dest='authenticator', help="Set authenticator class") parser.add_argument('--authorizer', default=None, dest='authorizer', help="Set authorizer class") diff --git a/dist/docker/redhat/scyllasetup.py b/dist/docker/redhat/scyllasetup.py index de99e222c4..05667fe0ca 100644 --- a/dist/docker/redhat/scyllasetup.py +++ b/dist/docker/redhat/scyllasetup.py @@ -16,6 +16,7 @@ class ScyllaSetup: self._broadcastRpcAddress = arguments.broadcastRpcAddress self._apiAddress = arguments.apiAddress self._alternatorPort = arguments.alternatorPort + self._alternatorWriteIsolation = arguments.alternatorWriteIsolation self._smp = arguments.smp self._memory = arguments.memory self._reserveMemory = arguments.reserveMemory @@ -120,6 +121,9 @@ class ScyllaSetup: args += ["--alternator-address %s" % self._alternatorAddress] args += ["--alternator-port %s" % self._alternatorPort] + if self._alternatorWriteIsolation is not None: + args += ["--alternator-write-isolation %s" % self._alternatorWriteIsolation] + if self._authenticator is not None: args += ["--authenticator %s" % self._authenticator] diff --git a/docs/alternator/alternator.md b/docs/alternator/alternator.md index 3bff4a5aae..38be591b57 100644 --- a/docs/alternator/alternator.md +++ b/docs/alternator/alternator.md @@ -25,6 +25,14 @@ By default, Scylla listens on this port on all network interfaces. To listen only on a specific interface, pass also an "`alternator-address`" option. +As we explain below in the "Write isolation policies", Alternator has +four different choices for the implementation of writes, each with +different advantages. You should consider which of the options makes +more sense for your intended use case, and use the "`--alternator-write-isolation`" +option to choose one. There is currently no default for this option: Trying +to run Scylla with Alternator enabled without passing this option will +result in an error asking you to set it. + DynamoDB clients usually specify a single "endpoint" address, e.g., `dynamodb.us-east-1.amazonaws.com`, and a DNS server and/or load balancers distribute the connections to many different backend nodes. Alternator @@ -154,23 +162,28 @@ implemented, with the following limitations: ### Write isolation policies DynamoDB API update requests may involve a read before the write - e.g., a -_conditional_ update, or an update based on the old value of an attribute. +_conditional_ update or an update based on the old value of an attribute. The read and the write should be treated as a single transaction - protected (_isolated_) from other parallel writes to the same item. -By default, Alternator does this isolation by using Scylla's LWT (lightweight -transactions) for every write operation. However, LWT significantly slows -writes down, so Alternator supports three additional _write isolation -policies_, which can be chosen on a per-table basis and may make sense for -certain workloads as explained below. +Alternator could do this isolation by using Scylla's LWT (lightweight +transactions) for every write operation, but this significantly slows +down writes, and not necessary for workloads which don't use read-modify-write +(RMW) updates. -The write isolation policy of a table is configured by tagging the table (at -CreateTable time, or any time later with TagResource) with the key +So Alternator supports four _write isolation policies_, which can be chosen +on a per-table basis and may make sense for certain workloads as explained +below. + +A default write isolation policy **must** be chosen using the +`--alternator-write-isolation` configuration option. Additionally, the write +isolation policy for a specific table can be overriden by tagging the table +(at CreateTable time, or any time later with TagResource) with the key `system:write_isolation`, and one of the following values: - * `a`, `always`, or `always_use_lwt` - This is the default choice. - It performs every write operation - even those that do not need a read - before the write - as a lightweight transaction. + * `a`, `always`, or `always_use_lwt` - This mode performs every write + operation - even those that do not need a read before the write - as a + lightweight transaction. This is the slowest choice, but also the only choice guaranteed to work correctly for every workload. diff --git a/docs/alternator/getting-started.md b/docs/alternator/getting-started.md index 71e81b9865..f883ec6903 100644 --- a/docs/alternator/getting-started.md +++ b/docs/alternator/getting-started.md @@ -10,10 +10,12 @@ This section will guide you through the steps for setting up the cluster: nightly image by running: `docker pull scylladb/scylla-nightly:latest` 2. Follow the steps in the [Scylla official download web page](https://www.scylladb.com/download/open-source/#docker) add to every "docker run" command: `-p 8000:8000` before the image name - and `--alternator-port=8000` at the end. The "alternator-port" option - specifies on which port Scylla will listen for the (unencrypted) DynamoDB API. + and `--alternator-port=8000 --alternator-write-isolation=always` at the end. + The "alternator-port" option specifies on which port Scylla will listen for + the (unencrypted) DynamoDB API, and the "alternator-write-isolation" chooses + whether or not Alternator will use LWT for every write. For example, - `docker run --name scylla -d -p 8000:8000 scylladb/scylla-nightly:latest --alternator-port=8000 + `docker run --name scylla -d -p 8000:8000 scylladb/scylla-nightly:latest --alternator-port=8000 --alternator-write-isolation=always ## Testing Scylla's DynamoDB API support: ### Running AWS Tic Tac Toe demo app to test the cluster: diff --git a/docs/docker-hub.md b/docs/docker-hub.md index ae59d4b799..70d2f974b5 100644 --- a/docs/docker-hub.md +++ b/docs/docker-hub.md @@ -163,6 +163,12 @@ $ docker run --name some-scylla -d scylladb/scylla --alternator-port 8000 **Since: 3.2** +### `--alternator-write-isolation policy` + +The `--alternator-write-isolation` command line option chooses between four allowed write isolation policies described in docs/alternator/alternator.md. This option must be specified if Alternator is enabled - it does not have a default. + +**Since: 4.1** + ### `--broadcast-address ADDR` The `--broadcast-address` command line option configures the IP address the Scylla instance tells other Scylla nodes in the cluster to connect to. diff --git a/main.cc b/main.cc index a041fcf037..7198682d01 100644 --- a/main.cc +++ b/main.cc @@ -79,6 +79,7 @@ #include "cdc/log.hh" #include "cdc/cdc_extension.hh" #include "alternator/tags_extension.hh" +#include "alternator/rmw_operation.hh" namespace fs = std::filesystem; @@ -1094,6 +1095,7 @@ int main(int ac, char** av) { } if (cfg->alternator_port() || cfg->alternator_https_port()) { + alternator::rmw_operation::set_default_write_isolation(cfg->alternator_write_isolation()); static sharded alternator_executor; static sharded alternator_server; diff --git a/test/alternator/run b/test/alternator/run index 7d4e094e71..43ae27af77 100755 --- a/test/alternator/run +++ b/test/alternator/run @@ -83,6 +83,7 @@ ln -s "$SCYLLA" "$SCYLLA_LINK" --alternator-address $SCYLLA_IP \ $alternator_port_option \ --alternator-enforce-authorization=1 \ + --alternator-write-isolation=always_use_lwt \ --developer-mode=1 \ --ring-delay-ms 0 --collectd 0 \ --smp 2 -m 1G \