storage_proxy: do not touch all_replicas.front() if it's empty.

The list of all endpoints for a query can be empty if we have replication_factor 0 or there are no live endpoints for this token. Do not access all_replicas.front() in this case. Fixes #5935. Message-Id: <20200306192521.73486-2-kostja@scylladb.com> (cherry picked from commit 9827efe554)
cql transport: do not log broken pipe error when a client closes its side of a connection abruptly
2020-06-22 18:29:15 +03:00 · 2020-06-21 13:09:22 +03:00 · 2020-06-21 13:07:21 +03:00 · 2020-06-21 13:03:05 +03:00 · 2020-06-21 12:57:48 +03:00 · 2020-06-21 12:47:05 +03:00
2597 changed files with 19987 additions and 7200 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -22,3 +22,5 @@ resources
 .pytest_cache
 /expressions.tokens
 tags
+testlog/*
+test/*/*.reject
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,7 +97,7 @@ scan_scylla_source_directories(
          service
          sstables
          streaming
-          tests
+          test
          thrift
          tracing
          transport
--- a/31
+++ b/31
@@ -5,8 +5,6 @@ F: Filename, directory, or pattern for the subsystem
 ---

 AUTH
-M: Paweł Dziepak <pdziepak@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 R: Calle Wilund <calle@scylladb.com>
 R: Vlad Zolotarov <vladz@scylladb.com>
 R: Jesse Haber-Kucharsky <jhaberku@scylladb.com>
@@ -14,22 +12,17 @@ F: auth/*

 CACHE
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Paweł Dziepak <pdziepak@scylladb.com>
 R: Piotr Jastrzebski <piotr@scylladb.com>
 F: row_cache*
 F: *mutation*
 F: tests/mvcc*

 COMMITLOG / BATCHLOGa
-M: Paweł Dziepak <pdziepak@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 R: Calle Wilund <calle@scylladb.com>
 F: db/commitlog/*
 F: db/batch*

 COORDINATOR
-M: Paweł Dziepak <pdziepak@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 R: Gleb Natapov <gleb@scylladb.com>
 F: service/storage_proxy*

@@ -49,12 +42,10 @@ M: Pekka Enberg <penberg@scylladb.com>
 F: cql3/*

 COUNTERS
-M: Paweł Dziepak <pdziepak@scylladb.com>
 F: counters*
 F: tests/counter_test*

 GOSSIP
-M: Duarte Nunes <duarte@scylladb.com>
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
 R: Asias He <asias@scylladb.com>
 F: gms/*
@@ -65,14 +56,11 @@ F: dist/docker/*

 LSA
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Paweł Dziepak <pdziepak@scylladb.com>
 F: utils/logalloc*

 MATERIALIZED VIEWS
-M: Duarte Nunes <duarte@scylladb.com>
 M: Pekka Enberg <penberg@scylladb.com>
-R: Nadav Har'El <nyh@scylladb.com>
-R: Duarte Nunes <duarte@scylladb.com>
+M: Nadav Har'El <nyh@scylladb.com>
 F: db/view/*
 F: cql3/statements/*view*

@@ -82,14 +70,12 @@ F: dist/*

 REPAIR
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 R: Asias He <asias@scylladb.com>
 R: Nadav Har'El <nyh@scylladb.com>
 F: repair/*

 SCHEMA MANAGEMENT
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 M: Pekka Enberg <penberg@scylladb.com>
 F: db/schema_tables*
 F: db/legacy_schema_migrator*
@@ -98,15 +84,13 @@ F: schema*

 SECONDARY INDEXES
 M: Pekka Enberg <penberg@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
-R: Nadav Har'El <nyh@scylladb.com>
+M: Nadav Har'El <nyh@scylladb.com>
 R: Pekka Enberg <penberg@scylladb.com>
 F: db/index/*
 F: cql3/statements/*index*

 SSTABLES
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 R: Raphael S. Carvalho <raphaelsc@scylladb.com>
 R: Glauber Costa <glauber@scylladb.com>
 R: Nadav Har'El <nyh@scylladb.com>
@@ -114,18 +98,17 @@ F: sstables/*

 STREAMING
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 R: Asias He <asias@scylladb.com>
 F: streaming/*
 F: service/storage_service.*

-THRIFT TRANSPORT LAYER
-M: Duarte Nunes <duarte@scylladb.com>
-F: thrift/*
+ALTERNATOR
+M: Nadav Har'El <nyh@scylladb.com>
+F: alternator/*
+F: alternator-test/*

 THE REST
 M: Avi Kivity <avi@scylladb.com>
-M: Paweł Dziepak <pdziepak@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
+M: Nadav Har'El <nyh@scylladb.com>
 F: *
--- a/README.md
+++ b/README.md
@@ -27,10 +27,10 @@ Please see [HACKING.md](HACKING.md) for detailed information on building and dev

 ```

-* run Scylla with one CPU and ./tmp as data directory
+* run Scylla with one CPU and ./tmp as work directory

 ```
-./build/release/scylla --datadir tmp --commitlog-directory tmp --smp 1
+./build/release/scylla --workdir tmp --smp 1
 ```

 * For more run options:
--- a/2
+++ b/2
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=3.2.5
+VERSION=3.3.4

 if test -f version
 then
--- a/alternator-test/test_authorization.py
+++ b/alternator-test/test_authorization.py
@@ -55,7 +55,7 @@ def test_expired_signature(dynamodb, test_table):
               'X-Amz-Target': 'DynamoDB_20120810.DescribeEndpoints',
               'Authorization': 'AWS4-HMAC-SHA256 Credential=alternator/2/3/4/aws4_request SignedHeaders=x-amz-date;host Signature=123'
    }
-    response = requests.post(url, headers=headers)
+    response = requests.post(url, headers=headers, verify=False)
    assert not response.ok
    assert "InvalidSignatureException" in response.text and "Signature expired" in response.text

@@ -69,6 +69,6 @@ def test_signature_too_futuristic(dynamodb, test_table):
               'X-Amz-Target': 'DynamoDB_20120810.DescribeEndpoints',
               'Authorization': 'AWS4-HMAC-SHA256 Credential=alternator/2/3/4/aws4_request SignedHeaders=x-amz-date;host Signature=123'
    }
-    response = requests.post(url, headers=headers)
+    response = requests.post(url, headers=headers, verify=False)
    assert not response.ok
    assert "InvalidSignatureException" in response.text and "Signature not yet current" in response.text
--- a/alternator-test/test_condition_expression.py
+++ b/alternator-test/test_condition_expression.py
--- a/alternator-test/test_describe_table.py
+++ b/alternator-test/test_describe_table.py
@@ -41,7 +41,6 @@ def test_describe_table_basic(test_table):

 # Test that DescribeTable correctly returns the table's schema, in
 # AttributeDefinitions and KeySchema attributes
-@pytest.mark.xfail(reason="DescribeTable does not yet return schema")
 def test_describe_table_schema(test_table):
    got = test_table.meta.client.describe_table(TableName=test_table.name)['Table']
    expected = { # Copied from test_table()'s fixture
--- a/alternator-test/test_expected.py
+++ b/alternator-test/test_expected.py
@@ -86,7 +86,6 @@ def test_update_expected_1_eq_true(test_table_s):
 # Check that set equality is checked correctly. Unlike string equality (for
 # example), it cannot be done with just naive string comparison of the JSON
 # representation, and we need to allow for any order.
-@pytest.mark.xfail(reason="bug in EQ test of sets")
 def test_update_expected_1_eq_set(test_table_s):
    p = random_string()
    # Because boto3 sorts the set values we give it, in order to generate a
@@ -171,7 +170,6 @@ def test_update_expected_1_ne_false(test_table_s):
        )

 # Tests for Expected with ComparisonOperator = "LE":
-@pytest.mark.xfail(reason="ComparisonOperator=LE in Expected not yet implemented")
 def test_update_expected_1_le(test_table_s):
    p = random_string()
    # LE should work for string, number, and binary type
@@ -308,7 +306,6 @@ def test_update_expected_1_lt(test_table_s):
        )

 # Tests for Expected with ComparisonOperator = "GE":
-@pytest.mark.xfail(reason="ComparisonOperator=GE in Expected not yet implemented")
 def test_update_expected_1_ge(test_table_s):
    p = random_string()
    # GE should work for string, number, and binary type
@@ -526,7 +523,6 @@ def test_update_expected_1_null(test_table_s):
        )

 # Tests for Expected with ComparisonOperator = "CONTAINS":
-@pytest.mark.xfail(reason="ComparisonOperator=CONTAINS in Expected not yet implemented")
 def test_update_expected_1_contains(test_table_s):
    # true cases. CONTAINS can be used for two unrelated things: check substrings
    # (in string or binary) and membership (in set or list).
@@ -609,7 +605,6 @@ def test_update_expected_1_contains(test_table_s):
        )

 # Tests for Expected with ComparisonOperator = "NOT_CONTAINS":
-@pytest.mark.xfail(reason="ComparisonOperator=NOT_CONTAINS in Expected not yet implemented")
 def test_update_expected_1_not_contains(test_table_s):
    # true cases. NOT_CONTAINS can be used for two unrelated things: check substrings
    # (in string or binary) and membership (in set or list).
@@ -699,14 +694,21 @@ def test_update_expected_1_not_contains(test_table_s):
 def test_update_expected_1_begins_with_true(test_table_s):
    p = random_string()
    test_table_s.update_item(Key={'p': p},
-        AttributeUpdates={'a': {'Value': 'hello', 'Action': 'PUT'}})
+        AttributeUpdates={'a': {'Value': 'hello', 'Action': 'PUT'},
+                          'd': {'Value': bytearray('hi there', 'utf-8'), 'Action': 'PUT'}})
    # Case where expected and update are on different attribute:
    test_table_s.update_item(Key={'p': p},
        AttributeUpdates={'b': {'Value': 3, 'Action': 'PUT'}},
        Expected={'a': {'ComparisonOperator': 'BEGINS_WITH',
                        'AttributeValueList': ['hell']}}
    )
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'a': 'hello', 'b': 3}
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['b'] == 3
+    test_table_s.update_item(Key={'p': p},
+        AttributeUpdates={'b': {'Value': 4, 'Action': 'PUT'}},
+        Expected={'d': {'ComparisonOperator': 'BEGINS_WITH',
+                        'AttributeValueList': [bytearray('hi', 'utf-8')]}}
+    )
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['b'] == 4
    # For BEGINS_WITH, AttributeValueList must have a single element
    with pytest.raises(ClientError, match='ValidationException'):
        test_table_s.update_item(Key={'p': p},
@@ -798,13 +800,13 @@ def test_update_expected_1_in(test_table_s):
        )

 # Tests for Expected with ComparisonOperator = "BETWEEN":
-@pytest.mark.xfail(reason="ComparisonOperator=BETWEEN in Expected not yet implemented")
 def test_update_expected_1_between(test_table_s):
    p = random_string()
    test_table_s.update_item(Key={'p': p},
        AttributeUpdates={'a': {'Value': 2, 'Action': 'PUT'},
                          'b': {'Value': 'cat', 'Action': 'PUT'},
-                          'c': {'Value': bytearray('cat', 'utf-8'), 'Action': 'PUT'}})
+                          'c': {'Value': bytearray('cat', 'utf-8'), 'Action': 'PUT'},
+                          'd': {'Value': set([2, 4, 7]), 'Action': 'PUT'}})
    # true cases:
    test_table_s.update_item(Key={'p': p},
        AttributeUpdates={'z': {'Value': 2, 'Action': 'PUT'}},
@@ -842,6 +844,10 @@ def test_update_expected_1_between(test_table_s):
            AttributeUpdates={'z': {'Value': 2, 'Action': 'PUT'}},
            Expected={'a': {'ComparisonOperator': 'BETWEEN', 'AttributeValueList': ['cat', 'dog']}}
        )
+    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
+        test_table_s.update_item(Key={'p': p},
+            AttributeUpdates={'z': {'Value': 2, 'Action': 'PUT'}},
+            Expected={'q': {'ComparisonOperator': 'BETWEEN', 'AttributeValueList': [0, 100]}})
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 6
    # The given AttributeValueList array must contain exactly two items of the
    # same type, and in the right order. Any other input is considered a validation
@@ -858,10 +864,18 @@ def test_update_expected_1_between(test_table_s):
        test_table_s.update_item(Key={'p': p},
            AttributeUpdates={'z': {'Value': 2, 'Action': 'PUT'}},
            Expected={'a': {'ComparisonOperator': 'BETWEEN', 'AttributeValueList': [4, 3]}})
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            AttributeUpdates={'z': {'Value': 2, 'Action': 'PUT'}},
+            Expected={'b': {'ComparisonOperator': 'BETWEEN', 'AttributeValueList': ['dog', 'aardvark']}})
    with pytest.raises(ClientError, match='ValidationException'):
        test_table_s.update_item(Key={'p': p},
            AttributeUpdates={'z': {'Value': 2, 'Action': 'PUT'}},
            Expected={'a': {'ComparisonOperator': 'BETWEEN', 'AttributeValueList': [4, 'dog']}})
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            AttributeUpdates={'z': {'Value': 2, 'Action': 'PUT'}},
+            Expected={'d': {'ComparisonOperator': 'BETWEEN', 'AttributeValueList': [set([1]), set([2])]}})

 ##############################################################################
 # Instead of ComparisonOperator and AttributeValueList, one can specify either
--- a/alternator-test/test_gsi.py
+++ b/alternator-test/test_gsi.py
@@ -377,7 +377,6 @@ def test_gsi_3(test_table_gsi_3):
        KeyConditions={'a': {'AttributeValueList': [items[3]['a']], 'ComparisonOperator': 'EQ'},
                       'b': {'AttributeValueList': [items[3]['b']], 'ComparisonOperator': 'EQ'}})

-@pytest.mark.xfail(reason="GSI in alternator currently have a bug on updating the second regular base column")
 def test_gsi_update_second_regular_base_column(test_table_gsi_3):
    items = [{'p': random_string(), 'a': random_string(), 'b': random_string(), 'd': random_string()} for i in range(10)]
    with test_table_gsi_3.batch_writer() as batch:
@@ -389,6 +388,34 @@ def test_gsi_update_second_regular_base_column(test_table_gsi_3):
        KeyConditions={'a': {'AttributeValueList': [items[3]['a']], 'ComparisonOperator': 'EQ'},
                       'b': {'AttributeValueList': [items[3]['b']], 'ComparisonOperator': 'EQ'}})

+# Test that when a table has a GSI, if the indexed attribute is missing, the
+# item is added to the base table but not the index.
+# This is the same feature we already tested in test_gsi_missing_attribute()
+# above, but on a different table: In that test we used test_table_gsi_2,
+# with one indexed attribute, and in this test we use test_table_gsi_3 which
+# has two base regular attributes in the view key, and more possibilities
+# of which value might be missing. Reproduces issue #6008.
+def test_gsi_missing_attribute_3(test_table_gsi_3):
+    p = random_string()
+    a = random_string()
+    b = random_string()
+    # First, add an item with a missing "a" value. It should appear in the
+    # base table, but not in the index:
+    test_table_gsi_3.put_item(Item={'p':  p, 'b': b})
+    assert test_table_gsi_3.get_item(Key={'p':  p})['Item'] == {'p': p, 'b': b}
+    # Note: with eventually consistent read, we can't really be sure that
+    # an item will "never" appear in the index. We hope that if a bug exists
+    # and such an item did appear, sometimes the delay here will be enough
+    # for the unexpected item to become visible.
+    assert not any([i['p'] == p for i in full_scan(test_table_gsi_3, IndexName='hello')])
+    # Same thing for an item with a missing "b" value:
+    test_table_gsi_3.put_item(Item={'p':  p, 'a': a})
+    assert test_table_gsi_3.get_item(Key={'p':  p})['Item'] == {'p': p, 'a': a}
+    assert not any([i['p'] == p for i in full_scan(test_table_gsi_3, IndexName='hello')])
+    # And for an item missing both:
+    test_table_gsi_3.put_item(Item={'p':  p})
+    assert test_table_gsi_3.get_item(Key={'p':  p})['Item'] == {'p': p}
+    assert not any([i['p'] == p for i in full_scan(test_table_gsi_3, IndexName='hello')])

 # A fourth scenario of GSI. Two GSIs on a single base table.
@pytest.fixture(scope="session")
@@ -477,6 +504,52 @@ def test_gsi_5(test_table_gsi_5):
        KeyConditions={'p': {'AttributeValueList': [p2], 'ComparisonOperator': 'EQ'},
                       'x': {'AttributeValueList': [x2], 'ComparisonOperator': 'EQ'}})

+# Verify that DescribeTable correctly returns the schema of both base-table
+# and secondary indexes. KeySchema is given for each of the base table and
+# indexes, and AttributeDefinitions is merged for all of them together.
+def test_gsi_5_describe_table_schema(test_table_gsi_5):
+    got = test_table_gsi_5.meta.client.describe_table(TableName=test_table_gsi_5.name)['Table']
+    # Copied from test_table_gsi_5 fixture
+    expected_base_keyschema = [
+                    { 'AttributeName': 'p', 'KeyType': 'HASH' },
+                    { 'AttributeName': 'c', 'KeyType': 'RANGE' } ]
+    expected_gsi_keyschema = [
+                    { 'AttributeName': 'p', 'KeyType': 'HASH' },
+                    { 'AttributeName': 'x', 'KeyType': 'RANGE' } ]
+    expected_all_attribute_definitions = [
+                    { 'AttributeName': 'p', 'AttributeType': 'S' },
+                    { 'AttributeName': 'c', 'AttributeType': 'S' },
+                    { 'AttributeName': 'x', 'AttributeType': 'S' } ]
+    assert got['KeySchema'] == expected_base_keyschema
+    gsis = got['GlobalSecondaryIndexes']
+    assert len(gsis) == 1
+    assert gsis[0]['KeySchema'] == expected_gsi_keyschema
+    # The list of attribute definitions may be arbitrarily reordered
+    assert multiset(got['AttributeDefinitions']) == multiset(expected_all_attribute_definitions)
+
+# Similar DescribeTable schema test for test_table_gsi_2. The peculiarity
+# in that table is that the base table has only a hash key p, and index
+# only hash hash key x; Now, while internally Scylla needs to add "p" as a
+# clustering key in the materialized view (in Scylla the view key always
+# contains the base key), when describing the table, "p" shouldn't be
+# returned as a range key, because the user didn't ask for it.
+# This test reproduces issue #5320.
+@pytest.mark.xfail(reason="GSI DescribeTable spurious range key (#5320)")
+def test_gsi_2_describe_table_schema(test_table_gsi_2):
+    got = test_table_gsi_2.meta.client.describe_table(TableName=test_table_gsi_2.name)['Table']
+    # Copied from test_table_gsi_2 fixture
+    expected_base_keyschema = [ { 'AttributeName': 'p', 'KeyType': 'HASH' } ]
+    expected_gsi_keyschema = [ { 'AttributeName': 'x', 'KeyType': 'HASH' } ]
+    expected_all_attribute_definitions = [
+                    { 'AttributeName': 'p', 'AttributeType': 'S' },
+                    { 'AttributeName': 'x', 'AttributeType': 'S' } ]
+    assert got['KeySchema'] == expected_base_keyschema
+    gsis = got['GlobalSecondaryIndexes']
+    assert len(gsis) == 1
+    assert gsis[0]['KeySchema'] == expected_gsi_keyschema
+    # The list of attribute definitions may be arbitrarily reordered
+    assert multiset(got['AttributeDefinitions']) == multiset(expected_all_attribute_definitions)
+
 # All tests above involved "ProjectionType: ALL". This test checks how
 # "ProjectionType:: KEYS_ONLY" works. We note that it projects both
 # the index's key, *and* the base table's key. So items which had different
--- a/alternator-test/test_health.py
+++ b/alternator-test/test_health.py
@@ -29,6 +29,7 @@ def test_health_works(dynamodb):
 # Test that a health check only works for the root URL ('/')
 def test_health_only_works_for_root_path(dynamodb):
    url = dynamodb.meta.client._endpoint.host
-    for suffix in ['/abc', '/..', '/-', '/index.htm', '/health']:
-        response = requests.get(url + suffix)
+    for suffix in ['/abc', '/-', '/index.htm', '/health']:
+        print(url + suffix)
+        response = requests.get(url + suffix, verify=False)
        assert response.status_code in range(400, 405)
--- a/alternator-test/test_query.py
+++ b/alternator-test/test_query.py
@@ -20,7 +20,7 @@

 import random
 import pytest
-from botocore.exceptions import ClientError
+from botocore.exceptions import ClientError, ParamValidationError
 from decimal import Decimal
 from util import random_string, random_bytes, full_query, multiset
 from boto3.dynamodb.conditions import Key, Attr
@@ -356,3 +356,161 @@ def test_query_which_key(test_table):
            'c': {'AttributeValueList': [c], 'ComparisonOperator': 'EQ'},
            'z': {'AttributeValueList': [c], 'ComparisonOperator': 'EQ'}
        })
+
+# Test the "Select" parameter of Query. The default Select mode,
+# ALL_ATTRIBUTES, returns items with all their attributes. Other modes
+# allow returning just specific attributes or just counting the results
+# without returning items at all.
+@pytest.mark.xfail(reason="Select not supported yet")
+def test_query_select(test_table_sn):
+    numbers = [Decimal(i) for i in range(10)]
+    # Insert these numbers, in random order, into one partition:
+    p = random_string()
+    items = [{'p': p, 'c': num, 'x': num} for num in random.sample(numbers, len(numbers))]
+    with test_table_sn.batch_writer() as batch:
+        for item in items:
+            batch.put_item(item)
+    # Verify that we get back the numbers in their sorted order. By default,
+    # query returns all attributes:
+    got_items = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}})['Items']
+    got_sort_keys = [x['c'] for x in got_items]
+    assert got_sort_keys == numbers
+    got_x_attributes = [x['x'] for x in got_items]
+    assert got_x_attributes == numbers
+    # Select=ALL_ATTRIBUTES does exactly the same as the default - return
+    # all attributes:
+    got_items = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Select='ALL_ATTRIBUTES')['Items']
+    got_sort_keys = [x['c'] for x in got_items]
+    assert got_sort_keys == numbers
+    got_x_attributes = [x['x'] for x in got_items]
+    assert got_x_attributes == numbers
+    # Select=ALL_PROJECTED_ATTRIBUTES is not allowed on a base table (it
+    # is just for indexes, when IndexName is specified)
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Select='ALL_PROJECTED_ATTRIBUTES')
+    # Select=SPECIFIC_ATTRIBUTES requires that either a AttributesToGet
+    # or ProjectionExpression appears, but then really does nothing:
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Select='SPECIFIC_ATTRIBUTES')
+    got_items = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Select='SPECIFIC_ATTRIBUTES', AttributesToGet=['x'])['Items']
+    expected_items = [{'x': i} for i in numbers]
+    assert got_items == expected_items
+    got_items = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Select='SPECIFIC_ATTRIBUTES', ProjectionExpression='x')['Items']
+    assert got_items == expected_items
+    # Select=COUNT just returns a count - not any items
+    got = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Select='COUNT')
+    assert got['Count'] == len(numbers)
+    assert not 'Items' in got
+    # Check again that we also get a count - not just with Select=COUNT,
+    # but without Select=COUNT we also get the items:
+    got = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}})
+    assert got['Count'] == len(numbers)
+    assert 'Items' in got
+    # Select with some unknown string generates a validation exception:
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Select='UNKNOWN')
+
+# Test that the "Limit" parameter can be used to return only some of the
+# items in a single partition. The items returned are the first in the
+# sorted order.
+def test_query_limit(test_table_sn):
+    numbers = [Decimal(i) for i in range(10)]
+    # Insert these numbers, in random order, into one partition:
+    p = random_string()
+    items = [{'p': p, 'c': num} for num in random.sample(numbers, len(numbers))]
+    with test_table_sn.batch_writer() as batch:
+        for item in items:
+            batch.put_item(item)
+    # Verify that we get back the numbers in their sorted order.
+    # First, no Limit so we should get all numbers (we have few of them, so
+    # it all fits in the default 1MB limitation)
+    got_items = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}})['Items']
+    got_sort_keys = [x['c'] for x in got_items]
+    assert got_sort_keys == numbers
+    # Now try a few different Limit values, and verify that the query
+    # returns exactly the first Limit sorted numbers.
+    for limit in [1, 2, 3, 7, 10, 17, 100, 10000]:
+        got_items = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Limit=limit)['Items']
+        assert len(got_items) == min(limit, len(numbers))
+        got_sort_keys = [x['c'] for x in got_items]
+        assert got_sort_keys == numbers[0:limit]
+    # Unfortunately, the boto3 library forbids a Limit of 0 on its own,
+    # before even sending a request, so we can't test how the server responds.
+    with pytest.raises(ParamValidationError):
+        test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Limit=0)
+
+# In test_query_limit we tested just that Limit allows to stop the result
+# after right right number of items. Here we test that such a stopped result
+# can be resumed, via the LastEvaluatedKey/ExclusiveStartKey paging mechanism.
+def test_query_limit_paging(test_table_sn):
+    numbers = [Decimal(i) for i in range(20)]
+    # Insert these numbers, in random order, into one partition:
+    p = random_string()
+    items = [{'p': p, 'c': num} for num in random.sample(numbers, len(numbers))]
+    with test_table_sn.batch_writer() as batch:
+        for item in items:
+            batch.put_item(item)
+    # Verify that full_query() returns all these numbers, in sorted order.
+    # full_query() will do a query with the given limit, and resume it again
+    # and again until the last page.
+    for limit in [1, 2, 3, 7, 10, 17, 100, 10000]:
+        got_items = full_query(test_table_sn, KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Limit=limit)
+        got_sort_keys = [x['c'] for x in got_items]
+        assert got_sort_keys == numbers
+
+# Test that the ScanIndexForward parameter works, and can be used to
+# return items sorted in reverse order. Combining this with Limit can
+# be used to return the last items instead of the first items of the
+# partition.
+@pytest.mark.xfail(reason="ScanIndexForward not supported yet")
+def test_query_reverse(test_table_sn):
+    numbers = [Decimal(i) for i in range(20)]
+    # Insert these numbers, in random order, into one partition:
+    p = random_string()
+    items = [{'p': p, 'c': num} for num in random.sample(numbers, len(numbers))]
+    with test_table_sn.batch_writer() as batch:
+        for item in items:
+            batch.put_item(item)
+    # Verify that we get back the numbers in their sorted order or reverse
+    # order, depending on the ScanIndexForward parameter being True or False.
+    # First, no Limit so we should get all numbers (we have few of them, so
+    # it all fits in the default 1MB limitation)
+    reversed_numbers = list(reversed(numbers))
+    got_items = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, ScanIndexForward=True)['Items']
+    got_sort_keys = [x['c'] for x in got_items]
+    assert got_sort_keys == numbers
+    got_items = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, ScanIndexForward=False)['Items']
+    got_sort_keys = [x['c'] for x in got_items]
+    assert got_sort_keys == reversed_numbers
+    # Now try a few different Limit values, and verify that the query
+    # returns exactly the first Limit sorted numbers - in regular or
+    # reverse order, depending on ScanIndexForward.
+    for limit in [1, 2, 3, 7, 10, 17, 100, 10000]:
+        got_items = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Limit=limit, ScanIndexForward=True)['Items']
+        assert len(got_items) == min(limit, len(numbers))
+        got_sort_keys = [x['c'] for x in got_items]
+        assert got_sort_keys == numbers[0:limit]
+        got_items = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Limit=limit, ScanIndexForward=False)['Items']
+        assert len(got_items) == min(limit, len(numbers))
+        got_sort_keys = [x['c'] for x in got_items]
+        assert got_sort_keys == reversed_numbers[0:limit]
+
+# Test that paging also works properly with reverse order
+# (ScanIndexForward=false), i.e., reverse-order queries can be resumed
+@pytest.mark.xfail(reason="ScanIndexForward not supported yet")
+def test_query_reverse_paging(test_table_sn):
+    numbers = [Decimal(i) for i in range(20)]
+    # Insert these numbers, in random order, into one partition:
+    p = random_string()
+    items = [{'p': p, 'c': num} for num in random.sample(numbers, len(numbers))]
+    with test_table_sn.batch_writer() as batch:
+        for item in items:
+            batch.put_item(item)
+    reversed_numbers = list(reversed(numbers))
+    # Verify that with ScanIndexForward=False, full_query() returns all
+    # these numbers in reversed sorted order - getting pages of Limit items
+    # at a time and resuming the query.
+    for limit in [1, 2, 3, 7, 10, 17, 100, 10000]:
+        got_items = full_query(test_table_sn, KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, ScanIndexForward=False, Limit=limit)
+        got_sort_keys = [x['c'] for x in got_items]
+        assert got_sort_keys == reversed_numbers
--- a/alternator-test/test_returnvalues.py
+++ b/alternator-test/test_returnvalues.py
@@ -0,0 +1,226 @@
+# Copyright 2019 ScyllaDB
+#
+# This file is part of Scylla.
+#
+# Scylla is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Scylla is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+
+# Tests for the ReturnValues parameter for the different update operations
+# (PutItem, UpdateItem, DeleteItem).
+
+import pytest
+from botocore.exceptions import ClientError
+from util import random_string
+
+# Test trivial support for the ReturnValues parameter in PutItem, UpdateItem
+# and DeleteItem - test that "NONE" works (and changes nothing), while a
+# completely unsupported value gives an error.
+# This test is useful to check that before the ReturnValues parameter is fully
+# implemented, it returns an error when a still-unsupported ReturnValues
+# option is attempted in the request - instead of simply being ignored.
+def test_trivial_returnvalues(test_table_s):
+    # PutItem:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
+    ret=test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='NONE')
+    assert not 'Attributes' in ret
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='DOG')
+    # UpdateItem:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
+    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='NONE',
+        UpdateExpression='SET b = :val',
+        ExpressionAttributeValues={':val': 'cat'})
+    assert not 'Attributes' in ret
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p}, ReturnValues='DOG',
+            UpdateExpression='SET a = a + :val',
+            ExpressionAttributeValues={':val': 1})
+    # DeleteItem:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
+    ret=test_table_s.delete_item(Key={'p': p}, ReturnValues='NONE')
+    assert not 'Attributes' in ret
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.delete_item(Key={'p': p}, ReturnValues='DOG')
+
+# Test the ReturnValues parameter on a PutItem operation. Only two settings
+# are supported for this parameter for this operation: NONE (the default)
+# and ALL_OLD.
+@pytest.mark.xfail(reason="ReturnValues not supported")
+def test_put_item_returnvalues(test_table_s):
+    # By default, the previous value of an item is not returned:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
+    ret=test_table_s.put_item(Item={'p': p, 'a': 'hello'})
+    assert not 'Attributes' in ret
+    # Using ReturnValues=NONE is the same:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
+    ret=test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='NONE')
+    assert not 'Attributes' in ret
+    # With ReturnValues=ALL_OLD, the old value of the item is returned
+    # in an "Attributes" attribute:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
+    ret=test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='ALL_OLD')
+    assert ret['Attributes'] == {'p': p, 'a': 'hi'}
+    # Other ReturnValue options - UPDATED_OLD, ALL_NEW, UPDATED_NEW,
+    # are supported by other operations but not by PutItem:
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='UPDATED_OLD')
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='ALL_NEW')
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='UPDATED_NEW')
+    # Also, obviously, a non-supported setting "DOG" also returns in error:
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='DOG')
+    # The ReturnValues value is case sensitive, so while "NONE" is supported
+    # (and tested above), "none" isn't:
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='none')
+
+# Test the ReturnValues parameter on a DeleteItem operation. Only two settings
+# are supported for this parameter for this operation: NONE (the default)
+# and ALL_OLD.
+@pytest.mark.xfail(reason="ReturnValues not supported")
+def test_delete_item_returnvalues(test_table_s):
+    # By default, the previous value of an item is not returned:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
+    ret=test_table_s.delete_item(Key={'p': p})
+    assert not 'Attributes' in ret
+    # Using ReturnValues=NONE is the same:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
+    ret=test_table_s.delete_item(Key={'p': p}, ReturnValues='NONE')
+    assert not 'Attributes' in ret
+    # With ReturnValues=ALL_OLD, the old value of the item is returned
+    # in an "Attributes" attribute:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
+    ret=test_table_s.delete_item(Key={'p': p}, ReturnValues='ALL_OLD')
+    assert ret['Attributes'] == {'p': p, 'a': 'hi'}
+    # Other ReturnValue options - UPDATED_OLD, ALL_NEW, UPDATED_NEW,
+    # are supported by other operations but not by PutItem:
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.delete_item(Key={'p': p}, ReturnValues='UPDATE_OLD')
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.delete_item(Key={'p': p}, ReturnValues='ALL_NEW')
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.delete_item(Key={'p': p}, ReturnValues='UPDATE_NEW')
+    # Also, obviously, a non-supported setting "DOG" also returns in error:
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.delete_item(Key={'p': p}, ReturnValues='DOG')
+    # The ReturnValues value is case sensitive, so while "NONE" is supported
+    # (and tested above), "none" isn't:
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.delete_item(Key={'p': p}, ReturnValues='none')
+
+# Test the ReturnValues parameter on a UpdateItem operation. All five
+# settings are supported for this parameter for this operation: NONE
+# (the default), ALL_OLD, UPDATED_OLD, ALL_NEW and UPDATED_NEW.
+@pytest.mark.xfail(reason="ReturnValues not supported")
+def test_update_item_returnvalues(test_table_s):
+    # By default, the previous value of an item is not returned:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
+    ret=test_table_s.update_item(Key={'p': p},
+        UpdateExpression='SET b = :val',
+        ExpressionAttributeValues={':val': 'cat'})
+    assert not 'Attributes' in ret
+
+    # Using ReturnValues=NONE is the same:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
+    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='NONE',
+        UpdateExpression='SET b = :val',
+        ExpressionAttributeValues={':val': 'cat'})
+    assert not 'Attributes' in ret
+
+    # With ReturnValues=ALL_OLD, the entire old value of the item (even
+    # attributes we did not modify) is returned in an "Attributes" attribute:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
+    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='ALL_OLD',
+        UpdateExpression='SET b = :val',
+        ExpressionAttributeValues={':val': 'cat'})
+    assert ret['Attributes'] == {'p': p, 'a': 'hi', 'b': 'dog'}
+
+    # With ReturnValues=UPDATED_OLD, only the overwritten attributes of the
+    # old item are returned in an "Attributes" attribute:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
+    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_OLD',
+        UpdateExpression='SET b = :val, c = :val2',
+        ExpressionAttributeValues={':val': 'cat', ':val2': 'hello'})
+    assert ret['Attributes'] == {'b': 'dog'}
+    # Even if an update overwrites an attribute by the same value again,
+    # this is considered an update, and the old value (identical to the
+    # new one) is returned:
+    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_OLD',
+        UpdateExpression='SET b = :val',
+        ExpressionAttributeValues={':val': 'cat'})
+    assert ret['Attributes'] == {'b': 'cat'}
+    # Deleting an attribute also counts as overwriting it, of course:
+    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_OLD',
+        UpdateExpression='REMOVE b')
+    assert ret['Attributes'] == {'b': 'cat'}
+
+    # With ReturnValues=ALL_NEW, the entire new value of the item (including
+    # old attributes we did not modify) is returned:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
+    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='ALL_NEW',
+        UpdateExpression='SET b = :val',
+        ExpressionAttributeValues={':val': 'cat'})
+    assert ret['Attributes'] == {'p': p, 'a': 'hi', 'b': 'cat'}
+
+    # With ReturnValues=UPDATED_NEW, only the new value of the updated
+    # attributes are returned. Note that "updated attributes" means
+    # the newly set attributes - it doesn't require that these attributes
+    # have any previous values
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
+    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_NEW',
+        UpdateExpression='SET b = :val, c = :val2',
+        ExpressionAttributeValues={':val': 'cat', ':val2': 'hello'})
+    assert ret['Attributes'] == {'b': 'cat', 'c': 'hello'}
+    # Deleting an attribute also counts as overwriting it, but the delete
+    # column is not returned in the response - so it's empty in this case.
+    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_NEW',
+        UpdateExpression='REMOVE b')
+    assert not 'Attributes' in ret
+    # In the above examples, UPDATED_NEW is not useful because it just
+    # returns the new values we already know from the request... UPDATED_NEW
+    # becomes more useful in read-modify-write operations:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 1})
+    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_NEW',
+        UpdateExpression='SET a = a + :val',
+        ExpressionAttributeValues={':val': 1})
+    assert ret['Attributes'] == {'a': 2}
+
+    # A non-supported setting "DOG" also returns in error:
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p}, ReturnValues='DOG',
+            UpdateExpression='SET a = a + :val',
+            ExpressionAttributeValues={':val': 1})
+    # The ReturnValues value is case sensitive, so while "NONE" is supported
+    # (and tested above), "none" isn't:
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p}, ReturnValues='none',
+            UpdateExpression='SET a = a + :val',
+            ExpressionAttributeValues={':val': 1})
--- a/alternator-test/test_scan.py
+++ b/alternator-test/test_scan.py
@@ -19,7 +19,7 @@

 import pytest
 from botocore.exceptions import ClientError
-from util import random_string, full_scan, multiset
+from util import random_string, full_scan, full_scan_and_count, multiset
 from boto3.dynamodb.conditions import Attr

 # Test that scanning works fine with/without pagination
@@ -189,3 +189,64 @@ def test_scan_with_key_equality_filtering(dynamodb, filled_test_table):
    got_items = full_scan(table, ScanFilter=scan_filter_c_and_another)
    expected_items = [item for item in items if "c" in item.keys() and "another" in item.keys() and item["c"] == "9" and item["another"] == "y"*16]
    assert multiset(expected_items) == multiset(got_items)
+
+# Test the "Select" parameter of Scan. The default Select mode,
+# ALL_ATTRIBUTES, returns items with all their attributes. Other modes
+# allow returning just specific attributes or just counting the results
+# without returning items at all.
+@pytest.mark.xfail(reason="Select not supported yet")
+def test_scan_select(filled_test_table):
+    test_table, items = filled_test_table
+    got_items = full_scan(test_table)
+    # By default, a scan returns all the items, with all their attributes:
+    # query returns all attributes:
+    got_items = full_scan(test_table)
+    assert multiset(items) == multiset(got_items)
+    # Select=ALL_ATTRIBUTES does exactly the same as the default - return
+    # all attributes:
+    got_items = full_scan(test_table, Select='ALL_ATTRIBUTES')
+    assert multiset(items) == multiset(got_items)
+    # Select=ALL_PROJECTED_ATTRIBUTES is not allowed on a base table (it
+    # is just for indexes, when IndexName is specified)
+    with pytest.raises(ClientError, match='ValidationException'):
+        full_scan(test_table, Select='ALL_PROJECTED_ATTRIBUTES')
+    # Select=SPECIFIC_ATTRIBUTES requires that either a AttributesToGet
+    # or ProjectionExpression appears, but then really does nothing beyond
+    # what AttributesToGet and ProjectionExpression already do:
+    with pytest.raises(ClientError, match='ValidationException'):
+        full_scan(test_table, Select='SPECIFIC_ATTRIBUTES')
+    wanted = ['c', 'another']
+    got_items = full_scan(test_table, Select='SPECIFIC_ATTRIBUTES', AttributesToGet=wanted)
+    expected_items = [{k: x[k] for k in wanted if k in x} for x in items]
+    assert multiset(expected_items) == multiset(got_items)
+    got_items = full_scan(test_table, Select='SPECIFIC_ATTRIBUTES', ProjectionExpression=','.join(wanted))
+    assert multiset(expected_items) == multiset(got_items)
+    # Select=COUNT just returns a count - not any items
+    (got_count, got_items) = full_scan_and_count(test_table, Select='COUNT')
+    assert got_count == len(items)
+    assert got_items == []
+    # Check that we also get a count in regular scans - not just with
+    # Select=COUNT, but without Select=COUNT we both items and count:
+    (got_count, got_items) = full_scan_and_count(test_table)
+    assert got_count == len(items)
+    assert multiset(items) == multiset(got_items)
+    # Select with some unknown string generates a validation exception:
+    with pytest.raises(ClientError, match='ValidationException'):
+        full_scan(test_table, Select='UNKNOWN')
+
+# Test parallel scan, i.e., the Segments and TotalSegments options.
+# In the following test we check that these parameters allow splitting
+# a scan into multiple parts, and that these parts are in fact disjoint,
+# and their union is the entire contents of the table. We do not actually
+# try to run these queries in *parallel* in this test.
+@pytest.mark.xfail(reason="parallel scan not supported yet")
+def test_scan_parallel(filled_test_table):
+    test_table, items = filled_test_table
+    for nsegments in [1, 2, 17]:
+        print('Testing TotalSegments={}'.format(nsegments))
+        got_items = []
+        for segment in range(nsegments):
+            got_items.extend(full_scan(test_table, TotalSegments=nsegments, Segment=segment))
+        # The following comparison verifies that each of the expected item
+        # in items was returned in one - and just one - of the segments.
+        assert multiset(items) == multiset(got_items)
--- a/alternator-test/util.py
+++ b/alternator-test/util.py
@@ -39,6 +39,26 @@ def full_scan(table, **kwargs):
        items.extend(response['Items'])
    return items

+# full_scan_and_count returns both items and count as returned by the server.
+# Note that count isn't simply len(items) - the server returns them
+# independently. e.g., with Select='COUNT' the items are not returned, but
+# count is.
+def full_scan_and_count(table, **kwargs):
+    response = table.scan(**kwargs)
+    items = []
+    count = 0
+    if 'Items' in response:
+        items.extend(response['Items'])
+    if 'Count' in response:
+        count = count + response['Count']
+    while 'LastEvaluatedKey' in response:
+        response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'], **kwargs)
+        if 'Items' in response:
+            items.extend(response['Items'])
+        if 'Count' in response:
+            count = count + response['Count']
+    return (count, items)
+
 # Utility function for fetching the entire results of a query into an array of items
 def full_query(table, **kwargs):
    response = table.query(**kwargs)
--- a/alternator/auth.cc
+++ b/alternator/auth.cc
@@ -66,8 +66,9 @@ static std::string format_time_point(db_clock::time_point tp) {
    time_t time_point_repr = db_clock::to_time_t(tp);
    std::string time_point_str;
    time_point_str.resize(17);
+    ::tm time_buf;
    // strftime prints the terminating null character as well
-    std::strftime(time_point_str.data(), time_point_str.size(), "%Y%m%dT%H%M%SZ", std::gmtime(&time_point_repr));
+    std::strftime(time_point_str.data(), time_point_str.size(), "%Y%m%dT%H%M%SZ", ::gmtime_r(&time_point_repr, &time_buf));
    time_point_str.resize(16);
    return time_point_str;
 }
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -29,6 +29,7 @@
 #include "rjson.hh"
 #include "serialization.hh"
 #include "base64.hh"
+#include <stdexcept>

 namespace alternator {

@@ -47,7 +48,9 @@ comparison_operator_type get_comparison_operator(const rjson::value& comparison_
            {"NOT_NULL", comparison_operator_type::NOT_NULL},
            {"BETWEEN", comparison_operator_type::BETWEEN},
            {"BEGINS_WITH", comparison_operator_type::BEGINS_WITH},
-    }; //TODO: CONTAINS
+            {"CONTAINS", comparison_operator_type::CONTAINS},
+            {"NOT_CONTAINS", comparison_operator_type::NOT_CONTAINS},
+    };
    if (!comparison_operator.IsString()) {
        throw api_error("ValidationException", format("Invalid comparison operator definition {}", rjson::print(comparison_operator)));
    }
@@ -143,9 +146,44 @@ static void verify_operand_count(const rjson::value* array, const size_check& ex
    }
 }

+struct rjson_engaged_ptr_comp {
+    bool operator()(const rjson::value* p1, const rjson::value* p2) const {
+        return rjson::single_value_comp()(*p1, *p2);
+    }
+};
+
+// It's not enough to compare underlying JSON objects when comparing sets,
+// as internally they're stored in an array, and the order of elements is
+// not important in set equality. See issue #5021
+static bool check_EQ_for_sets(const rjson::value& set1, const rjson::value& set2) {
+    if (set1.Size() != set2.Size()) {
+        return false;
+    }
+    std::set<const rjson::value*, rjson_engaged_ptr_comp> set1_raw;
+    for (auto it = set1.Begin(); it != set1.End(); ++it) {
+        set1_raw.insert(&*it);
+    }
+    for (const auto& a : set2.GetArray()) {
+        if (set1_raw.count(&a) == 0) {
+            return false;
+        }
+    }
+    return true;
+}
+
 // Check if two JSON-encoded values match with the EQ relation
 static bool check_EQ(const rjson::value* v1, const rjson::value& v2) {
-    return v1 && *v1 == v2;
+    if (!v1) {
+        return false;
+    }
+    if (v1->IsObject() && v1->MemberCount() == 1 && v2.IsObject() && v2.MemberCount() == 1) {
+        auto it1 = v1->MemberBegin();
+        auto it2 = v2.MemberBegin();
+        if ((it1->name == "SS" && it2->name == "SS") || (it1->name == "NS" && it2->name == "NS") || (it1->name == "BS" && it2->name == "BS")) {
+            return check_EQ_for_sets(it1->value, it2->value);
+        }
+    }
+    return *v1 == v2;
 }

 // Check if two JSON-encoded values match with the NE relation
@@ -174,9 +212,70 @@ static bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2) {
    if (it1->name != it2->name) {
        return false;
    }
-    std::string_view val1(it1->value.GetString(), it1->value.GetStringLength());
-    std::string_view val2(it2->value.GetString(), it2->value.GetStringLength());
-    return val1.substr(0, val2.size()) == val2;
+    if (it2->name == "S") {
+        std::string_view val1(it1->value.GetString(), it1->value.GetStringLength());
+        std::string_view val2(it2->value.GetString(), it2->value.GetStringLength());
+        return val1.substr(0, val2.size()) == val2;
+    } else /* it2->name == "B" */ {
+        // TODO (optimization): Check the begins_with condition directly on
+        // the base64-encoded string, without making a decoded copy.
+        bytes val1 = base64_decode(it1->value);
+        bytes val2 = base64_decode(it2->value);
+        return val1.substr(0, val2.size()) == val2;
+    }
+}
+
+static std::string_view to_string_view(const rjson::value& v) {
+    return std::string_view(v.GetString(), v.GetStringLength());
+}
+
+static bool is_set_of(const rjson::value& type1, const rjson::value& type2) {
+    return (type2 == "S" && type1 == "SS") || (type2 == "N" && type1 == "NS") || (type2 == "B" && type1 == "BS");
+}
+
+// Check if two JSON-encoded values match with the CONTAINS relation
+static bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2) {
+    if (!v1) {
+        return false;
+    }
+    const auto& kv1 = *v1->MemberBegin();
+    const auto& kv2 = *v2.MemberBegin();
+    if (kv2.name != "S" && kv2.name != "N" &&  kv2.name != "B") {
+        throw api_error("ValidationException",
+                        format("CONTAINS operator requires a single AttributeValue of type String, Number, or Binary, "
+                               "got {} instead", kv2.name));
+    }
+    if (kv1.name == "S" && kv2.name == "S") {
+        return to_string_view(kv1.value).find(to_string_view(kv2.value)) != std::string_view::npos;
+    } else if (kv1.name == "B" && kv2.name == "B") {
+        return base64_decode(kv1.value).find(base64_decode(kv2.value)) != bytes::npos;
+    } else if (is_set_of(kv1.name, kv2.name)) {
+        for (auto i = kv1.value.Begin(); i != kv1.value.End(); ++i) {
+            if (*i == kv2.value) {
+                return true;
+            }
+        }
+    } else if (kv1.name == "L") {
+        for (auto i = kv1.value.Begin(); i != kv1.value.End(); ++i) {
+            if (!i->IsObject() || i->MemberCount() != 1) {
+                clogger.error("check_CONTAINS received a list whose element is malformed");
+                return false;
+            }
+            const auto& el = *i->MemberBegin();
+            if (el.name == kv2.name && el.value == kv2.value) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+// Check if two JSON-encoded values match with the NOT_CONTAINS relation
+static bool check_NOT_CONTAINS(const rjson::value* v1, const rjson::value& v2) {
+    if (!v1) {
+        return false;
+    }
+    return !check_CONTAINS(v1, v2);
 }

 // Check if a JSON-encoded value equals any element of an array, which must have at least one element.
@@ -221,13 +320,13 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara
    if (!v2.IsObject() || v2.MemberCount() != 1) {
        throw api_error("ValidationException",
                        format("{} requires a single AttributeValue of type String, Number, or Binary",
-                               cmp.diagnostic()));
+                               cmp.diagnostic));
    }
    const auto& kv2 = *v2.MemberBegin();
    if (kv2.name != "S" && kv2.name != "N" && kv2.name != "B") {
        throw api_error("ValidationException",
                        format("{} requires a single AttributeValue of type String, Number, or Binary",
-                               cmp.diagnostic()));
+                               cmp.diagnostic));
    }
    if (!v1 || !v1->IsObject() || v1->MemberCount() != 1) {
        return false;
@@ -237,7 +336,7 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara
        return false;
    }
    if (kv1.name == "N") {
-        return cmp(unwrap_number(*v1, cmp.diagnostic()), unwrap_number(v2, cmp.diagnostic()));
+        return cmp(unwrap_number(*v1, cmp.diagnostic), unwrap_number(v2, cmp.diagnostic));
    }
    if (kv1.name == "S") {
        return cmp(std::string_view(kv1.value.GetString(), kv1.value.GetStringLength()),
@@ -252,15 +351,80 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara

 struct cmp_lt {
    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs < rhs; }
-    const char* diagnostic() const { return "LT operator"; }
+    static constexpr const char* diagnostic = "LT operator";
+};
+
+struct cmp_le {
+    // bytes only has <, so we cannot use <=.
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs < rhs || lhs == rhs; }
+    static constexpr const char* diagnostic = "LE operator";
+};
+
+struct cmp_ge {
+    // bytes only has <, so we cannot use >=.
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return rhs < lhs || lhs == rhs; }
+    static constexpr const char* diagnostic = "GE operator";
 };

 struct cmp_gt {
-    // bytes only has <
+    // bytes only has <, so we cannot use >.
    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return rhs < lhs; }
-    const char* diagnostic() const { return "GT operator"; }
+    static constexpr const char* diagnostic = "GT operator";
 };

+// True if v is between lb and ub, inclusive.  Throws if lb > ub.
+template <typename T>
+bool check_BETWEEN(const T& v, const T& lb, const T& ub) {
+    if (ub < lb) {
+        throw api_error("ValidationException",
+                        format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
+    }
+    return cmp_ge()(v, lb) && cmp_le()(v, ub);
+}
+
+static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const rjson::value& ub) {
+    if (!v) {
+        return false;
+    }
+    if (!v->IsObject() || v->MemberCount() != 1) {
+        throw api_error("ValidationException", format("BETWEEN operator encountered malformed AttributeValue: {}", *v));
+    }
+    if (!lb.IsObject() || lb.MemberCount() != 1) {
+        throw api_error("ValidationException", format("BETWEEN operator encountered malformed AttributeValue: {}", lb));
+    }
+    if (!ub.IsObject() || ub.MemberCount() != 1) {
+        throw api_error("ValidationException", format("BETWEEN operator encountered malformed AttributeValue: {}", ub));
+    }
+
+    const auto& kv_v = *v->MemberBegin();
+    const auto& kv_lb = *lb.MemberBegin();
+    const auto& kv_ub = *ub.MemberBegin();
+    if (kv_lb.name != kv_ub.name) {
+        throw api_error(
+                "ValidationException",
+                format("BETWEEN operator requires the same type for lower and upper bound; instead got {} and {}",
+                       kv_lb.name, kv_ub.name));
+    }
+    if (kv_v.name != kv_lb.name) { // Cannot compare different types, so v is NOT between lb and ub.
+        return false;
+    }
+    if (kv_v.name == "N") {
+        const char* diag = "BETWEEN operator";
+        return check_BETWEEN(unwrap_number(*v, diag), unwrap_number(lb, diag), unwrap_number(ub, diag));
+    }
+    if (kv_v.name == "S") {
+        return check_BETWEEN(std::string_view(kv_v.value.GetString(), kv_v.value.GetStringLength()),
+                             std::string_view(kv_lb.value.GetString(), kv_lb.value.GetStringLength()),
+                             std::string_view(kv_ub.value.GetString(), kv_ub.value.GetStringLength()));
+    }
+    if (kv_v.name == "B") {
+        return check_BETWEEN(base64_decode(kv_v.value), base64_decode(kv_lb.value), base64_decode(kv_ub.value));
+    }
+    throw api_error("ValidationException",
+        format("BETWEEN operator requires AttributeValueList elements to be of type String, Number, or Binary; instead got {}",
+               kv_lb.name));
+}
+
 // Verify one Expect condition on one attribute (whose content is "got")
 // for the verify_expected() below.
 // This function returns true or false depending on whether the condition
@@ -306,9 +470,15 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
        case comparison_operator_type::LT:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
            return check_compare(got, (*attribute_value_list)[0], cmp_lt{});
+        case comparison_operator_type::LE:
+            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
+            return check_compare(got, (*attribute_value_list)[0], cmp_le{});
        case comparison_operator_type::GT:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
            return check_compare(got, (*attribute_value_list)[0], cmp_gt{});
+        case comparison_operator_type::GE:
+            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
+            return check_compare(got, (*attribute_value_list)[0], cmp_ge{});
        case comparison_operator_type::BEGINS_WITH:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
            return check_BEGINS_WITH(got, (*attribute_value_list)[0]);
@@ -321,10 +491,17 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
        case comparison_operator_type::NOT_NULL:
            verify_operand_count(attribute_value_list, empty(), *comparison_operator);
            return check_NOT_NULL(got);
-        default:
-            // FIXME: implement all the missing types, so there will be no default here.
-            throw api_error("ValidationException", format("ComparisonOperator {} is not yet supported", *comparison_operator));
+        case comparison_operator_type::BETWEEN:
+            verify_operand_count(attribute_value_list, exact_size(2), *comparison_operator);
+            return check_BETWEEN(got, (*attribute_value_list)[0], (*attribute_value_list)[1]);
+        case comparison_operator_type::CONTAINS:
+            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
+            return check_CONTAINS(got, (*attribute_value_list)[0]);
+        case comparison_operator_type::NOT_CONTAINS:
+            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
+            return check_NOT_CONTAINS(got, (*attribute_value_list)[0]);
        }
+        throw std::logic_error(format("Internal error: corrupted operator enum: {}", int(op)));
    }
 }

--- a/alternator/conditions.hh
+++ b/alternator/conditions.hh
@@ -37,7 +37,7 @@
 namespace alternator {

 enum class comparison_operator_type {
-    EQ, NE, LE, LT, GE, GT, IN, BETWEEN, CONTAINS, IS_NULL, NOT_NULL, BEGINS_WITH
+    EQ, NE, LE, LT, GE, GT, IN, BETWEEN, CONTAINS, NOT_CONTAINS, IS_NULL, NOT_NULL, BEGINS_WITH
 };

 comparison_operator_type get_comparison_operator(const rjson::value& comparison_operator);
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -35,6 +35,7 @@
 #include "query-result-reader.hh"
 #include "cql3/selection/selection.hh"
 #include "cql3/result_set.hh"
+#include "cql3/type_json.hh"
 #include "bytes.hh"
 #include "cql3/update_parameters.hh"
 #include "server.hh"
@@ -237,17 +238,75 @@ static std::string get_string_attribute(const rjson::value& value, rjson::string
                attribute_name, value));
    }
    return attribute_value->GetString();
+}
+
+// Convenience function for getting the value of a boolean attribute, or a
+// default value if it is missing. If the attribute exists, but is not a
+// bool, a descriptive api_error is thrown.
+static bool get_bool_attribute(const rjson::value& value, rjson::string_ref_type attribute_name, bool default_return) {
+    const rjson::value* attribute_value = rjson::find(value, attribute_name);
+    if (!attribute_value) {
+        return default_return;
+    }
+    if (!attribute_value->IsBool()) {
+        throw api_error("ValidationException", format("Expected boolean value for attribute {}, got: {}",
+                attribute_name, value));
+    }
+    return attribute_value->GetBool();
+}
+
+// Convenience function for getting the value of an integer attribute, or
+// an empty optional if it is missing. If the attribute exists, but is not
+// an integer, a descriptive api_error is thrown.
+static std::optional<int> get_int_attribute(const rjson::value& value, rjson::string_ref_type attribute_name) {
+    const rjson::value* attribute_value = rjson::find(value, attribute_name);
+    if (!attribute_value)
+        return {};
+    if (!attribute_value->IsInt()) {
+        throw api_error("ValidationException", format("Expected integer value for attribute {}, got: {}",
+                attribute_name, value));
+    }
+    return attribute_value->GetInt();
+}
+
+// Sets a KeySchema object inside the given JSON parent describing the key
+// attributes of the the given schema as being either HASH or RANGE keys.
+// Additionally, adds to a given map mappings between the key attribute
+// names and their type (as a DynamoDB type string).
+static void describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>& attribute_types) {
+    rjson::value key_schema = rjson::empty_array();
+    for (const column_definition& cdef : schema.partition_key_columns()) {
+        rjson::value key = rjson::empty_object();
+        rjson::set(key, "AttributeName", rjson::from_string(cdef.name_as_text()));
+        rjson::set(key, "KeyType", "HASH");
+        rjson::push_back(key_schema, std::move(key));
+        attribute_types[cdef.name_as_text()] = type_to_string(cdef.type);
+
+    }
+    for (const column_definition& cdef : schema.clustering_key_columns()) {
+        rjson::value key = rjson::empty_object();
+        rjson::set(key, "AttributeName", rjson::from_string(cdef.name_as_text()));
+        rjson::set(key, "KeyType", "RANGE");
+        rjson::push_back(key_schema, std::move(key));
+        attribute_types[cdef.name_as_text()] = type_to_string(cdef.type);
+        // FIXME: this "break" can avoid listing some clustering key columns
+        // we added for GSIs just because they existed in the base table -
+        // but not in all cases. We still have issue #5320. See also
+        // reproducer in test_gsi_2_describe_table_schema.
+        break;
+    }
+    rjson::set(parent, "KeySchema", std::move(key_schema));

 }

-future<json::json_return_type> executor::describe_table(client_state& client_state, std::string content) {
+future<json::json_return_type> executor::describe_table(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content) {
    _stats.api_operations.describe_table++;
    rjson::value request = rjson::parse(content);
    elogger.trace("Describing table {}", request);

    schema_ptr schema = get_table(_proxy, request);

-    tracing::add_table_name(client_state.get_trace_state(), schema->ks_name(), schema->cf_name());
+    tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());

    rjson::value table_description = rjson::empty_object();
    rjson::set(table_description, "TableName", rjson::from_string(schema->cf_name()));
@@ -268,6 +327,11 @@ future<json::json_return_type> executor::describe_table(client_state& client_sta
    rjson::set(table_description, "BillingModeSummary", rjson::empty_object());
    rjson::set(table_description["BillingModeSummary"], "BillingMode", "PAY_PER_REQUEST");
    rjson::set(table_description["BillingModeSummary"], "LastUpdateToPayPerRequestDateTime", rjson::value(creation_date_seconds));
+
+    std::unordered_map<std::string,std::string> key_attribute_types;
+    // Add base table's KeySchema and collect types for AttributeDefinitions:
+    describe_key_schema(table_description, *schema, key_attribute_types);
+
    table& t = _proxy.get_db().local().find_column_family(schema);
    if (!t.views().empty()) {
        rjson::value gsi_array = rjson::empty_array();
@@ -282,6 +346,8 @@ future<json::json_return_type> executor::describe_table(client_state& client_sta
            }
            sstring index_name = cf_name.substr(delim_it + 1);
            rjson::set(view_entry, "IndexName", rjson::from_string(index_name));
+            // Add indexes's KeySchema and collect types for AttributeDefinitions:
+            describe_key_schema(view_entry, *vptr, key_attribute_types);
            // Local secondary indexes are marked by an extra '!' sign occurring before the ':' delimiter
            rjson::value& index_array = (delim_it > 1 && cf_name[delim_it-1] == '!') ? lsi_array : gsi_array;
            rjson::push_back(index_array, std::move(view_entry));
@@ -293,23 +359,32 @@ future<json::json_return_type> executor::describe_table(client_state& client_sta
            rjson::set(table_description, "GlobalSecondaryIndexes", std::move(gsi_array));
        }
    }
+    // Use map built by describe_key_schema() for base and indexes to produce
+    // AttributeDefinitions for all key columns:
+    rjson::value attribute_definitions = rjson::empty_array();
+    for (auto& type : key_attribute_types) {
+        rjson::value key = rjson::empty_object();
+        rjson::set(key, "AttributeName", rjson::from_string(type.first));
+        rjson::set(key, "AttributeType", rjson::from_string(type.second));
+        rjson::push_back(attribute_definitions, std::move(key));
+    }
+    rjson::set(table_description, "AttributeDefinitions", std::move(attribute_definitions));
+
+    // FIXME: still missing some response fields (issue #5026)

-    // FIXME: more attributes! Check https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/API_TableDescription.html#DDB-Type-TableDescription-TableStatus but also run a test to see what DyanmoDB really fills
-    // maybe for TableId or TableArn use  schema.id().to_sstring().c_str();
-    // Of course, the whole schema is missing!
    rjson::value response = rjson::empty_object();
    rjson::set(response, "Table", std::move(table_description));
    elogger.trace("returning {}", response);
    return make_ready_future<json::json_return_type>(make_jsonable(std::move(response)));
 }

-future<json::json_return_type> executor::delete_table(client_state& client_state, std::string content) {
+future<json::json_return_type> executor::delete_table(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content) {
    _stats.api_operations.delete_table++;
    rjson::value request = rjson::parse(content);
    elogger.trace("Deleting table {}", request);

    std::string table_name = get_table_name(request);
-    tracing::add_table_name(client_state.get_trace_state(), KEYSPACE_NAME, table_name);
+    tracing::add_table_name(trace_state, KEYSPACE_NAME, table_name);

    if (!_proxy.get_db().local().has_schema(KEYSPACE_NAME, table_name)) {
        throw api_error("ResourceNotFoundException",
@@ -406,14 +481,14 @@ static std::pair<std::string, std::string> parse_key_schema(const rjson::value&
 }


-future<json::json_return_type> executor::create_table(client_state& client_state, std::string content) {
+future<json::json_return_type> executor::create_table(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content) {
    _stats.api_operations.create_table++;
    rjson::value table_info = rjson::parse(content);
    elogger.trace("Creating table {}", table_info);
    std::string table_name = get_table_name(table_info);
    const rjson::value& attribute_definitions = table_info["AttributeDefinitions"];

-    tracing::add_table_name(client_state.get_trace_state(), KEYSPACE_NAME, table_name);
+    tracing::add_table_name(trace_state, KEYSPACE_NAME, table_name);

    schema_builder builder(KEYSPACE_NAME, table_name);
    auto [hash_key, range_key] = parse_key_schema(table_info);
@@ -656,7 +731,12 @@ static mutation make_item_mutation(const rjson::value& item, schema_ptr schema)
    // Scylla proper, to implement the operation to replace an entire
    // collection ("UPDATE .. SET x = ..") - see
    // cql3::update_parameters::make_tombstone_just_before().
-    row.apply(tombstone(ts-1, gc_clock::now()));
+    const bool use_partition_tombstone = schema->clustering_key_size() == 0;
+    if (use_partition_tombstone) {
+        m.partition().apply(tombstone(ts-1, gc_clock::now()));
+    } else {
+        row.apply(tombstone(ts-1, gc_clock::now()));
+    }
    return m;
 }

@@ -674,18 +754,24 @@ static future<std::unique_ptr<rjson::value>> maybe_get_previous_item(
        bool need_read_before_write,
        alternator::stats& stats);

-future<json::json_return_type> executor::put_item(client_state& client_state, std::string content) {
+future<json::json_return_type> executor::put_item(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content) {
    _stats.api_operations.put_item++;
    auto start_time = std::chrono::steady_clock::now();
    rjson::value update_info = rjson::parse(content);
    elogger.trace("Updating value {}", update_info);

    schema_ptr schema = get_table(_proxy, update_info);
-    tracing::add_table_name(client_state.get_trace_state(), schema->ks_name(), schema->cf_name());
+    tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());

    if (rjson::find(update_info, "ConditionExpression")) {
        throw api_error("ValidationException", "ConditionExpression is not yet implemented in alternator");
    }
+    auto return_values = get_string_attribute(update_info, "ReturnValues", "NONE");
+    if (return_values != "NONE") {
+        // FIXME: Need to support also the ALL_OLD option. See issue #5053.
+        throw api_error("ValidationException", format("Unsupported ReturnValues={} for PutItem operation", return_values));
+    }
+
    const bool has_expected = update_info.HasMember("Expected");

    const rjson::value& item = update_info["Item"];
@@ -694,11 +780,11 @@ future<json::json_return_type> executor::put_item(client_state& client_state, st

    return maybe_get_previous_item(_proxy, client_state, schema, item, has_expected, _stats).then(
            [this, schema, has_expected,  update_info = rjson::copy(update_info), m = std::move(m),
-             &client_state, start_time] (std::unique_ptr<rjson::value> previous_item) mutable {
+             &client_state, start_time, trace_state] (std::unique_ptr<rjson::value> previous_item) mutable {
        if (has_expected) {
            verify_expected(update_info, previous_item);
        }
-        return _proxy.mutate(std::vector<mutation>{std::move(m)}, db::consistency_level::LOCAL_QUORUM, default_timeout(), client_state.get_trace_state(), empty_service_permit()).then([this, start_time] () {
+        return _proxy.mutate(std::vector<mutation>{std::move(m)}, db::consistency_level::LOCAL_QUORUM, default_timeout(), trace_state, empty_service_permit()).then([this, start_time] () {
            _stats.api_operations.put_item_latency.add(std::chrono::steady_clock::now() - start_time, _stats.api_operations.put_item_latency._count + 1);
            // Without special options on what to return, PutItem returns nothing.
            return make_ready_future<json::json_return_type>(json_string(""));
@@ -721,22 +807,32 @@ static mutation make_delete_item_mutation(const rjson::value& key, schema_ptr sc
    clustering_key ck = ck_from_json(key, schema);
    check_key(key, schema);
    mutation m(schema, pk);
-    auto& row = m.partition().clustered_row(*schema, ck);
-    row.apply(tombstone(api::new_timestamp(), gc_clock::now()));
+    const bool use_partition_tombstone = schema->clustering_key_size() == 0;
+    if (use_partition_tombstone) {
+        m.partition().apply(tombstone(api::new_timestamp(), gc_clock::now()));
+    } else {
+        auto& row = m.partition().clustered_row(*schema, ck);
+        row.apply(tombstone(api::new_timestamp(), gc_clock::now()));
+    }
    return m;
 }

-future<json::json_return_type> executor::delete_item(client_state& client_state, std::string content) {
+future<json::json_return_type> executor::delete_item(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content) {
    _stats.api_operations.delete_item++;
    auto start_time = std::chrono::steady_clock::now();
    rjson::value update_info = rjson::parse(content);

    schema_ptr schema = get_table(_proxy, update_info);
-    tracing::add_table_name(client_state.get_trace_state(), schema->ks_name(), schema->cf_name());
+    tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());

    if (rjson::find(update_info, "ConditionExpression")) {
        throw api_error("ValidationException", "ConditionExpression is not yet implemented in alternator");
    }
+    auto return_values = get_string_attribute(update_info, "ReturnValues", "NONE");
+    if (return_values != "NONE") {
+        // FIXME: Need to support also the ALL_OLD option. See issue #5053.
+        throw api_error("ValidationException", format("Unsupported ReturnValues={} for DeleteItem operation", return_values));
+    }
    const bool has_expected = update_info.HasMember("Expected");

    const rjson::value& key = update_info["Key"];
@@ -746,11 +842,11 @@ future<json::json_return_type> executor::delete_item(client_state& client_state,

    return maybe_get_previous_item(_proxy, client_state, schema, key, has_expected, _stats).then(
            [this, schema, has_expected,  update_info = rjson::copy(update_info), m = std::move(m),
-             &client_state, start_time] (std::unique_ptr<rjson::value> previous_item) mutable {
+             &client_state, start_time, trace_state] (std::unique_ptr<rjson::value> previous_item) mutable {
        if (has_expected) {
            verify_expected(update_info, previous_item);
        }
-        return _proxy.mutate(std::vector<mutation>{std::move(m)}, db::consistency_level::LOCAL_QUORUM, default_timeout(), client_state.get_trace_state(), empty_service_permit()).then([this, start_time] () {
+        return _proxy.mutate(std::vector<mutation>{std::move(m)}, db::consistency_level::LOCAL_QUORUM, default_timeout(), trace_state, empty_service_permit()).then([this, start_time] () {
            _stats.api_operations.delete_item_latency.add(std::chrono::steady_clock::now() - start_time, _stats.api_operations.delete_item_latency._count + 1);
            // Without special options on what to return, DeleteItem returns nothing.
            return make_ready_future<json::json_return_type>(json_string(""));
@@ -783,7 +879,7 @@ struct primary_key_equal {
    }
 };

-future<json::json_return_type> executor::batch_write_item(client_state& client_state, std::string content) {
+future<json::json_return_type> executor::batch_write_item(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content) {
    _stats.api_operations.batch_write_item++;
    rjson::value batch_info = rjson::parse(content);
    rjson::value& request_items = batch_info["RequestItems"];
@@ -793,7 +889,7 @@ future<json::json_return_type> executor::batch_write_item(client_state& client_s

    for (auto it = request_items.MemberBegin(); it != request_items.MemberEnd(); ++it) {
        schema_ptr schema = get_table_from_batch_request(_proxy, it);
-        tracing::add_table_name(client_state.get_trace_state(), schema->ks_name(), schema->cf_name());
+        tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());
        std::unordered_set<primary_key, primary_key_hash, primary_key_equal> used_keys(1, primary_key_hash{schema}, primary_key_equal{schema});
        for (auto& request : it->value.GetArray()) {
            if (!request.IsObject() || request.MemberCount() != 1) {
@@ -826,7 +922,7 @@ future<json::json_return_type> executor::batch_write_item(client_state& client_s
        }
    }

-    return _proxy.mutate(std::move(mutations), db::consistency_level::LOCAL_QUORUM, default_timeout(), client_state.get_trace_state(), empty_service_permit()).then([] () {
+    return _proxy.mutate(std::move(mutations), db::consistency_level::LOCAL_QUORUM, default_timeout(), trace_state, empty_service_permit()).then([] () {
        // Without special options on what to return, BatchWriteItem returns nothing,
        // unless there are UnprocessedItems - it's possible to just stop processing a batch
        // due to throttling. TODO(sarna): Consider UnprocessedItems when returning.
@@ -911,21 +1007,6 @@ static std::string get_item_type_string(const rjson::value& v) {
    return it->name.GetString();
 }

-// Check if a given JSON object encodes a set (i.e., it is a {"SS": [...]}, or "NS", "BS"
-// and returns set's type and a pointer to that set. If the object does not encode a set,
-// returned value is {"", nullptr}
-static const std::pair<std::string, const rjson::value*> unwrap_set(const rjson::value& v) {
-    if (!v.IsObject() || v.MemberCount() != 1) {
-        return {"", nullptr};
-    }
-    auto it = v.MemberBegin();
-    const std::string it_key = it->name.GetString();
-    if (it_key != "SS" && it_key != "BS" && it_key != "NS") {
-        return {"", nullptr};
-    }
-    return std::make_pair(it_key, &(it->value));
-}
-
 // Take two JSON-encoded list values (remember that a list value is
 // {"L": [...the actual list]}) and return the concatenation, again as
 // a list value.
@@ -944,50 +1025,6 @@ static rjson::value list_concatenate(const rjson::value& v1, const rjson::value&
    return ret;
 }

-struct single_value_rjson_comp {
-    bool operator()(const rjson::value& r1, const rjson::value& r2) const {
-        auto r1_type = r1.GetType();
-        auto r2_type = r2.GetType();
-        switch (r1_type) {
-        case rjson::type::kNullType:
-            return r1_type < r2_type;
-        case rjson::type::kFalseType:
-            return r1_type < r2_type;
-        case rjson::type::kTrueType:
-            return r1_type < r2_type;
-        case rjson::type::kObjectType:
-            throw rjson::error("Object type comparison is not supported");
-        case rjson::type::kArrayType:
-            throw rjson::error("Array type comparison is not supported");
-        case rjson::type::kStringType: {
-            const size_t r1_len = r1.GetStringLength();
-            const size_t r2_len = r2.GetStringLength();
-            size_t len = std::min(r1_len, r2_len);
-            int result = std::strncmp(r1.GetString(), r2.GetString(), len);
-            return result < 0 || (result == 0 && r1_len < r2_len);
-        }
-        case rjson::type::kNumberType: {
-            if (r1_type != r2_type) {
-                throw rjson::error("All numbers in a set should have the same type");
-            }
-            if (r1.IsDouble()) {
-                return r1.GetDouble() < r2.GetDouble();
-            } else if (r1.IsInt()) {
-                return r1.GetInt() < r2.GetInt();
-            } else if (r1.IsUint()) {
-                return r1.GetUint() < r2.GetUint();
-            } else if (r1.IsInt64()) {
-                return r1.GetInt64() < r2.GetInt64();
-            } else {
-                return r1.GetUint64() < r2.GetUint64();
-            }
-        }
-        default:
-            return false;
-        }
-    }
-};
-
 // Take two JSON-encoded set values (e.g. {"SS": [...the actual set]}) and return the sum of both sets,
 // again as a set value.
 static rjson::value set_sum(const rjson::value& v1, const rjson::value& v2) {
@@ -1000,7 +1037,7 @@ static rjson::value set_sum(const rjson::value& v1, const rjson::value& v2) {
        throw api_error("ValidationException", "UpdateExpression: ADD operation for sets must be given sets as arguments");
    }
    rjson::value sum = rjson::copy(*set1);
-    std::set<rjson::value, single_value_rjson_comp> set1_raw;
+    std::set<rjson::value, rjson::single_value_comp> set1_raw;
    for (auto it = sum.Begin(); it != sum.End(); ++it) {
        set1_raw.insert(rjson::copy(*it));
    }
@@ -1025,7 +1062,7 @@ static rjson::value set_diff(const rjson::value& v1, const rjson::value& v2) {
    if (!set1 || !set2) {
        throw api_error("ValidationException", "UpdateExpression: DELETE operation can only be performed on a set");
    }
-    std::set<rjson::value, single_value_rjson_comp> set1_raw;
+    std::set<rjson::value, rjson::single_value_comp> set1_raw;
    for (auto it = set1->Begin(); it != set1->End(); ++it) {
        set1_raw.insert(rjson::copy(*it));
    }
@@ -1384,17 +1421,22 @@ static future<std::unique_ptr<rjson::value>> maybe_get_previous_item(
 }


-future<json::json_return_type> executor::update_item(client_state& client_state, std::string content) {
+future<json::json_return_type> executor::update_item(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content) {
    _stats.api_operations.update_item++;
    auto start_time = std::chrono::steady_clock::now();
    rjson::value update_info = rjson::parse(content);
    elogger.trace("update_item {}", update_info);
    schema_ptr schema = get_table(_proxy, update_info);
-    tracing::add_table_name(client_state.get_trace_state(), schema->ks_name(), schema->cf_name());
+    tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());

    if (rjson::find(update_info, "ConditionExpression")) {
        throw api_error("ValidationException", "ConditionExpression is not yet implemented in alternator");
    }
+    auto return_values = get_string_attribute(update_info, "ReturnValues", "NONE");
+    if (return_values != "NONE") {
+        // FIXME: Need to support also ALL_OLD, UPDATED_OLD, ALL_NEW and UPDATED_NEW options. See issue #5053.
+        throw api_error("ValidationException", format("Unsupported ReturnValues={} for UpdateItem operation", return_values));
+    }

    if (!update_info.HasMember("Key")) {
        throw api_error("ValidationException", "UpdateItem requires a Key parameter");
@@ -1441,7 +1483,7 @@ future<json::json_return_type> executor::update_item(client_state& client_state,
    return maybe_get_previous_item(_proxy, client_state, schema, pk, ck, has_update_expression, expression, has_expected, _stats).then(
            [this, schema, expression = std::move(expression), has_update_expression, ck = std::move(ck), has_expected,
             update_info = rjson::copy(update_info), m = std::move(m), attrs_collector = std::move(attrs_collector),
-             attribute_updates = rjson::copy(attribute_updates), ts, &client_state, start_time] (std::unique_ptr<rjson::value> previous_item) mutable {
+             attribute_updates = rjson::copy(attribute_updates), ts, &client_state, start_time, trace_state] (std::unique_ptr<rjson::value> previous_item) mutable {
        if (has_expected) {
            verify_expected(update_info, previous_item);
        }
@@ -1572,7 +1614,7 @@ future<json::json_return_type> executor::update_item(client_state& client_state,
        row.apply(row_marker(ts));

        elogger.trace("Applying mutation {}", m);
-        return _proxy.mutate(std::vector<mutation>{std::move(m)}, db::consistency_level::LOCAL_QUORUM, default_timeout(), client_state.get_trace_state(), empty_service_permit()).then([this, start_time] () {
+        return _proxy.mutate(std::vector<mutation>{std::move(m)}, db::consistency_level::LOCAL_QUORUM, default_timeout(), trace_state, empty_service_permit()).then([this, start_time] () {
            // Without special options on what to return, UpdateItem returns nothing.
            _stats.api_operations.update_item_latency.add(std::chrono::steady_clock::now() - start_time, _stats.api_operations.update_item_latency._count + 1);
            return make_ready_future<json::json_return_type>(json_string(""));
@@ -1599,7 +1641,7 @@ static db::consistency_level get_read_consistency(const rjson::value& request) {
    return consistent_read ? db::consistency_level::LOCAL_QUORUM : db::consistency_level::LOCAL_ONE;
 }

-future<json::json_return_type> executor::get_item(client_state& client_state, std::string content) {
+future<json::json_return_type> executor::get_item(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content) {
    _stats.api_operations.get_item++;
    auto start_time = std::chrono::steady_clock::now();
    rjson::value table_info = rjson::parse(content);
@@ -1607,7 +1649,7 @@ future<json::json_return_type> executor::get_item(client_state& client_state, st

    schema_ptr schema = get_table(_proxy, table_info);

-    tracing::add_table_name(client_state.get_trace_state(), schema->ks_name(), schema->cf_name());
+    tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());

    rjson::value& query_key = table_info["Key"];
    db::consistency_level cl = get_read_consistency(table_info);
@@ -1642,7 +1684,7 @@ future<json::json_return_type> executor::get_item(client_state& client_state, st
    });
 }

-future<json::json_return_type> executor::batch_get_item(client_state& client_state, std::string content) {
+future<json::json_return_type> executor::batch_get_item(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content) {
    // FIXME: In this implementation, an unbounded batch size can cause
    // unbounded response JSON object to be buffered in memory, unbounded
    // parallelism of the requests, and unbounded amount of non-preemptable
@@ -1670,7 +1712,7 @@ future<json::json_return_type> executor::batch_get_item(client_state& client_sta
    for (auto it = request_items.MemberBegin(); it != request_items.MemberEnd(); ++it) {
        table_requests rs;
        rs.schema = get_table_from_batch_request(_proxy, it);
-        tracing::add_table_name(client_state.get_trace_state(), KEYSPACE_NAME, rs.schema->cf_name());
+        tracing::add_table_name(trace_state, KEYSPACE_NAME, rs.schema->cf_name());
        rs.cl = get_read_consistency(it->value);
        rs.attrs_to_get = calculate_attrs_to_get(it->value);
        auto& keys = (it->value)["Keys"];
@@ -1810,7 +1852,7 @@ static rjson::value encode_paging_state(const schema& schema, const service::pag
    for (const column_definition& cdef : schema.partition_key_columns()) {
        rjson::set_with_string_name(last_evaluated_key, cdef.name_as_text(), rjson::empty_object());
        rjson::value& key_entry = last_evaluated_key[cdef.name_as_text()];
-        rjson::set_with_string_name(key_entry, type_to_string(cdef.type), rjson::parse(cdef.type->to_json_string(*exploded_pk_it)));
+        rjson::set_with_string_name(key_entry, type_to_string(cdef.type), rjson::parse(to_json_string(*cdef.type, *exploded_pk_it)));
        ++exploded_pk_it;
    }
    auto ck = paging_state.get_clustering_key();
@@ -1820,7 +1862,7 @@ static rjson::value encode_paging_state(const schema& schema, const service::pag
        for (const column_definition& cdef : schema.clustering_key_columns()) {
            rjson::set_with_string_name(last_evaluated_key, cdef.name_as_text(), rjson::empty_object());
            rjson::value& key_entry = last_evaluated_key[cdef.name_as_text()];
-            rjson::set_with_string_name(key_entry, type_to_string(cdef.type), rjson::parse(cdef.type->to_json_string(*exploded_ck_it)));
+            rjson::set_with_string_name(key_entry, type_to_string(cdef.type), rjson::parse(to_json_string(*cdef.type, *exploded_ck_it)));
            ++exploded_ck_it;
        }
    }
@@ -1836,10 +1878,11 @@ static future<json::json_return_type> do_query(schema_ptr schema,
        db::consistency_level cl,
        ::shared_ptr<cql3::restrictions::statement_restrictions> filtering_restrictions,
        service::client_state& client_state,
-        cql3::cql_stats& cql_stats) {
+        cql3::cql_stats& cql_stats,
+        tracing::trace_state_ptr trace_state) {
    ::shared_ptr<service::pager::paging_state> paging_state = nullptr;

-    tracing::trace(client_state.get_trace_state(), "Performing a database query");
+    tracing::trace(trace_state, "Performing a database query");

    if (exclusive_start_key) {
        partition_key pk = pk_from_json(*exclusive_start_key, schema);
@@ -1856,7 +1899,7 @@ static future<json::json_return_type> do_query(schema_ptr schema,
    auto partition_slice = query::partition_slice(std::move(ck_bounds), {}, std::move(regular_columns), selection->get_query_options());
    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, query::max_partitions);

-    auto query_state_ptr = std::make_unique<service::query_state>(client_state, empty_service_permit());
+    auto query_state_ptr = std::make_unique<service::query_state>(client_state, trace_state, empty_service_permit());

    command->slice.options.set<query::partition_slice::option::allow_short_read>();
    auto query_options = std::make_unique<cql3::query_options>(cl, infinite_timeout_config, std::vector<cql3::raw_value>{});
@@ -1888,7 +1931,7 @@ static future<json::json_return_type> do_query(schema_ptr schema,
 // 2. Filtering - by passing appropriately created restrictions to pager as a last parameter
 // 3. Proper timeouts instead of gc_clock::now() and db::no_timeout
 // 4. Implement parallel scanning via Segments
-future<json::json_return_type> executor::scan(client_state& client_state, std::string content) {
+future<json::json_return_type> executor::scan(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content) {
    _stats.api_operations.scan++;
    rjson::value request_info = rjson::parse(content);
    elogger.trace("Scanning {}", request_info);
@@ -1898,6 +1941,10 @@ future<json::json_return_type> executor::scan(client_state& client_state, std::s
    if (rjson::find(request_info, "FilterExpression")) {
        throw api_error("ValidationException", "FilterExpression is not yet implemented in alternator");
    }
+    if (get_int_attribute(request_info, "Segment") || get_int_attribute(request_info, "TotalSegments")) {
+        // FIXME: need to support parallel scan. See issue #5059.
+        throw api_error("ValidationException", "Scan Segment/TotalSegments is not yet implemented in alternator");
+    }

    rjson::value* exclusive_start_key = rjson::find(request_info, "ExclusiveStartKey");
    //FIXME(sarna): ScanFilter is deprecated in favor of FilterExpression
@@ -1921,7 +1968,7 @@ future<json::json_return_type> executor::scan(client_state& client_state, std::s
        partition_ranges = filtering_restrictions->get_partition_key_ranges(query_options);
        ck_bounds = filtering_restrictions->get_clustering_bounds(query_options);
    }
-    return do_query(schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl, std::move(filtering_restrictions), client_state, _stats.cql_stats);
+    return do_query(schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl, std::move(filtering_restrictions), client_state, _stats.cql_stats, trace_state);
 }

 static dht::partition_range calculate_pk_bound(schema_ptr schema, const column_definition& pk_cdef, comparison_operator_type op, const rjson::value& attrs) {
@@ -2044,14 +2091,14 @@ calculate_bounds(schema_ptr schema, const rjson::value& conditions) {
    return {std::move(partition_ranges), std::move(ck_bounds)};
 }

-future<json::json_return_type> executor::query(client_state& client_state, std::string content) {
+future<json::json_return_type> executor::query(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content) {
    _stats.api_operations.query++;
    rjson::value request_info = rjson::parse(content);
    elogger.trace("Querying {}", request_info);

    schema_ptr schema = get_table_or_view(_proxy, request_info);

-    tracing::add_table_name(client_state.get_trace_state(), schema->ks_name(), schema->cf_name());
+    tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());

    rjson::value* exclusive_start_key = rjson::find(request_info, "ExclusiveStartKey");
    db::consistency_level cl = get_read_consistency(request_info);
@@ -2067,6 +2114,11 @@ future<json::json_return_type> executor::query(client_state& client_state, std::
    if (rjson::find(request_info, "FilterExpression")) {
        throw api_error("ValidationException", "FilterExpression is not yet implemented in alternator");
    }
+    bool forward = get_bool_attribute(request_info, "ScanIndexForward", true);
+    if (!forward) {
+        // FIXME: need to support the !forward (i.e., reverse sort order) case. See issue #5153.
+        throw api_error("ValidationException", "ScanIndexForward=false is not yet implemented in alternator");
+    }

    //FIXME(sarna): KeyConditions are deprecated in favor of KeyConditionExpression
    rjson::value& conditions = rjson::get(request_info, "KeyConditions");
@@ -2089,7 +2141,7 @@ future<json::json_return_type> executor::query(client_state& client_state, std::
            throw api_error("ValidationException", format("QueryFilter can only contain non-primary key attributes: Primary key attribute: {}", ck_defs.front()->name_as_text()));
        }
    }
-    return do_query(schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl, std::move(filtering_restrictions), client_state, _stats.cql_stats);
+    return do_query(schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl, std::move(filtering_restrictions), client_state, _stats.cql_stats, std::move(trace_state));
 }

 static void validate_limit(int limit) {
@@ -2198,18 +2250,20 @@ future<> executor::maybe_create_keyspace() {
    });
 }

-static void create_tracing_session(executor::client_state& client_state) {
+static tracing::trace_state_ptr create_tracing_session() {
    tracing::trace_state_props_set props;
    props.set<tracing::trace_state_props::full_tracing>();
-    client_state.create_tracing_session(tracing::trace_type::QUERY, props);
+    return tracing::tracing::get_local_tracing_instance().create_session(tracing::trace_type::QUERY, props);
 }

-void executor::maybe_trace_query(client_state& client_state, sstring_view op, sstring_view query) {
+tracing::trace_state_ptr executor::maybe_trace_query(client_state& client_state, sstring_view op, sstring_view query) {
+    tracing::trace_state_ptr trace_state;
    if (tracing::tracing::get_local_tracing_instance().trace_next_query()) {
-        create_tracing_session(client_state);
-        tracing::add_query(client_state.get_trace_state(), query);
-        tracing::begin(client_state.get_trace_state(), format("Alternator {}", op), client_state.get_client_address());
+        trace_state = create_tracing_session();
+        tracing::add_query(trace_state, query);
+        tracing::begin(trace_state, format("Alternator {}", op), client_state.get_client_address());
    }
+    return trace_state;
 }

 future<> executor::start() {
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -46,26 +46,26 @@ public:

    executor(service::storage_proxy& proxy, service::migration_manager& mm) : _proxy(proxy), _mm(mm) {}

-    future<json::json_return_type> create_table(client_state& client_state, std::string content);
-    future<json::json_return_type> describe_table(client_state& client_state, std::string content);
-    future<json::json_return_type> delete_table(client_state& client_state, std::string content);
-    future<json::json_return_type> put_item(client_state& client_state, std::string content);
-    future<json::json_return_type> get_item(client_state& client_state, std::string content);
-    future<json::json_return_type> delete_item(client_state& client_state, std::string content);
-    future<json::json_return_type> update_item(client_state& client_state, std::string content);
+    future<json::json_return_type> create_table(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content);
+    future<json::json_return_type> describe_table(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content);
+    future<json::json_return_type> delete_table(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content);
+    future<json::json_return_type> put_item(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content);
+    future<json::json_return_type> get_item(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content);
+    future<json::json_return_type> delete_item(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content);
+    future<json::json_return_type> update_item(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content);
    future<json::json_return_type> list_tables(client_state& client_state, std::string content);
-    future<json::json_return_type> scan(client_state& client_state, std::string content);
+    future<json::json_return_type> scan(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content);
    future<json::json_return_type> describe_endpoints(client_state& client_state, std::string content, std::string host_header);
-    future<json::json_return_type> batch_write_item(client_state& client_state, std::string content);
-    future<json::json_return_type> batch_get_item(client_state& client_state, std::string content);
-    future<json::json_return_type> query(client_state& client_state, std::string content);
+    future<json::json_return_type> batch_write_item(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content);
+    future<json::json_return_type> batch_get_item(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content);
+    future<json::json_return_type> query(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content);

    future<> start();
    future<> stop() { return make_ready_future<>(); }

    future<> maybe_create_keyspace();

-    static void maybe_trace_query(client_state& client_state, sstring_view op, sstring_view query);
+    static tracing::trace_state_ptr maybe_trace_query(client_state& client_state, sstring_view op, sstring_view query);
 };

 }
--- a/alternator/rjson.cc
+++ b/alternator/rjson.cc
@@ -113,6 +113,58 @@ void push_back(rjson::value& base_array, rjson::value&& item) {

 }

+bool single_value_comp::operator()(const rjson::value& r1, const rjson::value& r2) const {
+   auto r1_type = r1.GetType();
+   auto r2_type = r2.GetType();
+
+   // null is the smallest type and compares with every other type, nothing is lesser than null
+   if (r1_type == rjson::type::kNullType || r2_type == rjson::type::kNullType) {
+       return r1_type < r2_type;
+   }
+   // only null, true, and false are comparable with each other, other types are not compatible
+   if (r1_type != r2_type) {
+       if (r1_type > rjson::type::kTrueType || r2_type > rjson::type::kTrueType) {
+           throw rjson::error(format("Types are not comparable: {} {}", r1, r2));
+       }
+   }
+
+   switch (r1_type) {
+   case rjson::type::kNullType:
+       // fall-through
+   case rjson::type::kFalseType:
+       // fall-through
+   case rjson::type::kTrueType:
+       return r1_type < r2_type;
+   case rjson::type::kObjectType:
+       throw rjson::error("Object type comparison is not supported");
+   case rjson::type::kArrayType:
+       throw rjson::error("Array type comparison is not supported");
+   case rjson::type::kStringType: {
+       const size_t r1_len = r1.GetStringLength();
+       const size_t r2_len = r2.GetStringLength();
+       size_t len = std::min(r1_len, r2_len);
+       int result = std::strncmp(r1.GetString(), r2.GetString(), len);
+       return result < 0 || (result == 0 && r1_len < r2_len);
+   }
+   case rjson::type::kNumberType: {
+       if (r1.IsInt() && r2.IsInt()) {
+           return r1.GetInt() < r2.GetInt();
+       } else if (r1.IsUint() && r2.IsUint()) {
+           return r1.GetUint() < r2.GetUint();
+       } else if (r1.IsInt64() && r2.IsInt64()) {
+           return r1.GetInt64() < r2.GetInt64();
+       } else if (r1.IsUint64() && r2.IsUint64()) {
+           return r1.GetUint64() < r2.GetUint64();
+       } else {
+           // it's safe to call GetDouble() on any number type
+           return r1.GetDouble() < r2.GetDouble();
+       }
+   }
+   default:
+       return false;
+   }
+}
+
 } // end namespace rjson

 std::ostream& std::operator<<(std::ostream& os, const rjson::value& v) {
--- a/alternator/rjson.hh
+++ b/alternator/rjson.hh
@@ -152,6 +152,10 @@ void set(rjson::value& base, rjson::string_ref_type name, rjson::string_ref_type
 // Throws if base_array is not a JSON array.
 void push_back(rjson::value& base_array, rjson::value&& item);

+struct single_value_comp {
+    bool operator()(const rjson::value& r1, const rjson::value& r2) const;
+};
+
 } // end namespace rjson

 namespace std {
--- a/alternator/serialization.cc
+++ b/alternator/serialization.cc
@@ -25,6 +25,7 @@
 #include "error.hh"
 #include "rapidjson/writer.h"
 #include "concrete_types.hh"
+#include "cql3/type_json.hh"

 static logging::logger slogger("alternator-serialization");

@@ -77,7 +78,7 @@ struct from_json_visitor {
    }
    // default
    void operator()(const abstract_type& t) const {
-        bo.write(t.from_json_object(Json::Value(rjson::print(v)), cql_serialization_format::internal()));
+        bo.write(from_json_object(t, Json::Value(rjson::print(v)), cql_serialization_format::internal()));
    }
 };

@@ -107,7 +108,7 @@ struct to_json_visitor {

    void operator()(const reversed_type_impl& t) const { visit(*t.underlying_type(), to_json_visitor{deserialized, type_ident, bv}); };
    void operator()(const decimal_type_impl& t) const {
-        auto s = decimal_type->to_json_string(bytes(bv));
+        auto s = to_json_string(*decimal_type, bytes(bv));
        //FIXME(sarna): unnecessary copy
        rjson::set_with_string_name(deserialized, type_ident, rjson::from_string(s));
    }
@@ -194,7 +195,7 @@ rjson::value json_key_column_value(bytes_view cell, const column_definition& col
        // FIXME: use specialized Alternator number type, not the more
        // general "decimal_type". A dedicated type can be more efficient
        // in storage space and in parsing speed.
-        auto s = decimal_type->to_json_string(bytes(cell));
+        auto s = to_json_string(*decimal_type, bytes(cell));
        return rjson::from_string(s);
    } else {
        // We shouldn't get here, we shouldn't see such key columns.
@@ -245,4 +246,16 @@ big_decimal unwrap_number(const rjson::value& v, std::string_view diagnostic) {
    return big_decimal(it->value.GetString());
 }

+const std::pair<std::string, const rjson::value*> unwrap_set(const rjson::value& v) {
+    if (!v.IsObject() || v.MemberCount() != 1) {
+        return {"", nullptr};
+    }
+    auto it = v.MemberBegin();
+    const std::string it_key = it->name.GetString();
+    if (it_key != "SS" && it_key != "BS" && it_key != "NS") {
+        return {"", nullptr};
+    }
+    return std::make_pair(it_key, &(it->value));
+}
+
 }
--- a/alternator/serialization.hh
+++ b/alternator/serialization.hh
@@ -63,4 +63,10 @@ clustering_key ck_from_json(const rjson::value& item, schema_ptr schema);
 // If v encodes a number (i.e., it is a {"N": [...]}, returns an object representing it.  Otherwise,
 // raises ValidationException with diagnostic.
 big_decimal unwrap_number(const rjson::value& v, std::string_view diagnostic);
+
+// Check if a given JSON object encodes a set (i.e., it is a {"SS": [...]}, or "NS", "BS"
+// and returns set's type and a pointer to that set. If the object does not encode a set,
+// returned value is {"", nullptr}
+const std::pair<std::string, const rjson::value*> unwrap_set(const rjson::value& v);
+
 }
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -215,6 +215,7 @@ future<> server::verify_signature(const request& req) {
 }

 future<json::json_return_type> server::handle_api_request(std::unique_ptr<request>&& req) {
+    _executor.local()._stats.total_operations++;
    sstring target = req->get_header(TARGET);
    std::vector<std::string_view> split_target = split(target, '.');
    //NOTICE(sarna): Target consists of Dynamo API version followed by a dot '.' and operation type (e.g. CreateTable)
@@ -231,9 +232,9 @@ future<json::json_return_type> server::handle_api_request(std::unique_ptr<reques
        // We use unique_ptr because client_state cannot be moved or copied
        return do_with(std::make_unique<executor::client_state>(executor::client_state::internal_tag()), [this, callback_it = std::move(callback_it), op = std::move(op), req = std::move(req)] (std::unique_ptr<executor::client_state>& client_state) mutable {
            client_state->set_raw_keyspace(executor::KEYSPACE_NAME);
-            executor::maybe_trace_query(*client_state, op, req->content);
-            tracing::trace(client_state->get_trace_state(), op);
-            return callback_it->second(_executor.local(), *client_state, std::move(req));
+            tracing::trace_state_ptr trace_state = executor::maybe_trace_query(*client_state, op, req->content);
+            tracing::trace(trace_state, op);
+            return callback_it->second(_executor.local(), *client_state, trace_state, std::move(req)).finally([trace_state] {});
        });
    });
 }
@@ -253,21 +254,21 @@ void server::set_routes(routes& r) {
 server::server(seastar::sharded<executor>& e)
        : _executor(e), _key_cache(1024, 1min, slogger), _enforce_authorization(false)
      , _callbacks{
-        {"CreateTable", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) {
-            return e.maybe_create_keyspace().then([&e, &client_state, req = std::move(req)] { return e.create_table(client_state, req->content); }); }
+        {"CreateTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) {
+            return e.maybe_create_keyspace().then([&e, &client_state, req = std::move(req), trace_state = std::move(trace_state)] () mutable { return e.create_table(client_state, std::move(trace_state), req->content); }); }
        },
-        {"DescribeTable", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.describe_table(client_state, req->content); }},
-        {"DeleteTable", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.delete_table(client_state, req->content); }},
-        {"PutItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.put_item(client_state, req->content); }},
-        {"UpdateItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.update_item(client_state, req->content); }},
-        {"GetItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.get_item(client_state, req->content); }},
-        {"DeleteItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.delete_item(client_state, req->content); }},
-        {"ListTables", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.list_tables(client_state, req->content); }},
-        {"Scan", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.scan(client_state, req->content); }},
-        {"DescribeEndpoints", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.describe_endpoints(client_state, req->content, req->get_header("Host")); }},
-        {"BatchWriteItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.batch_write_item(client_state, req->content); }},
-        {"BatchGetItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.batch_get_item(client_state, req->content); }},
-        {"Query", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.query(client_state, req->content); }},
+        {"DescribeTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.describe_table(client_state, std::move(trace_state), req->content); }},
+        {"DeleteTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.delete_table(client_state, std::move(trace_state), req->content); }},
+        {"PutItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.put_item(client_state, std::move(trace_state), req->content); }},
+        {"UpdateItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.update_item(client_state, std::move(trace_state), req->content); }},
+        {"GetItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.get_item(client_state, std::move(trace_state), req->content); }},
+        {"DeleteItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.delete_item(client_state, std::move(trace_state), req->content); }},
+        {"ListTables", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.list_tables(client_state, req->content); }},
+        {"Scan", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.scan(client_state, std::move(trace_state), req->content); }},
+        {"DescribeEndpoints", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.describe_endpoints(client_state, req->content, req->get_header("Host")); }},
+        {"BatchWriteItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.batch_write_item(client_state, std::move(trace_state), req->content); }},
+        {"BatchGetItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.batch_get_item(client_state, std::move(trace_state), req->content); }},
+        {"Query", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.query(client_state, std::move(trace_state), req->content); }},
    } {
 }

@@ -300,9 +301,11 @@ future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std:
                slogger.info("Alternator HTTPS server listening on {} port {}", addr, *https_port);
            }
        } catch (...) {
-            slogger.warn("Failed to set up Alternator HTTP server on {} port {}, TLS port {}: {}",
+            slogger.error("Failed to set up Alternator HTTP server on {} port {}, TLS port {}: {}",
                    addr, port ? std::to_string(*port) : "OFF", https_port ? std::to_string(*https_port) : "OFF", std::current_exception());
-            throw;
+            std::throw_with_nested(std::runtime_error(
+                    format("Failed to set up Alternator HTTP server on {} port {}, TLS port {}",
+                            addr, port ? std::to_string(*port) : "OFF", https_port ? std::to_string(*https_port) : "OFF")));
        }
    });
 }
--- a/alternator/server.hh
+++ b/alternator/server.hh
@@ -31,7 +31,7 @@
 namespace alternator {

 class server {
-    using alternator_callback = std::function<future<json::json_return_type>(executor&, executor::client_state&, std::unique_ptr<request>)>;
+    using alternator_callback = std::function<future<json::json_return_type>(executor&, executor::client_state&, tracing::trace_state_ptr, std::unique_ptr<request>)>;
    using alternator_callbacks_map = std::unordered_map<std::string_view, alternator_callback>;

    seastar::httpd::http_server_control _control;
--- a/api/api-doc/cache_service.json
+++ b/api/api-doc/cache_service.json
@@ -13,7 +13,7 @@
            {
               "method":"GET",
               "summary":"get row cache save period in seconds",
-               "type":"int",
+               "type": "long",
               "nickname":"get_row_cache_save_period_in_seconds",
               "produces":[
                  "application/json"
@@ -35,7 +35,7 @@
                     "description":"row cache save period in seconds",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -48,7 +48,7 @@
            {
               "method":"GET",
               "summary":"get key cache save period in seconds",
-               "type":"int",
+               "type": "long",
               "nickname":"get_key_cache_save_period_in_seconds",
               "produces":[
                  "application/json"
@@ -70,7 +70,7 @@
                     "description":"key cache save period in seconds",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -83,7 +83,7 @@
            {
               "method":"GET",
               "summary":"get counter cache save period in seconds",
-               "type":"int",
+               "type": "long",
               "nickname":"get_counter_cache_save_period_in_seconds",
               "produces":[
                  "application/json"
@@ -105,7 +105,7 @@
                     "description":"counter cache save period in seconds",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -118,7 +118,7 @@
            {
               "method":"GET",
               "summary":"get row cache keys to save",
-               "type":"int",
+               "type": "long",
               "nickname":"get_row_cache_keys_to_save",
               "produces":[
                  "application/json"
@@ -140,7 +140,7 @@
                     "description":"row cache keys to save",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -153,7 +153,7 @@
            {
               "method":"GET",
               "summary":"get key cache keys to save",
-               "type":"int",
+               "type": "long",
               "nickname":"get_key_cache_keys_to_save",
               "produces":[
                  "application/json"
@@ -175,7 +175,7 @@
                     "description":"key cache keys to save",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -188,7 +188,7 @@
            {
               "method":"GET",
               "summary":"get counter cache keys to save",
-               "type":"int",
+               "type": "long",
               "nickname":"get_counter_cache_keys_to_save",
               "produces":[
                  "application/json"
@@ -210,7 +210,7 @@
                     "description":"counter cache keys to save",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -448,7 +448,7 @@
        {
          "method": "GET",
          "summary": "Get key entries",
-          "type": "int",
+          "type": "long",
          "nickname": "get_key_entries",
          "produces": [
            "application/json"
@@ -568,7 +568,7 @@
        {
          "method": "GET",
          "summary": "Get row entries",
-          "type": "int",
+          "type": "long",
          "nickname": "get_row_entries",
          "produces": [
            "application/json"
@@ -688,7 +688,7 @@
        {
          "method": "GET",
          "summary": "Get counter entries",
-          "type": "int",
+          "type": "long",
          "nickname": "get_counter_entries",
          "produces": [
            "application/json"
--- a/api/api-doc/column_family.json
+++ b/api/api-doc/column_family.json
@@ -121,7 +121,7 @@
                     "description":"The minimum number of sstables in queue before compaction kicks off",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -172,7 +172,7 @@
                     "description":"The maximum number of sstables in queue before compaction kicks off",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -223,7 +223,7 @@
                     "description":"The maximum number of sstables in queue before compaction kicks off",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  },
                  {
@@ -231,7 +231,7 @@
                     "description":"The minimum number of sstables in queue before compaction kicks off",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -544,7 +544,7 @@
               "summary":"sstable count for each level. empty unless leveled compaction is used",
               "type":"array",
               "items":{
-                  "type":"int"
+                  "type": "long"
               },
               "nickname":"get_sstable_count_per_level",
               "produces":[
@@ -636,7 +636,7 @@
                     "description":"Duration (in milliseconds) of monitoring operation",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  },
                  {
@@ -644,7 +644,7 @@
                    "description":"number of the top partitions to list",
                    "required":false,
                    "allowMultiple":false,
-                    "type":"int",
+                    "type": "long",
                    "paramType":"query"
                 },
                 {
@@ -652,7 +652,7 @@
                    "description":"capacity of stream summary: determines amount of resources used in query processing",
                    "required":false,
                    "allowMultiple":false,
-                    "type":"int",
+                    "type": "long",
                    "paramType":"query"
                 }
              ]
@@ -921,7 +921,7 @@
            {
               "method":"GET",
               "summary":"Get memtable switch count",
-               "type":"int",
+               "type": "long",
               "nickname":"get_memtable_switch_count",
               "produces":[
                  "application/json"
@@ -945,7 +945,7 @@
            {
               "method":"GET",
               "summary":"Get all memtable switch count",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_memtable_switch_count",
               "produces":[
                  "application/json"
@@ -1082,7 +1082,7 @@
            {
               "method":"GET",
               "summary":"Get read latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_read_latency",
               "produces":[
                  "application/json"
@@ -1235,7 +1235,7 @@
            {
               "method":"GET",
               "summary":"Get all read latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_read_latency",
               "produces":[
                  "application/json"
@@ -1251,7 +1251,7 @@
            {
               "method":"GET",
               "summary":"Get range latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_range_latency",
               "produces":[
                  "application/json"
@@ -1275,7 +1275,7 @@
            {
               "method":"GET",
               "summary":"Get all range latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_range_latency",
               "produces":[
                  "application/json"
@@ -1291,7 +1291,7 @@
            {
               "method":"GET",
               "summary":"Get write latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_write_latency",
               "produces":[
                  "application/json"
@@ -1444,7 +1444,7 @@
            {
               "method":"GET",
               "summary":"Get all write latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_write_latency",
               "produces":[
                  "application/json"
@@ -1460,7 +1460,7 @@
            {
               "method":"GET",
               "summary":"Get pending flushes",
-               "type":"int",
+               "type": "long",
               "nickname":"get_pending_flushes",
               "produces":[
                  "application/json"
@@ -1484,7 +1484,7 @@
            {
               "method":"GET",
               "summary":"Get all pending flushes",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_pending_flushes",
               "produces":[
                  "application/json"
@@ -1500,7 +1500,7 @@
            {
               "method":"GET",
               "summary":"Get pending compactions",
-               "type":"int",
+               "type": "long",
               "nickname":"get_pending_compactions",
               "produces":[
                  "application/json"
@@ -1524,7 +1524,7 @@
            {
               "method":"GET",
               "summary":"Get all pending compactions",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_pending_compactions",
               "produces":[
                  "application/json"
@@ -1540,7 +1540,7 @@
            {
               "method":"GET",
               "summary":"Get live ss table count",
-               "type":"int",
+               "type": "long",
               "nickname":"get_live_ss_table_count",
               "produces":[
                  "application/json"
@@ -1564,7 +1564,7 @@
            {
               "method":"GET",
               "summary":"Get all live ss table count",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_live_ss_table_count",
               "produces":[
                  "application/json"
@@ -1580,7 +1580,7 @@
            {
               "method":"GET",
               "summary":"Get live disk space used",
-               "type":"int",
+               "type": "long",
               "nickname":"get_live_disk_space_used",
               "produces":[
                  "application/json"
@@ -1604,7 +1604,7 @@
            {
               "method":"GET",
               "summary":"Get all live disk space used",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_live_disk_space_used",
               "produces":[
                  "application/json"
@@ -1620,7 +1620,7 @@
            {
               "method":"GET",
               "summary":"Get total disk space used",
-               "type":"int",
+               "type": "long",
               "nickname":"get_total_disk_space_used",
               "produces":[
                  "application/json"
@@ -1644,7 +1644,7 @@
            {
               "method":"GET",
               "summary":"Get all total disk space used",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_total_disk_space_used",
               "produces":[
                  "application/json"
@@ -2100,7 +2100,7 @@
            {
               "method":"GET",
               "summary":"Get speculative retries",
-               "type":"int",
+               "type": "long",
               "nickname":"get_speculative_retries",
               "produces":[
                  "application/json"
@@ -2124,7 +2124,7 @@
            {
               "method":"GET",
               "summary":"Get all speculative retries",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_speculative_retries",
               "produces":[
                  "application/json"
@@ -2204,7 +2204,7 @@
            {
               "method":"GET",
               "summary":"Get row cache hit out of range",
-               "type":"int",
+               "type": "long",
               "nickname":"get_row_cache_hit_out_of_range",
               "produces":[
                  "application/json"
@@ -2228,7 +2228,7 @@
            {
               "method":"GET",
               "summary":"Get all row cache hit out of range",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_row_cache_hit_out_of_range",
               "produces":[
                  "application/json"
@@ -2244,7 +2244,7 @@
            {
               "method":"GET",
               "summary":"Get row cache hit",
-               "type":"int",
+               "type": "long",
               "nickname":"get_row_cache_hit",
               "produces":[
                  "application/json"
@@ -2268,7 +2268,7 @@
            {
               "method":"GET",
               "summary":"Get all row cache hit",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_row_cache_hit",
               "produces":[
                  "application/json"
@@ -2284,7 +2284,7 @@
            {
               "method":"GET",
               "summary":"Get row cache miss",
-               "type":"int",
+               "type": "long",
               "nickname":"get_row_cache_miss",
               "produces":[
                  "application/json"
@@ -2308,7 +2308,7 @@
            {
               "method":"GET",
               "summary":"Get all row cache miss",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_row_cache_miss",
               "produces":[
                  "application/json"
@@ -2324,7 +2324,7 @@
            {
               "method":"GET",
               "summary":"Get cas prepare",
-               "type":"int",
+               "type": "long",
               "nickname":"get_cas_prepare",
               "produces":[
                  "application/json"
@@ -2348,7 +2348,7 @@
            {
               "method":"GET",
               "summary":"Get cas propose",
-               "type":"int",
+               "type": "long",
               "nickname":"get_cas_propose",
               "produces":[
                  "application/json"
@@ -2372,7 +2372,7 @@
            {
               "method":"GET",
               "summary":"Get cas commit",
-               "type":"int",
+               "type": "long",
               "nickname":"get_cas_commit",
               "produces":[
                  "application/json"
--- a/api/api-doc/compaction_manager.json
+++ b/api/api-doc/compaction_manager.json
@@ -118,7 +118,7 @@
        {
          "method": "GET",
          "summary": "Get pending tasks",
-          "type": "int",
+          "type": "long",
          "nickname": "get_pending_tasks",
          "produces": [
            "application/json"
@@ -181,7 +181,7 @@
        {
          "method": "GET",
          "summary": "Get bytes compacted",
-          "type": "int",
+          "type": "long",
          "nickname": "get_bytes_compacted",
          "produces": [
            "application/json"
@@ -197,7 +197,7 @@
         "description":"A row merged information",
         "properties":{
            "key":{
-               "type":"int",
+               "type": "long",
               "description":"The number of sstable"
            },
            "value":{
--- a/api/api-doc/failure_detector.json
+++ b/api/api-doc/failure_detector.json
@@ -110,7 +110,7 @@
            {
               "method":"GET",
               "summary":"Get count down endpoint",
-               "type":"int",
+               "type": "long",
               "nickname":"get_down_endpoint_count",
               "produces":[
                  "application/json"
@@ -126,7 +126,7 @@
            {
               "method":"GET",
               "summary":"Get count up endpoint",
-               "type":"int",
+               "type": "long",
               "nickname":"get_up_endpoint_count",
               "produces":[
                  "application/json"
@@ -180,11 +180,11 @@
                    "description": "The endpoint address"
                },
                "generation": {
-                    "type": "int",
+                    "type": "long",
                    "description": "The heart beat generation"
                },
                "version": {
-                    "type": "int",
+                    "type": "long",
                    "description": "The heart beat version"
                },
                "update_time": {
@@ -209,7 +209,7 @@
           "description": "Holds a version value for an application state",
               "properties": {
                "application_state": {
-                    "type": "int",
+                    "type": "long",
                    "description": "The application state enum index"
                },
                "value": {
@@ -217,7 +217,7 @@
                    "description": "The version value"
                },
                "version": {
-                    "type": "int",
+                    "type": "long",
                    "description": "The application state version"
                }
            }
--- a/api/api-doc/gossiper.json
+++ b/api/api-doc/gossiper.json
@@ -75,7 +75,7 @@
            {
               "method":"GET",
               "summary":"Returns files which are pending for archival attempt. Does NOT include failed archive attempts",
-               "type":"int",
+               "type": "long",
               "nickname":"get_current_generation_number",
               "produces":[
                  "application/json"
@@ -99,7 +99,7 @@
            {
               "method":"GET",
               "summary":"Get heart beat version for a node",
-               "type":"int",
+               "type": "long",
               "nickname":"get_current_heart_beat_version",
               "produces":[
                  "application/json"
--- a/api/api-doc/hinted_handoff.json
+++ b/api/api-doc/hinted_handoff.json
@@ -99,7 +99,7 @@
        {
          "method": "GET",
          "summary": "Get create hint count",
-          "type": "int",
+          "type": "long",
          "nickname": "get_create_hint_count",
          "produces": [
            "application/json"
@@ -123,7 +123,7 @@
        {
          "method": "GET",
          "summary": "Get not stored hints count",
-          "type": "int",
+          "type": "long",
          "nickname": "get_not_stored_hints_count",
          "produces": [
            "application/json"
--- a/api/api-doc/messaging_service.json
+++ b/api/api-doc/messaging_service.json
@@ -191,7 +191,7 @@
            {
               "method":"GET",
               "summary":"Get the version number",
-               "type":"int",
+               "type": "long",
               "nickname":"get_version",
               "produces":[
                  "application/json"
--- a/api/api-doc/storage_proxy.json
+++ b/api/api-doc/storage_proxy.json
@@ -105,7 +105,7 @@
            {
               "method":"GET",
               "summary":"Get the max hint window",
-               "type":"int",
+               "type": "long",
               "nickname":"get_max_hint_window",
               "produces":[
                  "application/json"
@@ -128,7 +128,7 @@
                     "description":"max hint window in ms",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -141,7 +141,7 @@
            {
               "method":"GET",
               "summary":"Get max hints in progress",
-               "type":"int",
+               "type": "long",
               "nickname":"get_max_hints_in_progress",
               "produces":[
                  "application/json"
@@ -164,7 +164,7 @@
                     "description":"max hints in progress",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -177,7 +177,7 @@
            {
               "method":"GET",
               "summary":"get hints in progress",
-               "type":"int",
+               "type": "long",
               "nickname":"get_hints_in_progress",
               "produces":[
                  "application/json"
@@ -602,7 +602,7 @@
        {
          "method": "GET",
          "summary": "Get cas write metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_cas_write_metrics_unfinished_commit",
          "produces": [
            "application/json"
@@ -632,7 +632,7 @@
        {
          "method": "GET",
          "summary": "Get cas write metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_cas_write_metrics_condition_not_met",
          "produces": [
            "application/json"
@@ -647,7 +647,7 @@
        {
          "method": "GET",
          "summary": "Get cas read metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_cas_read_metrics_unfinished_commit",
          "produces": [
            "application/json"
@@ -677,7 +677,7 @@
        {
          "method": "GET",
          "summary": "Get read metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_read_metrics_timeouts",
          "produces": [
            "application/json"
@@ -692,7 +692,7 @@
        {
          "method": "GET",
          "summary": "Get read metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_read_metrics_unavailables",
          "produces": [
            "application/json"
@@ -827,7 +827,7 @@
        {
          "method": "GET",
          "summary": "Get range metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_range_metrics_timeouts",
          "produces": [
            "application/json"
@@ -842,7 +842,7 @@
        {
          "method": "GET",
          "summary": "Get range metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_range_metrics_unavailables",
          "produces": [
            "application/json"
@@ -887,7 +887,7 @@
        {
          "method": "GET",
          "summary": "Get write metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_write_metrics_timeouts",
          "produces": [
            "application/json"
@@ -902,7 +902,7 @@
        {
          "method": "GET",
          "summary": "Get write metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_write_metrics_unavailables",
          "produces": [
            "application/json"
@@ -1008,7 +1008,7 @@
            {
               "method":"GET",
               "summary":"Get read latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_read_latency",
               "produces":[
                  "application/json"
@@ -1040,7 +1040,7 @@
            {
               "method":"GET",
               "summary":"Get write latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_write_latency",
               "produces":[
                  "application/json"
@@ -1072,7 +1072,7 @@
            {
               "method":"GET",
               "summary":"Get range latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_range_latency",
               "produces":[
                  "application/json"
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -458,7 +458,7 @@
            {
               "method":"GET",
               "summary":"Return the generation value for this node.",
-               "type":"int",
+               "type": "long",
               "nickname":"get_current_generation_number",
               "produces":[
                  "application/json"
@@ -646,7 +646,7 @@
            {
               "method":"POST",
               "summary":"Trigger a cleanup of keys on a single keyspace",
-               "type":"int",
+               "type": "long",
               "nickname":"force_keyspace_cleanup",
               "produces":[
                  "application/json"
@@ -678,7 +678,7 @@
            {
               "method":"GET",
               "summary":"Scrub (deserialize + reserialize at the latest version, skipping bad rows if any) the given keyspace. If columnFamilies array is empty, all CFs are scrubbed. Scrubbed CFs will be snapshotted first, if disableSnapshot is false",
-               "type":"int",
+               "type": "long",
               "nickname":"scrub",
               "produces":[
                  "application/json"
@@ -726,7 +726,7 @@
            {
               "method":"GET",
               "summary":"Rewrite all sstables to the latest version. Unlike scrub, it doesn't skip bad rows and do not snapshot sstables first.",
-               "type":"int",
+               "type": "long",
               "nickname":"upgrade_sstables",
               "produces":[
                  "application/json"
@@ -800,7 +800,7 @@
               "summary":"Return an array with the ids of the currently active repairs",
               "type":"array",
               "items":{
-                  "type":"int"
+                  "type": "long"
               },
               "nickname":"get_active_repair_async",
               "produces":[
@@ -816,7 +816,7 @@
            {
               "method":"POST",
               "summary":"Invoke repair asynchronously. You can track repair progress by using the get supplying id",
-               "type":"int",
+               "type": "long",
               "nickname":"repair_async",
               "produces":[
                  "application/json"
@@ -947,7 +947,7 @@
                     "description":"The repair ID to check for status",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -1277,18 +1277,18 @@
                  },
                  {
                     "name":"dynamic_update_interval",
-                     "description":"integer, in ms (default 100)",
+                     "description":"interval in ms (default 100)",
                     "required":false,
                     "allowMultiple":false,
-                     "type":"integer",
+                     "type":"long",
                     "paramType":"query"
                  },
                  {
                     "name":"dynamic_reset_interval",
-                     "description":"integer, in ms (default 600,000)",
+                     "description":"interval in ms (default 600,000)",
                     "required":false,
                     "allowMultiple":false,
-                     "type":"integer",
+                     "type":"long",
                     "paramType":"query"
                  },
                  {
@@ -1493,7 +1493,7 @@
                     "description":"Stream throughput",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -1501,7 +1501,7 @@
            {
               "method":"GET",
               "summary":"Get stream throughput mb per sec",
-               "type":"int",
+               "type": "long",
               "nickname":"get_stream_throughput_mb_per_sec",
               "produces":[
                  "application/json"
@@ -1517,7 +1517,7 @@
            {
               "method":"GET",
               "summary":"get compaction throughput mb per sec",
-               "type":"int",
+               "type": "long",
               "nickname":"get_compaction_throughput_mb_per_sec",
               "produces":[
                  "application/json"
@@ -1539,7 +1539,7 @@
                     "description":"compaction throughput",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -1943,7 +1943,7 @@
            {
               "method":"GET",
               "summary":"Returns the threshold for warning of queries with many tombstones",
-               "type":"int",
+               "type": "long",
               "nickname":"get_tombstone_warn_threshold",
               "produces":[
                  "application/json"
@@ -1965,7 +1965,7 @@
                     "description":"tombstone debug threshold",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -1978,7 +1978,7 @@
            {
               "method":"GET",
               "summary":"",
-               "type":"int",
+               "type": "long",
               "nickname":"get_tombstone_failure_threshold",
               "produces":[
                  "application/json"
@@ -2000,7 +2000,7 @@
                     "description":"tombstone debug threshold",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -2013,7 +2013,7 @@
            {
               "method":"GET",
               "summary":"Returns the threshold for rejecting queries due to a large batch size",
-               "type":"int",
+               "type": "long",
               "nickname":"get_batch_size_failure_threshold",
               "produces":[
                  "application/json"
@@ -2035,7 +2035,7 @@
                     "description":"batch size debug threshold",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -2059,7 +2059,7 @@
                     "description":"throttle in kb",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -2072,7 +2072,7 @@
            {
               "method":"GET",
               "summary":"Get load",
-               "type":"int",
+               "type": "long",
               "nickname":"get_metrics_load",
               "produces":[
                  "application/json"
@@ -2088,7 +2088,7 @@
            {
               "method":"GET",
               "summary":"Get exceptions",
-               "type":"int",
+               "type": "long",
               "nickname":"get_exceptions",
               "produces":[
                  "application/json"
@@ -2104,7 +2104,7 @@
            {
               "method":"GET",
               "summary":"Get total hints in progress",
-               "type":"int",
+               "type": "long",
               "nickname":"get_total_hints_in_progress",
               "produces":[
                  "application/json"
@@ -2120,7 +2120,7 @@
            {
               "method":"GET",
               "summary":"Get total hints",
-               "type":"int",
+               "type": "long",
               "nickname":"get_total_hints",
               "produces":[
                  "application/json"
--- a/api/api-doc/stream_manager.json
+++ b/api/api-doc/stream_manager.json
@@ -32,7 +32,7 @@
            {
               "method":"GET",
               "summary":"Get number of active outbound streams",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_active_streams_outbound",
               "produces":[
                  "application/json"
@@ -48,7 +48,7 @@
            {
               "method":"GET",
               "summary":"Get total incoming bytes",
-               "type":"int",
+               "type": "long",
               "nickname":"get_total_incoming_bytes",
               "produces":[
                  "application/json"
@@ -72,7 +72,7 @@
            {
               "method":"GET",
               "summary":"Get all total incoming bytes",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_total_incoming_bytes",
               "produces":[
                  "application/json"
@@ -88,7 +88,7 @@
            {
               "method":"GET",
               "summary":"Get total outgoing bytes",
-               "type":"int",
+               "type": "long",
               "nickname":"get_total_outgoing_bytes",
               "produces":[
                  "application/json"
@@ -112,7 +112,7 @@
            {
               "method":"GET",
               "summary":"Get all total outgoing bytes",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_total_outgoing_bytes",
               "produces":[
                  "application/json"
@@ -154,7 +154,7 @@
               "description":"The peer"
            },
            "session_index":{
-               "type":"int",
+               "type": "long",
               "description":"The session index"
            },
            "connecting":{
@@ -211,7 +211,7 @@
               "description":"The ID"
            },
            "files":{
-               "type":"int",
+               "type": "long",
               "description":"Number of files to transfer. Can be 0 if nothing to transfer for some streaming request."
            },
            "total_size":{
@@ -242,7 +242,7 @@
               "description":"The peer address"
            },
            "session_index":{
-               "type":"int",
+               "type": "long",
               "description":"The session index"
            },
            "file_name":{
--- a/api/api-doc/system.json
+++ b/api/api-doc/system.json
@@ -52,6 +52,21 @@
            }
         ]
      },
+      {
+         "path":"/system/uptime_ms",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get system uptime, in milliseconds",
+               "type":"long",
+               "nickname":"get_system_uptime",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[]
+            }
+         ]
+      },
      {
         "path":"/system/logger/{name}",
         "operations":[
--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -23,6 +23,8 @@
 #include "service/storage_proxy.hh"
 #include <seastar/http/httpd.hh>

+namespace service { class load_meter; }
+
 namespace api {

 struct http_context {
@@ -31,9 +33,11 @@ struct http_context {
    httpd::http_server_control http_server;
    distributed<database>& db;
    distributed<service::storage_proxy>& sp;
+    service::load_meter& lmeter;
    http_context(distributed<database>& _db,
-            distributed<service::storage_proxy>& _sp)
-            : db(_db), sp(_sp) {
+            distributed<service::storage_proxy>& _sp,
+            service::load_meter& _lm)
+            : db(_db), sp(_sp), lmeter(_lm) {
    }
 };

--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -27,6 +27,7 @@
 #include <boost/range/adaptor/map.hpp>
 #include <boost/range/adaptor/filtered.hpp>
 #include "service/storage_service.hh"
+#include "service/load_meter.hh"
 #include "db/commitlog/commitlog.hh"
 #include "gms/gossiper.hh"
 #include "db/system_keyspace.hh"
@@ -55,26 +56,22 @@ static sstring validate_keyspace(http_context& ctx, const parameters& param) {
    throw bad_param_exception("Keyspace " + param["keyspace"] + " Does not exist");
 }

-static std::vector<ss::token_range> describe_ring(const sstring& keyspace) {
-    std::vector<ss::token_range> res;
-    for (auto d : service::get_local_storage_service().describe_ring(keyspace)) {
-        ss::token_range r;
-        r.start_token = d._start_token;
-        r.end_token = d._end_token;
-        r.endpoints = d._endpoints;
-        r.rpc_endpoints = d._rpc_endpoints;
-        for (auto det : d._endpoint_details) {
-            ss::endpoint_detail ed;
-            ed.host = det._host;
-            ed.datacenter = det._datacenter;
-            if (det._rack != "") {
-                ed.rack = det._rack;
-            }
-            r.endpoint_details.push(ed);
+static ss::token_range token_range_endpoints_to_json(const dht::token_range_endpoints& d) {
+    ss::token_range r;
+    r.start_token = d._start_token;
+    r.end_token = d._end_token;
+    r.endpoints = d._endpoints;
+    r.rpc_endpoints = d._rpc_endpoints;
+    for (auto det : d._endpoint_details) {
+        ss::endpoint_detail ed;
+        ed.host = det._host;
+        ed.datacenter = det._datacenter;
+        if (det._rack != "") {
+            ed.rack = det._rack;
        }
-        res.push_back(r);
+        r.endpoint_details.push(ed);
    }
-    return res;
+    return r;
 }

 void set_storage_service(http_context& ctx, routes& r) {
@@ -176,13 +173,13 @@ void set_storage_service(http_context& ctx, routes& r) {
        return make_ready_future<json::json_return_type>(res);
    });

-    ss::describe_any_ring.set(r, [&ctx](const_req req) {
-        return describe_ring("");
+    ss::describe_any_ring.set(r, [&ctx](std::unique_ptr<request> req) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().describe_ring(""), token_range_endpoints_to_json));
    });

-    ss::describe_ring.set(r, [&ctx](const_req req) {
-        auto keyspace = validate_keyspace(ctx, req.param);
-        return describe_ring(keyspace);
+    ss::describe_ring.set(r, [&ctx](std::unique_ptr<request> req) {
+        auto keyspace = validate_keyspace(ctx, req->param);
+        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().describe_ring(keyspace), token_range_endpoints_to_json));
    });

    ss::get_host_id_map.set(r, [](const_req req) {
@@ -195,8 +192,8 @@ void set_storage_service(http_context& ctx, routes& r) {
        return get_cf_stats(ctx, &column_family_stats::live_disk_space_used);
    });

-    ss::get_load_map.set(r, [] (std::unique_ptr<request> req) {
-        return service::get_local_storage_service().get_load_map().then([] (auto&& load_map) {
+    ss::get_load_map.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return ctx.lmeter.get_load_map().then([] (auto&& load_map) {
            std::vector<ss::map_string_double> res;
            for (auto i : load_map) {
                ss::map_string_double val;
@@ -608,9 +605,7 @@ void set_storage_service(http_context& ctx, routes& r) {
    });

    ss::join_ring.set(r, [](std::unique_ptr<request> req) {
-        return service::get_local_storage_service().join_ring().then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
+        return make_ready_future<json::json_return_type>(json_void());
    });

    ss::is_joined.set(r, [] (std::unique_ptr<request> req) {
--- a/api/system.cc
+++ b/api/system.cc
@@ -30,6 +30,10 @@ namespace api {
 namespace hs = httpd::system_json;

 void set_system(http_context& ctx, routes& r) {
+    hs::get_system_uptime.set(r, [](const_req req) {
+        return std::chrono::duration_cast<std::chrono::milliseconds>(engine().uptime()).count();
+    });
+
    hs::get_all_logger_names.set(r, [](const_req req) {
        return logging::logger_registry().get_all_logger_names();
    });
--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -21,6 +21,7 @@

 #include "atomic_cell.hh"
 #include "atomic_cell_or_collection.hh"
+#include "counters.hh"
 #include "types.hh"

 /// LSA mirator for cells with irrelevant type
@@ -214,6 +215,61 @@ size_t atomic_cell_or_collection::external_memory_usage(const abstract_type& t)
        + imr_object_type::size_overhead + external_value_size;
 }

+std::ostream&
+operator<<(std::ostream& os, const atomic_cell_view& acv) {
+    if (acv.is_live()) {
+        return fmt_print(os, "atomic_cell{{{},ts={:d},expiry={:d},ttl={:d}}}",
+            acv.is_counter_update()
+                    ? "counter_update_value=" + to_sstring(acv.counter_update_value())
+                    : to_hex(acv.value().linearize()),
+            acv.timestamp(),
+            acv.is_live_and_has_ttl() ? acv.expiry().time_since_epoch().count() : -1,
+            acv.is_live_and_has_ttl() ? acv.ttl().count() : 0);
+    } else {
+        return fmt_print(os, "atomic_cell{{DEAD,ts={:d},deletion_time={:d}}}",
+            acv.timestamp(), acv.deletion_time().time_since_epoch().count());
+    }
+}
+
+std::ostream&
+operator<<(std::ostream& os, const atomic_cell& ac) {
+    return os << atomic_cell_view(ac);
+}
+
+std::ostream&
+operator<<(std::ostream& os, const atomic_cell_view::printer& acvp) {
+    auto& type = acvp._type;
+    auto& acv = acvp._cell;
+    if (acv.is_live()) {
+        std::ostringstream cell_value_string_builder;
+        if (type.is_counter()) {
+            if (acv.is_counter_update()) {
+                cell_value_string_builder << "counter_update_value=" << acv.counter_update_value();
+            } else {
+                cell_value_string_builder << "shards: ";
+                counter_cell_view::with_linearized(acv, [&cell_value_string_builder] (counter_cell_view& ccv) {
+                    cell_value_string_builder << ::join(", ", ccv.shards());
+                });
+            }
+        } else {
+            cell_value_string_builder << type.to_string(acv.value().linearize());
+        }
+        return fmt_print(os, "atomic_cell{{{},ts={:d},expiry={:d},ttl={:d}}}",
+            cell_value_string_builder.str(),
+            acv.timestamp(),
+            acv.is_live_and_has_ttl() ? acv.expiry().time_since_epoch().count() : -1,
+            acv.is_live_and_has_ttl() ? acv.ttl().count() : 0);
+    } else {
+        return fmt_print(os, "atomic_cell{{DEAD,ts={:d},deletion_time={:d}}}",
+            acv.timestamp(), acv.deletion_time().time_since_epoch().count());
+    }
+}
+
+std::ostream&
+operator<<(std::ostream& os, const atomic_cell::printer& acp) {
+    return operator<<(os, static_cast<const atomic_cell_view::printer&>(acp));
+}
+
 std::ostream& operator<<(std::ostream& os, const atomic_cell_or_collection::printer& p) {
    if (!p._cell._data.get()) {
        return os << "{ null atomic_cell_or_collection }";
@@ -223,9 +279,9 @@ std::ostream& operator<<(std::ostream& os, const atomic_cell_or_collection::prin
    if (dc::structure::get_member<dc::tags::flags>(p._cell._data.get()).get<dc::tags::collection>()) {
        os << "collection ";
        auto cmv = p._cell.as_collection_mutation();
-        os << to_hex(cmv.data.linearize());
+        os << collection_mutation_view::printer(*p._cdef.type, cmv);
    } else {
-        os << p._cell.as_atomic_cell(p._cdef);
+        os << atomic_cell_view::printer(*p._cdef.type, p._cell.as_atomic_cell(p._cdef));
    }
    return os << " }";
 }
--- a/atomic_cell.hh
+++ b/atomic_cell.hh
@@ -153,6 +153,14 @@ public:
    }

    friend std::ostream& operator<<(std::ostream& os, const atomic_cell_view& acv);
+
+    class printer {
+        const abstract_type& _type;
+        const atomic_cell_view& _cell;
+    public:
+        printer(const abstract_type& type, const atomic_cell_view& cell) : _type(type), _cell(cell) {}
+        friend std::ostream& operator<<(std::ostream& os, const printer& acvp);
+    };
 };

 class atomic_cell_mutable_view final : public basic_atomic_cell_view<mutable_view::yes> {
@@ -219,6 +227,12 @@ public:
    static atomic_cell make_live_uninitialized(const abstract_type& type, api::timestamp_type timestamp, size_t size);
    friend class atomic_cell_or_collection;
    friend std::ostream& operator<<(std::ostream& os, const atomic_cell& ac);
+
+    class printer : atomic_cell_view::printer {
+    public:
+        printer(const abstract_type& type, const atomic_cell_view& cell) : atomic_cell_view::printer(type, cell) {}
+        friend std::ostream& operator<<(std::ostream& os, const printer& acvp);
+    };
 };

 class column_definition;
--- a/auth/role_manager.hh
+++ b/auth/role_manager.hh
@@ -33,6 +33,7 @@

 #include "auth/resource.hh"
 #include "seastarx.hh"
+#include "exceptions/exceptions.hh"

 namespace auth {

@@ -52,9 +53,9 @@ struct role_config_update final {
 ///
 /// A logical argument error for a role-management operation.
 ///
-class roles_argument_exception : public std::invalid_argument {
+class roles_argument_exception : public exceptions::invalid_request_exception {
 public:
-    using std::invalid_argument::invalid_argument;
+    using exceptions::invalid_request_exception::invalid_request_exception;
 };

 class role_already_exists : public roles_argument_exception {
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -39,7 +39,7 @@
 #include "db/consistency_level_type.hh"
 #include "exceptions/exceptions.hh"
 #include "log.hh"
-#include "service/migration_listener.hh"
+#include "service/migration_manager.hh"
 #include "utils/class_registrator.hh"
 #include "database.hh"

@@ -114,14 +114,14 @@ static future<> validate_role_exists(const service& ser, std::string_view role_n
 service::service(
        permissions_cache_config c,
        cql3::query_processor& qp,
-        ::service::migration_manager& mm,
+        ::service::migration_notifier& mn,
        std::unique_ptr<authorizer> z,
        std::unique_ptr<authenticator> a,
        std::unique_ptr<role_manager> r)
            : _permissions_cache_config(std::move(c))
            , _permissions_cache(nullptr)
            , _qp(qp)
-            , _migration_manager(mm)
+            , _mnotifier(mn)
            , _authorizer(std::move(z))
            , _authenticator(std::move(a))
            , _role_manager(std::move(r))
@@ -141,18 +141,19 @@ service::service(
 service::service(
        permissions_cache_config c,
        cql3::query_processor& qp,
+        ::service::migration_notifier& mn,
        ::service::migration_manager& mm,
        const service_config& sc)
            : service(
                      std::move(c),
                      qp,
-                      mm,
+                      mn,
                      create_object<authorizer>(sc.authorizer_java_name, qp, mm),
                      create_object<authenticator>(sc.authenticator_java_name, qp, mm),
                      create_object<role_manager>(sc.role_manager_java_name, qp, mm)) {
 }

-future<> service::create_keyspace_if_missing() const {
+future<> service::create_keyspace_if_missing(::service::migration_manager& mm) const {
    auto& db = _qp.db();

    if (!db.has_keyspace(meta::AUTH_KS)) {
@@ -166,15 +167,15 @@ future<> service::create_keyspace_if_missing() const {

        // We use min_timestamp so that default keyspace metadata will loose with any manual adjustments.
        // See issue #2129.
-        return _migration_manager.announce_new_keyspace(ksm, api::min_timestamp, false);
+        return mm.announce_new_keyspace(ksm, api::min_timestamp, false);
    }

    return make_ready_future<>();
 }

-future<> service::start() {
-    return once_among_shards([this] {
-        return create_keyspace_if_missing();
+future<> service::start(::service::migration_manager& mm) {
+    return once_among_shards([this, &mm] {
+        return create_keyspace_if_missing(mm);
    }).then([this] {
        return _role_manager->start().then([this] {
            return when_all_succeed(_authorizer->start(), _authenticator->start());
@@ -183,7 +184,7 @@ future<> service::start() {
        _permissions_cache = std::make_unique<permissions_cache>(_permissions_cache_config, *this, log);
    }).then([this] {
        return once_among_shards([this] {
-            _migration_manager.register_listener(_migration_listener.get());
+            _mnotifier.register_listener(_migration_listener.get());
            return make_ready_future<>();
        });
    });
@@ -192,9 +193,9 @@ future<> service::start() {
 future<> service::stop() {
    // Only one of the shards has the listener registered, but let's try to
    // unregister on each one just to make sure.
-    _migration_manager.unregister_listener(_migration_listener.get());
-
-    return _permissions_cache->stop().then([this] {
+    return _mnotifier.unregister_listener(_migration_listener.get()).then([this] {
+        return _permissions_cache->stop();
+    }).then([this] {
        return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop());
    });
 }
--- a/auth/service.hh
+++ b/auth/service.hh
@@ -28,6 +28,7 @@
 #include <seastar/core/future.hh>
 #include <seastar/core/sstring.hh>
 #include <seastar/util/bool_class.hh>
+#include <seastar/core/sharded.hh>

 #include "auth/authenticator.hh"
 #include "auth/authorizer.hh"
@@ -42,6 +43,7 @@ class query_processor;

 namespace service {
 class migration_manager;
+class migration_notifier;
 class migration_listener;
 }

@@ -76,13 +78,15 @@ public:
 ///
 /// All state associated with access-control is stored externally to any particular instance of this class.
 ///
-class service final {
+/// peering_sharded_service inheritance is needed to be able to access shard local authentication service
+/// given an object from another shard. Used for bouncing lwt requests to correct shard.
+class service final : public seastar::peering_sharded_service<service> {
    permissions_cache_config _permissions_cache_config;
    std::unique_ptr<permissions_cache> _permissions_cache;

    cql3::query_processor& _qp;

-    ::service::migration_manager& _migration_manager;
+    ::service::migration_notifier& _mnotifier;

    std::unique_ptr<authorizer> _authorizer;

@@ -97,7 +101,7 @@ public:
    service(
            permissions_cache_config,
            cql3::query_processor&,
-            ::service::migration_manager&,
+            ::service::migration_notifier&,
            std::unique_ptr<authorizer>,
            std::unique_ptr<authenticator>,
            std::unique_ptr<role_manager>);
@@ -110,10 +114,11 @@ public:
    service(
            permissions_cache_config,
            cql3::query_processor&,
+            ::service::migration_notifier&,
            ::service::migration_manager&,
            const service_config&);

-    future<> start();
+    future<> start(::service::migration_manager&);

    future<> stop();

@@ -159,7 +164,7 @@ public:
 private:
    future<bool> has_existing_legacy_users() const;

-    future<> create_keyspace_if_missing() const;
+    future<> create_keyspace_if_missing(::service::migration_manager& mm) const;
 };

 future<bool> has_superuser(const service&, const authenticated_user&);
--- a/build_id.cc
+++ b/build_id.cc
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+#include "build_id.hh"
+#include <fmt/printf.h>
+#include <link.h>
+#include <seastar/core/align.hh>
+#include <sstream>
+
+using namespace seastar;
+
+static const Elf64_Nhdr* get_nt_build_id(dl_phdr_info* info) {
+    auto base = info->dlpi_addr;
+    const auto* h = info->dlpi_phdr;
+    auto num_headers = info->dlpi_phnum;
+    for (int i = 0; i != num_headers; ++i, ++h) {
+        if (h->p_type != PT_NOTE) {
+            continue;
+        }
+
+        auto* p = reinterpret_cast<const char*>(base) + h->p_vaddr;
+        auto* e = p + h->p_memsz;
+        while (p != e) {
+            const auto* n = reinterpret_cast<const Elf64_Nhdr*>(p);
+            if (n->n_type == NT_GNU_BUILD_ID) {
+                return n;
+            }
+
+            p += sizeof(Elf64_Nhdr);
+
+            p += n->n_namesz;
+            p = align_up(p, 4);
+
+            p += n->n_descsz;
+            p = align_up(p, 4);
+        }
+    }
+
+    assert(0 && "no NT_GNU_BUILD_ID note");
+}
+
+static int callback(dl_phdr_info* info, size_t size, void* data) {
+    std::string& ret = *(std::string*)data;
+    std::ostringstream os;
+
+    // The first DSO is always the main program, which has an empty name.
+    assert(strlen(info->dlpi_name) == 0);
+
+    auto* n = get_nt_build_id(info);
+    auto* p = reinterpret_cast<const char*>(n);
+
+    p += sizeof(Elf64_Nhdr);
+
+    p += n->n_namesz;
+    p = align_up(p, 4);
+
+    const char* desc = p;
+    for (unsigned i = 0; i < n->n_descsz; ++i) {
+        fmt::fprintf(os, "%02x", (unsigned char)*(desc + i));
+    }
+    ret = os.str();
+    return 1;
+}
+
+std::string get_build_id() {
+    std::string ret;
+    int r = dl_iterate_phdr(callback, &ret);
+    assert(r == 1);
+    return ret;
+}
--- a/build_id.hh
+++ b/build_id.hh
@@ -0,0 +1,9 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+#pragma once
+
+#include <string>
+
+std::string get_build_id();
--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -38,6 +38,7 @@ class bytes_ostream {
 public:
    using size_type = bytes::size_type;
    using value_type = bytes::value_type;
+    using fragment_type = bytes_view;
    static constexpr size_type max_chunk_size() { return 128 * 1024; }
 private:
    static_assert(sizeof(value_type) == 1, "value_type is assumed to be one byte long");
@@ -93,6 +94,29 @@ public:
            return _current != other._current;
        }
    };
+    using const_iterator = fragment_iterator;
+
+    class output_iterator {
+    public:
+        using iterator_category = std::output_iterator_tag;
+        using difference_type = std::ptrdiff_t;
+        using value_type = bytes_ostream::value_type;
+        using pointer = bytes_ostream::value_type*;
+        using reference = bytes_ostream::value_type&;
+
+        friend class bytes_ostream;
+
+    private:
+        bytes_ostream* _ostream = nullptr;
+
+    private:
+        explicit output_iterator(bytes_ostream& os) : _ostream(&os) { }
+
+    public:
+        reference operator*() const { return *_ostream->write_place_holder(1); }
+        output_iterator& operator++() { return *this; }
+        output_iterator operator++(int) { return *this; }
+    };
 private:
    inline size_type current_space_left() const {
        if (!_current) {
@@ -289,6 +313,11 @@ public:
        return _size;
    }

+    // For the FragmentRange concept
+    size_type size_bytes() const {
+        return _size;
+    }
+
    bool empty() const {
        return _size == 0;
    }
@@ -326,6 +355,8 @@ public:
    fragment_iterator begin() const { return { _begin.get() }; }
    fragment_iterator end() const { return { nullptr }; }

+    output_iterator write_begin() { return output_iterator(*this); }
+
    boost::iterator_range<fragment_iterator> fragments() const {
        return { begin(), end() };
    }
--- a/canonical_mutation.cc
+++ b/canonical_mutation.cc
@@ -35,6 +35,7 @@
 #include "idl/uuid.dist.impl.hh"
 #include "idl/keys.dist.impl.hh"
 #include "idl/mutation.dist.impl.hh"
+#include <iostream>

 canonical_mutation::canonical_mutation(bytes data)
        : _data(std::move(data))
@@ -89,3 +90,81 @@ mutation canonical_mutation::to_mutation(schema_ptr s) const {
    }
    return m;
 }
+
+static sstring bytes_to_text(bytes_view bv) {
+    sstring ret(sstring::initialized_later(), bv.size());
+    std::copy_n(reinterpret_cast<const char*>(bv.data()), bv.size(), ret.data());
+    return ret;
+}
+
+std::ostream& operator<<(std::ostream& os, const canonical_mutation& cm) {
+    auto in = ser::as_input_stream(cm._data);
+    auto mv = ser::deserialize(in, boost::type<ser::canonical_mutation_view>());
+    column_mapping mapping = mv.mapping();
+    auto partition_view = mutation_partition_view::from_view(mv.partition());
+    fmt::print(os, "{{canonical_mutation: ");
+    fmt::print(os, "table_id {} schema_version {} ", mv.table_id(), mv.schema_version());
+    fmt::print(os, "partition_key {} ", mv.key());
+    class printing_visitor : public mutation_partition_view_virtual_visitor {
+        std::ostream& _os;
+        const column_mapping& _cm;
+        bool _first = true;
+        bool _in_row = false;
+    private:
+        void print_separator() {
+            if (!_first) {
+                fmt::print(_os, ", ");
+            }
+            _first = false;
+        }
+    public:
+        printing_visitor(std::ostream& os, const column_mapping& cm) : _os(os), _cm(cm) {}
+        virtual void accept_partition_tombstone(tombstone t) override {
+            print_separator();
+            fmt::print(_os, "partition_tombstone {}", t);
+        }
+        virtual void accept_static_cell(column_id id, atomic_cell ac) override {
+            print_separator();
+            auto&& entry = _cm.static_column_at(id);
+            fmt::print(_os, "static column {} {}", bytes_to_text(entry.name()), atomic_cell::printer(*entry.type(), ac));
+        }
+        virtual void accept_static_cell(column_id id, collection_mutation_view cmv) override {
+            print_separator();
+            auto&& entry = _cm.static_column_at(id);
+            fmt::print(_os, "static column {} {}", bytes_to_text(entry.name()), collection_mutation_view::printer(*entry.type(), cmv));
+        }
+        virtual void accept_row_tombstone(range_tombstone rt) override {
+            print_separator();
+            fmt::print(_os, "row tombstone {}", rt);
+        }
+        virtual void accept_row(position_in_partition_view pipv, row_tombstone rt, row_marker rm, is_dummy, is_continuous) override {
+            if (_in_row) {
+                fmt::print(_os, "}}, ");
+            }
+            fmt::print(_os, "{{row {} tombstone {} marker {}", pipv, rt, rm);
+            _in_row = true;
+            _first = false;
+        }
+        virtual void accept_row_cell(column_id id, atomic_cell ac) override {
+            print_separator();
+            auto&& entry = _cm.regular_column_at(id);
+            fmt::print(_os, "column {} {}", bytes_to_text(entry.name()), atomic_cell::printer(*entry.type(), ac));
+        }
+        virtual void accept_row_cell(column_id id, collection_mutation_view cmv) override {
+            print_separator();
+            auto&& entry = _cm.regular_column_at(id);
+            fmt::print(_os, "column {} {}", bytes_to_text(entry.name()), collection_mutation_view::printer(*entry.type(), cmv));
+        }
+        void finalize() {
+            if (_in_row) {
+                fmt::print(_os, "}}");
+            }
+        }
+    };
+    printing_visitor pv(os, mapping);
+    partition_view.accept(mapping, pv);
+    pv.finalize();
+    fmt::print(os, "}}");
+    return os;
+}
+
--- a/canonical_mutation.hh
+++ b/canonical_mutation.hh
@@ -26,6 +26,7 @@
 #include "database_fwd.hh"
 #include "mutation_partition_visitor.hh"
 #include "mutation_partition_serializer.hh"
+#include <iosfwd>

 // Immutable mutation form which can be read using any schema version of the same table.
 // Safe to access from other shards via const&.
@@ -52,4 +53,5 @@ public:

    const bytes& representation() const { return _data; }

+    friend std::ostream& operator<<(std::ostream& os, const canonical_mutation& cm);
 };
--- a/cdc/cdc.cc
+++ b/cdc/cdc.cc
@@ -22,6 +22,7 @@
 #include <utility>
 #include <algorithm>

+#include <boost/range/irange.hpp>
 #include <seastar/util/defer.hh>
 #include <seastar/core/thread.hh>

@@ -33,19 +34,20 @@
 #include "partition_slice_builder.hh"
 #include "schema.hh"
 #include "schema_builder.hh"
-#include "service/migration_manager.hh"
+#include "service/migration_listener.hh"
 #include "service/storage_service.hh"
 #include "types/tuple.hh"
 #include "cql3/statements/select_statement.hh"
 #include "cql3/multi_column_relation.hh"
 #include "cql3/tuples.hh"
 #include "log.hh"
+#include "json.hh"

 using locator::snitch_ptr;
 using locator::token_metadata;
 using locator::topology;
 using seastar::sstring;
-using service::migration_manager;
+using service::migration_notifier;
 using service::storage_proxy;

 namespace std {
@@ -62,6 +64,196 @@ using namespace std::chrono_literals;

 static logging::logger cdc_log("cdc");

+namespace cdc {
+static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {});
+static schema_ptr create_stream_description_table_schema(const schema&, std::optional<utils::UUID> = {});
+static future<> populate_desc(db_context ctx, const schema& s);
+}
+
+class cdc::cdc_service::impl : service::migration_listener::empty_listener {
+    friend cdc_service;
+    db_context _ctxt;
+    bool _stopped = false;
+public:
+    impl(db_context ctxt)
+        : _ctxt(std::move(ctxt))
+    {
+        _ctxt._migration_notifier.register_listener(this);
+    }
+    ~impl() {
+        assert(_stopped);
+    }
+
+    future<> stop() {
+        return _ctxt._migration_notifier.unregister_listener(this).then([this] {
+            _stopped = true;
+        });
+    }
+
+    void on_before_create_column_family(const schema& schema, std::vector<mutation>& mutations, api::timestamp_type timestamp) override {
+        if (schema.cdc_options().enabled()) {
+            auto& db = _ctxt._proxy.get_db().local();
+            auto logname = log_name(schema.cf_name());
+            if (!db.has_schema(schema.ks_name(), logname)) {
+                // in seastar thread
+                auto log_schema = create_log_schema(schema);
+                auto stream_desc_schema = create_stream_description_table_schema(schema);
+                auto& keyspace = db.find_keyspace(schema.ks_name());
+
+                auto log_mut = db::schema_tables::make_create_table_mutations(keyspace.metadata(), log_schema, timestamp);
+                auto stream_mut = db::schema_tables::make_create_table_mutations(keyspace.metadata(), stream_desc_schema, timestamp);
+
+                mutations.insert(mutations.end(), std::make_move_iterator(log_mut.begin()), std::make_move_iterator(log_mut.end()));
+                mutations.insert(mutations.end(), std::make_move_iterator(stream_mut.begin()), std::make_move_iterator(stream_mut.end()));
+            }
+        }
+    }
+
+    void on_before_update_column_family(const schema& new_schema, const schema& old_schema, std::vector<mutation>& mutations, api::timestamp_type timestamp) override {
+        bool is_cdc = new_schema.cdc_options().enabled();
+        bool was_cdc = old_schema.cdc_options().enabled();
+
+        // we need to create or modify the log & stream schemas iff either we changed cdc status (was != is)
+        // or if cdc is on now unconditionally, since then any actual base schema changes will affect the column 
+        // etc.
+        if (was_cdc || is_cdc) {
+            auto logname = log_name(old_schema.cf_name());
+            auto descname = desc_name(old_schema.cf_name());
+            auto& db = _ctxt._proxy.get_db().local();
+            auto& keyspace = db.find_keyspace(old_schema.ks_name());
+            auto log_schema = was_cdc ? db.find_column_family(old_schema.ks_name(), logname).schema() : nullptr;
+            auto stream_desc_schema = was_cdc ? db.find_column_family(old_schema.ks_name(), descname).schema() : nullptr;
+
+            if (!is_cdc) {
+                auto log_mut = db::schema_tables::make_drop_table_mutations(keyspace.metadata(), log_schema, timestamp);
+                auto stream_mut = db::schema_tables::make_drop_table_mutations(keyspace.metadata(), stream_desc_schema, timestamp);
+
+                mutations.insert(mutations.end(), std::make_move_iterator(log_mut.begin()), std::make_move_iterator(log_mut.end()));
+                mutations.insert(mutations.end(), std::make_move_iterator(stream_mut.begin()), std::make_move_iterator(stream_mut.end()));
+                return;
+            }
+
+            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt);
+            auto new_stream_desc_schema = create_stream_description_table_schema(new_schema, stream_desc_schema ? std::make_optional(stream_desc_schema->id()) : std::nullopt);
+
+            auto log_mut = log_schema 
+                ? db::schema_tables::make_update_table_mutations(keyspace.metadata(), log_schema, new_log_schema, timestamp, false)
+                : db::schema_tables::make_create_table_mutations(keyspace.metadata(), new_log_schema, timestamp)
+                ;
+            auto stream_mut = stream_desc_schema 
+                ? db::schema_tables::make_update_table_mutations(keyspace.metadata(), stream_desc_schema, new_stream_desc_schema, timestamp, false)
+                : db::schema_tables::make_create_table_mutations(keyspace.metadata(), new_stream_desc_schema, timestamp)
+                ;
+
+            mutations.insert(mutations.end(), std::make_move_iterator(log_mut.begin()), std::make_move_iterator(log_mut.end()));
+            mutations.insert(mutations.end(), std::make_move_iterator(stream_mut.begin()), std::make_move_iterator(stream_mut.end()));
+        }
+    }
+
+    void on_before_drop_column_family(const schema& schema, std::vector<mutation>& mutations, api::timestamp_type timestamp) override {
+        if (schema.cdc_options().enabled()) {
+            auto logname = log_name(schema.cf_name());
+            auto descname = desc_name(schema.cf_name());
+            auto& db = _ctxt._proxy.get_db().local();
+            auto& keyspace = db.find_keyspace(schema.ks_name());
+            auto log_schema = db.find_column_family(schema.ks_name(), logname).schema();
+            auto stream_desc_schema = db.find_column_family(schema.ks_name(), descname).schema();
+
+            auto log_mut = db::schema_tables::make_drop_table_mutations(keyspace.metadata(), log_schema, timestamp);
+            auto stream_mut = db::schema_tables::make_drop_table_mutations(keyspace.metadata(), stream_desc_schema, timestamp);
+
+            mutations.insert(mutations.end(), std::make_move_iterator(log_mut.begin()), std::make_move_iterator(log_mut.end()));
+            mutations.insert(mutations.end(), std::make_move_iterator(stream_mut.begin()), std::make_move_iterator(stream_mut.end()));
+        }
+    }
+
+    void on_create_column_family(const sstring& ks_name, const sstring& cf_name) override {
+        // This callback is done on all shards. Only do the work once. 
+        if (engine().cpu_id() != 0) {
+            return; 
+        }
+        auto& db = _ctxt._proxy.get_db().local();
+        auto& cf = db.find_column_family(ks_name, cf_name);
+        auto schema = cf.schema();
+        if (schema->cdc_options().enabled()) {
+            populate_desc(_ctxt, *schema).get();
+        }
+    }
+
+    void on_update_column_family(const sstring& ks_name, const sstring& cf_name, bool columns_changed) override {
+        on_create_column_family(ks_name, cf_name);
+    }
+
+    void on_drop_column_family(const sstring& ks_name, const sstring& cf_name) override {}
+
+    future<std::tuple<std::vector<mutation>, result_callback>> augment_mutation_call(
+        lowres_clock::time_point timeout,
+        std::vector<mutation>&& mutations
+    );
+
+    template<typename Iter>
+    future<> append_mutations(Iter i, Iter e, schema_ptr s, lowres_clock::time_point, std::vector<mutation>&);
+};
+
+cdc::cdc_service::cdc_service(service::storage_proxy& proxy)
+    : cdc_service(db_context::builder(proxy).build())
+{}
+
+cdc::cdc_service::cdc_service(db_context ctxt)
+    : _impl(std::make_unique<impl>(std::move(ctxt)))
+{
+    _impl->_ctxt._proxy.set_cdc_service(this);
+}
+
+future<> cdc::cdc_service::stop() {
+    return _impl->stop();
+}
+
+cdc::cdc_service::~cdc_service() = default;
+
+cdc::options::options(const std::map<sstring, sstring>& map) {
+    if (map.find("enabled") == std::end(map)) {
+        return;
+    }
+
+    for (auto& p : map) {
+        if (p.first == "enabled") {
+            _enabled = p.second == "true";
+        } else if (p.first == "preimage") {
+            _preimage = p.second == "true";
+        } else if (p.first == "postimage") {
+            _postimage = p.second == "true";
+        } else if (p.first == "ttl") {
+            _ttl = std::stoi(p.second);
+        } else {
+            throw exceptions::configuration_exception("Invalid CDC option: " + p.first);
+        }
+    }
+}
+
+std::map<sstring, sstring> cdc::options::to_map() const {
+    if (!_enabled) {
+        return {};
+    }
+    return {
+        { "enabled", _enabled ? "true" : "false" },
+        { "preimage", _preimage ? "true" : "false" },
+        { "postimage", _postimage ? "true" : "false" },
+        { "ttl", std::to_string(_ttl) },
+    };
+}
+
+sstring cdc::options::to_sstring() const {
+    return json::to_json(to_map());
+}
+
+bool cdc::options::operator==(const options& o) const {
+    return _enabled == o._enabled && _preimage == o._preimage && _postimage == o._postimage && _ttl == o._ttl;
+}
+bool cdc::options::operator!=(const options& o) const {
+    return !(*this == o);
+}
+
 namespace cdc {

 using operation_native_type = std::underlying_type_t<operation>;
@@ -77,41 +269,8 @@ sstring desc_name(const sstring& table_name) {
    return table_name + cdc_desc_suffix;
 }

-static future<>
-remove_log(db_context ctx, const sstring& ks_name, const sstring& table_name) {
-    try {
-        return ctx._migration_manager.announce_column_family_drop(
-                ks_name, log_name(table_name), false);
-    } catch (exceptions::configuration_exception& e) {
-        // It's fine if the table does not exist.
-        return make_ready_future<>();
-    } catch (...) {
-        return make_exception_future<>(std::current_exception());
-    }
-}
-
-static future<>
-remove_desc(db_context ctx, const sstring& ks_name, const sstring& table_name) {
-    try {
-        return ctx._migration_manager.announce_column_family_drop(
-                ks_name, desc_name(table_name), false);
-    } catch (exceptions::configuration_exception& e) {
-        // It's fine if the table does not exist.
-        return make_ready_future<>();
-    } catch (...) {
-        return make_exception_future<>(std::current_exception());
-    }
-}
-
-future<>
-remove(db_context ctx, const sstring& ks_name, const sstring& table_name) {
-    return when_all(remove_log(ctx, ks_name, table_name),
-                    remove_desc(ctx, ks_name, table_name)).discard_result();
-}
-
-static future<> setup_log(db_context ctx, const schema& s) {
+static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid) {
    schema_builder b(s.ks_name(), log_name(s.cf_name()));
-    b.set_default_time_to_live(gc_clock::duration{s.cdc_options().ttl()});
    b.set_comment(sprint("CDC log for %s.%s", s.ks_name(), s.cf_name()));
    b.with_column("stream_id", uuid_type, column_kind::partition_key);
    b.with_column("time", timeuuid_type, column_kind::clustering_key);
@@ -131,17 +290,27 @@ static future<> setup_log(db_context ctx, const schema& s) {
    add_columns(s.clustering_key_columns());
    add_columns(s.static_columns(), true);
    add_columns(s.regular_columns(), true);
-    return ctx._migration_manager.announce_new_column_family(b.build(), false);
+
+    if (uuid) {
+        b.set_uuid(*uuid);
+    }
+    
+    return b.build();
 }

-static future<> setup_stream_description_table(db_context ctx, const schema& s) {
+static schema_ptr create_stream_description_table_schema(const schema& s, std::optional<utils::UUID> uuid) {
    schema_builder b(s.ks_name(), desc_name(s.cf_name()));
    b.set_comment(sprint("CDC description for %s.%s", s.ks_name(), s.cf_name()));
    b.with_column("node_ip", inet_addr_type, column_kind::partition_key);
    b.with_column("shard_id", int32_type, column_kind::partition_key);
    b.with_column("created_at", timestamp_type, column_kind::clustering_key);
    b.with_column("stream_id", uuid_type);
-    return ctx._migration_manager.announce_new_column_family(b.build(), false);
+
+    if (uuid) {
+        b.set_uuid(*uuid);
+    }
+
+    return b.build();
 }

 // This function assumes setup_stream_description_table was called on |s| before the call to this
@@ -201,22 +370,34 @@ static future<> populate_desc(db_context ctx, const schema& s) {
                             empty_service_permit());
 }

-future<> setup(db_context ctx, schema_ptr s) {
-    return seastar::async([ctx = std::move(ctx), s = std::move(s)] {
-        setup_log(ctx, *s).get();
-        auto log_guard = seastar::defer([&] { remove_log(ctx, s->ks_name(), s->cf_name()).get(); });
-        setup_stream_description_table(ctx, *s).get();
-        auto desc_guard = seastar::defer([&] { remove_desc(ctx, s->ks_name(), s->cf_name()).get(); });
-        populate_desc(ctx, *s).get();
-        desc_guard.cancel();
-        log_guard.cancel();
-    });
+db_context::builder::builder(service::storage_proxy& proxy) 
+    : _proxy(proxy) 
+{}
+
+db_context::builder& db_context::builder::with_migration_notifier(service::migration_notifier& migration_notifier) {
+    _migration_notifier = migration_notifier;
+    return *this;
+}
+
+db_context::builder& db_context::builder::with_token_metadata(locator::token_metadata& token_metadata) {
+    _token_metadata = token_metadata;
+    return *this;
+}
+
+db_context::builder& db_context::builder::with_snitch(locator::snitch_ptr& snitch) {
+    _snitch = snitch;
+    return *this;
+}
+
+db_context::builder& db_context::builder::with_partitioner(dht::i_partitioner& partitioner) {
+    _partitioner = partitioner;
+    return *this;
 }

 db_context db_context::builder::build() {
    return db_context{
        _proxy,
-        _migration_manager ? _migration_manager->get() : service::get_local_migration_manager(),
+        _migration_notifier ? _migration_notifier->get() : service::get_local_storage_service().get_migration_notifier(),
        _token_metadata ? _token_metadata->get() : service::get_local_storage_service().get_token_metadata(),
        _snitch ? _snitch->get() : locator::i_endpoint_snitch::get_local_snitch_ptr(),
        _partitioner ? _partitioner->get() : dht::global_partitioner()
@@ -234,6 +415,7 @@ private:
    bytes _decomposed_time;
    ::shared_ptr<const transformer::streams_type> _streams;
    const column_definition& _op_col;
+    ttl_opt _cdc_ttl_opt;

    clustering_key set_pk_columns(const partition_key& pk, int batch_no, mutation& m) const {
        const auto log_ck = clustering_key::from_exploded(
@@ -245,7 +427,8 @@ private:
            auto cdef = m.schema()->get_column_definition(to_bytes("_" + column.name()));
            auto value = atomic_cell::make_live(*column.type,
                                                _time.timestamp(),
-                                                bytes_view(pk_value[pos]));
+                                                bytes_view(pk_value[pos]),
+                                                _cdc_ttl_opt);
            m.set_cell(log_ck, *cdef, std::move(value));
            ++pos;
        }
@@ -253,7 +436,7 @@ private:
    }

    void set_operation(const clustering_key& ck, operation op, mutation& m) const {
-        m.set_cell(ck, _op_col, atomic_cell::make_live(*_op_col.type, _time.timestamp(), _op_col.type->decompose(operation_native_type(op))));
+        m.set_cell(ck, _op_col, atomic_cell::make_live(*_op_col.type, _time.timestamp(), _op_col.type->decompose(operation_native_type(op)), _cdc_ttl_opt));
    }

    partition_key stream_id(const net::inet_address& ip, unsigned int shard_id) const {
@@ -272,7 +455,11 @@ public:
        , _decomposed_time(timeuuid_type->decompose(_time))
        , _streams(std::move(streams))
        , _op_col(*_log_schema->get_column_definition(to_bytes("operation")))
-    {}
+    {
+        if (_schema->cdc_options().ttl()) {
+            _cdc_ttl_opt = std::chrono::seconds(_schema->cdc_options().ttl());
+        }
+    }

    // TODO: is pre-image data based on query enough. We only have actual column data. Do we need
    // more details like tombstones/ttl? Probably not but keep in mind.
@@ -304,7 +491,8 @@ public:
                        auto cdef = _log_schema->get_column_definition(to_bytes("_" + column.name()));
                        auto value = atomic_cell::make_live(*column.type,
                                                            _time.timestamp(),
-                                                            bytes_view(exploded[pos]));
+                                                            bytes_view(exploded[pos]),
+                                                            _cdc_ttl_opt);
                        res.set_cell(log_ck, *cdef, std::move(value));
                        ++pos;
                    }
@@ -360,11 +548,11 @@ public:
                for (const auto& column : _schema->clustering_key_columns()) {
                    assert (pos < ck_value.size());
                    auto cdef = _log_schema->get_column_definition(to_bytes("_" + column.name()));
-                    res.set_cell(log_ck, *cdef, atomic_cell::make_live(*column.type, _time.timestamp(), bytes_view(ck_value[pos])));
+                    res.set_cell(log_ck, *cdef, atomic_cell::make_live(*column.type, _time.timestamp(), bytes_view(ck_value[pos]), _cdc_ttl_opt));

                    if (pirow) {
                        assert(pirow->has(column.name_as_text()));
-                        res.set_cell(*pikey, *cdef, atomic_cell::make_live(*column.type, _time.timestamp(), bytes_view(ck_value[pos])));
+                        res.set_cell(*pikey, *cdef, atomic_cell::make_live(*column.type, _time.timestamp(), bytes_view(ck_value[pos]), _cdc_ttl_opt));
                    }

                    ++pos;
@@ -393,7 +581,7 @@ public:
                            }

                            values[0] = data_type_for<column_op_native_type>()->decompose(data_value(static_cast<column_op_native_type>(op)));
-                            res.set_cell(log_ck, *dst, atomic_cell::make_live(*dst->type, _time.timestamp(), tuple_type_impl::build_value(values)));
+                            res.set_cell(log_ck, *dst, atomic_cell::make_live(*dst->type, _time.timestamp(), tuple_type_impl::build_value(values), _cdc_ttl_opt));

                            if (pirow && pirow->has(cdef.name_as_text())) {
                                values[0] = data_type_for<column_op_native_type>()->decompose(data_value(static_cast<column_op_native_type>(column_op::set)));
@@ -402,7 +590,7 @@ public:

                                assert(std::addressof(res.partition().clustered_row(*_log_schema, *pikey)) != std::addressof(res.partition().clustered_row(*_log_schema, log_ck)));
                                assert(pikey->explode() != log_ck.explode());
-                                res.set_cell(*pikey, *dst, atomic_cell::make_live(*dst->type, _time.timestamp(), tuple_type_impl::build_value(values)));
+                                res.set_cell(*pikey, *dst, atomic_cell::make_live(*dst->type, _time.timestamp(), tuple_type_impl::build_value(values), _cdc_ttl_opt));
                            }
                        } else {
                            cdc_log.warn("Non-atomic cell ignored {}.{}:{}", _schema->ks_name(), _schema->cf_name(), cdef.name_as_text());
@@ -426,7 +614,6 @@ public:
    }

    future<lw_shared_ptr<cql3::untyped_result_set>> pre_image_select(
-            service::storage_proxy& proxy,
            service::client_state& client_state,
            db::consistency_level cl,
            const mutation& m)
@@ -474,10 +661,10 @@ public:
        auto partition_slice = query::partition_slice(std::move(bounds), std::move(static_columns), std::move(regular_columns), selection->get_query_options());
        auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(), partition_slice, query::max_partitions);

-        return proxy.query(_schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), empty_service_permit(), client_state)).then(
-                [this, partition_slice = std::move(partition_slice), selection = std::move(selection)] (service::storage_proxy::coordinator_query_result qr) -> lw_shared_ptr<cql3::untyped_result_set> {
+        return _ctx._proxy.query(_schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), empty_service_permit(), client_state)).then(
+                [s = _schema, partition_slice = std::move(partition_slice), selection = std::move(selection)] (service::storage_proxy::coordinator_query_result qr) -> lw_shared_ptr<cql3::untyped_result_set> {
                    cql3::selection::result_set_builder builder(*selection, gc_clock::now(), cql_serialization_format::latest());
-                    query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *_schema, *selection));
+                    query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *s, *selection));
                    auto result_set = builder.build();
                    if (!result_set || result_set->empty()) {
                        return {};
@@ -578,27 +765,71 @@ static future<::shared_ptr<transformer::streams_type>> get_streams(
    });
 }

-future<std::vector<mutation>> append_log_mutations(
-        db_context ctx,
-        schema_ptr s,
-        service::storage_proxy::clock_type::time_point timeout,
-        service::query_state& qs,
-        std::vector<mutation> muts) {
-    auto mp = ::make_lw_shared<std::vector<mutation>>(std::move(muts));
+template <typename Func>
+future<std::vector<mutation>>
+transform_mutations(std::vector<mutation>& muts, decltype(muts.size()) batch_size, Func&& f) {
+    return parallel_for_each(
+            boost::irange(static_cast<decltype(muts.size())>(0), muts.size(), batch_size),
+            std::move(f))
+        .then([&muts] () mutable { return std::move(muts); });
+}

-    return get_streams(ctx, s->ks_name(), s->cf_name(), timeout, qs).then([ctx, s = std::move(s), mp, &qs](::shared_ptr<transformer::streams_type> streams) mutable {
-        mp->reserve(2 * mp->size());
-        auto trans = make_lw_shared<transformer>(ctx, s, std::move(streams));
-        auto i = mp->begin();
-        auto e = mp->end();
-        return parallel_for_each(i, e, [ctx, &qs, trans, mp](mutation& m) {
-            return trans->pre_image_select(ctx._proxy, qs.get_client_state(), db::consistency_level::LOCAL_QUORUM, m).then([trans, mp, &m](lw_shared_ptr<cql3::untyped_result_set> rs) {
-                mp->push_back(trans->transform(m, rs.get()));
+} // namespace cdc
+
+future<std::tuple<std::vector<mutation>, cdc::result_callback>>
+cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout, std::vector<mutation>&& mutations) {
+    // we do all this because in the case of batches, we can have mixed schemas.
+    auto e = mutations.end();
+    auto i = std::find_if(mutations.begin(), e, [](const mutation& m) {
+        return m.schema()->cdc_options().enabled();
+    });
+
+    if (i == e) {
+        return make_ready_future<std::tuple<std::vector<mutation>, cdc::result_callback>>(std::make_tuple(std::move(mutations), result_callback{}));
+    }
+
+    mutations.reserve(2 * mutations.size());
+
+    return do_with(std::move(mutations), service::query_state(service::client_state::for_internal_calls(), empty_service_permit()), [this, timeout, i](std::vector<mutation>& mutations, service::query_state& qs) {
+        return transform_mutations(mutations, 1, [this, &mutations, timeout, &qs] (int idx) {
+            auto& m = mutations[idx];
+            auto s = m.schema();
+
+            if (!s->cdc_options().enabled()) {
+                return make_ready_future<>();
+            }
+            // for batches/multiple mutations this is super inefficient. either partition the mutation set by schema
+            // and re-use streams, or probably better: add a cache so this lookup is a noop on second mutation
+            return get_streams(_ctxt, s->ks_name(), s->cf_name(), timeout, qs).then([this, s = std::move(s), &qs, &mutations, idx](::shared_ptr<transformer::streams_type> streams) mutable {
+                auto& m = mutations[idx]; // should not really need because of reserve, but lets be conservative
+                transformer trans(_ctxt, s, streams);
+
+                if (!s->cdc_options().preimage()) {
+                    mutations.emplace_back(trans.transform(m));
+                    return make_ready_future<>();
+                }
+
+                // Note: further improvement here would be to coalesce the pre-image selects into one
+                // iff a batch contains several modifications to the same table. Otoh, batch is rare(?)
+                // so this is premature.
+                auto f = trans.pre_image_select(qs.get_client_state(), db::consistency_level::LOCAL_QUORUM, m);
+                return f.then([trans = std::move(trans), &mutations, idx] (lw_shared_ptr<cql3::untyped_result_set> rs) mutable {
+                    mutations.push_back(trans.transform(mutations[idx], rs.get()));
+                });
            });
-        }).then([mp] {
-            return std::move(*mp);
+        }).then([](std::vector<mutation> mutations) {
+            return make_ready_future<std::tuple<std::vector<mutation>, cdc::result_callback>>(std::make_tuple(std::move(mutations), result_callback{}));
        });
    });
 }

-} // namespace cdc
+bool cdc::cdc_service::needs_cdc_augmentation(const std::vector<mutation>& mutations) const {
+    return std::any_of(mutations.begin(), mutations.end(), [](const mutation& m) {
+        return m.schema()->cdc_options().enabled();
+    });
+}
+
+future<std::tuple<std::vector<mutation>, cdc::result_callback>>
+cdc::cdc_service::augment_mutation_call(lowres_clock::time_point timeout, std::vector<mutation>&& mutations) {
+    return _impl->augment_mutation_call(timeout, std::move(mutations));
+}
--- a/cdc/cdc.hh
+++ b/cdc/cdc.hh
@@ -33,8 +33,8 @@
 #include <seastar/core/sstring.hh>

 #include "exceptions/exceptions.hh"
-#include "json.hh"
 #include "timestamp.hh"
+#include "cdc_options.hh"

 class schema;
 using schema_ptr = seastar::lw_shared_ptr<const schema>;
@@ -48,7 +48,7 @@ class token_metadata;

 namespace service {

-class migration_manager;
+class migration_notifier;
 class storage_proxy;
 class query_state;

@@ -65,110 +65,63 @@ class partition_key;

 namespace cdc {

-class options final {
-    bool _enabled = false;
-    bool _preimage = false;
-    bool _postimage = false;
-    int _ttl = 86400; // 24h in seconds
+class db_context;
+
+// Callback to be invoked on mutation finish to fix
+// the whole bit about post-image.
+// TODO: decide on what the parameters are to be for this.
+using result_callback = std::function<future<>()>;
+
+/// \brief CDC service, responsible for schema listeners
+///
+/// CDC service will listen for schema changes and iff CDC is enabled/changed
+/// create/modify/delete corresponding log tables etc as part of the schema change. 
+///
+class cdc_service {
+    class impl;
+    std::unique_ptr<impl> _impl;
 public:
-    options() = default;
-    options(const std::map<sstring, sstring>& map) {
-        if (map.find("enabled") == std::end(map)) {
-            return;
-        }
+    future<> stop();
+    cdc_service(service::storage_proxy&);
+    cdc_service(db_context);
+    ~cdc_service();

-        for (auto& p : map) {
-            if (p.first == "enabled") {
-                _enabled = p.second == "true";
-            } else if (p.first == "preimage") {
-                _preimage = p.second == "true";
-            } else if (p.first == "postimage") {
-                _postimage = p.second == "true";
-            } else if (p.first == "ttl") {
-                _ttl = std::stoi(p.second);
-            } else {
-                throw exceptions::configuration_exception("Invalid CDC option: " + p.first);
-            }
-        }
-    }
-    std::map<sstring, sstring> to_map() const {
-        if (!_enabled) {
-            return {};
-        }
-        return {
-            { "enabled", _enabled ? "true" : "false" },
-            { "preimage", _preimage ? "true" : "false" },
-            { "postimage", _postimage ? "true" : "false" },
-            { "ttl", std::to_string(_ttl) },
-        };
-    }
-
-    sstring to_sstring() const {
-        return json::to_json(to_map());
-    }
-
-    bool enabled() const { return _enabled; }
-    bool preimage() const { return _preimage; }
-    bool postimage() const { return _postimage; }
-    int ttl() const { return _ttl; }
-
-    bool operator==(const options& o) const {
-        return _enabled == o._enabled && _preimage == o._preimage && _postimage == o._postimage && _ttl == o._ttl;
-    }
-    bool operator!=(const options& o) const {
-        return !(*this == o);
-    }
+    // If any of the mutations are cdc enabled, optionally selects preimage, and adds the
+    // appropriate augments to set the log entries.
+    // Iff post-image is enabled for any of these, a non-empty callback is also
+    // returned to be invoked post the mutation query.
+    future<std::tuple<std::vector<mutation>, result_callback>> augment_mutation_call(
+        lowres_clock::time_point timeout,
+        std::vector<mutation>&& mutations
+        );
+    bool needs_cdc_augmentation(const std::vector<mutation>&) const;
 };

 struct db_context final {
    service::storage_proxy& _proxy;
-    service::migration_manager& _migration_manager;
+    service::migration_notifier& _migration_notifier;
    locator::token_metadata& _token_metadata;
    locator::snitch_ptr& _snitch;
    dht::i_partitioner& _partitioner;

    class builder final {
        service::storage_proxy& _proxy;
-        std::optional<std::reference_wrapper<service::migration_manager>> _migration_manager;
+        std::optional<std::reference_wrapper<service::migration_notifier>> _migration_notifier;
        std::optional<std::reference_wrapper<locator::token_metadata>> _token_metadata;
        std::optional<std::reference_wrapper<locator::snitch_ptr>> _snitch;
        std::optional<std::reference_wrapper<dht::i_partitioner>> _partitioner;
    public:
-        builder(service::storage_proxy& proxy) : _proxy(proxy) { }
+        builder(service::storage_proxy& proxy);

-        builder& with_migration_manager(service::migration_manager& migration_manager) {
-            _migration_manager = migration_manager;
-            return *this;
-        }
-
-        builder& with_token_metadata(locator::token_metadata& token_metadata) {
-            _token_metadata = token_metadata;
-            return *this;
-        }
-
-        builder& with_snitch(locator::snitch_ptr& snitch) {
-            _snitch = snitch;
-            return *this;
-        }
-
-        builder& with_partitioner(dht::i_partitioner& partitioner) {
-            _partitioner = partitioner;
-            return *this;
-        }
+        builder& with_migration_notifier(service::migration_notifier& migration_notifier);
+        builder& with_token_metadata(locator::token_metadata& token_metadata);
+        builder& with_snitch(locator::snitch_ptr& snitch);
+        builder& with_partitioner(dht::i_partitioner& partitioner);

        db_context build();
    };
 };

-/// \brief Sets up CDC related tables for a given table
-///
-/// This function not only creates CDC Log and CDC Description for a given table
-/// but also populates CDC Description with a list of change streams.
-///
-/// param[in] ctx object with references to database components
-/// param[in] schema schema of a table for which CDC tables are being created
-seastar::future<> setup(db_context ctx, schema_ptr schema);
-
 // cdc log table operation
 enum class operation : int8_t {
    // note: these values will eventually be read by a third party, probably not privvy to this
@@ -182,52 +135,8 @@ enum class column_op : int8_t {
    set = 0, del = 1, add = 2,
 };

-/// \brief Deletes CDC Log and CDC Description tables for a given table
-///
-/// This function cleans up all CDC related tables created for a given table.
-/// At the moment, CDC Log and CDC Description are the only affected tables.
-/// It's ok if some/all of them don't exist.
-///
-/// \param[in] ctx object with references to database components
-/// \param[in] ks_name keyspace name of a table for which CDC tables are removed
-/// \param[in] table_name name of a table for which CDC tables are removed
-///
-/// \pre This function works correctly no matter if CDC Log and/or CDC Description
-///      exist.
-seastar::future<>
-remove(db_context ctx, const seastar::sstring& ks_name, const seastar::sstring& table_name);
-
 seastar::sstring log_name(const seastar::sstring& table_name);

 seastar::sstring desc_name(const seastar::sstring& table_name);

-/// \brief For each mutation in the set appends related CDC Log mutation
-///
-/// This function should be called with a set of mutations of a table
-/// with CDC enabled. Returned set of mutations contains all original mutations
-/// and for each original mutation appends a mutation to CDC Log that reflects
-/// the change.
-///
-/// \param[in] ctx object with references to database components
-/// \param[in] s schema of a CDC enabled table which is being modified
-/// \param[in] timeout period of time after which a request is considered timed out
-/// \param[in] qs the state of the query that's being executed
-/// \param[in] mutations set of changes of a CDC enabled table
-///
-/// \return set of mutations from input parameter with relevant CDC Log mutations appended
-///
-/// \pre CDC Log and CDC Description have to exist
-/// \pre CDC Description has to be in sync with cluster topology
-///
-/// \note At the moment, cluster topology changes are not supported
-//        so the assumption that CDC Description is in sync with cluster topology
-//        is easy to enforce. When support for cluster topology changes is added
-//        it has to make sure the assumption holds.
-seastar::future<std::vector<mutation>>append_log_mutations(
-        db_context ctx,
-        schema_ptr s,
-        lowres_clock::time_point timeout,
-        service::query_state& qs,
-        std::vector<mutation> mutations);
-
 } // namespace cdc
--- a/cdc/cdc_options.hh
+++ b/cdc/cdc_options.hh
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <map>
+#include <seastar/core/sstring.hh>
+#include "seastarx.hh"
+
+namespace cdc {
+
+class options final {
+    bool _enabled = false;
+    bool _preimage = false;
+    bool _postimage = false;
+    int _ttl = 86400; // 24h in seconds
+public:
+    options() = default;
+    options(const std::map<sstring, sstring>& map);
+
+    std::map<sstring, sstring> to_map() const;
+    sstring to_sstring() const;
+
+    bool enabled() const { return _enabled; }
+    bool preimage() const { return _preimage; }
+    bool postimage() const { return _postimage; }
+    int ttl() const { return _ttl; }
+
+    bool operator==(const options& o) const;
+    bool operator!=(const options& o) const;
+};
+
+} // namespace cdc
--- a/collection_mutation.cc
+++ b/collection_mutation.cc
@@ -32,8 +32,8 @@
 collection_mutation::collection_mutation(const abstract_type& type, collection_mutation_view v)
    : _data(imr_object_type::make(data::cell::make_collection(v.data), &type.imr_state().lsa_migrator())) {}

-collection_mutation::collection_mutation(const abstract_type& type, bytes_view v)
-    : _data(imr_object_type::make(data::cell::make_collection(v), &type.imr_state().lsa_migrator())) {}
+collection_mutation::collection_mutation(const abstract_type& type, const bytes_ostream& data)
+	: _data(imr_object_type::make(data::cell::make_collection(fragment_range_view(data)), &type.imr_state().lsa_migrator())) {}

 static collection_mutation_view get_collection_mutation_view(const uint8_t* ptr)
 {
@@ -55,51 +55,49 @@ collection_mutation_view atomic_cell_or_collection::as_collection_mutation() con
 }

 bool collection_mutation_view::is_empty() const {
-  return data.with_linearized([&] (bytes_view in) { // FIXME: we can guarantee that this is in the first fragment
-    auto has_tomb = read_simple<bool>(in);
-    return !has_tomb && read_simple<uint32_t>(in) == 0;
-  });
+    auto in = collection_mutation_input_stream(data);
+    auto has_tomb = in.read_trivial<bool>();
+    return !has_tomb && in.read_trivial<uint32_t>() == 0;
 }

 template <typename F>
-GCC6_CONCEPT(requires std::is_invocable_r_v<const data::type_info&, F, bytes_view&>)
+GCC6_CONCEPT(requires std::is_invocable_r_v<const data::type_info&, F, collection_mutation_input_stream&>)
 static bool is_any_live(const atomic_cell_value_view& data, tombstone tomb, gc_clock::time_point now, F&& read_cell_type_info) {
-  return data.with_linearized([&] (bytes_view in) {
-    auto has_tomb = read_simple<bool>(in);
+    auto in = collection_mutation_input_stream(data);
+    auto has_tomb = in.read_trivial<bool>();
    if (has_tomb) {
-        auto ts = read_simple<api::timestamp_type>(in);
-        auto ttl = read_simple<gc_clock::duration::rep>(in);
+        auto ts = in.read_trivial<api::timestamp_type>();
+        auto ttl = in.read_trivial<gc_clock::duration::rep>();
        tomb.apply(tombstone{ts, gc_clock::time_point(gc_clock::duration(ttl))});
    }

-    auto nr = read_simple<uint32_t>(in);
+    auto nr = in.read_trivial<uint32_t>();
    for (uint32_t i = 0; i != nr; ++i) {
        auto& type_info = read_cell_type_info(in);
-        auto vsize = read_simple<uint32_t>(in);
-        auto value = atomic_cell_view::from_bytes(type_info, read_simple_bytes(in, vsize));
+        auto vsize = in.read_trivial<uint32_t>();
+        auto value = atomic_cell_view::from_bytes(type_info, in.read(vsize));
        if (value.is_live(tomb, now, false)) {
            return true;
        }
    }

    return false;
-  });
 }

 bool collection_mutation_view::is_any_live(const abstract_type& type, tombstone tomb, gc_clock::time_point now) const {
    return visit(type, make_visitor(
    [&] (const collection_type_impl& ctype) {
        auto& type_info = ctype.value_comparator()->imr_state().type_info();
-        return ::is_any_live(data, tomb, now, [&type_info] (bytes_view& in) -> const data::type_info& {
-            auto key_size = read_simple<uint32_t>(in);
-            in.remove_prefix(key_size);
+        return ::is_any_live(data, tomb, now, [&type_info] (collection_mutation_input_stream& in) -> const data::type_info& {
+            auto key_size = in.read_trivial<uint32_t>();
+            in.skip(key_size);
            return type_info;
        });
    },
    [&] (const user_type_impl& utype) {
-        return ::is_any_live(data, tomb, now, [&utype] (bytes_view& in) -> const data::type_info& {
-            auto key_size = read_simple<uint32_t>(in);
-            auto key = read_simple_bytes(in, key_size);
+        return ::is_any_live(data, tomb, now, [&utype] (collection_mutation_input_stream& in) -> const data::type_info& {
+            auto key_size = in.read_trivial<uint32_t>();
+            auto key = in.read(key_size);
            return utype.type(deserialize_field_index(key))->imr_state().type_info();
        });
    },
@@ -110,26 +108,25 @@ bool collection_mutation_view::is_any_live(const abstract_type& type, tombstone
 }

 template <typename F>
-GCC6_CONCEPT(requires std::is_invocable_r_v<const data::type_info&, F, bytes_view&>)
+GCC6_CONCEPT(requires std::is_invocable_r_v<const data::type_info&, F, collection_mutation_input_stream&>)
 static api::timestamp_type last_update(const atomic_cell_value_view& data, F&& read_cell_type_info) {
-  return data.with_linearized([&] (bytes_view in) {
+    auto in = collection_mutation_input_stream(data);
    api::timestamp_type max = api::missing_timestamp;
-    auto has_tomb = read_simple<bool>(in);
+    auto has_tomb = in.read_trivial<bool>();
    if (has_tomb) {
-        max = std::max(max, read_simple<api::timestamp_type>(in));
-        (void)read_simple<gc_clock::duration::rep>(in);
+        max = std::max(max, in.read_trivial<api::timestamp_type>());
+        (void)in.read_trivial<gc_clock::duration::rep>();
    }

-    auto nr = read_simple<uint32_t>(in);
+    auto nr = in.read_trivial<uint32_t>();
    for (uint32_t i = 0; i != nr; ++i) {
        auto& type_info = read_cell_type_info(in);
-        auto vsize = read_simple<uint32_t>(in);
-        auto value = atomic_cell_view::from_bytes(type_info, read_simple_bytes(in, vsize));
+        auto vsize = in.read_trivial<uint32_t>();
+        auto value = atomic_cell_view::from_bytes(type_info, in.read(vsize));
        max = std::max(value.timestamp(), max);
    }

    return max;
-  });
 }


@@ -137,16 +134,16 @@ api::timestamp_type collection_mutation_view::last_update(const abstract_type& t
    return visit(type, make_visitor(
    [&] (const collection_type_impl& ctype) {
        auto& type_info = ctype.value_comparator()->imr_state().type_info();
-        return ::last_update(data, [&type_info] (bytes_view& in) -> const data::type_info& {
-            auto key_size = read_simple<uint32_t>(in);
-            in.remove_prefix(key_size);
+        return ::last_update(data, [&type_info] (collection_mutation_input_stream& in) -> const data::type_info& {
+            auto key_size = in.read_trivial<uint32_t>();
+            in.skip(key_size);
            return type_info;
        });
    },
    [&] (const user_type_impl& utype) {
-        return ::last_update(data, [&utype] (bytes_view& in) -> const data::type_info& {
-            auto key_size = read_simple<uint32_t>(in);
-            auto key = read_simple_bytes(in, key_size);
+        return ::last_update(data, [&utype] (collection_mutation_input_stream& in) -> const data::type_info& {
+            auto key_size = in.read_trivial<uint32_t>();
+            auto key = in.read(key_size);
            return utype.type(deserialize_field_index(key))->imr_state().type_info();
        });
    },
@@ -156,6 +153,44 @@ api::timestamp_type collection_mutation_view::last_update(const abstract_type& t
    ));
 }

+std::ostream& operator<<(std::ostream& os, const collection_mutation_view::printer& cmvp) {
+    fmt::print(os, "{{collection_mutation_view ");
+    cmvp._cmv.with_deserialized(cmvp._type, [&os, &type = cmvp._type] (const collection_mutation_view_description& cmvd) {
+        bool first = true;
+        fmt::print(os, "tombstone {}", cmvd.tomb);
+        visit(type, make_visitor(
+        [&] (const collection_type_impl& ctype) {
+            auto&& key_type = ctype.name_comparator();
+            auto&& value_type = ctype.value_comparator();
+            for (auto&& [key, value] : cmvd.cells) {
+                if (!first) {
+                    fmt::print(os, ", ");
+                }
+                fmt::print(os, "{}: {}", key_type->to_string(key), atomic_cell_view::printer(*value_type, value));
+                first = false;
+            }
+        },
+        [&] (const user_type_impl& utype) {
+            for (auto&& [raw_idx, value] : cmvd.cells) {
+                if (!first) {
+                    fmt::print(os, ", ");
+                }
+                auto idx = deserialize_field_index(raw_idx);
+                fmt::print(os, "{}: {}", utype.field_name_as_string(idx), atomic_cell_view::printer(*utype.type(idx), value));
+                first = false;
+            }
+        },
+        [&] (const abstract_type& o) {
+            // Not throwing exception in this likely-to-be debug context
+            fmt::print(os, "attempted to pretty-print collection_mutation_view_description with type {}", o.name());
+        }
+        ));
+    });
+    fmt::print(os, "}}");
+    return os;
+}
+
+
 collection_mutation_description
 collection_mutation_view_description::materialize(const abstract_type& type) const {
    collection_mutation_description m;
@@ -245,8 +280,9 @@ static collection_mutation serialize_collection_mutation(
    if (tomb) {
        size += sizeof(tomb.timestamp) + sizeof(tomb.deletion_time);
    }
-    bytes ret(bytes::initialized_later(), size);
-    bytes::iterator out = ret.begin();
+    bytes_ostream ret;
+    ret.reserve(size);
+    auto out = ret.write_begin();
    *out++ = bool(tomb);
    if (tomb) {
        write(out, tomb.timestamp);
@@ -385,19 +421,19 @@ collection_mutation difference(const abstract_type& type, collection_mutation_vi
 }

 template <typename F>
-GCC6_CONCEPT(requires std::is_invocable_r_v<std::pair<bytes_view, atomic_cell_view>, F, bytes_view&>)
+GCC6_CONCEPT(requires std::is_invocable_r_v<std::pair<bytes_view, atomic_cell_view>, F, collection_mutation_input_stream&>)
 static collection_mutation_view_description
-deserialize_collection_mutation(bytes_view in, F&& read_kv) {
+deserialize_collection_mutation(collection_mutation_input_stream& in, F&& read_kv) {
    collection_mutation_view_description ret;

-    auto has_tomb = read_simple<bool>(in);
+    auto has_tomb = in.read_trivial<bool>();
    if (has_tomb) {
-        auto ts = read_simple<api::timestamp_type>(in);
-        auto ttl = read_simple<gc_clock::duration::rep>(in);
+        auto ts = in.read_trivial<api::timestamp_type>();
+        auto ttl = in.read_trivial<gc_clock::duration::rep>();
        ret.tomb = tombstone{ts, gc_clock::time_point(gc_clock::duration(ttl))};
    }

-    auto nr = read_simple<uint32_t>(in);
+    auto nr = in.read_trivial<uint32_t>();
    ret.cells.reserve(nr);
    for (uint32_t i = 0; i != nr; ++i) {
        ret.cells.push_back(read_kv(in));
@@ -408,28 +444,28 @@ deserialize_collection_mutation(bytes_view in, F&& read_kv) {
 }

 collection_mutation_view_description
-deserialize_collection_mutation(const abstract_type& type, bytes_view in) {
+deserialize_collection_mutation(const abstract_type& type, collection_mutation_input_stream& in) {
    return visit(type, make_visitor(
    [&] (const collection_type_impl& ctype) {
        // value_comparator(), ugh
        auto& type_info = ctype.value_comparator()->imr_state().type_info();
-        return deserialize_collection_mutation(in, [&type_info] (bytes_view& in) {
+        return deserialize_collection_mutation(in, [&type_info] (collection_mutation_input_stream& in) {
            // FIXME: we could probably avoid the need for size
-            auto ksize = read_simple<uint32_t>(in);
-            auto key = read_simple_bytes(in, ksize);
-            auto vsize = read_simple<uint32_t>(in);
-            auto value = atomic_cell_view::from_bytes(type_info, read_simple_bytes(in, vsize));
+            auto ksize = in.read_trivial<uint32_t>();
+            auto key = in.read(ksize);
+            auto vsize = in.read_trivial<uint32_t>();
+            auto value = atomic_cell_view::from_bytes(type_info, in.read(vsize));
            return std::make_pair(key, value);
        });
    },
    [&] (const user_type_impl& utype) {
-        return deserialize_collection_mutation(in, [&utype] (bytes_view& in) {
+        return deserialize_collection_mutation(in, [&utype] (collection_mutation_input_stream& in) {
            // FIXME: we could probably avoid the need for size
-            auto ksize = read_simple<uint32_t>(in);
-            auto key = read_simple_bytes(in, ksize);
-            auto vsize = read_simple<uint32_t>(in);
+            auto ksize = in.read_trivial<uint32_t>();
+            auto key = in.read(ksize);
+            auto vsize = in.read_trivial<uint32_t>();
            auto value = atomic_cell_view::from_bytes(
-                    utype.type(deserialize_field_index(key))->imr_state().type_info(), read_simple_bytes(in, vsize));
+                    utype.type(deserialize_field_index(key))->imr_state().type_info(), in.read(vsize));
            return std::make_pair(key, value);
        });
    },
--- a/collection_mutation.hh
+++ b/collection_mutation.hh
@@ -26,8 +26,12 @@
 #include "gc_clock.hh"
 #include "atomic_cell.hh"
 #include "cql_serialization_format.hh"
+#include "marshal_exception.hh"
+#include "utils/linearizing_input_stream.hh"
+#include <iosfwd>

 class abstract_type;
+class bytes_ostream;
 class compaction_garbage_collector;
 class row_tombstone;

@@ -66,10 +70,13 @@ struct collection_mutation_view_description {
    collection_mutation serialize(const abstract_type&) const;
 };

+using collection_mutation_input_stream = utils::linearizing_input_stream<atomic_cell_value_view, marshal_exception>;
+
 // Given a linearized collection_mutation_view, returns an auxiliary struct allowing the inspection of each cell.
-// The struct is an observer of the data given by the collection_mutation_view and doesn't extend its lifetime.
+// The struct is an observer of the data given by the collection_mutation_view and is only valid while the
+// passed in `collection_mutation_input_stream` is alive.
 // The function needs to be given the type of stored data to reconstruct the structural information.
-collection_mutation_view_description deserialize_collection_mutation(const abstract_type&, bytes_view);
+collection_mutation_view_description deserialize_collection_mutation(const abstract_type&, collection_mutation_input_stream&);

 class collection_mutation_view {
 public:
@@ -90,10 +97,18 @@ public:
    // calls it on the corresponding description of `this`.
    template <typename F>
    inline decltype(auto) with_deserialized(const abstract_type& type, F f) const {
-        return data.with_linearized([&] (bytes_view bv) {
-            return f(deserialize_collection_mutation(type, std::move(bv)));
-        });
+        auto stream = collection_mutation_input_stream(data);
+        return f(deserialize_collection_mutation(type, stream));
    }
+
+    class printer {
+        const abstract_type& _type;
+        const collection_mutation_view& _cmv;
+    public:
+        printer(const abstract_type& type, const collection_mutation_view& cmv)
+                : _type(type), _cmv(cmv) {}
+        friend std::ostream& operator<<(std::ostream& os, const printer& cmvp);
+    };
 };

 // A serialized mutation of a collection of cells.
@@ -112,7 +127,7 @@ public:

    collection_mutation() {}
    collection_mutation(const abstract_type&, collection_mutation_view);
-    collection_mutation(const abstract_type&, bytes_view);
+    collection_mutation(const abstract_type& type, const bytes_ostream& data);
    operator collection_mutation_view() const;
 };

--- a/compound.hh
+++ b/compound.hh
@@ -74,8 +74,8 @@ private:
     *   <len(value1)><value1><len(value2)><value2>...<len(value_n)><value_n>
     *
     */
-    template<typename RangeOfSerializedComponents>
-    static void serialize_value(RangeOfSerializedComponents&& values, bytes::iterator& out) {
+    template<typename RangeOfSerializedComponents, typename CharOutputIterator>
+    static void serialize_value(RangeOfSerializedComponents&& values, CharOutputIterator& out) {
        for (auto&& val : values) {
            assert(val.size() <= std::numeric_limits<size_type>::max());
            write<size_type>(out, size_type(val.size()));
--- a/compound_compat.hh
+++ b/compound_compat.hh
@@ -248,15 +248,16 @@ private:
    static size_t size(const data_value& val) {
        return val.serialized_size();
    }
-    template<typename Value, typename = std::enable_if_t<!std::is_same<data_value, std::decay_t<Value>>::value>>
-    static void write_value(Value&& val, bytes::iterator& out) {
+    template<typename Value, typename CharOutputIterator, typename = std::enable_if_t<!std::is_same<data_value, std::decay_t<Value>>::value>>
+    static void write_value(Value&& val, CharOutputIterator& out) {
        out = std::copy(val.begin(), val.end(), out);
    }
-    static void write_value(const data_value& val, bytes::iterator& out) {
+    template <typename CharOutputIterator>
+    static void write_value(const data_value& val, CharOutputIterator& out) {
        val.serialize(out);
    }
-    template<typename RangeOfSerializedComponents>
-    static void serialize_value(RangeOfSerializedComponents&& values, bytes::iterator& out, bool is_compound) {
+    template<typename RangeOfSerializedComponents, typename CharOutputIterator>
+    static void serialize_value(RangeOfSerializedComponents&& values, CharOutputIterator& out, bool is_compound) {
        if (!is_compound) {
            auto it = values.begin();
            write_value(std::forward<decltype(*it)>(*it), out);
--- a/concrete_types.hh
+++ b/concrete_types.hh
@@ -92,14 +92,17 @@ struct duration_type_impl final : public concrete_type<cql_duration> {

 struct timestamp_type_impl final : public simple_type_impl<db_clock::time_point> {
    timestamp_type_impl();
+    static db_clock::time_point from_sstring(sstring_view s);
 };

 struct simple_date_type_impl final : public simple_type_impl<uint32_t> {
    simple_date_type_impl();
+    static uint32_t from_sstring(sstring_view s);
 };

 struct time_type_impl final : public simple_type_impl<int64_t> {
    time_type_impl();
+    static int64_t from_sstring(sstring_view s);
 };

 struct string_type_impl : public concrete_type<sstring> {
@@ -129,6 +132,7 @@ using timestamp_date_base_class = concrete_type<db_clock::time_point>;

 struct timeuuid_type_impl final : public concrete_type<utils::UUID> {
    timeuuid_type_impl();
+    static utils::UUID from_sstring(sstring_view s);
 };

 struct varint_type_impl final : public concrete_type<boost::multiprecision::cpp_int> {
@@ -137,10 +141,13 @@ struct varint_type_impl final : public concrete_type<boost::multiprecision::cpp_

 struct inet_addr_type_impl final : public concrete_type<seastar::net::inet_address> {
    inet_addr_type_impl();
+    static sstring to_sstring(const seastar::net::inet_address& addr);
+    static seastar::net::inet_address from_sstring(sstring_view s);
 };

 struct uuid_type_impl final : public concrete_type<utils::UUID> {
    uuid_type_impl();
+    static utils::UUID from_sstring(sstring_view s);
 };

 template <typename Func> using visit_ret_type = std::invoke_result_t<Func, const ascii_type_impl&>;
@@ -241,3 +248,28 @@ static inline visit_ret_type<Func> visit(const abstract_type& t, Func&& f) {
    }
    __builtin_unreachable();
 }
+
+template <typename Func> struct data_value_visitor {
+    const void* v;
+    Func& f;
+    auto operator()(const empty_type_impl& t) { return f(t, v); }
+    auto operator()(const counter_type_impl& t) { return f(t, v); }
+    auto operator()(const reversed_type_impl& t) { return f(t, v); }
+    template <typename T> auto operator()(const T& t) {
+        return f(t, reinterpret_cast<const typename T::native_type*>(v));
+    }
+};
+
+// Given an abstract_type and a void pointer to an object of that
+// type, call f with the runtime type of t and v casted to the
+// corresponding native type.
+// This takes an abstract_type and a void pointer instead of a
+// data_value to support reversed_type_impl without requiring that
+// each visitor create a new data_value just to recurse.
+template <typename Func> inline auto visit(const abstract_type& t, const void* v, Func&& f) {
+    return ::visit(t, data_value_visitor<Func>{v, f});
+}
+
+template <typename Func> inline auto visit(const data_value& v, Func&& f) {
+    return ::visit(*v.type(), v._value, f);
+}
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -25,15 +25,19 @@
 # multiple tokens per node, see http://cassandra.apache.org/doc/latest/operating
 num_tokens: 256

+# Directory where Scylla should store all its files, which are commitlog,
+# data, hints, view_hints and saved_caches subdirectories. All of these
+# subs can be overriden by the respective options below.
+# If unset, the value defaults to /var/lib/scylla
+# workdir: /var/lib/scylla
+
 # Directory where Scylla should store data on disk.
-# If not set, the default directory is /var/lib/scylla/data.
-data_file_directories:
-    - /var/lib/scylla/data
+# data_file_directories:
+#    - /var/lib/scylla/data

 # commit log.  when running on magnetic HDD, this should be a
 # separate spindle than the data directories.
-# If not set, the default directory is /var/lib/scylla/commitlog.
-commitlog_directory: /var/lib/scylla/commitlog
+# commitlog_directory: /var/lib/scylla/commitlog

 # commitlog_sync may be either "periodic" or "batch."
 #
@@ -244,6 +248,7 @@ batch_size_fail_threshold_in_kb: 50
 # experimental_features:
 #     - cdc
 #     - lwt
+#     - udf

 # The directory where hints files are stored if hinted handoff is enabled.
 # hints_directory: /var/lib/scylla/hints
@@ -262,24 +267,6 @@ batch_size_fail_threshold_in_kb: 50
 # created until it has been seen alive and gone down again.
 # max_hint_window_in_ms: 10800000 # 3 hours

-# Maximum throttle in KBs per second, per delivery thread.  This will be
-# reduced proportionally to the number of nodes in the cluster.  (If there
-# are two nodes in the cluster, each delivery thread will use the maximum
-# rate; if there are three, each will throttle to half of the maximum,
-# since we expect two nodes to be delivering hints simultaneously.)
-# hinted_handoff_throttle_in_kb: 1024
-# Number of threads with which to deliver hints;
-# Consider increasing this number when you have multi-dc deployments, since
-# cross-dc handoff tends to be slower
-# max_hints_delivery_threads: 2
-
-###################################################
-## Not currently supported, reserved for future use
-###################################################
-
-# Maximum throttle in KBs per second, total. This will be
-# reduced proportionally to the number of nodes in the cluster.
-# batchlog_replay_throttle_in_kb: 1024

 # Validity period for permissions cache (fetching permissions can be an
 # expensive operation depending on the authorizer, CassandraAuthorizer is
@@ -307,120 +294,6 @@ batch_size_fail_threshold_in_kb: 50
 #
 partitioner: org.apache.cassandra.dht.Murmur3Partitioner

-# Maximum size of the key cache in memory.
-#
-# Each key cache hit saves 1 seek and each row cache hit saves 2 seeks at the
-# minimum, sometimes more. The key cache is fairly tiny for the amount of
-# time it saves, so it's worthwhile to use it at large numbers.
-# The row cache saves even more time, but must contain the entire row,
-# so it is extremely space-intensive. It's best to only use the
-# row cache if you have hot rows or static rows.
-#
-# NOTE: if you reduce the size, you may not get you hottest keys loaded on startup.
-#
-# Default value is empty to make it "auto" (min(5% of Heap (in MB), 100MB)). Set to 0 to disable key cache.
-# key_cache_size_in_mb:
-
-# Duration in seconds after which Scylla should
-# save the key cache. Caches are saved to saved_caches_directory as
-# specified in this configuration file.
-#
-# Saved caches greatly improve cold-start speeds, and is relatively cheap in
-# terms of I/O for the key cache. Row cache saving is much more expensive and
-# has limited use.
-#
-# Default is 14400 or 4 hours.
-# key_cache_save_period: 14400
-
-# Number of keys from the key cache to save
-# Disabled by default, meaning all keys are going to be saved
-# key_cache_keys_to_save: 100
-
-# Maximum size of the row cache in memory.
-# NOTE: if you reduce the size, you may not get you hottest keys loaded on startup.
-#
-# Default value is 0, to disable row caching.
-# row_cache_size_in_mb: 0
-
-# Duration in seconds after which Scylla should
-# save the row cache. Caches are saved to saved_caches_directory as specified
-# in this configuration file.
-#
-# Saved caches greatly improve cold-start speeds, and is relatively cheap in
-# terms of I/O for the key cache. Row cache saving is much more expensive and
-# has limited use.
-#
-# Default is 0 to disable saving the row cache.
-# row_cache_save_period: 0
-
-# Number of keys from the row cache to save
-# Disabled by default, meaning all keys are going to be saved
-# row_cache_keys_to_save: 100
-
-# Maximum size of the counter cache in memory.
-#
-# Counter cache helps to reduce counter locks' contention for hot counter cells.
-# In case of RF = 1 a counter cache hit will cause Scylla to skip the read before
-# write entirely. With RF > 1 a counter cache hit will still help to reduce the duration
-# of the lock hold, helping with hot counter cell updates, but will not allow skipping
-# the read entirely. Only the local (clock, count) tuple of a counter cell is kept
-# in memory, not the whole counter, so it's relatively cheap.
-#
-# NOTE: if you reduce the size, you may not get you hottest keys loaded on startup.
-#
-# Default value is empty to make it "auto" (min(2.5% of Heap (in MB), 50MB)). Set to 0 to disable counter cache.
-# NOTE: if you perform counter deletes and rely on low gcgs, you should disable the counter cache.
-# counter_cache_size_in_mb:
-
-# Duration in seconds after which Scylla should
-# save the counter cache (keys only). Caches are saved to saved_caches_directory as
-# specified in this configuration file.
-#
-# Default is 7200 or 2 hours.
-# counter_cache_save_period: 7200
-
-# Number of keys from the counter cache to save
-# Disabled by default, meaning all keys are going to be saved
-# counter_cache_keys_to_save: 100
-
-# The off-heap memory allocator.  Affects storage engine metadata as
-# well as caches.  Experiments show that JEMAlloc saves some memory
-# than the native GCC allocator (i.e., JEMalloc is more
-# fragmentation-resistant).
-# 
-# Supported values are: NativeAllocator, JEMallocAllocator
-#
-# If you intend to use JEMallocAllocator you have to install JEMalloc as library and
-# modify cassandra-env.sh as directed in the file.
-#
-# Defaults to NativeAllocator
-# memory_allocator: NativeAllocator
-
-# saved caches
-# If not set, the default directory is /var/lib/scylla/saved_caches.
-# saved_caches_directory: /var/lib/scylla/saved_caches
-
-
-
-# For workloads with more data than can fit in memory, Scylla's
-# bottleneck will be reads that need to fetch data from
-# disk. "concurrent_reads" should be set to (16 * number_of_drives) in
-# order to allow the operations to enqueue low enough in the stack
-# that the OS and drives can reorder them. Same applies to
-# "concurrent_counter_writes", since counter writes read the current
-# values before incrementing and writing them back.
-#
-# On the other hand, since writes are almost never IO bound, the ideal
-# number of "concurrent_writes" is dependent on the number of cores in
-# your system; (8 * number_of_cores) is a good rule of thumb.
-# concurrent_reads: 32
-# concurrent_writes: 32
-# concurrent_counter_writes: 32
-
-# Total memory to use for sstable-reading buffers.  Defaults to
-# the smaller of 1/4 of heap or 512MB.
-# file_cache_size_in_mb: 512
-
 # Total space to use for commitlogs.
 #
 # If space gets above this value (it will round up to the next nearest
@@ -432,28 +305,6 @@ partitioner: org.apache.cassandra.dht.Murmur3Partitioner
 # available for Scylla.
 commitlog_total_space_in_mb: -1

-# A fixed memory pool size in MB for for SSTable index summaries. If left
-# empty, this will default to 5% of the heap size. If the memory usage of
-# all index summaries exceeds this limit, SSTables with low read rates will
-# shrink their index summaries in order to meet this limit.  However, this
-# is a best-effort process. In extreme conditions Scylla may need to use
-# more than this amount of memory.
-# index_summary_capacity_in_mb:
-
-# How frequently index summaries should be resampled.  This is done
-# periodically to redistribute memory from the fixed-size pool to sstables
-# proportional their recent read rates.  Setting to -1 will disable this
-# process, leaving existing index summaries at their current sampling level.
-# index_summary_resize_interval_in_minutes: 60
-
-# Whether to, when doing sequential writing, fsync() at intervals in
-# order to force the operating system to flush the dirty
-# buffers. Enable this to avoid sudden dirty buffer flushing from
-# impacting read latencies. Almost always a good idea on SSDs; not
-# necessarily on platters.
-# trickle_fsync: false
-# trickle_fsync_interval_in_kb: 10240
-
 # TCP port, for commands and data
 # For security reasons, you should not expose this port to the internet.  Firewall it if needed.
 # storage_port: 7000
@@ -466,91 +317,21 @@ commitlog_total_space_in_mb: -1
 # listen_interface: eth0
 # listen_interface_prefer_ipv6: false

-# Internode authentication backend, implementing IInternodeAuthenticator;
-# used to allow/disallow connections from peer nodes.
-# internode_authenticator: org.apache.cassandra.auth.AllowAllInternodeAuthenticator
-
 # Whether to start the native transport server.
 # Please note that the address on which the native transport is bound is the
 # same as the rpc_address. The port however is different and specified below.
 # start_native_transport: true

-# The maximum threads for handling requests when the native transport is used.
-# This is similar to rpc_max_threads though the default differs slightly (and
-# there is no native_transport_min_threads, idle threads will always be stopped
-# after 30 seconds).
-# native_transport_max_threads: 128
-#
 # The maximum size of allowed frame. Frame (requests) larger than this will
 # be rejected as invalid. The default is 256MB.
 # native_transport_max_frame_size_in_mb: 256

-# The maximum number of concurrent client connections.
-# The default is -1, which means unlimited.
-# native_transport_max_concurrent_connections: -1
-
-# The maximum number of concurrent client connections per source ip.
-# The default is -1, which means unlimited.
-# native_transport_max_concurrent_connections_per_ip: -1
-
 # Whether to start the thrift rpc server.
 # start_rpc: true

 # enable or disable keepalive on rpc/native connections
 # rpc_keepalive: true

-# Scylla provides two out-of-the-box options for the RPC Server:
-#
-# sync  -> One thread per thrift connection. For a very large number of clients, memory
-#          will be your limiting factor. On a 64 bit JVM, 180KB is the minimum stack size
-#          per thread, and that will correspond to your use of virtual memory (but physical memory
-#          may be limited depending on use of stack space).
-#
-# hsha  -> Stands for "half synchronous, half asynchronous." All thrift clients are handled
-#          asynchronously using a small number of threads that does not vary with the amount
-#          of thrift clients (and thus scales well to many clients). The rpc requests are still
-#          synchronous (one thread per active request). If hsha is selected then it is essential
-#          that rpc_max_threads is changed from the default value of unlimited.
-#
-# The default is sync because on Windows hsha is about 30% slower.  On Linux,
-# sync/hsha performance is about the same, with hsha of course using less memory.
-#
-# Alternatively,  can provide your own RPC server by providing the fully-qualified class name
-# of an o.a.c.t.TServerFactory that can create an instance of it.
-# rpc_server_type: sync
-
-# Uncomment rpc_min|max_thread to set request pool size limits.
-#
-# Regardless of your choice of RPC server (see above), the number of maximum requests in the
-# RPC thread pool dictates how many concurrent requests are possible (but if you are using the sync
-# RPC server, it also dictates the number of clients that can be connected at all).
-#
-# The default is unlimited and thus provides no protection against clients overwhelming the server. You are
-# encouraged to set a maximum that makes sense for you in production, but do keep in mind that
-# rpc_max_threads represents the maximum number of client requests this server may execute concurrently.
-#
-# rpc_min_threads: 16
-# rpc_max_threads: 2048
-
-# uncomment to set socket buffer sizes on rpc connections
-# rpc_send_buff_size_in_bytes:
-# rpc_recv_buff_size_in_bytes:
-
-# Uncomment to set socket buffer size for internode communication
-# Note that when setting this, the buffer size is limited by net.core.wmem_max
-# and when not setting it it is defined by net.ipv4.tcp_wmem
-# See:
-# /proc/sys/net/core/wmem_max
-# /proc/sys/net/core/rmem_max
-# /proc/sys/net/ipv4/tcp_wmem
-# /proc/sys/net/ipv4/tcp_rmem
-# and: man tcp
-# internode_send_buff_size_in_bytes:
-# internode_recv_buff_size_in_bytes:
-
-# Frame size for thrift (maximum message length).
-# thrift_framed_transport_size_in_mb: 15
-
 # Set to true to have Scylla create a hard link to each sstable
 # flushed or streamed locally in a backups/ subdirectory of the
 # keyspace data.  Removing these links is the operator's
@@ -593,30 +374,6 @@ commitlog_total_space_in_mb: -1
 # column_index_size_in_kb: 64


-# Number of simultaneous compactions to allow, NOT including
-# validation "compactions" for anti-entropy repair.  Simultaneous
-# compactions can help preserve read performance in a mixed read/write
-# workload, by mitigating the tendency of small sstables to accumulate
-# during a single long running compactions. The default is usually
-# fine and if you experience problems with compaction running too
-# slowly or too fast, you should look at
-# compaction_throughput_mb_per_sec first.
-#
-# concurrent_compactors defaults to the smaller of (number of disks,
-# number of cores), with a minimum of 2 and a maximum of 8.
-# 
-# If your data directories are backed by SSD, you should increase this
-# to the number of cores.
-#concurrent_compactors: 1
-
-# Throttles compaction to the given total throughput across the entire
-# system. The faster you insert data, the faster you need to compact in
-# order to keep the sstable count down, but in general, setting this to
-# 16 to 32 times the rate you are inserting data is more than sufficient.
-# Setting this to 0 disables throttling. Note that this account for all types
-# of compaction, including validation compaction.
-# compaction_throughput_mb_per_sec: 16
-
 # Log a warning when writing partitions larger than this value
 # compaction_large_partition_warning_threshold_mb: 1000

@@ -629,18 +386,6 @@ commitlog_total_space_in_mb: -1
 # Log a warning when row number is larger than this value
 # compaction_rows_count_warning_threshold: 100000

-# When compacting, the replacement sstable(s) can be opened before they
-# are completely written, and used in place of the prior sstables for
-# any range that has been written. This helps to smoothly transfer reads 
-# between the sstables, reducing page cache churn and keeping hot rows hot
-# sstable_preemptive_open_interval_in_mb: 50
-
-# Throttles all streaming file transfer between the datacenters,
-# this setting allows users to throttle inter dc stream throughput in addition
-# to throttling all network stream traffic as configured with
-# stream_throughput_outbound_megabits_per_sec
-# inter_dc_stream_throughput_outbound_megabits_per_sec:
-
 # How long the coordinator should wait for seq or index scans to complete
 # range_request_timeout_in_ms: 10000
 # How long the coordinator should wait for writes to complete
@@ -655,88 +400,23 @@ commitlog_total_space_in_mb: -1
 # The default timeout for other, miscellaneous operations
 # request_timeout_in_ms: 10000

-# Enable operation timeout information exchange between nodes to accurately
-# measure request timeouts.  If disabled, replicas will assume that requests
-# were forwarded to them instantly by the coordinator, which means that
-# under overload conditions we will waste that much extra time processing 
-# already-timed-out requests.
-#
-# Warning: before enabling this property make sure to ntp is installed
-# and the times are synchronized between the nodes.
-# cross_node_timeout: false
-
-# Enable socket timeout for streaming operation.
-# When a timeout occurs during streaming, streaming is retried from the start
-# of the current file. This _can_ involve re-streaming an important amount of
-# data, so you should avoid setting the value too low.
-# Default value is 0, which never timeout streams.
-# streaming_socket_timeout_in_ms: 0
-
-# controls how often to perform the more expensive part of host score
-# calculation
-# dynamic_snitch_update_interval_in_ms: 100 
-
-# controls how often to reset all host scores, allowing a bad host to
-# possibly recover
-# dynamic_snitch_reset_interval_in_ms: 600000
-
-# if set greater than zero and read_repair_chance is < 1.0, this will allow
-# 'pinning' of replicas to hosts in order to increase cache capacity.
-# The badness threshold will control how much worse the pinned host has to be
-# before the dynamic snitch will prefer other replicas over it.  This is
-# expressed as a double which represents a percentage.  Thus, a value of
-# 0.2 means Scylla would continue to prefer the static snitch values
-# until the pinned host was 20% worse than the fastest.
-# dynamic_snitch_badness_threshold: 0.1
-
-# request_scheduler -- Set this to a class that implements
-# RequestScheduler, which will schedule incoming client requests
-# according to the specific policy. This is useful for multi-tenancy
-# with a single Scylla cluster.
-# NOTE: This is specifically for requests from the client and does
-# not affect inter node communication.
-# org.apache.cassandra.scheduler.NoScheduler - No scheduling takes place
-# org.apache.cassandra.scheduler.RoundRobinScheduler - Round robin of
-# client requests to a node with a separate queue for each
-# request_scheduler_id. The scheduler is further customized by
-# request_scheduler_options as described below.
-# request_scheduler: org.apache.cassandra.scheduler.NoScheduler
-
-# Scheduler Options vary based on the type of scheduler
-# NoScheduler - Has no options
-# RoundRobin
-#  - throttle_limit -- The throttle_limit is the number of in-flight
-#                      requests per client.  Requests beyond 
-#                      that limit are queued up until
-#                      running requests can complete.
-#                      The value of 80 here is twice the number of
-#                      concurrent_reads + concurrent_writes.
-#  - default_weight -- default_weight is optional and allows for
-#                      overriding the default which is 1.
-#  - weights -- Weights are optional and will default to 1 or the
-#               overridden default_weight. The weight translates into how
-#               many requests are handled during each turn of the
-#               RoundRobin, based on the scheduler id.
-#
-# request_scheduler_options:
-#    throttle_limit: 80
-#    default_weight: 5
-#    weights:
-#      Keyspace1: 1
-#      Keyspace2: 5
-
-# request_scheduler_id -- An identifier based on which to perform
-# the request scheduling. Currently the only valid option is keyspace.
-# request_scheduler_id: keyspace
-
 # Enable or disable inter-node encryption. 
 # You must also generate keys and provide the appropriate key and trust store locations and passwords. 
-# No custom encryption options are currently enabled. The available options are:
 #
 # The available internode options are : all, none, dc, rack
 # If set to dc scylla  will encrypt the traffic between the DCs
 # If set to rack scylla  will encrypt the traffic between the racks
 #
+# SSL/TLS algorithm and ciphers used can be controlled by 
+# the priority_string parameter. Info on priority string
+# syntax and values is available at:
+#   https://gnutls.org/manual/html_node/Priority-Strings.html
+#
+# The require_client_auth parameter allows you to 
+# restrict access to service based on certificate 
+# validation. Client must provide a certificate 
+# accepted by the used trust store to connect.
+# 
 # server_encryption_options:
 #    internode_encryption: none
 #    certificate: conf/scylla.crt
--- a/configure.py
+++ b/configure.py
@@ -144,8 +144,12 @@ def flag_supported(flag, compiler):

 def gold_supported(compiler):
    src_main = 'int main(int argc, char **argv) { return 0; }'
-    if try_compile_and_link(source=src_main, flags=['-fuse-ld=gold'], compiler=compiler):
-        return '-fuse-ld=gold'
+    link_flags = ['-fuse-ld=gold']
+    if try_compile_and_link(source=src_main, flags=link_flags, compiler=compiler):
+        threads_flag = '-Wl,--threads'
+        if try_compile_and_link(source=src_main, flags=link_flags + [threads_flag], compiler=compiler):
+            link_flags.append(threads_flag)
+        return ' '.join(link_flags)
    else:
        print('Note: gold not found; using default system linker')
        return ''
@@ -257,139 +261,142 @@ modes = {
 }

 scylla_tests = [
-    'tests/mutation_test',
-    'tests/mvcc_test',
-    'tests/mutation_fragment_test',
-    'tests/flat_mutation_reader_test',
-    'tests/schema_registry_test',
-    'tests/canonical_mutation_test',
-    'tests/range_test',
-    'tests/types_test',
-    'tests/keys_test',
-    'tests/partitioner_test',
-    'tests/frozen_mutation_test',
-    'tests/serialized_action_test',
-    'tests/hint_test',
-    'tests/clustering_ranges_walker_test',
-    'tests/perf/perf_mutation',
-    'tests/lsa_async_eviction_test',
-    'tests/lsa_sync_eviction_test',
-    'tests/row_cache_alloc_stress',
-    'tests/perf_row_cache_update',
-    'tests/perf/perf_hash',
-    'tests/perf/perf_cql_parser',
-    'tests/perf/perf_simple_query',
-    'tests/perf/perf_fast_forward',
-    'tests/perf/perf_cache_eviction',
-    'tests/cache_flat_mutation_reader_test',
-    'tests/row_cache_stress_test',
-    'tests/memory_footprint',
-    'tests/perf/perf_sstable',
-    'tests/cdc_test',
-    'tests/cql_query_test',
-    'tests/user_types_test',
-    'tests/secondary_index_test',
-    'tests/json_cql_query_test',
-    'tests/filtering_test',
-    'tests/storage_proxy_test',
-    'tests/schema_change_test',
-    'tests/mutation_reader_test',
-    'tests/mutation_query_test',
-    'tests/row_cache_test',
-    'tests/test-serialization',
-    'tests/broken_sstable_test',
-    'tests/sstable_test',
-    'tests/sstable_datafile_test',
-    'tests/sstable_3_x_test',
-    'tests/sstable_mutation_test',
-    'tests/sstable_resharding_test',
-    'tests/memtable_test',
-    'tests/commitlog_test',
-    'tests/cartesian_product_test',
-    'tests/hash_test',
-    'tests/map_difference_test',
-    'tests/message',
-    'tests/gossip',
-    'tests/gossip_test',
-    'tests/compound_test',
-    'tests/config_test',
-    'tests/gossiping_property_file_snitch_test',
-    'tests/ec2_snitch_test',
-    'tests/gce_snitch_test',
-    'tests/snitch_reset_test',
-    'tests/network_topology_strategy_test',
-    'tests/query_processor_test',
-    'tests/batchlog_manager_test',
-    'tests/bytes_ostream_test',
-    'tests/UUID_test',
-    'tests/murmur_hash_test',
-    'tests/allocation_strategy_test',
-    'tests/logalloc_test',
-    'tests/log_heap_test',
-    'tests/managed_vector_test',
-    'tests/crc_test',
-    'tests/checksum_utils_test',
-    'tests/flush_queue_test',
-    'tests/dynamic_bitset_test',
-    'tests/auth_test',
-    'tests/idl_test',
-    'tests/range_tombstone_list_test',
-    'tests/anchorless_list_test',
-    'tests/database_test',
-    'tests/nonwrapping_range_test',
-    'tests/input_stream_test',
-    'tests/virtual_reader_test',
-    'tests/view_schema_test',
-    'tests/view_build_test',
-    'tests/view_complex_test',
-    'tests/counter_test',
-    'tests/cell_locker_test',
-    'tests/row_locker_test',
-    'tests/streaming_histogram_test',
-    'tests/duration_test',
-    'tests/vint_serialization_test',
-    'tests/continuous_data_consumer_test',
-    'tests/compress_test',
-    'tests/chunked_vector_test',
-    'tests/loading_cache_test',
-    'tests/castas_fcts_test',
-    'tests/big_decimal_test',
-    'tests/aggregate_fcts_test',
-    'tests/role_manager_test',
-    'tests/caching_options_test',
-    'tests/auth_resource_test',
-    'tests/cql_auth_query_test',
-    'tests/enum_set_test',
-    'tests/extensions_test',
-    'tests/cql_auth_syntax_test',
-    'tests/querier_cache',
-    'tests/limiting_data_source_test',
-    'tests/meta_test',
-    'tests/imr_test',
-    'tests/partition_data_test',
-    'tests/reusable_buffer_test',
-    'tests/mutation_writer_test',
-    'tests/observable_test',
-    'tests/transport_test',
-    'tests/fragmented_temporary_buffer_test',
-    'tests/json_test',
-    'tests/auth_passwords_test',
-    'tests/multishard_mutation_query_test',
-    'tests/top_k_test',
-    'tests/utf8_test',
-    'tests/small_vector_test',
-    'tests/data_listeners_test',
-    'tests/truncation_migration_test',
-    'tests/like_matcher_test',
-    'tests/enum_option_test',
+    'test/boost/UUID_test',
+    'test/boost/aggregate_fcts_test',
+    'test/boost/allocation_strategy_test',
+    'test/boost/anchorless_list_test',
+    'test/boost/auth_passwords_test',
+    'test/boost/auth_resource_test',
+    'test/boost/auth_test',
+    'test/boost/batchlog_manager_test',
+    'test/boost/big_decimal_test',
+    'test/boost/broken_sstable_test',
+    'test/boost/bytes_ostream_test',
+    'test/boost/cache_flat_mutation_reader_test',
+    'test/boost/caching_options_test',
+    'test/boost/canonical_mutation_test',
+    'test/boost/cartesian_product_test',
+    'test/boost/castas_fcts_test',
+    'test/boost/cdc_test',
+    'test/boost/cell_locker_test',
+    'test/boost/checksum_utils_test',
+    'test/boost/chunked_vector_test',
+    'test/boost/clustering_ranges_walker_test',
+    'test/boost/commitlog_test',
+    'test/boost/compound_test',
+    'test/boost/compress_test',
+    'test/boost/config_test',
+    'test/boost/continuous_data_consumer_test',
+    'test/boost/counter_test',
+    'test/boost/cql_auth_query_test',
+    'test/boost/cql_auth_syntax_test',
+    'test/boost/cql_query_test',
+    'test/boost/crc_test',
+    'test/boost/data_listeners_test',
+    'test/boost/database_test',
+    'test/boost/duration_test',
+    'test/boost/dynamic_bitset_test',
+    'test/boost/enum_option_test',
+    'test/boost/enum_set_test',
+    'test/boost/extensions_test',
+    'test/boost/filtering_test',
+    'test/boost/flat_mutation_reader_test',
+    'test/boost/flush_queue_test',
+    'test/boost/fragmented_temporary_buffer_test',
+    'test/boost/frozen_mutation_test',
+    'test/boost/gossip_test',
+    'test/boost/gossiping_property_file_snitch_test',
+    'test/boost/hash_test',
+    'test/boost/idl_test',
+    'test/boost/input_stream_test',
+    'test/boost/json_cql_query_test',
+    'test/boost/keys_test',
+    'test/boost/like_matcher_test',
+    'test/boost/limiting_data_source_test',
+    'test/boost/linearizing_input_stream_test',
+    'test/boost/loading_cache_test',
+    'test/boost/log_heap_test',
+    'test/boost/logalloc_test',
+    'test/boost/managed_vector_test',
+    'test/boost/map_difference_test',
+    'test/boost/memtable_test',
+    'test/boost/meta_test',
+    'test/boost/multishard_mutation_query_test',
+    'test/boost/murmur_hash_test',
+    'test/boost/mutation_fragment_test',
+    'test/boost/mutation_query_test',
+    'test/boost/mutation_reader_test',
+    'test/boost/mutation_test',
+    'test/boost/mutation_writer_test',
+    'test/boost/mvcc_test',
+    'test/boost/network_topology_strategy_test',
+    'test/boost/nonwrapping_range_test',
+    'test/boost/observable_test',
+    'test/boost/partitioner_test',
+    'test/boost/querier_cache_test',
+    'test/boost/query_processor_test',
+    'test/boost/range_test',
+    'test/boost/range_tombstone_list_test',
+    'test/boost/reusable_buffer_test',
+    'test/boost/role_manager_test',
+    'test/boost/row_cache_test',
+    'test/boost/schema_change_test',
+    'test/boost/schema_registry_test',
+    'test/boost/secondary_index_test',
+    'test/boost/serialization_test',
+    'test/boost/serialized_action_test',
+    'test/boost/small_vector_test',
+    'test/boost/snitch_reset_test',
+    'test/boost/sstable_3_x_test',
+    'test/boost/sstable_datafile_test',
+    'test/boost/sstable_mutation_test',
+    'test/boost/sstable_resharding_test',
+    'test/boost/sstable_test',
+    'test/boost/storage_proxy_test',
+    'test/boost/top_k_test',
+    'test/boost/transport_test',
+    'test/boost/truncation_migration_test',
+    'test/boost/types_test',
+    'test/boost/user_function_test',
+    'test/boost/user_types_test',
+    'test/boost/utf8_test',
+    'test/boost/view_build_test',
+    'test/boost/view_complex_test',
+    'test/boost/view_schema_test',
+    'test/boost/vint_serialization_test',
+    'test/boost/virtual_reader_test',
+    'test/manual/ec2_snitch_test',
+    'test/manual/gce_snitch_test',
+    'test/manual/gossip',
+    'test/manual/hint_test',
+    'test/manual/imr_test',
+    'test/manual/json_test',
+    'test/manual/message',
+    'test/manual/partition_data_test',
+    'test/manual/row_locker_test',
+    'test/manual/streaming_histogram_test',
+    'test/perf/perf_cache_eviction',
+    'test/perf/perf_cql_parser',
+    'test/perf/perf_fast_forward',
+    'test/perf/perf_hash',
+    'test/perf/perf_mutation',
+    'test/perf/perf_row_cache_update',
+    'test/perf/perf_simple_query',
+    'test/perf/perf_sstable',
+    'test/tools/cql_repl',
+    'test/unit/lsa_async_eviction_test',
+    'test/unit/lsa_sync_eviction_test',
+    'test/unit/memory_footprint_test',
+    'test/unit/row_cache_alloc_stress_test',
+    'test/unit/row_cache_stress_test',
 ]

 perf_tests = [
-    'tests/perf/perf_mutation_readers',
-    'tests/perf/perf_checksum',
-    'tests/perf/perf_mutation_fragment',
-    'tests/perf/perf_idl',
-    'tests/perf/perf_vint',
+    'test/perf/perf_mutation_readers',
+    'test/perf/perf_checksum',
+    'test/perf/perf_mutation_fragment',
+    'test/perf/perf_idl',
+    'test/perf/perf_vint',
 ]

 apps = [
@@ -432,8 +439,6 @@ arg_parser.add_argument('--dpdk-target', action='store', dest='dpdk_target', def
                        help='Path to DPDK SDK target location (e.g. <DPDK SDK dir>/x86_64-native-linuxapp-gcc)')
 arg_parser.add_argument('--debuginfo', action='store', dest='debuginfo', type=int, default=1,
                        help='Enable(1)/disable(0)compiler debug information generation')
-arg_parser.add_argument('--compress-exec-debuginfo', action='store', dest='compress_exec_debuginfo', type=int, default=1,
-                        help='Enable(1)/disable(0) debug information compression in executables')
 arg_parser.add_argument('--static-stdc++', dest='staticcxx', action='store_true',
                        help='Link libgcc and libstdc++ statically')
 arg_parser.add_argument('--static-thrift', dest='staticthrift', action='store_true',
@@ -456,6 +461,8 @@ arg_parser.add_argument('--enable-alloc-failure-injector', dest='alloc_failure_i
                        help='enable allocation failure injection')
 arg_parser.add_argument('--with-antlr3', dest='antlr3_exec', action='store', default=None,
                        help='path to antlr3 executable')
+arg_parser.add_argument('--with-ragel', dest='ragel_exec', action='store', default='ragel',
+        help='path to ragel executable')
 args = arg_parser.parse_args()

 defines = ['XXH_PRIVATE_API',
@@ -470,6 +477,7 @@ scylla_core = (['database.cc',
                'table.cc',
                'atomic_cell.cc',
                'collection_mutation.cc',
+                'connection_notifier.cc',
                'hashers.cc',
                'schema.cc',
                'frozen_schema.cc',
@@ -489,6 +497,7 @@ scylla_core = (['database.cc',
                'utils/buffer_input_stream.cc',
                'utils/limiting_data_source.cc',
                'utils/updateable_value.cc',
+                'utils/directories.cc',
                'mutation_partition.cc',
                'mutation_partition_view.cc',
                'mutation_partition_serializer.cc',
@@ -509,6 +518,7 @@ scylla_core = (['database.cc',
                'sstables/partition.cc',
                'sstables/compaction.cc',
                'sstables/compaction_strategy.cc',
+                'sstables/size_tiered_compaction_strategy.cc',
                'sstables/leveled_compaction_strategy.cc',
                'sstables/compaction_manager.cc',
                'sstables/integrity_checked_file_impl.cc',
@@ -519,6 +529,7 @@ scylla_core = (['database.cc',
                'transport/server.cc',
                'transport/messages/result_message.cc',
                'cdc/cdc.cc',
+                'cql3/type_json.cc',
                'cql3/abstract_marker.cc',
                'cql3/attributes.cc',
                'cql3/cf_name.cc',
@@ -530,7 +541,9 @@ scylla_core = (['database.cc',
                'cql3/sets.cc',
                'cql3/tuples.cc',
                'cql3/maps.cc',
+                'cql3/functions/user_function.cc',
                'cql3/functions/functions.cc',
+                'cql3/functions/aggregate_fcts.cc',
                'cql3/functions/castas_fcts.cc',
                'cql3/statements/cf_prop_defs.cc',
                'cql3/statements/cf_statement.cc',
@@ -539,13 +552,16 @@ scylla_core = (['database.cc',
                'cql3/statements/create_table_statement.cc',
                'cql3/statements/create_view_statement.cc',
                'cql3/statements/create_type_statement.cc',
+                'cql3/statements/create_function_statement.cc',
                'cql3/statements/drop_index_statement.cc',
                'cql3/statements/drop_keyspace_statement.cc',
                'cql3/statements/drop_table_statement.cc',
                'cql3/statements/drop_view_statement.cc',
                'cql3/statements/drop_type_statement.cc',
+                'cql3/statements/drop_function_statement.cc',
                'cql3/statements/schema_altering_statement.cc',
                'cql3/statements/ks_prop_defs.cc',
+                'cql3/statements/function_statement.cc',
                'cql3/statements/modification_statement.cc',
                'cql3/statements/cas_request.cc',
                'cql3/statements/parsed_statement.cc',
@@ -745,6 +761,7 @@ scylla_core = (['database.cc',
                'utils/ascii.cc',
                'utils/like_matcher.cc',
                'mutation_writer/timestamp_based_splitting_writer.cc',
+                'lua.cc',
                ] + [Antlr3Grammar('cql3/Cql.g')] + [Thrift('interface/cassandra.thrift', 'Cassandra')]
               )

@@ -797,6 +814,21 @@ alternator = [
       'alternator/auth.cc',
 ]

+redis = [
+        'redis/service.cc',
+        'redis/server.cc',
+        'redis/query_processor.cc',
+        'redis/protocol_parser.rl',
+        'redis/keyspace_utils.cc',
+        'redis/options.cc',
+        'redis/stats.cc',
+        'redis/mutation_utils.cc',
+        'redis/query_utils.cc',
+        'redis/abstract_command.cc',
+        'redis/command_factory.cc',
+        'redis/commands.cc',
+        ]
+
 idls = ['idl/gossip_digest.idl.hh',
        'idl/uuid.idl.hh',
        'idl/range.idl.hh',
@@ -828,72 +860,73 @@ idls = ['idl/gossip_digest.idl.hh',
 headers = find_headers('.', excluded_dirs=['idl', 'build', 'seastar', '.git'])

 scylla_tests_generic_dependencies = [
-    'tests/cql_test_env.cc',
-    'tests/test_services.cc',
+    'test/lib/cql_test_env.cc',
+    'test/lib/test_services.cc',
 ]

 scylla_tests_dependencies = scylla_core + idls + scylla_tests_generic_dependencies + [
-    'tests/cql_assertions.cc',
-    'tests/result_set_assertions.cc',
-    'tests/mutation_source_test.cc',
-    'tests/data_model.cc',
-    'tests/exception_utils.cc',
-    'tests/random_schema.cc',
+    'test/lib/cql_assertions.cc',
+    'test/lib/result_set_assertions.cc',
+    'test/lib/mutation_source_test.cc',
+    'test/lib/data_model.cc',
+    'test/lib/exception_utils.cc',
+    'test/lib/random_schema.cc',
 ]

 deps = {
-    'scylla': idls + ['main.cc', 'release.cc'] + scylla_core + api + alternator,
+    'scylla': idls + ['main.cc', 'release.cc', 'build_id.cc'] + scylla_core + api + alternator + redis,
 }

 pure_boost_tests = set([
-    'tests/map_difference_test',
-    'tests/keys_test',
-    'tests/compound_test',
-    'tests/range_tombstone_list_test',
-    'tests/anchorless_list_test',
-    'tests/nonwrapping_range_test',
-    'tests/test-serialization',
-    'tests/range_test',
-    'tests/crc_test',
-    'tests/checksum_utils_test',
-    'tests/dynamic_bitset_test',
-    'tests/idl_test',
-    'tests/cartesian_product_test',
-    'tests/streaming_histogram_test',
-    'tests/duration_test',
-    'tests/vint_serialization_test',
-    'tests/compress_test',
-    'tests/chunked_vector_test',
-    'tests/big_decimal_test',
-    'tests/caching_options_test',
-    'tests/auth_resource_test',
-    'tests/enum_set_test',
-    'tests/cql_auth_syntax_test',
-    'tests/meta_test',
-    'tests/observable_test',
-    'tests/json_test',
-    'tests/auth_passwords_test',
-    'tests/top_k_test',
-    'tests/small_vector_test',
-    'tests/like_matcher_test',
-    'tests/enum_option_test',
+    'test/boost/anchorless_list_test',
+    'test/boost/auth_passwords_test',
+    'test/boost/auth_resource_test',
+    'test/boost/big_decimal_test',
+    'test/boost/caching_options_test',
+    'test/boost/cartesian_product_test',
+    'test/boost/checksum_utils_test',
+    'test/boost/chunked_vector_test',
+    'test/boost/compound_test',
+    'test/boost/compress_test',
+    'test/boost/cql_auth_syntax_test',
+    'test/boost/crc_test',
+    'test/boost/duration_test',
+    'test/boost/dynamic_bitset_test',
+    'test/boost/enum_option_test',
+    'test/boost/enum_set_test',
+    'test/boost/idl_test',
+    'test/boost/keys_test',
+    'test/boost/like_matcher_test',
+    'test/boost/linearizing_input_stream_test',
+    'test/boost/map_difference_test',
+    'test/boost/meta_test',
+    'test/boost/nonwrapping_range_test',
+    'test/boost/observable_test',
+    'test/boost/range_test',
+    'test/boost/range_tombstone_list_test',
+    'test/boost/serialization_test',
+    'test/boost/small_vector_test',
+    'test/boost/top_k_test',
+    'test/boost/vint_serialization_test',
+    'test/manual/json_test',
+    'test/manual/streaming_histogram_test',
 ])

 tests_not_using_seastar_test_framework = set([
-    'tests/perf/perf_mutation',
-    'tests/lsa_async_eviction_test',
-    'tests/lsa_sync_eviction_test',
-    'tests/row_cache_alloc_stress',
-    'tests/perf_row_cache_update',
-    'tests/perf/perf_hash',
-    'tests/perf/perf_cql_parser',
-    'tests/message',
-    'tests/perf/perf_cache_eviction',
-    'tests/row_cache_stress_test',
-    'tests/memory_footprint',
-    'tests/gossip',
-    'tests/perf/perf_sstable',
-    'tests/small_vector_test',
+    'test/boost/small_vector_test',
+    'test/manual/gossip',
+    'test/manual/message',
+    'test/perf/perf_cache_eviction',
+    'test/perf/perf_cql_parser',
+    'test/perf/perf_hash',
+    'test/perf/perf_mutation',
+    'test/perf/perf_row_cache_update',
+    'test/perf/perf_sstable',
+    'test/unit/lsa_async_eviction_test',
+    'test/unit/lsa_sync_eviction_test',
+    'test/unit/memory_footprint_test',
+    'test/unit/row_cache_alloc_stress_test',
+    'test/unit/row_cache_stress_test',
 ]) | pure_boost_tests

 for t in tests_not_using_seastar_test_framework:
@@ -914,28 +947,29 @@ perf_tests_seastar_deps = [
 for t in perf_tests:
    deps[t] = [t + '.cc'] + scylla_tests_dependencies + perf_tests_seastar_deps

-deps['tests/sstable_test'] += ['tests/sstable_utils.cc', 'tests/normalizing_reader.cc']
-deps['tests/sstable_datafile_test'] += ['tests/sstable_utils.cc', 'tests/normalizing_reader.cc']
-deps['tests/mutation_reader_test'] += ['tests/sstable_utils.cc']
+deps['test/boost/sstable_test'] += ['test/lib/sstable_utils.cc', 'test/lib/normalizing_reader.cc']
+deps['test/boost/sstable_datafile_test'] += ['test/lib/sstable_utils.cc', 'test/lib/normalizing_reader.cc']
+deps['test/boost/mutation_reader_test'] += ['test/lib/sstable_utils.cc']

-deps['tests/bytes_ostream_test'] = ['tests/bytes_ostream_test.cc', 'utils/managed_bytes.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
-deps['tests/input_stream_test'] = ['tests/input_stream_test.cc']
-deps['tests/UUID_test'] = ['utils/UUID_gen.cc', 'tests/UUID_test.cc', 'utils/uuid.cc', 'utils/managed_bytes.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc', 'hashers.cc']
-deps['tests/murmur_hash_test'] = ['bytes.cc', 'utils/murmur_hash.cc', 'tests/murmur_hash_test.cc']
-deps['tests/allocation_strategy_test'] = ['tests/allocation_strategy_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
-deps['tests/log_heap_test'] = ['tests/log_heap_test.cc']
-deps['tests/anchorless_list_test'] = ['tests/anchorless_list_test.cc']
-deps['tests/perf/perf_fast_forward'] += ['release.cc']
-deps['tests/perf/perf_simple_query'] += ['release.cc']
-deps['tests/meta_test'] = ['tests/meta_test.cc']
-deps['tests/imr_test'] = ['tests/imr_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
-deps['tests/reusable_buffer_test'] = ['tests/reusable_buffer_test.cc']
-deps['tests/utf8_test'] = ['utils/utf8.cc', 'tests/utf8_test.cc']
-deps['tests/small_vector_test'] = ['tests/small_vector_test.cc']
-deps['tests/multishard_mutation_query_test'] += ['tests/test_table.cc']
-deps['tests/vint_serialization_test'] = ['tests/vint_serialization_test.cc', 'vint-serialization.cc', 'bytes.cc']
+deps['test/boost/bytes_ostream_test'] = ['test/boost/bytes_ostream_test.cc', 'utils/managed_bytes.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
+deps['test/boost/input_stream_test'] = ['test/boost/input_stream_test.cc']
+deps['test/boost/UUID_test'] = ['utils/UUID_gen.cc', 'test/boost/UUID_test.cc', 'utils/uuid.cc', 'utils/managed_bytes.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc', 'hashers.cc']
+deps['test/boost/murmur_hash_test'] = ['bytes.cc', 'utils/murmur_hash.cc', 'test/boost/murmur_hash_test.cc']
+deps['test/boost/allocation_strategy_test'] = ['test/boost/allocation_strategy_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
+deps['test/boost/log_heap_test'] = ['test/boost/log_heap_test.cc']
+deps['test/boost/anchorless_list_test'] = ['test/boost/anchorless_list_test.cc']
+deps['test/perf/perf_fast_forward'] += ['release.cc']
+deps['test/perf/perf_simple_query'] += ['release.cc']
+deps['test/boost/meta_test'] = ['test/boost/meta_test.cc']
+deps['test/manual/imr_test'] = ['test/manual/imr_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
+deps['test/boost/reusable_buffer_test'] = ['test/boost/reusable_buffer_test.cc']
+deps['test/boost/utf8_test'] = ['utils/utf8.cc', 'test/boost/utf8_test.cc']
+deps['test/boost/small_vector_test'] = ['test/boost/small_vector_test.cc']
+deps['test/boost/multishard_mutation_query_test'] += ['test/boost/test_table.cc']
+deps['test/boost/vint_serialization_test'] = ['test/boost/vint_serialization_test.cc', 'vint-serialization.cc', 'bytes.cc']
+deps['test/boost/linearizing_input_stream_test'] = ['test/boost/linearizing_input_stream_test.cc']

-deps['tests/duration_test'] += ['tests/exception_utils.cc']
+deps['test/boost/duration_test'] += ['test/lib/exception_utils.cc']

 deps['utils/gz/gen_crc_combine_table'] = ['utils/gz/gen_crc_combine_table.cc']

@@ -978,9 +1012,13 @@ modes['release']['cxx_ld_flags'] += ' ' + ' '.join(optimization_flags)

 gold_linker_flag = gold_supported(compiler=args.cxx)

-dbgflag = '-g' if args.debuginfo else ''
+dbgflag = '-g -gz' if args.debuginfo else ''
 tests_link_rule = 'link' if args.tests_debuginfo else 'link_stripped'

+# Strip if debuginfo is disabled, otherwise we end up with partial
+# debug info from the libraries we static link with
+regular_link_rule = 'link' if args.debuginfo else 'link_stripped'
+
 if args.so:
    args.pie = '-shared'
    args.fpie = '-fpic'
@@ -997,6 +1035,10 @@ else:
 optional_packages = [['libsystemd', 'libsystemd-daemon']]
 pkgs = []

+# Lua can be provided by lua53 package on Debian-like
+# systems and by Lua on others.
+pkgs.append('lua53' if have_pkg('lua53') else 'lua')
+

 def setup_first_pkg_of_list(pkglist):
    # The HAVE_pkg symbol is taken from the first alternative
@@ -1087,12 +1129,6 @@ scylla_release = file.read().strip()

 extra_cxxflags["release.cc"] = "-DSCYLLA_VERSION=\"\\\"" + scylla_version + "\\\"\" -DSCYLLA_RELEASE=\"\\\"" + scylla_release + "\\\"\""

-# We never compress debug info in debug mode
-modes['debug']['cxxflags'] += ' -gz'
-# We compress it by default in release mode
-flag_dest = 'cxx_ld_flags' if args.compress_exec_debuginfo else 'cxxflags'
-modes['release'][flag_dest] += ' -gz'
-
 for m in ['debug', 'release', 'sanitize']:
    modes[m]['cxxflags'] += ' ' + dbgflag

@@ -1233,6 +1269,11 @@ if args.antlr3_exec:
 else:
    antlr3_exec = "antlr3"

+if args.ragel_exec:
+    ragel_exec = args.ragel_exec
+else:
+    ragel_exec = "ragel"
+
 for mode in build_modes:
    configure_zstd(outdir, mode)

@@ -1249,6 +1290,7 @@ with open(buildfile_tmp, 'w') as f:
        cxx = {cxx}
        cxxflags = {user_cflags} {warnings} {defines}
        ldflags = {gold_linker_flag} {user_ldflags}
+        ldflags_build = {gold_linker_flag}
        libs = {libs}
        pool link_pool
            depth = {link_pool_depth}
@@ -1267,6 +1309,11 @@ with open(buildfile_tmp, 'w') as f:
            command = {ninja} -C $subdir $target
            restat = 1
            description = NINJA $out
+        rule ragel
+            # sed away a bug in ragel 7 that emits some extraneous _nfa* variables
+            # (the $$ is collapsed to a single one by ninja)
+            command = {ragel_exec} -G2 -o $out $in && sed -i -e '1h;2,$$H;$$!d;g' -re 's/static const char _nfa[^;]*;//g' $out
+            description = RAGEL $out
        rule run
            command = $in > $out
            description = GEN $out
@@ -1286,7 +1333,7 @@ with open(buildfile_tmp, 'w') as f:
            libs_{mode} = -l{fmt_lib}
            seastar_libs_{mode} = {seastar_libs}
            rule cxx.{mode}
-              command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags $cxxflags_{mode} $obj_cxxflags -c -o $out $in
+              command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags_{mode} $cxxflags $obj_cxxflags -c -o $out $in
              description = CXX $out
              depfile = $out.d
            rule link.{mode}
@@ -1297,6 +1344,10 @@ with open(buildfile_tmp, 'w') as f:
              command = $cxx  $ld_flags_{mode} -s $ldflags -o $out $in $libs $libs_{mode}
              description = LINK (stripped) $out
              pool = link_pool
+            rule link_build.{mode}
+              command = $cxx  $ld_flags_{mode} $ldflags_build -o $out $in $libs $libs_{mode}
+              description = LINK (build) $out
+              pool = link_pool
            rule ar.{mode}
              command = rm -f $out; ar cr $out $in; ranlib $out
              description = AR $out
@@ -1331,8 +1382,10 @@ with open(buildfile_tmp, 'w') as f:
        swaggers = {}
        serializers = {}
        thrifts = set()
+        ragels = {}
        antlr3_grammars = set()
        seastar_dep = 'build/{}/seastar/libseastar.a'.format(mode)
+        seastar_testing_dep = 'build/{}/seastar/libseastar_testing.a'.format(mode)
        for binary in build_artifacts:
            if binary in other:
                continue
@@ -1356,7 +1409,7 @@ with open(buildfile_tmp, 'w') as f:
                    'zstd/lib/libzstd.a',
                ]])
                objs.append('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o')
-                if binary.startswith('tests/'):
+                if binary.startswith('test/'):
                    local_libs = '$seastar_libs_{} $libs'.format(mode)
                    if binary in pure_boost_tests:
                        local_libs += ' ' + maybe_static(args.staticboost, '-lboost_unit_test_framework')
@@ -1370,12 +1423,12 @@ with open(buildfile_tmp, 'w') as f:
                    # So we strip the tests by default; The user can very
                    # quickly re-link the test unstripped by adding a "_g"
                    # to the test name, e.g., "ninja build/release/testname_g"
-                    f.write('build $builddir/{}/{}: {}.{} {} | {}\n'.format(mode, binary, tests_link_rule, mode, str.join(' ', objs), seastar_dep))
+                    f.write('build $builddir/{}/{}: {}.{} {} | {} {}\n'.format(mode, binary, tests_link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
                    f.write('   libs = {}\n'.format(local_libs))
-                    f.write('build $builddir/{}/{}_g: link.{} {} | {}\n'.format(mode, binary, mode, str.join(' ', objs), seastar_dep))
+                    f.write('build $builddir/{}/{}_g: {}.{} {} | {} {}\n'.format(mode, binary, regular_link_rule, mode, str.join(' ', objs), seastar_dep, seastar_testing_dep))
                    f.write('   libs = {}\n'.format(local_libs))
                else:
-                    f.write('build $builddir/{}/{}: link.{} {} | {}\n'.format(mode, binary, mode, str.join(' ', objs), seastar_dep))
+                    f.write('build $builddir/{}/{}: {}.{} {} | {}\n'.format(mode, binary, regular_link_rule, mode, str.join(' ', objs), seastar_dep))
                    if has_thrift:
                        f.write('   libs =  {} {} $seastar_libs_{} $libs\n'.format(thrift_libs, maybe_static(args.staticboost, '-lboost_system'), mode))
            for src in srcs:
@@ -1388,6 +1441,9 @@ with open(buildfile_tmp, 'w') as f:
                elif src.endswith('.json'):
                    hh = '$builddir/' + mode + '/gen/' + src + '.hh'
                    swaggers[hh] = src
+                elif src.endswith('.rl'):
+                    hh = '$builddir/' + mode + '/gen/' + src.replace('.rl', '.hh')
+                    ragels[hh] = src
                elif src.endswith('.thrift'):
                    thrifts.add(src)
                elif src.endswith('.g'):
@@ -1398,7 +1454,7 @@ with open(buildfile_tmp, 'w') as f:
        compiles['$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'] = 'utils/gz/gen_crc_combine_table.cc'
        f.write('build {}: run {}\n'.format('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.cc',
                                            '$builddir/' + mode + '/utils/gz/gen_crc_combine_table'))
-        f.write('build {}: link.{} {}\n'.format('$builddir/' + mode + '/utils/gz/gen_crc_combine_table', mode,
+        f.write('build {}: link_build.{} {}\n'.format('$builddir/' + mode + '/utils/gz/gen_crc_combine_table', mode,
                                                '$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'))
        f.write('   libs = $seastar_libs_{}\n'.format(mode))
        f.write(
@@ -1416,6 +1472,7 @@ with open(buildfile_tmp, 'w') as f:
            gen_headers += g.headers('$builddir/{}/gen'.format(mode))
        gen_headers += list(swaggers.keys())
        gen_headers += list(serializers.keys())
+        gen_headers += list(ragels.keys())
        gen_headers_dep = ' '.join(gen_headers)

        for obj in compiles:
@@ -1429,6 +1486,9 @@ with open(buildfile_tmp, 'w') as f:
        for hh in serializers:
            src = serializers[hh]
            f.write('build {}: serializer {} | idl-compiler.py\n'.format(hh, src))
+        for hh in ragels:
+            src = ragels[hh]
+            f.write('build {}: ragel {}\n'.format(hh, src))
        for thrift in thrifts:
            outs = ' '.join(thrift.generated('$builddir/{}/gen'.format(mode)))
            f.write('build {}: thrift.{} {}\n'.format(outs, mode, thrift.source))
@@ -1442,9 +1502,12 @@ with open(buildfile_tmp, 'w') as f:
            for cc in grammar.sources('$builddir/{}/gen'.format(mode)):
                obj = cc.replace('.cpp', '.o')
                f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, ' '.join(serializers)))
-                if cc.endswith('Parser.cpp') and has_sanitize_address_use_after_scope:
-                    # Parsers end up using huge amounts of stack space and overflowing their stack
-                    f.write('  obj_cxxflags = -fno-sanitize-address-use-after-scope\n')
+                if cc.endswith('Parser.cpp'):
+                    # Unoptimized parsers end up using huge amounts of stack space and overflowing their stack
+                    flags = '-O1'
+                    if has_sanitize_address_use_after_scope:
+                        flags += ' -fno-sanitize-address-use-after-scope'
+                    f.write('  obj_cxxflags = %s\n' % flags)
        for hh in headers:
            f.write('build $builddir/{mode}/{hh}.o: checkhh.{mode} {hh} || {gen_headers_dep}\n'.format(
                    mode=mode, hh=hh, gen_headers_dep=gen_headers_dep))
@@ -1453,7 +1516,12 @@ with open(buildfile_tmp, 'w') as f:
                .format(**locals()))
        f.write('  pool = submodule_pool\n')
        f.write('  subdir = build/{mode}/seastar\n'.format(**locals()))
-        f.write('  target = seastar seastar_testing\n'.format(**locals()))
+        f.write('  target = seastar\n'.format(**locals()))
+        f.write('build build/{mode}/seastar/libseastar_testing.a: ninja\n'
+                .format(**locals()))
+        f.write('  pool = submodule_pool\n')
+        f.write('  subdir = build/{mode}/seastar\n'.format(**locals()))
+        f.write('  target = seastar_testing\n'.format(**locals()))
        f.write('build build/{mode}/seastar/apps/iotune/iotune: ninja\n'
                .format(**locals()))
        f.write('  pool = submodule_pool\n')
@@ -1481,7 +1549,7 @@ with open(buildfile_tmp, 'w') as f:
        rule configure
          command = {python} configure.py $configure_args
          generator = 1
-        build build.ninja: configure | configure.py
+        build build.ninja: configure | configure.py SCYLLA-VERSION-GEN
        rule cscope
            command = find -name '*.[chS]' -o -name "*.cc" -o -name "*.hh" | cscope -bq -i-
            description = CSCOPE
@@ -1490,6 +1558,10 @@ with open(buildfile_tmp, 'w') as f:
            command = rm -rf build
            description = CLEAN
        build clean: clean
+        rule mode_list
+            command = echo {modes_list}
+            description = List configured modes
+        build mode_list: mode_list
        default {modes_list}
        ''').format(modes_list=' '.join(default_modes), **globals()))
    f.write(textwrap.dedent('''\
--- a/connection_notifier.cc
+++ b/connection_notifier.cc
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "connection_notifier.hh"
+#include "db/query_context.hh"
+#include "cql3/constants.hh"
+#include "database.hh"
+#include "service/storage_proxy.hh"
+
+#include <stdexcept>
+
+namespace db::system_keyspace {
+extern const char *const CLIENTS;
+}
+
+static sstring to_string(client_type ct) {
+    switch (ct) {
+        case client_type::cql: return "cql";
+        case client_type::thrift: return "thrift";
+        case client_type::alternator: return "alternator";
+        default: throw std::runtime_error("Invalid client_type");
+    }
+}
+
+future<> notify_new_client(client_data cd) {
+    // FIXME: consider prepared statement
+    const static sstring req
+            = format("INSERT INTO system.{} (address, port, client_type, shard_id, protocol_version, username) "
+                     "VALUES (?, ?, ?, ?, ?, ?);", db::system_keyspace::CLIENTS);
+    
+    return db::execute_cql(req,
+            std::move(cd.ip), cd.port, to_string(cd.ct), cd.shard_id,
+            cd.protocol_version.has_value() ? data_value(*cd.protocol_version) : data_value::make_null(int32_type),
+            cd.username.value_or("anonymous")).discard_result();
+}
+
+future<> notify_disconnected_client(gms::inet_address addr, client_type ct, int port) {
+    // FIXME: consider prepared statement
+    const static sstring req
+            = format("DELETE FROM system.{} where address=? AND port=? AND client_type=?;",
+                     db::system_keyspace::CLIENTS);
+    return db::execute_cql(req, addr.addr(), port, to_string(ct)).discard_result();
+}
+
+future<> clear_clientlist() {
+    auto& db_local = service::get_storage_proxy().local().get_db().local();
+    return db_local.truncate(
+            db_local.find_keyspace(db::system_keyspace_name()),
+            db_local.find_column_family(db::system_keyspace_name(),
+                    db::system_keyspace::CLIENTS),
+            [] { return make_ready_future<db_clock::time_point>(db_clock::now()); },
+            false /* with_snapshot */);
+}
--- a/connection_notifier.hh
+++ b/connection_notifier.hh
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include "gms/inet_address.hh"
+#include <seastar/core/sstring.hh>
+#include <optional>
+
+enum class client_type {
+    cql = 0,
+    thrift,
+    alternator,
+};
+
+// Representation of a row in `system.clients'. std::optionals are for nullable cells.
+struct client_data {
+    gms::inet_address ip;
+    int32_t port;
+    client_type ct;
+    int32_t shard_id;  /// ID of server-side shard which is processing the connection.
+
+    // `optional' column means that it's nullable (possibly because it's
+    // unimplemented yet). If you want to fill ("implement") any of them,
+    // remember to update the query in `notify_new_client()'.
+    std::optional<sstring> connection_stage;
+    std::optional<sstring> driver_name;
+    std::optional<sstring> driver_version;
+    std::optional<sstring> hostname;
+    std::optional<int32_t> protocol_version;
+    std::optional<sstring> ssl_cipher_suite;
+    std::optional<bool> ssl_enabled;
+    std::optional<sstring> ssl_protocol;
+    std::optional<sstring> username;
+};
+
+future<> notify_new_client(client_data cd);
+future<> notify_disconnected_client(gms::inet_address addr, client_type ct, int port);
+
+future<> clear_clientlist();
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -43,12 +43,14 @@ options {
 #include "cql3/statements/create_table_statement.hh"
 #include "cql3/statements/create_view_statement.hh"
 #include "cql3/statements/create_type_statement.hh"
+#include "cql3/statements/create_function_statement.hh"
 #include "cql3/statements/drop_type_statement.hh"
 #include "cql3/statements/alter_type_statement.hh"
 #include "cql3/statements/property_definitions.hh"
 #include "cql3/statements/drop_index_statement.hh"
 #include "cql3/statements/drop_table_statement.hh"
 #include "cql3/statements/drop_view_statement.hh"
+#include "cql3/statements/drop_function_statement.hh"
 #include "cql3/statements/truncate_statement.hh"
 #include "cql3/statements/raw/update_statement.hh"
 #include "cql3/statements/raw/insert_statement.hh"
@@ -243,10 +245,14 @@ struct uninitialized {
        return res;
    }

-    bool convert_boolean_literal(std::string_view s) {
-        std::string lower_s(s.size(), '\0');
+    sstring to_lower(std::string_view s) {
+        sstring lower_s(s.size(), '\0');
        std::transform(s.cbegin(), s.cend(), lower_s.begin(), &::tolower);
-        return lower_s == "true";
+        return lower_s;
+    }
+
+    bool convert_boolean_literal(std::string_view s) {
+        return to_lower(s) == "true";
    }

    void add_raw_update(std::vector<std::pair<::shared_ptr<cql3::column_identifier::raw>,::shared_ptr<cql3::operation::raw_update>>>& operations,
@@ -348,9 +354,9 @@ cqlStatement returns [shared_ptr<raw::parsed_statement> stmt]
    | st25=createTypeStatement         { $stmt = st25; }
    | st26=alterTypeStatement          { $stmt = st26; }
    | st27=dropTypeStatement           { $stmt = st27; }
-#if 0
    | st28=createFunctionStatement     { $stmt = st28; }
    | st29=dropFunctionStatement       { $stmt = st29; }
+#if 0
    | st30=createAggregateStatement    { $stmt = st30; }
    | st31=dropAggregateStatement      { $stmt = st31; }
 #endif
@@ -686,54 +692,56 @@ dropAggregateStatement returns [DropAggregateStatement expr]
      )?
      { $expr = new DropAggregateStatement(fn, argsTypes, argsPresent, ifExists); }
    ;
+#endif

-createFunctionStatement returns [CreateFunctionStatement expr]
+createFunctionStatement returns [shared_ptr<cql3::statements::create_function_statement> expr]
    @init {
-        boolean orReplace = false;
-        boolean ifNotExists = false;
+        bool or_replace = false;
+        bool if_not_exists = false;

-        boolean deterministic = true;
-        List<ColumnIdentifier> argsNames = new ArrayList<>();
-        List<CQL3Type.Raw> argsTypes = new ArrayList<>();
+        std::vector<shared_ptr<cql3::column_identifier>> arg_names;
+        std::vector<shared_ptr<cql3_type::raw>> arg_types;
+        bool called_on_null_input = false;
    }
-    : K_CREATE (K_OR K_REPLACE { orReplace = true; })?
-      ((K_NON { deterministic = false; })? K_DETERMINISTIC)?
-      K_FUNCTION
-      (K_IF K_NOT K_EXISTS { ifNotExists = true; })?
+    : K_CREATE
+        // "OR REPLACE" and "IF NOT EXISTS" cannot be used together
+        ((K_OR K_REPLACE { or_replace = true; } K_FUNCTION)
+         | (K_FUNCTION K_IF K_NOT K_EXISTS { if_not_exists = true; })
+         | K_FUNCTION)
      fn=functionName
      '('
        (
-          k=ident v=comparatorType { argsNames.add(k); argsTypes.add(v); }
-          ( ',' k=ident v=comparatorType { argsNames.add(k); argsTypes.add(v); } )*
+          k=ident v=comparatorType { arg_names.push_back(k); arg_types.push_back(v); }
+          ( ',' k=ident v=comparatorType { arg_names.push_back(k); arg_types.push_back(v); } )*
        )?
      ')'
+      ( (K_RETURNS K_NULL) | (K_CALLED { called_on_null_input = true; })) K_ON K_NULL K_INPUT
      K_RETURNS rt = comparatorType
      K_LANGUAGE language = IDENT
      K_AS body = STRING_LITERAL
-      { $expr = new CreateFunctionStatement(fn, $language.text.toLowerCase(), $body.text, deterministic, argsNames, argsTypes, rt, orReplace, ifNotExists); }
+      { $expr = ::make_shared<cql3::statements::create_function_statement>(std::move(fn), to_lower($language.text), $body.text, std::move(arg_names), std::move(arg_types), std::move(rt), called_on_null_input, or_replace, if_not_exists); }
    ;

-dropFunctionStatement returns [DropFunctionStatement expr]
+dropFunctionStatement returns [shared_ptr<cql3::statements::drop_function_statement> expr]
    @init {
-        boolean ifExists = false;
-        List<CQL3Type.Raw> argsTypes = new ArrayList<>();
-        boolean argsPresent = false;
+        bool if_exists = false;
+        std::vector<shared_ptr<cql3_type::raw>> arg_types;
+        bool args_present = false;
    }
    : K_DROP K_FUNCTION
-      (K_IF K_EXISTS { ifExists = true; } )?
+      (K_IF K_EXISTS { if_exists = true; } )?
      fn=functionName
      (
        '('
          (
-            v=comparatorType { argsTypes.add(v); }
-            ( ',' v=comparatorType { argsTypes.add(v); } )*
+            v=comparatorType { arg_types.push_back(v); }
+            ( ',' v=comparatorType { arg_types.push_back(v); } )*
          )?
        ')'
-        { argsPresent = true; }
+        { args_present = true; }
      )?
-      { $expr = new DropFunctionStatement(fn, argsTypes, argsPresent, ifExists); }
+      { $expr = ::make_shared<cql3::statements::drop_function_statement>(std::move(fn), std::move(arg_types), args_present, if_exists); }
    ;
-#endif

 /**
 * CREATE KEYSPACE [IF NOT EXISTS] <KEYSPACE> WITH attr1 = value1 AND attr2 = value2;
@@ -1743,8 +1751,8 @@ basic_unreserved_keyword returns [sstring str]
        | K_INITCOND
        | K_RETURNS
        | K_LANGUAGE
-        | K_NON
-        | K_DETERMINISTIC
+        | K_CALLED
+        | K_INPUT
        | K_JSON
        | K_CACHE
        | K_BYPASS
@@ -1883,11 +1891,11 @@ K_STYPE:       S T Y P E;
 K_FINALFUNC:   F I N A L F U N C;
 K_INITCOND:    I N I T C O N D;
 K_RETURNS:     R E T U R N S;
+K_CALLED:      C A L L E D;
+K_INPUT:       I N P U T;
 K_LANGUAGE:    L A N G U A G E;
-K_NON:         N O N;
 K_OR:          O R;
 K_REPLACE:     R E P L A C E;
-K_DETERMINISTIC: D E T E R M I N I S T I C;
 K_JSON:        J S O N;
 K_DEFAULT:     D E F A U L T;
 K_UNSET:       U N S E T;
--- a/cql3/abstract_marker.cc
+++ b/cql3/abstract_marker.cc
@@ -55,7 +55,7 @@ abstract_marker::abstract_marker(int32_t bind_index, ::shared_ptr<column_specifi
    , _receiver{std::move(receiver)}
 { }

-void abstract_marker::collect_marker_specification(::shared_ptr<variable_specifications> bound_names) {
+void abstract_marker::collect_marker_specification(lw_shared_ptr<variable_specifications> bound_names) {
    bound_names->add(_bind_index, _receiver);
 }

--- a/cql3/abstract_marker.hh
+++ b/cql3/abstract_marker.hh
@@ -57,7 +57,7 @@ protected:
 public:
    abstract_marker(int32_t bind_index, ::shared_ptr<column_specification>&& receiver);

-    virtual void collect_marker_specification(::shared_ptr<variable_specifications> bound_names) override;
+    virtual void collect_marker_specification(lw_shared_ptr<variable_specifications> bound_names) override;

    virtual bool contains_bind_marker() const override;

--- a/cql3/attributes.cc
+++ b/cql3/attributes.cc
@@ -120,7 +120,7 @@ int32_t attributes::get_time_to_live(const query_options& options) {
    return ttl;
 }

-void attributes::collect_marker_specification(::shared_ptr<variable_specifications> bound_names) {
+void attributes::collect_marker_specification(lw_shared_ptr<variable_specifications> bound_names) {
    if (_timestamp) {
        _timestamp->collect_marker_specification(bound_names);
    }
--- a/cql3/attributes.hh
+++ b/cql3/attributes.hh
@@ -69,7 +69,7 @@ public:

    int32_t get_time_to_live(const query_options& options);

-    void collect_marker_specification(::shared_ptr<variable_specifications> bound_names);
+    void collect_marker_specification(lw_shared_ptr<variable_specifications> bound_names);

    class raw {
    public:
--- a/cql3/column_condition.cc
+++ b/cql3/column_condition.cc
@@ -114,7 +114,7 @@ uint32_t read_and_check_list_index(const cql3::raw_value_view& key) {
 namespace cql3 {

 bool
-column_condition::uses_function(const sstring& ks_name, const sstring& function_name) {
+column_condition::uses_function(const sstring& ks_name, const sstring& function_name) const {
    if (bool(_collection_element) && _collection_element->uses_function(ks_name, function_name)) {
        return true;
    }
@@ -131,7 +131,7 @@ column_condition::uses_function(const sstring& ks_name, const sstring& function_
    return false;
 }

-void column_condition::collect_marker_specificaton(::shared_ptr<variable_specifications> bound_names) {
+void column_condition::collect_marker_specificaton(lw_shared_ptr<variable_specifications> bound_names) {
    if (_collection_element) {
        _collection_element->collect_marker_specification(bound_names);
    }
--- a/cql3/column_condition.hh
+++ b/cql3/column_condition.hh
@@ -85,9 +85,9 @@ public:
     * @param boundNames the list of column specification where to collect the
     * bind variables of this term in.
     */
-    void collect_marker_specificaton(::shared_ptr<variable_specifications> bound_names);
+    void collect_marker_specificaton(lw_shared_ptr<variable_specifications> bound_names);

-    bool uses_function(const sstring& ks_name, const sstring& function_name);
+    bool uses_function(const sstring& ks_name, const sstring& function_name) const;

    // Retrieve parameter marker values, if any, find the appropriate collection
    // element if the cell is a collection and an element access is used in the expression,
--- a/cql3/cql3_type.cc
+++ b/cql3/cql3_type.cc
@@ -31,13 +31,48 @@
 #include "types/map.hh"
 #include "types/set.hh"
 #include "types/list.hh"
+#include "concrete_types.hh"

 namespace cql3 {

+static cql3_type::kind get_cql3_kind(const abstract_type& t) {
+    struct visitor {
+        cql3_type::kind operator()(const ascii_type_impl&) { return cql3_type::kind::ASCII; }
+        cql3_type::kind operator()(const byte_type_impl&) { return cql3_type::kind::TINYINT; }
+        cql3_type::kind operator()(const bytes_type_impl&) { return cql3_type::kind::BLOB; }
+        cql3_type::kind operator()(const boolean_type_impl&) { return cql3_type::kind::BOOLEAN; }
+        cql3_type::kind operator()(const counter_type_impl&) { return cql3_type::kind::COUNTER; }
+        cql3_type::kind operator()(const decimal_type_impl&) { return cql3_type::kind::DECIMAL; }
+        cql3_type::kind operator()(const double_type_impl&) { return cql3_type::kind::DOUBLE; }
+        cql3_type::kind operator()(const duration_type_impl&) { return cql3_type::kind::DURATION; }
+        cql3_type::kind operator()(const empty_type_impl&) { return cql3_type::kind::EMPTY; }
+        cql3_type::kind operator()(const float_type_impl&) { return cql3_type::kind::FLOAT; }
+        cql3_type::kind operator()(const inet_addr_type_impl&) { return cql3_type::kind::INET; }
+        cql3_type::kind operator()(const int32_type_impl&) { return cql3_type::kind::INT; }
+        cql3_type::kind operator()(const long_type_impl&) { return cql3_type::kind::BIGINT; }
+        cql3_type::kind operator()(const short_type_impl&) { return cql3_type::kind::SMALLINT; }
+        cql3_type::kind operator()(const simple_date_type_impl&) { return cql3_type::kind::DATE; }
+        cql3_type::kind operator()(const utf8_type_impl&) { return cql3_type::kind::TEXT; }
+        cql3_type::kind operator()(const time_type_impl&) { return cql3_type::kind::TIME; }
+        cql3_type::kind operator()(const timestamp_date_base_class&) { return cql3_type::kind::TIMESTAMP; }
+        cql3_type::kind operator()(const timeuuid_type_impl&) { return cql3_type::kind::TIMEUUID; }
+        cql3_type::kind operator()(const uuid_type_impl&) { return cql3_type::kind::UUID; }
+        cql3_type::kind operator()(const varint_type_impl&) { return cql3_type::kind::VARINT; }
+        cql3_type::kind operator()(const reversed_type_impl& r) { return get_cql3_kind(*r.underlying_type()); }
+        cql3_type::kind operator()(const tuple_type_impl&) { assert(0 && "no kind for this type"); }
+        cql3_type::kind operator()(const collection_type_impl&) { assert(0 && "no kind for this type"); }
+    };
+    return visit(t, visitor{});
+}
+
+cql3_type::kind_enum_set::prepared cql3_type::get_kind() const {
+    return kind_enum_set::prepare(get_cql3_kind(*_type));
+}
+
 cql3_type cql3_type::raw::prepare(database& db, const sstring& keyspace) {
    try {
        auto&& ks = db.find_keyspace(keyspace);
-        return prepare_internal(keyspace, *ks.metadata()->user_types());
+        return prepare_internal(keyspace, ks.metadata()->user_types());
    } catch (no_such_keyspace& nsk) {
        throw exceptions::invalid_request_exception("Unknown keyspace " + keyspace);
    }
@@ -66,7 +101,7 @@ public:
    virtual cql3_type prepare(database& db, const sstring& keyspace) {
        return _type;
    }
-    cql3_type prepare_internal(const sstring&, user_types_metadata&) override {
+    cql3_type prepare_internal(const sstring&, const user_types_metadata&) override {
        return _type;
    }

@@ -123,7 +158,7 @@ public:
        return true;
    }

-    virtual cql3_type prepare_internal(const sstring& keyspace, user_types_metadata& user_types) override {
+    virtual cql3_type prepare_internal(const sstring& keyspace, const user_types_metadata& user_types) override {
        assert(_values); // "Got null values type for a collection";

        if (!is_frozen() && _values->supports_freezing() && !_values->is_frozen()) {
@@ -190,7 +225,7 @@ public:
        _frozen = true;
    }

-    virtual cql3_type prepare_internal(const sstring& keyspace, user_types_metadata& user_types) override {
+    virtual cql3_type prepare_internal(const sstring& keyspace, const user_types_metadata& user_types) override {
        if (_name.has_keyspace()) {
            // The provided keyspace is the one of the current statement this is part of. If it's different from the keyspace of
            // the UTName, we reject since we want to limit user types to their own keyspace (see #6643)
@@ -249,7 +284,7 @@ public:
        }
        _frozen = true;
    }
-    virtual cql3_type prepare_internal(const sstring& keyspace, user_types_metadata& user_types) override {
+    virtual cql3_type prepare_internal(const sstring& keyspace, const user_types_metadata& user_types) override {
        if (!is_frozen()) {
            freeze();
        }
@@ -395,14 +430,42 @@ operator<<(std::ostream& os, const cql3_type::raw& r) {
 namespace util {

 sstring maybe_quote(const sstring& identifier) {
-    static const std::regex unquoted_identifier_re("[a-z][a-z0-9_]*");
-    if (std::regex_match(identifier.begin(), identifier.end(), unquoted_identifier_re)) {
+    const auto* p = identifier.begin();
+    const auto* ep = identifier.end();
+
+    // quote empty string
+    if (__builtin_expect(p == ep, false)) {
+        return "\"\"";
+    }
+
+    // string needs no quoting if it matches [a-z][a-z0-9_]*
+    // quotes ('"') in the string are doubled
+    bool need_quotes;
+    bool has_quotes;
+    auto c = *p;
+    if ('a' <= c && c <= 'z') {
+        need_quotes = false;
+        has_quotes = false;
+    } else {
+        need_quotes = true;
+        has_quotes = (c == '"');
+    }
+    while ((++p != ep) && !has_quotes) {
+        c = *p;
+        if (!(('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || (c == '_'))) {
+            need_quotes = true;
+            has_quotes = (c == '"');
+        }
+    }
+
+    if (!need_quotes) {
        return identifier;
    }
+    if (!has_quotes) {
+        return make_sstring("\"", identifier, "\"");
+    }
    static const std::regex double_quote_re("\"");
-    std::string result = identifier;
-    std::regex_replace(result, double_quote_re, "\"\"");
-    return '"' + result + '"';
+    return '"' + std::regex_replace(identifier.c_str(), double_quote_re, "\"\"") + '"';
 }

 }
--- a/cql3/cql3_type.hh
+++ b/cql3/cql3_type.hh
@@ -81,7 +81,7 @@ public:
        virtual bool references_user_type(const sstring&) const;
        virtual std::optional<sstring> keyspace() const;
        virtual void freeze();
-        virtual cql3_type prepare_internal(const sstring& keyspace, user_types_metadata&) = 0;
+        virtual cql3_type prepare_internal(const sstring& keyspace, const user_types_metadata&) = 0;
        virtual cql3_type prepare(database& db, const sstring& keyspace);
        static shared_ptr<raw> from(cql3_type type);
        static shared_ptr<raw> user_type(ut_name name);
@@ -103,6 +103,33 @@ private:
    }

 public:
+    enum class kind : int8_t {
+        ASCII, BIGINT, BLOB, BOOLEAN, COUNTER, DECIMAL, DOUBLE, EMPTY, FLOAT, INT, SMALLINT, TINYINT, INET, TEXT, TIMESTAMP, UUID, VARINT, TIMEUUID, DATE, TIME, DURATION
+    };
+    using kind_enum = super_enum<kind,
+        kind::ASCII,
+        kind::BIGINT,
+        kind::BLOB,
+        kind::BOOLEAN,
+        kind::COUNTER,
+        kind::DECIMAL,
+        kind::DOUBLE,
+        kind::EMPTY,
+        kind::FLOAT,
+        kind::INET,
+        kind::INT,
+        kind::SMALLINT,
+        kind::TINYINT,
+        kind::TEXT,
+        kind::TIMESTAMP,
+        kind::UUID,
+        kind::VARINT,
+        kind::TIMEUUID,
+        kind::DATE,
+        kind::TIME,
+        kind::DURATION>;
+    using kind_enum_set = enum_set<kind_enum>;
+
    static thread_local cql3_type ascii;
    static thread_local cql3_type bigint;
    static thread_local cql3_type blob;
@@ -127,9 +154,7 @@ public:

    static const std::vector<cql3_type>& values();
 public:
-    using kind = abstract_type::cql3_kind;
-    using kind_enum_set = abstract_type::cql3_kind_enum_set;
-    kind_enum_set::prepared get_kind() const { return _type->get_cql3_kind(); }
+    kind_enum_set::prepared get_kind() const;
 };

 inline bool operator==(const cql3_type& a, const cql3_type& b) {
--- a/cql3/cql_statement.hh
+++ b/cql3/cql_statement.hh
@@ -72,14 +72,14 @@ public:

    timeout_config_selector get_timeout_config_selector() const { return _timeout_config_selector; }

-    virtual uint32_t get_bound_terms() = 0;
+    virtual uint32_t get_bound_terms() const = 0;

    /**
     * Perform any access verification necessary for the statement.
     *
     * @param state the current client state
     */
-    virtual future<> check_access(const service::client_state& state) = 0;
+    virtual future<> check_access(const service::client_state& state) const = 0;

    /**
     * Perform additional validation required by the statment.
@@ -87,7 +87,7 @@ public:
     *
     * @param state the current client state
     */
-    virtual void validate(service::storage_proxy& proxy, const service::client_state& state) = 0;
+    virtual void validate(service::storage_proxy& proxy, const service::client_state& state) const = 0;

    /**
     * Execute the statement and return the resulting result or null if there is no result.
@@ -96,7 +96,7 @@ public:
     * @param options options for this query (consistency, variables, pageSize, ...)
     */
    virtual future<::shared_ptr<cql_transport::messages::result_message>>
-        execute(service::storage_proxy& proxy, service::query_state& state, const query_options& options) = 0;
+        execute(service::storage_proxy& proxy, service::query_state& state, const query_options& options) const = 0;

    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const = 0;

--- a/cql3/error_collector.hh
+++ b/cql3/error_collector.hh
@@ -55,12 +55,12 @@ class error_collector : public error_listener<RecognizerType, ExceptionBaseType>
    /**
     * The offset of the first token of the snippet.
     */
-    static const int32_t FIRST_TOKEN_OFFSET = 10;
+    static constexpr int32_t FIRST_TOKEN_OFFSET = 10;

    /**
     * The offset of the last token of the snippet.
     */
-    static const int32_t LAST_TOKEN_OFFSET = 2;
+    static constexpr int32_t LAST_TOKEN_OFFSET = 2;

    /**
     * The CQL query.
--- a/cql3/functions/abstract_function.hh
+++ b/cql3/functions/abstract_function.hh
@@ -48,6 +48,10 @@
 #include <iosfwd>
 #include <boost/functional/hash.hpp>

+namespace std {
+    std::ostream& operator<<(std::ostream& os, const std::vector<data_type>& arg_types);
+}
+
 namespace cql3 {

 namespace functions {
@@ -66,6 +70,9 @@ protected:
    }

 public:
+
+    virtual bool requires_thread() const;
+
    virtual const function_name& name() const override {
        return _name;
    }
@@ -84,15 +91,15 @@ public:
            && _return_type == x._return_type;
    }

-    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) override {
+    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override {
        return _name.keyspace == ks_name && _name.name == function_name;
    }

-    virtual bool has_reference_to(function& f) override {
+    virtual bool has_reference_to(function& f) const override {
        return false;
    }

-    virtual sstring column_name(const std::vector<sstring>& column_names) override {
+    virtual sstring column_name(const std::vector<sstring>& column_names) const override {
        return format("{}({})", _name, join(", ", column_names));
    }

@@ -103,12 +110,7 @@ inline
 void
 abstract_function::print(std::ostream& os) const {
    os << _name << " : (";
-    for (size_t i = 0; i < _arg_types.size(); ++i) {
-        if (i > 0) {
-            os << ", ";
-        }
-        os << _arg_types[i]->as_cql3_type().to_string();
-    }
+    os << _arg_types;
    os << ") -> " << _return_type->as_cql3_type().to_string();
 }

--- a/cql3/functions/aggregate_fcts.cc
+++ b/cql3/functions/aggregate_fcts.cc
@@ -0,0 +1,612 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (C) 2019 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "utils/big_decimal.hh"
+#include "aggregate_fcts.hh"
+#include "functions.hh"
+#include "native_aggregate_function.hh"
+#include "exceptions/exceptions.hh"
+
+using namespace cql3;
+using namespace functions;
+using namespace aggregate_fcts;
+
+namespace {
+class impl_count_function : public aggregate_function::aggregate {
+    int64_t _count;
+public:
+    virtual void reset() override {
+        _count = 0;
+    }
+    virtual opt_bytes compute(cql_serialization_format sf) override {
+        return long_type->decompose(_count);
+    }
+    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+        ++_count;
+    }
+};
+
+class count_rows_function final : public native_aggregate_function {
+public:
+    count_rows_function() : native_aggregate_function(COUNT_ROWS_FUNCTION_NAME, long_type, {}) {}
+    virtual std::unique_ptr<aggregate> new_aggregate() override {
+        return std::make_unique<impl_count_function>();
+    }
+    virtual sstring column_name(const std::vector<sstring>& column_names) const override {
+        return "count";
+    }
+};
+
+// We need a wider accumulator for sum and average,
+// since summing the inputs can overflow the input type
+template <typename T>
+struct accumulator_for;
+
+template <typename NarrowType, typename AccType>
+static NarrowType checking_narrow(AccType acc) {
+    NarrowType ret = static_cast<NarrowType>(acc);
+    if (static_cast<AccType>(ret) != acc) {
+        throw exceptions::overflow_error_exception("Sum overflow. Values should be casted to a wider type.");
+    }
+    return ret;
+}
+
+template <>
+struct accumulator_for<int8_t> {
+    using type = __int128;
+
+    static int8_t narrow(type acc) {
+        return checking_narrow<int8_t>(acc);
+    }
+};
+
+template <>
+struct accumulator_for<int16_t> {
+    using type = __int128;
+
+    static int16_t narrow(type acc) {
+        return checking_narrow<int16_t>(acc);
+    }
+};
+
+template <>
+struct accumulator_for<int32_t> {
+    using type = __int128;
+
+    static int32_t narrow(type acc) {
+        return checking_narrow<int32_t>(acc);
+    }
+};
+
+template <>
+struct accumulator_for<int64_t> {
+    using type = __int128;
+
+    static int64_t narrow(type acc) {
+        return checking_narrow<int64_t>(acc);
+    }
+};
+
+template <>
+struct accumulator_for<float> {
+    using type = float;
+
+    static auto narrow(type acc) {
+        return acc;
+    }
+};
+
+template <>
+struct accumulator_for<double> {
+    using type = double;
+
+    static auto narrow(type acc) {
+        return acc;
+    }
+};
+
+template <>
+struct accumulator_for<boost::multiprecision::cpp_int> {
+    using type = boost::multiprecision::cpp_int;
+
+    static auto narrow(type acc) {
+        return acc;
+    }
+};
+
+template <>
+struct accumulator_for<big_decimal> {
+    using type = big_decimal;
+
+    static auto narrow(type acc) {
+        return acc;
+    }
+};
+
+template <typename Type>
+class impl_sum_function_for final : public aggregate_function::aggregate {
+    using accumulator_type = typename accumulator_for<Type>::type;
+    accumulator_type _sum{};
+public:
+    virtual void reset() override {
+        _sum = {};
+    }
+    virtual opt_bytes compute(cql_serialization_format sf) override {
+        return data_type_for<Type>()->decompose(accumulator_for<Type>::narrow(_sum));
+    }
+    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+        if (!values[0]) {
+            return;
+        }
+        _sum += value_cast<Type>(data_type_for<Type>()->deserialize(*values[0]));
+    }
+};
+
+template <typename Type>
+class sum_function_for final : public native_aggregate_function {
+public:
+    sum_function_for() : native_aggregate_function("sum", data_type_for<Type>(), { data_type_for<Type>() }) {}
+    virtual std::unique_ptr<aggregate> new_aggregate() override {
+        return std::make_unique<impl_sum_function_for<Type>>();
+    }
+};
+
+
+template <typename Type>
+static
+shared_ptr<aggregate_function>
+make_sum_function() {
+    return make_shared<sum_function_for<Type>>();
+}
+
+template <typename Type>
+class impl_div_for_avg {
+public:
+    static Type div(const typename accumulator_for<Type>::type& x, const int64_t y) {
+        return x/y;
+    }
+};
+
+template <>
+class impl_div_for_avg<big_decimal> {
+public:
+    static big_decimal div(const big_decimal& x, const int64_t y) {
+        return x.div(y, big_decimal::rounding_mode::HALF_EVEN);
+    }
+};
+
+template <typename Type>
+class impl_avg_function_for final : public aggregate_function::aggregate {
+   typename accumulator_for<Type>::type _sum{};
+   int64_t _count = 0;
+public:
+    virtual void reset() override {
+        _sum = {};
+        _count = 0;
+    }
+    virtual opt_bytes compute(cql_serialization_format sf) override {
+        Type ret{};
+        if (_count) {
+            ret = impl_div_for_avg<Type>::div(_sum, _count);
+        }
+        return data_type_for<Type>()->decompose(ret);
+    }
+    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+        if (!values[0]) {
+            return;
+        }
+        ++_count;
+        _sum += value_cast<Type>(data_type_for<Type>()->deserialize(*values[0]));
+    }
+};
+
+template <typename Type>
+class avg_function_for final : public native_aggregate_function {
+public:
+    avg_function_for() : native_aggregate_function("avg", data_type_for<Type>(), { data_type_for<Type>() }) {}
+    virtual std::unique_ptr<aggregate> new_aggregate() override {
+        return std::make_unique<impl_avg_function_for<Type>>();
+    }
+};
+
+template <typename Type>
+static
+shared_ptr<aggregate_function>
+make_avg_function() {
+    return make_shared<avg_function_for<Type>>();
+}
+
+template <typename T>
+struct aggregate_type_for {
+    using type = T;
+};
+
+template<>
+struct aggregate_type_for<ascii_native_type> {
+    using type = ascii_native_type::primary_type;
+};
+
+template<>
+struct aggregate_type_for<simple_date_native_type> {
+    using type = simple_date_native_type::primary_type;
+};
+
+template<>
+struct aggregate_type_for<timeuuid_native_type> {
+    using type = timeuuid_native_type::primary_type;
+};
+
+template<>
+struct aggregate_type_for<time_native_type> {
+    using type = time_native_type::primary_type;
+};
+
+template <typename Type>
+const Type& max_wrapper(const Type& t1, const Type& t2) {
+    using std::max;
+    return max(t1, t2);
+}
+
+inline const net::inet_address& max_wrapper(const net::inet_address& t1, const net::inet_address& t2) {
+    using family = seastar::net::inet_address::family;
+    const size_t len =
+            (t1.in_family() == family::INET || t2.in_family() == family::INET)
+            ? sizeof(::in_addr) : sizeof(::in6_addr);
+    return std::memcmp(t1.data(), t2.data(), len) >= 0 ? t1 : t2;
+}
+
+template <typename Type>
+class impl_max_function_for final : public aggregate_function::aggregate {
+   std::optional<typename aggregate_type_for<Type>::type> _max{};
+public:
+    virtual void reset() override {
+        _max = {};
+    }
+    virtual opt_bytes compute(cql_serialization_format sf) override {
+        if (!_max) {
+            return {};
+        }
+        return data_type_for<Type>()->decompose(data_value(Type{*_max}));
+    }
+    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+        if (!values[0]) {
+            return;
+        }
+        auto val = value_cast<typename aggregate_type_for<Type>::type>(data_type_for<Type>()->deserialize(*values[0]));
+        if (!_max) {
+            _max = val;
+        } else {
+            _max = max_wrapper(*_max, val);
+        }
+    }
+};
+
+/// The same as `impl_max_function_for' but without knowledge of `Type'.
+class impl_max_dynamic_function final : public aggregate_function::aggregate {
+    opt_bytes _max;
+public:
+    virtual void reset() override {
+        _max = {};
+    }
+    virtual opt_bytes compute(cql_serialization_format sf) override {
+        return _max.value_or(bytes{});
+    }
+    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+        if (!values[0]) {
+            return;
+        }
+        const auto val = *values[0];
+        if (!_max || *_max < val) {
+            _max = val;
+        }
+    }
+};
+
+template <typename Type>
+class max_function_for final : public native_aggregate_function {
+public:
+    max_function_for() : native_aggregate_function("max", data_type_for<Type>(), { data_type_for<Type>() }) {}
+    virtual std::unique_ptr<aggregate> new_aggregate() override {
+        return std::make_unique<impl_max_function_for<Type>>();
+    }
+};
+
+class max_dynamic_function final : public native_aggregate_function {
+public:
+    max_dynamic_function(data_type io_type) : native_aggregate_function("max", io_type, { io_type }) {}
+    virtual std::unique_ptr<aggregate> new_aggregate() override {
+        return std::make_unique<impl_max_dynamic_function>();
+    }
+};
+
+/**
+ * Creates a MAX function for the specified type.
+ *
+ * @param inputType the function input and output type
+ * @return a MAX function for the specified type.
+ */
+template <typename Type>
+static
+shared_ptr<aggregate_function>
+make_max_function() {
+    return make_shared<max_function_for<Type>>();
+}
+
+template <typename Type>
+const Type& min_wrapper(const Type& t1, const Type& t2) {
+    using std::min;
+    return min(t1, t2);
+}
+
+inline const net::inet_address& min_wrapper(const net::inet_address& t1, const net::inet_address& t2) {
+    using family = seastar::net::inet_address::family;
+    const size_t len =
+            (t1.in_family() == family::INET || t2.in_family() == family::INET)
+            ? sizeof(::in_addr) : sizeof(::in6_addr);
+    return std::memcmp(t1.data(), t2.data(), len) <= 0 ? t1 : t2;
+}
+
+template <typename Type>
+class impl_min_function_for final : public aggregate_function::aggregate {
+   std::optional<typename aggregate_type_for<Type>::type> _min{};
+public:
+    virtual void reset() override {
+        _min = {};
+    }
+    virtual opt_bytes compute(cql_serialization_format sf) override {
+        if (!_min) {
+            return {};
+        }
+        return data_type_for<Type>()->decompose(data_value(Type{*_min}));
+    }
+    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+        if (!values[0]) {
+            return;
+        }
+        auto val = value_cast<typename aggregate_type_for<Type>::type>(data_type_for<Type>()->deserialize(*values[0]));
+        if (!_min) {
+            _min = val;
+        } else {
+            _min = min_wrapper(*_min, val);
+        }
+    }
+};
+
+/// The same as `impl_min_function_for' but without knowledge of `Type'.
+class impl_min_dynamic_function final : public aggregate_function::aggregate {
+    opt_bytes _min;
+public:
+    virtual void reset() override {
+        _min = {};
+    }
+    virtual opt_bytes compute(cql_serialization_format sf) override {
+        return _min.value_or(bytes{});
+    }
+    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+        if (!values[0]) {
+            return;
+        }
+        const auto val = *values[0];
+        if (!_min || val < *_min) {
+            _min = val;
+        }
+    }
+};
+
+template <typename Type>
+class min_function_for final : public native_aggregate_function {
+public:
+    min_function_for() : native_aggregate_function("min", data_type_for<Type>(), { data_type_for<Type>() }) {}
+    virtual std::unique_ptr<aggregate> new_aggregate() override {
+        return std::make_unique<impl_min_function_for<Type>>();
+    }
+};
+
+class min_dynamic_function final : public native_aggregate_function {
+public:
+    min_dynamic_function(data_type io_type) : native_aggregate_function("min", io_type, { io_type }) {}
+    virtual std::unique_ptr<aggregate> new_aggregate() override {
+        return std::make_unique<impl_min_dynamic_function>();
+    }
+};
+
+/**
+ * Creates a MIN function for the specified type.
+ *
+ * @param inputType the function input and output type
+ * @return a MIN function for the specified type.
+ */
+template <typename Type>
+static
+shared_ptr<aggregate_function>
+make_min_function() {
+    return make_shared<min_function_for<Type>>();
+}
+
+template <typename Type>
+class impl_count_function_for final : public aggregate_function::aggregate {
+   int64_t _count = 0;
+public:
+    virtual void reset() override {
+        _count = 0;
+    }
+    virtual opt_bytes compute(cql_serialization_format sf) override {
+        return long_type->decompose(_count);
+    }
+    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
+        if (!values[0]) {
+            return;
+        }
+        ++_count;
+    }
+};
+
+template <typename Type>
+class count_function_for final : public native_aggregate_function {
+public:
+    count_function_for() : native_aggregate_function("count", long_type, { data_type_for<Type>() }) {}
+    virtual std::unique_ptr<aggregate> new_aggregate() override {
+        return std::make_unique<impl_count_function_for<Type>>();
+    }
+};
+
+/**
+ * Creates a COUNT function for the specified type.
+ *
+ * @param inputType the function input type
+ * @return a COUNT function for the specified type.
+ */
+template <typename Type>
+static shared_ptr<aggregate_function> make_count_function() {
+    return make_shared<count_function_for<Type>>();
+}
+}
+
+shared_ptr<aggregate_function>
+aggregate_fcts::make_count_rows_function() {
+    return make_shared<count_rows_function>();
+}
+
+shared_ptr<aggregate_function>
+aggregate_fcts::make_max_dynamic_function(data_type io_type) {
+    return make_shared<max_dynamic_function>(io_type);
+}
+
+shared_ptr<aggregate_function>
+aggregate_fcts::make_min_dynamic_function(data_type io_type) {
+    return make_shared<min_dynamic_function>(io_type);
+}
+
+void cql3::functions::add_agg_functions(declared_t& funcs) {
+    auto declare = [&funcs] (shared_ptr<function> f) { funcs.emplace(f->name(), f); };
+
+    declare(make_count_function<int8_t>());
+    declare(make_max_function<int8_t>());
+    declare(make_min_function<int8_t>());
+
+    declare(make_count_function<int16_t>());
+    declare(make_max_function<int16_t>());
+    declare(make_min_function<int16_t>());
+
+    declare(make_count_function<int32_t>());
+    declare(make_max_function<int32_t>());
+    declare(make_min_function<int32_t>());
+
+    declare(make_count_function<int64_t>());
+    declare(make_max_function<int64_t>());
+    declare(make_min_function<int64_t>());
+
+    declare(make_count_function<boost::multiprecision::cpp_int>());
+    declare(make_max_function<boost::multiprecision::cpp_int>());
+    declare(make_min_function<boost::multiprecision::cpp_int>());
+
+    declare(make_count_function<big_decimal>());
+    declare(make_max_function<big_decimal>());
+    declare(make_min_function<big_decimal>());
+
+    declare(make_count_function<float>());
+    declare(make_max_function<float>());
+    declare(make_min_function<float>());
+
+    declare(make_count_function<double>());
+    declare(make_max_function<double>());
+    declare(make_min_function<double>());
+
+    declare(make_count_function<sstring>());
+    declare(make_max_function<sstring>());
+    declare(make_min_function<sstring>());
+
+    declare(make_count_function<ascii_native_type>());
+    declare(make_max_function<ascii_native_type>());
+    declare(make_min_function<ascii_native_type>());
+
+    declare(make_count_function<simple_date_native_type>());
+    declare(make_max_function<simple_date_native_type>());
+    declare(make_min_function<simple_date_native_type>());
+
+    declare(make_count_function<db_clock::time_point>());
+    declare(make_max_function<db_clock::time_point>());
+    declare(make_min_function<db_clock::time_point>());
+
+    declare(make_count_function<timeuuid_native_type>());
+    declare(make_max_function<timeuuid_native_type>());
+    declare(make_min_function<timeuuid_native_type>());
+
+    declare(make_count_function<time_native_type>());
+    declare(make_max_function<time_native_type>());
+    declare(make_min_function<time_native_type>());
+
+    declare(make_count_function<utils::UUID>());
+    declare(make_max_function<utils::UUID>());
+    declare(make_min_function<utils::UUID>());
+
+    declare(make_count_function<bytes>());
+    declare(make_max_function<bytes>());
+    declare(make_min_function<bytes>());
+
+    declare(make_count_function<bool>());
+    declare(make_max_function<bool>());
+    declare(make_min_function<bool>());
+
+    declare(make_count_function<net::inet_address>());
+    declare(make_max_function<net::inet_address>());
+    declare(make_min_function<net::inet_address>());
+
+    // FIXME: more count/min/max
+
+    declare(make_sum_function<int8_t>());
+    declare(make_sum_function<int16_t>());
+    declare(make_sum_function<int32_t>());
+    declare(make_sum_function<int64_t>());
+    declare(make_sum_function<float>());
+    declare(make_sum_function<double>());
+    declare(make_sum_function<boost::multiprecision::cpp_int>());
+    declare(make_sum_function<big_decimal>());
+    declare(make_avg_function<int8_t>());
+    declare(make_avg_function<int16_t>());
+    declare(make_avg_function<int32_t>());
+    declare(make_avg_function<int64_t>());
+    declare(make_avg_function<float>());
+    declare(make_avg_function<double>());
+    declare(make_avg_function<boost::multiprecision::cpp_int>());
+    declare(make_avg_function<big_decimal>());
+}
--- a/cql3/functions/aggregate_fcts.hh
+++ b/cql3/functions/aggregate_fcts.hh
@@ -41,348 +41,28 @@

 #pragma once

-#include "utils/big_decimal.hh"
 #include "aggregate_function.hh"
-#include "native_aggregate_function.hh"

 namespace cql3 {
 namespace functions {

-/**
- * Factory methods for aggregate functions.
- */
+/// Factory methods for aggregate functions.
 namespace aggregate_fcts {

-class impl_count_function : public aggregate_function::aggregate {
-    int64_t _count;
-public:
-    virtual void reset() override {
-        _count = 0;
-    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
-        return long_type->decompose(_count);
-    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
-        ++_count;
-    }
-};
-
 static const sstring COUNT_ROWS_FUNCTION_NAME = "countRows";

-class count_rows_function final : public native_aggregate_function {
-public:
-    count_rows_function() : native_aggregate_function(COUNT_ROWS_FUNCTION_NAME, long_type, {}) {}
-    virtual std::unique_ptr<aggregate> new_aggregate() override {
-        return std::make_unique<impl_count_function>();
-    }
-    virtual sstring column_name(const std::vector<sstring>& column_names) override {
-        return "count";
-    }
-};
-
-    /**
-     * The function used to count the number of rows of a result set. This function is called when COUNT(*) or COUNT(1)
-     * is specified.
-     */
-inline
+/// The function used to count the number of rows of a result set. This function is called when COUNT(*) or COUNT(1)
+/// is specified.
 shared_ptr<aggregate_function>
-make_count_rows_function() {
-    return make_shared<count_rows_function>();
-}
+make_count_rows_function();

-template <typename Type>
-class impl_sum_function_for final : public aggregate_function::aggregate {
-   Type _sum{};
-public:
-    virtual void reset() override {
-        _sum = {};
-    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
-        return data_type_for<Type>()->decompose(_sum);
-    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
-        if (!values[0]) {
-            return;
-        }
-        _sum += value_cast<Type>(data_type_for<Type>()->deserialize(*values[0]));
-    }
-};
-
-template <typename Type>
-class sum_function_for final : public native_aggregate_function {
-public:
-    sum_function_for() : native_aggregate_function("sum", data_type_for<Type>(), { data_type_for<Type>() }) {}
-    virtual std::unique_ptr<aggregate> new_aggregate() override {
-        return std::make_unique<impl_sum_function_for<Type>>();
-    }
-};
-
-
-template <typename Type>
-inline
+/// The same as `make_max_function()' but with type provided in runtime.
 shared_ptr<aggregate_function>
-make_sum_function() {
-    return make_shared<sum_function_for<Type>>();
-}
+make_max_dynamic_function(data_type io_type);

-template <typename Type>
-class impl_div_for_avg {
-public:
-    static Type div(const Type& x, const int64_t y) {
-        return x/y;
-    }
-};
-
-template <>
-class impl_div_for_avg<big_decimal> {
-public:
-    static big_decimal div(const big_decimal& x, const int64_t y) {
-        return x.div(y, big_decimal::rounding_mode::HALF_EVEN);
-    }
-};
-
-// We need a wider accumulator for average, since summing the inputs can overflow
-// the input type
-template <typename T>
-struct accumulator_for;
-
-template <>
-struct accumulator_for<int8_t> {
-    using type = __int128;
-};
-
-template <>
-struct accumulator_for<int16_t> {
-    using type = __int128;
-};
-
-template <>
-struct accumulator_for<int32_t> {
-    using type = __int128;
-};
-
-template <>
-struct accumulator_for<int64_t> {
-    using type = __int128;
-};
-
-template <>
-struct accumulator_for<float> {
-    using type = float;
-};
-
-template <>
-struct accumulator_for<double> {
-    using type = double;
-};
-
-template <>
-struct accumulator_for<boost::multiprecision::cpp_int> {
-    using type = boost::multiprecision::cpp_int;
-};
-
-template <>
-struct accumulator_for<big_decimal> {
-    using type = big_decimal;
-};
-
-template <typename Type>
-class impl_avg_function_for final : public aggregate_function::aggregate {
-   typename accumulator_for<Type>::type _sum{};
-   int64_t _count = 0;
-public:
-    virtual void reset() override {
-        _sum = {};
-        _count = 0;
-    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
-        Type ret{};
-        if (_count) {
-            ret = impl_div_for_avg<Type>::div(_sum, _count);
-        }
-        return data_type_for<Type>()->decompose(ret);
-    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
-        if (!values[0]) {
-            return;
-        }
-        ++_count;
-        _sum += value_cast<Type>(data_type_for<Type>()->deserialize(*values[0]));
-    }
-};
-
-template <typename Type>
-class avg_function_for final : public native_aggregate_function {
-public:
-    avg_function_for() : native_aggregate_function("avg", data_type_for<Type>(), { data_type_for<Type>() }) {}
-    virtual std::unique_ptr<aggregate> new_aggregate() override {
-        return std::make_unique<impl_avg_function_for<Type>>();
-    }
-};
-
-template <typename Type>
-inline
+/// The same as `make_min_function()' but with type provided in runtime.
 shared_ptr<aggregate_function>
-make_avg_function() {
-    return make_shared<avg_function_for<Type>>();
-}
-
-template <typename T>
-struct aggregate_type_for {
-    using type = T;
-};
-
-template<>
-struct aggregate_type_for<ascii_native_type> {
-    using type = ascii_native_type::primary_type;
-};
-
-template<>
-struct aggregate_type_for<simple_date_native_type> {
-    using type = simple_date_native_type::primary_type;
-};
-
-template<>
-struct aggregate_type_for<timeuuid_native_type> {
-    using type = timeuuid_native_type::primary_type;
-};
-
-template <typename Type>
-class impl_max_function_for final : public aggregate_function::aggregate {
-   std::optional<typename aggregate_type_for<Type>::type> _max{};
-public:
-    virtual void reset() override {
-        _max = {};
-    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
-        if (!_max) {
-            return {};
-        }
-        return data_type_for<Type>()->decompose(data_value(Type{*_max}));
-    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
-        if (!values[0]) {
-            return;
-        }
-        auto val = value_cast<typename aggregate_type_for<Type>::type>(data_type_for<Type>()->deserialize(*values[0]));
-        if (!_max) {
-            _max = val;
-        } else {
-            _max = std::max(*_max, val);
-        }
-    }
-};
-
-template <typename Type>
-class max_function_for final : public native_aggregate_function {
-public:
-    max_function_for() : native_aggregate_function("max", data_type_for<Type>(), { data_type_for<Type>() }) {}
-    virtual std::unique_ptr<aggregate> new_aggregate() override {
-        return std::make_unique<impl_max_function_for<Type>>();
-    }
-};
-
-    /**
-     * Creates a MAX function for the specified type.
-     *
-     * @param inputType the function input and output type
-     * @return a MAX function for the specified type.
-     */
-template <typename Type>
-shared_ptr<aggregate_function>
-make_max_function() {
-    return make_shared<max_function_for<Type>>();
-}
-
-template <typename Type>
-class impl_min_function_for final : public aggregate_function::aggregate {
-   std::optional<typename aggregate_type_for<Type>::type> _min{};
-public:
-    virtual void reset() override {
-        _min = {};
-    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
-        if (!_min) {
-            return {};
-        }
-        return data_type_for<Type>()->decompose(data_value(Type{*_min}));
-    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
-        if (!values[0]) {
-            return;
-        }
-        auto val = value_cast<typename aggregate_type_for<Type>::type>(data_type_for<Type>()->deserialize(*values[0]));
-        if (!_min) {
-            _min = val;
-        } else {
-            _min = std::min(*_min, val);
-        }
-    }
-};
-
-template <typename Type>
-class min_function_for final : public native_aggregate_function {
-public:
-    min_function_for() : native_aggregate_function("min", data_type_for<Type>(), { data_type_for<Type>() }) {}
-    virtual std::unique_ptr<aggregate> new_aggregate() override {
-        return std::make_unique<impl_min_function_for<Type>>();
-    }
-};
-
-
-    /**
-     * Creates a MIN function for the specified type.
-     *
-     * @param inputType the function input and output type
-     * @return a MIN function for the specified type.
-     */
-template <typename Type>
-shared_ptr<aggregate_function>
-make_min_function() {
-    return make_shared<min_function_for<Type>>();
-}
-
-
-template <typename Type>
-class impl_count_function_for final : public aggregate_function::aggregate {
-   int64_t _count = 0;
-public:
-    virtual void reset() override {
-        _count = 0;
-    }
-    virtual opt_bytes compute(cql_serialization_format sf) override {
-        return long_type->decompose(_count);
-    }
-    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
-        if (!values[0]) {
-            return;
-        }
-        ++_count;
-    }
-};
-
-template <typename Type>
-class count_function_for final : public native_aggregate_function {
-public:
-    count_function_for() : native_aggregate_function("count", long_type, { data_type_for<Type>() }) {}
-    virtual std::unique_ptr<aggregate> new_aggregate() override {
-        return std::make_unique<impl_count_function_for<Type>>();
-    }
-};
-
-    /**
-     * Creates a COUNT function for the specified type.
-     *
-     * @param inputType the function input type
-     * @return a COUNT function for the specified type.
-     */
-template <typename Type>
-shared_ptr<aggregate_function>
-make_count_function() {
-    return make_shared<count_function_for<Type>>();
-}
-
+make_min_dynamic_function(data_type io_type);
 }
 }
 }
-
--- a/cql3/functions/as_json_function.hh
+++ b/cql3/functions/as_json_function.hh
@@ -44,6 +44,7 @@
 #include "cql3/functions/function.hh"
 #include "cql3/functions/scalar_function.hh"
 #include "cql3/cql3_type.hh"
+#include "cql3/type_json.hh"

 #include "bytes_ostream.hh"
 #include "types.hh"
@@ -73,6 +74,8 @@ public:
        : _selector_names(std::move(selector_names)), _selector_types(std::move(selector_types)) {
    }

+    virtual bool requires_thread() const;
+
    virtual bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) override {
        bytes_ostream encoded_row;
        encoded_row.write("{", 1);
@@ -90,7 +93,7 @@ public:
                encoded_row.write("\\\"", 2);
            }
            encoded_row.write("\": ", 3);
-            sstring row_sstring = _selector_types[i]->to_json_string(parameters[i]);
+            sstring row_sstring = to_json_string(*_selector_types[i], parameters[i]);
            encoded_row.write(row_sstring.c_str(), row_sstring.size());
        }
        encoded_row.write("}", 1);
@@ -110,15 +113,15 @@ public:
        return utf8_type;
    }

-    virtual bool is_pure() override {
+    virtual bool is_pure() const override {
        return true;
    }

-    virtual bool is_native() override {
+    virtual bool is_native() const override {
        return true;
    }

-    virtual bool is_aggregate() override {
+    virtual bool is_aggregate() const override {
        // Aggregates of aggregates are currently not supported, but JSON handles them
        return false;
    }
@@ -137,15 +140,15 @@ public:
        os << ") -> " << utf8_type->as_cql3_type().to_string();
    }

-    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) override {
+    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override {
        return false;
    }

-    virtual bool has_reference_to(function& f) override {
+    virtual bool has_reference_to(function& f) const override {
        return false;
    }

-    virtual sstring column_name(const std::vector<sstring>& column_names) override {
+    virtual sstring column_name(const std::vector<sstring>& column_names) const override {
        return "[json]";
    }

--- a/cql3/functions/castas_fcts.cc
+++ b/cql3/functions/castas_fcts.cc
@@ -20,7 +20,11 @@
 */

 #include "castas_fcts.hh"
+#include "concrete_types.hh"
+#include "utils/UUID_gen.hh"
 #include "cql3/functions/native_scalar_function.hh"
+#include "utils/date.h"
+#include <boost/date_time/posix_time/posix_time.hpp>

 namespace cql3 {
 namespace functions {
@@ -30,7 +34,7 @@ namespace {
 using bytes_opt = std::optional<bytes>;

 class castas_function_for : public cql3::functions::native_scalar_function {
-    castas_fctn _func;
+    cql3::functions::castas_fctn _func;
 public:
    castas_function_for(data_type to_type,
                        data_type from_type,
@@ -38,7 +42,7 @@ public:
            : native_scalar_function("castas" + to_type->as_cql3_type().to_string(), to_type, {from_type})
            , _func(func) {
    }
-    virtual bool is_pure() override {
+    virtual bool is_pure() const override {
        return true;
    }
    virtual void print(std::ostream& os) const override {
@@ -64,6 +68,289 @@ shared_ptr<function> make_castas_function(data_type to_type, data_type from_type

 } /* Anonymous Namespace */

+/*
+ * Support for CAST(. AS .) functions.
+ */
+namespace {
+
+using bytes_opt = std::optional<bytes>;
+
+template<typename ToType, typename FromType>
+std::function<data_value(data_value)> make_castas_fctn_simple() {
+    return [](data_value from) -> data_value {
+        auto val_from = value_cast<FromType>(from);
+        return static_cast<ToType>(val_from);
+    };
+}
+
+template<typename ToType>
+std::function<data_value(data_value)> make_castas_fctn_from_decimal_to_float() {
+    return [](data_value from) -> data_value {
+        auto val_from = value_cast<big_decimal>(from);
+        boost::multiprecision::cpp_int ten(10);
+        boost::multiprecision::cpp_rational r = val_from.unscaled_value();
+        r /= boost::multiprecision::pow(ten, val_from.scale());
+        return static_cast<ToType>(r);
+    };
+}
+
+static boost::multiprecision::cpp_int from_decimal_to_cppint(const data_value& from) {
+    const auto& val_from = value_cast<big_decimal>(from);
+    boost::multiprecision::cpp_int ten(10);
+    return val_from.unscaled_value() / boost::multiprecision::pow(ten, val_from.scale());
+}
+
+template<typename ToType>
+std::function<data_value(data_value)> make_castas_fctn_from_varint_to_integer() {
+    return [](data_value from) -> data_value {
+        const auto& varint = value_cast<boost::multiprecision::cpp_int>(from);
+        return static_cast<ToType>(from_varint_to_integer(varint));
+    };
+}
+
+template<typename ToType>
+std::function<data_value(data_value)> make_castas_fctn_from_decimal_to_integer() {
+    return [](data_value from) -> data_value {
+        auto varint = from_decimal_to_cppint(from);
+        return static_cast<ToType>(from_varint_to_integer(varint));
+    };
+}
+
+std::function<data_value(data_value)> make_castas_fctn_from_decimal_to_varint() {
+    return [](data_value from) -> data_value {
+        return from_decimal_to_cppint(from);
+    };
+}
+
+template<typename FromType>
+std::function<data_value(data_value)> make_castas_fctn_from_integer_to_decimal() {
+    return [](data_value from) -> data_value {
+        auto val_from = value_cast<FromType>(from);
+        return big_decimal(1, 10*static_cast<boost::multiprecision::cpp_int>(val_from));
+    };
+}
+
+template<typename FromType>
+std::function<data_value(data_value)> make_castas_fctn_from_float_to_decimal() {
+    return [](data_value from) -> data_value {
+        auto val_from = value_cast<FromType>(from);
+        return big_decimal(boost::lexical_cast<std::string>(val_from));
+    };
+}
+
+template<typename FromType>
+std::function<data_value(data_value)> make_castas_fctn_to_string() {
+    return [](data_value from) -> data_value {
+        return to_sstring(value_cast<FromType>(from));
+    };
+}
+
+std::function<data_value(data_value)> make_castas_fctn_from_varint_to_string() {
+    return [](data_value from) -> data_value {
+        return to_sstring(value_cast<boost::multiprecision::cpp_int>(from).str());
+    };
+}
+
+std::function<data_value(data_value)> make_castas_fctn_from_decimal_to_string() {
+    return [](data_value from) -> data_value {
+        return value_cast<big_decimal>(from).to_string();
+    };
+}
+
+db_clock::time_point millis_to_time_point(const int64_t millis) {
+    return db_clock::time_point{std::chrono::milliseconds(millis)};
+}
+
+simple_date_native_type time_point_to_date(const db_clock::time_point& tp) {
+    const auto epoch = boost::posix_time::from_time_t(0);
+    auto timestamp = tp.time_since_epoch().count();
+    auto time = boost::posix_time::from_time_t(0) + boost::posix_time::milliseconds(timestamp);
+    const auto diff = time.date() - epoch.date();
+    return simple_date_native_type{uint32_t(diff.days() + (1UL<<31))};
+}
+
+db_clock::time_point date_to_time_point(const uint32_t date) {
+    const auto epoch = boost::posix_time::from_time_t(0);
+    const auto target_date = epoch + boost::gregorian::days(int64_t(date) - (1UL<<31));
+    boost::posix_time::time_duration duration = target_date - epoch;
+    const auto millis = std::chrono::milliseconds(duration.total_milliseconds());
+    return db_clock::time_point(std::chrono::duration_cast<db_clock::duration>(millis));
+}
+
+std::function<data_value(data_value)> make_castas_fctn_from_timestamp_to_date() {
+    return [](data_value from) -> data_value {
+        const auto val_from = value_cast<db_clock::time_point>(from);
+        return time_point_to_date(val_from);
+    };
+}
+
+std::function<data_value(data_value)> make_castas_fctn_from_date_to_timestamp() {
+    return [](data_value from) -> data_value {
+        const auto val_from = value_cast<uint32_t>(from);
+        return date_to_time_point(val_from);
+    };
+}
+
+std::function<data_value(data_value)> make_castas_fctn_from_timeuuid_to_timestamp() {
+    return [](data_value from) -> data_value {
+        const auto val_from = value_cast<utils::UUID>(from);
+        return db_clock::time_point{db_clock::duration{utils::UUID_gen::unix_timestamp(val_from)}};
+    };
+}
+
+std::function<data_value(data_value)> make_castas_fctn_from_timeuuid_to_date() {
+    return [](data_value from) -> data_value {
+        const auto val_from = value_cast<utils::UUID>(from);
+        return time_point_to_date(millis_to_time_point(utils::UUID_gen::unix_timestamp(val_from)));
+    };
+}
+
+static std::function<data_value(data_value)> make_castas_fctn_from_dv_to_string() {
+    return [](data_value from) -> data_value {
+        return from.type()->to_string_impl(from);
+    };
+}
+
+// FIXME: Add conversions for counters, after they are fully implemented...
+
+// Map <ToType, FromType> -> castas_fctn
+using castas_fctn_key = std::pair<data_type, data_type>;
+struct castas_fctn_hash {
+    std::size_t operator()(const castas_fctn_key& x) const noexcept {
+        return boost::hash_value(x);
+    }
+};
+using castas_fctns_map = std::unordered_map<castas_fctn_key, castas_fctn, castas_fctn_hash>;
+
+// List of supported castas functions...
+thread_local castas_fctns_map castas_fctns {
+    { {byte_type, byte_type}, make_castas_fctn_simple<int8_t, int8_t>() },
+    { {byte_type, short_type}, make_castas_fctn_simple<int8_t, int16_t>() },
+    { {byte_type, int32_type}, make_castas_fctn_simple<int8_t, int32_t>() },
+    { {byte_type, long_type}, make_castas_fctn_simple<int8_t, int64_t>() },
+    { {byte_type, float_type}, make_castas_fctn_simple<int8_t, float>() },
+    { {byte_type, double_type}, make_castas_fctn_simple<int8_t, double>() },
+    { {byte_type, varint_type}, make_castas_fctn_from_varint_to_integer<int8_t>() },
+    { {byte_type, decimal_type}, make_castas_fctn_from_decimal_to_integer<int8_t>() },
+
+    { {short_type, byte_type}, make_castas_fctn_simple<int16_t, int8_t>() },
+    { {short_type, short_type}, make_castas_fctn_simple<int16_t, int16_t>() },
+    { {short_type, int32_type}, make_castas_fctn_simple<int16_t, int32_t>() },
+    { {short_type, long_type}, make_castas_fctn_simple<int16_t, int64_t>() },
+    { {short_type, float_type}, make_castas_fctn_simple<int16_t, float>() },
+    { {short_type, double_type}, make_castas_fctn_simple<int16_t, double>() },
+    { {short_type, varint_type}, make_castas_fctn_from_varint_to_integer<int16_t>() },
+    { {short_type, decimal_type}, make_castas_fctn_from_decimal_to_integer<int16_t>() },
+
+    { {int32_type, byte_type}, make_castas_fctn_simple<int32_t, int8_t>() },
+    { {int32_type, short_type}, make_castas_fctn_simple<int32_t, int16_t>() },
+    { {int32_type, int32_type}, make_castas_fctn_simple<int32_t, int32_t>() },
+    { {int32_type, long_type}, make_castas_fctn_simple<int32_t, int64_t>() },
+    { {int32_type, float_type}, make_castas_fctn_simple<int32_t, float>() },
+    { {int32_type, double_type}, make_castas_fctn_simple<int32_t, double>() },
+    { {int32_type, varint_type}, make_castas_fctn_from_varint_to_integer<int32_t>() },
+    { {int32_type, decimal_type}, make_castas_fctn_from_decimal_to_integer<int32_t>() },
+
+    { {long_type, byte_type}, make_castas_fctn_simple<int64_t, int8_t>() },
+    { {long_type, short_type}, make_castas_fctn_simple<int64_t, int16_t>() },
+    { {long_type, int32_type}, make_castas_fctn_simple<int64_t, int32_t>() },
+    { {long_type, long_type}, make_castas_fctn_simple<int64_t, int64_t>() },
+    { {long_type, float_type}, make_castas_fctn_simple<int64_t, float>() },
+    { {long_type, double_type}, make_castas_fctn_simple<int64_t, double>() },
+    { {long_type, varint_type}, make_castas_fctn_from_varint_to_integer<int64_t>() },
+    { {long_type, decimal_type}, make_castas_fctn_from_decimal_to_integer<int64_t>() },
+
+    { {float_type, byte_type}, make_castas_fctn_simple<float, int8_t>() },
+    { {float_type, short_type}, make_castas_fctn_simple<float, int16_t>() },
+    { {float_type, int32_type}, make_castas_fctn_simple<float, int32_t>() },
+    { {float_type, long_type}, make_castas_fctn_simple<float, int64_t>() },
+    { {float_type, float_type}, make_castas_fctn_simple<float, float>() },
+    { {float_type, double_type}, make_castas_fctn_simple<float, double>() },
+    { {float_type, varint_type}, make_castas_fctn_simple<float, boost::multiprecision::cpp_int>() },
+    { {float_type, decimal_type}, make_castas_fctn_from_decimal_to_float<float>() },
+
+    { {double_type, byte_type}, make_castas_fctn_simple<double, int8_t>() },
+    { {double_type, short_type}, make_castas_fctn_simple<double, int16_t>() },
+    { {double_type, int32_type}, make_castas_fctn_simple<double, int32_t>() },
+    { {double_type, long_type}, make_castas_fctn_simple<double, int64_t>() },
+    { {double_type, float_type}, make_castas_fctn_simple<double, float>() },
+    { {double_type, double_type}, make_castas_fctn_simple<double, double>() },
+    { {double_type, varint_type}, make_castas_fctn_simple<double, boost::multiprecision::cpp_int>() },
+    { {double_type, decimal_type}, make_castas_fctn_from_decimal_to_float<double>() },
+
+    { {varint_type, byte_type}, make_castas_fctn_simple<boost::multiprecision::cpp_int, int8_t>() },
+    { {varint_type, short_type}, make_castas_fctn_simple<boost::multiprecision::cpp_int, int16_t>() },
+    { {varint_type, int32_type}, make_castas_fctn_simple<boost::multiprecision::cpp_int, int32_t>() },
+    { {varint_type, long_type}, make_castas_fctn_simple<boost::multiprecision::cpp_int, int64_t>() },
+    { {varint_type, float_type}, make_castas_fctn_simple<boost::multiprecision::cpp_int, float>() },
+    { {varint_type, double_type}, make_castas_fctn_simple<boost::multiprecision::cpp_int, double>() },
+    { {varint_type, varint_type}, make_castas_fctn_simple<boost::multiprecision::cpp_int, boost::multiprecision::cpp_int>() },
+    { {varint_type, decimal_type}, make_castas_fctn_from_decimal_to_varint() },
+
+    { {decimal_type, byte_type}, make_castas_fctn_from_integer_to_decimal<int8_t>() },
+    { {decimal_type, short_type}, make_castas_fctn_from_integer_to_decimal<int16_t>() },
+    { {decimal_type, int32_type}, make_castas_fctn_from_integer_to_decimal<int32_t>() },
+    { {decimal_type, long_type}, make_castas_fctn_from_integer_to_decimal<int64_t>() },
+    { {decimal_type, float_type}, make_castas_fctn_from_float_to_decimal<float>() },
+    { {decimal_type, double_type}, make_castas_fctn_from_float_to_decimal<double>() },
+    { {decimal_type, varint_type}, make_castas_fctn_from_integer_to_decimal<boost::multiprecision::cpp_int>() },
+    { {decimal_type, decimal_type}, make_castas_fctn_simple<big_decimal, big_decimal>() },
+
+    { {ascii_type, byte_type}, make_castas_fctn_to_string<int8_t>() },
+    { {ascii_type, short_type}, make_castas_fctn_to_string<int16_t>() },
+    { {ascii_type, int32_type}, make_castas_fctn_to_string<int32_t>() },
+    { {ascii_type, long_type}, make_castas_fctn_to_string<int64_t>() },
+    { {ascii_type, float_type}, make_castas_fctn_to_string<float>() },
+    { {ascii_type, double_type}, make_castas_fctn_to_string<double>() },
+    { {ascii_type, varint_type}, make_castas_fctn_from_varint_to_string() },
+    { {ascii_type, decimal_type}, make_castas_fctn_from_decimal_to_string() },
+
+    { {utf8_type, byte_type}, make_castas_fctn_to_string<int8_t>() },
+    { {utf8_type, short_type}, make_castas_fctn_to_string<int16_t>() },
+    { {utf8_type, int32_type}, make_castas_fctn_to_string<int32_t>() },
+    { {utf8_type, long_type}, make_castas_fctn_to_string<int64_t>() },
+    { {utf8_type, float_type}, make_castas_fctn_to_string<float>() },
+    { {utf8_type, double_type}, make_castas_fctn_to_string<double>() },
+    { {utf8_type, varint_type}, make_castas_fctn_from_varint_to_string() },
+    { {utf8_type, decimal_type}, make_castas_fctn_from_decimal_to_string() },
+
+    { {simple_date_type, timestamp_type}, make_castas_fctn_from_timestamp_to_date() },
+    { {simple_date_type, timeuuid_type}, make_castas_fctn_from_timeuuid_to_date() },
+
+    { {timestamp_type, simple_date_type}, make_castas_fctn_from_date_to_timestamp() },
+    { {timestamp_type, timeuuid_type}, make_castas_fctn_from_timeuuid_to_timestamp() },
+
+    { {ascii_type, timestamp_type}, make_castas_fctn_from_dv_to_string() },
+    { {ascii_type, simple_date_type}, make_castas_fctn_from_dv_to_string() },
+    { {ascii_type, time_type}, make_castas_fctn_from_dv_to_string() },
+    { {ascii_type, timeuuid_type}, make_castas_fctn_from_dv_to_string() },
+    { {ascii_type, uuid_type}, make_castas_fctn_from_dv_to_string() },
+    { {ascii_type, boolean_type}, make_castas_fctn_from_dv_to_string() },
+    { {ascii_type, inet_addr_type}, make_castas_fctn_from_dv_to_string() },
+    { {ascii_type, ascii_type}, make_castas_fctn_simple<sstring, sstring>() },
+
+    { {utf8_type, timestamp_type}, make_castas_fctn_from_dv_to_string() },
+    { {utf8_type, simple_date_type}, make_castas_fctn_from_dv_to_string() },
+    { {utf8_type, time_type}, make_castas_fctn_from_dv_to_string() },
+    { {utf8_type, timeuuid_type}, make_castas_fctn_from_dv_to_string() },
+    { {utf8_type, uuid_type}, make_castas_fctn_from_dv_to_string() },
+    { {utf8_type, boolean_type}, make_castas_fctn_from_dv_to_string() },
+    { {utf8_type, inet_addr_type}, make_castas_fctn_from_dv_to_string() },
+    { {utf8_type, ascii_type}, make_castas_fctn_simple<sstring, sstring>() },
+    { {utf8_type, utf8_type}, make_castas_fctn_simple<sstring, sstring>() },
+};
+
+} /* Anonymous Namespace */
+
+castas_fctn get_castas_fctn(data_type to_type, data_type from_type) {
+    auto it_candidate = castas_fctns.find(castas_fctn_key{to_type, from_type});
+    if (it_candidate == castas_fctns.end()) {
+        throw exceptions::invalid_request_exception(format("{} cannot be cast to {}", from_type->name(), to_type->name()));
+    }
+
+    return it_candidate->second;
+}
+
 shared_ptr<function> castas_functions::get(data_type to_type, const std::vector<shared_ptr<cql3::selection::selector>>& provided_args, schema_ptr s) {
    if (provided_args.size() != 1) {
        throw exceptions::invalid_request_exception("Invalid CAST expression");
--- a/cql3/functions/castas_fcts.hh
+++ b/cql3/functions/castas_fcts.hh
@@ -54,6 +54,14 @@
 namespace cql3 {
 namespace functions {

+/*
+ * Support for CAST(. AS .) functions.
+ */
+
+using castas_fctn = std::function<data_value(data_value)>;
+
+castas_fctn get_castas_fctn(data_type to_type, data_type from_type);
+
 class castas_functions {
 public:
    static shared_ptr<function> get(data_type to_type, const std::vector<shared_ptr<cql3::selection::selector>>& provided_args, schema_ptr s);
--- a/cql3/functions/function.hh
+++ b/cql3/functions/function.hh
@@ -62,25 +62,27 @@ public:
     *
     * @return <code>true</code> if the function is a pure function, <code>false</code> otherwise.
     */
-    virtual bool is_pure() = 0;
+    virtual bool is_pure() const = 0;

    /**
     * Checks whether the function is a native/hard coded one or not.
     *
     * @return <code>true</code> if the function is a native/hard coded one, <code>false</code> otherwise.
     */
-    virtual bool is_native() = 0;
+    virtual bool is_native() const = 0;
+
+    virtual bool requires_thread() const = 0;

    /**
     * Checks whether the function is an aggregate function or not.
     *
     * @return <code>true</code> if the function is an aggregate function, <code>false</code> otherwise.
     */
-    virtual bool is_aggregate() = 0;
+    virtual bool is_aggregate() const = 0;

    virtual void print(std::ostream& os) const = 0;
-    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) = 0;
-    virtual bool has_reference_to(function& f) = 0;
+    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const = 0;
+    virtual bool has_reference_to(function& f) const = 0;

    /**
     * Returns the name of the function to use within a ResultSet.
@@ -88,7 +90,7 @@ public:
     * @param column_names the names of the columns used to call the function
     * @return the name of the function to use within a ResultSet
     */
-    virtual sstring column_name(const std::vector<sstring>& column_names) = 0;
+    virtual sstring column_name(const std::vector<sstring>& column_names) const = 0;

    friend class function_call;
    friend std::ostream& operator<<(std::ostream& os, const function& f);
--- a/cql3/functions/function_call.hh
+++ b/cql3/functions/function_call.hh
@@ -57,7 +57,7 @@ public:
            : _fun(std::move(fun)), _terms(std::move(terms)) {
    }
    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override;
-    virtual void collect_marker_specification(shared_ptr<variable_specifications> bound_names) override;
+    virtual void collect_marker_specification(lw_shared_ptr<variable_specifications> bound_names) override;
    virtual shared_ptr<terminal> bind(const query_options& options) override;
    virtual cql3::raw_value_view bind_and_get(const query_options& options) override;
 private:
--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -28,18 +28,42 @@
 #include "cql3/lists.hh"
 #include "cql3/constants.hh"
 #include "cql3/user_types.hh"
+#include "cql3/type_json.hh"
 #include "database.hh"
 #include "types/map.hh"
 #include "types/set.hh"
 #include "types/list.hh"
 #include "types/user.hh"
 #include "concrete_types.hh"
+#include "as_json_function.hh"
+
+namespace std {
+std::ostream& operator<<(std::ostream& os, const std::vector<data_type>& arg_types) {
+    for (size_t i = 0; i < arg_types.size(); ++i) {
+        if (i > 0) {
+            os << ", ";
+        }
+        os << arg_types[i]->as_cql3_type().to_string();
+    }
+    return os;
+}
+}

 namespace cql3 {
 namespace functions {

+static logging::logger log("cql3_fuctions");
+
+bool abstract_function::requires_thread() const { return false; }
+
+bool as_json_function::requires_thread() const { return false; }
+
 thread_local std::unordered_multimap<function_name, shared_ptr<function>> functions::_declared = init();

+void functions::clear_functions() {
+    functions::_declared = init();
+}
+
 std::unordered_multimap<function_name, shared_ptr<function>>
 functions::init() {
    std::unordered_multimap<function_name, shared_ptr<function>> ret;
@@ -78,90 +102,10 @@ functions::init() {
        declare(make_to_blob_function(type.get_type()));
        declare(make_from_blob_function(type.get_type()));
    }
-    declare(aggregate_fcts::make_count_function<int8_t>());
-    declare(aggregate_fcts::make_max_function<int8_t>());
-    declare(aggregate_fcts::make_min_function<int8_t>());
-
-    declare(aggregate_fcts::make_count_function<int16_t>());
-    declare(aggregate_fcts::make_max_function<int16_t>());
-    declare(aggregate_fcts::make_min_function<int16_t>());
-
-    declare(aggregate_fcts::make_count_function<int32_t>());
-    declare(aggregate_fcts::make_max_function<int32_t>());
-    declare(aggregate_fcts::make_min_function<int32_t>());
-
-    declare(aggregate_fcts::make_count_function<int64_t>());
-    declare(aggregate_fcts::make_max_function<int64_t>());
-    declare(aggregate_fcts::make_min_function<int64_t>());
-
-    declare(aggregate_fcts::make_count_function<boost::multiprecision::cpp_int>());
-    declare(aggregate_fcts::make_max_function<boost::multiprecision::cpp_int>());
-    declare(aggregate_fcts::make_min_function<boost::multiprecision::cpp_int>());
-
-    declare(aggregate_fcts::make_count_function<big_decimal>());
-    declare(aggregate_fcts::make_max_function<big_decimal>());
-    declare(aggregate_fcts::make_min_function<big_decimal>());
-
-    declare(aggregate_fcts::make_count_function<float>());
-    declare(aggregate_fcts::make_max_function<float>());
-    declare(aggregate_fcts::make_min_function<float>());
-
-    declare(aggregate_fcts::make_count_function<double>());
-    declare(aggregate_fcts::make_max_function<double>());
-    declare(aggregate_fcts::make_min_function<double>());
-
-    declare(aggregate_fcts::make_count_function<sstring>());
-    declare(aggregate_fcts::make_max_function<sstring>());
-    declare(aggregate_fcts::make_min_function<sstring>());
-
-    declare(aggregate_fcts::make_count_function<ascii_native_type>());
-    declare(aggregate_fcts::make_max_function<ascii_native_type>());
-    declare(aggregate_fcts::make_min_function<ascii_native_type>());
-
-    declare(aggregate_fcts::make_count_function<simple_date_native_type>());
-    declare(aggregate_fcts::make_max_function<simple_date_native_type>());
-    declare(aggregate_fcts::make_min_function<simple_date_native_type>());
-
-    declare(aggregate_fcts::make_count_function<db_clock::time_point>());
-    declare(aggregate_fcts::make_max_function<db_clock::time_point>());
-    declare(aggregate_fcts::make_min_function<db_clock::time_point>());
-
-    declare(aggregate_fcts::make_count_function<timeuuid_native_type>());
-    declare(aggregate_fcts::make_max_function<timeuuid_native_type>());
-    declare(aggregate_fcts::make_min_function<timeuuid_native_type>());
-
-    declare(aggregate_fcts::make_count_function<utils::UUID>());
-    declare(aggregate_fcts::make_max_function<utils::UUID>());
-    declare(aggregate_fcts::make_min_function<utils::UUID>());
-
-    declare(aggregate_fcts::make_count_function<bytes>());
-    declare(aggregate_fcts::make_max_function<bytes>());
-    declare(aggregate_fcts::make_min_function<bytes>());
-
-    declare(aggregate_fcts::make_count_function<bool>());
-    declare(aggregate_fcts::make_max_function<bool>());
-    declare(aggregate_fcts::make_min_function<bool>());
-
-    // FIXME: more count/min/max

    declare(make_varchar_as_blob_fct());
    declare(make_blob_as_varchar_fct());
-    declare(aggregate_fcts::make_sum_function<int8_t>());
-    declare(aggregate_fcts::make_sum_function<int16_t>());
-    declare(aggregate_fcts::make_sum_function<int32_t>());
-    declare(aggregate_fcts::make_sum_function<int64_t>());
-    declare(aggregate_fcts::make_sum_function<float>());
-    declare(aggregate_fcts::make_sum_function<double>());
-    declare(aggregate_fcts::make_sum_function<boost::multiprecision::cpp_int>());
-    declare(aggregate_fcts::make_sum_function<big_decimal>());
-    declare(aggregate_fcts::make_avg_function<int8_t>());
-    declare(aggregate_fcts::make_avg_function<int16_t>());
-    declare(aggregate_fcts::make_avg_function<int32_t>());
-    declare(aggregate_fcts::make_avg_function<int64_t>());
-    declare(aggregate_fcts::make_avg_function<float>());
-    declare(aggregate_fcts::make_avg_function<double>());
-    declare(aggregate_fcts::make_avg_function<boost::multiprecision::cpp_int>());
-    declare(aggregate_fcts::make_avg_function<big_decimal>());
+    add_agg_functions(ret);

    // also needed for smp:
 #if 0
@@ -170,6 +114,33 @@ functions::init() {
    return ret;
 }

+void functions::add_function(shared_ptr<function> func) {
+    if (find(func->name(), func->arg_types())) {
+        throw std::logic_error(format("duplicated function {}", func));
+    }
+    _declared.emplace(func->name(), func);
+}
+
+template <typename F>
+void functions::with_udf_iter(const function_name& name, const std::vector<data_type>& arg_types, F&& f) {
+    auto i = find_iter(name, arg_types);
+    if (i == _declared.end() || i->second->is_native()) {
+        log.error("attempted to remove or alter non existent user defined function {}({})", name, arg_types);
+        return;
+    }
+    f(i);
+}
+
+void functions::replace_function(shared_ptr<function> func) {
+    with_udf_iter(func->name(), func->arg_types(), [func] (functions::declared_t::iterator i) {
+        i->second = std::move(func);
+    });
+}
+
+void functions::remove_function(const function_name& name, const std::vector<data_type>& arg_types) {
+    with_udf_iter(name, arg_types, [] (functions::declared_t::iterator i) { _declared.erase(i); });
+}
+
 shared_ptr<column_specification>
 functions::make_arg_spec(const sstring& receiver_ks, const sstring& receiver_cf,
        const function& fun, size_t i) {
@@ -191,7 +162,7 @@ shared_ptr<function>
 make_to_json_function(data_type t) {
    return make_native_scalar_function<true>("tojson", utf8_type, {t},
            [t](cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
-        return utf8_type->decompose(t->to_json_string(parameters[0]));
+        return utf8_type->decompose(to_json_string(*t, parameters[0]));
    });
 }

@@ -203,7 +174,7 @@ make_from_json_function(database& db, const sstring& keyspace, data_type t) {
        Json::Value json_value = json::to_json_value(utf8_type->to_string(parameters[0].value()));
        bytes_opt parsed_json_value;
        if (!json_value.isNull()) {
-            parsed_json_value.emplace(t->from_json_object(json_value, sf));
+            parsed_json_value.emplace(from_json_object(*t, json_value, sf));
        }
        return parsed_json_value;
    });
@@ -221,6 +192,8 @@ functions::get(database& db,
    static const function_name TOKEN_FUNCTION_NAME = function_name::native_function("token");
    static const function_name TO_JSON_FUNCTION_NAME = function_name::native_function("tojson");
    static const function_name FROM_JSON_FUNCTION_NAME = function_name::native_function("fromjson");
+    static const function_name MIN_FUNCTION_NAME = function_name::native_function("min");
+    static const function_name MAX_FUNCTION_NAME = function_name::native_function("max");

    if (name.has_keyspace()
                ? name == TOKEN_FUNCTION_NAME
@@ -253,6 +226,40 @@ functions::get(database& db,
        return make_from_json_function(db, keyspace, receiver->type);
    }

+    if (name.has_keyspace()
+                ? name == MIN_FUNCTION_NAME
+                : name.name == MIN_FUNCTION_NAME.name) {
+        if (provided_args.size() != 1) {
+            throw exceptions::invalid_request_exception("min() operates on 1 argument at a time");
+        }
+        selection::selector *sp = dynamic_cast<selection::selector*>(provided_args[0].get());
+        if (!sp) {
+            throw exceptions::invalid_request_exception("min() is only valid in SELECT clause");
+        }
+        const data_type arg_type = sp->get_type();
+        if (arg_type->is_collection() || arg_type->is_tuple() || arg_type->is_user_type()) {
+            // `min()' function is created on demand for arguments of compound types.
+            return aggregate_fcts::make_min_dynamic_function(arg_type);
+        }
+    }
+
+    if (name.has_keyspace()
+                ? name == MAX_FUNCTION_NAME
+                : name.name == MAX_FUNCTION_NAME.name) {
+        if (provided_args.size() != 1) {
+            throw exceptions::invalid_request_exception("max() operates on 1 argument at a time");
+        }
+        selection::selector *sp = dynamic_cast<selection::selector*>(provided_args[0].get());
+        if (!sp) {
+            throw exceptions::invalid_request_exception("max() is only valid in SELECT clause");
+        }
+        const data_type arg_type = sp->get_type();
+        if (arg_type->is_collection() || arg_type->is_tuple() || arg_type->is_user_type()) {
+            // `max()' function is created on demand for arguments of compound types.
+            return aggregate_fcts::make_max_dynamic_function(arg_type);
+        }
+    }
+
    std::vector<shared_ptr<function>> candidates;
    auto&& add_declared = [&] (function_name fn) {
        auto&& fns = _declared.equal_range(fn);
@@ -310,23 +317,30 @@ functions::get(database& db,
    return std::move(compatibles[0]);
 }

-std::vector<shared_ptr<function>>
+boost::iterator_range<functions::declared_t::iterator>
 functions::find(const function_name& name) {
-    auto range = _declared.equal_range(name);
-    std::vector<shared_ptr<function>> ret;
-    for (auto i = range.first; i != range.second; ++i) {
-        ret.push_back(i->second);
+    assert(name.has_keyspace()); // : "function name not fully qualified";
+    auto pair = _declared.equal_range(name);
+    return boost::make_iterator_range(pair.first, pair.second);
+}
+
+functions::declared_t::iterator
+functions::find_iter(const function_name& name, const std::vector<data_type>& arg_types) {
+    auto range = find(name);
+    auto i = std::find_if(range.begin(), range.end(), [&] (const std::pair<const function_name, shared_ptr<function>>& d) {
+        return type_equals(d.second->arg_types(), arg_types);
+    });
+    if (i == range.end()) {
+        return _declared.end();
    }
-    return ret;
+    return i;
 }

 shared_ptr<function>
 functions::find(const function_name& name, const std::vector<data_type>& arg_types) {
-    assert(name.has_keyspace()); // : "function name not fully qualified";
-    for (auto&& f : find(name)) {
-        if (type_equals(f->arg_types(), arg_types)) {
-            return f;
-        }
+    auto i = find_iter(name, arg_types);
+    if (i != _declared.end()) {
+        return i->second;
    }
    return {};
 }
@@ -396,15 +410,7 @@ functions::match_arguments(database& db, const sstring& keyspace,

 bool
 functions::type_equals(const std::vector<data_type>& t1, const std::vector<data_type>& t2) {
-#if 0
-    if (t1.size() != t2.size())
-        return false;
-    for (int i = 0; i < t1.size(); i ++)
-        if (!typeEquals(t1.get(i), t2.get(i)))
-            return false;
-    return true;
-#endif
-    abort();
+    return t1 == t2;
 }

 bool
@@ -413,7 +419,7 @@ function_call::uses_function(const sstring& ks_name, const sstring& function_nam
 }

 void
-function_call::collect_marker_specification(shared_ptr<variable_specifications> bound_names) {
+function_call::collect_marker_specification(lw_shared_ptr<variable_specifications> bound_names) {
    for (auto&& t : _terms) {
        t->collect_marker_specification(bound_names);
    }
--- a/cql3/functions/functions.hh
+++ b/cql3/functions/functions.hh
@@ -58,9 +58,12 @@
 namespace cql3 {

 namespace functions {
+    using declared_t = std::unordered_multimap<function_name, shared_ptr<function>>;
+    void add_agg_functions(declared_t& funcs);

 class functions {
-    static thread_local std::unordered_multimap<function_name, shared_ptr<function>> _declared;
+    using declared_t = cql3::functions::declared_t;
+    static thread_local declared_t _declared;
 private:
    static std::unordered_multimap<function_name, shared_ptr<function>> init();
 public:
@@ -86,9 +89,17 @@ public:
        const std::vector<shared_ptr<assignment_testable>> args(std::begin(provided_args), std::end(provided_args));
        return get(db, keyspace, name, args, receiver_ks, receiver_cf, receiver);
    }
-    static std::vector<shared_ptr<function>> find(const function_name& name);
+    static boost::iterator_range<declared_t::iterator> find(const function_name& name);
+    static declared_t::iterator find_iter(const function_name& name, const std::vector<data_type>& arg_types);
    static shared_ptr<function> find(const function_name& name, const std::vector<data_type>& arg_types);
+    static void clear_functions();
+    static void add_function(shared_ptr<function>);
+    static void replace_function(shared_ptr<function>);
+    static void remove_function(const function_name& name, const std::vector<data_type>& arg_types);
 private:
+    template <typename F>
+    static void with_udf_iter(const function_name& name, const std::vector<data_type>& arg_types, F&& f);
+
    // This method and matchArguments are somewhat duplicate, but this method allows us to provide more precise errors in the common
    // case where there is no override for a given function. This is thus probably worth the minor code duplication.
    static void validate_types(database& db,
@@ -102,50 +113,6 @@ private:
            const std::vector<shared_ptr<assignment_testable>>& provided_args,
            const sstring& receiver_ks,
            const sstring& receiver_cf);
-#if 0
-    // This is *not* thread safe but is only called in SchemaTables that is synchronized.
-    public static void addFunction(AbstractFunction fun)
-    {
-        // We shouldn't get there unless that function don't exist
-        assert find(fun.name(), fun.argTypes()) == null;
-        declare(fun);
-    }
-
-    // Same remarks than for addFunction
-    public static void removeFunction(FunctionName name, List<AbstractType<?>> argsTypes)
-    {
-        Function old = find(name, argsTypes);
-        assert old != null && !old.isNative();
-        declared.remove(old.name(), old);
-    }
-
-    // Same remarks than for addFunction
-    public static void replaceFunction(AbstractFunction fun)
-    {
-        removeFunction(fun.name(), fun.argTypes());
-        addFunction(fun);
-    }
-
-    public static List<Function> getReferencesTo(Function old)
-    {
-        List<Function> references = new ArrayList<>();
-        for (Function function : declared.values())
-            if (function.hasReferenceTo(old))
-                references.add(function);
-        return references;
-    }
-
-    public static Collection<Function> all()
-    {
-        return declared.values();
-    }
-
-    public static boolean typeEquals(AbstractType<?> t1, AbstractType<?> t2)
-    {
-        return t1.asCQL3Type().toString().equals(t2.asCQL3Type().toString());
-    }
-
-#endif

    static bool type_equals(const std::vector<data_type>& t1, const std::vector<data_type>& t2);

--- a/cql3/functions/native_aggregate_function.hh
+++ b/cql3/functions/native_aggregate_function.hh
@@ -59,7 +59,7 @@ protected:
    }

 public:
-    virtual bool is_aggregate() override final {
+    virtual bool is_aggregate() const override final {
        return true;
    }
 };
--- a/cql3/functions/native_function.hh
+++ b/cql3/functions/native_function.hh
@@ -58,11 +58,11 @@ protected:

 public:
    // Most of our functions are pure, the other ones should override this
-    virtual bool is_pure() override {
+    virtual bool is_pure() const override {
        return true;
    }

-    virtual bool is_native() override {
+    virtual bool is_native() const override {
        return true;
    }
 };
--- a/cql3/functions/native_scalar_function.hh
+++ b/cql3/functions/native_scalar_function.hh
@@ -58,7 +58,7 @@ protected:
    }

 public:
-    virtual bool is_aggregate() override {
+    virtual bool is_aggregate() const override {
        return false;
    }
 };
@@ -74,7 +74,7 @@ public:
            : native_scalar_function(std::move(name), std::move(return_type), std::move(arg_types))
            , _func(std::forward<Func>(func)) {
    }
-    virtual bool is_pure() override {
+    virtual bool is_pure() const override {
        return Pure;
    }
    virtual bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) override {
--- a/cql3/functions/time_uuid_fcts.hh
+++ b/cql3/functions/time_uuid_fcts.hh
@@ -41,6 +41,7 @@

 #pragma once

+#include "castas_fcts.hh"
 #include "native_scalar_function.hh"
 #include "utils/UUID_gen.hh"
 #include <boost/uuid/uuid.hpp>
@@ -61,16 +62,6 @@ make_now_fct() {
    });
 }

-static int64_t get_valid_timestamp(const data_value& ts_obj) {
-    auto ts = value_cast<db_clock::time_point>(ts_obj);
-    int64_t ms = ts.time_since_epoch().count();
-    auto nanos_since = utils::UUID_gen::make_nanos_since(ms);
-    if (!utils::UUID_gen::is_valid_nanos_since(nanos_since)) {
-        throw exceptions::server_exception(format("{}: timestamp is out of range. Must be in milliseconds since epoch", ms));
-    }
-    return ms;
-}
-
 inline
 shared_ptr<function>
 make_min_timeuuid_fct() {
@@ -84,7 +75,8 @@ make_min_timeuuid_fct() {
        if (ts_obj.is_null()) {
            return {};
        }
-        auto uuid = utils::UUID_gen::min_time_UUID(get_valid_timestamp(ts_obj));
+        auto ts = value_cast<db_clock::time_point>(ts_obj);
+        auto uuid = utils::UUID_gen::min_time_UUID(ts.time_since_epoch().count());
        return {timeuuid_type->decompose(uuid)};
    });
 }
@@ -94,6 +86,7 @@ shared_ptr<function>
 make_max_timeuuid_fct() {
    return make_native_scalar_function<true>("maxtimeuuid", timeuuid_type, { timestamp_type },
            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+        // FIXME: should values be a vector<optional<bytes>>?
        auto& bb = values[0];
        if (!bb) {
            return {};
@@ -102,22 +95,12 @@ make_max_timeuuid_fct() {
        if (ts_obj.is_null()) {
            return {};
        }
-        auto uuid = utils::UUID_gen::max_time_UUID(get_valid_timestamp(ts_obj));
+        auto ts = value_cast<db_clock::time_point>(ts_obj);
+        auto uuid = utils::UUID_gen::max_time_UUID(ts.time_since_epoch().count());
        return {timeuuid_type->decompose(uuid)};
    });
 }

-inline utils::UUID get_valid_timeuuid(bytes raw) {
-    if (!utils::UUID_gen::is_valid_UUID(raw)) {
-        throw exceptions::server_exception(format("invalid timeuuid: size={}", raw.size()));
-    }
-    auto uuid = utils::UUID_gen::get_UUID(raw);
-    if (!uuid.is_timestamp()) {
-        throw exceptions::server_exception(format("{}: Not a timeuuid: version={}", uuid, uuid.version()));
-    }
-    return uuid;
-}
-
 inline
 shared_ptr<function>
 make_date_of_fct() {
@@ -128,7 +111,7 @@ make_date_of_fct() {
        if (!bb) {
            return {};
        }
-        auto ts = db_clock::time_point(db_clock::duration(UUID_gen::unix_timestamp(get_valid_timeuuid(*bb))));
+        auto ts = db_clock::time_point(db_clock::duration(UUID_gen::unix_timestamp(UUID_gen::get_UUID(*bb))));
        return {timestamp_type->decompose(ts)};
    });
 }
@@ -143,7 +126,7 @@ make_unix_timestamp_of_fct() {
        if (!bb) {
            return {};
        }
-        return {long_type->decompose(UUID_gen::unix_timestamp(get_valid_timeuuid(*bb)))};
+        return {long_type->decompose(UUID_gen::unix_timestamp(UUID_gen::get_UUID(*bb)))};
    });
 }

@@ -194,7 +177,7 @@ make_timeuuidtodate_fct() {
        if (!bb) {
            return {};
        }
-        auto ts = db_clock::time_point(db_clock::duration(UUID_gen::unix_timestamp(get_valid_timeuuid(*bb))));
+        auto ts = db_clock::time_point(db_clock::duration(UUID_gen::unix_timestamp(UUID_gen::get_UUID(*bb))));
        auto to_simple_date = get_castas_fctn(simple_date_type, timestamp_type);
        return {simple_date_type->decompose(to_simple_date(ts))};
    });
@@ -229,7 +212,7 @@ make_timeuuidtotimestamp_fct() {
        if (!bb) {
            return {};
        }
-        auto ts = db_clock::time_point(db_clock::duration(UUID_gen::unix_timestamp(get_valid_timeuuid(*bb))));
+        auto ts = db_clock::time_point(db_clock::duration(UUID_gen::unix_timestamp(UUID_gen::get_UUID(*bb))));
        return {timestamp_type->decompose(ts)};
    });
 }
@@ -263,12 +246,14 @@ make_timeuuidtounixtimestamp_fct() {
        if (!bb) {
            return {};
        }
-        return {long_type->decompose(UUID_gen::unix_timestamp(get_valid_timeuuid(*bb)))};
+        return {long_type->decompose(UUID_gen::unix_timestamp(UUID_gen::get_UUID(*bb)))};
    });
 }

 inline bytes time_point_to_long(const data_value& v) {
-    return data_value(get_valid_timestamp(v)).serialize();
+    auto since_epoch = value_cast<db_clock::time_point>(v).time_since_epoch();
+    int64_t ms = std::chrono::duration_cast<std::chrono::milliseconds>(since_epoch).count();
+    return serialized(ms);
 }

 inline
--- a/cql3/functions/user_function.cc
+++ b/cql3/functions/user_function.cc
@@ -0,0 +1,63 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "user_function.hh"
+#include "lua.hh"
+
+namespace cql3 {
+namespace functions {
+user_function::user_function(function_name name, std::vector<data_type> arg_types, std::vector<sstring> arg_names,
+        sstring body, sstring language, data_type return_type, bool called_on_null_input, sstring bitcode,
+        lua::runtime_config cfg)
+    : abstract_function(std::move(name), std::move(arg_types), std::move(return_type)),
+      _arg_names(std::move(arg_names)), _body(std::move(body)), _language(std::move(language)),
+      _called_on_null_input(called_on_null_input), _bitcode(std::move(bitcode)),
+      _cfg(std::move(cfg)) {}
+
+bool user_function::is_pure() const { return true; }
+
+bool user_function::is_native() const { return false; }
+
+bool user_function::is_aggregate() const { return false; }
+
+bool user_function::requires_thread() const { return true; }
+
+bytes_opt user_function::execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) {
+    const auto& types = arg_types();
+    if (parameters.size() != types.size()) {
+        throw std::logic_error("Wrong number of parameters");
+    }
+
+    std::vector<data_value> values;
+    values.reserve(parameters.size());
+    for (int i = 0, n = types.size(); i != n; ++i) {
+        const data_type& type = types[i];
+        const bytes_opt& bytes = parameters[i];
+        if (!bytes && !_called_on_null_input) {
+            return std::nullopt;
+        }
+        values.push_back(bytes ? type->deserialize(*bytes) : data_value::make_null(type));
+    }
+
+    return lua::run_script(lua::bitcode_view{_bitcode}, values, return_type(), _cfg).get0();
+}
+}
+}
--- a/cql3/functions/user_function.hh
+++ b/cql3/functions/user_function.hh
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "abstract_function.hh"
+#include "scalar_function.hh"
+#include "lua.hh"
+
+namespace cql3 {
+namespace functions {
+
+class user_function final : public abstract_function, public scalar_function {
+    std::vector<sstring> _arg_names;
+    sstring _body;
+    sstring _language;
+    bool _called_on_null_input;
+    sstring _bitcode;
+
+    // FIXME: We should not need a copy in each function. It is here
+    // because user_function::execute is only passed the
+    // cql_serialization_format and the runtime arguments.  We could
+    // avoid it by having a runtime->execute(user_function) instead,
+    // but that is a large refactoring. We could also store a
+    // lua_runtime in a thread_local variable, but that is one extra
+    // global.
+    lua::runtime_config _cfg;
+
+public:
+    user_function(function_name name, std::vector<data_type> arg_types, std::vector<sstring> arg_names, sstring body,
+            sstring language, data_type return_type, bool called_on_null_input, sstring bitcode,
+            lua::runtime_config cfg);
+
+    const std::vector<sstring>& arg_names() const { return _arg_names; }
+
+    const sstring& body() const { return _body; }
+
+    const sstring& language() const { return _language; }
+
+    bool called_on_null_input() const { return _called_on_null_input; }
+
+    virtual bool is_pure() const override;
+    virtual bool is_native() const override;
+    virtual bool is_aggregate() const override;
+    virtual bool requires_thread() const override;
+    virtual bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) override;
+};
+
+}
+}
--- a/cql3/lists.cc
+++ b/cql3/lists.cc
@@ -202,7 +202,7 @@ lists::delayed_value::contains_bind_marker() const {
 }

 void
-lists::delayed_value::collect_marker_specification(shared_ptr<variable_specifications> bound_names) {
+lists::delayed_value::collect_marker_specification(lw_shared_ptr<variable_specifications> bound_names) {
 }

 shared_ptr<terminal>
@@ -244,7 +244,7 @@ lists::marker::bind(const query_options& options) {
    }
 }

-constexpr const db_clock::time_point lists::precision_time::REFERENCE_TIME;
+constexpr db_clock::time_point lists::precision_time::REFERENCE_TIME;
 thread_local lists::precision_time lists::precision_time::_last = {db_clock::time_point::max(), 0};

 lists::precision_time
@@ -280,12 +280,12 @@ lists::setter::execute(mutation& m, const clustering_key_prefix& prefix, const u
 }

 bool
-lists::setter_by_index::requires_read() {
+lists::setter_by_index::requires_read() const {
    return true;
 }

 void
-lists::setter_by_index::collect_marker_specification(shared_ptr<variable_specifications> bound_names) {
+lists::setter_by_index::collect_marker_specification(lw_shared_ptr<variable_specifications> bound_names) {
    operation::collect_marker_specification(bound_names);
    _idx->collect_marker_specification(std::move(bound_names));
 }
@@ -337,7 +337,7 @@ lists::setter_by_index::execute(mutation& m, const clustering_key_prefix& prefix
 }

 bool
-lists::setter_by_uuid::requires_read() {
+lists::setter_by_uuid::requires_read() const {
    return false;
 }

@@ -437,7 +437,7 @@ lists::prepender::execute(mutation& m, const clustering_key_prefix& prefix, cons
 }

 bool
-lists::discarder::requires_read() {
+lists::discarder::requires_read() const {
    return true;
 }

@@ -490,7 +490,7 @@ lists::discarder::execute(mutation& m, const clustering_key_prefix& prefix, cons
 }

 bool
-lists::discarder_by_index::requires_read() {
+lists::discarder_by_index::requires_read() const {
    return true;
 }

--- a/cql3/lists.hh
+++ b/cql3/lists.hh
@@ -104,7 +104,7 @@ public:
                : _elements(std::move(elements)) {
        }
        virtual bool contains_bind_marker() const override;
-        virtual void collect_marker_specification(shared_ptr<variable_specifications> bound_names);
+        virtual void collect_marker_specification(lw_shared_ptr<variable_specifications> bound_names);
        virtual shared_ptr<terminal> bind(const query_options& options) override;
    };

@@ -158,8 +158,8 @@ public:
        setter_by_index(const column_definition& column, shared_ptr<term> idx, shared_ptr<term> t)
            : operation(column, std::move(t)), _idx(std::move(idx)) {
        }
-        virtual bool requires_read() override;
-        virtual void collect_marker_specification(shared_ptr<variable_specifications> bound_names);
+        virtual bool requires_read() const override;
+        virtual void collect_marker_specification(lw_shared_ptr<variable_specifications> bound_names);
        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };

@@ -168,7 +168,7 @@ public:
        setter_by_uuid(const column_definition& column, shared_ptr<term> idx, shared_ptr<term> t)
            : setter_by_index(column, std::move(idx), std::move(t)) {
        }
-        virtual bool requires_read() override;
+        virtual bool requires_read() const override;
        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };

@@ -195,7 +195,7 @@ public:
        discarder(const column_definition& column, shared_ptr<term> t)
                : operation(column, std::move(t)) {
        }
-        virtual bool requires_read() override;
+        virtual bool requires_read() const override;
        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };

@@ -204,7 +204,7 @@ public:
        discarder_by_index(const column_definition& column, shared_ptr<term> idx)
                : operation(column, std::move(idx)) {
        }
-        virtual bool requires_read() override;
+        virtual bool requires_read() const override;
        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params);
    };
 };
--- a/cql3/maps.cc
+++ b/cql3/maps.cc
@@ -218,7 +218,7 @@ maps::delayed_value::contains_bind_marker() const {
 }

 void
-maps::delayed_value::collect_marker_specification(shared_ptr<variable_specifications> bound_names) {
+maps::delayed_value::collect_marker_specification(lw_shared_ptr<variable_specifications> bound_names) {
 }

 shared_ptr<terminal>
@@ -293,7 +293,7 @@ maps::setter::execute(mutation& m, const clustering_key_prefix& row_key, const u
 }

 void
-maps::setter_by_key::collect_marker_specification(shared_ptr<variable_specifications> bound_names) {
+maps::setter_by_key::collect_marker_specification(lw_shared_ptr<variable_specifications> bound_names) {
    operation::collect_marker_specification(bound_names);
    _k->collect_marker_specification(bound_names);
 }
--- a/cql3/maps.hh
+++ b/cql3/maps.hh
@@ -98,7 +98,7 @@ public:
                : _comparator(std::move(comparator)), _elements(std::move(elements)) {
        }
        virtual bool contains_bind_marker() const override;
-        virtual void collect_marker_specification(shared_ptr<variable_specifications> bound_names) override;
+        virtual void collect_marker_specification(lw_shared_ptr<variable_specifications> bound_names) override;
        shared_ptr<terminal> bind(const query_options& options);
    };

@@ -126,7 +126,7 @@ public:
        setter_by_key(const column_definition& column, shared_ptr<term> k, shared_ptr<term> t)
            : operation(column, std::move(t)), _k(std::move(k)) {
        }
-        virtual void collect_marker_specification(shared_ptr<variable_specifications> bound_names) override;
+        virtual void collect_marker_specification(lw_shared_ptr<variable_specifications> bound_names) override;
        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };

--- a/cql3/multi_column_relation.hh
+++ b/cql3/multi_column_relation.hh
@@ -138,7 +138,7 @@ public:

 protected:
    virtual shared_ptr<restrictions::restriction> new_EQ_restriction(database& db, schema_ptr schema,
-                                                                     shared_ptr<variable_specifications> bound_names) override {
+                                                                     lw_shared_ptr<variable_specifications> bound_names) override {
        auto rs = receivers(db, schema);
        std::vector<::shared_ptr<column_specification>> col_specs(rs.size());
        std::transform(rs.begin(), rs.end(), col_specs.begin(), [] (auto cs) {
@@ -149,7 +149,7 @@ protected:
    }

    virtual shared_ptr<restrictions::restriction> new_IN_restriction(database& db, schema_ptr schema,
-                                                                     shared_ptr<variable_specifications> bound_names) override {
+                                                                     lw_shared_ptr<variable_specifications> bound_names) override {
        auto rs = receivers(db, schema);
        std::vector<::shared_ptr<column_specification>> col_specs(rs.size());
        std::transform(rs.begin(), rs.end(), col_specs.begin(), [] (auto cs) {
@@ -172,7 +172,7 @@ protected:
    }

    virtual shared_ptr<restrictions::restriction> new_slice_restriction(database& db, schema_ptr schema,
-                                                                        shared_ptr<variable_specifications> bound_names,
+                                                                        lw_shared_ptr<variable_specifications> bound_names,
                                                                        statements::bound bound, bool inclusive) override {
        auto rs = receivers(db, schema);
        std::vector<::shared_ptr<column_specification>> col_specs(rs.size());
@@ -184,12 +184,12 @@ protected:
    }

    virtual shared_ptr<restrictions::restriction> new_contains_restriction(database& db, schema_ptr schema,
-                                                                           shared_ptr<variable_specifications> bound_names, bool is_key) override {
+                                                                           lw_shared_ptr<variable_specifications> bound_names, bool is_key) override {
        throw exceptions::invalid_request_exception(format("{} cannot be used for Multi-column relations", get_operator()));
    }

    virtual ::shared_ptr<restrictions::restriction> new_LIKE_restriction(
-            database& db, schema_ptr schema, ::shared_ptr<variable_specifications> bound_names) override {
+            database& db, schema_ptr schema, lw_shared_ptr<variable_specifications> bound_names) override {
        throw exceptions::invalid_request_exception("LIKE cannot be used for Multi-column relations");
    }

@@ -202,7 +202,7 @@ protected:

    virtual shared_ptr<term> to_term(const std::vector<shared_ptr<column_specification>>& receivers,
                                     ::shared_ptr<term::raw> raw, database& db, const sstring& keyspace,
-                                     ::shared_ptr<variable_specifications> bound_names) override {
+                                     lw_shared_ptr<variable_specifications> bound_names) override {
        auto as_multi_column_raw = dynamic_pointer_cast<term::multi_column_raw>(raw);
        auto t = as_multi_column_raw->prepare(db, keyspace, receivers);
        t->collect_marker_specification(bound_names);
--- a/cql3/operation.hh
+++ b/cql3/operation.hh
@@ -115,7 +115,7 @@ public:
    * @return whether the operation requires a read of the previous value to be executed
    * (only lists setterByIdx, discard and discardByIdx requires that).
    */
-    virtual bool requires_read() {
+    virtual bool requires_read() const {
        return false;
    }

@@ -125,7 +125,7 @@ public:
     * @param bound_names the list of column specification where to collect the
     * bind variables of this term in.
     */
-    virtual void collect_marker_specification(::shared_ptr<variable_specifications> bound_names) {
+    virtual void collect_marker_specification(lw_shared_ptr<variable_specifications> bound_names) {
        if (_t) {
            _t->collect_marker_specification(bound_names);
        }
--- a/cql3/query_options.hh
+++ b/cql3/query_options.hh
@@ -237,6 +237,10 @@ public:
        return _names;
    }

+    const std::vector<cql3::raw_value_view>& get_values() const noexcept {
+        return _value_views;
+    }
+
    const cql_config& get_cql_config() const {
        return _cql_config;
    }
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -85,10 +85,11 @@ public:
    }
 };

-query_processor::query_processor(service::storage_proxy& proxy, database& db, query_processor::memory_config mcfg)
+query_processor::query_processor(service::storage_proxy& proxy, database& db, service::migration_notifier& mn, query_processor::memory_config mcfg)
        : _migration_subscriber{std::make_unique<migration_subscriber>(this)}
        , _proxy(proxy)
        , _db(db)
+        , _mnotifier(mn)
        , _internal_state(new internal_state())
        , _prepared_cache(prep_cache_log, mcfg.prepared_statment_cache_size)
        , _authorized_prepared_cache(std::min(std::chrono::milliseconds(_db.get_config().permissions_validity_in_ms()),
@@ -96,14 +97,22 @@ query_processor::query_processor(service::storage_proxy& proxy, database& db, qu
                                     std::chrono::milliseconds(_db.get_config().permissions_update_interval_in_ms()),
                                     mcfg.authorized_prepared_cache_size, authorized_prepared_statements_cache_log) {
    namespace sm = seastar::metrics;
+    namespace stm = statements;
    using clevel = db::consistency_level;
    sm::label cl_label("consistency_level");

+    sm::label who_label("who");  // Who queried system tables
+    const auto user_who_label_instance = who_label("user");
+    const auto internal_who_label_instance = who_label("internal");
+
+    sm::label ks_label("ks");
+    const auto system_ks_label_instance = ks_label("system");
+
    std::vector<sm::metric_definition> qp_group;
    qp_group.push_back(sm::make_derive(
        "statements_prepared",
        _stats.prepare_invocations,
-        sm::description("Counts a total number of parsed CQL requests.")));
+        sm::description("Counts the total number of parsed CQL requests.")));
    for (auto cl = size_t(clevel::MIN_VALUE); cl <= size_t(clevel::MAX_VALUE); ++cl) {
        qp_group.push_back(
            sm::make_derive(
@@ -123,97 +132,219 @@ query_processor::query_processor(service::storage_proxy& proxy, database& db, qu
            {
                    sm::make_derive(
                            "reads",
-                            _cql_stats.statements[size_t(statement_type::SELECT)],
-                            sm::description("Counts a total number of CQL read requests.")),
+                            sm::description("Counts the total number of CQL SELECT requests."),
+                            [this] {
+                                // Reads fall into `cond_selector::NO_CONDITIONS' pigeonhole
+                                return _cql_stats.query_cnt(source_selector::USER, ks_selector::SYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::SELECT)
+                                        + _cql_stats.query_cnt(source_selector::USER, ks_selector::NONSYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::SELECT)
+                                        + _cql_stats.query_cnt(source_selector::INTERNAL, ks_selector::SYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::SELECT)
+                                        + _cql_stats.query_cnt(source_selector::INTERNAL, ks_selector::NONSYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::SELECT);
+                            }),

                    sm::make_derive(
                            "inserts",
-                            _cql_stats.statements[size_t(statement_type::INSERT)],
-                            sm::description("Counts a total number of CQL INSERT requests without conditions."),
-                            {non_cas_label_instance}),
-
+                            sm::description("Counts the total number of CQL INSERT requests with/without conditions."),
+                            {non_cas_label_instance},
+                            [this] {
+                                return _cql_stats.query_cnt(source_selector::USER, ks_selector::SYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::INSERT)
+                                        + _cql_stats.query_cnt(source_selector::USER, ks_selector::NONSYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::INSERT)
+                                        + _cql_stats.query_cnt(source_selector::INTERNAL, ks_selector::SYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::INSERT)
+                                        + _cql_stats.query_cnt(source_selector::INTERNAL, ks_selector::NONSYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::INSERT);
+                            }),
                    sm::make_derive(
                            "inserts",
-                            _cql_stats.cas_statements[size_t(statement_type::INSERT)],
-                            sm::description("Counts a total number of CQL INSERT requests with conditions."),
-                            {cas_label_instance}),
+                            sm::description("Counts the total number of CQL INSERT requests with/without conditions."),
+                            {cas_label_instance},
+                            [this] {
+                                return _cql_stats.query_cnt(source_selector::USER, ks_selector::SYSTEM, cond_selector::WITH_CONDITIONS, stm::statement_type::INSERT)
+                                        + _cql_stats.query_cnt(source_selector::USER, ks_selector::NONSYSTEM, cond_selector::WITH_CONDITIONS, stm::statement_type::INSERT)
+                                        + _cql_stats.query_cnt(source_selector::INTERNAL, ks_selector::SYSTEM, cond_selector::WITH_CONDITIONS, stm::statement_type::INSERT)
+                                        + _cql_stats.query_cnt(source_selector::INTERNAL, ks_selector::NONSYSTEM, cond_selector::WITH_CONDITIONS, stm::statement_type::INSERT);
+                            }),

                    sm::make_derive(
                            "updates",
-                            _cql_stats.statements[size_t(statement_type::UPDATE)],
-                            sm::description("Counts a total number of CQL UPDATE requests without conditions."),
-                            {non_cas_label_instance}),
-
+                            sm::description("Counts the total number of CQL UPDATE requests with/without conditions."),
+                            {non_cas_label_instance},
+                            [this] {
+                                return _cql_stats.query_cnt(source_selector::USER, ks_selector::SYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::UPDATE)
+                                        + _cql_stats.query_cnt(source_selector::USER, ks_selector::NONSYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::UPDATE)
+                                        + _cql_stats.query_cnt(source_selector::INTERNAL, ks_selector::SYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::UPDATE)
+                                        + _cql_stats.query_cnt(source_selector::INTERNAL, ks_selector::NONSYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::UPDATE);
+                            }),
                    sm::make_derive(
                            "updates",
-                            _cql_stats.cas_statements[size_t(statement_type::UPDATE)],
-                            sm::description("Counts a total number of CQL UPDATE requests with conditions."),
-                            {cas_label_instance}),
+                            sm::description("Counts the total number of CQL UPDATE requests with/without conditions."),
+                            {cas_label_instance},
+                            [this] {
+                                return _cql_stats.query_cnt(source_selector::USER, ks_selector::SYSTEM, cond_selector::WITH_CONDITIONS, stm::statement_type::UPDATE)
+                                        + _cql_stats.query_cnt(source_selector::USER, ks_selector::NONSYSTEM, cond_selector::WITH_CONDITIONS, stm::statement_type::UPDATE)
+                                        + _cql_stats.query_cnt(source_selector::INTERNAL, ks_selector::SYSTEM, cond_selector::WITH_CONDITIONS, stm::statement_type::UPDATE)
+                                        + _cql_stats.query_cnt(source_selector::INTERNAL, ks_selector::NONSYSTEM, cond_selector::WITH_CONDITIONS, stm::statement_type::UPDATE);
+                            }),

                    sm::make_derive(
                            "deletes",
-                            _cql_stats.statements[size_t(statement_type::DELETE)],
-                            sm::description("Counts a total number of CQL DELETE requests without conditions."),
-                            {non_cas_label_instance}),
-
+                            sm::description("Counts the total number of CQL DELETE requests with/without conditions."),
+                            {non_cas_label_instance},
+                            [this] {
+                                return _cql_stats.query_cnt(source_selector::USER, ks_selector::SYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::DELETE)
+                                        + _cql_stats.query_cnt(source_selector::USER, ks_selector::NONSYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::DELETE)
+                                        + _cql_stats.query_cnt(source_selector::INTERNAL, ks_selector::SYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::DELETE)
+                                        + _cql_stats.query_cnt(source_selector::INTERNAL, ks_selector::NONSYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::DELETE);
+                            }),
                    sm::make_derive(
                            "deletes",
-                            _cql_stats.cas_statements[size_t(statement_type::DELETE)],
-                            sm::description("Counts a total number of CQL DELETE requests with conditions."),
-                            {cas_label_instance}),
+                            sm::description("Counts the total number of CQL DELETE requests with/without conditions."),
+                            {cas_label_instance},
+                            [this] {
+                                return _cql_stats.query_cnt(source_selector::USER, ks_selector::SYSTEM, cond_selector::WITH_CONDITIONS, stm::statement_type::DELETE)
+                                        + _cql_stats.query_cnt(source_selector::USER, ks_selector::NONSYSTEM, cond_selector::WITH_CONDITIONS, stm::statement_type::DELETE)
+                                        + _cql_stats.query_cnt(source_selector::INTERNAL, ks_selector::SYSTEM, cond_selector::WITH_CONDITIONS, stm::statement_type::DELETE)
+                                        + _cql_stats.query_cnt(source_selector::INTERNAL, ks_selector::NONSYSTEM, cond_selector::WITH_CONDITIONS, stm::statement_type::DELETE);
+                            }),
+
+                    sm::make_derive(
+                            "reads_per_ks",
+                            // Reads fall into `cond_selector::NO_CONDITIONS' pigeonhole
+                            _cql_stats.query_cnt(source_selector::USER, ks_selector::SYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::SELECT),
+                            sm::description("Counts the number of CQL SELECT requests executed on particular keyspaces. "
+                                            "Label `who' indicates where the reqs come from (clients or DB internals)"),
+                            {user_who_label_instance, system_ks_label_instance}),
+                    sm::make_derive(
+                            "reads_per_ks",
+                            _cql_stats.query_cnt(source_selector::INTERNAL, ks_selector::SYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::SELECT),
+                            sm::description("Counts the number of CQL SELECT requests executed on particular keyspaces. "
+                                            "Label `who' indicates where the reqs come from (clients or DB internals)"),
+                            {internal_who_label_instance, system_ks_label_instance}),
+
+                    sm::make_derive(
+                            "inserts_per_ks",
+                            _cql_stats.query_cnt(source_selector::USER, ks_selector::SYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::INSERT),
+                            sm::description("Counts the number of CQL INSERT requests executed on particular keyspaces. "
+                                            "Label `who' indicates where the reqs come from (clients or DB internals)."),
+                            {user_who_label_instance, system_ks_label_instance, non_cas_label_instance}),
+                    sm::make_derive(
+                            "inserts_per_ks",
+                            _cql_stats.query_cnt(source_selector::INTERNAL, ks_selector::SYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::INSERT),
+                            sm::description("Counts the number of CQL INSERT requests executed on particular keyspaces. "
+                                            "Label `who' indicates where the reqs come from (clients or DB internals)."),
+                            {internal_who_label_instance, system_ks_label_instance, non_cas_label_instance}),
+                    sm::make_derive(
+                            "inserts_per_ks",
+                            _cql_stats.query_cnt(source_selector::USER, ks_selector::SYSTEM, cond_selector::WITH_CONDITIONS, stm::statement_type::INSERT),
+                            sm::description("Counts the number of CQL INSERT requests executed on particular keyspaces. "
+                                            "Label `who' indicates where the reqs come from (clients or DB internals)."),
+                            {user_who_label_instance, system_ks_label_instance, cas_label_instance}),
+                    sm::make_derive(
+                            "inserts_per_ks",
+                            _cql_stats.query_cnt(source_selector::INTERNAL, ks_selector::SYSTEM, cond_selector::WITH_CONDITIONS, stm::statement_type::INSERT),
+                            sm::description("Counts the number of CQL INSERT requests executed on particular keyspaces. "
+                                            "Label `who' indicates where the reqs come from (clients or DB internals)."),
+                            {internal_who_label_instance, system_ks_label_instance, cas_label_instance}),
+
+                    sm::make_derive(
+                            "updates_per_ks",
+                            _cql_stats.query_cnt(source_selector::USER, ks_selector::SYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::UPDATE),
+                            sm::description("Counts the number of CQL UPDATE requests executed on particular keyspaces. "
+                                            "Label `who' indicates where the reqs come from (clients or DB internals)"),
+                            {user_who_label_instance, system_ks_label_instance, non_cas_label_instance}),
+                    sm::make_derive(
+                            "updates_per_ks",
+                            _cql_stats.query_cnt(source_selector::INTERNAL, ks_selector::SYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::UPDATE),
+                            sm::description("Counts the number of CQL UPDATE requests executed on particular keyspaces. "
+                                            "Label `who' indicates where the reqs come from (clients or DB internals)"),
+                            {internal_who_label_instance, system_ks_label_instance, non_cas_label_instance}),
+                    sm::make_derive(
+                            "updates_per_ks",
+                            _cql_stats.query_cnt(source_selector::USER, ks_selector::SYSTEM, cond_selector::WITH_CONDITIONS, stm::statement_type::UPDATE),
+                            sm::description("Counts the number of CQL UPDATE requests executed on particular keyspaces. "
+                                            "Label `who' indicates where the reqs come from (clients or DB internals)"),
+                            {user_who_label_instance, system_ks_label_instance, cas_label_instance}),
+                    sm::make_derive(
+                            "updates_per_ks",
+                            _cql_stats.query_cnt(source_selector::INTERNAL, ks_selector::SYSTEM, cond_selector::WITH_CONDITIONS, stm::statement_type::UPDATE),
+                            sm::description("Counts the number of CQL UPDATE requests executed on particular keyspaces. "
+                                            "Label `who' indicates where the reqs come from (clients or DB internals)"),
+                            {internal_who_label_instance, system_ks_label_instance, cas_label_instance}),
+
+                    sm::make_derive(
+                            "deletes_per_ks",
+                            _cql_stats.query_cnt(source_selector::USER, ks_selector::SYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::DELETE),
+                            sm::description("Counts the number of CQL DELETE requests executed on particular keyspaces. "
+                                            "Label `who' indicates where the reqs come from (clients or DB internals)"),
+                            {user_who_label_instance, system_ks_label_instance, non_cas_label_instance}),
+                    sm::make_derive(
+                            "deletes_per_ks",
+                            _cql_stats.query_cnt(source_selector::INTERNAL, ks_selector::SYSTEM, cond_selector::NO_CONDITIONS, stm::statement_type::DELETE),
+                            sm::description("Counts the number of CQL DELETE requests executed on particular keyspaces. "
+                                            "Label `who' indicates where the reqs come from (clients or DB internals)"),
+                            {internal_who_label_instance, system_ks_label_instance, non_cas_label_instance}),
+                    sm::make_derive(
+                            "deletes_per_ks",
+                            _cql_stats.query_cnt(source_selector::USER, ks_selector::SYSTEM, cond_selector::WITH_CONDITIONS, stm::statement_type::DELETE),
+                            sm::description("Counts the number of CQL DELETE requests executed on particular keyspaces. "
+                                            "Label `who' indicates where the reqs come from (clients or DB internals)"),
+                            {user_who_label_instance, system_ks_label_instance, cas_label_instance}),
+                    sm::make_derive(
+                            "deletes_per_ks",
+                            _cql_stats.query_cnt(source_selector::INTERNAL, ks_selector::SYSTEM, cond_selector::WITH_CONDITIONS, stm::statement_type::DELETE),
+                            sm::description("Counts the number of CQL DELETE requests executed on particular keyspaces. "
+                                            "Label `who' indicates where the reqs come from (clients or DB internals)"),
+                            {internal_who_label_instance, system_ks_label_instance, cas_label_instance}),

                    sm::make_derive(
                            "batches",
                            _cql_stats.batches,
-                            sm::description("Counts a total number of CQL BATCH requests without conditions."),
+                            sm::description("Counts the total number of CQL BATCH requests without conditions."),
                            {non_cas_label_instance}),

                    sm::make_derive(
                            "batches",
                            _cql_stats.cas_batches,
-                            sm::description("Counts a total number of CQL BATCH requests with conditions."),
+                            sm::description("Counts the total number of CQL BATCH requests with conditions."),
                            {cas_label_instance}),

                    sm::make_derive(
                            "statements_in_batches",
                            _cql_stats.statements_in_batches,
-                            sm::description("Counts a total number of sub-statements in CQL BATCH requests without conditions."),
+                            sm::description("Counts the total number of sub-statements in CQL BATCH requests without conditions."),
                            {non_cas_label_instance}),

                    sm::make_derive(
                            "statements_in_batches",
                            _cql_stats.statements_in_cas_batches,
-                            sm::description("Counts a total number of sub-statements in CQL BATCH requests with conditions."),
+                            sm::description("Counts the total number of sub-statements in CQL BATCH requests with conditions."),
                            {cas_label_instance}),

                    sm::make_derive(
                            "batches_pure_logged",
                            _cql_stats.batches_pure_logged,
                            sm::description(
-                                    "Counts a total number of LOGGED batches that were executed as LOGGED batches.")),
+                                    "Counts the total number of LOGGED batches that were executed as LOGGED batches.")),

                    sm::make_derive(
                            "batches_pure_unlogged",
                            _cql_stats.batches_pure_unlogged,
                            sm::description(
-                                    "Counts a total number of UNLOGGED batches that were executed as UNLOGGED "
+                                    "Counts the total number of UNLOGGED batches that were executed as UNLOGGED "
                                    "batches.")),

                    sm::make_derive(
                            "batches_unlogged_from_logged",
                            _cql_stats.batches_unlogged_from_logged,
-                            sm::description("Counts a total number of LOGGED batches that were executed as UNLOGGED "
+                            sm::description("Counts the total number of LOGGED batches that were executed as UNLOGGED "
                                            "batches.")),

                    sm::make_derive(
                            "rows_read",
                            _cql_stats.rows_read,
-                            sm::description("Counts a total number of rows read during CQL requests.")),
+                            sm::description("Counts the total number of rows read during CQL requests.")),

                    sm::make_derive(
                            "prepared_cache_evictions",
                            [] { return prepared_statements_cache::shard_stats().prepared_cache_evictions; },
-                            sm::description("Counts a number of prepared statements cache entries evictions.")),
+                            sm::description("Counts the number of prepared statements cache entries evictions.")),

                    sm::make_gauge(
                            "prepared_cache_size",
@@ -228,58 +359,63 @@ query_processor::query_processor(service::storage_proxy& proxy, database& db, qu
                    sm::make_derive(
                            "secondary_index_creates",
                            _cql_stats.secondary_index_creates,
-                            sm::description("Counts a total number of CQL CREATE INDEX requests.")),
+                            sm::description("Counts the total number of CQL CREATE INDEX requests.")),

                    sm::make_derive(
                            "secondary_index_drops",
                            _cql_stats.secondary_index_drops,
-                            sm::description("Counts a total number of CQL DROP INDEX requests.")),
+                            sm::description("Counts the total number of CQL DROP INDEX requests.")),

                    // secondary_index_reads total count is also included in all cql reads
                    sm::make_derive(
                            "secondary_index_reads",
                            _cql_stats.secondary_index_reads,
-                            sm::description("Counts a total number of CQL read requests performed using secondary indexes.")),
+                            sm::description("Counts the total number of CQL read requests performed using secondary indexes.")),

                    // secondary_index_rows_read total count is also included in all cql rows read
                    sm::make_derive(
                            "secondary_index_rows_read",
                            _cql_stats.secondary_index_rows_read,
-                            sm::description("Counts a total number of rows read during CQL requests performed using secondary indexes.")),
+                            sm::description("Counts the total number of rows read during CQL requests performed using secondary indexes.")),

                    // read requests that required ALLOW FILTERING
                    sm::make_derive(
                            "filtered_read_requests",
                            _cql_stats.filtered_reads,
-                            sm::description("Counts a total number of CQL read requests that required ALLOW FILTERING. See filtered_rows_read_total to compare how many rows needed to be filtered.")),
+                            sm::description("Counts the total number of CQL read requests that required ALLOW FILTERING. See filtered_rows_read_total to compare how many rows needed to be filtered.")),

                    // rows read with filtering enabled (because ALLOW FILTERING was required)
                    sm::make_derive(
                            "filtered_rows_read_total",
                            _cql_stats.filtered_rows_read_total,
-                            sm::description("Counts a total number of rows read during CQL requests that required ALLOW FILTERING. See filtered_rows_matched_total and filtered_rows_dropped_total for information how accurate filtering queries are.")),
+                            sm::description("Counts the total number of rows read during CQL requests that required ALLOW FILTERING. See filtered_rows_matched_total and filtered_rows_dropped_total for information how accurate filtering queries are.")),

                    // rows read with filtering enabled and accepted by the filter
                    sm::make_derive(
                            "filtered_rows_matched_total",
                            _cql_stats.filtered_rows_matched_total,
-                            sm::description("Counts a number of rows read during CQL requests that required ALLOW FILTERING and accepted by the filter. Number similar to filtered_rows_read_total indicates that filtering is accurate.")),
+                            sm::description("Counts the number of rows read during CQL requests that required ALLOW FILTERING and accepted by the filter. Number similar to filtered_rows_read_total indicates that filtering is accurate.")),

                    // rows read with filtering enabled and rejected by the filter
                    sm::make_derive(
                            "filtered_rows_dropped_total",
                            [this]() {return _cql_stats.filtered_rows_read_total - _cql_stats.filtered_rows_matched_total;},
-                            sm::description("Counts a number of rows read during CQL requests that required ALLOW FILTERING and dropped by the filter. Number similar to filtered_rows_read_total indicates that filtering is not accurate and might cause performance degradation.")),
+                            sm::description("Counts the number of rows read during CQL requests that required ALLOW FILTERING and dropped by the filter. Number similar to filtered_rows_read_total indicates that filtering is not accurate and might cause performance degradation.")),
+
+                    sm::make_derive(
+                            "select_bypass_caches",
+                            _cql_stats.select_bypass_caches,
+                            sm::description("Counts the number of SELECT statements with BYPASS CACHE option.")),

                    sm::make_derive(
                            "authorized_prepared_statements_cache_evictions",
                            [] { return authorized_prepared_statements_cache::shard_stats().authorized_prepared_statements_cache_evictions; },
-                            sm::description("Counts a number of authenticated prepared statements cache entries evictions.")),
+                            sm::description("Counts the number of authenticated prepared statements cache entries evictions.")),

                    sm::make_gauge(
                            "authorized_prepared_statements_cache_size",
                            [this] { return _authorized_prepared_cache.size(); },
-                            sm::description("A number of entries in the authenticated prepared statements cache.")),
+                            sm::description("Number of entries in the authenticated prepared statements cache.")),

                    sm::make_gauge(
                            "user_prepared_auth_cache_footprint",
@@ -289,24 +425,34 @@ query_processor::query_processor(service::storage_proxy& proxy, database& db, qu
                    sm::make_counter(
                            "reverse_queries",
                            _cql_stats.reverse_queries,
-                            sm::description("Counts number of CQL SELECT requests with ORDER BY DESC.")),
+                            sm::description("Counts the number of CQL SELECT requests with reverse ORDER BY order.")),

                    sm::make_counter(
                            "unpaged_select_queries",
-                            _cql_stats.unpaged_select_queries,
-                            sm::description("Counts number of unpaged CQL SELECT requests.")),
+                            [this] {
+                                return _cql_stats.unpaged_select_queries(ks_selector::NONSYSTEM)
+                                        + _cql_stats.unpaged_select_queries(ks_selector::SYSTEM);
+                            },
+                            sm::description("Counts the total number of unpaged CQL SELECT requests.")),
+
+                    sm::make_counter(
+                            "unpaged_select_queries_per_ks",
+                            _cql_stats.unpaged_select_queries(ks_selector::SYSTEM),
+                            sm::description("Counts the number of unpaged CQL SELECT requests against particular keyspaces."),
+                            {system_ks_label_instance})

            });

-    service::get_local_migration_manager().register_listener(_migration_subscriber.get());
+    _mnotifier.register_listener(_migration_subscriber.get());
 }

 query_processor::~query_processor() {
 }

 future<> query_processor::stop() {
-    service::get_local_migration_manager().unregister_listener(_migration_subscriber.get());
-    return _authorized_prepared_cache.stop().finally([this] { return _prepared_cache.stop(); });
+    return _mnotifier.unregister_listener(_migration_subscriber.get()).then([this] {
+        return _authorized_prepared_cache.stop().finally([this] { return _prepared_cache.stop(); });
+    });
 }

 future<::shared_ptr<result_message>>
@@ -484,7 +630,7 @@ query_options query_processor::make_internal_options(
        const std::initializer_list<data_value>& values,
        db::consistency_level cl,
        const timeout_config& timeout_config,
-        int32_t page_size) {
+        int32_t page_size) const {
    if (p->bound_names.size() != values.size()) {
        throw std::invalid_argument(
                format("Invalid number of values. Expecting {:d} but got {:d}", p->bound_names.size(), values.size()));
--- a/cql3/query_processor.hh
+++ b/cql3/query_processor.hh
@@ -57,7 +57,7 @@
 #include "cql3/untyped_result_set.hh"
 #include "exceptions/exceptions.hh"
 #include "log.hh"
-#include "service/migration_manager.hh"
+#include "service/migration_listener.hh"
 #include "service/query_state.hh"
 #include "transport/messages/result_message.hh"

@@ -109,6 +109,7 @@ private:
    std::unique_ptr<migration_subscriber> _migration_subscriber;
    service::storage_proxy& _proxy;
    database& _db;
+    service::migration_notifier& _mnotifier;

    struct stats {
        uint64_t prepare_invocations = 0;
@@ -142,7 +143,7 @@ public:

    static ::shared_ptr<statements::raw::parsed_statement> parse_statement(const std::string_view& query);

-    query_processor(service::storage_proxy& proxy, database& db, memory_config mcfg);
+    query_processor(service::storage_proxy& proxy, database& db, service::migration_notifier& mn, memory_config mcfg);

    ~query_processor();

@@ -158,15 +159,15 @@ public:
        return _cql_stats;
    }

-    statements::prepared_statement::checked_weak_ptr get_prepared(const auth::authenticated_user* user_ptr, const prepared_cache_key_type& key) {
-        if (user_ptr) {
-            auto it = _authorized_prepared_cache.find(*user_ptr, key);
+    statements::prepared_statement::checked_weak_ptr get_prepared(const std::optional<auth::authenticated_user>& user, const prepared_cache_key_type& key) {
+        if (user) {
+            auto it = _authorized_prepared_cache.find(*user, key);
            if (it != _authorized_prepared_cache.end()) {
                try {
                    return it->get()->checked_weak_from_this();
                } catch (seastar::checked_ptr_is_null_exception&) {
                    // If the prepared statement got invalidated - remove the corresponding authorized_prepared_statements_cache entry as well.
-                    _authorized_prepared_cache.remove(*user_ptr, key);
+                    _authorized_prepared_cache.remove(*user, key);
                }
            }
        }
@@ -325,7 +326,7 @@ private:
            const std::initializer_list<data_value>&,
            db::consistency_level,
            const timeout_config& timeout_config,
-            int32_t page_size = -1);
+            int32_t page_size = -1) const;

    future<::shared_ptr<cql_transport::messages::result_message>>
    process_authorized_statement(const ::shared_ptr<cql_statement> statement, service::query_state& query_state, const query_options& options);
--- a/cql3/relation.hh
+++ b/cql3/relation.hh
@@ -139,7 +139,7 @@ public:
     * @return the <code>Restriction</code> corresponding to this <code>Relation</code>
     * @throws InvalidRequestException if this <code>Relation</code> is not valid
     */
-    virtual ::shared_ptr<restrictions::restriction> to_restriction(database& db, schema_ptr schema, ::shared_ptr<variable_specifications> bound_names) final {
+    virtual ::shared_ptr<restrictions::restriction> to_restriction(database& db, schema_ptr schema, lw_shared_ptr<variable_specifications> bound_names) final {
        if (_relation_type == operator_type::EQ) {
            return new_EQ_restriction(db, schema, bound_names);
        } else if (_relation_type == operator_type::LT) {
@@ -182,7 +182,7 @@ public:
     * @throws InvalidRequestException if the relation cannot be converted into an EQ restriction.
     */
    virtual ::shared_ptr<restrictions::restriction> new_EQ_restriction(database& db, schema_ptr schema,
-        ::shared_ptr<variable_specifications> bound_names) = 0;
+        lw_shared_ptr<variable_specifications> bound_names) = 0;

    /**
     * Creates a new IN restriction instance.
@@ -193,7 +193,7 @@ public:
     * @throws InvalidRequestException if the relation cannot be converted into an IN restriction.
     */
    virtual ::shared_ptr<restrictions::restriction> new_IN_restriction(database& db, schema_ptr schema,
-        ::shared_ptr<variable_specifications> bound_names) = 0;
+        lw_shared_ptr<variable_specifications> bound_names) = 0;

    /**
     * Creates a new Slice restriction instance.
@@ -206,7 +206,7 @@ public:
     * @throws InvalidRequestException if the <code>Relation</code> is not valid
     */
    virtual ::shared_ptr<restrictions::restriction> new_slice_restriction(database& db, schema_ptr schema,
-        ::shared_ptr<variable_specifications> bound_names,
+        lw_shared_ptr<variable_specifications> bound_names,
        statements::bound bound,
        bool inclusive) = 0;

@@ -220,13 +220,13 @@ public:
     * @throws InvalidRequestException if the <code>Relation</code> is not valid
     */
    virtual ::shared_ptr<restrictions::restriction> new_contains_restriction(database& db, schema_ptr schema,
-        ::shared_ptr<variable_specifications> bound_names, bool isKey) = 0;
+        lw_shared_ptr<variable_specifications> bound_names, bool isKey) = 0;

    /**
     * Creates a new LIKE restriction instance.
     */
    virtual ::shared_ptr<restrictions::restriction> new_LIKE_restriction(database& db, schema_ptr schema,
-        ::shared_ptr<variable_specifications> bound_names) = 0;
+        lw_shared_ptr<variable_specifications> bound_names) = 0;

    /**
     * Renames an identifier in this Relation, if applicable.
@@ -253,7 +253,7 @@ protected:
                                       ::shared_ptr<term::raw> raw,
                                       database& db,
                                       const sstring& keyspace,
-                                       ::shared_ptr<variable_specifications> boundNames) = 0;
+                                       lw_shared_ptr<variable_specifications> boundNames) = 0;

    /**
     * Converts the specified <code>Raw</code> terms into a <code>Term</code>s.
@@ -269,7 +269,7 @@ protected:
                                             const std::vector<::shared_ptr<term::raw>>& raws,
                                             database& db,
                                             const sstring& keyspace,
-                                             ::shared_ptr<variable_specifications> boundNames) {
+                                             lw_shared_ptr<variable_specifications> boundNames) {
        std::vector<::shared_ptr<term>> terms;
        for (auto&& r : raws) {
            terms.emplace_back(to_term(receivers, r, db, keyspace, boundNames));
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -176,7 +176,7 @@ statement_restrictions::statement_restrictions(database& db,
        schema_ptr schema,
        statements::statement_type type,
        const std::vector<::shared_ptr<relation>>& where_clause,
-        ::shared_ptr<variable_specifications> bound_names,
+        lw_shared_ptr<variable_specifications> bound_names,
        bool selects_only_static_columns,
        bool select_a_collection,
        bool for_view,
--- a/Show More
+++ b/Show More