From fcd15b5cd4835aa1ca6e2f11ffe78ee1b8857115 Mon Sep 17 00:00:00 2001 From: Dawid Pawlik Date: Mon, 27 Apr 2026 14:39:45 +0200 Subject: [PATCH] index: add `fulltext_index` custom index implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce `fulltext_index`, a new `custom_index` subclass for full-text search (FTS). The index validates that the target column is a text type (text, varchar, or ascii) and supports two WITH OPTIONS keys: - 'analyzer': one of standard, english, german, french, spanish, italian, portuguese, russian, chinese, japanese, korean, simple, whitespace - 'positions': boolean controlling whether term positions are stored `view_should_exist()` returns false — no backing materialized view is created, matching the CDC-backed pattern used by `vector_index`. Fixes: SCYLLADB-1517 --- configure.py | 1 + index/CMakeLists.txt | 1 + index/fulltext_index.cc | 135 +++++++++++++++++++++++++++++++ index/fulltext_index.hh | 43 ++++++++++ index/secondary_index_manager.cc | 2 + 5 files changed, 182 insertions(+) create mode 100644 index/fulltext_index.cc create mode 100644 index/fulltext_index.hh diff --git a/configure.py b/configure.py index 9c0979027a..d63b73cca5 100755 --- a/configure.py +++ b/configure.py @@ -1163,6 +1163,7 @@ scylla_core = (['message/messaging_service.cc', 'index/secondary_index_manager.cc', 'index/secondary_index.cc', 'index/vector_index.cc', + 'index/fulltext_index.cc', 'index/index_option_utils.cc', 'utils/UUID_gen.cc', 'utils/i_filter.cc', diff --git a/index/CMakeLists.txt b/index/CMakeLists.txt index d4d72c232f..18d6e35b8b 100644 --- a/index/CMakeLists.txt +++ b/index/CMakeLists.txt @@ -5,6 +5,7 @@ target_sources(index PRIVATE secondary_index.cc secondary_index_manager.cc + fulltext_index.cc index_option_utils.cc vector_index.cc) target_include_directories(index diff --git a/index/fulltext_index.cc b/index/fulltext_index.cc new file mode 100644 index 0000000000..54cc57e27e --- /dev/null +++ b/index/fulltext_index.cc @@ -0,0 +1,135 @@ +/* + * Copyright 2026-present ScyllaDB + */ + +/* + * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1 + */ + +#include "cql3/statements/index_target.hh" +#include "cql3/util.hh" +#include "exceptions/exceptions.hh" +#include "schema/schema.hh" +#include "index/fulltext_index.hh" +#include "index/index_option_utils.hh" +#include "index/secondary_index.hh" +#include "index/secondary_index_manager.hh" +#include "utils/UUID_gen.hh" +#include "utils/managed_string.hh" +#include +#include + +namespace secondary_index { + +// Supported text analyzers for fulltext indexing. +// This list corresponds to analyzers expected to be provided +// by the backend search engine (Tantivy). +static const std::vector analyzer_values = { + "standard", "english", "german", "french", "spanish", "italian", "portuguese", "russian", "chinese", "japanese", "korean", "simple", "whitespace"}; + +const static std::unordered_map> fulltext_index_options = { + // 'analyzer' specifies the built-in text analyzer to use for tokenization. + {"analyzer", std::bind_front(util::validate_enumerated_option, analyzer_values)}, + // 'positions' controls whether token positions are stored in the index. + // Required for phrase queries. Set to false to save space. + {"positions", std::bind_front(util::validate_enumerated_option, util::boolean_values)}, +}; + +bool fulltext_index::view_should_exist() const { + return false; +} + +std::optional fulltext_index::describe(const index_metadata& im, const schema& base_schema) const { + static const std::unordered_set system_options = { + cql3::statements::index_target::target_option_name, + db::index::secondary_index::custom_class_option_name, + db::index::secondary_index::index_version_option_name, + }; + + auto target = im.options().at(cql3::statements::index_target::target_option_name); + auto target_column = cql3::statements::index_target::column_name_from_target_string(target); + + fragmented_ostringstream os; + os << "CREATE CUSTOM INDEX " << cql3::util::maybe_quote(im.name()) << " ON " << cql3::util::maybe_quote(base_schema.ks_name()) << "." + << cql3::util::maybe_quote(base_schema.cf_name()) << "(" << cql3::util::maybe_quote(target_column) << ")" + << " USING 'fulltext_index'"; + + // Collect user-provided options (excluding system keys like target, class_name, index_version). + std::map user_options; + for (const auto& [key, value] : im.options()) { + if (!system_options.contains(key)) { + user_options.emplace(key, value); + } + } + if (!user_options.empty()) { + os << " WITH OPTIONS = {"; + bool first = true; + for (const auto& [key, value] : user_options) { + if (!first) { + os << ", "; + } + os << "'" << key << "': '" << value << "'"; + first = false; + } + os << "}"; + } + + return cql3::description{ + .keyspace = base_schema.ks_name(), + .type = "index", + .name = im.name(), + .create_statement = std::move(os).to_managed_string(), + }; +} + +void fulltext_index::check_target(const schema& schema, const std::vector<::shared_ptr>& targets) const { + using cql3::statements::index_target; + + if (targets.size() != 1) { + throw exceptions::invalid_request_exception("Fulltext index must have exactly one target column"); + } + + auto& target = targets[0]; + if (!std::holds_alternative(target->value)) { + throw exceptions::invalid_request_exception("Fulltext index target must be a single column"); + } + + auto& column = std::get(target->value); + auto c_name = column->to_string(); + auto const* c_def = schema.get_column_definition(column->name()); + if (c_def == nullptr) { + throw exceptions::invalid_request_exception(format("Column {} not found in schema", c_name)); + } + + auto kind = c_def->type->get_kind(); + if (kind != abstract_type::kind::utf8 && kind != abstract_type::kind::ascii) { + throw exceptions::invalid_request_exception( + format("Fulltext index is only supported on text, varchar, or ascii columns, but column {} has an incompatible type", c_name)); + } +} + +void fulltext_index::check_index_options(const cql3::statements::index_specific_prop_defs& properties) const { + for (auto option : properties.get_raw_options()) { + auto it = fulltext_index_options.find(option.first); + if (it == fulltext_index_options.end()) { + throw exceptions::invalid_request_exception(format("Unsupported option {} for fulltext index", option.first)); + } + it->second(index_type_name(), option.first, option.second); + } +} + +void fulltext_index::validate(const schema& schema, const cql3::statements::index_specific_prop_defs& properties, + const std::vector<::shared_ptr>& targets, const gms::feature_service&, const data_dictionary::database&) const { + check_target(schema, targets); + check_index_options(properties); +} + +utils::UUID fulltext_index::index_version(const schema& schema) { + return utils::UUID_gen::get_time_UUID(); +} + +std::unique_ptr fulltext_index_factory() { + return std::make_unique(); +} + +} // namespace secondary_index diff --git a/index/fulltext_index.hh b/index/fulltext_index.hh new file mode 100644 index 0000000000..8ac7aff00b --- /dev/null +++ b/index/fulltext_index.hh @@ -0,0 +1,43 @@ +/* + * Copyright 2026-present ScyllaDB + */ + +/* + * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1 + */ + +#pragma once + +#include "schema/schema.hh" + +#include "data_dictionary/data_dictionary.hh" +#include "cql3/statements/index_target.hh" +#include "index/secondary_index_manager.hh" + +#include + +namespace secondary_index { + +class fulltext_index : public custom_index { +public: + std::string_view index_type_name() const override { + return "fulltext"; + } + + fulltext_index() = default; + ~fulltext_index() override = default; + std::optional describe(const index_metadata& im, const schema& base_schema) const override; + bool view_should_exist() const override; + void validate(const schema& schema, const cql3::statements::index_specific_prop_defs& properties, + const std::vector<::shared_ptr>& targets, const gms::feature_service& fs, + const data_dictionary::database& db) const override; + utils::UUID index_version(const schema& schema) override; + +private: + void check_target(const schema& schema, const std::vector<::shared_ptr>& targets) const; + void check_index_options(const cql3::statements::index_specific_prop_defs& properties) const; +}; + +std::unique_ptr fulltext_index_factory(); + +} // namespace secondary_index diff --git a/index/secondary_index_manager.cc b/index/secondary_index_manager.cc index 1483c0bef1..93acb991a0 100644 --- a/index/secondary_index_manager.cc +++ b/index/secondary_index_manager.cc @@ -17,6 +17,7 @@ #include "index/secondary_index_manager.hh" #include "index/secondary_index.hh" +#include "index/fulltext_index.hh" #include "index/vector_index.hh" #include "cql3/expr/expression.hh" @@ -211,6 +212,7 @@ std::optional()>> secondary_index_ma std::transform(lower_class_name.begin(), lower_class_name.end(), lower_class_name.begin(), ::tolower); const static std::unordered_map()>> classes = { + {"fulltext_index", fulltext_index_factory}, {"vector_index", vector_index_factory}, };