mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-13 03:12:13 +00:00
index: add fulltext_index custom index implementation
Introduce `fulltext_index`, a new `custom_index` subclass for full-text search (FTS). The index validates that the target column is a text type (text, varchar, or ascii) and supports two WITH OPTIONS keys: - 'analyzer': one of standard, english, german, french, spanish, italian, portuguese, russian, chinese, japanese, korean, simple, whitespace - 'positions': boolean controlling whether term positions are stored `view_should_exist()` returns false — no backing materialized view is created, matching the CDC-backed pattern used by `vector_index`. Fixes: SCYLLADB-1517
This commit is contained in:
@@ -1163,6 +1163,7 @@ scylla_core = (['message/messaging_service.cc',
|
||||
'index/secondary_index_manager.cc',
|
||||
'index/secondary_index.cc',
|
||||
'index/vector_index.cc',
|
||||
'index/fulltext_index.cc',
|
||||
'index/index_option_utils.cc',
|
||||
'utils/UUID_gen.cc',
|
||||
'utils/i_filter.cc',
|
||||
|
||||
@@ -5,6 +5,7 @@ target_sources(index
|
||||
PRIVATE
|
||||
secondary_index.cc
|
||||
secondary_index_manager.cc
|
||||
fulltext_index.cc
|
||||
index_option_utils.cc
|
||||
vector_index.cc)
|
||||
target_include_directories(index
|
||||
|
||||
135
index/fulltext_index.cc
Normal file
135
index/fulltext_index.cc
Normal file
@@ -0,0 +1,135 @@
|
||||
/*
|
||||
* Copyright 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
|
||||
#include "cql3/statements/index_target.hh"
|
||||
#include "cql3/util.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include "schema/schema.hh"
|
||||
#include "index/fulltext_index.hh"
|
||||
#include "index/index_option_utils.hh"
|
||||
#include "index/secondary_index.hh"
|
||||
#include "index/secondary_index_manager.hh"
|
||||
#include "utils/UUID_gen.hh"
|
||||
#include "utils/managed_string.hh"
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
namespace secondary_index {
|
||||
|
||||
// Supported text analyzers for fulltext indexing.
|
||||
// This list corresponds to analyzers expected to be provided
|
||||
// by the backend search engine (Tantivy).
|
||||
static const std::vector<sstring> analyzer_values = {
|
||||
"standard", "english", "german", "french", "spanish", "italian", "portuguese", "russian", "chinese", "japanese", "korean", "simple", "whitespace"};
|
||||
|
||||
const static std::unordered_map<sstring, std::function<void(std::string_view, const sstring&, const sstring&)>> fulltext_index_options = {
|
||||
// 'analyzer' specifies the built-in text analyzer to use for tokenization.
|
||||
{"analyzer", std::bind_front(util::validate_enumerated_option, analyzer_values)},
|
||||
// 'positions' controls whether token positions are stored in the index.
|
||||
// Required for phrase queries. Set to false to save space.
|
||||
{"positions", std::bind_front(util::validate_enumerated_option, util::boolean_values)},
|
||||
};
|
||||
|
||||
bool fulltext_index::view_should_exist() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
std::optional<cql3::description> fulltext_index::describe(const index_metadata& im, const schema& base_schema) const {
|
||||
static const std::unordered_set<sstring> system_options = {
|
||||
cql3::statements::index_target::target_option_name,
|
||||
db::index::secondary_index::custom_class_option_name,
|
||||
db::index::secondary_index::index_version_option_name,
|
||||
};
|
||||
|
||||
auto target = im.options().at(cql3::statements::index_target::target_option_name);
|
||||
auto target_column = cql3::statements::index_target::column_name_from_target_string(target);
|
||||
|
||||
fragmented_ostringstream os;
|
||||
os << "CREATE CUSTOM INDEX " << cql3::util::maybe_quote(im.name()) << " ON " << cql3::util::maybe_quote(base_schema.ks_name()) << "."
|
||||
<< cql3::util::maybe_quote(base_schema.cf_name()) << "(" << cql3::util::maybe_quote(target_column) << ")"
|
||||
<< " USING 'fulltext_index'";
|
||||
|
||||
// Collect user-provided options (excluding system keys like target, class_name, index_version).
|
||||
std::map<sstring, sstring> user_options;
|
||||
for (const auto& [key, value] : im.options()) {
|
||||
if (!system_options.contains(key)) {
|
||||
user_options.emplace(key, value);
|
||||
}
|
||||
}
|
||||
if (!user_options.empty()) {
|
||||
os << " WITH OPTIONS = {";
|
||||
bool first = true;
|
||||
for (const auto& [key, value] : user_options) {
|
||||
if (!first) {
|
||||
os << ", ";
|
||||
}
|
||||
os << "'" << key << "': '" << value << "'";
|
||||
first = false;
|
||||
}
|
||||
os << "}";
|
||||
}
|
||||
|
||||
return cql3::description{
|
||||
.keyspace = base_schema.ks_name(),
|
||||
.type = "index",
|
||||
.name = im.name(),
|
||||
.create_statement = std::move(os).to_managed_string(),
|
||||
};
|
||||
}
|
||||
|
||||
void fulltext_index::check_target(const schema& schema, const std::vector<::shared_ptr<cql3::statements::index_target>>& targets) const {
|
||||
using cql3::statements::index_target;
|
||||
|
||||
if (targets.size() != 1) {
|
||||
throw exceptions::invalid_request_exception("Fulltext index must have exactly one target column");
|
||||
}
|
||||
|
||||
auto& target = targets[0];
|
||||
if (!std::holds_alternative<index_target::single_column>(target->value)) {
|
||||
throw exceptions::invalid_request_exception("Fulltext index target must be a single column");
|
||||
}
|
||||
|
||||
auto& column = std::get<index_target::single_column>(target->value);
|
||||
auto c_name = column->to_string();
|
||||
auto const* c_def = schema.get_column_definition(column->name());
|
||||
if (c_def == nullptr) {
|
||||
throw exceptions::invalid_request_exception(format("Column {} not found in schema", c_name));
|
||||
}
|
||||
|
||||
auto kind = c_def->type->get_kind();
|
||||
if (kind != abstract_type::kind::utf8 && kind != abstract_type::kind::ascii) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
format("Fulltext index is only supported on text, varchar, or ascii columns, but column {} has an incompatible type", c_name));
|
||||
}
|
||||
}
|
||||
|
||||
void fulltext_index::check_index_options(const cql3::statements::index_specific_prop_defs& properties) const {
|
||||
for (auto option : properties.get_raw_options()) {
|
||||
auto it = fulltext_index_options.find(option.first);
|
||||
if (it == fulltext_index_options.end()) {
|
||||
throw exceptions::invalid_request_exception(format("Unsupported option {} for fulltext index", option.first));
|
||||
}
|
||||
it->second(index_type_name(), option.first, option.second);
|
||||
}
|
||||
}
|
||||
|
||||
void fulltext_index::validate(const schema& schema, const cql3::statements::index_specific_prop_defs& properties,
|
||||
const std::vector<::shared_ptr<cql3::statements::index_target>>& targets, const gms::feature_service&, const data_dictionary::database&) const {
|
||||
check_target(schema, targets);
|
||||
check_index_options(properties);
|
||||
}
|
||||
|
||||
utils::UUID fulltext_index::index_version(const schema& schema) {
|
||||
return utils::UUID_gen::get_time_UUID();
|
||||
}
|
||||
|
||||
std::unique_ptr<secondary_index::custom_index> fulltext_index_factory() {
|
||||
return std::make_unique<fulltext_index>();
|
||||
}
|
||||
|
||||
} // namespace secondary_index
|
||||
43
index/fulltext_index.hh
Normal file
43
index/fulltext_index.hh
Normal file
@@ -0,0 +1,43 @@
|
||||
/*
|
||||
* Copyright 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "schema/schema.hh"
|
||||
|
||||
#include "data_dictionary/data_dictionary.hh"
|
||||
#include "cql3/statements/index_target.hh"
|
||||
#include "index/secondary_index_manager.hh"
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace secondary_index {
|
||||
|
||||
class fulltext_index : public custom_index {
|
||||
public:
|
||||
std::string_view index_type_name() const override {
|
||||
return "fulltext";
|
||||
}
|
||||
|
||||
fulltext_index() = default;
|
||||
~fulltext_index() override = default;
|
||||
std::optional<cql3::description> describe(const index_metadata& im, const schema& base_schema) const override;
|
||||
bool view_should_exist() const override;
|
||||
void validate(const schema& schema, const cql3::statements::index_specific_prop_defs& properties,
|
||||
const std::vector<::shared_ptr<cql3::statements::index_target>>& targets, const gms::feature_service& fs,
|
||||
const data_dictionary::database& db) const override;
|
||||
utils::UUID index_version(const schema& schema) override;
|
||||
|
||||
private:
|
||||
void check_target(const schema& schema, const std::vector<::shared_ptr<cql3::statements::index_target>>& targets) const;
|
||||
void check_index_options(const cql3::statements::index_specific_prop_defs& properties) const;
|
||||
};
|
||||
|
||||
std::unique_ptr<secondary_index::custom_index> fulltext_index_factory();
|
||||
|
||||
} // namespace secondary_index
|
||||
@@ -17,6 +17,7 @@
|
||||
|
||||
#include "index/secondary_index_manager.hh"
|
||||
#include "index/secondary_index.hh"
|
||||
#include "index/fulltext_index.hh"
|
||||
#include "index/vector_index.hh"
|
||||
|
||||
#include "cql3/expr/expression.hh"
|
||||
@@ -211,6 +212,7 @@ std::optional<std::function<std::unique_ptr<custom_index>()>> secondary_index_ma
|
||||
std::transform(lower_class_name.begin(), lower_class_name.end(), lower_class_name.begin(), ::tolower);
|
||||
|
||||
const static std::unordered_map<std::string_view, std::function<std::unique_ptr<custom_index>()>> classes = {
|
||||
{"fulltext_index", fulltext_index_factory},
|
||||
{"vector_index", vector_index_factory},
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user