You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
132 lines
3.7 KiB
132 lines
3.7 KiB
#include "search_engine.h"
|
|
#include <filesystem>
|
|
#include <fstream>
|
|
|
|
const char* const DICT_PATH = "dict/jieba.dict.utf8";
|
|
const char* const HMM_PATH = "dict/hmm_model.utf8";
|
|
const char* const USER_DICT_PATH = "dict/user.dict.utf8";
|
|
const char* const IDF_PATH = "dict/idf.utf8";
|
|
const char* const STOP_WORD_PATH = "dict/stop_words.utf8";
|
|
|
|
SearchEngine::SearchEngine(const std::string& db_path) {
|
|
// 初始化 Xapian 数据库
|
|
db_ = std::make_unique<Xapian::WritableDatabase>(
|
|
db_path, Xapian::DB_CREATE_OR_OPEN
|
|
);
|
|
|
|
term_gen_ = std::make_unique<Xapian::TermGenerator>();
|
|
|
|
// 初始化结巴分词
|
|
jieba_ = std::make_unique<cppjieba::Jieba>(
|
|
DICT_PATH,
|
|
HMM_PATH,
|
|
USER_DICT_PATH,
|
|
IDF_PATH,
|
|
STOP_WORD_PATH
|
|
);
|
|
}
|
|
|
|
SearchEngine::~SearchEngine() = default;
|
|
|
|
void SearchEngine::addDocument(const std::string& path, const std::string& content) {
|
|
Xapian::Document doc;
|
|
term_gen_->set_document(doc);
|
|
|
|
// 设置文档路径
|
|
doc.add_value(1, path);
|
|
|
|
// 对内容分词并索引
|
|
std::string tokens = tokenize(content);
|
|
term_gen_->index_text(tokens);
|
|
|
|
// 存储原始内容用于生成摘要
|
|
doc.set_data(content);
|
|
|
|
// 添加到数据库
|
|
db_->add_document(doc);
|
|
}
|
|
|
|
std::vector<SearchResult> SearchEngine::search(
|
|
const std::string& query_str,
|
|
const SearchOptions& options
|
|
) {
|
|
std::vector<SearchResult> results;
|
|
|
|
Xapian::QueryParser qp;
|
|
qp.set_database(*db_);
|
|
|
|
// 添加文件类型过滤
|
|
if (!options.file_type.empty()) {
|
|
std::string type_query = "type:" + options.file_type;
|
|
query_str = query_str + " AND " + type_query;
|
|
}
|
|
|
|
Xapian::Query query = qp.parse_query(
|
|
tokenize(query_str),
|
|
Xapian::QueryParser::FLAG_BOOLEAN |
|
|
Xapian::QueryParser::FLAG_PHRASE
|
|
);
|
|
|
|
Xapian::Enquire enquire(*db_);
|
|
enquire.set_query(query);
|
|
|
|
// 设置排序
|
|
switch (options.sort_field) {
|
|
case SortField::PATH:
|
|
enquire.set_sort_by_value(1, options.sort_order == SortOrder::DESC);
|
|
break;
|
|
case SortField::SIZE:
|
|
enquire.set_sort_by_value(2, options.sort_order == SortOrder::DESC);
|
|
break;
|
|
case SortField::MODIFIED_TIME:
|
|
enquire.set_sort_by_value(3, options.sort_order == SortOrder::DESC);
|
|
break;
|
|
default:
|
|
// 默认按相关度排序
|
|
break;
|
|
}
|
|
|
|
// 使用偏移量和限制实现分页
|
|
Xapian::MSet matches = enquire.get_mset(
|
|
options.offset,
|
|
options.limit
|
|
);
|
|
|
|
// 处理结果
|
|
Highlighter highlighter;
|
|
std::vector<std::string> keywords;
|
|
jieba_->Cut(query_str, keywords);
|
|
|
|
for (auto it = matches.begin(); it != matches.end(); ++it) {
|
|
const Xapian::Document& doc = it.get_document();
|
|
std::string content = doc.get_data();
|
|
|
|
SearchResult result;
|
|
result.path = doc.get_value(1);
|
|
result.score = it.get_weight();
|
|
|
|
// 生成高亮片段
|
|
auto spans = highlighter.find_keyword_positions(content, keywords);
|
|
result.snippet = highlighter.extract_snippet(content, spans);
|
|
result.positions = spans;
|
|
|
|
results.push_back(std::move(result));
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
std::string SearchEngine::tokenize(const std::string& text) {
|
|
std::vector<std::string> words;
|
|
jieba_->Cut(text, words, true);
|
|
|
|
std::string result;
|
|
for (const auto& word : words) {
|
|
result += word + " ";
|
|
}
|
|
return result;
|
|
}
|
|
|
|
void SearchEngine::commit() {
|
|
db_->commit();
|
|
}
|