You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

132 lines
3.7 KiB

#include "search_engine.h"
#include <filesystem>
#include <fstream>
const char* const DICT_PATH = "dict/jieba.dict.utf8";
const char* const HMM_PATH = "dict/hmm_model.utf8";
const char* const USER_DICT_PATH = "dict/user.dict.utf8";
const char* const IDF_PATH = "dict/idf.utf8";
const char* const STOP_WORD_PATH = "dict/stop_words.utf8";
SearchEngine::SearchEngine(const std::string& db_path) {
// 初始化 Xapian 数据库
db_ = std::make_unique<Xapian::WritableDatabase>(
db_path, Xapian::DB_CREATE_OR_OPEN
);
term_gen_ = std::make_unique<Xapian::TermGenerator>();
// 初始化结巴分词
jieba_ = std::make_unique<cppjieba::Jieba>(
DICT_PATH,
HMM_PATH,
USER_DICT_PATH,
IDF_PATH,
STOP_WORD_PATH
);
}
SearchEngine::~SearchEngine() = default;
void SearchEngine::addDocument(const std::string& path, const std::string& content) {
Xapian::Document doc;
term_gen_->set_document(doc);
// 设置文档路径
doc.add_value(1, path);
// 对内容分词并索引
std::string tokens = tokenize(content);
term_gen_->index_text(tokens);
// 存储原始内容用于生成摘要
doc.set_data(content);
// 添加到数据库
db_->add_document(doc);
}
std::vector<SearchResult> SearchEngine::search(
const std::string& query_str,
const SearchOptions& options
) {
std::vector<SearchResult> results;
Xapian::QueryParser qp;
qp.set_database(*db_);
// 添加文件类型过滤
if (!options.file_type.empty()) {
std::string type_query = "type:" + options.file_type;
query_str = query_str + " AND " + type_query;
}
Xapian::Query query = qp.parse_query(
tokenize(query_str),
Xapian::QueryParser::FLAG_BOOLEAN |
Xapian::QueryParser::FLAG_PHRASE
);
Xapian::Enquire enquire(*db_);
enquire.set_query(query);
// 设置排序
switch (options.sort_field) {
case SortField::PATH:
enquire.set_sort_by_value(1, options.sort_order == SortOrder::DESC);
break;
case SortField::SIZE:
enquire.set_sort_by_value(2, options.sort_order == SortOrder::DESC);
break;
case SortField::MODIFIED_TIME:
enquire.set_sort_by_value(3, options.sort_order == SortOrder::DESC);
break;
default:
// 默认按相关度排序
break;
}
// 使用偏移量和限制实现分页
Xapian::MSet matches = enquire.get_mset(
options.offset,
options.limit
);
// 处理结果
Highlighter highlighter;
std::vector<std::string> keywords;
jieba_->Cut(query_str, keywords);
for (auto it = matches.begin(); it != matches.end(); ++it) {
const Xapian::Document& doc = it.get_document();
std::string content = doc.get_data();
SearchResult result;
result.path = doc.get_value(1);
result.score = it.get_weight();
// 生成高亮片段
auto spans = highlighter.find_keyword_positions(content, keywords);
result.snippet = highlighter.extract_snippet(content, spans);
result.positions = spans;
results.push_back(std::move(result));
}
return results;
}
std::string SearchEngine::tokenize(const std::string& text) {
std::vector<std::string> words;
jieba_->Cut(text, words, true);
std::string result;
for (const auto& word : words) {
result += word + " ";
}
return result;
}
void SearchEngine::commit() {
db_->commit();
}