#include "search_engine.h" #include #include const char* const DICT_PATH = "dict/jieba.dict.utf8"; const char* const HMM_PATH = "dict/hmm_model.utf8"; const char* const USER_DICT_PATH = "dict/user.dict.utf8"; const char* const IDF_PATH = "dict/idf.utf8"; const char* const STOP_WORD_PATH = "dict/stop_words.utf8"; SearchEngine::SearchEngine(const std::string& db_path) { // 初始化 Xapian 数据库 db_ = std::make_unique( db_path, Xapian::DB_CREATE_OR_OPEN ); term_gen_ = std::make_unique(); // 初始化结巴分词 jieba_ = std::make_unique( DICT_PATH, HMM_PATH, USER_DICT_PATH, IDF_PATH, STOP_WORD_PATH ); } SearchEngine::~SearchEngine() = default; void SearchEngine::addDocument(const std::string& path, const std::string& content) { Xapian::Document doc; term_gen_->set_document(doc); // 设置文档路径 doc.add_value(1, path); // 对内容分词并索引 std::string tokens = tokenize(content); term_gen_->index_text(tokens); // 存储原始内容用于生成摘要 doc.set_data(content); // 添加到数据库 db_->add_document(doc); } std::vector SearchEngine::search( const std::string& query_str, const SearchOptions& options ) { std::vector results; Xapian::QueryParser qp; qp.set_database(*db_); // 添加文件类型过滤 if (!options.file_type.empty()) { std::string type_query = "type:" + options.file_type; query_str = query_str + " AND " + type_query; } Xapian::Query query = qp.parse_query( tokenize(query_str), Xapian::QueryParser::FLAG_BOOLEAN | Xapian::QueryParser::FLAG_PHRASE ); Xapian::Enquire enquire(*db_); enquire.set_query(query); // 设置排序 switch (options.sort_field) { case SortField::PATH: enquire.set_sort_by_value(1, options.sort_order == SortOrder::DESC); break; case SortField::SIZE: enquire.set_sort_by_value(2, options.sort_order == SortOrder::DESC); break; case SortField::MODIFIED_TIME: enquire.set_sort_by_value(3, options.sort_order == SortOrder::DESC); break; default: // 默认按相关度排序 break; } // 使用偏移量和限制实现分页 Xapian::MSet matches = enquire.get_mset( options.offset, options.limit ); // 处理结果 Highlighter highlighter; std::vector keywords; jieba_->Cut(query_str, keywords); for (auto it = matches.begin(); it != matches.end(); ++it) { const Xapian::Document& doc = it.get_document(); std::string content = doc.get_data(); SearchResult result; result.path = doc.get_value(1); result.score = it.get_weight(); // 生成高亮片段 auto spans = highlighter.find_keyword_positions(content, keywords); result.snippet = highlighter.extract_snippet(content, spans); result.positions = spans; results.push_back(std::move(result)); } return results; } std::string SearchEngine::tokenize(const std::string& text) { std::vector words; jieba_->Cut(text, words, true); std::string result; for (const auto& word : words) { result += word + " "; } return result; } void SearchEngine::commit() { db_->commit(); }