diff --git a/search_engine.cpp b/search_engine.cpp new file mode 100644 index 0000000..6e05296 --- /dev/null +++ b/search_engine.cpp @@ -0,0 +1,132 @@ +#include "search_engine.h" +#include +#include + +const char* const DICT_PATH = "dict/jieba.dict.utf8"; +const char* const HMM_PATH = "dict/hmm_model.utf8"; +const char* const USER_DICT_PATH = "dict/user.dict.utf8"; +const char* const IDF_PATH = "dict/idf.utf8"; +const char* const STOP_WORD_PATH = "dict/stop_words.utf8"; + +SearchEngine::SearchEngine(const std::string& db_path) { + // 初始化 Xapian 数据库 + db_ = std::make_unique( + db_path, Xapian::DB_CREATE_OR_OPEN + ); + + term_gen_ = std::make_unique(); + + // 初始化结巴分词 + jieba_ = std::make_unique( + DICT_PATH, + HMM_PATH, + USER_DICT_PATH, + IDF_PATH, + STOP_WORD_PATH + ); +} + +SearchEngine::~SearchEngine() = default; + +void SearchEngine::addDocument(const std::string& path, const std::string& content) { + Xapian::Document doc; + term_gen_->set_document(doc); + + // 设置文档路径 + doc.add_value(1, path); + + // 对内容分词并索引 + std::string tokens = tokenize(content); + term_gen_->index_text(tokens); + + // 存储原始内容用于生成摘要 + doc.set_data(content); + + // 添加到数据库 + db_->add_document(doc); +} + +std::vector SearchEngine::search( + const std::string& query_str, + const SearchOptions& options +) { + std::vector results; + + Xapian::QueryParser qp; + qp.set_database(*db_); + + // 添加文件类型过滤 + if (!options.file_type.empty()) { + std::string type_query = "type:" + options.file_type; + query_str = query_str + " AND " + type_query; + } + + Xapian::Query query = qp.parse_query( + tokenize(query_str), + Xapian::QueryParser::FLAG_BOOLEAN | + Xapian::QueryParser::FLAG_PHRASE + ); + + Xapian::Enquire enquire(*db_); + enquire.set_query(query); + + // 设置排序 + switch (options.sort_field) { + case SortField::PATH: + enquire.set_sort_by_value(1, options.sort_order == SortOrder::DESC); + break; + case SortField::SIZE: + enquire.set_sort_by_value(2, options.sort_order == SortOrder::DESC); + break; + case SortField::MODIFIED_TIME: + enquire.set_sort_by_value(3, options.sort_order == SortOrder::DESC); + break; + default: + // 默认按相关度排序 + break; + } + + // 使用偏移量和限制实现分页 + Xapian::MSet matches = enquire.get_mset( + options.offset, + options.limit + ); + + // 处理结果 + Highlighter highlighter; + std::vector keywords; + jieba_->Cut(query_str, keywords); + + for (auto it = matches.begin(); it != matches.end(); ++it) { + const Xapian::Document& doc = it.get_document(); + std::string content = doc.get_data(); + + SearchResult result; + result.path = doc.get_value(1); + result.score = it.get_weight(); + + // 生成高亮片段 + auto spans = highlighter.find_keyword_positions(content, keywords); + result.snippet = highlighter.extract_snippet(content, spans); + result.positions = spans; + + results.push_back(std::move(result)); + } + + return results; +} + +std::string SearchEngine::tokenize(const std::string& text) { + std::vector words; + jieba_->Cut(text, words, true); + + std::string result; + for (const auto& word : words) { + result += word + " "; + } + return result; +} + +void SearchEngine::commit() { + db_->commit(); +} \ No newline at end of file