diff --git a/index.cpp b/index.cpp new file mode 100644 index 0000000..115978a --- /dev/null +++ b/index.cpp @@ -0,0 +1,88 @@ +use anyhow::Result; +use jieba_rs::Jieba; +use std::path::Path; +use tantivy::{ + collector::TopDocs, + doc, + query::QueryParser, + schema::{Schema, STORED, TEXT}, + Index, IndexWriter, +}; + +pub struct SearchEngine { + index: Index, + writer: IndexWriter, + jieba: Jieba, +} + +impl SearchEngine { + pub fn new(index_path: &Path) -> Result { + let mut schema_builder = Schema::builder(); + + // 定义索引结构 + schema_builder.add_text_field("path", TEXT | STORED); + schema_builder.add_text_field("content", TEXT); + schema_builder.add_text_field("file_type", TEXT | STORED); + schema_builder.add_date_field("modified", STORED); + schema_builder.add_u64_field("size", STORED); + + let schema = schema_builder.build(); + let index = Index::create_in_dir(index_path, schema)?; + let writer = index.writer(50_000_000)?; // 50MB buffer + + Ok(Self { + index, + writer, + jieba: Jieba::new(), + }) + } + + pub async fn add_document(&mut self, path: &Path, content: &str) -> Result<()> { + let schema = self.index.schema(); + let path_field = schema.get_field("path").unwrap(); + let content_field = schema.get_field("content").unwrap(); + + // 使用结巴分词处理内容 + let tokens = self.jieba.cut_for_search(content); + + self.writer.add_document(doc!( + path_field => path.to_string_lossy().to_string(), + content_field => tokens.join(" ") + ))?; + + Ok(()) + } + + pub async fn search(&self, query: &str, limit: usize) -> Result> { + let reader = self.index.reader()?; + let searcher = reader.searcher(); + + let schema = self.index.schema(); + let content_field = schema.get_field("content").unwrap(); + + let query_parser = QueryParser::for_index(&self.index, vec![content_field]); + + // 处理布尔查询 + let query = query_parser.parse_query(query)?; + let top_docs = searcher.search(&query, &TopDocs::with_limit(limit))?; + + let mut results = Vec::new(); + for (_score, doc_address) in top_docs { + let doc = searcher.doc(doc_address)?; + results.push(SearchResult { + path: doc.get_first(schema.get_field("path").unwrap()).unwrap().text().unwrap().to_string(), + snippet: String::new(), // TODO: 实现片段提取 + positions: vec![], // TODO: 实现位置信息 + }); + } + + Ok(results) + } +} + +#[derive(Debug, serde::Serialize)] +pub struct SearchResult { + pub path: String, + pub snippet: String, + pub positions: Vec, +} \ No newline at end of file