parent
006471b2e2
commit
d8e31033fc
@ -0,0 +1,88 @@
|
||||
use anyhow::Result;
|
||||
use jieba_rs::Jieba;
|
||||
use std::path::Path;
|
||||
use tantivy::{
|
||||
collector::TopDocs,
|
||||
doc,
|
||||
query::QueryParser,
|
||||
schema::{Schema, STORED, TEXT},
|
||||
Index, IndexWriter,
|
||||
};
|
||||
|
||||
pub struct SearchEngine {
|
||||
index: Index,
|
||||
writer: IndexWriter,
|
||||
jieba: Jieba,
|
||||
}
|
||||
|
||||
impl SearchEngine {
|
||||
pub fn new(index_path: &Path) -> Result<Self> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
// 定义索引结构
|
||||
schema_builder.add_text_field("path", TEXT | STORED);
|
||||
schema_builder.add_text_field("content", TEXT);
|
||||
schema_builder.add_text_field("file_type", TEXT | STORED);
|
||||
schema_builder.add_date_field("modified", STORED);
|
||||
schema_builder.add_u64_field("size", STORED);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_dir(index_path, schema)?;
|
||||
let writer = index.writer(50_000_000)?; // 50MB buffer
|
||||
|
||||
Ok(Self {
|
||||
index,
|
||||
writer,
|
||||
jieba: Jieba::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn add_document(&mut self, path: &Path, content: &str) -> Result<()> {
|
||||
let schema = self.index.schema();
|
||||
let path_field = schema.get_field("path").unwrap();
|
||||
let content_field = schema.get_field("content").unwrap();
|
||||
|
||||
// 使用结巴分词处理内容
|
||||
let tokens = self.jieba.cut_for_search(content);
|
||||
|
||||
self.writer.add_document(doc!(
|
||||
path_field => path.to_string_lossy().to_string(),
|
||||
content_field => tokens.join(" ")
|
||||
))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn search(&self, query: &str, limit: usize) -> Result<Vec<SearchResult>> {
|
||||
let reader = self.index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let schema = self.index.schema();
|
||||
let content_field = schema.get_field("content").unwrap();
|
||||
|
||||
let query_parser = QueryParser::for_index(&self.index, vec![content_field]);
|
||||
|
||||
// 处理布尔查询
|
||||
let query = query_parser.parse_query(query)?;
|
||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(limit))?;
|
||||
|
||||
let mut results = Vec::new();
|
||||
for (_score, doc_address) in top_docs {
|
||||
let doc = searcher.doc(doc_address)?;
|
||||
results.push(SearchResult {
|
||||
path: doc.get_first(schema.get_field("path").unwrap()).unwrap().text().unwrap().to_string(),
|
||||
snippet: String::new(), // TODO: 实现片段提取
|
||||
positions: vec![], // TODO: 实现位置信息
|
||||
});
|
||||
}
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, serde::Serialize)]
|
||||
pub struct SearchResult {
|
||||
pub path: String,
|
||||
pub snippet: String,
|
||||
pub positions: Vec<usize>,
|
||||
}
|
Loading…
Reference in new issue