parent
006471b2e2
commit
d8e31033fc
@ -0,0 +1,88 @@
|
|||||||
|
use anyhow::Result;
|
||||||
|
use jieba_rs::Jieba;
|
||||||
|
use std::path::Path;
|
||||||
|
use tantivy::{
|
||||||
|
collector::TopDocs,
|
||||||
|
doc,
|
||||||
|
query::QueryParser,
|
||||||
|
schema::{Schema, STORED, TEXT},
|
||||||
|
Index, IndexWriter,
|
||||||
|
};
|
||||||
|
|
||||||
|
pub struct SearchEngine {
|
||||||
|
index: Index,
|
||||||
|
writer: IndexWriter,
|
||||||
|
jieba: Jieba,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SearchEngine {
|
||||||
|
pub fn new(index_path: &Path) -> Result<Self> {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
|
||||||
|
// 定义索引结构
|
||||||
|
schema_builder.add_text_field("path", TEXT | STORED);
|
||||||
|
schema_builder.add_text_field("content", TEXT);
|
||||||
|
schema_builder.add_text_field("file_type", TEXT | STORED);
|
||||||
|
schema_builder.add_date_field("modified", STORED);
|
||||||
|
schema_builder.add_u64_field("size", STORED);
|
||||||
|
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_dir(index_path, schema)?;
|
||||||
|
let writer = index.writer(50_000_000)?; // 50MB buffer
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
index,
|
||||||
|
writer,
|
||||||
|
jieba: Jieba::new(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn add_document(&mut self, path: &Path, content: &str) -> Result<()> {
|
||||||
|
let schema = self.index.schema();
|
||||||
|
let path_field = schema.get_field("path").unwrap();
|
||||||
|
let content_field = schema.get_field("content").unwrap();
|
||||||
|
|
||||||
|
// 使用结巴分词处理内容
|
||||||
|
let tokens = self.jieba.cut_for_search(content);
|
||||||
|
|
||||||
|
self.writer.add_document(doc!(
|
||||||
|
path_field => path.to_string_lossy().to_string(),
|
||||||
|
content_field => tokens.join(" ")
|
||||||
|
))?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn search(&self, query: &str, limit: usize) -> Result<Vec<SearchResult>> {
|
||||||
|
let reader = self.index.reader()?;
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
|
||||||
|
let schema = self.index.schema();
|
||||||
|
let content_field = schema.get_field("content").unwrap();
|
||||||
|
|
||||||
|
let query_parser = QueryParser::for_index(&self.index, vec![content_field]);
|
||||||
|
|
||||||
|
// 处理布尔查询
|
||||||
|
let query = query_parser.parse_query(query)?;
|
||||||
|
let top_docs = searcher.search(&query, &TopDocs::with_limit(limit))?;
|
||||||
|
|
||||||
|
let mut results = Vec::new();
|
||||||
|
for (_score, doc_address) in top_docs {
|
||||||
|
let doc = searcher.doc(doc_address)?;
|
||||||
|
results.push(SearchResult {
|
||||||
|
path: doc.get_first(schema.get_field("path").unwrap()).unwrap().text().unwrap().to_string(),
|
||||||
|
snippet: String::new(), // TODO: 实现片段提取
|
||||||
|
positions: vec![], // TODO: 实现位置信息
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(results)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, serde::Serialize)]
|
||||||
|
pub struct SearchResult {
|
||||||
|
pub path: String,
|
||||||
|
pub snippet: String,
|
||||||
|
pub positions: Vec<usize>,
|
||||||
|
}
|
Loading…
Reference in new issue