|
|
|
|
@ -1,150 +1,184 @@
|
|
|
|
|
# 导入 Django 字符串处理工具:确保字符串编码兼容
|
|
|
|
|
from django.utils.encoding import force_str
|
|
|
|
|
# 导入 Elasticsearch DSL 工具:构建 Elasticsearch 查询语句
|
|
|
|
|
from elasticsearch_dsl import Q
|
|
|
|
|
# 导入 Haystack 核心类:实现自定义搜索后端、查询和引擎
|
|
|
|
|
from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, log_query
|
|
|
|
|
from haystack.forms import ModelSearchForm
|
|
|
|
|
from haystack.models import SearchResult
|
|
|
|
|
from haystack.utils import log as logging
|
|
|
|
|
from haystack.forms import ModelSearchForm # Haystack 基础搜索表单
|
|
|
|
|
from haystack.models import SearchResult # Haystack 搜索结果封装类
|
|
|
|
|
from haystack.utils import log as logging # Haystack 日志工具
|
|
|
|
|
|
|
|
|
|
# 导入项目自定义的 Elasticsearch 文档和管理器:关联博客文章模型
|
|
|
|
|
from blog.documents import ArticleDocument, ArticleDocumentManager
|
|
|
|
|
from blog.models import Article
|
|
|
|
|
from blog.models import Article # 博客核心文章模型
|
|
|
|
|
|
|
|
|
|
# 初始化日志对象:记录搜索相关日志(如查询语句、错误信息)
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 自定义 Elasticsearch 搜索后端:实现 Haystack 与 Elasticsearch 的底层交互
|
|
|
|
|
class ElasticSearchBackend(BaseSearchBackend):
|
|
|
|
|
def __init__(self, connection_alias, **connection_options):
|
|
|
|
|
# 调用父类构造方法,初始化 Haystack 基础搜索后端
|
|
|
|
|
super(
|
|
|
|
|
ElasticSearchBackend,
|
|
|
|
|
self).__init__(
|
|
|
|
|
connection_alias,
|
|
|
|
|
**connection_options)
|
|
|
|
|
# 初始化文章文档管理器:负责 Elasticsearch 索引的创建、更新、删除
|
|
|
|
|
self.manager = ArticleDocumentManager()
|
|
|
|
|
# 启用拼写建议功能:用于返回搜索关键词的推荐词
|
|
|
|
|
self.include_spelling = True
|
|
|
|
|
|
|
|
|
|
# 辅助方法:将模型实例转换为 Elasticsearch 文档(Document)
|
|
|
|
|
def _get_models(self, iterable):
|
|
|
|
|
# 若传入空列表,默认获取所有文章;否则使用传入的模型实例
|
|
|
|
|
models = iterable if iterable and iterable[0] else Article.objects.all()
|
|
|
|
|
# 通过文档管理器将模型转换为 Elasticsearch 可识别的文档
|
|
|
|
|
docs = self.manager.convert_to_doc(models)
|
|
|
|
|
return docs
|
|
|
|
|
|
|
|
|
|
# 初始化索引:创建 Elasticsearch 索引并批量添加文档
|
|
|
|
|
def _create(self, models):
|
|
|
|
|
self.manager.create_index()
|
|
|
|
|
docs = self._get_models(models)
|
|
|
|
|
self.manager.rebuild(docs)
|
|
|
|
|
self.manager.create_index() # 创建 Elasticsearch 索引结构
|
|
|
|
|
docs = self._get_models(models) # 转换模型为文档
|
|
|
|
|
self.manager.rebuild(docs) # 批量写入文档到索引
|
|
|
|
|
|
|
|
|
|
# 删除索引中的文档:根据模型实例删除对应 Elasticsearch 记录
|
|
|
|
|
def _delete(self, models):
|
|
|
|
|
for m in models:
|
|
|
|
|
m.delete()
|
|
|
|
|
m.delete() # 调用文档的 delete 方法,删除 Elasticsearch 中的对应记录
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
# 重建索引:全量更新 Elasticsearch 中的文档(覆盖旧数据)
|
|
|
|
|
def _rebuild(self, models):
|
|
|
|
|
# 若未指定模型,默认获取所有文章
|
|
|
|
|
models = models if models else Article.objects.all()
|
|
|
|
|
docs = self.manager.convert_to_doc(models)
|
|
|
|
|
self.manager.update_docs(docs)
|
|
|
|
|
docs = self._get_models(models) # 转换模型为文档
|
|
|
|
|
self.manager.update_docs(docs) # 批量更新文档到索引
|
|
|
|
|
|
|
|
|
|
# Haystack 标准方法:增量更新索引(更新指定模型对应的文档)
|
|
|
|
|
def update(self, index, iterable, commit=True):
|
|
|
|
|
models = self._get_models(iterable) # 转换模型为文档
|
|
|
|
|
self.manager.update_docs(models) # 增量更新文档
|
|
|
|
|
|
|
|
|
|
models = self._get_models(iterable)
|
|
|
|
|
self.manager.update_docs(models)
|
|
|
|
|
|
|
|
|
|
# Haystack 标准方法:移除单个模型对应的索引记录
|
|
|
|
|
def remove(self, obj_or_string):
|
|
|
|
|
models = self._get_models([obj_or_string])
|
|
|
|
|
self._delete(models)
|
|
|
|
|
models = self._get_models([obj_or_string]) # 转换为文档
|
|
|
|
|
self._delete(models) # 删除文档
|
|
|
|
|
|
|
|
|
|
# Haystack 标准方法:清空索引(删除所有相关记录)
|
|
|
|
|
def clear(self, models=None, commit=True):
|
|
|
|
|
self.remove(None)
|
|
|
|
|
self.remove(None) # 调用 remove 方法清空索引
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def get_suggestion(query: str) -> str:
|
|
|
|
|
"""获取推荐词, 如果没有找到添加原搜索词"""
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
生成搜索关键词的推荐词(基于 Elasticsearch 拼写建议功能)
|
|
|
|
|
若未找到推荐词,返回原查询词
|
|
|
|
|
"""
|
|
|
|
|
# 构建 Elasticsearch 查询:匹配文章内容,并启用拼写建议
|
|
|
|
|
search = ArticleDocument.search() \
|
|
|
|
|
.query("match", body=query) \
|
|
|
|
|
.suggest('suggest_search', query, term={'field': 'body'}) \
|
|
|
|
|
.execute()
|
|
|
|
|
.execute() # 执行查询
|
|
|
|
|
|
|
|
|
|
keywords = []
|
|
|
|
|
# 提取 Elasticsearch 返回的建议词
|
|
|
|
|
for suggest in search.suggest.suggest_search:
|
|
|
|
|
if suggest["options"]:
|
|
|
|
|
if suggest["options"]: # 若有推荐词,取第一个
|
|
|
|
|
keywords.append(suggest["options"][0]["text"])
|
|
|
|
|
else:
|
|
|
|
|
else: # 若无推荐词,保留原查询词
|
|
|
|
|
keywords.append(suggest["text"])
|
|
|
|
|
|
|
|
|
|
return ' '.join(keywords)
|
|
|
|
|
return ' '.join(keywords) # 拼接推荐词为字符串返回
|
|
|
|
|
|
|
|
|
|
# Haystack 核心搜索方法:执行搜索并返回结果(带日志记录装饰器)
|
|
|
|
|
@log_query
|
|
|
|
|
def search(self, query_string, **kwargs):
|
|
|
|
|
logger.info('search query_string:' + query_string)
|
|
|
|
|
logger.info('search query_string:' + query_string) # 记录查询关键词
|
|
|
|
|
|
|
|
|
|
start_offset = kwargs.get('start_offset')
|
|
|
|
|
end_offset = kwargs.get('end_offset')
|
|
|
|
|
# 获取分页参数:起始偏移量和结束偏移量(用于分页)
|
|
|
|
|
start_offset = kwargs.get('start_offset', 0)
|
|
|
|
|
end_offset = kwargs.get('end_offset') # 若为 None,Elasticsearch 会返回默认数量结果
|
|
|
|
|
|
|
|
|
|
# 推荐词搜索
|
|
|
|
|
# 生成推荐词:根据 is_suggest 标识判断是否需要拼写建议
|
|
|
|
|
if getattr(self, "is_suggest", None):
|
|
|
|
|
suggestion = self.get_suggestion(query_string)
|
|
|
|
|
else:
|
|
|
|
|
suggestion = query_string
|
|
|
|
|
suggestion = query_string # 不需要建议则使用原查询词
|
|
|
|
|
|
|
|
|
|
# 构建 Elasticsearch 查询条件(布尔查询)
|
|
|
|
|
# 1. 匹配条件:标题或内容包含推荐词,匹配度最低 70%
|
|
|
|
|
q = Q('bool',
|
|
|
|
|
should=[Q('match', body=suggestion), Q('match', title=suggestion)],
|
|
|
|
|
minimum_should_match="70%")
|
|
|
|
|
|
|
|
|
|
# 构建完整搜索请求:
|
|
|
|
|
# - 过滤条件:使用上面的 q 匹配结果,且文章状态为“已发布”(status='p')、类型为“文章”(type='a')
|
|
|
|
|
# - 不返回文档源数据(source=False):仅获取 ID 和得分,减少数据传输
|
|
|
|
|
# - 分页:按 start_offset 和 end_offset 截取结果
|
|
|
|
|
search = ArticleDocument.search() \
|
|
|
|
|
.query('bool', filter=[q]) \
|
|
|
|
|
.filter('term', status='p') \
|
|
|
|
|
.filter('term', type='a') \
|
|
|
|
|
.source(False)[start_offset: end_offset]
|
|
|
|
|
|
|
|
|
|
# 执行搜索,获取 Elasticsearch 返回结果
|
|
|
|
|
results = search.execute()
|
|
|
|
|
hits = results['hits'].total
|
|
|
|
|
raw_results = []
|
|
|
|
|
hits = results['hits'].total # 匹配到的总结果数
|
|
|
|
|
raw_results = [] # 存储 Haystack 标准格式的搜索结果
|
|
|
|
|
|
|
|
|
|
# 解析 Elasticsearch 原始结果,封装为 Haystack 的 SearchResult 格式
|
|
|
|
|
for raw_result in results['hits']['hits']:
|
|
|
|
|
app_label = 'blog'
|
|
|
|
|
model_name = 'Article'
|
|
|
|
|
additional_fields = {}
|
|
|
|
|
app_label = 'blog' # 模型所属应用
|
|
|
|
|
model_name = 'Article' # 模型名称
|
|
|
|
|
additional_fields = {} # 额外字段(此处无额外信息,留空)
|
|
|
|
|
|
|
|
|
|
# 实例化 SearchResult:封装应用名、模型名、文档ID、匹配得分等信息
|
|
|
|
|
result_class = SearchResult
|
|
|
|
|
|
|
|
|
|
result = result_class(
|
|
|
|
|
app_label,
|
|
|
|
|
model_name,
|
|
|
|
|
raw_result['_id'],
|
|
|
|
|
raw_result['_score'],
|
|
|
|
|
raw_result['_id'], # Elasticsearch 中文档的 ID
|
|
|
|
|
raw_result['_score'], # 匹配得分(用于排序)
|
|
|
|
|
**additional_fields)
|
|
|
|
|
raw_results.append(result)
|
|
|
|
|
|
|
|
|
|
# 搜索结果元数据:分面(无分面需求,留空)、拼写建议
|
|
|
|
|
facets = {}
|
|
|
|
|
# 若推荐词与原查询词不同,返回推荐词;否则为 None
|
|
|
|
|
spelling_suggestion = None if query_string == suggestion else suggestion
|
|
|
|
|
|
|
|
|
|
# 返回 Haystack 标准格式的搜索结果
|
|
|
|
|
return {
|
|
|
|
|
'results': raw_results,
|
|
|
|
|
'hits': hits,
|
|
|
|
|
'facets': facets,
|
|
|
|
|
'spelling_suggestion': spelling_suggestion,
|
|
|
|
|
'results': raw_results, # 封装后的搜索结果列表
|
|
|
|
|
'hits': hits, # 总匹配数
|
|
|
|
|
'facets': facets, # 分面数据(空)
|
|
|
|
|
'spelling_suggestion': spelling_suggestion, # 拼写建议
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 自定义 Elasticsearch 查询类:处理查询参数解析、格式清洗等
|
|
|
|
|
class ElasticSearchQuery(BaseSearchQuery):
|
|
|
|
|
# 转换日期格式:适配 Elasticsearch 的日期查询需求
|
|
|
|
|
def _convert_datetime(self, date):
|
|
|
|
|
if hasattr(date, 'hour'):
|
|
|
|
|
if hasattr(date, 'hour'): # 若为datetime(含时分秒),格式化为年月日时分秒
|
|
|
|
|
return force_str(date.strftime('%Y%m%d%H%M%S'))
|
|
|
|
|
else:
|
|
|
|
|
else: # 若为date(仅年月日),补全时分秒为000000
|
|
|
|
|
return force_str(date.strftime('%Y%m%d000000'))
|
|
|
|
|
|
|
|
|
|
# 清洗查询词:处理 Haystack 保留词和特殊字符,避免查询语法错误
|
|
|
|
|
def clean(self, query_fragment):
|
|
|
|
|
"""
|
|
|
|
|
Provides a mechanism for sanitizing user input before presenting the
|
|
|
|
|
value to the backend.
|
|
|
|
|
|
|
|
|
|
Whoosh 1.X differs here in that you can no longer use a backslash
|
|
|
|
|
to escape reserved characters. Instead, the whole word should be
|
|
|
|
|
quoted.
|
|
|
|
|
"""
|
|
|
|
|
words = query_fragment.split()
|
|
|
|
|
words = query_fragment.split() # 拆分查询词为单词列表
|
|
|
|
|
cleaned_words = []
|
|
|
|
|
|
|
|
|
|
for word in words:
|
|
|
|
|
# 处理 Haystack 保留词(如 AND、OR),转为小写(避免语法冲突)
|
|
|
|
|
if word in self.backend.RESERVED_WORDS:
|
|
|
|
|
word = word.replace(word, word.lower())
|
|
|
|
|
|
|
|
|
|
# 处理特殊字符(如 +、-、*):包含特殊字符的单词用引号包裹
|
|
|
|
|
for char in self.backend.RESERVED_CHARACTERS:
|
|
|
|
|
if char in word:
|
|
|
|
|
word = "'%s'" % word
|
|
|
|
|
@ -152,32 +186,39 @@ class ElasticSearchQuery(BaseSearchQuery):
|
|
|
|
|
|
|
|
|
|
cleaned_words.append(word)
|
|
|
|
|
|
|
|
|
|
return ' '.join(cleaned_words)
|
|
|
|
|
return ' '.join(cleaned_words) # 拼接清洗后的查询词
|
|
|
|
|
|
|
|
|
|
# 构建查询片段:适配自定义查询逻辑(此处直接返回查询字符串)
|
|
|
|
|
def build_query_fragment(self, field, filter_type, value):
|
|
|
|
|
return value.query_string
|
|
|
|
|
|
|
|
|
|
# 获取搜索结果总数:通过 get_results 结果长度计算
|
|
|
|
|
def get_count(self):
|
|
|
|
|
results = self.get_results()
|
|
|
|
|
return len(results) if results else 0
|
|
|
|
|
|
|
|
|
|
# 获取拼写建议:返回后端生成的推荐词
|
|
|
|
|
def get_spelling_suggestion(self, preferred_query=None):
|
|
|
|
|
return self._spelling_suggestion
|
|
|
|
|
|
|
|
|
|
# 构建搜索参数:继承父类逻辑,可自定义扩展参数
|
|
|
|
|
def build_params(self, spelling_query=None):
|
|
|
|
|
kwargs = super(ElasticSearchQuery, self).build_params(spelling_query=spelling_query)
|
|
|
|
|
return kwargs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 自定义搜索表单:扩展 Haystack 基础表单,支持“是否启用拼写建议”的控制
|
|
|
|
|
class ElasticSearchModelSearchForm(ModelSearchForm):
|
|
|
|
|
|
|
|
|
|
def search(self):
|
|
|
|
|
# 是否建议搜索
|
|
|
|
|
# 根据请求参数(is_suggest)设置后端是否启用拼写建议
|
|
|
|
|
# 若 is_suggest = "no",则不启用;否则启用
|
|
|
|
|
self.searchqueryset.query.backend.is_suggest = self.data.get("is_suggest") != "no"
|
|
|
|
|
# 调用父类 search 方法,执行搜索并返回结果
|
|
|
|
|
sqs = super().search()
|
|
|
|
|
return sqs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 自定义 Elasticsearch 搜索引擎:关联后端和查询类,供 Haystack 调用
|
|
|
|
|
class ElasticSearchEngine(BaseEngine):
|
|
|
|
|
backend = ElasticSearchBackend
|
|
|
|
|
query = ElasticSearchQuery
|
|
|
|
|
backend = ElasticSearchBackend # 绑定自定义搜索后端
|
|
|
|
|
query = ElasticSearchQuery # 绑定自定义查询类
|