|
|
|
|
@ -1,8 +1,3 @@
|
|
|
|
|
# Django博客系统Elasticsearch搜索引擎集成模块
|
|
|
|
|
# 功能:基于Haystack框架封装Elasticsearch全文检索能力,专为博客文章搜索设计
|
|
|
|
|
# 核心能力包括:文章索引的创建/更新/删除、标题+正文全文搜索、搜索推荐词生成、搜索结果格式化
|
|
|
|
|
# 适配Haystack接口规范,可直接集成到Django项目,提供高效、可扩展的文章检索服务
|
|
|
|
|
|
|
|
|
|
from django.utils.encoding import force_str
|
|
|
|
|
from elasticsearch_dsl import Q
|
|
|
|
|
from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, log_query
|
|
|
|
|
@ -10,226 +5,146 @@ from haystack.forms import ModelSearchForm
|
|
|
|
|
from haystack.models import SearchResult
|
|
|
|
|
from haystack.utils import log as logging
|
|
|
|
|
|
|
|
|
|
# 导入文章Elasticsearch文档定义、文档管理器及文章模型
|
|
|
|
|
from blog.documents import ArticleDocument, ArticleDocumentManager
|
|
|
|
|
from blog.models import Article
|
|
|
|
|
|
|
|
|
|
# 初始化日志记录器,记录搜索过程中的关键操作和异常
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ElasticSearchBackend(BaseSearchBackend):
|
|
|
|
|
"""
|
|
|
|
|
Elasticsearch搜索后端实现类,继承Haystack BaseSearchBackend
|
|
|
|
|
负责与Elasticsearch集群交互,处理索引CRUD操作和搜索请求执行
|
|
|
|
|
"""
|
|
|
|
|
def __init__(self, connection_alias, **connection_options):
|
|
|
|
|
"""
|
|
|
|
|
初始化搜索后端
|
|
|
|
|
:param connection_alias: 连接别名(对应Haystack配置中的连接标识)
|
|
|
|
|
:param connection_options: 连接配置参数(如地址、端口、认证信息等)
|
|
|
|
|
"""
|
|
|
|
|
super(
|
|
|
|
|
ElasticSearchBackend,
|
|
|
|
|
self).__init__(
|
|
|
|
|
connection_alias,
|
|
|
|
|
**connection_options)
|
|
|
|
|
# 初始化文章文档管理器,封装Elasticsearch索引操作
|
|
|
|
|
self.manager = ArticleDocumentManager()
|
|
|
|
|
# 启用拼写/近义词推荐功能
|
|
|
|
|
self.include_spelling = True
|
|
|
|
|
|
|
|
|
|
def _get_models(self, iterable):
|
|
|
|
|
"""
|
|
|
|
|
将模型实例列表转换为Elasticsearch文档对象
|
|
|
|
|
:param iterable: 模型实例列表(可为空)
|
|
|
|
|
:return: 转换后的ArticleDocument文档列表
|
|
|
|
|
"""
|
|
|
|
|
# 若输入为空,默认查询所有文章模型实例
|
|
|
|
|
models = iterable if iterable and iterable[0] else Article.objects.all()
|
|
|
|
|
# 调用文档管理器的转换方法,将模型实例转为Elasticsearch文档
|
|
|
|
|
docs = self.manager.convert_to_doc(models)
|
|
|
|
|
return docs
|
|
|
|
|
|
|
|
|
|
def _create(self, models):
|
|
|
|
|
"""
|
|
|
|
|
新建索引并批量写入文档
|
|
|
|
|
:param models: 待索引的文章模型实例列表
|
|
|
|
|
"""
|
|
|
|
|
self.manager.create_index() # 创建Elasticsearch索引(若不存在)
|
|
|
|
|
docs = self._get_models(models) # 转换模型为文档
|
|
|
|
|
self.manager.rebuild(docs) # 批量重建索引(覆盖已有数据)
|
|
|
|
|
self.manager.create_index()
|
|
|
|
|
docs = self._get_models(models)
|
|
|
|
|
self.manager.rebuild(docs)
|
|
|
|
|
|
|
|
|
|
def _delete(self, models):
|
|
|
|
|
"""
|
|
|
|
|
从索引中删除指定文档
|
|
|
|
|
:param models: 待删除的文档对应的模型实例列表
|
|
|
|
|
:return: 操作结果标识(固定返回True)
|
|
|
|
|
"""
|
|
|
|
|
for m in models:
|
|
|
|
|
m.delete() # 调用文档对象的删除方法,从Elasticsearch移除
|
|
|
|
|
m.delete()
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
def _rebuild(self, models):
|
|
|
|
|
"""
|
|
|
|
|
重建索引(增量更新)
|
|
|
|
|
:param models: 待更新的模型实例列表(可为空,空则更新所有文章)
|
|
|
|
|
"""
|
|
|
|
|
models = models if models else Article.objects.all()
|
|
|
|
|
docs = self._get_models(models) # 转换模型为文档
|
|
|
|
|
self.manager.update_docs(docs) # 批量更新索引文档
|
|
|
|
|
docs = self.manager.convert_to_doc(models)
|
|
|
|
|
self.manager.update_docs(docs)
|
|
|
|
|
|
|
|
|
|
def update(self, index, iterable, commit=True):
|
|
|
|
|
"""
|
|
|
|
|
Haystack标准接口:更新索引(新增/修改文档)
|
|
|
|
|
:param index: 索引名称(当前实现未使用,由文档管理器维护)
|
|
|
|
|
:param iterable: 待更新的模型实例列表
|
|
|
|
|
:param commit: 是否立即提交(当前实现强制提交)
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
models = self._get_models(iterable)
|
|
|
|
|
self.manager.update_docs(models) # 调用文档管理器执行更新
|
|
|
|
|
self.manager.update_docs(models)
|
|
|
|
|
|
|
|
|
|
def remove(self, obj_or_string):
|
|
|
|
|
"""
|
|
|
|
|
Haystack标准接口:删除索引中的指定对象
|
|
|
|
|
:param obj_or_string: 模型实例或对象唯一标识字符串
|
|
|
|
|
"""
|
|
|
|
|
models = self._get_models([obj_or_string])
|
|
|
|
|
self._delete(models) # 调用内部删除方法处理
|
|
|
|
|
self._delete(models)
|
|
|
|
|
|
|
|
|
|
def clear(self, models=None, commit=True):
|
|
|
|
|
"""
|
|
|
|
|
Haystack标准接口:清空索引
|
|
|
|
|
:param models: 可选,指定要清空的模型类(当前实现未使用)
|
|
|
|
|
:param commit: 是否立即提交(当前实现强制提交)
|
|
|
|
|
"""
|
|
|
|
|
self.remove(None) # 传入None触发删除所有文档
|
|
|
|
|
self.remove(None)
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def get_suggestion(query: str) -> str:
|
|
|
|
|
"""
|
|
|
|
|
基于Elasticsearch的term suggest功能获取搜索推荐词
|
|
|
|
|
:param query: 原始搜索关键词
|
|
|
|
|
:return: 拼接后的推荐词字符串(无推荐时返回原关键词)
|
|
|
|
|
"""
|
|
|
|
|
# 构建搜索请求:匹配正文字段,同时启用术语推荐
|
|
|
|
|
"""获取推荐词, 如果没有找到添加原搜索词"""
|
|
|
|
|
|
|
|
|
|
search = ArticleDocument.search() \
|
|
|
|
|
.query("match", body=query) \
|
|
|
|
|
.suggest('suggest_search', query, term={'field': 'body'}) \
|
|
|
|
|
.execute()
|
|
|
|
|
|
|
|
|
|
keywords = []
|
|
|
|
|
# 解析推荐结果,提取推荐词(无推荐时保留原词)
|
|
|
|
|
for suggest in search.suggest.suggest_search:
|
|
|
|
|
if suggest["options"]:
|
|
|
|
|
keywords.append(suggest["options"][0]["text"]) # 取置信度最高的推荐词
|
|
|
|
|
keywords.append(suggest["options"][0]["text"])
|
|
|
|
|
else:
|
|
|
|
|
keywords.append(suggest["text"]) # 无推荐词时使用原搜索词
|
|
|
|
|
keywords.append(suggest["text"])
|
|
|
|
|
|
|
|
|
|
return ' '.join(keywords) # 拼接推荐词为字符串返回
|
|
|
|
|
return ' '.join(keywords)
|
|
|
|
|
|
|
|
|
|
@log_query
|
|
|
|
|
def search(self, query_string, **kwargs):
|
|
|
|
|
"""
|
|
|
|
|
Haystack标准接口:执行搜索请求
|
|
|
|
|
:param query_string: 搜索关键词
|
|
|
|
|
:param kwargs: 额外参数(包含分页、过滤条件等)
|
|
|
|
|
:return: 格式化后的搜索结果字典
|
|
|
|
|
"""
|
|
|
|
|
logger.info('search query_string:' + query_string) # 记录搜索关键词
|
|
|
|
|
logger.info('search query_string:' + query_string)
|
|
|
|
|
|
|
|
|
|
# 从参数中提取分页偏移量(起始位置、结束位置)
|
|
|
|
|
start_offset = kwargs.get('start_offset')
|
|
|
|
|
end_offset = kwargs.get('end_offset')
|
|
|
|
|
|
|
|
|
|
# 处理搜索推荐:根据is_suggest标识决定是否使用推荐词
|
|
|
|
|
# 推荐词搜索
|
|
|
|
|
if getattr(self, "is_suggest", None):
|
|
|
|
|
suggestion = self.get_suggestion(query_string) # 获取推荐词
|
|
|
|
|
suggestion = self.get_suggestion(query_string)
|
|
|
|
|
else:
|
|
|
|
|
suggestion = query_string # 不启用推荐,使用原搜索词
|
|
|
|
|
suggestion = query_string
|
|
|
|
|
|
|
|
|
|
# 构建Elasticsearch查询条件:布尔查询
|
|
|
|
|
# should子句:匹配正文或标题,最小匹配度70%(确保结果相关性)
|
|
|
|
|
q = Q('bool',
|
|
|
|
|
should=[Q('match', body=suggestion), Q('match', title=suggestion)],
|
|
|
|
|
minimum_should_match="70%")
|
|
|
|
|
|
|
|
|
|
# 构建完整搜索请求:包含过滤条件、分页、字段筛选
|
|
|
|
|
search = ArticleDocument.search() \
|
|
|
|
|
.query('bool', filter=[q]) \
|
|
|
|
|
.filter('term', status='p') # 过滤已发布(status='p')的文章
|
|
|
|
|
.filter('term', type='a') # 过滤文章类型(type='a')
|
|
|
|
|
.source(False) # 不返回原始文档内容,仅保留ID和得分
|
|
|
|
|
[start_offset: end_offset] # 分页截取结果集
|
|
|
|
|
.filter('term', status='p') \
|
|
|
|
|
.filter('term', type='a') \
|
|
|
|
|
.source(False)[start_offset: end_offset]
|
|
|
|
|
|
|
|
|
|
# 执行搜索并解析结果
|
|
|
|
|
results = search.execute()
|
|
|
|
|
hits = results['hits'].total # 总命中数
|
|
|
|
|
hits = results['hits'].total
|
|
|
|
|
raw_results = []
|
|
|
|
|
|
|
|
|
|
# 转换Elasticsearch原始结果为Haystack SearchResult格式
|
|
|
|
|
for raw_result in results['hits']['hits']:
|
|
|
|
|
app_label = 'blog' # 应用标签(固定为博客应用)
|
|
|
|
|
model_name = 'Article' # 模型名称(固定为文章模型)
|
|
|
|
|
additional_fields = {} # 额外字段(当前无扩展字段需求)
|
|
|
|
|
app_label = 'blog'
|
|
|
|
|
model_name = 'Article'
|
|
|
|
|
additional_fields = {}
|
|
|
|
|
|
|
|
|
|
result_class = SearchResult
|
|
|
|
|
# 构建SearchResult实例,适配Haystack结果格式
|
|
|
|
|
|
|
|
|
|
result = result_class(
|
|
|
|
|
app_label,
|
|
|
|
|
model_name,
|
|
|
|
|
raw_result['_id'], # 文档ID(对应文章模型主键)
|
|
|
|
|
raw_result['_score'], # 搜索匹配得分
|
|
|
|
|
raw_result['_id'],
|
|
|
|
|
raw_result['_score'],
|
|
|
|
|
**additional_fields)
|
|
|
|
|
raw_results.append(result)
|
|
|
|
|
|
|
|
|
|
# 封装返回结果(符合Haystack接口规范)
|
|
|
|
|
facets = {} # 聚合统计结果(当前未启用聚合功能)
|
|
|
|
|
# 拼写推荐词:仅当推荐词与原搜索词不同时返回
|
|
|
|
|
facets = {}
|
|
|
|
|
spelling_suggestion = None if query_string == suggestion else suggestion
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
'results': raw_results, # 搜索结果列表(SearchResult实例集合)
|
|
|
|
|
'hits': hits, # 总命中数
|
|
|
|
|
'facets': facets, # 聚合统计数据(空)
|
|
|
|
|
'spelling_suggestion': spelling_suggestion, # 推荐词(无则为None)
|
|
|
|
|
'results': raw_results,
|
|
|
|
|
'hits': hits,
|
|
|
|
|
'facets': facets,
|
|
|
|
|
'spelling_suggestion': spelling_suggestion,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ElasticSearchQuery(BaseSearchQuery):
|
|
|
|
|
"""
|
|
|
|
|
Elasticsearch搜索查询类,继承Haystack BaseSearchQuery
|
|
|
|
|
负责构建搜索查询参数、处理用户输入清理、格式转换等
|
|
|
|
|
"""
|
|
|
|
|
def _convert_datetime(self, date):
|
|
|
|
|
"""
|
|
|
|
|
转换日期时间为Elasticsearch支持的字符串格式
|
|
|
|
|
:param date: 日期时间对象
|
|
|
|
|
:return: 格式化后的日期时间字符串
|
|
|
|
|
"""
|
|
|
|
|
if hasattr(date, 'hour'):
|
|
|
|
|
# 带时分秒的日期:格式化为YYYYMMDDHHMMSS
|
|
|
|
|
return force_str(date.strftime('%Y%m%d%H%M%S'))
|
|
|
|
|
else:
|
|
|
|
|
# 仅日期:格式化为YYYYMMDD000000(补零时分秒)
|
|
|
|
|
return force_str(date.strftime('%Y%m%d000000'))
|
|
|
|
|
|
|
|
|
|
def clean(self, query_fragment):
|
|
|
|
|
"""
|
|
|
|
|
清理用户输入的搜索关键词,避免特殊字符影响查询执行
|
|
|
|
|
处理逻辑:保留词转小写、含特殊字符的词用单引号包裹
|
|
|
|
|
:param query_fragment: 原始搜索关键词片段
|
|
|
|
|
:return: 清理后的搜索关键词
|
|
|
|
|
Provides a mechanism for sanitizing user input before presenting the
|
|
|
|
|
value to the backend.
|
|
|
|
|
|
|
|
|
|
Whoosh 1.X differs here in that you can no longer use a backslash
|
|
|
|
|
to escape reserved characters. Instead, the whole word should be
|
|
|
|
|
quoted.
|
|
|
|
|
"""
|
|
|
|
|
words = query_fragment.split()
|
|
|
|
|
cleaned_words = []
|
|
|
|
|
|
|
|
|
|
for word in words:
|
|
|
|
|
# 处理Haystack保留词:转换为小写
|
|
|
|
|
if word in self.backend.RESERVED_WORDS:
|
|
|
|
|
word = word.replace(word, word.lower())
|
|
|
|
|
|
|
|
|
|
# 处理保留字符:包含特殊字符则用单引号包裹
|
|
|
|
|
for char in self.backend.RESERVED_CHARACTERS:
|
|
|
|
|
if char in word:
|
|
|
|
|
word = "'%s'" % word
|
|
|
|
|
@ -240,62 +155,29 @@ class ElasticSearchQuery(BaseSearchQuery):
|
|
|
|
|
return ' '.join(cleaned_words)
|
|
|
|
|
|
|
|
|
|
def build_query_fragment(self, field, filter_type, value):
|
|
|
|
|
"""
|
|
|
|
|
构建查询片段(适配Haystack过滤条件)
|
|
|
|
|
:param field: 过滤字段名称
|
|
|
|
|
:param filter_type: 过滤类型(如精确匹配、模糊匹配等)
|
|
|
|
|
:param value: 过滤值(需包含query_string属性)
|
|
|
|
|
:return: 过滤对应的查询字符串
|
|
|
|
|
"""
|
|
|
|
|
return value.query_string
|
|
|
|
|
|
|
|
|
|
def get_count(self):
|
|
|
|
|
"""
|
|
|
|
|
获取搜索结果总数
|
|
|
|
|
:return: 结果列表长度(即命中数)
|
|
|
|
|
"""
|
|
|
|
|
results = self.get_results()
|
|
|
|
|
return len(results) if results else 0
|
|
|
|
|
|
|
|
|
|
def get_spelling_suggestion(self, preferred_query=None):
|
|
|
|
|
"""
|
|
|
|
|
获取拼写推荐词
|
|
|
|
|
:param preferred_query: 优先使用的查询词(当前未使用)
|
|
|
|
|
:return: 搜索后端返回的推荐词
|
|
|
|
|
"""
|
|
|
|
|
return self._spelling_suggestion
|
|
|
|
|
|
|
|
|
|
def build_params(self, spelling_query=None):
|
|
|
|
|
"""
|
|
|
|
|
构建搜索参数(适配Haystack接口,传递给后端执行)
|
|
|
|
|
:param spelling_query: 拼写推荐查询词(当前未使用)
|
|
|
|
|
:return: 搜索参数字典
|
|
|
|
|
"""
|
|
|
|
|
kwargs = super(ElasticSearchQuery, self).build_params(spelling_query=spelling_query)
|
|
|
|
|
return kwargs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ElasticSearchModelSearchForm(ModelSearchForm):
|
|
|
|
|
"""
|
|
|
|
|
自定义搜索表单类,继承Haystack ModelSearchForm
|
|
|
|
|
扩展功能:支持通过请求参数控制是否启用搜索推荐
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def search(self):
|
|
|
|
|
"""
|
|
|
|
|
执行搜索(重写父类方法)
|
|
|
|
|
逻辑:根据请求参数is_suggest设置后端推荐开关,再调用父类搜索方法
|
|
|
|
|
:return: 搜索结果集(SearchQuerySet)
|
|
|
|
|
"""
|
|
|
|
|
# 是否启用搜索推荐:请求参数is_suggest != "no"时启用
|
|
|
|
|
# 是否建议搜索
|
|
|
|
|
self.searchqueryset.query.backend.is_suggest = self.data.get("is_suggest") != "no"
|
|
|
|
|
sqs = super().search() # 调用父类方法执行搜索
|
|
|
|
|
sqs = super().search()
|
|
|
|
|
return sqs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ElasticSearchEngine(BaseEngine):
|
|
|
|
|
"""
|
|
|
|
|
Elasticsearch搜索引擎入口类,继承Haystack BaseEngine
|
|
|
|
|
定义搜索引擎的后端和查询类,供Haystack框架识别和调用
|
|
|
|
|
"""
|
|
|
|
|
backend = ElasticSearchBackend # 绑定搜索后端类
|
|
|
|
|
query = ElasticSearchQuery # 绑定搜索查询类
|
|
|
|
|
backend = ElasticSearchBackend
|
|
|
|
|
query = ElasticSearchQuery
|