Update elasticsearch_backend.py

6 months ago · f526a8d64a
parent 2d5359b2c7
commit f526a8d64a
1 changed files with 96 additions and 55 deletions
--- a/src/DjangoBlog-master/djangoblog/elasticsearch_backend.py
+++ b/src/DjangoBlog-master/djangoblog/elasticsearch_backend.py
@ -1,150 +1,184 @@
+# 导入 Django 字符串处理工具：确保字符串编码兼容
 from django.utils.encoding import force_str
+# 导入 Elasticsearch DSL 工具：构建 Elasticsearch 查询语句
 from elasticsearch_dsl import Q
+# 导入 Haystack 核心类：实现自定义搜索后端、查询和引擎
 from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, log_query
-from haystack.forms import ModelSearchForm
-from haystack.models import SearchResult
-from haystack.utils import log as logging
+from haystack.forms import ModelSearchForm  # Haystack 基础搜索表单
+from haystack.models import SearchResult  # Haystack 搜索结果封装类
+from haystack.utils import log as logging  # Haystack 日志工具

+# 导入项目自定义的 Elasticsearch 文档和管理器：关联博客文章模型
 from blog.documents import ArticleDocument, ArticleDocumentManager
-from blog.models import Article
+from blog.models import Article  # 博客核心文章模型

+# 初始化日志对象：记录搜索相关日志（如查询语句、错误信息）
 logger = logging.getLogger(__name__)


+# 自定义 Elasticsearch 搜索后端：实现 Haystack 与 Elasticsearch 的底层交互
 class ElasticSearchBackend(BaseSearchBackend):
    def __init__(self, connection_alias, **connection_options):
+        # 调用父类构造方法，初始化 Haystack 基础搜索后端
        super(
            ElasticSearchBackend,
            self).__init__(
            connection_alias,
            **connection_options)
+        # 初始化文章文档管理器：负责 Elasticsearch 索引的创建、更新、删除
        self.manager = ArticleDocumentManager()
+        # 启用拼写建议功能：用于返回搜索关键词的推荐词
        self.include_spelling = True

+    # 辅助方法：将模型实例转换为 Elasticsearch 文档（Document）
    def _get_models(self, iterable):
+        # 若传入空列表，默认获取所有文章；否则使用传入的模型实例
        models = iterable if iterable and iterable[0] else Article.objects.all()
+        # 通过文档管理器将模型转换为 Elasticsearch 可识别的文档
        docs = self.manager.convert_to_doc(models)
        return docs

+    # 初始化索引：创建 Elasticsearch 索引并批量添加文档
    def _create(self, models):
-        self.manager.create_index()
-        docs = self._get_models(models)
-        self.manager.rebuild(docs)
+        self.manager.create_index()  # 创建 Elasticsearch 索引结构
+        docs = self._get_models(models)  # 转换模型为文档
+        self.manager.rebuild(docs)  # 批量写入文档到索引

+    # 删除索引中的文档：根据模型实例删除对应 Elasticsearch 记录
    def _delete(self, models):
        for m in models:
-            m.delete()
+            m.delete()  # 调用文档的 delete 方法，删除 Elasticsearch 中的对应记录
        return True

+    # 重建索引：全量更新 Elasticsearch 中的文档（覆盖旧数据）
    def _rebuild(self, models):
+        # 若未指定模型，默认获取所有文章
        models = models if models else Article.objects.all()
-        docs = self.manager.convert_to_doc(models)
-        self.manager.update_docs(docs)
+        docs = self._get_models(models)  # 转换模型为文档
+        self.manager.update_docs(docs)  # 批量更新文档到索引

+    # Haystack 标准方法：增量更新索引（更新指定模型对应的文档）
    def update(self, index, iterable, commit=True):
+        models = self._get_models(iterable)  # 转换模型为文档
+        self.manager.update_docs(models)  # 增量更新文档

-        models = self._get_models(iterable)
-        self.manager.update_docs(models)
-
+    # Haystack 标准方法：移除单个模型对应的索引记录
    def remove(self, obj_or_string):
-        models = self._get_models([obj_or_string])
-        self._delete(models)
+        models = self._get_models([obj_or_string])  # 转换为文档
+        self._delete(models)  # 删除文档

+    # Haystack 标准方法：清空索引（删除所有相关记录）
    def clear(self, models=None, commit=True):
-        self.remove(None)
+        self.remove(None)  # 调用 remove 方法清空索引

    @staticmethod
    def get_suggestion(query: str) -> str:
-        """获取推荐词, 如果没有找到添加原搜索词"""
-
+        """
+        生成搜索关键词的推荐词（基于 Elasticsearch 拼写建议功能）
+        若未找到推荐词，返回原查询词
+        """
+        # 构建 Elasticsearch 查询：匹配文章内容，并启用拼写建议
        search = ArticleDocument.search() \
            .query("match", body=query) \
            .suggest('suggest_search', query, term={'field': 'body'}) \
-            .execute()
+            .execute()  # 执行查询

        keywords = []
+        # 提取 Elasticsearch 返回的建议词
        for suggest in search.suggest.suggest_search:
-            if suggest["options"]:
+            if suggest["options"]:  # 若有推荐词，取第一个
                keywords.append(suggest["options"][0]["text"])
-            else:
+            else:  # 若无推荐词，保留原查询词
                keywords.append(suggest["text"])

-        return ' '.join(keywords)
+        return ' '.join(keywords)  # 拼接推荐词为字符串返回

+    # Haystack 核心搜索方法：执行搜索并返回结果（带日志记录装饰器）
    @log_query
    def search(self, query_string, **kwargs):
-        logger.info('search query_string:' + query_string)
+        logger.info('search query_string:' + query_string)  # 记录查询关键词

-        start_offset = kwargs.get('start_offset')
-        end_offset = kwargs.get('end_offset')
+        # 获取分页参数：起始偏移量和结束偏移量（用于分页）
+        start_offset = kwargs.get('start_offset', 0)
+        end_offset = kwargs.get('end_offset')  # 若为 None，Elasticsearch 会返回默认数量结果

-        # 推荐词搜索
+        # 生成推荐词：根据 is_suggest 标识判断是否需要拼写建议
        if getattr(self, "is_suggest", None):
            suggestion = self.get_suggestion(query_string)
        else:
-            suggestion = query_string
+            suggestion = query_string  # 不需要建议则使用原查询词

+        # 构建 Elasticsearch 查询条件（布尔查询）
+        # 1. 匹配条件：标题或内容包含推荐词，匹配度最低 70%
        q = Q('bool',
              should=[Q('match', body=suggestion), Q('match', title=suggestion)],
              minimum_should_match="70%")

+        # 构建完整搜索请求：
+        # - 过滤条件：使用上面的 q 匹配结果，且文章状态为“已发布”（status='p'）、类型为“文章”（type='a'）
+        # - 不返回文档源数据（source=False）：仅获取 ID 和得分，减少数据传输
+        # - 分页：按 start_offset 和 end_offset 截取结果
        search = ArticleDocument.search() \
                     .query('bool', filter=[q]) \
                     .filter('term', status='p') \
                     .filter('term', type='a') \
                     .source(False)[start_offset: end_offset]

+        # 执行搜索，获取 Elasticsearch 返回结果
        results = search.execute()
-        hits = results['hits'].total
-        raw_results = []
+        hits = results['hits'].total  # 匹配到的总结果数
+        raw_results = []  # 存储 Haystack 标准格式的搜索结果
+
+        # 解析 Elasticsearch 原始结果，封装为 Haystack 的 SearchResult 格式
        for raw_result in results['hits']['hits']:
-            app_label = 'blog'
-            model_name = 'Article'
-            additional_fields = {}
+            app_label = 'blog'  # 模型所属应用
+            model_name = 'Article'  # 模型名称
+            additional_fields = {}  # 额外字段（此处无额外信息，留空）

+            # 实例化 SearchResult：封装应用名、模型名、文档ID、匹配得分等信息
            result_class = SearchResult
-
            result = result_class(
                app_label,
                model_name,
-                raw_result['_id'],
-                raw_result['_score'],
+                raw_result['_id'],  # Elasticsearch 中文档的 ID
+                raw_result['_score'],  # 匹配得分（用于排序）
                **additional_fields)
            raw_results.append(result)
+
+        # 搜索结果元数据：分面（无分面需求，留空）、拼写建议
        facets = {}
+        # 若推荐词与原查询词不同，返回推荐词；否则为 None
        spelling_suggestion = None if query_string == suggestion else suggestion

+        # 返回 Haystack 标准格式的搜索结果
        return {
-            'results': raw_results,
-            'hits': hits,
-            'facets': facets,
-            'spelling_suggestion': spelling_suggestion,
+            'results': raw_results,  # 封装后的搜索结果列表
+            'hits': hits,  # 总匹配数
+            'facets': facets,  # 分面数据（空）
+            'spelling_suggestion': spelling_suggestion,  # 拼写建议
        }


+# 自定义 Elasticsearch 查询类：处理查询参数解析、格式清洗等
 class ElasticSearchQuery(BaseSearchQuery):
+    # 转换日期格式：适配 Elasticsearch 的日期查询需求
    def _convert_datetime(self, date):
-        if hasattr(date, 'hour'):
+        if hasattr(date, 'hour'):  # 若为datetime（含时分秒），格式化为年月日时分秒
            return force_str(date.strftime('%Y%m%d%H%M%S'))
-        else:
+        else:  # 若为date（仅年月日），补全时分秒为000000
            return force_str(date.strftime('%Y%m%d000000'))

+    # 清洗查询词：处理 Haystack 保留词和特殊字符，避免查询语法错误
    def clean(self, query_fragment):
-        """
-        Provides a mechanism for sanitizing user input before presenting the
-        value to the backend.
-
-        Whoosh 1.X differs here in that you can no longer use a backslash
-        to escape reserved characters. Instead, the whole word should be
-        quoted.
-        """
-        words = query_fragment.split()
+        words = query_fragment.split()  # 拆分查询词为单词列表
        cleaned_words = []

        for word in words:
+            # 处理 Haystack 保留词（如 AND、OR），转为小写（避免语法冲突）
            if word in self.backend.RESERVED_WORDS:
                word = word.replace(word, word.lower())

+            # 处理特殊字符（如 +、-、*）：包含特殊字符的单词用引号包裹
            for char in self.backend.RESERVED_CHARACTERS:
                if char in word:
                    word = "'%s'" % word
@ -152,32 +186,39 @@ class ElasticSearchQuery(BaseSearchQuery):

            cleaned_words.append(word)

-        return ' '.join(cleaned_words)
+        return ' '.join(cleaned_words)  # 拼接清洗后的查询词

+    # 构建查询片段：适配自定义查询逻辑（此处直接返回查询字符串）
    def build_query_fragment(self, field, filter_type, value):
        return value.query_string

+    # 获取搜索结果总数：通过 get_results 结果长度计算
    def get_count(self):
        results = self.get_results()
        return len(results) if results else 0

+    # 获取拼写建议：返回后端生成的推荐词
    def get_spelling_suggestion(self, preferred_query=None):
        return self._spelling_suggestion

+    # 构建搜索参数：继承父类逻辑，可自定义扩展参数
    def build_params(self, spelling_query=None):
        kwargs = super(ElasticSearchQuery, self).build_params(spelling_query=spelling_query)
        return kwargs


+# 自定义搜索表单：扩展 Haystack 基础表单，支持“是否启用拼写建议”的控制
 class ElasticSearchModelSearchForm(ModelSearchForm):
-
    def search(self):
-        # 是否建议搜索
+        # 根据请求参数（is_suggest）设置后端是否启用拼写建议
+        # 若 is_suggest = "no"，则不启用；否则启用
        self.searchqueryset.query.backend.is_suggest = self.data.get("is_suggest") != "no"
+        # 调用父类 search 方法，执行搜索并返回结果
        sqs = super().search()
        return sqs


+# 自定义 Elasticsearch 搜索引擎：关联后端和查询类，供 Haystack 调用
 class ElasticSearchEngine(BaseEngine):
-    backend = ElasticSearchBackend
-    query = ElasticSearchQuery
+    backend = ElasticSearchBackend  # 绑定自定义搜索后端
+    query = ElasticSearchQuery      # 绑定自定义查询类