From e0af0d00dc651e99c93af3feb458b82ff51f57d2 Mon Sep 17 00:00:00 2001
From: LY <yantong@qq.com>
Date: Sat, 18 Oct 2025 18:24:17 +0800
Subject: [PATCH] =?UTF-8?q?ly=5F=E7=AC=AC=E4=BA=94=E5=91=A8=E6=B3=A8?=
 =?UTF-8?q?=E9=87=8A?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/whoosh_cn_backend.py | 1120 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 1120 insertions(+)
 create mode 100644 src/whoosh_cn_backend.py
diff --git a/src/whoosh_cn_backend.py b/src/whoosh_cn_backend.py
new file mode 100644
index 0000000..44964f3
--- /dev/null
+++ b/src/whoosh_cn_backend.py
@@ -0,0 +1,1120 @@
+# encoding: utf-8
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+# 导入必要模块：JSON处理、文件操作、正则、线程、警告等
+import json
+import os
+import re
+import shutil
+import threading
+import warnings
+
+import six  # 兼容Python 2/3
+from django.conf import settings
+from django.core.exceptions import ImproperlyConfigured  # Django配置异常
+from datetime import datetime
+from django.utils.encoding import force_str  # 字符串编码处理
+# 导入Haystack核心模块：引擎、后端、查询、结果等基础类
+from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, EmptyResults, log_query
+from haystack.constants import DJANGO_CT, DJANGO_ID, ID  # Haystack常量（模型类型、ID等）
+from haystack.exceptions import MissingDependency, SearchBackendError, SkipDocument  # Haystack异常
+from haystack.inputs import Clean, Exact, PythonData, Raw  # Haystack查询输入类型
+from haystack.models import SearchResult  # Haystack搜索结果模型
+from haystack.utils import get_identifier, get_model_ct  # Haystack工具函数（获取唯一标识、模型类型）
+from haystack.utils import log as logging  # Haystack日志
+from haystack.utils.app_loading import haystack_get_model  # Haystack模型加载工具
+from jieba.analyse import ChineseAnalyzer  # 结巴中文分词器（用于中文搜索）
+# 导入Whoosh核心模块：索引、分析器、字段、存储、高亮、查询解析、搜索结果等
+from whoosh import index
+from whoosh.analysis import StemmingAnalyzer  # Whoosh英文词干分析器
+from whoosh.fields import BOOLEAN, DATETIME, IDLIST, KEYWORD, NGRAM, NGRAMWORDS, NUMERIC, Schema, TEXT  # Whoosh字段类型
+from whoosh.fields import ID as WHOOSH_ID  # Whoosh ID字段（避免与Haystack的ID冲突）
+from whoosh.filedb.filestore import FileStorage, RamStorage  # Whoosh文件存储/内存存储
+from whoosh.highlight import ContextFragmenter, HtmlFormatter  # Whoosh高亮相关
+from whoosh.highlight import highlight as whoosh_highlight  # Whoosh高亮函数
+from whoosh.qparser import QueryParser  # Whoosh查询解析器
+from whoosh.searching import ResultsPage  # Whoosh分页结果
+from whoosh.writing import AsyncWriter  # Whoosh异步写入器（提高写入效率）
+
+
+# 检查Whoosh依赖是否安装
+try:
+    import whoosh
+except ImportError:
+    raise MissingDependency(
+        "The 'whoosh' backend requires the installation of 'Whoosh'. Please refer to the documentation.")
+
+# 检查Whoosh版本（要求2.5.0及以上）
+if not hasattr(whoosh, '__version__') or whoosh.__version__ < (2, 5, 0):
+    raise MissingDependency(
+        "The 'whoosh' backend requires version 2.5.0 or greater.")
+
+
+# 正则表达式：匹配ISO格式的日期时间字符串（用于Whoosh与Python datetime转换）
+DATETIME_REGEX = re.compile(
+    '^(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})T(?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2})(\.\d{3,6}Z?)?$')
+# 线程本地存储：用于共享内存存储（RamStorage），避免多线程冲突
+LOCALS = threading.local()
+LOCALS.RAM_STORE = None
+
+
+class WhooshHtmlFormatter(HtmlFormatter):
+    """
+    自定义Whoosh HTML高亮格式化器
+    简化默认格式，确保与其他搜索后端（如Solr、Elasticsearch）的高亮结果格式一致
+    使用<em>标签包裹高亮文本（默认格式）
+    """
+    template = '<%(tag)s>%(t)s</%(tag)s>'
+
+
+class WhooshSearchBackend(BaseSearchBackend):
+    """
+    Whoosh搜索后端实现类：继承自Haystack的BaseSearchBackend
+    负责与Whoosh交互，实现索引创建、更新、删除、搜索等核心功能
+    支持中文分词（基于结巴分词）
+    """
+    # Whoosh保留关键字（搜索时需特殊处理，避免语法错误）
+    RESERVED_WORDS = (
+        'AND',
+        'NOT',
+        'OR',
+        'TO',
+    )
+
+    # Whoosh保留字符（搜索时需转义或处理，避免语法错误）
+    # '\\'需放在首位，防止覆盖其他斜杠替换
+    RESERVED_CHARACTERS = (
+        '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
+        '[', ']', '^', '"', '~', '*', '?', ':', '.',
+    )
+
+    def __init__(self, connection_alias, **connection_options):
+        """
+        初始化Whoosh搜索后端
+        :param connection_alias: 连接别名（来自Haystack配置）
+        :param connection_options: 连接参数（如索引路径、存储类型等）
+        """
+        super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options)
+        self.setup_complete = False  # 初始化完成标记（延迟初始化）
+        self.use_file_storage = True  # 默认使用文件存储（FileStorage）
+        # POST请求大小限制（默认128MB）
+        self.post_limit = getattr(connection_options, 'POST_LIMIT', 128 * 1024 * 1024)
+        # 索引存储路径（从配置中获取）
+        self.path = connection_options.get('PATH')
+
+        # 检查存储类型：若配置为非文件存储（如内存），则使用RamStorage
+        if connection_options.get('STORAGE', 'file') != 'file':
+            self.use_file_storage = False
+
+        # 若使用文件存储但未配置路径，抛出配置异常
+        if self.use_file_storage and not self.path:
+            raise ImproperlyConfigured(
+                "You must specify a 'PATH' in your settings for connection '%s'." % connection_alias)
+
+        # 初始化日志记录器
+        self.log = logging.getLogger('haystack')
+
+    def setup(self):
+        """
+        延迟初始化：创建索引存储和Schema，初始化Whoosh索引
+        避免项目启动时立即加载，仅在首次使用搜索功能时执行
+        """
+        from haystack import connections  # 延迟导入，避免循环导入
+        new_index = False  # 是否为新创建的索引（首次初始化）
+
+        # 若使用文件存储且路径不存在，创建目录并标记为新索引
+        if self.use_file_storage and not os.path.exists(self.path):
+            os.makedirs(self.path)
+            new_index = True
+
+        # 检查文件存储路径是否可写
+        if self.use_file_storage and not os.access(self.path, os.W_OK):
+            raise IOError(
+                "The path to your Whoosh index '%s' is not writable for the current user/group." % self.path)
+
+        # 初始化存储：文件存储或内存存储
+        if self.use_file_storage:
+            self.storage = FileStorage(self.path)
+        else:
+            global LOCALS
+            # 内存存储共享（线程本地存储，避免多线程重复创建）
+            if getattr(LOCALS, 'RAM_STORE', None) is None:
+                LOCALS.RAM_STORE = RamStorage()
+            self.storage = LOCALS.RAM_STORE
+
+        # 构建Whoosh Schema（索引结构）：从Haystack统一索引获取字段
+        unified_index = connections[self.connection_alias].get_unified_index()
+        self.content_field_name, self.schema = self.build_schema(unified_index.all_searchfields())
+        # 初始化查询解析器（基于内容字段和Schema）
+        self.parser = QueryParser(self.content_field_name, schema=self.schema)
+
+        # 若为新索引，创建索引；否则打开现有索引（不存在则创建）
+        if new_index is True:
+            self.index = self.storage.create_index(self.schema)
+        else:
+            try:
+                self.index = self.storage.open_index(schema=self.schema)
+            except index.EmptyIndexError:
+                self.index = self.storage.create_index(self.schema)
+
+        # 标记初始化完成
+        self.setup_complete = True
+
+    def build_schema(self, fields):
+        """
+        构建Whoosh Schema（索引结构）：将Haystack字段映射为Whoosh字段类型
+        :param fields: Haystack统一索引中的所有字段（dict，key为字段名，value为字段类）
+        :return: (content_field_name, schema)：内容字段名（主搜索字段）、Whoosh Schema对象
+        """
+        # 初始化Schema字段：包含Haystack默认字段（ID、模型类型、模型ID）
+        schema_fields = {
+            ID: WHOOSH_ID(stored=True, unique=True),  # 文档唯一ID（Haystack标识）
+            DJANGO_CT: WHOOSH_ID(stored=True),       # 模型类型（如blog.Article）
+            DJANGO_ID: WHOOSH_ID(stored=True),       # 模型主键ID
+        }
+        # 初始字段数量（用于后续检查是否有有效字段）
+        initial_key_count = len(schema_fields)
+        content_field_name = ''  # 主内容字段名（标记为document=True的字段）
+
+        # 遍历Haystack字段，映射为对应的Whoosh字段
+        for field_name, field_class in fields.items():
+            index_fieldname = field_class.index_fieldname  # 索引中的实际字段名
+            # 处理多值字段（如标签、分类）
+            if field_class.is_multivalued:
+                if not field_class.indexed:
+                    # 非索引多值字段：使用IDLIST（存储但不索引）
+                    schema_fields[index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost)
+                else:
+                    # 索引多值字段：使用KEYWORD（逗号分隔，可索引、可排序）
+                    schema_fields[index_fieldname] = KEYWORD(
+                        stored=True, commas=True, scorable=True, field_boost=field_class.boost)
+            # 处理日期/日期时间字段
+            elif field_class.field_type in ['date', 'datetime']:
+                schema_fields[index_fieldname] = DATETIME(stored=field_class.stored, sortable=True)
+            # 处理整数字段
+            elif field_class.field_type == 'integer':
+                schema_fields[index_fieldname] = NUMERIC(
+                    stored=field_class.stored, numtype=int, field_boost=field_class.boost)
+            # 处理浮点数字段
+            elif field_class.field_type == 'float':
+                schema_fields[index_fieldname] = NUMERIC(
+                    stored=field_class.stored, numtype=float, field_boost=field_class.boost)
+            # 处理布尔字段
+            elif field_class.field_type == 'boolean':
+                # Whoosh BOOLEAN字段不支持boost（2.5.0+版本）
+                schema_fields[index_fieldname] = BOOLEAN(stored=field_class.stored)
+            # 处理NGram字段（适用于模糊搜索，如拼音、部分匹配）
+            elif field_class.field_type == 'ngram':
+                schema_fields[index_fieldname] = NGRAM(
+                    minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost)
+            # 处理Edge NGram字段（适用于前缀匹配，如搜索"py"匹配"Python"）
+            elif field_class.field_type == 'edge_ngram':
+                schema_fields[index_fieldname] = NGRAMWORDS(
+                    minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost)
+            # 默认字段类型：文本字段（支持中文分词）
+            else:
+                # 替换默认的StemmingAnalyzer（英文词干）为ChineseAnalyzer（结巴中文分词）
+                schema_fields[index_fieldname] = TEXT(
+                    stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True)
+
+            # 标记主内容字段（document=True的字段，用于默认搜索）
+            if field_class.document is True:
+                content_field_name = index_fieldname
+                # 启用拼写检查（仅主内容字段支持）
+                schema_fields[index_fieldname].spelling = True
+
+        # 检查是否有有效字段（若仅包含初始字段，说明未配置任何搜索字段）
+        if len(schema_fields) <= initial_key_count:
+            raise SearchBackendError(
+                "No fields were found in any search_indexes. Please correct this before attempting to search.")
+
+        # 创建并返回Whoosh Schema
+        return (content_field_name, Schema(**schema_fields))
+
+    def update(self, index, iterable, commit=True):
+        """
+        更新索引：将模型对象批量添加/更新到Whoosh索引
+        :param index: Haystack索引对象（对应某个模型的索引配置）
+        :param iterable: 模型对象迭代器（需索引的对象列表）
+        :param commit: 是否立即提交（此处强制提交，避免锁问题）
+        """
+        # 若未初始化，先执行setup
+        if not self.setup_complete:
+            self.setup()
+
+        # 刷新索引（确保获取最新状态）
+        self.index = self.index.refresh()
+        # 使用异步写入器（提高批量写入效率，避免阻塞）
+        writer = AsyncWriter(self.index)
+
+        # 遍历对象，处理并写入索引
+        for obj in iterable:
+            try:
+                # 准备文档数据（调用Haystack索引的full_prepare方法，处理字段值）
+                doc = index.full_prepare(obj)
+            except SkipDocument:
+                # 跳过无需索引的对象（如草稿文章）
+                self.log.debug(u"Indexing for object `%s` skipped", obj)
+            else:
+                # 转换文档值为Whoosh支持的格式（如datetime转字符串、布尔值转'true'/'false'）
+                for key in doc:
+                    doc[key] = self._from_python(doc[key])
+
+                # Whoosh 2.5.0+不支持文档级boost，删除该字段
+                if 'boost' in doc:
+                    del doc['boost']
+
+                try:
+                    # 更新文档：若ID存在则更新，不存在则新增
+                    writer.update_document(**doc)
+                except Exception as e:
+                    # 若设置为静默失败，则仅记录日志；否则抛出异常
+                    if not self.silently_fail:
+                        raise
+                    # 记录错误日志（包含对象标识，避免编码问题）
+                    self.log.error(
+                        u"%s while preparing object for update" % e.__class__.__name__,
+                        exc_info=True,
+                        extra={"data": {"index": index, "object": get_identifier(obj)}})
+
+        # 批量写入后强制提交（Whoosh需提交才会持久化）
+        if len(iterable) > 0:
+            writer.commit()
+
+    def remove(self, obj_or_string, commit=True):
+        """
+        删除索引：从Whoosh索引中删除指定模型对象
+        :param obj_or_string: 模型对象或对象唯一标识（get_identifier返回值）
+        :param commit: 是否立即提交（Whoosh删除后自动提交，此处参数仅为兼容）
+        """
+        if not self.setup_complete:
+            self.setup()
+
+        # 刷新索引
+        self.index = self.index.refresh()
+        # 获取对象的唯一标识（用于Whoosh查询删除）
+        whoosh_id = get_identifier(obj_or_string)
+
+        try:
+            # 构造查询：根据ID删除文档
+            delete_query = self.parser.parse(u'%s:"%s"' % (ID, whoosh_id))
+            self.index.delete_by_query(q=delete_query)
+        except Exception as e:
+            if not self.silently_fail:
+                raise
+            # 记录删除失败日志
+            self.log.error(
+                "Failed to remove document '%s' from Whoosh: %s",
+                whoosh_id, e, exc_info=True)
+
+    def clear(self, models=None, commit=True):
+        """
+        清空索引：删除指定模型的所有索引，或清空整个索引
+        :param models: 模型列表（如[Article, Comment]），为None则清空所有
+        :param commit: 是否立即提交（Whoosh删除后自动提交）
+        """
+        if not self.setup_complete:
+            self.setup()
+
+        # 刷新索引
+        self.index = self.index.refresh()
+
+        # 验证models参数是否为列表/元组
+        if models is not None:
+            assert isinstance(models, (list, tuple))
+
+        try:
+            # 清空整个索引（效率更高：直接删除索引文件/内存存储）
+            if models is None:
+                self.delete_index()
+            # 仅清空指定模型的索引
+            else:
+                models_to_delete = []
+                # 遍历模型，生成模型类型查询条件（如DJANGO_CT:blog.Article）
+                for model in models:
+                    models_to_delete.append(u"%s:%s" % (DJANGO_CT, get_model_ct(model)))
+                # 构造OR查询，删除所有匹配模型的文档
+                delete_query = self.parser.parse(u" OR ".join(models_to_delete))
+                self.index.delete_by_query(q=delete_query)
+        except Exception as e:
+            if not self.silently_fail:
+                raise
+            # 记录清空失败日志
+            if models is not None:
+                self.log.error(
+                    "Failed to clear Whoosh index of models '%s': %s",
+                    ','.join(models_to_delete), e, exc_info=True)
+            else:
+                self.log.error("Failed to clear Whoosh index: %s", e, exc_info=True)
+
+    def delete_index(self):
+        """
+        彻底删除索引：删除索引存储（文件或内存），并重新初始化
+        比clear更彻底，适用于重建索引场景
+        """
+        # 文件存储：删除索引目录
+        if self.use_file_storage and os.path.exists(self.path):
+            shutil.rmtree(self.path)
+        # 内存存储：清空存储
+        elif not self.use_file_storage:
+            self.storage.clean()
+
+        # 重新初始化索引（创建新的空索引）
+        self.setup()
+
+    def optimize(self):
+        """
+        优化索引：整理Whoosh索引文件，提高搜索效率
+        Whoosh会合并小索引段，减少磁盘IO
+        """
+        if not self.setup_complete:
+            self.setup()
+
+        # 刷新并优化索引
+        self.index = self.index.refresh()
+        self.index.optimize()
+
+    def calculate_page(self, start_offset=0, end_offset=None):
+        """
+        计算分页参数：将Haystack的start/end偏移量转换为Whoosh的页码和页长
+        Whoosh使用页码（1-based）和页长，而非偏移量
+        :param start_offset: 起始偏移量（从0开始）
+        :param end_offset: 结束偏移量（不包含）
+        :return: (page_num, page_length)：页码、页长
+        """
+        # 处理end_offset为0或负数的情况（避免Whoosh报错）
+        if end_offset is not None and end_offset <= 0:
+            end_offset = 1
+
+        # 初始化默认值
+        page_num = 0
+        if end_offset is None:
+            end_offset = 1000000  # 默认最大页长（获取所有结果）
+        if start_offset is None:
+            start_offset = 0
+
+        # 计算页长（end - start）和页码（start / 页长，向上取整）
+        page_length = end_offset - start_offset
+        if page_length and page_length > 0:
+            page_num = int(start_offset / page_length)
+
+        # Whoosh页码为1-based，故加1
+        page_num += 1
+        return page_num, page_length
+
+    @log_query
+    def search(
+            self,
+            query_string,
+            sort_by=None,
+            start_offset=0,
+            end_offset=None,
+            fields='',
+            highlight=False,
+            facets=None,
+            date_facets=None,
+            query_facets=None,
+            narrow_queries=None,
+            spelling_query=None,
+            within=None,
+            dwithin=None,
+            distance_point=None,
+            models=None,
+            limit_to_registered_models=None,
+            result_class=None,
+            **kwargs):
+        """
+        核心搜索方法：执行查询并返回处理后的结果
+        支持分页、排序、高亮、过滤模型等功能
+        :param query_string: 搜索关键词
+        :param sort_by: 排序字段列表（如['-pub_time', 'title']）
+        :param start_offset/end_offset: 分页偏移量
+        :param highlight: 是否开启结果高亮
+        :param models: 限制搜索的模型列表
+        :param result_class: 搜索结果类（默认SearchResult）
+        :return: 搜索结果字典（含results列表、hits总数、facets、拼写建议等）
+        """
+        # 初始化检查
+        if not self.setup_complete:
+            self.setup()
+
+        # 空查询字符串返回空结果
+        if len(query_string) == 0:
+            return {'results': [], 'hits': 0}
+
+        # 转换查询字符串为Unicode（兼容Python 2）
+        query_string = force_str(query_string)
+
+        # 单字符查询（非通配符）返回空结果（通常为停用词，无意义）
+        if len(query_string) <= 1 and query_string != u'*':
+            return {'results': [], 'hits': 0}
+
+        # 处理排序：Whoosh要求所有排序字段方向一致（均升序或均降序）
+        reverse = False  # 是否倒序（默认升序）
+        if sort_by is not None:
+            sort_by_list = []
+            reverse_counter = 0  # 倒序字段计数
+
+            # 统计倒序字段数量
+            for order_by in sort_by:
+                if order_by.startswith('-'):
+                    reverse_counter += 1
+
+            # Whoosh不支持混合排序方向，抛出异常
+            if reverse_counter and reverse_counter != len(sort_by):
+                raise SearchBackendError("Whoosh requires all order_by fields to use the same sort direction")
+
+            # 提取排序字段（去掉'-'符号），确定排序方向
+            for order_by in sort_by:
+                if order_by.startswith('-'):
+                    sort_by_list.append(order_by[1:])
+                    if len(sort_by_list) == 1:
+                        reverse = True
+                else:
+                    sort_by_list.append(order_by)
+                    if len(sort_by_list) == 1:
+                        reverse = False
+
+            # Whoosh仅支持单个排序字段，取第一个
+            sort_by = sort_by_list[0]
+
+        # Whoosh不支持分面搜索（facets），给出警告
+        if facets is not None:
+            warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2)
+        if date_facets is not None:
+            warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2)
+        if query_facets is not None:
+            warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2)
+
+        # 处理过滤查询（narrow_queries）：限制搜索结果范围
+        narrowed_results = None  # 过滤后的结果集
+        self.index = self.index.refresh()
+
+        # 处理模型过滤：限制仅搜索指定模型或已注册模型
+        if limit_to_registered_models is None:
+            # 从配置获取默认值（是否仅搜索已注册模型）
+            limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)
+
+        model_choices = []
+        if models and len(models):
+            # 限制搜索指定模型（如[Article]）
+            model_choices = sorted(get_model_ct(model) for model in models)
+        elif limit_to_registered_models:
+            # 限制搜索所有已注册模型（通过Haystack路由获取）
+            model_choices = self.build_models_list()
+
+        # 将模型过滤添加到narrow_queries
+        if len(model_choices) > 0:
+            if narrow_queries is None:
+                narrow_queries = set()
+            # 构造OR查询：匹配任一模型类型
+            model_query = ' OR '.join(['%s:%s' % (DJANGO_CT, rm) for rm in model_choices])
+            narrow_queries.add(model_query)
+
+        # 执行过滤查询：获取符合所有narrow_queries的结果集
+        narrow_searcher = None
+        if narrow_queries is not None:
+            narrow_searcher = self.index.searcher()
+            for nq in narrow_queries:
+                # 解析过滤查询并执行（获取所有匹配结果）
+                nq_parsed = self.parser.parse(force_str(nq))
+                recent_narrowed = narrow_searcher.search(nq_parsed, limit=None)
+
+                # 若任一过滤条件无结果，直接返回空结果
+                if len(recent_narrowed) <= 0:
+                    return {'results': [], 'hits': 0}
+
+                # 合并过滤结果（交集）
+                if narrowed_results:
+                    narrowed_results.filter(recent_narrowed)
+                else:
+                    narrowed_results = recent_narrowed
+
+        # 刷新索引，准备执行主搜索
+        self.index = self.index.refresh()
+
+        # 若索引为空，返回空结果（含拼写建议）
+        if not self.index.doc_count():
+            spelling_suggestion = self.create_spelling_suggestion(spelling_query or query_string) if self.include_spelling else None
+            return {'results': [], 'hits': 0, 'spelling_suggestion': spelling_suggestion}
+
+        # 执行主搜索
+        searcher = self.index.searcher()
+        try:
+            # 解析查询字符串
+            parsed_query = self.parser.parse(query_string)
+        except Exception:
+            # 无效查询（如语法错误），返回空结果
+            if not self.silently_fail:
+                raise
+            return {'results': [], 'hits': 0, 'spelling_suggestion': None}
+
+        # 无效查询（如仅停用词），返回空结果
+        if parsed_query is None:
+            return {'results': [], 'hits': 0, 'spelling_suggestion': None}
+
+        # 计算分页参数
+        page_num, page_length = self.calculate_page(start_offset, end_offset)
+
+        # 构造搜索参数
+        search_kwargs = {
+            'pagelen': page_length,  # 页长
+            'sortedby': sort_by,     # 排序字段
+            'reverse': reverse       # 是否倒序
+        }
+        # 应用过滤结果（仅返回过滤后的子集）
+        if narrowed_results is not None:
+            search_kwargs['filter'] = narrowed_results
+
+        # 执行搜索并获取分页结果
+        try:
+            raw_page = searcher.search_page(parsed_query, page_num, **search_kwargs)
+        except ValueError:
+            # 页码超出范围（如请求第10页但仅5页），返回空结果
+            if not self.silently_fail:
+                raise
+            return {'results': [], 'hits': 0, 'spelling_suggestion': None}
+
+        # Whoosh 2.5.1+ bug：页码超出时返回错误页码，需检查
+        if raw_page.pagenum < page_num:
+            spelling_suggestion = self.create_spelling_suggestion(spelling_query or query_string) if self.include_spelling else None
+            return {'results': [], 'hits': 0, 'spelling_suggestion': spelling_suggestion}
+
+        # 处理搜索结果（转换为Haystack SearchResult，添加高亮等）
+        results = self._process_results(
+            raw_page,
+            highlight=highlight,
+            query_string=query_string,
+            spelling_query=spelling_query,
+            result_class=result_class)
+
+        # 关闭搜索器（释放资源）
+        searcher.close()
+        if hasattr(narrow_searcher, 'close'):
+            narrow_searcher.close()
+
+        return results
+
+    def more_like_this(
+            self,
+            model_instance,
+            additional_query_string=None,
+            start_offset=0,
+            end_offset=None,
+            models=None,
+            limit_to_registered_models=None,
+            result_class=None,
+            **kwargs):
+        """
+        相似结果搜索：根据指定模型对象，查找相似的文档
+        基于Whoosh的more_like_this功能，分析主内容字段的相似度
+        :param model_instance: 参考模型对象（如某篇文章）
+        :return: 相似结果字典（结构同search方法）
+        """
+        if not self.setup_complete:
+            self.setup()
+
+        # 获取模型的实际类（排除延迟加载模型）
+        model_klass = model_instance._meta.concrete_model
+        # 主内容字段名（用于相似度分析）
+        field_name = self.content_field_name
+        # 过滤查询和结果集
+        narrow_queries = set()
+        narrowed_results = None
+        self.index = self.index.refresh()
+
+        # 处理模型过滤（同search方法）
+        if limit_to_registered_models is None:
+            limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)
+
+        model_choices = []
+        if models and len(models):
+            model_choices = sorted(get_model_ct(model) for model in models)
+        elif limit_to_registered_models:
+            model_choices = self.build_models_list()
+
+        # 添加模型过滤条件
+        if len(model_choices) > 0:
+            if narrow_queries is None:
+                narrow_queries = set()
+            model_query = ' OR '.join(['%s:%s' % (DJANGO_CT, rm) for rm in model_choices])
+            narrow_queries.add(model_query)
+
+        # 添加额外过滤条件（如关键词过滤）
+        if additional_query_string and additional_query_string != '*':
+            narrow_queries.add(additional_query_string)
+
+        # 执行过滤查询（同search方法）
+        narrow_searcher = None
+        if narrow_queries is not None:
+            narrow_searcher = self.index.searcher()
+            for nq in narrow_queries:
+                nq_parsed = self.parser.parse(force_str(nq))
+                recent_narrowed = narrow_searcher.search(nq_parsed, limit=None)
+
+                if len(recent_narrowed) <= 0:
+                    return {'results': [], 'hits': 0}
+
+                if narrowed_results:
+                    narrowed_results.filter(recent_narrowed)
+                else:
+                    narrowed_results = recent_narrowed
+
+        # 计算分页参数
+        page_num, page_length = self.calculate_page(start_offset, end_offset)
+
+        # 刷新索引，执行相似搜索
+        self.index = self.index.refresh()
+        raw_results = EmptyResults()  # 默认空结果
+
+        if self.index.doc_count():
+            searcher = self.index.searcher()
+            # 构造查询：获取参考对象的索引文档
+            query = "%s:%s" % (ID, get_identifier(model_instance))
+            parsed_query = self.parser.parse(query)
+            results = searcher.search(parsed_query)
+
+            # 若找到参考文档，获取相似结果
+            if len(results):
+                # 基于主内容字段查找相似文档，限制最大数量为end_offset
+                raw_results = results[0].more_like_this(field_name, top=end_offset)
+
+            # 应用过滤结果
+            if narrowed_results is not None and hasattr(raw_results, 'filter'):
+                raw_results.filter(narrowed_results)
+
+        # 处理分页结果
+        try:
+            raw_page = ResultsPage(raw_results, page_num, page_length)
+        except ValueError:
+            if not self.silently_fail:
+                raise
+            return {'results': [], 'hits': 0, 'spelling_suggestion': None}
+
+        # 检查页码有效性
+        if raw_page.pagenum < page_num:
+            return {'results': [], 'hits': 0, 'spelling_suggestion': None}
+
+        # 处理结果并关闭搜索器
+        results = self._process_results(raw_page, result_class=result_class)
+        searcher.close()
+        if hasattr(narrow_searcher, 'close'):
+            narrow_searcher.close()
+
+        return results
+
+    def _process_results(
+            self,
+            raw_page,
+            highlight=False,
+            query_string='',
+            spelling_query=None,
+            result_class=None):
+        """
+        处理搜索结果：将Whoosh原始结果转换为Haystack SearchResult格式
+        支持高亮、字段类型转换、拼写建议等
+        :param raw_page: Whoosh ResultsPage对象（分页原始结果）
+        :param highlight: 是否开启高亮
+        :return: 处理后的结果字典
+        """
+        from haystack import connections  # 延迟导入
+        results = []  # 最终结果列表（SearchResult对象）
+        hits = len(raw_page)  # 总命中数（当前页）
+
+        # 结果类默认值（Haystack SearchResult）
+        if result_class is None:
+            result_class = SearchResult
+
+        # 初始化分面和拼写建议（Whoosh不支持分面，故为空）
+        facets = {}
+        spelling_suggestion = None
+        # 获取Haystack统一索引和已索引模型
+        unified_index = connections[self.connection_alias].get_unified_index()
+        indexed_models = unified_index.get_indexed_models()
+
+        # 遍历原始结果，转换为SearchResult
+        for doc_offset, raw_result in enumerate(raw_page):
+            # 获取文档得分（相关性）
+            score = raw_page.score(doc_offset) or 0
+            # 提取模型类型（如blog.Article）并拆分应用标签和模型名
+            app_label, model_name = raw_result[DJANGO_CT].split('.')
+            additional_fields = {}  # 额外字段（除默认字段外的其他字段）
+            # 加载模型类
+            model = haystack_get_model(app_label, model_name)
+
+            # 仅处理已索引的模型
+            if model and model in indexed_models:
+                # 遍历原始结果的所有字段，转换为Python原生类型
+                for key, value in raw_result.items():
+                    string_key = str(key)
+                    # 获取模型对应的Haystack索引
+                    index = unified_index.get_index(model)
+
+                    # 若字段在索引中定义，使用索引的convert方法转换值
+                    if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
+                        field = index.fields[string_key]
+                        # 处理多值字段（如KEYWORD类型，逗号分隔字符串转列表）
+                        if field.is_multivalued:
+                            if value is None or len(value) == 0:
+                                additional_fields[string_key] = []
+                            else:
+                                additional_fields[string_key] = value.split(',')
+                        else:
+                            # 单值字段：使用索引的convert方法转换
+                            additional_fields[string_key] = field.convert(value)
+                    else:
+                        # 未定义的字段：直接转换为Python类型
+                        additional_fields[string_key] = self._to_python(value)
+
+                # 删除默认字段（DJANGO_CT、DJANGO_ID），避免重复
+                del additional_fields[DJANGO_CT]
+                del additional_fields[DJANGO_ID]
+
+                # 处理结果高亮
+                if highlight:
+                    # 使用英文词干分析器解析查询关键词（用于高亮匹配）
+                    sa = StemmingAnalyzer()
+                    # 自定义高亮格式化器（<em>标签）
+                    formatter = WhooshHtmlFormatter('em')
+                    # 提取查询关键词的词干（如"running"→"run"）
+                    terms = [token.text for token in sa(query_string)]
+
+                    # 对主内容字段执行高亮
+                    content_value = additional_fields.get(self.content_field_name, '')
+                    whoosh_highlighted = whoosh_highlight(
+                        content_value,
+                        terms,
+                        sa,
+                        ContextFragmenter(),  # 上下文片段生成器（显示关键词前后内容）
+                        formatter
+                    )
+                    # 将高亮结果添加到额外字段
+                    additional_fields['highlighted'] = {self.content_field_name: [whoosh_highlighted]}
+
+                # 创建SearchResult对象并添加到结果列表
+                result = result_class(
+                    app_label,
+                    model_name,
+                    raw_result[DJANGO_ID],  # 模型主键ID
+                    score,
+                    **additional_fields
+                )
+                results.append(result)
+            else:
+                # 跳过未索引的模型，减少命中数
+                hits -= 1
+
+        # 生成拼写建议（若开启拼写检查）
+        if self.include_spelling:
+            spelling_suggestion = self.create_spelling_suggestion(spelling_query or query_string)
+
+        # 返回处理后的结果字典
+        return {
+            'results': results,
+            'hits': hits,
+            'facets': facets,
+            'spelling_suggestion': spelling_suggestion,
+        }
+
+    def create_spelling_suggestion(self, query_string):
+        """
+        生成拼写建议：基于Whoosh的拼写检查功能，推荐可能的正确关键词
+        :param query_string: 原始查询关键词
+        :return: 拼写建议字符串（如"pytho"→"python"）
+        """
+        spelling_suggestion = None
+        # 获取索引阅读器和拼写校正器（基于主内容字段）
+        reader = self.index.reader()
+        corrector = reader.corrector(self.content_field_name)
+        cleaned_query = force_str(query_string)
+
+        # 空查询返回None
+        if not query_string:
+            return spelling_suggestion
+
+        # 清理查询字符串：移除Whoosh保留词和字符
+        for rev_word in self.RESERVED_WORDS:
+            cleaned_query = cleaned_query.replace(rev_word, '')
+        for rev_char in self.RESERVED_CHARACTERS:
+            cleaned_query = cleaned_query.replace(rev_char, '')
+
+        # 拆分关键词，逐个生成建议
+        query_words = cleaned_query.split()
+        suggested_words = []
+        for word in query_words:
+            # 获取每个词的最佳建议（限制1个）
+            suggestions = corrector.suggest(word, limit=1)
+            if len(suggestions) > 0:
+                suggested_words.append(suggestions[0])
+
+        # 拼接建议词为字符串
+        spelling_suggestion = ' '.join(suggested_words)
+        return spelling_suggestion
+
+    def _from_python(self, value):
+        """
+        Python类型转换为Whoosh支持的格式（如datetime→字符串、布尔→'true'/'false'）
+        参考pysolr的转换逻辑，确保兼容性
+        :param value: Python原生类型值
+        :return: Whoosh支持的字符串/数值类型
+        """
+        # 处理日期时间：转换为ISO格式字符串（Whoosh DATETIME字段支持）
+        if hasattr(value, 'strftime'):
+            # 若仅为日期（无时间），补充时间为00:00:00
+            if not hasattr(value, 'hour'):
+                value = datetime(value.year, value.month, value.day, 0, 0, 0)
+            value = value.isoformat()
+        # 处理布尔值：转换为'true'/'false'字符串
+        elif isinstance(value, bool):
+            value = 'true' if value else 'false'
+        # 处理列表/元组：转换为逗号分隔字符串（Whoosh KEYWORD字段支持）
+        elif isinstance(value, (list, tuple)):
+            value = u','.join([force_str(v) for v in value])
+        # 数值类型（整数、浮点数）：保持不变（Whoosh NUMERIC字段支持）
+        elif isinstance(value, (six.integer_types, float)):
+            pass
+        # 其他类型：转换为字符串
+        else:
+            value = force_str(value)
+        return value
+
+    def _to_python(self, value):
+        """
+        Whoosh返回值转换为Python原生类型（如字符串→datetime、'true'→True）
+        参考pysolr的转换逻辑，确保兼容性
+        :param value: Whoosh返回的字符串/数值
+        :return: Python原生类型值
+        """
+        # 处理布尔值
+        if value == 'true':
+            return True
+        elif value == 'false':
+            return False
+
+        # 处理日期时间字符串（匹配ISO格式）
+        if value and isinstance(value, six.string_types):
+            possible_datetime = DATETIME_REGEX.search(value)
+            if possible_datetime:
+                # 提取日期时间组件并转换为整数
+                date_values = possible_datetime.groupdict()
+                for dk, dv in date_values.items():
+                    date_values[dk] = int(dv)
+                # 创建datetime对象
+                return datetime(
+                    date_values['year'],
+                    date_values['month'],
+                    date_values['day'],
+                    date_values['hour'],
+                    date_values['minute'],
+                    date_values['second']
+                )
+
+        # 尝试JSON解析（处理列表、字典等复杂类型）
+        try:
+            converted_value = json.loads(value)
+            # 仅保留Python内置类型（列表、元组、集合、字典、数值等）
+            if isinstance(converted_value, (list, tuple, set, dict, six.integer_types, float, complex)):
+                return converted_value
+        except BaseException:
+            # JSON解析失败（如语法错误），跳过
+            pass
+
+        # 默认返回原始值
+        return value
+
+
+class WhooshSearchQuery(BaseSearchQuery):
+    """
+    Whoosh搜索查询类：继承自Haystack的BaseSearchQuery
+    负责构建Whoosh兼容的查询字符串，处理过滤条件、排序等
+    """
+    def _convert_datetime(self, date):
+        """
+        转换日期时间为Whoosh范围查询格式（如20240520143000）
+        :param date: datetime/date对象
+        :return: 格式化字符串
+        """
+        if hasattr(date, 'hour'):
+            # 日期时间：格式为YYYYMMDDHHMMSS
+            return force_str(date.strftime('%Y%m%d%H%M%S'))
+        else:
+            # 仅日期：时间部分补000000
+            return force_str(date.strftime('%Y%m%d000000'))
+
+    def clean(self, query_fragment):
+        """
+        清理查询片段：处理Whoosh保留词和字符，避免语法错误
+        Whoosh 1.X+不支持反斜杠转义，需用引号包裹含保留字符的词
+        :param query_fragment: 原始查询片段
+        :return: 清理后的查询片段
+        """
+        words = query_fragment.split()
+        cleaned_words = []
+
+        for word in words:
+            # 处理保留词：转换为小写（Whoosh保留词区分大小写，小写不视为保留词）
+            if word in self.backend.RESERVED_WORDS:
+                word = word.lower()
+
+            # 处理保留字符：若词中含保留字符，用单引号包裹
+            for char in self.backend.RESERVED_CHARACTERS:
+                if char in word:
+                    word = "'%s'" % word
+                    break
+
+            cleaned_words.append(word)
+
+        # 拼接清理后的词为查询片段
+        return ' '.join(cleaned_words)
+
+    def build_query_fragment(self, field, filter_type, value):
+        """
+        构建查询片段：根据字段、过滤类型、值，生成Whoosh兼容的查询字符串
+        支持精确匹配、模糊匹配、范围查询等多种过滤类型
+        :param field: 字段名（如'title'、'content'）
+        :param filter_type: 过滤类型（如'exact'、'contains'、'range'）
+        :param value: 过滤值（如'Python'、[2024-01-01, 2024-05-01]）
+        :return: 构建后的查询片段字符串
+        """
+        from haystack import connections  # 延迟导入
+        query_frag = ''  # 最终查询片段
+        is_datetime = False  # 是否为日期时间类型
+
+        # 处理非InputType值（如普通字符串、列表、datetime对象）
+        if not hasattr(value, 'input_type_name'):
+            # 处理ValuesListQuerySet：转换为列表
+            if hasattr(value, 'values_list'):
+                value = list(value)
+            # 检查是否为日期时间类型
+            if hasattr(value, 'strftime'):
+                is_datetime = True
+            # 字符串值：默认使用Clean输入类型（清理特殊字符）
+            if isinstance(value, six.string_types) and value != ' ':
+                value = Clean(value)
+            # 其他类型：使用PythonData输入类型（直接传递值）
+            else:
+                value = PythonData(value)
+
+        # 准备查询值（调用InputType的prepare方法，如Exact会添加引号）
+        prepared_value = value.prepare(self)
+
+        # 转换值为Whoosh支持的格式（如列表→逗号分隔字符串）
+        if not isinstance(prepared_value, (set, list, tuple)):
+            prepared_value = self.backend._from_python(prepared_value)
+
+        # 处理"content"字段（Haystack保留字段，代表"所有字段"，无需指定字段名）
+        if field == 'content':
+            index_fieldname = ''
+        else:
+            # 获取字段在索引中的实际名称（支持字段别名）
+            index_fieldname = u'%s:' % connections[self._using].get_unified_index().get_index_fieldname(field)
+
+        # Whoosh查询模板：不同过滤类型对应的查询格式
+        filter_types = {
+            'content': '%s',          # 全文搜索（无字段名）
+            'contains': '*%s*',       # 包含匹配（如*Python*）
+            'endswith': "*%s",        # 后缀匹配（如*thon）
+            'startswith': "%s*",      # 前缀匹配（如Pyth*）
+            'exact': '%s',            # 精确匹配（如"Python"）
+            'gt': "{%s to}",          # 大于（如{20240101 to}）
+            'gte': "[%s to]",         # 大于等于（如[20240101 to]）
+            'lt': "{to %s}",          # 小于（如{to 20240101}）
+            'lte': "[to %s]",         # 小于等于（如[to 20240101]）
+            'fuzzy': u'%s~',          # 模糊匹配（如Pytho~）
+        }
+
+        # 处理无需后处理的值（如Raw输入类型，直接使用原始值）
+        if value.post_process is False:
+            query_frag = prepared_value
+        else:
+            # 处理文本匹配类过滤类型（content、contains、startswith等）
+            if filter_type in ['content', 'contains', 'startswith', 'endswith', 'fuzzy']:
+                # 精确匹配输入类型（Exact）：直接使用准备好的值（含引号）
+                if value.input_type_name == 'exact':
+                    query_frag = prepared_value
+                else:
+                    # 拆分值为多个术语（如空格分隔的关键词）
+                    terms = []
+                    if isinstance(prepared_value, six.string_types):
+                        possible_values = prepared_value.split(' ')
+                    else:
+                        # 非字符串值（如datetime）：转换为Whoosh格式
+                        if is_datetime is True:
+                            prepared_value = self._convert_datetime(prepared_value)
+                        possible_values = [prepared_value]
+
+                    # 为每个术语应用过滤模板
+                    for possible_value in possible_values:
+                        term = filter_types[filter_type] % self.backend._from_python(possible_value)
+                        terms.append(term)
+
+                    # 拼接术语（单个术语直接返回，多个术语用AND连接并加括号）
+                    if len(terms) == 1:
+                        query_frag = terms[0]
+                    else:
+                        query_frag = u"(%s)" % " AND ".join(terms)
+            # 处理IN过滤类型（匹配多个值中的任一）
+            elif filter_type == 'in':
+                in_options = []
+                for possible_value in prepared_value:
+                    is_dt = False
+                    # 检查是否为日期时间类型
+                    if hasattr(possible_value, 'strftime'):
+                        is_dt = True
+                    # 转换值为Whoosh格式
+                    pv = self.backend._from_python(possible_value)
+                    if is_dt is True:
+                        pv = self._convert_datetime(pv)
+                    # 字符串值加引号，其他值直接使用
+                    if isinstance(pv, six.string_types) and not is_dt:
+                        in_options.append('"%s"' % pv)
+                    else:
+                        in_options.append('%s' % pv)
+                # 用OR连接所有选项并加括号（如("a" OR "b" OR "c")）
+                query_frag = "(%s)" % " OR ".join(in_options)
+            # 处理RANGE过滤类型（范围匹配）
+            elif filter_type == 'range':
+                # 提取范围的起始和结束值
+                start = self.backend._from_python(prepared_value[0])
+                end = self.backend._from_python(prepared_value[1])
+                # 转换日期时间类型为Whoosh格式
+                if hasattr(prepared_value[0], 'strftime'):
+                    start = self._convert_datetime(start)
+                if hasattr(prepared_value[1], 'strftime'):
+                    end = self._convert_datetime(end)
+                # 范围查询格式（如[20240101 to 20240501]）
+                query_frag = u"[%s to %s]" % (start, end)
+            # 处理EXACT过滤类型（精确匹配）
+            elif filter_type == 'exact':
+                # 精确匹配输入类型：直接使用准备好的值
+                if value.input_type_name == 'exact':
+                    query_frag = prepared_value
+                else:
+                    # 其他输入类型：转换为Exact格式（加引号）
+                    prepared_value = Exact(prepared_value).prepare(self)
+                    query_frag = filter_types[filter_type] % prepared_value
+            # 其他过滤类型（如gt、gte等）
+            else:
+                # 日期时间类型转换为Whoosh格式
+                if is_datetime is True:
+                    prepared_value = self._convert_datetime(prepared_value)
+                # 应用过滤模板
+                query_frag = filter_types[filter_type] % prepared_value
+
+        # 非Raw输入类型：若查询片段无括号，添加括号（确保逻辑正确）
+        if len(query_frag) and not isinstance(value, Raw):
+            if not query_frag.startswith('(') and not query_frag.endswith(')'):
+                query_frag = "(%s)" % query_frag
+
+        # 拼接字段名和查询片段（如"title:(Python)"）
+        return u"%s%s" % (index_fieldname, query_frag)
+
+
+class WhooshEngine(BaseEngine):
+    """
+    Whoosh搜索引擎类：继承自Haystack的BaseEngine
+    绑定Whoosh搜索后端和查询类，供Haystack调用
+    """
+    backend = WhooshSearchBackend  # 关联Whoosh搜索后端
+    query = WhooshSearchQuery      # 关联Whoosh搜索查询
\ No newline at end of file