From e0af0d00dc651e99c93af3feb458b82ff51f57d2 Mon Sep 17 00:00:00 2001 From: LY Date: Sat, 18 Oct 2025 18:24:17 +0800 Subject: [PATCH] =?UTF-8?q?ly=5F=E7=AC=AC=E4=BA=94=E5=91=A8=E6=B3=A8?= =?UTF-8?q?=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/whoosh_cn_backend.py | 1120 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 1120 insertions(+) create mode 100644 src/whoosh_cn_backend.py diff --git a/src/whoosh_cn_backend.py b/src/whoosh_cn_backend.py new file mode 100644 index 0000000..44964f3 --- /dev/null +++ b/src/whoosh_cn_backend.py @@ -0,0 +1,1120 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +# 导入必要模块:JSON处理、文件操作、正则、线程、警告等 +import json +import os +import re +import shutil +import threading +import warnings + +import six # 兼容Python 2/3 +from django.conf import settings +from django.core.exceptions import ImproperlyConfigured # Django配置异常 +from datetime import datetime +from django.utils.encoding import force_str # 字符串编码处理 +# 导入Haystack核心模块:引擎、后端、查询、结果等基础类 +from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, EmptyResults, log_query +from haystack.constants import DJANGO_CT, DJANGO_ID, ID # Haystack常量(模型类型、ID等) +from haystack.exceptions import MissingDependency, SearchBackendError, SkipDocument # Haystack异常 +from haystack.inputs import Clean, Exact, PythonData, Raw # Haystack查询输入类型 +from haystack.models import SearchResult # Haystack搜索结果模型 +from haystack.utils import get_identifier, get_model_ct # Haystack工具函数(获取唯一标识、模型类型) +from haystack.utils import log as logging # Haystack日志 +from haystack.utils.app_loading import haystack_get_model # Haystack模型加载工具 +from jieba.analyse import ChineseAnalyzer # 结巴中文分词器(用于中文搜索) +# 导入Whoosh核心模块:索引、分析器、字段、存储、高亮、查询解析、搜索结果等 +from whoosh import index +from whoosh.analysis import StemmingAnalyzer # Whoosh英文词干分析器 +from whoosh.fields import BOOLEAN, DATETIME, IDLIST, KEYWORD, NGRAM, NGRAMWORDS, NUMERIC, Schema, TEXT # Whoosh字段类型 +from whoosh.fields import ID as WHOOSH_ID # Whoosh ID字段(避免与Haystack的ID冲突) +from whoosh.filedb.filestore import FileStorage, RamStorage # Whoosh文件存储/内存存储 +from whoosh.highlight import ContextFragmenter, HtmlFormatter # Whoosh高亮相关 +from whoosh.highlight import highlight as whoosh_highlight # Whoosh高亮函数 +from whoosh.qparser import QueryParser # Whoosh查询解析器 +from whoosh.searching import ResultsPage # Whoosh分页结果 +from whoosh.writing import AsyncWriter # Whoosh异步写入器(提高写入效率) + + +# 检查Whoosh依赖是否安装 +try: + import whoosh +except ImportError: + raise MissingDependency( + "The 'whoosh' backend requires the installation of 'Whoosh'. Please refer to the documentation.") + +# 检查Whoosh版本(要求2.5.0及以上) +if not hasattr(whoosh, '__version__') or whoosh.__version__ < (2, 5, 0): + raise MissingDependency( + "The 'whoosh' backend requires version 2.5.0 or greater.") + + +# 正则表达式:匹配ISO格式的日期时间字符串(用于Whoosh与Python datetime转换) +DATETIME_REGEX = re.compile( + '^(?P\d{4})-(?P\d{2})-(?P\d{2})T(?P\d{2}):(?P\d{2}):(?P\d{2})(\.\d{3,6}Z?)?$') +# 线程本地存储:用于共享内存存储(RamStorage),避免多线程冲突 +LOCALS = threading.local() +LOCALS.RAM_STORE = None + + +class WhooshHtmlFormatter(HtmlFormatter): + """ + 自定义Whoosh HTML高亮格式化器 + 简化默认格式,确保与其他搜索后端(如Solr、Elasticsearch)的高亮结果格式一致 + 使用标签包裹高亮文本(默认格式) + """ + template = '<%(tag)s>%(t)s' + + +class WhooshSearchBackend(BaseSearchBackend): + """ + Whoosh搜索后端实现类:继承自Haystack的BaseSearchBackend + 负责与Whoosh交互,实现索引创建、更新、删除、搜索等核心功能 + 支持中文分词(基于结巴分词) + """ + # Whoosh保留关键字(搜索时需特殊处理,避免语法错误) + RESERVED_WORDS = ( + 'AND', + 'NOT', + 'OR', + 'TO', + ) + + # Whoosh保留字符(搜索时需转义或处理,避免语法错误) + # '\\'需放在首位,防止覆盖其他斜杠替换 + RESERVED_CHARACTERS = ( + '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', + '[', ']', '^', '"', '~', '*', '?', ':', '.', + ) + + def __init__(self, connection_alias, **connection_options): + """ + 初始化Whoosh搜索后端 + :param connection_alias: 连接别名(来自Haystack配置) + :param connection_options: 连接参数(如索引路径、存储类型等) + """ + super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options) + self.setup_complete = False # 初始化完成标记(延迟初始化) + self.use_file_storage = True # 默认使用文件存储(FileStorage) + # POST请求大小限制(默认128MB) + self.post_limit = getattr(connection_options, 'POST_LIMIT', 128 * 1024 * 1024) + # 索引存储路径(从配置中获取) + self.path = connection_options.get('PATH') + + # 检查存储类型:若配置为非文件存储(如内存),则使用RamStorage + if connection_options.get('STORAGE', 'file') != 'file': + self.use_file_storage = False + + # 若使用文件存储但未配置路径,抛出配置异常 + if self.use_file_storage and not self.path: + raise ImproperlyConfigured( + "You must specify a 'PATH' in your settings for connection '%s'." % connection_alias) + + # 初始化日志记录器 + self.log = logging.getLogger('haystack') + + def setup(self): + """ + 延迟初始化:创建索引存储和Schema,初始化Whoosh索引 + 避免项目启动时立即加载,仅在首次使用搜索功能时执行 + """ + from haystack import connections # 延迟导入,避免循环导入 + new_index = False # 是否为新创建的索引(首次初始化) + + # 若使用文件存储且路径不存在,创建目录并标记为新索引 + if self.use_file_storage and not os.path.exists(self.path): + os.makedirs(self.path) + new_index = True + + # 检查文件存储路径是否可写 + if self.use_file_storage and not os.access(self.path, os.W_OK): + raise IOError( + "The path to your Whoosh index '%s' is not writable for the current user/group." % self.path) + + # 初始化存储:文件存储或内存存储 + if self.use_file_storage: + self.storage = FileStorage(self.path) + else: + global LOCALS + # 内存存储共享(线程本地存储,避免多线程重复创建) + if getattr(LOCALS, 'RAM_STORE', None) is None: + LOCALS.RAM_STORE = RamStorage() + self.storage = LOCALS.RAM_STORE + + # 构建Whoosh Schema(索引结构):从Haystack统一索引获取字段 + unified_index = connections[self.connection_alias].get_unified_index() + self.content_field_name, self.schema = self.build_schema(unified_index.all_searchfields()) + # 初始化查询解析器(基于内容字段和Schema) + self.parser = QueryParser(self.content_field_name, schema=self.schema) + + # 若为新索引,创建索引;否则打开现有索引(不存在则创建) + if new_index is True: + self.index = self.storage.create_index(self.schema) + else: + try: + self.index = self.storage.open_index(schema=self.schema) + except index.EmptyIndexError: + self.index = self.storage.create_index(self.schema) + + # 标记初始化完成 + self.setup_complete = True + + def build_schema(self, fields): + """ + 构建Whoosh Schema(索引结构):将Haystack字段映射为Whoosh字段类型 + :param fields: Haystack统一索引中的所有字段(dict,key为字段名,value为字段类) + :return: (content_field_name, schema):内容字段名(主搜索字段)、Whoosh Schema对象 + """ + # 初始化Schema字段:包含Haystack默认字段(ID、模型类型、模型ID) + schema_fields = { + ID: WHOOSH_ID(stored=True, unique=True), # 文档唯一ID(Haystack标识) + DJANGO_CT: WHOOSH_ID(stored=True), # 模型类型(如blog.Article) + DJANGO_ID: WHOOSH_ID(stored=True), # 模型主键ID + } + # 初始字段数量(用于后续检查是否有有效字段) + initial_key_count = len(schema_fields) + content_field_name = '' # 主内容字段名(标记为document=True的字段) + + # 遍历Haystack字段,映射为对应的Whoosh字段 + for field_name, field_class in fields.items(): + index_fieldname = field_class.index_fieldname # 索引中的实际字段名 + # 处理多值字段(如标签、分类) + if field_class.is_multivalued: + if not field_class.indexed: + # 非索引多值字段:使用IDLIST(存储但不索引) + schema_fields[index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost) + else: + # 索引多值字段:使用KEYWORD(逗号分隔,可索引、可排序) + schema_fields[index_fieldname] = KEYWORD( + stored=True, commas=True, scorable=True, field_boost=field_class.boost) + # 处理日期/日期时间字段 + elif field_class.field_type in ['date', 'datetime']: + schema_fields[index_fieldname] = DATETIME(stored=field_class.stored, sortable=True) + # 处理整数字段 + elif field_class.field_type == 'integer': + schema_fields[index_fieldname] = NUMERIC( + stored=field_class.stored, numtype=int, field_boost=field_class.boost) + # 处理浮点数字段 + elif field_class.field_type == 'float': + schema_fields[index_fieldname] = NUMERIC( + stored=field_class.stored, numtype=float, field_boost=field_class.boost) + # 处理布尔字段 + elif field_class.field_type == 'boolean': + # Whoosh BOOLEAN字段不支持boost(2.5.0+版本) + schema_fields[index_fieldname] = BOOLEAN(stored=field_class.stored) + # 处理NGram字段(适用于模糊搜索,如拼音、部分匹配) + elif field_class.field_type == 'ngram': + schema_fields[index_fieldname] = NGRAM( + minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) + # 处理Edge NGram字段(适用于前缀匹配,如搜索"py"匹配"Python") + elif field_class.field_type == 'edge_ngram': + schema_fields[index_fieldname] = NGRAMWORDS( + minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost) + # 默认字段类型:文本字段(支持中文分词) + else: + # 替换默认的StemmingAnalyzer(英文词干)为ChineseAnalyzer(结巴中文分词) + schema_fields[index_fieldname] = TEXT( + stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True) + + # 标记主内容字段(document=True的字段,用于默认搜索) + if field_class.document is True: + content_field_name = index_fieldname + # 启用拼写检查(仅主内容字段支持) + schema_fields[index_fieldname].spelling = True + + # 检查是否有有效字段(若仅包含初始字段,说明未配置任何搜索字段) + if len(schema_fields) <= initial_key_count: + raise SearchBackendError( + "No fields were found in any search_indexes. Please correct this before attempting to search.") + + # 创建并返回Whoosh Schema + return (content_field_name, Schema(**schema_fields)) + + def update(self, index, iterable, commit=True): + """ + 更新索引:将模型对象批量添加/更新到Whoosh索引 + :param index: Haystack索引对象(对应某个模型的索引配置) + :param iterable: 模型对象迭代器(需索引的对象列表) + :param commit: 是否立即提交(此处强制提交,避免锁问题) + """ + # 若未初始化,先执行setup + if not self.setup_complete: + self.setup() + + # 刷新索引(确保获取最新状态) + self.index = self.index.refresh() + # 使用异步写入器(提高批量写入效率,避免阻塞) + writer = AsyncWriter(self.index) + + # 遍历对象,处理并写入索引 + for obj in iterable: + try: + # 准备文档数据(调用Haystack索引的full_prepare方法,处理字段值) + doc = index.full_prepare(obj) + except SkipDocument: + # 跳过无需索引的对象(如草稿文章) + self.log.debug(u"Indexing for object `%s` skipped", obj) + else: + # 转换文档值为Whoosh支持的格式(如datetime转字符串、布尔值转'true'/'false') + for key in doc: + doc[key] = self._from_python(doc[key]) + + # Whoosh 2.5.0+不支持文档级boost,删除该字段 + if 'boost' in doc: + del doc['boost'] + + try: + # 更新文档:若ID存在则更新,不存在则新增 + writer.update_document(**doc) + except Exception as e: + # 若设置为静默失败,则仅记录日志;否则抛出异常 + if not self.silently_fail: + raise + # 记录错误日志(包含对象标识,避免编码问题) + self.log.error( + u"%s while preparing object for update" % e.__class__.__name__, + exc_info=True, + extra={"data": {"index": index, "object": get_identifier(obj)}}) + + # 批量写入后强制提交(Whoosh需提交才会持久化) + if len(iterable) > 0: + writer.commit() + + def remove(self, obj_or_string, commit=True): + """ + 删除索引:从Whoosh索引中删除指定模型对象 + :param obj_or_string: 模型对象或对象唯一标识(get_identifier返回值) + :param commit: 是否立即提交(Whoosh删除后自动提交,此处参数仅为兼容) + """ + if not self.setup_complete: + self.setup() + + # 刷新索引 + self.index = self.index.refresh() + # 获取对象的唯一标识(用于Whoosh查询删除) + whoosh_id = get_identifier(obj_or_string) + + try: + # 构造查询:根据ID删除文档 + delete_query = self.parser.parse(u'%s:"%s"' % (ID, whoosh_id)) + self.index.delete_by_query(q=delete_query) + except Exception as e: + if not self.silently_fail: + raise + # 记录删除失败日志 + self.log.error( + "Failed to remove document '%s' from Whoosh: %s", + whoosh_id, e, exc_info=True) + + def clear(self, models=None, commit=True): + """ + 清空索引:删除指定模型的所有索引,或清空整个索引 + :param models: 模型列表(如[Article, Comment]),为None则清空所有 + :param commit: 是否立即提交(Whoosh删除后自动提交) + """ + if not self.setup_complete: + self.setup() + + # 刷新索引 + self.index = self.index.refresh() + + # 验证models参数是否为列表/元组 + if models is not None: + assert isinstance(models, (list, tuple)) + + try: + # 清空整个索引(效率更高:直接删除索引文件/内存存储) + if models is None: + self.delete_index() + # 仅清空指定模型的索引 + else: + models_to_delete = [] + # 遍历模型,生成模型类型查询条件(如DJANGO_CT:blog.Article) + for model in models: + models_to_delete.append(u"%s:%s" % (DJANGO_CT, get_model_ct(model))) + # 构造OR查询,删除所有匹配模型的文档 + delete_query = self.parser.parse(u" OR ".join(models_to_delete)) + self.index.delete_by_query(q=delete_query) + except Exception as e: + if not self.silently_fail: + raise + # 记录清空失败日志 + if models is not None: + self.log.error( + "Failed to clear Whoosh index of models '%s': %s", + ','.join(models_to_delete), e, exc_info=True) + else: + self.log.error("Failed to clear Whoosh index: %s", e, exc_info=True) + + def delete_index(self): + """ + 彻底删除索引:删除索引存储(文件或内存),并重新初始化 + 比clear更彻底,适用于重建索引场景 + """ + # 文件存储:删除索引目录 + if self.use_file_storage and os.path.exists(self.path): + shutil.rmtree(self.path) + # 内存存储:清空存储 + elif not self.use_file_storage: + self.storage.clean() + + # 重新初始化索引(创建新的空索引) + self.setup() + + def optimize(self): + """ + 优化索引:整理Whoosh索引文件,提高搜索效率 + Whoosh会合并小索引段,减少磁盘IO + """ + if not self.setup_complete: + self.setup() + + # 刷新并优化索引 + self.index = self.index.refresh() + self.index.optimize() + + def calculate_page(self, start_offset=0, end_offset=None): + """ + 计算分页参数:将Haystack的start/end偏移量转换为Whoosh的页码和页长 + Whoosh使用页码(1-based)和页长,而非偏移量 + :param start_offset: 起始偏移量(从0开始) + :param end_offset: 结束偏移量(不包含) + :return: (page_num, page_length):页码、页长 + """ + # 处理end_offset为0或负数的情况(避免Whoosh报错) + if end_offset is not None and end_offset <= 0: + end_offset = 1 + + # 初始化默认值 + page_num = 0 + if end_offset is None: + end_offset = 1000000 # 默认最大页长(获取所有结果) + if start_offset is None: + start_offset = 0 + + # 计算页长(end - start)和页码(start / 页长,向上取整) + page_length = end_offset - start_offset + if page_length and page_length > 0: + page_num = int(start_offset / page_length) + + # Whoosh页码为1-based,故加1 + page_num += 1 + return page_num, page_length + + @log_query + def search( + self, + query_string, + sort_by=None, + start_offset=0, + end_offset=None, + fields='', + highlight=False, + facets=None, + date_facets=None, + query_facets=None, + narrow_queries=None, + spelling_query=None, + within=None, + dwithin=None, + distance_point=None, + models=None, + limit_to_registered_models=None, + result_class=None, + **kwargs): + """ + 核心搜索方法:执行查询并返回处理后的结果 + 支持分页、排序、高亮、过滤模型等功能 + :param query_string: 搜索关键词 + :param sort_by: 排序字段列表(如['-pub_time', 'title']) + :param start_offset/end_offset: 分页偏移量 + :param highlight: 是否开启结果高亮 + :param models: 限制搜索的模型列表 + :param result_class: 搜索结果类(默认SearchResult) + :return: 搜索结果字典(含results列表、hits总数、facets、拼写建议等) + """ + # 初始化检查 + if not self.setup_complete: + self.setup() + + # 空查询字符串返回空结果 + if len(query_string) == 0: + return {'results': [], 'hits': 0} + + # 转换查询字符串为Unicode(兼容Python 2) + query_string = force_str(query_string) + + # 单字符查询(非通配符)返回空结果(通常为停用词,无意义) + if len(query_string) <= 1 and query_string != u'*': + return {'results': [], 'hits': 0} + + # 处理排序:Whoosh要求所有排序字段方向一致(均升序或均降序) + reverse = False # 是否倒序(默认升序) + if sort_by is not None: + sort_by_list = [] + reverse_counter = 0 # 倒序字段计数 + + # 统计倒序字段数量 + for order_by in sort_by: + if order_by.startswith('-'): + reverse_counter += 1 + + # Whoosh不支持混合排序方向,抛出异常 + if reverse_counter and reverse_counter != len(sort_by): + raise SearchBackendError("Whoosh requires all order_by fields to use the same sort direction") + + # 提取排序字段(去掉'-'符号),确定排序方向 + for order_by in sort_by: + if order_by.startswith('-'): + sort_by_list.append(order_by[1:]) + if len(sort_by_list) == 1: + reverse = True + else: + sort_by_list.append(order_by) + if len(sort_by_list) == 1: + reverse = False + + # Whoosh仅支持单个排序字段,取第一个 + sort_by = sort_by_list[0] + + # Whoosh不支持分面搜索(facets),给出警告 + if facets is not None: + warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2) + if date_facets is not None: + warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2) + if query_facets is not None: + warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2) + + # 处理过滤查询(narrow_queries):限制搜索结果范围 + narrowed_results = None # 过滤后的结果集 + self.index = self.index.refresh() + + # 处理模型过滤:限制仅搜索指定模型或已注册模型 + if limit_to_registered_models is None: + # 从配置获取默认值(是否仅搜索已注册模型) + limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) + + model_choices = [] + if models and len(models): + # 限制搜索指定模型(如[Article]) + model_choices = sorted(get_model_ct(model) for model in models) + elif limit_to_registered_models: + # 限制搜索所有已注册模型(通过Haystack路由获取) + model_choices = self.build_models_list() + + # 将模型过滤添加到narrow_queries + if len(model_choices) > 0: + if narrow_queries is None: + narrow_queries = set() + # 构造OR查询:匹配任一模型类型 + model_query = ' OR '.join(['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]) + narrow_queries.add(model_query) + + # 执行过滤查询:获取符合所有narrow_queries的结果集 + narrow_searcher = None + if narrow_queries is not None: + narrow_searcher = self.index.searcher() + for nq in narrow_queries: + # 解析过滤查询并执行(获取所有匹配结果) + nq_parsed = self.parser.parse(force_str(nq)) + recent_narrowed = narrow_searcher.search(nq_parsed, limit=None) + + # 若任一过滤条件无结果,直接返回空结果 + if len(recent_narrowed) <= 0: + return {'results': [], 'hits': 0} + + # 合并过滤结果(交集) + if narrowed_results: + narrowed_results.filter(recent_narrowed) + else: + narrowed_results = recent_narrowed + + # 刷新索引,准备执行主搜索 + self.index = self.index.refresh() + + # 若索引为空,返回空结果(含拼写建议) + if not self.index.doc_count(): + spelling_suggestion = self.create_spelling_suggestion(spelling_query or query_string) if self.include_spelling else None + return {'results': [], 'hits': 0, 'spelling_suggestion': spelling_suggestion} + + # 执行主搜索 + searcher = self.index.searcher() + try: + # 解析查询字符串 + parsed_query = self.parser.parse(query_string) + except Exception: + # 无效查询(如语法错误),返回空结果 + if not self.silently_fail: + raise + return {'results': [], 'hits': 0, 'spelling_suggestion': None} + + # 无效查询(如仅停用词),返回空结果 + if parsed_query is None: + return {'results': [], 'hits': 0, 'spelling_suggestion': None} + + # 计算分页参数 + page_num, page_length = self.calculate_page(start_offset, end_offset) + + # 构造搜索参数 + search_kwargs = { + 'pagelen': page_length, # 页长 + 'sortedby': sort_by, # 排序字段 + 'reverse': reverse # 是否倒序 + } + # 应用过滤结果(仅返回过滤后的子集) + if narrowed_results is not None: + search_kwargs['filter'] = narrowed_results + + # 执行搜索并获取分页结果 + try: + raw_page = searcher.search_page(parsed_query, page_num, **search_kwargs) + except ValueError: + # 页码超出范围(如请求第10页但仅5页),返回空结果 + if not self.silently_fail: + raise + return {'results': [], 'hits': 0, 'spelling_suggestion': None} + + # Whoosh 2.5.1+ bug:页码超出时返回错误页码,需检查 + if raw_page.pagenum < page_num: + spelling_suggestion = self.create_spelling_suggestion(spelling_query or query_string) if self.include_spelling else None + return {'results': [], 'hits': 0, 'spelling_suggestion': spelling_suggestion} + + # 处理搜索结果(转换为Haystack SearchResult,添加高亮等) + results = self._process_results( + raw_page, + highlight=highlight, + query_string=query_string, + spelling_query=spelling_query, + result_class=result_class) + + # 关闭搜索器(释放资源) + searcher.close() + if hasattr(narrow_searcher, 'close'): + narrow_searcher.close() + + return results + + def more_like_this( + self, + model_instance, + additional_query_string=None, + start_offset=0, + end_offset=None, + models=None, + limit_to_registered_models=None, + result_class=None, + **kwargs): + """ + 相似结果搜索:根据指定模型对象,查找相似的文档 + 基于Whoosh的more_like_this功能,分析主内容字段的相似度 + :param model_instance: 参考模型对象(如某篇文章) + :return: 相似结果字典(结构同search方法) + """ + if not self.setup_complete: + self.setup() + + # 获取模型的实际类(排除延迟加载模型) + model_klass = model_instance._meta.concrete_model + # 主内容字段名(用于相似度分析) + field_name = self.content_field_name + # 过滤查询和结果集 + narrow_queries = set() + narrowed_results = None + self.index = self.index.refresh() + + # 处理模型过滤(同search方法) + if limit_to_registered_models is None: + limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) + + model_choices = [] + if models and len(models): + model_choices = sorted(get_model_ct(model) for model in models) + elif limit_to_registered_models: + model_choices = self.build_models_list() + + # 添加模型过滤条件 + if len(model_choices) > 0: + if narrow_queries is None: + narrow_queries = set() + model_query = ' OR '.join(['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]) + narrow_queries.add(model_query) + + # 添加额外过滤条件(如关键词过滤) + if additional_query_string and additional_query_string != '*': + narrow_queries.add(additional_query_string) + + # 执行过滤查询(同search方法) + narrow_searcher = None + if narrow_queries is not None: + narrow_searcher = self.index.searcher() + for nq in narrow_queries: + nq_parsed = self.parser.parse(force_str(nq)) + recent_narrowed = narrow_searcher.search(nq_parsed, limit=None) + + if len(recent_narrowed) <= 0: + return {'results': [], 'hits': 0} + + if narrowed_results: + narrowed_results.filter(recent_narrowed) + else: + narrowed_results = recent_narrowed + + # 计算分页参数 + page_num, page_length = self.calculate_page(start_offset, end_offset) + + # 刷新索引,执行相似搜索 + self.index = self.index.refresh() + raw_results = EmptyResults() # 默认空结果 + + if self.index.doc_count(): + searcher = self.index.searcher() + # 构造查询:获取参考对象的索引文档 + query = "%s:%s" % (ID, get_identifier(model_instance)) + parsed_query = self.parser.parse(query) + results = searcher.search(parsed_query) + + # 若找到参考文档,获取相似结果 + if len(results): + # 基于主内容字段查找相似文档,限制最大数量为end_offset + raw_results = results[0].more_like_this(field_name, top=end_offset) + + # 应用过滤结果 + if narrowed_results is not None and hasattr(raw_results, 'filter'): + raw_results.filter(narrowed_results) + + # 处理分页结果 + try: + raw_page = ResultsPage(raw_results, page_num, page_length) + except ValueError: + if not self.silently_fail: + raise + return {'results': [], 'hits': 0, 'spelling_suggestion': None} + + # 检查页码有效性 + if raw_page.pagenum < page_num: + return {'results': [], 'hits': 0, 'spelling_suggestion': None} + + # 处理结果并关闭搜索器 + results = self._process_results(raw_page, result_class=result_class) + searcher.close() + if hasattr(narrow_searcher, 'close'): + narrow_searcher.close() + + return results + + def _process_results( + self, + raw_page, + highlight=False, + query_string='', + spelling_query=None, + result_class=None): + """ + 处理搜索结果:将Whoosh原始结果转换为Haystack SearchResult格式 + 支持高亮、字段类型转换、拼写建议等 + :param raw_page: Whoosh ResultsPage对象(分页原始结果) + :param highlight: 是否开启高亮 + :return: 处理后的结果字典 + """ + from haystack import connections # 延迟导入 + results = [] # 最终结果列表(SearchResult对象) + hits = len(raw_page) # 总命中数(当前页) + + # 结果类默认值(Haystack SearchResult) + if result_class is None: + result_class = SearchResult + + # 初始化分面和拼写建议(Whoosh不支持分面,故为空) + facets = {} + spelling_suggestion = None + # 获取Haystack统一索引和已索引模型 + unified_index = connections[self.connection_alias].get_unified_index() + indexed_models = unified_index.get_indexed_models() + + # 遍历原始结果,转换为SearchResult + for doc_offset, raw_result in enumerate(raw_page): + # 获取文档得分(相关性) + score = raw_page.score(doc_offset) or 0 + # 提取模型类型(如blog.Article)并拆分应用标签和模型名 + app_label, model_name = raw_result[DJANGO_CT].split('.') + additional_fields = {} # 额外字段(除默认字段外的其他字段) + # 加载模型类 + model = haystack_get_model(app_label, model_name) + + # 仅处理已索引的模型 + if model and model in indexed_models: + # 遍历原始结果的所有字段,转换为Python原生类型 + for key, value in raw_result.items(): + string_key = str(key) + # 获取模型对应的Haystack索引 + index = unified_index.get_index(model) + + # 若字段在索引中定义,使用索引的convert方法转换值 + if string_key in index.fields and hasattr(index.fields[string_key], 'convert'): + field = index.fields[string_key] + # 处理多值字段(如KEYWORD类型,逗号分隔字符串转列表) + if field.is_multivalued: + if value is None or len(value) == 0: + additional_fields[string_key] = [] + else: + additional_fields[string_key] = value.split(',') + else: + # 单值字段:使用索引的convert方法转换 + additional_fields[string_key] = field.convert(value) + else: + # 未定义的字段:直接转换为Python类型 + additional_fields[string_key] = self._to_python(value) + + # 删除默认字段(DJANGO_CT、DJANGO_ID),避免重复 + del additional_fields[DJANGO_CT] + del additional_fields[DJANGO_ID] + + # 处理结果高亮 + if highlight: + # 使用英文词干分析器解析查询关键词(用于高亮匹配) + sa = StemmingAnalyzer() + # 自定义高亮格式化器(标签) + formatter = WhooshHtmlFormatter('em') + # 提取查询关键词的词干(如"running"→"run") + terms = [token.text for token in sa(query_string)] + + # 对主内容字段执行高亮 + content_value = additional_fields.get(self.content_field_name, '') + whoosh_highlighted = whoosh_highlight( + content_value, + terms, + sa, + ContextFragmenter(), # 上下文片段生成器(显示关键词前后内容) + formatter + ) + # 将高亮结果添加到额外字段 + additional_fields['highlighted'] = {self.content_field_name: [whoosh_highlighted]} + + # 创建SearchResult对象并添加到结果列表 + result = result_class( + app_label, + model_name, + raw_result[DJANGO_ID], # 模型主键ID + score, + **additional_fields + ) + results.append(result) + else: + # 跳过未索引的模型,减少命中数 + hits -= 1 + + # 生成拼写建议(若开启拼写检查) + if self.include_spelling: + spelling_suggestion = self.create_spelling_suggestion(spelling_query or query_string) + + # 返回处理后的结果字典 + return { + 'results': results, + 'hits': hits, + 'facets': facets, + 'spelling_suggestion': spelling_suggestion, + } + + def create_spelling_suggestion(self, query_string): + """ + 生成拼写建议:基于Whoosh的拼写检查功能,推荐可能的正确关键词 + :param query_string: 原始查询关键词 + :return: 拼写建议字符串(如"pytho"→"python") + """ + spelling_suggestion = None + # 获取索引阅读器和拼写校正器(基于主内容字段) + reader = self.index.reader() + corrector = reader.corrector(self.content_field_name) + cleaned_query = force_str(query_string) + + # 空查询返回None + if not query_string: + return spelling_suggestion + + # 清理查询字符串:移除Whoosh保留词和字符 + for rev_word in self.RESERVED_WORDS: + cleaned_query = cleaned_query.replace(rev_word, '') + for rev_char in self.RESERVED_CHARACTERS: + cleaned_query = cleaned_query.replace(rev_char, '') + + # 拆分关键词,逐个生成建议 + query_words = cleaned_query.split() + suggested_words = [] + for word in query_words: + # 获取每个词的最佳建议(限制1个) + suggestions = corrector.suggest(word, limit=1) + if len(suggestions) > 0: + suggested_words.append(suggestions[0]) + + # 拼接建议词为字符串 + spelling_suggestion = ' '.join(suggested_words) + return spelling_suggestion + + def _from_python(self, value): + """ + Python类型转换为Whoosh支持的格式(如datetime→字符串、布尔→'true'/'false') + 参考pysolr的转换逻辑,确保兼容性 + :param value: Python原生类型值 + :return: Whoosh支持的字符串/数值类型 + """ + # 处理日期时间:转换为ISO格式字符串(Whoosh DATETIME字段支持) + if hasattr(value, 'strftime'): + # 若仅为日期(无时间),补充时间为00:00:00 + if not hasattr(value, 'hour'): + value = datetime(value.year, value.month, value.day, 0, 0, 0) + value = value.isoformat() + # 处理布尔值:转换为'true'/'false'字符串 + elif isinstance(value, bool): + value = 'true' if value else 'false' + # 处理列表/元组:转换为逗号分隔字符串(Whoosh KEYWORD字段支持) + elif isinstance(value, (list, tuple)): + value = u','.join([force_str(v) for v in value]) + # 数值类型(整数、浮点数):保持不变(Whoosh NUMERIC字段支持) + elif isinstance(value, (six.integer_types, float)): + pass + # 其他类型:转换为字符串 + else: + value = force_str(value) + return value + + def _to_python(self, value): + """ + Whoosh返回值转换为Python原生类型(如字符串→datetime、'true'→True) + 参考pysolr的转换逻辑,确保兼容性 + :param value: Whoosh返回的字符串/数值 + :return: Python原生类型值 + """ + # 处理布尔值 + if value == 'true': + return True + elif value == 'false': + return False + + # 处理日期时间字符串(匹配ISO格式) + if value and isinstance(value, six.string_types): + possible_datetime = DATETIME_REGEX.search(value) + if possible_datetime: + # 提取日期时间组件并转换为整数 + date_values = possible_datetime.groupdict() + for dk, dv in date_values.items(): + date_values[dk] = int(dv) + # 创建datetime对象 + return datetime( + date_values['year'], + date_values['month'], + date_values['day'], + date_values['hour'], + date_values['minute'], + date_values['second'] + ) + + # 尝试JSON解析(处理列表、字典等复杂类型) + try: + converted_value = json.loads(value) + # 仅保留Python内置类型(列表、元组、集合、字典、数值等) + if isinstance(converted_value, (list, tuple, set, dict, six.integer_types, float, complex)): + return converted_value + except BaseException: + # JSON解析失败(如语法错误),跳过 + pass + + # 默认返回原始值 + return value + + +class WhooshSearchQuery(BaseSearchQuery): + """ + Whoosh搜索查询类:继承自Haystack的BaseSearchQuery + 负责构建Whoosh兼容的查询字符串,处理过滤条件、排序等 + """ + def _convert_datetime(self, date): + """ + 转换日期时间为Whoosh范围查询格式(如20240520143000) + :param date: datetime/date对象 + :return: 格式化字符串 + """ + if hasattr(date, 'hour'): + # 日期时间:格式为YYYYMMDDHHMMSS + return force_str(date.strftime('%Y%m%d%H%M%S')) + else: + # 仅日期:时间部分补000000 + return force_str(date.strftime('%Y%m%d000000')) + + def clean(self, query_fragment): + """ + 清理查询片段:处理Whoosh保留词和字符,避免语法错误 + Whoosh 1.X+不支持反斜杠转义,需用引号包裹含保留字符的词 + :param query_fragment: 原始查询片段 + :return: 清理后的查询片段 + """ + words = query_fragment.split() + cleaned_words = [] + + for word in words: + # 处理保留词:转换为小写(Whoosh保留词区分大小写,小写不视为保留词) + if word in self.backend.RESERVED_WORDS: + word = word.lower() + + # 处理保留字符:若词中含保留字符,用单引号包裹 + for char in self.backend.RESERVED_CHARACTERS: + if char in word: + word = "'%s'" % word + break + + cleaned_words.append(word) + + # 拼接清理后的词为查询片段 + return ' '.join(cleaned_words) + + def build_query_fragment(self, field, filter_type, value): + """ + 构建查询片段:根据字段、过滤类型、值,生成Whoosh兼容的查询字符串 + 支持精确匹配、模糊匹配、范围查询等多种过滤类型 + :param field: 字段名(如'title'、'content') + :param filter_type: 过滤类型(如'exact'、'contains'、'range') + :param value: 过滤值(如'Python'、[2024-01-01, 2024-05-01]) + :return: 构建后的查询片段字符串 + """ + from haystack import connections # 延迟导入 + query_frag = '' # 最终查询片段 + is_datetime = False # 是否为日期时间类型 + + # 处理非InputType值(如普通字符串、列表、datetime对象) + if not hasattr(value, 'input_type_name'): + # 处理ValuesListQuerySet:转换为列表 + if hasattr(value, 'values_list'): + value = list(value) + # 检查是否为日期时间类型 + if hasattr(value, 'strftime'): + is_datetime = True + # 字符串值:默认使用Clean输入类型(清理特殊字符) + if isinstance(value, six.string_types) and value != ' ': + value = Clean(value) + # 其他类型:使用PythonData输入类型(直接传递值) + else: + value = PythonData(value) + + # 准备查询值(调用InputType的prepare方法,如Exact会添加引号) + prepared_value = value.prepare(self) + + # 转换值为Whoosh支持的格式(如列表→逗号分隔字符串) + if not isinstance(prepared_value, (set, list, tuple)): + prepared_value = self.backend._from_python(prepared_value) + + # 处理"content"字段(Haystack保留字段,代表"所有字段",无需指定字段名) + if field == 'content': + index_fieldname = '' + else: + # 获取字段在索引中的实际名称(支持字段别名) + index_fieldname = u'%s:' % connections[self._using].get_unified_index().get_index_fieldname(field) + + # Whoosh查询模板:不同过滤类型对应的查询格式 + filter_types = { + 'content': '%s', # 全文搜索(无字段名) + 'contains': '*%s*', # 包含匹配(如*Python*) + 'endswith': "*%s", # 后缀匹配(如*thon) + 'startswith': "%s*", # 前缀匹配(如Pyth*) + 'exact': '%s', # 精确匹配(如"Python") + 'gt': "{%s to}", # 大于(如{20240101 to}) + 'gte': "[%s to]", # 大于等于(如[20240101 to]) + 'lt': "{to %s}", # 小于(如{to 20240101}) + 'lte': "[to %s]", # 小于等于(如[to 20240101]) + 'fuzzy': u'%s~', # 模糊匹配(如Pytho~) + } + + # 处理无需后处理的值(如Raw输入类型,直接使用原始值) + if value.post_process is False: + query_frag = prepared_value + else: + # 处理文本匹配类过滤类型(content、contains、startswith等) + if filter_type in ['content', 'contains', 'startswith', 'endswith', 'fuzzy']: + # 精确匹配输入类型(Exact):直接使用准备好的值(含引号) + if value.input_type_name == 'exact': + query_frag = prepared_value + else: + # 拆分值为多个术语(如空格分隔的关键词) + terms = [] + if isinstance(prepared_value, six.string_types): + possible_values = prepared_value.split(' ') + else: + # 非字符串值(如datetime):转换为Whoosh格式 + if is_datetime is True: + prepared_value = self._convert_datetime(prepared_value) + possible_values = [prepared_value] + + # 为每个术语应用过滤模板 + for possible_value in possible_values: + term = filter_types[filter_type] % self.backend._from_python(possible_value) + terms.append(term) + + # 拼接术语(单个术语直接返回,多个术语用AND连接并加括号) + if len(terms) == 1: + query_frag = terms[0] + else: + query_frag = u"(%s)" % " AND ".join(terms) + # 处理IN过滤类型(匹配多个值中的任一) + elif filter_type == 'in': + in_options = [] + for possible_value in prepared_value: + is_dt = False + # 检查是否为日期时间类型 + if hasattr(possible_value, 'strftime'): + is_dt = True + # 转换值为Whoosh格式 + pv = self.backend._from_python(possible_value) + if is_dt is True: + pv = self._convert_datetime(pv) + # 字符串值加引号,其他值直接使用 + if isinstance(pv, six.string_types) and not is_dt: + in_options.append('"%s"' % pv) + else: + in_options.append('%s' % pv) + # 用OR连接所有选项并加括号(如("a" OR "b" OR "c")) + query_frag = "(%s)" % " OR ".join(in_options) + # 处理RANGE过滤类型(范围匹配) + elif filter_type == 'range': + # 提取范围的起始和结束值 + start = self.backend._from_python(prepared_value[0]) + end = self.backend._from_python(prepared_value[1]) + # 转换日期时间类型为Whoosh格式 + if hasattr(prepared_value[0], 'strftime'): + start = self._convert_datetime(start) + if hasattr(prepared_value[1], 'strftime'): + end = self._convert_datetime(end) + # 范围查询格式(如[20240101 to 20240501]) + query_frag = u"[%s to %s]" % (start, end) + # 处理EXACT过滤类型(精确匹配) + elif filter_type == 'exact': + # 精确匹配输入类型:直接使用准备好的值 + if value.input_type_name == 'exact': + query_frag = prepared_value + else: + # 其他输入类型:转换为Exact格式(加引号) + prepared_value = Exact(prepared_value).prepare(self) + query_frag = filter_types[filter_type] % prepared_value + # 其他过滤类型(如gt、gte等) + else: + # 日期时间类型转换为Whoosh格式 + if is_datetime is True: + prepared_value = self._convert_datetime(prepared_value) + # 应用过滤模板 + query_frag = filter_types[filter_type] % prepared_value + + # 非Raw输入类型:若查询片段无括号,添加括号(确保逻辑正确) + if len(query_frag) and not isinstance(value, Raw): + if not query_frag.startswith('(') and not query_frag.endswith(')'): + query_frag = "(%s)" % query_frag + + # 拼接字段名和查询片段(如"title:(Python)") + return u"%s%s" % (index_fieldname, query_frag) + + +class WhooshEngine(BaseEngine): + """ + Whoosh搜索引擎类:继承自Haystack的BaseEngine + 绑定Whoosh搜索后端和查询类,供Haystack调用 + """ + backend = WhooshSearchBackend # 关联Whoosh搜索后端 + query = WhooshSearchQuery # 关联Whoosh搜索查询 \ No newline at end of file