|
|
# Zxy 导入未来支持模块,确保代码在 Python 2 和 3 中的兼容性
|
|
|
# encoding: utf-8
|
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
|
|
# Zxy 导入标准库模块
|
|
|
import json
|
|
|
import os
|
|
|
import re
|
|
|
import shutil
|
|
|
import threading
|
|
|
import warnings
|
|
|
|
|
|
# Zxy 导入第三方库
|
|
|
import six
|
|
|
from django.conf import settings
|
|
|
from django.core.exceptions import ImproperlyConfigured
|
|
|
from datetime import datetime
|
|
|
from django.utils.encoding import force_str
|
|
|
from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, EmptyResults, log_query
|
|
|
from haystack.constants import DJANGO_CT, DJANGO_ID, ID
|
|
|
from haystack.exceptions import MissingDependency, SearchBackendError, SkipDocument
|
|
|
from haystack.inputs import Clean, Exact, PythonData, Raw
|
|
|
from haystack.models import SearchResult
|
|
|
from haystack.utils import get_identifier, get_model_ct
|
|
|
from haystack.utils import log as logging
|
|
|
from haystack.utils.app_loading import haystack_get_model
|
|
|
from jieba.analyse import ChineseAnalyzer
|
|
|
from whoosh import index
|
|
|
from whoosh.analysis import StemmingAnalyzer
|
|
|
from whoosh.fields import BOOLEAN, DATETIME, IDLIST, KEYWORD, NGRAM, NGRAMWORDS, NUMERIC, Schema, TEXT
|
|
|
from whoosh.fields import ID as WHOOSH_ID
|
|
|
from whoosh.filedb.filestore import FileStorage, RamStorage
|
|
|
from whoosh.highlight import ContextFragmenter, HtmlFormatter
|
|
|
from whoosh.highlight import highlight as whoosh_highlight
|
|
|
from whoosh.qparser import QueryParser
|
|
|
from whoosh.searching import ResultsPage
|
|
|
from whoosh.writing import AsyncWriter
|
|
|
|
|
|
# Zxy 尝试导入 whoosh 库,如果失败则抛出依赖缺失异常
|
|
|
try:
|
|
|
import whoosh
|
|
|
except ImportError:
|
|
|
raise MissingDependency(
|
|
|
"The 'whoosh' backend requires the installation of 'Whoosh'. Please refer to the documentation.")
|
|
|
|
|
|
# Zxy 检查 whoosh 版本是否满足最低要求
|
|
|
if not hasattr(whoosh, '__version__') or whoosh.__version__ < (2, 5, 0):
|
|
|
raise MissingDependency(
|
|
|
"The 'whoosh' backend requires version 2.5.0 or greater.")
|
|
|
|
|
|
# Zxy 定义一个正则表达式,用于匹配 ISO 8601 格式的日期时间字符串
|
|
|
DATETIME_REGEX = re.compile(
|
|
|
'^(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})T(?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2})(\.\d{3,6}Z?)?$')
|
|
|
# Zxy 创建一个线程局部存储对象,用于在每个线程中存储独立的数据
|
|
|
LOCALS = threading.local()
|
|
|
# Zxy 初始化线程局部存储中的 RAM_STORE 为 None,用于存储内存索引
|
|
|
LOCALS.RAM_STORE = None
|
|
|
|
|
|
|
|
|
# Zxy 定义一个自定义的 HTML 格式化器,继承自 Whoosh 的 HtmlFormatter
|
|
|
class WhooshHtmlFormatter(HtmlFormatter):
|
|
|
"""
|
|
|
This is a HtmlFormatter simpler than the whoosh.HtmlFormatter.
|
|
|
We use it to have consistent results across backends. Specifically,
|
|
|
Solr, Xapian and Elasticsearch are using this formatting.
|
|
|
"""
|
|
|
# Zxy 定义高亮显示的 HTML 模板
|
|
|
template = '<%(tag)s>%(t)s</%(tag)s>'
|
|
|
|
|
|
|
|
|
# Zxy 定义 Whoosh 搜索后端类,继承自 Haystack 的 BaseSearchBackend
|
|
|
class WhooshSearchBackend(BaseSearchBackend):
|
|
|
# Zxy 定义 Whoosh 保留的关键字列表,这些关键字在查询中有特殊含义
|
|
|
RESERVED_WORDS = (
|
|
|
'AND',
|
|
|
'NOT',
|
|
|
'OR',
|
|
|
'TO',
|
|
|
)
|
|
|
|
|
|
# Zxy 定义 Whoosh 保留的字符列表,这些字符在查询中有特殊含义
|
|
|
# The '\\' must come first, so as not to overwrite the other slash
|
|
|
# replacements.
|
|
|
RESERVED_CHARACTERS = (
|
|
|
'\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
|
|
|
'[', ']', '^', '"', '~', '*', '?', ':', '.',
|
|
|
)
|
|
|
|
|
|
# Zxy 初始化方法,设置连接别名和连接选项
|
|
|
def __init__(self, connection_alias, **connection_options):
|
|
|
# Zxy 调用父类的初始化方法
|
|
|
super(
|
|
|
WhooshSearchBackend,
|
|
|
self).__init__(
|
|
|
connection_alias,
|
|
|
**connection_options)
|
|
|
# Zxy 标记后端设置尚未完成
|
|
|
self.setup_complete = False
|
|
|
# Zxy 默认使用文件存储
|
|
|
self.use_file_storage = True
|
|
|
# Zxy 设置 POST 请求的大小限制,默认为 128MB
|
|
|
self.post_limit = getattr(
|
|
|
connection_options,
|
|
|
'POST_LIMIT',
|
|
|
128 * 1024 * 1024)
|
|
|
# Zxy 从连接选项中获取索引文件存储路径
|
|
|
self.path = connection_options.get('PATH')
|
|
|
|
|
|
# Zxy 如果配置的存储类型不是 'file',则使用内存存储
|
|
|
if connection_options.get('STORAGE', 'file') != 'file':
|
|
|
self.use_file_storage = False
|
|
|
|
|
|
# Zxy 如果使用文件存储但未提供路径,则抛出配置错误异常
|
|
|
if self.use_file_storage and not self.path:
|
|
|
raise ImproperlyConfigured(
|
|
|
"You must specify a 'PATH' in your settings for connection '%s'." %
|
|
|
connection_alias)
|
|
|
|
|
|
# Zxy 获取 Haystack 的日志记录器
|
|
|
self.log = logging.getLogger('haystack')
|
|
|
|
|
|
|
|
|
# Zxy 设置搜索引擎,延迟加载直到需要时才执行
|
|
|
def setup(self):
|
|
|
"""
|
|
|
Defers loading until needed.
|
|
|
"""
|
|
|
# Zxy 导入 haystack 的连接管理器
|
|
|
from haystack import connections
|
|
|
# Zxy 标记是否为新创建的索引
|
|
|
new_index = False
|
|
|
|
|
|
# Zxy 确保索引目录存在
|
|
|
if self.use_file_storage and not os.path.exists(self.path):
|
|
|
# Zxy 如果目录不存在,则创建它
|
|
|
os.makedirs(self.path)
|
|
|
# Zxy 标记这是一个新索引
|
|
|
new_index = True
|
|
|
|
|
|
# Zxy 检查索引目录是否可写
|
|
|
if self.use_file_storage and not os.access(self.path, os.W_OK):
|
|
|
# Zxy 如果不可写,则抛出 IO 错误
|
|
|
raise IOError(
|
|
|
"The path to your Whoosh index '%s' is not writable for the current user/group." %
|
|
|
self.path)
|
|
|
|
|
|
# Zxy 根据配置选择存储方式:文件存储或内存存储
|
|
|
if self.use_file_storage:
|
|
|
# Zxy 使用文件存储
|
|
|
self.storage = FileStorage(self.path)
|
|
|
else:
|
|
|
# Zxy 声明使用全局的线程局部存储
|
|
|
global LOCALS
|
|
|
|
|
|
# Zxy 如果内存存储尚未初始化,则创建一个
|
|
|
if getattr(LOCALS, 'RAM_STORE', None) is None:
|
|
|
LOCALS.RAM_STORE = RamStorage()
|
|
|
|
|
|
# Zxy 使用内存存储
|
|
|
self.storage = LOCALS.RAM_STORE
|
|
|
|
|
|
# Zxy 构建索引模式,并获取主内容字段名
|
|
|
self.content_field_name, self.schema = self.build_schema(
|
|
|
connections[self.connection_alias].get_unified_index().all_searchfields())
|
|
|
# Zxy 创建查询解析器,用于解析用户查询
|
|
|
self.parser = QueryParser(self.content_field_name, schema=self.schema)
|
|
|
|
|
|
# Zxy 根据是否为新索引来创建或打开索引
|
|
|
if new_index is True:
|
|
|
# Zxy 如果是新索引,则根据 schema 创建
|
|
|
self.index = self.storage.create_index(self.schema)
|
|
|
else:
|
|
|
# Zxy 否则尝试打开现有索引
|
|
|
try:
|
|
|
self.index = self.storage.open_index(schema=self.schema)
|
|
|
except index.EmptyIndexError:
|
|
|
# Zxy 如果索引为空(可能刚创建目录但无文件),则创建新索引
|
|
|
self.index = self.storage.create_index(self.schema)
|
|
|
|
|
|
# Zxy 标记后端设置已完成
|
|
|
self.setup_complete = True
|
|
|
|
|
|
# Zxy 根据 Haystack 的搜索字段构建 Whoosh 的索引模式
|
|
|
def build_schema(self, fields):
|
|
|
# Zxy 初始化模式字段,包含 Haystack 内置的 ID、类型和模型 ID
|
|
|
schema_fields = {
|
|
|
ID: WHOOSH_ID(stored=True, unique=True),
|
|
|
DJANGO_CT: WHOOSH_ID(stored=True),
|
|
|
DJANGO_ID: WHOOSH_ID(stored=True),
|
|
|
}
|
|
|
# Zxy 获取 Haystack 内置字段的数量,用于后续检查
|
|
|
initial_key_count = len(schema_fields)
|
|
|
# Zxy 初始化主内容字段名
|
|
|
content_field_name = ''
|
|
|
|
|
|
# Zxy 遍历所有搜索字段,根据字段类型转换为 Whoosh 字段
|
|
|
for field_name, field_class in fields.items():
|
|
|
# Zxy 如果字段是多值字段
|
|
|
if field_class.is_multivalued:
|
|
|
if field_class.indexed is False:
|
|
|
# Zxy 如果多值字段不被索引,使用 IDLIST 类型
|
|
|
schema_fields[field_class.index_fieldname] = IDLIST(
|
|
|
stored=True, field_boost=field_class.boost)
|
|
|
else:
|
|
|
# Zxy 如果多值字段被索引,使用 KEYWORD 类型
|
|
|
schema_fields[field_class.index_fieldname] = KEYWORD(
|
|
|
stored=True, commas=True, scorable=True, field_boost=field_class.boost)
|
|
|
# Zxy 如果字段类型是日期或日期时间
|
|
|
elif field_class.field_type in ['date', 'datetime']:
|
|
|
# Zxy 使用 DATETIME 类型,并设置为可排序
|
|
|
schema_fields[field_class.index_fieldname] = DATETIME(
|
|
|
stored=field_class.stored, sortable=True)
|
|
|
# Zxy 如果字段类型是整数
|
|
|
elif field_class.field_type == 'integer':
|
|
|
# Zxy 使用 NUMERIC 类型,并指定数字类型为整数
|
|
|
schema_fields[field_class.index_fieldname] = NUMERIC(
|
|
|
stored=field_class.stored, numtype=int, field_boost=field_class.boost)
|
|
|
# Zxy 如果字段类型是浮点数
|
|
|
elif field_class.field_type == 'float':
|
|
|
# Zxy 使用 NUMERIC 类型,并指定数字类型为浮点数
|
|
|
schema_fields[field_class.index_fieldname] = NUMERIC(
|
|
|
stored=field_class.stored, numtype=float, field_boost=field_class.boost)
|
|
|
# Zxy 如果字段类型是布尔值
|
|
|
elif field_class.field_type == 'boolean':
|
|
|
# Zxy 使用 BOOLEAN 类型
|
|
|
# Field boost isn't supported on BOOLEAN as of 1.8.2.
|
|
|
schema_fields[field_class.index_fieldname] = BOOLEAN(
|
|
|
stored=field_class.stored)
|
|
|
# Zxy 如果字段类型是 N-gram
|
|
|
elif field_class.field_type == 'ngram':
|
|
|
# Zxy 使用 NGRAM 类型
|
|
|
schema_fields[field_class.index_fieldname] = NGRAM(
|
|
|
minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost)
|
|
|
# Zxy 如果字段类型是边 N-gram
|
|
|
elif field_class.field_type == 'edge_ngram':
|
|
|
# Zxy 使用 NGRAMWORDS 类型,并设置为从词首开始
|
|
|
schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, at='start',
|
|
|
stored=field_class.stored,
|
|
|
field_boost=field_class.boost)
|
|
|
else:
|
|
|
# Zxy 默认情况下,使用 TEXT 类型,并配置中文分词器
|
|
|
# schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True)
|
|
|
schema_fields[field_class.index_fieldname] = TEXT(
|
|
|
stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True)
|
|
|
# Zxy 如果字段被标记为文档主内容
|
|
|
if field_class.document is True:
|
|
|
# Zxy 记录主内容字段名
|
|
|
content_field_name = field_class.index_fieldname
|
|
|
# Zxy 启用拼写建议功能
|
|
|
schema_fields[field_class.index_fieldname].spelling = True
|
|
|
|
|
|
# Zxy 如果除了内置字段外没有找到任何其他字段,则优雅地报错
|
|
|
if len(schema_fields) <= initial_key_count:
|
|
|
raise SearchBackendError(
|
|
|
"No fields were found in any search_indexes. Please correct this before attempting to search.")
|
|
|
|
|
|
# Zxy 返回主内容字段名和构建好的 Schema 对象
|
|
|
return (content_field_name, Schema(**schema_fields))
|
|
|
|
|
|
# Zxy 更新索引,将可迭代对象中的每个对象添加或更新到 Whoosh 索引中
|
|
|
def update(self, index, iterable, commit=True):
|
|
|
# Zxy 如果后端尚未初始化,则先进行设置
|
|
|
if not self.setup_complete:
|
|
|
self.setup()
|
|
|
|
|
|
# Zxy 刷新索引以获取最新状态
|
|
|
self.index = self.index.refresh()
|
|
|
# Zxy 创建一个异步写入器,用于高效地写入索引
|
|
|
writer = AsyncWriter(self.index)
|
|
|
|
|
|
# Zxy 遍历所有需要更新的对象
|
|
|
for obj in iterable:
|
|
|
try:
|
|
|
# Zxy 准备要索引的文档数据
|
|
|
doc = index.full_prepare(obj)
|
|
|
except SkipDocument:
|
|
|
# Zxy 如果对象被标记为跳过,则记录调试信息
|
|
|
self.log.debug(u"Indexing for object `%s` skipped", obj)
|
|
|
else:
|
|
|
# Zxy 确保所有字段值都是 Whoosh 可以处理的格式
|
|
|
for key in doc:
|
|
|
doc[key] = self._from_python(doc[key])
|
|
|
|
|
|
# Zxy 文档级别的权重提升在 Whoosh 2.5.0+ 版本中不再支持
|
|
|
if 'boost' in doc:
|
|
|
del doc['boost']
|
|
|
|
|
|
try:
|
|
|
# Zxy 使用写入器更新文档(如果存在则更新,否则创建)
|
|
|
writer.update_document(**doc)
|
|
|
except Exception as e:
|
|
|
# Zxy 如果未配置为静默失败,则重新抛出异常
|
|
|
if not self.silently_fail:
|
|
|
raise
|
|
|
|
|
|
# Zxy 记录错误信息,包含对象标识符,但不包含对象本身以避免编码问题
|
|
|
self.log.error(
|
|
|
u"%s while preparing object for update" %
|
|
|
e.__class__.__name__,
|
|
|
exc_info=True,
|
|
|
extra={
|
|
|
"data": {
|
|
|
"index": index,
|
|
|
"object": get_identifier(obj)}})
|
|
|
|
|
|
# Zxy 如果可迭代对象不为空,则提交更改
|
|
|
if len(iterable) > 0:
|
|
|
# For now, commit no matter what, as we run into locking issues
|
|
|
# otherwise.
|
|
|
writer.commit()
|
|
|
|
|
|
# Zxy 从索引中移除一个对象
|
|
|
def remove(self, obj_or_string, commit=True):
|
|
|
# Zxy 如果后端尚未初始化,则先进行设置
|
|
|
if not self.setup_complete:
|
|
|
self.setup()
|
|
|
|
|
|
# Zxy 刷新索引以获取最新状态
|
|
|
self.index = self.index.refresh()
|
|
|
# Zxy 获取对象的唯一标识符
|
|
|
whoosh_id = get_identifier(obj_or_string)
|
|
|
|
|
|
try:
|
|
|
# Zxy 构建一个查询,根据 ID 查找文档并删除
|
|
|
self.index.delete_by_query(
|
|
|
q=self.parser.parse(
|
|
|
u'%s:"%s"' %
|
|
|
(ID, whoosh_id)))
|
|
|
except Exception as e:
|
|
|
# Zxy 如果未配置为静默失败,则重新抛出异常
|
|
|
if not self.silently_fail:
|
|
|
raise
|
|
|
|
|
|
# Zxy 记录删除失败的错误
|
|
|
self.log.error(
|
|
|
"Failed to remove document '%s' from Whoosh: %s",
|
|
|
whoosh_id,
|
|
|
e,
|
|
|
exc_info=True)
|
|
|
|
|
|
# Zxy 清空索引,可以清空所有内容或指定模型的内容
|
|
|
def clear(self, models=None, commit=True):
|
|
|
# Zxy 如果后端尚未初始化,则先进行设置
|
|
|
if not self.setup_complete:
|
|
|
self.setup()
|
|
|
|
|
|
# Zxy 刷新索引以获取最新状态
|
|
|
self.index = self.index.refresh()
|
|
|
|
|
|
# Zxy 如果提供了模型列表,则检查其类型
|
|
|
if models is not None:
|
|
|
assert isinstance(models, (list, tuple))
|
|
|
|
|
|
try:
|
|
|
if models is None:
|
|
|
# Zxy 如果没有指定模型,则删除整个索引
|
|
|
self.delete_index()
|
|
|
else:
|
|
|
# Zxy 准备要删除的模型列表
|
|
|
models_to_delete = []
|
|
|
|
|
|
for model in models:
|
|
|
# Zxy 为每个模型构建查询字符串
|
|
|
models_to_delete.append(
|
|
|
u"%s:%s" %
|
|
|
(DJANGO_CT, get_model_ct(model)))
|
|
|
|
|
|
# Zxy 使用 OR 连接多个模型查询,并删除匹配的文档
|
|
|
self.index.delete_by_query(
|
|
|
q=self.parser.parse(
|
|
|
u" OR ".join(models_to_delete)))
|
|
|
except Exception as e:
|
|
|
# Zxy 如果未配置为静默失败,则重新抛出异常
|
|
|
if not self.silently_fail:
|
|
|
raise
|
|
|
|
|
|
# Zxy 根据是否指定模型,记录不同的错误信息
|
|
|
if models is not None:
|
|
|
self.log.error(
|
|
|
"Failed to clear Whoosh index of models '%s': %s",
|
|
|
','.join(models_to_delete),
|
|
|
e,
|
|
|
exc_info=True)
|
|
|
else:
|
|
|
self.log.error(
|
|
|
"Failed to clear Whoosh index: %s", e, exc_info=True)
|
|
|
|
|
|
# Zxy 物理删除整个索引
|
|
|
def delete_index(self):
|
|
|
# Per the Whoosh mailing list, if wiping out everything from the index,
|
|
|
# it's much more efficient to simply delete the index files.
|
|
|
# Zxy 如果使用文件存储,则直接删除索引目录
|
|
|
if self.use_file_storage and os.path.exists(self.path):
|
|
|
shutil.rmtree(self.path)
|
|
|
# Zxy 如果使用内存存储,则清空存储
|
|
|
elif not self.use_file_storage:
|
|
|
self.storage.clean()
|
|
|
|
|
|
# Zxy 重新初始化后端,创建新的空索引
|
|
|
self.setup()
|
|
|
|
|
|
# Zxy 优化索引,合并索引段以提高搜索性能
|
|
|
def optimize(self):
|
|
|
# Zxy 如果后端尚未初始化,则先进行设置
|
|
|
if not self.setup_complete:
|
|
|
self.setup()
|
|
|
|
|
|
# Zxy 刷新索引以获取最新状态
|
|
|
self.index = self.index.refresh()
|
|
|
# Zxy 执行 Whoosh 的优化操作
|
|
|
self.index.optimize()
|
|
|
|
|
|
# Zxy 根据偏移量计算 Whoosh 分页所需的页码和每页大小
|
|
|
def calculate_page(self, start_offset=0, end_offset=None):
|
|
|
# Zxy 防止 Whoosh 因 end_offset 小于等于 0 而抛出错误
|
|
|
if end_offset is not None and end_offset <= 0:
|
|
|
end_offset = 1
|
|
|
|
|
|
# Zxy 初始化页码
|
|
|
page_num = 0
|
|
|
|
|
|
# Zxy 如果未指定结束偏移量,则设置一个很大的默认值
|
|
|
if end_offset is None:
|
|
|
end_offset = 1000000
|
|
|
|
|
|
# Zxy 如果未指定起始偏移量,则默认为 0
|
|
|
if start_offset is None:
|
|
|
start_offset = 0
|
|
|
|
|
|
# Zxy 计算每页的长度
|
|
|
page_length = end_offset - start_offset
|
|
|
|
|
|
# Zxy 如果页长度有效,则计算页码
|
|
|
if page_length and page_length > 0:
|
|
|
page_num = int(start_offset / page_length)
|
|
|
|
|
|
# Zxy Whoosh 使用 1-based 页码,所以需要加 1
|
|
|
page_num += 1
|
|
|
return page_num, page_length
|
|
|
|
|
|
# Zxy 执行搜索查询的核心方法
|
|
|
@log_query
|
|
|
def search(
|
|
|
self,
|
|
|
query_string,
|
|
|
sort_by=None,
|
|
|
start_offset=0,
|
|
|
end_offset=None,
|
|
|
fields='',
|
|
|
highlight=False,
|
|
|
facets=None,
|
|
|
date_facets=None,
|
|
|
query_facets=None,
|
|
|
narrow_queries=None,
|
|
|
spelling_query=None,
|
|
|
within=None,
|
|
|
dwithin=None,
|
|
|
distance_point=None,
|
|
|
models=None,
|
|
|
limit_to_registered_models=None,
|
|
|
result_class=None,
|
|
|
**kwargs):
|
|
|
# Zxy 如果后端尚未初始化,则先进行设置
|
|
|
if not self.setup_complete:
|
|
|
self.setup()
|
|
|
|
|
|
# Zxy 空查询字符串应返回无结果
|
|
|
if len(query_string) == 0:
|
|
|
return {
|
|
|
'results': [],
|
|
|
'hits': 0,
|
|
|
}
|
|
|
|
|
|
# Zxy 确保查询字符串为正确的字符串类型
|
|
|
query_string = force_str(query_string)
|
|
|
|
|
|
# Zxy 单个字符(非通配符)查询会被停用词过滤器拦截,应返回无结果
|
|
|
if len(query_string) <= 1 and query_string != u'*':
|
|
|
return {
|
|
|
'results': [],
|
|
|
'hits': 0,
|
|
|
}
|
|
|
|
|
|
# Zxy 初始化排序方向为非逆序
|
|
|
reverse = False
|
|
|
|
|
|
# Zxy 如果提供了排序字段
|
|
|
if sort_by is not None:
|
|
|
# Zxy 确定是否需要反转结果,以及 Whoosh 是否能处理排序
|
|
|
sort_by_list = []
|
|
|
reverse_counter = 0
|
|
|
|
|
|
# Zxy 统计逆序排序字段的数量
|
|
|
for order_by in sort_by:
|
|
|
if order_by.startswith('-'):
|
|
|
reverse_counter += 1
|
|
|
|
|
|
# Zxy Whoosh 要求所有排序字段的排序方向必须一致
|
|
|
if reverse_counter and reverse_counter != len(sort_by):
|
|
|
raise SearchBackendError("Whoosh requires all order_by fields"
|
|
|
" to use the same sort direction")
|
|
|
|
|
|
# Zxy 处理排序字段列表,去除 '-' 前缀并确定最终排序方向
|
|
|
for order_by in sort_by:
|
|
|
if order_by.startswith('-'):
|
|
|
sort_by_list.append(order_by[1:])
|
|
|
if len(sort_by_list) == 1:
|
|
|
reverse = True
|
|
|
else:
|
|
|
sort_by_list.append(order_by)
|
|
|
if len(sort_by_list) == 1:
|
|
|
reverse = False
|
|
|
|
|
|
# Zxy Whoosh 的 search_page 方法只接受单个排序字段
|
|
|
sort_by = sort_by_list[0]
|
|
|
|
|
|
# Zxy Whoosh 后端不支持分面搜索,发出警告
|
|
|
if facets is not None:
|
|
|
warnings.warn(
|
|
|
"Whoosh does not handle faceting.",
|
|
|
Warning,
|
|
|
stacklevel=2)
|
|
|
|
|
|
# Zxy Whoosh 后端不支持日期分面,发出警告
|
|
|
if date_facets is not None:
|
|
|
warnings.warn(
|
|
|
"Whoosh does not handle date faceting.",
|
|
|
Warning,
|
|
|
stacklevel=2)
|
|
|
|
|
|
# Zxy Whoosh 后端不支持查询分面,发出警告
|
|
|
if query_facets is not None:
|
|
|
warnings.warn(
|
|
|
"Whoosh does not handle query faceting.",
|
|
|
Warning,
|
|
|
stacklevel=2)
|
|
|
|
|
|
# Zxy 初始化用于存储缩小范围后的结果
|
|
|
narrowed_results = None
|
|
|
# Zxy 刷新索引以获取最新状态
|
|
|
self.index = self.index.refresh()
|
|
|
|
|
|
# Zxy 确定是否限制搜索到已注册的模型
|
|
|
if limit_to_registered_models is None:
|
|
|
limit_to_registered_models = getattr(
|
|
|
settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)
|
|
|
|
|
|
# Zxy 根据传入的模型或配置构建模型选择列表
|
|
|
if models and len(models):
|
|
|
model_choices = sorted(get_model_ct(model) for model in models)
|
|
|
elif limit_to_registered_models:
|
|
|
# Zxy 使用缩小查询的方式,将结果限制在当前路由器处理的模型中
|
|
|
model_choices = self.build_models_list()
|
|
|
else:
|
|
|
model_choices = []
|
|
|
|
|
|
# Zxy 如果存在模型选择,则将其添加到缩小查询中
|
|
|
if len(model_choices) > 0:
|
|
|
if narrow_queries is None:
|
|
|
narrow_queries = set()
|
|
|
|
|
|
# Zxy 构建一个 OR 查询来限制模型类型
|
|
|
narrow_queries.add(' OR '.join(
|
|
|
['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]))
|
|
|
|
|
|
# Zxy 初始化缩小查询的搜索器
|
|
|
narrow_searcher = None
|
|
|
|
|
|
# Zxy 如果存在缩小查询,则执行它们以获取一个结果集过滤器
|
|
|
if narrow_queries is not None:
|
|
|
# Zxy 这个操作可能很昂贵,但 Whoosh 中没有其他方法
|
|
|
narrow_searcher = self.index.searcher()
|
|
|
|
|
|
for nq in narrow_queries:
|
|
|
recent_narrowed_results = narrow_searcher.search(
|
|
|
self.parser.parse(force_str(nq)), limit=None)
|
|
|
|
|
|
# Zxy 如果任何一个缩小查询返回空结果,则直接返回空
|
|
|
if len(recent_narrowed_results) <= 0:
|
|
|
return {
|
|
|
'results': [],
|
|
|
'hits': 0,
|
|
|
}
|
|
|
|
|
|
# Zxy 将多个缩小查询的结果集进行交集过滤
|
|
|
if narrowed_results:
|
|
|
narrowed_results.filter(recent_narrowed_results)
|
|
|
else:
|
|
|
narrowed_results = recent_narrowed_results
|
|
|
|
|
|
# Zxy 再次刷新索引以确保所有写入都可见
|
|
|
self.index = self.index.refresh()
|
|
|
|
|
|
# Zxy 如果索引中有文档,则执行搜索
|
|
|
if self.index.doc_count():
|
|
|
searcher = self.index.searcher()
|
|
|
# Zxy 解析查询字符串
|
|
|
parsed_query = self.parser.parse(query_string)
|
|
|
|
|
|
# Zxy 如果查询无效或被停用词过滤,则优雅地恢复
|
|
|
if parsed_query is None:
|
|
|
return {
|
|
|
'results': [],
|
|
|
'hits': 0,
|
|
|
}
|
|
|
|
|
|
# Zxy 计算分页参数
|
|
|
page_num, page_length = self.calculate_page(
|
|
|
start_offset, end_offset)
|
|
|
|
|
|
# Zxy 准备搜索参数
|
|
|
search_kwargs = {
|
|
|
'pagelen': page_length,
|
|
|
'sortedby': sort_by,
|
|
|
'reverse': reverse,
|
|
|
}
|
|
|
|
|
|
# Zxy 如果存在缩小范围的结果,则将其作为过滤器
|
|
|
if narrowed_results is not None:
|
|
|
search_kwargs['filter'] = narrowed_results
|
|
|
|
|
|
try:
|
|
|
# Zxy 执行分页搜索
|
|
|
raw_page = searcher.search_page(
|
|
|
parsed_query,
|
|
|
page_num,
|
|
|
**search_kwargs
|
|
|
)
|
|
|
except ValueError:
|
|
|
# Zxy 如果页码无效,则返回空结果
|
|
|
if not self.silently_fail:
|
|
|
raise
|
|
|
return {
|
|
|
'results': [],
|
|
|
'hits': 0,
|
|
|
'spelling_suggestion': None,
|
|
|
}
|
|
|
|
|
|
# Zxy 兼容 Whoosh 2.5.1 的 bug:请求过高的页码会返回错误的页
|
|
|
if raw_page.pagenum < page_num:
|
|
|
return {
|
|
|
'results': [],
|
|
|
'hits': 0,
|
|
|
'spelling_suggestion': None,
|
|
|
}
|
|
|
|
|
|
# Zxy 处理原始搜索结果,转换为 Haystack 的 SearchResult 对象
|
|
|
results = self._process_results(
|
|
|
raw_page,
|
|
|
highlight=highlight,
|
|
|
query_string=query_string,
|
|
|
spelling_query=spelling_query,
|
|
|
result_class=result_class)
|
|
|
# Zxy 关闭主搜索器
|
|
|
searcher.close()
|
|
|
|
|
|
# Zxy 关闭缩小查询的搜索器
|
|
|
if hasattr(narrow_searcher, 'close'):
|
|
|
narrow_searcher.close()
|
|
|
|
|
|
return results
|
|
|
else:
|
|
|
# Zxy 如果索引为空,但仍需处理拼写建议
|
|
|
spelling_suggestion = None
|
|
|
if self.include_spelling:
|
|
|
if spelling_query:
|
|
|
spelling_suggestion = self.create_spelling_suggestion(
|
|
|
spelling_query)
|
|
|
else:
|
|
|
spelling_suggestion = self.create_spelling_suggestion(
|
|
|
query_string)
|
|
|
|
|
|
return {
|
|
|
'results': [],
|
|
|
'hits': 0,
|
|
|
'spelling_suggestion': spelling_suggestion,
|
|
|
}
|
|
|
|
|
|
# Zxy 实现“更多类似于此”功能,根据给定模型实例查找相似文档
|
|
|
def more_like_this(
|
|
|
self,
|
|
|
model_instance,
|
|
|
additional_query_string=None,
|
|
|
start_offset=0,
|
|
|
end_offset=None,
|
|
|
models=None,
|
|
|
limit_to_registered_models=None,
|
|
|
result_class=None,
|
|
|
**kwargs):
|
|
|
# Zxy 如果后端尚未初始化,则先进行设置
|
|
|
if not self.setup_complete:
|
|
|
self.setup()
|
|
|
|
|
|
# Zxy 获取模型的真实类,避免使用延迟加载的模型类
|
|
|
# Deferred models will have a different class ("RealClass_Deferred_fieldname")
|
|
|
# which won't be in our registry:
|
|
|
model_klass = model_instance._meta.concrete_model
|
|
|
|
|
|
# Zxy 获取主内容字段名,用于相似性分析
|
|
|
field_name = self.content_field_name
|
|
|
# Zxy 初始化缩小查询集合
|
|
|
narrow_queries = set()
|
|
|
# Zxy 初始化缩小范围后的结果集
|
|
|
narrowed_results = None
|
|
|
# Zxy 刷新索引以获取最新状态
|
|
|
self.index = self.index.refresh()
|
|
|
|
|
|
# Zxy 确定是否限制搜索到已注册的模型
|
|
|
if limit_to_registered_models is None:
|
|
|
limit_to_registered_models = getattr(
|
|
|
settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)
|
|
|
|
|
|
# Zxy 根据传入的模型或配置构建模型选择列表
|
|
|
if models and len(models):
|
|
|
model_choices = sorted(get_model_ct(model) for model in models)
|
|
|
elif limit_to_registered_models:
|
|
|
# Zxy 使用缩小查询的方式,将结果限制在当前路由器处理的模型中
|
|
|
model_choices = self.build_models_list()
|
|
|
else:
|
|
|
model_choices = []
|
|
|
|
|
|
# Zxy 如果存在模型选择,则将其添加到缩小查询中
|
|
|
if len(model_choices) > 0:
|
|
|
if narrow_queries is None:
|
|
|
narrow_queries = set()
|
|
|
|
|
|
# Zxy 构建一个 OR 查询来限制模型类型
|
|
|
narrow_queries.add(' OR '.join(
|
|
|
['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]))
|
|
|
|
|
|
# Zxy 如果提供了额外的查询字符串,则也添加到缩小查询中
|
|
|
if additional_query_string and additional_query_string != '*':
|
|
|
narrow_queries.add(additional_query_string)
|
|
|
|
|
|
# Zxy 初始化缩小查询的搜索器
|
|
|
narrow_searcher = None
|
|
|
|
|
|
# Zxy 如果存在缩小查询,则执行它们以获取一个结果集过滤器
|
|
|
if narrow_queries is not None:
|
|
|
# Zxy 这个操作可能很昂贵,但 Whoosh 中没有其他方法
|
|
|
narrow_searcher = self.index.searcher()
|
|
|
|
|
|
for nq in narrow_queries:
|
|
|
recent_narrowed_results = narrow_searcher.search(
|
|
|
self.parser.parse(force_str(nq)), limit=None)
|
|
|
|
|
|
# Zxy 如果任何一个缩小查询返回空结果,则直接返回空
|
|
|
if len(recent_narrowed_results) <= 0:
|
|
|
return {
|
|
|
'results': [],
|
|
|
'hits': 0,
|
|
|
}
|
|
|
|
|
|
# Zxy 将多个缩小查询的结果集进行交集过滤
|
|
|
if narrowed_results:
|
|
|
narrowed_results.filter(recent_narrowed_results)
|
|
|
else:
|
|
|
narrowed_results = recent_narrowed_results
|
|
|
|
|
|
# Zxy 计算分页参数
|
|
|
page_num, page_length = self.calculate_page(start_offset, end_offset)
|
|
|
|
|
|
# Zxy 再次刷新索引以确保所有写入都可见
|
|
|
self.index = self.index.refresh()
|
|
|
# Zxy 初始化原始结果为空
|
|
|
raw_results = EmptyResults()
|
|
|
|
|
|
# Zxy 如果索引中有文档,则执行“更多类似于此”查询
|
|
|
if self.index.doc_count():
|
|
|
# Zxy 构建一个查询以找到当前模型实例对应的索引文档
|
|
|
query = "%s:%s" % (ID, get_identifier(model_instance))
|
|
|
searcher = self.index.searcher()
|
|
|
parsed_query = self.parser.parse(query)
|
|
|
# Zxy 搜索当前文档
|
|
|
results = searcher.search(parsed_query)
|
|
|
|
|
|
# Zxy 如果找到了当前文档,则调用其 more_like_this 方法
|
|
|
if len(results):
|
|
|
# Zxy 获取与当前文档相似的其他文档
|
|
|
raw_results = results[0].more_like_this(
|
|
|
field_name, top=end_offset)
|
|
|
|
|
|
# Zxy 如果存在缩小范围的结果,则将其作为过滤器应用于相似结果
|
|
|
if narrowed_results is not None and hasattr(raw_results, 'filter'):
|
|
|
raw_results.filter(narrowed_results)
|
|
|
|
|
|
try:
|
|
|
# Zxy 将原始结果集包装成分页对象
|
|
|
raw_page = ResultsPage(raw_results, page_num, page_length)
|
|
|
except ValueError:
|
|
|
# Zxy 如果页码无效,则返回空结果
|
|
|
if not self.silently_fail:
|
|
|
raise
|
|
|
|
|
|
return {
|
|
|
'results': [],
|
|
|
'hits': 0,
|
|
|
'spelling_suggestion': None,
|
|
|
}
|
|
|
|
|
|
# Zxy 兼容 Whoosh 2.5.1 的 bug:请求过高的页码会返回错误的页
|
|
|
if raw_page.pagenum < page_num:
|
|
|
return {
|
|
|
'results': [],
|
|
|
'hits': 0,
|
|
|
'spelling_suggestion': None,
|
|
|
}
|
|
|
|
|
|
# Zxy 处理原始搜索结果,转换为 Haystack 的 SearchResult 对象
|
|
|
results = self._process_results(raw_page, result_class=result_class)
|
|
|
# Zxy 关闭主搜索器
|
|
|
searcher.close()
|
|
|
|
|
|
# Zxy 关闭缩小查询的搜索器
|
|
|
if hasattr(narrow_searcher, 'close'):
|
|
|
narrow_searcher.close()
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
# Zxy 处理 Whoosh 返回的原始搜索结果,转换为 Haystack 标准格式
|
|
|
def _process_results(
|
|
|
self,
|
|
|
raw_page,
|
|
|
highlight=False,
|
|
|
query_string='',
|
|
|
spelling_query=None,
|
|
|
result_class=None):
|
|
|
# Zxy 导入 haystack 连接管理器
|
|
|
from haystack import connections
|
|
|
# Zxy 初始化结果列表
|
|
|
results = []
|
|
|
|
|
|
# Zxy 在切片之前获取总命中数,这对于分页至关重要
|
|
|
hits = len(raw_page)
|
|
|
|
|
|
# Zxy 如果未指定结果类,则使用默认的 SearchResult
|
|
|
if result_class is None:
|
|
|
result_class = SearchResult
|
|
|
|
|
|
# Zxy 初始化分面和拼写建议
|
|
|
facets = {}
|
|
|
spelling_suggestion = None
|
|
|
# Zxy 获取统一索引对象和已注册的模型列表
|
|
|
unified_index = connections[self.connection_alias].get_unified_index()
|
|
|
indexed_models = unified_index.get_indexed_models()
|
|
|
|
|
|
# Zxy 遍历原始结果页中的每个文档
|
|
|
for doc_offset, raw_result in enumerate(raw_page):
|
|
|
# Zxy 获取文档的得分
|
|
|
score = raw_page.score(doc_offset) or 0
|
|
|
# Zxy 从文档中解析出应用标签和模型名
|
|
|
app_label, model_name = raw_result[DJANGO_CT].split('.')
|
|
|
# Zxy 初始化额外字段字典
|
|
|
additional_fields = {}
|
|
|
# Zxy 根据应用标签和模型名获取模型类
|
|
|
model = haystack_get_model(app_label, model_name)
|
|
|
|
|
|
# Zxy 确保模型存在且已注册到索引
|
|
|
if model and model in indexed_models:
|
|
|
# Zxy 遍历文档中的所有字段
|
|
|
for key, value in raw_result.items():
|
|
|
# Zxy 获取该模型对应的索引
|
|
|
index = unified_index.get_index(model)
|
|
|
string_key = str(key)
|
|
|
|
|
|
# Zxy 如果字段在索引定义中,并且有转换方法
|
|
|
if string_key in index.fields and hasattr(
|
|
|
index.fields[string_key], 'convert'):
|
|
|
# Zxy 特殊处理多值字段
|
|
|
if index.fields[string_key].is_multivalued:
|
|
|
if value is None or len(value) == 0:
|
|
|
additional_fields[string_key] = []
|
|
|
else:
|
|
|
additional_fields[string_key] = value.split(
|
|
|
',')
|
|
|
else:
|
|
|
# Zxy 使用索引字段定义的转换方法
|
|
|
additional_fields[string_key] = index.fields[string_key].convert(
|
|
|
value)
|
|
|
else:
|
|
|
# Zxy 否则使用通用的 Python 类型转换
|
|
|
additional_fields[string_key] = self._to_python(value)
|
|
|
|
|
|
# Zxy 删除 Haystack 内部字段,不返回给用户
|
|
|
del (additional_fields[DJANGO_CT])
|
|
|
del (additional_fields[DJANGO_ID])
|
|
|
|
|
|
# Zxy 如果需要高亮显示
|
|
|
if highlight:
|
|
|
# Zxy 创建词干分析器和 HTML 格式化器
|
|
|
sa = StemmingAnalyzer()
|
|
|
formatter = WhooshHtmlFormatter('em')
|
|
|
# Zxy 从查询字符串中提取词条
|
|
|
terms = [token.text for token in sa(query_string)]
|
|
|
|
|
|
# Zxy 调用 Whoosh 的高亮方法
|
|
|
whoosh_result = whoosh_highlight(
|
|
|
additional_fields.get(self.content_field_name),
|
|
|
terms,
|
|
|
sa,
|
|
|
ContextFragmenter(),
|
|
|
formatter
|
|
|
)
|
|
|
# Zxy 将高亮结果添加到额外字段中
|
|
|
additional_fields['highlighted'] = {
|
|
|
self.content_field_name: [whoosh_result],
|
|
|
}
|
|
|
|
|
|
# Zxy 创建 SearchResult 对象并添加到结果列表
|
|
|
result = result_class(
|
|
|
app_label,
|
|
|
model_name,
|
|
|
raw_result[DJANGO_ID],
|
|
|
score,
|
|
|
**additional_fields)
|
|
|
results.append(result)
|
|
|
else:
|
|
|
# Zxy 如果模型未注册,则减少总命中数
|
|
|
hits -= 1
|
|
|
|
|
|
# Zxy 如果启用了拼写建议
|
|
|
if self.include_spelling:
|
|
|
if spelling_query:
|
|
|
spelling_suggestion = self.create_spelling_suggestion(
|
|
|
spelling_query)
|
|
|
else:
|
|
|
spelling_suggestion = self.create_spelling_suggestion(
|
|
|
query_string)
|
|
|
|
|
|
# Zxy 返回包含结果、命中数、分面和拼写建议的字典
|
|
|
return {
|
|
|
'results': results,
|
|
|
'hits': hits,
|
|
|
'facets': facets,
|
|
|
'spelling_suggestion': spelling_suggestion,
|
|
|
}
|
|
|
|
|
|
# Zxy 根据查询字符串创建拼写建议
|
|
|
def create_spelling_suggestion(self, query_string):
|
|
|
# Zxy 初始化拼写建议
|
|
|
spelling_suggestion = None
|
|
|
# Zxy 获取索引的读取器和校正器
|
|
|
reader = self.index.reader()
|
|
|
corrector = reader.corrector(self.content_field_name)
|
|
|
# Zxy 清理查询字符串
|
|
|
cleaned_query = force_str(query_string)
|
|
|
|
|
|
# Zxy 如果查询字符串为空,直接返回
|
|
|
if not query_string:
|
|
|
return spelling_suggestion
|
|
|
|
|
|
# Zxy 移除查询中的保留字
|
|
|
for rev_word in self.RESERVED_WORDS:
|
|
|
cleaned_query = cleaned_query.replace(rev_word, '')
|
|
|
|
|
|
# Zxy 移除查询中的保留字符
|
|
|
for rev_char in self.RESERVED_CHARACTERS:
|
|
|
cleaned_query = cleaned_query.replace(rev_char, '')
|
|
|
|
|
|
# Zxy 将清理后的查询拆分为单词列表
|
|
|
query_words = cleaned_query.split()
|
|
|
suggested_words = []
|
|
|
|
|
|
# Zxy 为每个单词查找拼写建议
|
|
|
for word in query_words:
|
|
|
suggestions = corrector.suggest(word, limit=1)
|
|
|
|
|
|
if len(suggestions) > 0:
|
|
|
suggested_words.append(suggestions[0])
|
|
|
|
|
|
# Zxy 将建议的单词重新组合成字符串
|
|
|
spelling_suggestion = ' '.join(suggested_words)
|
|
|
return spelling_suggestion
|
|
|
|
|
|
# Zxy 将 Python 值转换为 Whoosh 可用的字符串格式
|
|
|
def _from_python(self, value):
|
|
|
"""
|
|
|
Converts Python values to a string for Whoosh.
|
|
|
|
|
|
Code courtesy of pysolr.
|
|
|
"""
|
|
|
# Zxy 处理日期时间对象
|
|
|
if hasattr(value, 'strftime'):
|
|
|
# Zxy 如果只有日期没有时间,则将时间部分设为 0
|
|
|
if not hasattr(value, 'hour'):
|
|
|
value = datetime(value.year, value.month, value.day, 0, 0, 0)
|
|
|
# Zxy 处理布尔值
|
|
|
elif isinstance(value, bool):
|
|
|
if value:
|
|
|
value = 'true'
|
|
|
else:
|
|
|
value = 'false'
|
|
|
# Zxy 处理列表或元组,用逗号连接
|
|
|
elif isinstance(value, (list, tuple)):
|
|
|
value = u','.join([force_str(v) for v in value])
|
|
|
# Zxy 处理数字,保持原样
|
|
|
elif isinstance(value, (six.integer_types, float)):
|
|
|
# Leave it alone.
|
|
|
pass
|
|
|
else:
|
|
|
# Zxy 其他类型强制转换为字符串
|
|
|
value = force_str(value)
|
|
|
return value
|
|
|
|
|
|
# Zxy 将 Whoosh 的值转换为原生 Python 值
|
|
|
def _to_python(self, value):
|
|
|
"""
|
|
|
Converts values from Whoosh to native Python values.
|
|
|
|
|
|
A port of the same method in pysolr, as they deal with data the same way.
|
|
|
"""
|
|
|
# Zxy 处理布尔字符串
|
|
|
if value == 'true':
|
|
|
return True
|
|
|
elif value == 'false':
|
|
|
return False
|
|
|
|
|
|
# Zxy 尝试解析日期时间字符串
|
|
|
if value and isinstance(value, six.string_types):
|
|
|
possible_datetime = DATETIME_REGEX.search(value)
|
|
|
|
|
|
if possible_datetime:
|
|
|
date_values = possible_datetime.groupdict()
|
|
|
|
|
|
for dk, dv in date_values.items():
|
|
|
date_values[dk] = int(dv)
|
|
|
|
|
|
return datetime(
|
|
|
date_values['year'],
|
|
|
date_values['month'],
|
|
|
date_values['day'],
|
|
|
date_values['hour'],
|
|
|
date_values['minute'],
|
|
|
date_values['second'])
|
|
|
|
|
|
# Zxy 尝试使用 json 解复杂数据类型
|
|
|
try:
|
|
|
# Attempt to use json to load the values.
|
|
|
converted_value = json.loads(value)
|
|
|
|
|
|
# Try to handle most built-in types.
|
|
|
if isinstance(
|
|
|
converted_value,
|
|
|
(list,
|
|
|
tuple,
|
|
|
set,
|
|
|
dict,
|
|
|
six.integer_types,
|
|
|
float,
|
|
|
complex)):
|
|
|
return converted_value
|
|
|
except BaseException:
|
|
|
# If it fails (SyntaxError or its ilk) or we don't trust it,
|
|
|
# continue on.
|
|
|
pass
|
|
|
|
|
|
# Zxy 如果都无法转换,则返回原始值
|
|
|
return value
|
|
|
|
|
|
# Zxy 定义 Whoosh 搜索查询类,继承自 Haystack 的 BaseSearchQuery
|
|
|
class WhooshSearchQuery(BaseSearchQuery):
|
|
|
# Zxy 将日期时间对象转换为 Whoosh 查询所需的字符串格式
|
|
|
def _convert_datetime(self, date):
|
|
|
# Zxy 如果包含时间,则转换为完整格式
|
|
|
if hasattr(date, 'hour'):
|
|
|
return force_str(date.strftime('%Y%m%d%H%M%S'))
|
|
|
else:
|
|
|
# Zxy 如果只有日期,则补充零时间
|
|
|
return force_str(date.strftime('%Y%m%d000000'))
|
|
|
|
|
|
# Zxy 清理查询片段,转义 Whoosh 的保留字符
|
|
|
def clean(self, query_fragment):
|
|
|
"""
|
|
|
Provides a mechanism for sanitizing user input before presenting the
|
|
|
value to the backend.
|
|
|
|
|
|
Whoosh 1.X differs here in that you can no longer use a backslash
|
|
|
to escape reserved characters. Instead, the whole word should be
|
|
|
quoted.
|
|
|
"""
|
|
|
# Zxy 将查询片段按空格分割成单词
|
|
|
words = query_fragment.split()
|
|
|
cleaned_words = []
|
|
|
|
|
|
# Zxy 遍历每个单词进行清理
|
|
|
for word in words:
|
|
|
# Zxy 如果是保留字,则转为小写
|
|
|
if word in self.backend.RESERVED_WORDS:
|
|
|
word = word.replace(word, word.lower())
|
|
|
|
|
|
# Zxy 如果包含保留字符,则用单引号将整个单词括起来
|
|
|
for char in self.backend.RESERVED_CHARACTERS:
|
|
|
if char in word:
|
|
|
word = "'%s'" % word
|
|
|
break
|
|
|
|
|
|
cleaned_words.append(word)
|
|
|
|
|
|
# Zxy 将清理后的单词重新组合
|
|
|
return ' '.join(cleaned_words)
|
|
|
|
|
|
# Zxy 构建查询片段,根据字段、过滤类型和值生成 Whoosh 查询语法
|
|
|
def build_query_fragment(self, field, filter_type, value):
|
|
|
# Zxy 导入 haystack 连接管理器
|
|
|
from haystack import connections
|
|
|
query_frag = ''
|
|
|
is_datetime = False
|
|
|
|
|
|
# Zxy 如果值没有 input_type_name 属性,则进行类型推断
|
|
|
if not hasattr(value, 'input_type_name'):
|
|
|
# Handle when we've got a ``ValuesListQuerySet``...
|
|
|
if hasattr(value, 'values_list'):
|
|
|
value = list(value)
|
|
|
|
|
|
if hasattr(value, 'strftime'):
|
|
|
is_datetime = True
|
|
|
|
|
|
if isinstance(value, six.string_types) and value != ' ':
|
|
|
# It's not an ``InputType``. Assume ``Clean``.
|
|
|
value = Clean(value)
|
|
|
else:
|
|
|
value = PythonData(value)
|
|
|
|
|
|
# Zxy 使用 InputType 准备查询值
|
|
|
prepared_value = value.prepare(self)
|
|
|
|
|
|
# Zxy 如果准备好的值不是集合类型,则转换为 Whoosh 可用的格式
|
|
|
if not isinstance(prepared_value, (set, list, tuple)):
|
|
|
# Then convert whatever we get back to what pysolr wants if needed.
|
|
|
prepared_value = self.backend._from_python(prepared_value)
|
|
|
|
|
|
# 'content' is a special reserved word, much like 'pk' in
|
|
|
# Django's ORM layer. It indicates 'no special field'.
|
|
|
# Zxy 'content' 是特殊字段,代表所有可搜索内容
|
|
|
if field == 'content':
|
|
|
index_fieldname = ''
|
|
|
else:
|
|
|
# Zxy 获取字段在索引中的真实名称
|
|
|
index_fieldname = u'%s:' % connections[self._using].get_unified_index(
|
|
|
).get_index_fieldname(field)
|
|
|
|
|
|
# Zxy 定义不同过滤类型对应的 Whoosh 查询模板
|
|
|
filter_types = {
|
|
|
'content': '%s',
|
|
|
'contains': '*%s*',
|
|
|
'endswith': "*%s",
|
|
|
'startswith': "%s*",
|
|
|
'exact': '%s',
|
|
|
'gt': "{%s to}",
|
|
|
'gte': "[%s to]",
|
|
|
'lt': "{to %s}",
|
|
|
'lte': "[to %s]",
|
|
|
'fuzzy': u'%s~',
|
|
|
}
|
|
|
# Zxy 如果值不需要后处理,则直接使用
|
|
|
if value.post_process is False:
|
|
|
query_frag = prepared_value
|
|
|
else:
|
|
|
# Zxy 根据不同的过滤类型构建查询片段
|
|
|
if filter_type in [
|
|
|
'content',
|
|
|
'contains',
|
|
|
'startswith',
|
|
|
'endswith',
|
|
|
'fuzzy']:
|
|
|
# Zxy 如果输入类型是精确匹配,则直接使用值
|
|
|
if value.input_type_name == 'exact':
|
|
|
query_frag = prepared_value
|
|
|
else:
|
|
|
# Iterate over terms & incorportate the converted form of
|
|
|
# each into the query.
|
|
|
terms = []
|
|
|
|
|
|
if isinstance(prepared_value, six.string_types):
|
|
|
possible_values = prepared_value.split(' ')
|
|
|
else:
|
|
|
if is_datetime is True:
|
|
|
prepared_value = self._convert_datetime(
|
|
|
prepared_value)
|
|
|
|
|
|
possible_values = [prepared_value]
|
|
|
|
|
|
for possible_value in possible_values:
|
|
|
terms.append(
|
|
|
filter_types[filter_type] %
|
|
|
self.backend._from_python(possible_value))
|
|
|
|
|
|
if len(terms) == 1:
|
|
|
query_frag = terms[0]
|
|
|
else:
|
|
|
query_frag = u"(%s)" % " AND ".join(terms)
|
|
|
# Zxy 处理 'in' 过滤类型
|
|
|
elif filter_type == 'in':
|
|
|
in_options = []
|
|
|
|
|
|
for possible_value in prepared_value:
|
|
|
is_datetime = False
|
|
|
|
|
|
if hasattr(possible_value, 'strftime'):
|
|
|
is_datetime = True
|
|
|
|
|
|
pv = self.backend._from_python(possible_value)
|
|
|
|
|
|
if is_datetime is True:
|
|
|
pv = self._convert_datetime(pv)
|
|
|
|
|
|
if isinstance(pv, six.string_types) and not is_datetime:
|
|
|
in_options.append('"%s"' % pv)
|
|
|
else:
|
|
|
in_options.append('%s' % pv)
|
|
|
|
|
|
query_frag = "(%s)" % " OR ".join(in_options)
|
|
|
# Zxy 处理 'range' 过滤类型
|
|
|
elif filter_type == 'range':
|
|
|
start = self.backend._from_python(prepared_value[0])
|
|
|
end = self.backend._from_python(prepared_value[1])
|
|
|
|
|
|
if hasattr(prepared_value[0], 'strftime'):
|
|
|
start = self._convert_datetime(start)
|
|
|
|
|
|
if hasattr(prepared_value[1], 'strftime'):
|
|
|
end = self._convert_datetime(end)
|
|
|
|
|
|
query_frag = u"[%s to %s]" % (start, end)
|
|
|
# Zxy 处理 'exact' 过滤类型
|
|
|
elif filter_type == 'exact':
|
|
|
if value.input_type_name == 'exact':
|
|
|
query_frag = prepared_value
|
|
|
else:
|
|
|
prepared_value = Exact(prepared_value).prepare(self)
|
|
|
query_frag = filter_types[filter_type] % prepared_value
|
|
|
else:
|
|
|
# Zxy 处理其他类型(如 gt, gte, lt, lte)
|
|
|
if is_datetime is True:
|
|
|
prepared_value = self._convert_datetime(prepared_value)
|
|
|
|
|
|
query_frag = filter_types[filter_type] % prepared_value
|
|
|
|
|
|
# Zxy 如果查询片段不为空且不是原始查询,则用括号括起来
|
|
|
if len(query_frag) and not isinstance(value, Raw):
|
|
|
if not query_frag.startswith('(') and not query_frag.endswith(')'):
|
|
|
query_frag = "(%s)" % query_frag
|
|
|
return u"%s%s" % (index_fieldname, query_frag)
|
|
|
|
|
|
# if not filter_type in ('in', 'range'):
|
|
|
# # 'in' is a bit of a special case, as we don't want to
|
|
|
# # convert a valid list/tuple to string. Defer handling it
|
|
|
# # until later...
|
|
|
# value = self.backend._from_python(value)
|
|
|
|
|
|
# Zxy 定义 Whoosh 引擎类,继承自 Haystack 的 BaseEngine
|
|
|
class WhooshEngine(BaseEngine):
|
|
|
# Zxy 指定后端和查询类
|
|
|
backend = WhooshSearchBackend
|
|
|
query = WhooshSearchQuery
|