You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
git-test/src/DjangoBlog-master/djangoblog/whoosh_cn_backend.py

1257 lines
50 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# Zxy 导入未来支持模块,确保代码在 Python 2 和 3 中的兼容性
# encoding: utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
# Zxy 导入标准库模块
import json
import os
import re
import shutil
import threading
import warnings
# Zxy 导入第三方库
import six
from django.conf import settings
from django.core.exceptions import ImproperlyConfigured
from datetime import datetime
from django.utils.encoding import force_str
from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, EmptyResults, log_query
from haystack.constants import DJANGO_CT, DJANGO_ID, ID
from haystack.exceptions import MissingDependency, SearchBackendError, SkipDocument
from haystack.inputs import Clean, Exact, PythonData, Raw
from haystack.models import SearchResult
from haystack.utils import get_identifier, get_model_ct
from haystack.utils import log as logging
from haystack.utils.app_loading import haystack_get_model
from jieba.analyse import ChineseAnalyzer
from whoosh import index
from whoosh.analysis import StemmingAnalyzer
from whoosh.fields import BOOLEAN, DATETIME, IDLIST, KEYWORD, NGRAM, NGRAMWORDS, NUMERIC, Schema, TEXT
from whoosh.fields import ID as WHOOSH_ID
from whoosh.filedb.filestore import FileStorage, RamStorage
from whoosh.highlight import ContextFragmenter, HtmlFormatter
from whoosh.highlight import highlight as whoosh_highlight
from whoosh.qparser import QueryParser
from whoosh.searching import ResultsPage
from whoosh.writing import AsyncWriter
# Zxy 尝试导入 whoosh 库,如果失败则抛出依赖缺失异常
try:
import whoosh
except ImportError:
raise MissingDependency(
"The 'whoosh' backend requires the installation of 'Whoosh'. Please refer to the documentation.")
# Zxy 检查 whoosh 版本是否满足最低要求
if not hasattr(whoosh, '__version__') or whoosh.__version__ < (2, 5, 0):
raise MissingDependency(
"The 'whoosh' backend requires version 2.5.0 or greater.")
# Zxy 定义一个正则表达式,用于匹配 ISO 8601 格式的日期时间字符串
DATETIME_REGEX = re.compile(
'^(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})T(?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2})(\.\d{3,6}Z?)?$')
# Zxy 创建一个线程局部存储对象,用于在每个线程中存储独立的数据
LOCALS = threading.local()
# Zxy 初始化线程局部存储中的 RAM_STORE 为 None用于存储内存索引
LOCALS.RAM_STORE = None
# Zxy 定义一个自定义的 HTML 格式化器,继承自 Whoosh 的 HtmlFormatter
class WhooshHtmlFormatter(HtmlFormatter):
"""
This is a HtmlFormatter simpler than the whoosh.HtmlFormatter.
We use it to have consistent results across backends. Specifically,
Solr, Xapian and Elasticsearch are using this formatting.
"""
# Zxy 定义高亮显示的 HTML 模板
template = '<%(tag)s>%(t)s</%(tag)s>'
# Zxy 定义 Whoosh 搜索后端类,继承自 Haystack 的 BaseSearchBackend
class WhooshSearchBackend(BaseSearchBackend):
# Zxy 定义 Whoosh 保留的关键字列表,这些关键字在查询中有特殊含义
RESERVED_WORDS = (
'AND',
'NOT',
'OR',
'TO',
)
# Zxy 定义 Whoosh 保留的字符列表,这些字符在查询中有特殊含义
# The '\\' must come first, so as not to overwrite the other slash
# replacements.
RESERVED_CHARACTERS = (
'\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
'[', ']', '^', '"', '~', '*', '?', ':', '.',
)
# Zxy 初始化方法,设置连接别名和连接选项
def __init__(self, connection_alias, **connection_options):
# Zxy 调用父类的初始化方法
super(
WhooshSearchBackend,
self).__init__(
connection_alias,
**connection_options)
# Zxy 标记后端设置尚未完成
self.setup_complete = False
# Zxy 默认使用文件存储
self.use_file_storage = True
# Zxy 设置 POST 请求的大小限制,默认为 128MB
self.post_limit = getattr(
connection_options,
'POST_LIMIT',
128 * 1024 * 1024)
# Zxy 从连接选项中获取索引文件存储路径
self.path = connection_options.get('PATH')
# Zxy 如果配置的存储类型不是 'file',则使用内存存储
if connection_options.get('STORAGE', 'file') != 'file':
self.use_file_storage = False
# Zxy 如果使用文件存储但未提供路径,则抛出配置错误异常
if self.use_file_storage and not self.path:
raise ImproperlyConfigured(
"You must specify a 'PATH' in your settings for connection '%s'." %
connection_alias)
# Zxy 获取 Haystack 的日志记录器
self.log = logging.getLogger('haystack')
# Zxy 设置搜索引擎,延迟加载直到需要时才执行
def setup(self):
"""
Defers loading until needed.
"""
# Zxy 导入 haystack 的连接管理器
from haystack import connections
# Zxy 标记是否为新创建的索引
new_index = False
# Zxy 确保索引目录存在
if self.use_file_storage and not os.path.exists(self.path):
# Zxy 如果目录不存在,则创建它
os.makedirs(self.path)
# Zxy 标记这是一个新索引
new_index = True
# Zxy 检查索引目录是否可写
if self.use_file_storage and not os.access(self.path, os.W_OK):
# Zxy 如果不可写,则抛出 IO 错误
raise IOError(
"The path to your Whoosh index '%s' is not writable for the current user/group." %
self.path)
# Zxy 根据配置选择存储方式:文件存储或内存存储
if self.use_file_storage:
# Zxy 使用文件存储
self.storage = FileStorage(self.path)
else:
# Zxy 声明使用全局的线程局部存储
global LOCALS
# Zxy 如果内存存储尚未初始化,则创建一个
if getattr(LOCALS, 'RAM_STORE', None) is None:
LOCALS.RAM_STORE = RamStorage()
# Zxy 使用内存存储
self.storage = LOCALS.RAM_STORE
# Zxy 构建索引模式,并获取主内容字段名
self.content_field_name, self.schema = self.build_schema(
connections[self.connection_alias].get_unified_index().all_searchfields())
# Zxy 创建查询解析器,用于解析用户查询
self.parser = QueryParser(self.content_field_name, schema=self.schema)
# Zxy 根据是否为新索引来创建或打开索引
if new_index is True:
# Zxy 如果是新索引,则根据 schema 创建
self.index = self.storage.create_index(self.schema)
else:
# Zxy 否则尝试打开现有索引
try:
self.index = self.storage.open_index(schema=self.schema)
except index.EmptyIndexError:
# Zxy 如果索引为空(可能刚创建目录但无文件),则创建新索引
self.index = self.storage.create_index(self.schema)
# Zxy 标记后端设置已完成
self.setup_complete = True
# Zxy 根据 Haystack 的搜索字段构建 Whoosh 的索引模式
def build_schema(self, fields):
# Zxy 初始化模式字段,包含 Haystack 内置的 ID、类型和模型 ID
schema_fields = {
ID: WHOOSH_ID(stored=True, unique=True),
DJANGO_CT: WHOOSH_ID(stored=True),
DJANGO_ID: WHOOSH_ID(stored=True),
}
# Zxy 获取 Haystack 内置字段的数量,用于后续检查
initial_key_count = len(schema_fields)
# Zxy 初始化主内容字段名
content_field_name = ''
# Zxy 遍历所有搜索字段,根据字段类型转换为 Whoosh 字段
for field_name, field_class in fields.items():
# Zxy 如果字段是多值字段
if field_class.is_multivalued:
if field_class.indexed is False:
# Zxy 如果多值字段不被索引,使用 IDLIST 类型
schema_fields[field_class.index_fieldname] = IDLIST(
stored=True, field_boost=field_class.boost)
else:
# Zxy 如果多值字段被索引,使用 KEYWORD 类型
schema_fields[field_class.index_fieldname] = KEYWORD(
stored=True, commas=True, scorable=True, field_boost=field_class.boost)
# Zxy 如果字段类型是日期或日期时间
elif field_class.field_type in ['date', 'datetime']:
# Zxy 使用 DATETIME 类型,并设置为可排序
schema_fields[field_class.index_fieldname] = DATETIME(
stored=field_class.stored, sortable=True)
# Zxy 如果字段类型是整数
elif field_class.field_type == 'integer':
# Zxy 使用 NUMERIC 类型,并指定数字类型为整数
schema_fields[field_class.index_fieldname] = NUMERIC(
stored=field_class.stored, numtype=int, field_boost=field_class.boost)
# Zxy 如果字段类型是浮点数
elif field_class.field_type == 'float':
# Zxy 使用 NUMERIC 类型,并指定数字类型为浮点数
schema_fields[field_class.index_fieldname] = NUMERIC(
stored=field_class.stored, numtype=float, field_boost=field_class.boost)
# Zxy 如果字段类型是布尔值
elif field_class.field_type == 'boolean':
# Zxy 使用 BOOLEAN 类型
# Field boost isn't supported on BOOLEAN as of 1.8.2.
schema_fields[field_class.index_fieldname] = BOOLEAN(
stored=field_class.stored)
# Zxy 如果字段类型是 N-gram
elif field_class.field_type == 'ngram':
# Zxy 使用 NGRAM 类型
schema_fields[field_class.index_fieldname] = NGRAM(
minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost)
# Zxy 如果字段类型是边 N-gram
elif field_class.field_type == 'edge_ngram':
# Zxy 使用 NGRAMWORDS 类型,并设置为从词首开始
schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, at='start',
stored=field_class.stored,
field_boost=field_class.boost)
else:
# Zxy 默认情况下,使用 TEXT 类型,并配置中文分词器
# schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True)
schema_fields[field_class.index_fieldname] = TEXT(
stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True)
# Zxy 如果字段被标记为文档主内容
if field_class.document is True:
# Zxy 记录主内容字段名
content_field_name = field_class.index_fieldname
# Zxy 启用拼写建议功能
schema_fields[field_class.index_fieldname].spelling = True
# Zxy 如果除了内置字段外没有找到任何其他字段,则优雅地报错
if len(schema_fields) <= initial_key_count:
raise SearchBackendError(
"No fields were found in any search_indexes. Please correct this before attempting to search.")
# Zxy 返回主内容字段名和构建好的 Schema 对象
return (content_field_name, Schema(**schema_fields))
# Zxy 更新索引,将可迭代对象中的每个对象添加或更新到 Whoosh 索引中
def update(self, index, iterable, commit=True):
# Zxy 如果后端尚未初始化,则先进行设置
if not self.setup_complete:
self.setup()
# Zxy 刷新索引以获取最新状态
self.index = self.index.refresh()
# Zxy 创建一个异步写入器,用于高效地写入索引
writer = AsyncWriter(self.index)
# Zxy 遍历所有需要更新的对象
for obj in iterable:
try:
# Zxy 准备要索引的文档数据
doc = index.full_prepare(obj)
except SkipDocument:
# Zxy 如果对象被标记为跳过,则记录调试信息
self.log.debug(u"Indexing for object `%s` skipped", obj)
else:
# Zxy 确保所有字段值都是 Whoosh 可以处理的格式
for key in doc:
doc[key] = self._from_python(doc[key])
# Zxy 文档级别的权重提升在 Whoosh 2.5.0+ 版本中不再支持
if 'boost' in doc:
del doc['boost']
try:
# Zxy 使用写入器更新文档(如果存在则更新,否则创建)
writer.update_document(**doc)
except Exception as e:
# Zxy 如果未配置为静默失败,则重新抛出异常
if not self.silently_fail:
raise
# Zxy 记录错误信息,包含对象标识符,但不包含对象本身以避免编码问题
self.log.error(
u"%s while preparing object for update" %
e.__class__.__name__,
exc_info=True,
extra={
"data": {
"index": index,
"object": get_identifier(obj)}})
# Zxy 如果可迭代对象不为空,则提交更改
if len(iterable) > 0:
# For now, commit no matter what, as we run into locking issues
# otherwise.
writer.commit()
# Zxy 从索引中移除一个对象
def remove(self, obj_or_string, commit=True):
# Zxy 如果后端尚未初始化,则先进行设置
if not self.setup_complete:
self.setup()
# Zxy 刷新索引以获取最新状态
self.index = self.index.refresh()
# Zxy 获取对象的唯一标识符
whoosh_id = get_identifier(obj_or_string)
try:
# Zxy 构建一个查询,根据 ID 查找文档并删除
self.index.delete_by_query(
q=self.parser.parse(
u'%s:"%s"' %
(ID, whoosh_id)))
except Exception as e:
# Zxy 如果未配置为静默失败,则重新抛出异常
if not self.silently_fail:
raise
# Zxy 记录删除失败的错误
self.log.error(
"Failed to remove document '%s' from Whoosh: %s",
whoosh_id,
e,
exc_info=True)
# Zxy 清空索引,可以清空所有内容或指定模型的内容
def clear(self, models=None, commit=True):
# Zxy 如果后端尚未初始化,则先进行设置
if not self.setup_complete:
self.setup()
# Zxy 刷新索引以获取最新状态
self.index = self.index.refresh()
# Zxy 如果提供了模型列表,则检查其类型
if models is not None:
assert isinstance(models, (list, tuple))
try:
if models is None:
# Zxy 如果没有指定模型,则删除整个索引
self.delete_index()
else:
# Zxy 准备要删除的模型列表
models_to_delete = []
for model in models:
# Zxy 为每个模型构建查询字符串
models_to_delete.append(
u"%s:%s" %
(DJANGO_CT, get_model_ct(model)))
# Zxy 使用 OR 连接多个模型查询,并删除匹配的文档
self.index.delete_by_query(
q=self.parser.parse(
u" OR ".join(models_to_delete)))
except Exception as e:
# Zxy 如果未配置为静默失败,则重新抛出异常
if not self.silently_fail:
raise
# Zxy 根据是否指定模型,记录不同的错误信息
if models is not None:
self.log.error(
"Failed to clear Whoosh index of models '%s': %s",
','.join(models_to_delete),
e,
exc_info=True)
else:
self.log.error(
"Failed to clear Whoosh index: %s", e, exc_info=True)
# Zxy 物理删除整个索引
def delete_index(self):
# Per the Whoosh mailing list, if wiping out everything from the index,
# it's much more efficient to simply delete the index files.
# Zxy 如果使用文件存储,则直接删除索引目录
if self.use_file_storage and os.path.exists(self.path):
shutil.rmtree(self.path)
# Zxy 如果使用内存存储,则清空存储
elif not self.use_file_storage:
self.storage.clean()
# Zxy 重新初始化后端,创建新的空索引
self.setup()
# Zxy 优化索引,合并索引段以提高搜索性能
def optimize(self):
# Zxy 如果后端尚未初始化,则先进行设置
if not self.setup_complete:
self.setup()
# Zxy 刷新索引以获取最新状态
self.index = self.index.refresh()
# Zxy 执行 Whoosh 的优化操作
self.index.optimize()
# Zxy 根据偏移量计算 Whoosh 分页所需的页码和每页大小
def calculate_page(self, start_offset=0, end_offset=None):
# Zxy 防止 Whoosh 因 end_offset 小于等于 0 而抛出错误
if end_offset is not None and end_offset <= 0:
end_offset = 1
# Zxy 初始化页码
page_num = 0
# Zxy 如果未指定结束偏移量,则设置一个很大的默认值
if end_offset is None:
end_offset = 1000000
# Zxy 如果未指定起始偏移量,则默认为 0
if start_offset is None:
start_offset = 0
# Zxy 计算每页的长度
page_length = end_offset - start_offset
# Zxy 如果页长度有效,则计算页码
if page_length and page_length > 0:
page_num = int(start_offset / page_length)
# Zxy Whoosh 使用 1-based 页码,所以需要加 1
page_num += 1
return page_num, page_length
# Zxy 执行搜索查询的核心方法
@log_query
def search(
self,
query_string,
sort_by=None,
start_offset=0,
end_offset=None,
fields='',
highlight=False,
facets=None,
date_facets=None,
query_facets=None,
narrow_queries=None,
spelling_query=None,
within=None,
dwithin=None,
distance_point=None,
models=None,
limit_to_registered_models=None,
result_class=None,
**kwargs):
# Zxy 如果后端尚未初始化,则先进行设置
if not self.setup_complete:
self.setup()
# Zxy 空查询字符串应返回无结果
if len(query_string) == 0:
return {
'results': [],
'hits': 0,
}
# Zxy 确保查询字符串为正确的字符串类型
query_string = force_str(query_string)
# Zxy 单个字符(非通配符)查询会被停用词过滤器拦截,应返回无结果
if len(query_string) <= 1 and query_string != u'*':
return {
'results': [],
'hits': 0,
}
# Zxy 初始化排序方向为非逆序
reverse = False
# Zxy 如果提供了排序字段
if sort_by is not None:
# Zxy 确定是否需要反转结果,以及 Whoosh 是否能处理排序
sort_by_list = []
reverse_counter = 0
# Zxy 统计逆序排序字段的数量
for order_by in sort_by:
if order_by.startswith('-'):
reverse_counter += 1
# Zxy Whoosh 要求所有排序字段的排序方向必须一致
if reverse_counter and reverse_counter != len(sort_by):
raise SearchBackendError("Whoosh requires all order_by fields"
" to use the same sort direction")
# Zxy 处理排序字段列表,去除 '-' 前缀并确定最终排序方向
for order_by in sort_by:
if order_by.startswith('-'):
sort_by_list.append(order_by[1:])
if len(sort_by_list) == 1:
reverse = True
else:
sort_by_list.append(order_by)
if len(sort_by_list) == 1:
reverse = False
# Zxy Whoosh 的 search_page 方法只接受单个排序字段
sort_by = sort_by_list[0]
# Zxy Whoosh 后端不支持分面搜索,发出警告
if facets is not None:
warnings.warn(
"Whoosh does not handle faceting.",
Warning,
stacklevel=2)
# Zxy Whoosh 后端不支持日期分面,发出警告
if date_facets is not None:
warnings.warn(
"Whoosh does not handle date faceting.",
Warning,
stacklevel=2)
# Zxy Whoosh 后端不支持查询分面,发出警告
if query_facets is not None:
warnings.warn(
"Whoosh does not handle query faceting.",
Warning,
stacklevel=2)
# Zxy 初始化用于存储缩小范围后的结果
narrowed_results = None
# Zxy 刷新索引以获取最新状态
self.index = self.index.refresh()
# Zxy 确定是否限制搜索到已注册的模型
if limit_to_registered_models is None:
limit_to_registered_models = getattr(
settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)
# Zxy 根据传入的模型或配置构建模型选择列表
if models and len(models):
model_choices = sorted(get_model_ct(model) for model in models)
elif limit_to_registered_models:
# Zxy 使用缩小查询的方式,将结果限制在当前路由器处理的模型中
model_choices = self.build_models_list()
else:
model_choices = []
# Zxy 如果存在模型选择,则将其添加到缩小查询中
if len(model_choices) > 0:
if narrow_queries is None:
narrow_queries = set()
# Zxy 构建一个 OR 查询来限制模型类型
narrow_queries.add(' OR '.join(
['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]))
# Zxy 初始化缩小查询的搜索器
narrow_searcher = None
# Zxy 如果存在缩小查询,则执行它们以获取一个结果集过滤器
if narrow_queries is not None:
# Zxy 这个操作可能很昂贵,但 Whoosh 中没有其他方法
narrow_searcher = self.index.searcher()
for nq in narrow_queries:
recent_narrowed_results = narrow_searcher.search(
self.parser.parse(force_str(nq)), limit=None)
# Zxy 如果任何一个缩小查询返回空结果,则直接返回空
if len(recent_narrowed_results) <= 0:
return {
'results': [],
'hits': 0,
}
# Zxy 将多个缩小查询的结果集进行交集过滤
if narrowed_results:
narrowed_results.filter(recent_narrowed_results)
else:
narrowed_results = recent_narrowed_results
# Zxy 再次刷新索引以确保所有写入都可见
self.index = self.index.refresh()
# Zxy 如果索引中有文档,则执行搜索
if self.index.doc_count():
searcher = self.index.searcher()
# Zxy 解析查询字符串
parsed_query = self.parser.parse(query_string)
# Zxy 如果查询无效或被停用词过滤,则优雅地恢复
if parsed_query is None:
return {
'results': [],
'hits': 0,
}
# Zxy 计算分页参数
page_num, page_length = self.calculate_page(
start_offset, end_offset)
# Zxy 准备搜索参数
search_kwargs = {
'pagelen': page_length,
'sortedby': sort_by,
'reverse': reverse,
}
# Zxy 如果存在缩小范围的结果,则将其作为过滤器
if narrowed_results is not None:
search_kwargs['filter'] = narrowed_results
try:
# Zxy 执行分页搜索
raw_page = searcher.search_page(
parsed_query,
page_num,
**search_kwargs
)
except ValueError:
# Zxy 如果页码无效,则返回空结果
if not self.silently_fail:
raise
return {
'results': [],
'hits': 0,
'spelling_suggestion': None,
}
# Zxy 兼容 Whoosh 2.5.1 的 bug请求过高的页码会返回错误的页
if raw_page.pagenum < page_num:
return {
'results': [],
'hits': 0,
'spelling_suggestion': None,
}
# Zxy 处理原始搜索结果,转换为 Haystack 的 SearchResult 对象
results = self._process_results(
raw_page,
highlight=highlight,
query_string=query_string,
spelling_query=spelling_query,
result_class=result_class)
# Zxy 关闭主搜索器
searcher.close()
# Zxy 关闭缩小查询的搜索器
if hasattr(narrow_searcher, 'close'):
narrow_searcher.close()
return results
else:
# Zxy 如果索引为空,但仍需处理拼写建议
spelling_suggestion = None
if self.include_spelling:
if spelling_query:
spelling_suggestion = self.create_spelling_suggestion(
spelling_query)
else:
spelling_suggestion = self.create_spelling_suggestion(
query_string)
return {
'results': [],
'hits': 0,
'spelling_suggestion': spelling_suggestion,
}
# Zxy 实现“更多类似于此”功能,根据给定模型实例查找相似文档
def more_like_this(
self,
model_instance,
additional_query_string=None,
start_offset=0,
end_offset=None,
models=None,
limit_to_registered_models=None,
result_class=None,
**kwargs):
# Zxy 如果后端尚未初始化,则先进行设置
if not self.setup_complete:
self.setup()
# Zxy 获取模型的真实类,避免使用延迟加载的模型类
# Deferred models will have a different class ("RealClass_Deferred_fieldname")
# which won't be in our registry:
model_klass = model_instance._meta.concrete_model
# Zxy 获取主内容字段名,用于相似性分析
field_name = self.content_field_name
# Zxy 初始化缩小查询集合
narrow_queries = set()
# Zxy 初始化缩小范围后的结果集
narrowed_results = None
# Zxy 刷新索引以获取最新状态
self.index = self.index.refresh()
# Zxy 确定是否限制搜索到已注册的模型
if limit_to_registered_models is None:
limit_to_registered_models = getattr(
settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)
# Zxy 根据传入的模型或配置构建模型选择列表
if models and len(models):
model_choices = sorted(get_model_ct(model) for model in models)
elif limit_to_registered_models:
# Zxy 使用缩小查询的方式,将结果限制在当前路由器处理的模型中
model_choices = self.build_models_list()
else:
model_choices = []
# Zxy 如果存在模型选择,则将其添加到缩小查询中
if len(model_choices) > 0:
if narrow_queries is None:
narrow_queries = set()
# Zxy 构建一个 OR 查询来限制模型类型
narrow_queries.add(' OR '.join(
['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]))
# Zxy 如果提供了额外的查询字符串,则也添加到缩小查询中
if additional_query_string and additional_query_string != '*':
narrow_queries.add(additional_query_string)
# Zxy 初始化缩小查询的搜索器
narrow_searcher = None
# Zxy 如果存在缩小查询,则执行它们以获取一个结果集过滤器
if narrow_queries is not None:
# Zxy 这个操作可能很昂贵,但 Whoosh 中没有其他方法
narrow_searcher = self.index.searcher()
for nq in narrow_queries:
recent_narrowed_results = narrow_searcher.search(
self.parser.parse(force_str(nq)), limit=None)
# Zxy 如果任何一个缩小查询返回空结果,则直接返回空
if len(recent_narrowed_results) <= 0:
return {
'results': [],
'hits': 0,
}
# Zxy 将多个缩小查询的结果集进行交集过滤
if narrowed_results:
narrowed_results.filter(recent_narrowed_results)
else:
narrowed_results = recent_narrowed_results
# Zxy 计算分页参数
page_num, page_length = self.calculate_page(start_offset, end_offset)
# Zxy 再次刷新索引以确保所有写入都可见
self.index = self.index.refresh()
# Zxy 初始化原始结果为空
raw_results = EmptyResults()
# Zxy 如果索引中有文档,则执行“更多类似于此”查询
if self.index.doc_count():
# Zxy 构建一个查询以找到当前模型实例对应的索引文档
query = "%s:%s" % (ID, get_identifier(model_instance))
searcher = self.index.searcher()
parsed_query = self.parser.parse(query)
# Zxy 搜索当前文档
results = searcher.search(parsed_query)
# Zxy 如果找到了当前文档,则调用其 more_like_this 方法
if len(results):
# Zxy 获取与当前文档相似的其他文档
raw_results = results[0].more_like_this(
field_name, top=end_offset)
# Zxy 如果存在缩小范围的结果,则将其作为过滤器应用于相似结果
if narrowed_results is not None and hasattr(raw_results, 'filter'):
raw_results.filter(narrowed_results)
try:
# Zxy 将原始结果集包装成分页对象
raw_page = ResultsPage(raw_results, page_num, page_length)
except ValueError:
# Zxy 如果页码无效,则返回空结果
if not self.silently_fail:
raise
return {
'results': [],
'hits': 0,
'spelling_suggestion': None,
}
# Zxy 兼容 Whoosh 2.5.1 的 bug请求过高的页码会返回错误的页
if raw_page.pagenum < page_num:
return {
'results': [],
'hits': 0,
'spelling_suggestion': None,
}
# Zxy 处理原始搜索结果,转换为 Haystack 的 SearchResult 对象
results = self._process_results(raw_page, result_class=result_class)
# Zxy 关闭主搜索器
searcher.close()
# Zxy 关闭缩小查询的搜索器
if hasattr(narrow_searcher, 'close'):
narrow_searcher.close()
return results
# Zxy 处理 Whoosh 返回的原始搜索结果,转换为 Haystack 标准格式
def _process_results(
self,
raw_page,
highlight=False,
query_string='',
spelling_query=None,
result_class=None):
# Zxy 导入 haystack 连接管理器
from haystack import connections
# Zxy 初始化结果列表
results = []
# Zxy 在切片之前获取总命中数,这对于分页至关重要
hits = len(raw_page)
# Zxy 如果未指定结果类,则使用默认的 SearchResult
if result_class is None:
result_class = SearchResult
# Zxy 初始化分面和拼写建议
facets = {}
spelling_suggestion = None
# Zxy 获取统一索引对象和已注册的模型列表
unified_index = connections[self.connection_alias].get_unified_index()
indexed_models = unified_index.get_indexed_models()
# Zxy 遍历原始结果页中的每个文档
for doc_offset, raw_result in enumerate(raw_page):
# Zxy 获取文档的得分
score = raw_page.score(doc_offset) or 0
# Zxy 从文档中解析出应用标签和模型名
app_label, model_name = raw_result[DJANGO_CT].split('.')
# Zxy 初始化额外字段字典
additional_fields = {}
# Zxy 根据应用标签和模型名获取模型类
model = haystack_get_model(app_label, model_name)
# Zxy 确保模型存在且已注册到索引
if model and model in indexed_models:
# Zxy 遍历文档中的所有字段
for key, value in raw_result.items():
# Zxy 获取该模型对应的索引
index = unified_index.get_index(model)
string_key = str(key)
# Zxy 如果字段在索引定义中,并且有转换方法
if string_key in index.fields and hasattr(
index.fields[string_key], 'convert'):
# Zxy 特殊处理多值字段
if index.fields[string_key].is_multivalued:
if value is None or len(value) == 0:
additional_fields[string_key] = []
else:
additional_fields[string_key] = value.split(
',')
else:
# Zxy 使用索引字段定义的转换方法
additional_fields[string_key] = index.fields[string_key].convert(
value)
else:
# Zxy 否则使用通用的 Python 类型转换
additional_fields[string_key] = self._to_python(value)
# Zxy 删除 Haystack 内部字段,不返回给用户
del (additional_fields[DJANGO_CT])
del (additional_fields[DJANGO_ID])
# Zxy 如果需要高亮显示
if highlight:
# Zxy 创建词干分析器和 HTML 格式化器
sa = StemmingAnalyzer()
formatter = WhooshHtmlFormatter('em')
# Zxy 从查询字符串中提取词条
terms = [token.text for token in sa(query_string)]
# Zxy 调用 Whoosh 的高亮方法
whoosh_result = whoosh_highlight(
additional_fields.get(self.content_field_name),
terms,
sa,
ContextFragmenter(),
formatter
)
# Zxy 将高亮结果添加到额外字段中
additional_fields['highlighted'] = {
self.content_field_name: [whoosh_result],
}
# Zxy 创建 SearchResult 对象并添加到结果列表
result = result_class(
app_label,
model_name,
raw_result[DJANGO_ID],
score,
**additional_fields)
results.append(result)
else:
# Zxy 如果模型未注册,则减少总命中数
hits -= 1
# Zxy 如果启用了拼写建议
if self.include_spelling:
if spelling_query:
spelling_suggestion = self.create_spelling_suggestion(
spelling_query)
else:
spelling_suggestion = self.create_spelling_suggestion(
query_string)
# Zxy 返回包含结果、命中数、分面和拼写建议的字典
return {
'results': results,
'hits': hits,
'facets': facets,
'spelling_suggestion': spelling_suggestion,
}
# Zxy 根据查询字符串创建拼写建议
def create_spelling_suggestion(self, query_string):
# Zxy 初始化拼写建议
spelling_suggestion = None
# Zxy 获取索引的读取器和校正器
reader = self.index.reader()
corrector = reader.corrector(self.content_field_name)
# Zxy 清理查询字符串
cleaned_query = force_str(query_string)
# Zxy 如果查询字符串为空,直接返回
if not query_string:
return spelling_suggestion
# Zxy 移除查询中的保留字
for rev_word in self.RESERVED_WORDS:
cleaned_query = cleaned_query.replace(rev_word, '')
# Zxy 移除查询中的保留字符
for rev_char in self.RESERVED_CHARACTERS:
cleaned_query = cleaned_query.replace(rev_char, '')
# Zxy 将清理后的查询拆分为单词列表
query_words = cleaned_query.split()
suggested_words = []
# Zxy 为每个单词查找拼写建议
for word in query_words:
suggestions = corrector.suggest(word, limit=1)
if len(suggestions) > 0:
suggested_words.append(suggestions[0])
# Zxy 将建议的单词重新组合成字符串
spelling_suggestion = ' '.join(suggested_words)
return spelling_suggestion
# Zxy 将 Python 值转换为 Whoosh 可用的字符串格式
def _from_python(self, value):
"""
Converts Python values to a string for Whoosh.
Code courtesy of pysolr.
"""
# Zxy 处理日期时间对象
if hasattr(value, 'strftime'):
# Zxy 如果只有日期没有时间,则将时间部分设为 0
if not hasattr(value, 'hour'):
value = datetime(value.year, value.month, value.day, 0, 0, 0)
# Zxy 处理布尔值
elif isinstance(value, bool):
if value:
value = 'true'
else:
value = 'false'
# Zxy 处理列表或元组,用逗号连接
elif isinstance(value, (list, tuple)):
value = u','.join([force_str(v) for v in value])
# Zxy 处理数字,保持原样
elif isinstance(value, (six.integer_types, float)):
# Leave it alone.
pass
else:
# Zxy 其他类型强制转换为字符串
value = force_str(value)
return value
# Zxy 将 Whoosh 的值转换为原生 Python 值
def _to_python(self, value):
"""
Converts values from Whoosh to native Python values.
A port of the same method in pysolr, as they deal with data the same way.
"""
# Zxy 处理布尔字符串
if value == 'true':
return True
elif value == 'false':
return False
# Zxy 尝试解析日期时间字符串
if value and isinstance(value, six.string_types):
possible_datetime = DATETIME_REGEX.search(value)
if possible_datetime:
date_values = possible_datetime.groupdict()
for dk, dv in date_values.items():
date_values[dk] = int(dv)
return datetime(
date_values['year'],
date_values['month'],
date_values['day'],
date_values['hour'],
date_values['minute'],
date_values['second'])
# Zxy 尝试使用 json 解复杂数据类型
try:
# Attempt to use json to load the values.
converted_value = json.loads(value)
# Try to handle most built-in types.
if isinstance(
converted_value,
(list,
tuple,
set,
dict,
six.integer_types,
float,
complex)):
return converted_value
except BaseException:
# If it fails (SyntaxError or its ilk) or we don't trust it,
# continue on.
pass
# Zxy 如果都无法转换,则返回原始值
return value
# Zxy 定义 Whoosh 搜索查询类,继承自 Haystack 的 BaseSearchQuery
class WhooshSearchQuery(BaseSearchQuery):
# Zxy 将日期时间对象转换为 Whoosh 查询所需的字符串格式
def _convert_datetime(self, date):
# Zxy 如果包含时间,则转换为完整格式
if hasattr(date, 'hour'):
return force_str(date.strftime('%Y%m%d%H%M%S'))
else:
# Zxy 如果只有日期,则补充零时间
return force_str(date.strftime('%Y%m%d000000'))
# Zxy 清理查询片段,转义 Whoosh 的保留字符
def clean(self, query_fragment):
"""
Provides a mechanism for sanitizing user input before presenting the
value to the backend.
Whoosh 1.X differs here in that you can no longer use a backslash
to escape reserved characters. Instead, the whole word should be
quoted.
"""
# Zxy 将查询片段按空格分割成单词
words = query_fragment.split()
cleaned_words = []
# Zxy 遍历每个单词进行清理
for word in words:
# Zxy 如果是保留字,则转为小写
if word in self.backend.RESERVED_WORDS:
word = word.replace(word, word.lower())
# Zxy 如果包含保留字符,则用单引号将整个单词括起来
for char in self.backend.RESERVED_CHARACTERS:
if char in word:
word = "'%s'" % word
break
cleaned_words.append(word)
# Zxy 将清理后的单词重新组合
return ' '.join(cleaned_words)
# Zxy 构建查询片段,根据字段、过滤类型和值生成 Whoosh 查询语法
def build_query_fragment(self, field, filter_type, value):
# Zxy 导入 haystack 连接管理器
from haystack import connections
query_frag = ''
is_datetime = False
# Zxy 如果值没有 input_type_name 属性,则进行类型推断
if not hasattr(value, 'input_type_name'):
# Handle when we've got a ``ValuesListQuerySet``...
if hasattr(value, 'values_list'):
value = list(value)
if hasattr(value, 'strftime'):
is_datetime = True
if isinstance(value, six.string_types) and value != ' ':
# It's not an ``InputType``. Assume ``Clean``.
value = Clean(value)
else:
value = PythonData(value)
# Zxy 使用 InputType 准备查询值
prepared_value = value.prepare(self)
# Zxy 如果准备好的值不是集合类型,则转换为 Whoosh 可用的格式
if not isinstance(prepared_value, (set, list, tuple)):
# Then convert whatever we get back to what pysolr wants if needed.
prepared_value = self.backend._from_python(prepared_value)
# 'content' is a special reserved word, much like 'pk' in
# Django's ORM layer. It indicates 'no special field'.
# Zxy 'content' 是特殊字段,代表所有可搜索内容
if field == 'content':
index_fieldname = ''
else:
# Zxy 获取字段在索引中的真实名称
index_fieldname = u'%s:' % connections[self._using].get_unified_index(
).get_index_fieldname(field)
# Zxy 定义不同过滤类型对应的 Whoosh 查询模板
filter_types = {
'content': '%s',
'contains': '*%s*',
'endswith': "*%s",
'startswith': "%s*",
'exact': '%s',
'gt': "{%s to}",
'gte': "[%s to]",
'lt': "{to %s}",
'lte': "[to %s]",
'fuzzy': u'%s~',
}
# Zxy 如果值不需要后处理,则直接使用
if value.post_process is False:
query_frag = prepared_value
else:
# Zxy 根据不同的过滤类型构建查询片段
if filter_type in [
'content',
'contains',
'startswith',
'endswith',
'fuzzy']:
# Zxy 如果输入类型是精确匹配,则直接使用值
if value.input_type_name == 'exact':
query_frag = prepared_value
else:
# Iterate over terms & incorportate the converted form of
# each into the query.
terms = []
if isinstance(prepared_value, six.string_types):
possible_values = prepared_value.split(' ')
else:
if is_datetime is True:
prepared_value = self._convert_datetime(
prepared_value)
possible_values = [prepared_value]
for possible_value in possible_values:
terms.append(
filter_types[filter_type] %
self.backend._from_python(possible_value))
if len(terms) == 1:
query_frag = terms[0]
else:
query_frag = u"(%s)" % " AND ".join(terms)
# Zxy 处理 'in' 过滤类型
elif filter_type == 'in':
in_options = []
for possible_value in prepared_value:
is_datetime = False
if hasattr(possible_value, 'strftime'):
is_datetime = True
pv = self.backend._from_python(possible_value)
if is_datetime is True:
pv = self._convert_datetime(pv)
if isinstance(pv, six.string_types) and not is_datetime:
in_options.append('"%s"' % pv)
else:
in_options.append('%s' % pv)
query_frag = "(%s)" % " OR ".join(in_options)
# Zxy 处理 'range' 过滤类型
elif filter_type == 'range':
start = self.backend._from_python(prepared_value[0])
end = self.backend._from_python(prepared_value[1])
if hasattr(prepared_value[0], 'strftime'):
start = self._convert_datetime(start)
if hasattr(prepared_value[1], 'strftime'):
end = self._convert_datetime(end)
query_frag = u"[%s to %s]" % (start, end)
# Zxy 处理 'exact' 过滤类型
elif filter_type == 'exact':
if value.input_type_name == 'exact':
query_frag = prepared_value
else:
prepared_value = Exact(prepared_value).prepare(self)
query_frag = filter_types[filter_type] % prepared_value
else:
# Zxy 处理其他类型(如 gt, gte, lt, lte
if is_datetime is True:
prepared_value = self._convert_datetime(prepared_value)
query_frag = filter_types[filter_type] % prepared_value
# Zxy 如果查询片段不为空且不是原始查询,则用括号括起来
if len(query_frag) and not isinstance(value, Raw):
if not query_frag.startswith('(') and not query_frag.endswith(')'):
query_frag = "(%s)" % query_frag
return u"%s%s" % (index_fieldname, query_frag)
# if not filter_type in ('in', 'range'):
# # 'in' is a bit of a special case, as we don't want to
# # convert a valid list/tuple to string. Defer handling it
# # until later...
# value = self.backend._from_python(value)
# Zxy 定义 Whoosh 引擎类,继承自 Haystack 的 BaseEngine
class WhooshEngine(BaseEngine):
# Zxy 指定后端和查询类
backend = WhooshSearchBackend
query = WhooshSearchQuery