|
|
|
|
@ -1,5 +1,19 @@
|
|
|
|
|
# encoding: utf-8
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
Whoosh中文搜索后端模块
|
|
|
|
|
|
|
|
|
|
本模块提供了基于Whoosh搜索引擎的中文全文搜索功能,专门针对Django Haystack框架进行定制。
|
|
|
|
|
集成了jieba中文分词器,支持中文文本的高效索引和搜索。
|
|
|
|
|
|
|
|
|
|
主要特性:
|
|
|
|
|
- 中文分词支持(使用jieba)
|
|
|
|
|
- 高性能索引和搜索
|
|
|
|
|
- 拼写建议和查询高亮
|
|
|
|
|
- 多字段类型支持(文本、数字、日期等)
|
|
|
|
|
- 与Django Haystack框架深度集成
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
@ -40,30 +54,39 @@ except ImportError:
|
|
|
|
|
raise MissingDependency(
|
|
|
|
|
"The 'whoosh' backend requires the installation of 'Whoosh'. Please refer to the documentation.")
|
|
|
|
|
|
|
|
|
|
# Handle minimum requirement.
|
|
|
|
|
# 检查Whoosh版本要求
|
|
|
|
|
if not hasattr(whoosh, '__version__') or whoosh.__version__ < (2, 5, 0):
|
|
|
|
|
raise MissingDependency(
|
|
|
|
|
"The 'whoosh' backend requires version 2.5.0 or greater.")
|
|
|
|
|
|
|
|
|
|
# Bubble up the correct error.
|
|
|
|
|
|
|
|
|
|
# 日期时间正则表达式 - 用于解析日期格式
|
|
|
|
|
DATETIME_REGEX = re.compile(
|
|
|
|
|
'^(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})T(?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2})(\.\d{3,6}Z?)?$')
|
|
|
|
|
|
|
|
|
|
# 线程本地存储 - 用于内存索引
|
|
|
|
|
LOCALS = threading.local()
|
|
|
|
|
LOCALS.RAM_STORE = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class WhooshHtmlFormatter(HtmlFormatter):
|
|
|
|
|
"""
|
|
|
|
|
This is a HtmlFormatter simpler than the whoosh.HtmlFormatter.
|
|
|
|
|
We use it to have consistent results across backends. Specifically,
|
|
|
|
|
Solr, Xapian and Elasticsearch are using this formatting.
|
|
|
|
|
简化的Whoosh HTML格式化器
|
|
|
|
|
|
|
|
|
|
提供跨后端一致的高亮结果显示格式。
|
|
|
|
|
Solr、Xapian和Elasticsearch都使用这种格式化方式。
|
|
|
|
|
"""
|
|
|
|
|
template = '<%(tag)s>%(t)s</%(tag)s>'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
# Word reserved by Whoosh for special use.
|
|
|
|
|
"""
|
|
|
|
|
Whoosh搜索后端实现
|
|
|
|
|
|
|
|
|
|
继承自Haystack的BaseSearchBackend,提供Whoosh搜索引擎的核心功能。
|
|
|
|
|
支持文件存储和内存存储两种方式。
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# Whoosh保留关键字
|
|
|
|
|
RESERVED_WORDS = (
|
|
|
|
|
'AND',
|
|
|
|
|
'NOT',
|
|
|
|
|
@ -71,15 +94,20 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
'TO',
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Characters reserved by Whoosh for special use.
|
|
|
|
|
# The '\\' must come first, so as not to overwrite the other slash
|
|
|
|
|
# replacements.
|
|
|
|
|
# Whoosh保留字符
|
|
|
|
|
RESERVED_CHARACTERS = (
|
|
|
|
|
'\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
|
|
|
|
|
'[', ']', '^', '"', '~', '*', '?', ':', '.',
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def __init__(self, connection_alias, **connection_options):
|
|
|
|
|
"""
|
|
|
|
|
初始化Whoosh搜索后端
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
connection_alias: 连接别名
|
|
|
|
|
**connection_options: 连接配置选项
|
|
|
|
|
"""
|
|
|
|
|
super(
|
|
|
|
|
WhooshSearchBackend,
|
|
|
|
|
self).__init__(
|
|
|
|
|
@ -93,9 +121,11 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
128 * 1024 * 1024)
|
|
|
|
|
self.path = connection_options.get('PATH')
|
|
|
|
|
|
|
|
|
|
# 检查存储类型
|
|
|
|
|
if connection_options.get('STORAGE', 'file') != 'file':
|
|
|
|
|
self.use_file_storage = False
|
|
|
|
|
|
|
|
|
|
# 文件存储必须指定路径
|
|
|
|
|
if self.use_file_storage and not self.path:
|
|
|
|
|
raise ImproperlyConfigured(
|
|
|
|
|
"You must specify a 'PATH' in your settings for connection '%s'." %
|
|
|
|
|
@ -105,21 +135,26 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
|
|
|
|
|
def setup(self):
|
|
|
|
|
"""
|
|
|
|
|
Defers loading until needed.
|
|
|
|
|
初始化设置
|
|
|
|
|
|
|
|
|
|
延迟加载,在需要时进行初始化。
|
|
|
|
|
创建或打开索引,构建schema。
|
|
|
|
|
"""
|
|
|
|
|
from haystack import connections
|
|
|
|
|
new_index = False
|
|
|
|
|
|
|
|
|
|
# Make sure the index is there.
|
|
|
|
|
# 确保索引目录存在
|
|
|
|
|
if self.use_file_storage and not os.path.exists(self.path):
|
|
|
|
|
os.makedirs(self.path)
|
|
|
|
|
new_index = True
|
|
|
|
|
|
|
|
|
|
# 检查目录写入权限
|
|
|
|
|
if self.use_file_storage and not os.access(self.path, os.W_OK):
|
|
|
|
|
raise IOError(
|
|
|
|
|
"The path to your Whoosh index '%s' is not writable for the current user/group." %
|
|
|
|
|
self.path)
|
|
|
|
|
|
|
|
|
|
# 初始化存储
|
|
|
|
|
if self.use_file_storage:
|
|
|
|
|
self.storage = FileStorage(self.path)
|
|
|
|
|
else:
|
|
|
|
|
@ -130,10 +165,12 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
|
|
|
|
|
self.storage = LOCALS.RAM_STORE
|
|
|
|
|
|
|
|
|
|
# 构建schema和解析器
|
|
|
|
|
self.content_field_name, self.schema = self.build_schema(
|
|
|
|
|
connections[self.connection_alias].get_unified_index().all_searchfields())
|
|
|
|
|
self.parser = QueryParser(self.content_field_name, schema=self.schema)
|
|
|
|
|
|
|
|
|
|
# 创建或打开索引
|
|
|
|
|
if new_index is True:
|
|
|
|
|
self.index = self.storage.create_index(self.schema)
|
|
|
|
|
else:
|
|
|
|
|
@ -145,18 +182,30 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
self.setup_complete = True
|
|
|
|
|
|
|
|
|
|
def build_schema(self, fields):
|
|
|
|
|
"""
|
|
|
|
|
构建Whoosh schema
|
|
|
|
|
|
|
|
|
|
根据字段定义创建Whoosh索引schema。
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
fields: 字段定义字典
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
tuple: (内容字段名, schema对象)
|
|
|
|
|
"""
|
|
|
|
|
# 基础字段
|
|
|
|
|
schema_fields = {
|
|
|
|
|
ID: WHOOSH_ID(stored=True, unique=True),
|
|
|
|
|
DJANGO_CT: WHOOSH_ID(stored=True),
|
|
|
|
|
DJANGO_ID: WHOOSH_ID(stored=True),
|
|
|
|
|
}
|
|
|
|
|
# Grab the number of keys that are hard-coded into Haystack.
|
|
|
|
|
# We'll use this to (possibly) fail slightly more gracefully later.
|
|
|
|
|
initial_key_count = len(schema_fields)
|
|
|
|
|
content_field_name = ''
|
|
|
|
|
|
|
|
|
|
# 处理每个字段
|
|
|
|
|
for field_name, field_class in fields.items():
|
|
|
|
|
if field_class.is_multivalued:
|
|
|
|
|
# 多值字段
|
|
|
|
|
if field_class.indexed is False:
|
|
|
|
|
schema_fields[field_class.index_fieldname] = IDLIST(
|
|
|
|
|
stored=True, field_boost=field_class.boost)
|
|
|
|
|
@ -164,35 +213,42 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
schema_fields[field_class.index_fieldname] = KEYWORD(
|
|
|
|
|
stored=True, commas=True, scorable=True, field_boost=field_class.boost)
|
|
|
|
|
elif field_class.field_type in ['date', 'datetime']:
|
|
|
|
|
# 日期时间字段
|
|
|
|
|
schema_fields[field_class.index_fieldname] = DATETIME(
|
|
|
|
|
stored=field_class.stored, sortable=True)
|
|
|
|
|
elif field_class.field_type == 'integer':
|
|
|
|
|
# 整数字段
|
|
|
|
|
schema_fields[field_class.index_fieldname] = NUMERIC(
|
|
|
|
|
stored=field_class.stored, numtype=int, field_boost=field_class.boost)
|
|
|
|
|
elif field_class.field_type == 'float':
|
|
|
|
|
# 浮点数字段
|
|
|
|
|
schema_fields[field_class.index_fieldname] = NUMERIC(
|
|
|
|
|
stored=field_class.stored, numtype=float, field_boost=field_class.boost)
|
|
|
|
|
elif field_class.field_type == 'boolean':
|
|
|
|
|
# Field boost isn't supported on BOOLEAN as of 1.8.2.
|
|
|
|
|
# 布尔字段
|
|
|
|
|
schema_fields[field_class.index_fieldname] = BOOLEAN(
|
|
|
|
|
stored=field_class.stored)
|
|
|
|
|
elif field_class.field_type == 'ngram':
|
|
|
|
|
# N-gram字段
|
|
|
|
|
schema_fields[field_class.index_fieldname] = NGRAM(
|
|
|
|
|
minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost)
|
|
|
|
|
elif field_class.field_type == 'edge_ngram':
|
|
|
|
|
# 边缘N-gram字段
|
|
|
|
|
schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, at='start',
|
|
|
|
|
stored=field_class.stored,
|
|
|
|
|
field_boost=field_class.boost)
|
|
|
|
|
else:
|
|
|
|
|
# 文本字段 - 使用中文分析器
|
|
|
|
|
# schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True)
|
|
|
|
|
schema_fields[field_class.index_fieldname] = TEXT(
|
|
|
|
|
stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True)
|
|
|
|
|
|
|
|
|
|
# 标记内容字段
|
|
|
|
|
if field_class.document is True:
|
|
|
|
|
content_field_name = field_class.index_fieldname
|
|
|
|
|
schema_fields[field_class.index_fieldname].spelling = True
|
|
|
|
|
|
|
|
|
|
# Fail more gracefully than relying on the backend to die if no fields
|
|
|
|
|
# are found.
|
|
|
|
|
# 检查是否有有效字段
|
|
|
|
|
if len(schema_fields) <= initial_key_count:
|
|
|
|
|
raise SearchBackendError(
|
|
|
|
|
"No fields were found in any search_indexes. Please correct this before attempting to search.")
|
|
|
|
|
@ -200,6 +256,14 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
return (content_field_name, Schema(**schema_fields))
|
|
|
|
|
|
|
|
|
|
def update(self, index, iterable, commit=True):
|
|
|
|
|
"""
|
|
|
|
|
更新索引
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
index: 搜索索引
|
|
|
|
|
iterable: 可迭代对象
|
|
|
|
|
commit: 是否提交更改
|
|
|
|
|
"""
|
|
|
|
|
if not self.setup_complete:
|
|
|
|
|
self.setup()
|
|
|
|
|
|
|
|
|
|
@ -212,12 +276,11 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
except SkipDocument:
|
|
|
|
|
self.log.debug(u"Indexing for object `%s` skipped", obj)
|
|
|
|
|
else:
|
|
|
|
|
# Really make sure it's unicode, because Whoosh won't have it any
|
|
|
|
|
# other way.
|
|
|
|
|
# 确保所有值为unicode
|
|
|
|
|
for key in doc:
|
|
|
|
|
doc[key] = self._from_python(doc[key])
|
|
|
|
|
|
|
|
|
|
# Document boosts aren't supported in Whoosh 2.5.0+.
|
|
|
|
|
# Whoosh 2.5.0+不支持文档boost
|
|
|
|
|
if 'boost' in doc:
|
|
|
|
|
del doc['boost']
|
|
|
|
|
|
|
|
|
|
@ -227,9 +290,6 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
if not self.silently_fail:
|
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
# We'll log the object identifier but won't include the actual object
|
|
|
|
|
# to avoid the possibility of that generating encoding errors while
|
|
|
|
|
# processing the log message:
|
|
|
|
|
self.log.error(
|
|
|
|
|
u"%s while preparing object for update" %
|
|
|
|
|
e.__class__.__name__,
|
|
|
|
|
@ -239,12 +299,18 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
"index": index,
|
|
|
|
|
"object": get_identifier(obj)}})
|
|
|
|
|
|
|
|
|
|
# 提交更改
|
|
|
|
|
if len(iterable) > 0:
|
|
|
|
|
# For now, commit no matter what, as we run into locking issues
|
|
|
|
|
# otherwise.
|
|
|
|
|
writer.commit()
|
|
|
|
|
|
|
|
|
|
def remove(self, obj_or_string, commit=True):
|
|
|
|
|
"""
|
|
|
|
|
移除文档
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
obj_or_string: 对象或标识符
|
|
|
|
|
commit: 是否提交更改
|
|
|
|
|
"""
|
|
|
|
|
if not self.setup_complete:
|
|
|
|
|
self.setup()
|
|
|
|
|
|
|
|
|
|
@ -267,6 +333,13 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
exc_info=True)
|
|
|
|
|
|
|
|
|
|
def clear(self, models=None, commit=True):
|
|
|
|
|
"""
|
|
|
|
|
清空索引
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
models: 要清空的模型列表
|
|
|
|
|
commit: 是否提交更改
|
|
|
|
|
"""
|
|
|
|
|
if not self.setup_complete:
|
|
|
|
|
self.setup()
|
|
|
|
|
|
|
|
|
|
@ -304,17 +377,27 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
"Failed to clear Whoosh index: %s", e, exc_info=True)
|
|
|
|
|
|
|
|
|
|
def delete_index(self):
|
|
|
|
|
# Per the Whoosh mailing list, if wiping out everything from the index,
|
|
|
|
|
# it's much more efficient to simply delete the index files.
|
|
|
|
|
"""
|
|
|
|
|
删除索引
|
|
|
|
|
|
|
|
|
|
彻底删除索引文件并重新创建。
|
|
|
|
|
"""
|
|
|
|
|
# 文件存储:直接删除目录
|
|
|
|
|
if self.use_file_storage and os.path.exists(self.path):
|
|
|
|
|
shutil.rmtree(self.path)
|
|
|
|
|
elif not self.use_file_storage:
|
|
|
|
|
# 内存存储:清理存储
|
|
|
|
|
self.storage.clean()
|
|
|
|
|
|
|
|
|
|
# Recreate everything.
|
|
|
|
|
# 重新创建
|
|
|
|
|
self.setup()
|
|
|
|
|
|
|
|
|
|
def optimize(self):
|
|
|
|
|
"""
|
|
|
|
|
优化索引
|
|
|
|
|
|
|
|
|
|
提高搜索性能。
|
|
|
|
|
"""
|
|
|
|
|
if not self.setup_complete:
|
|
|
|
|
self.setup()
|
|
|
|
|
|
|
|
|
|
@ -322,12 +405,21 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
self.index.optimize()
|
|
|
|
|
|
|
|
|
|
def calculate_page(self, start_offset=0, end_offset=None):
|
|
|
|
|
# Prevent against Whoosh throwing an error. Requires an end_offset
|
|
|
|
|
# greater than 0.
|
|
|
|
|
"""
|
|
|
|
|
计算分页参数
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
start_offset: 起始偏移量
|
|
|
|
|
end_offset: 结束偏移量
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
tuple: (页码, 页大小)
|
|
|
|
|
"""
|
|
|
|
|
# 防止Whoosh错误
|
|
|
|
|
if end_offset is not None and end_offset <= 0:
|
|
|
|
|
end_offset = 1
|
|
|
|
|
|
|
|
|
|
# Determine the page.
|
|
|
|
|
# 确定页码
|
|
|
|
|
page_num = 0
|
|
|
|
|
|
|
|
|
|
if end_offset is None:
|
|
|
|
|
@ -341,7 +433,7 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
if page_length and page_length > 0:
|
|
|
|
|
page_num = int(start_offset / page_length)
|
|
|
|
|
|
|
|
|
|
# Increment because Whoosh uses 1-based page numbers.
|
|
|
|
|
# Whoosh使用1-based页码
|
|
|
|
|
page_num += 1
|
|
|
|
|
return page_num, page_length
|
|
|
|
|
|
|
|
|
|
@ -366,10 +458,15 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
limit_to_registered_models=None,
|
|
|
|
|
result_class=None,
|
|
|
|
|
**kwargs):
|
|
|
|
|
"""
|
|
|
|
|
执行搜索查询
|
|
|
|
|
|
|
|
|
|
核心搜索方法,处理各种搜索参数和选项。
|
|
|
|
|
"""
|
|
|
|
|
if not self.setup_complete:
|
|
|
|
|
self.setup()
|
|
|
|
|
|
|
|
|
|
# A zero length query should return no results.
|
|
|
|
|
# 空查询返回无结果
|
|
|
|
|
if len(query_string) == 0:
|
|
|
|
|
return {
|
|
|
|
|
'results': [],
|
|
|
|
|
@ -378,8 +475,7 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
|
|
|
|
|
query_string = force_str(query_string)
|
|
|
|
|
|
|
|
|
|
# A one-character query (non-wildcard) gets nabbed by a stopwords
|
|
|
|
|
# filter and should yield zero results.
|
|
|
|
|
# 单字符查询(非通配符)返回无结果
|
|
|
|
|
if len(query_string) <= 1 and query_string != u'*':
|
|
|
|
|
return {
|
|
|
|
|
'results': [],
|
|
|
|
|
@ -388,10 +484,8 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
|
|
|
|
|
reverse = False
|
|
|
|
|
|
|
|
|
|
# 处理排序
|
|
|
|
|
if sort_by is not None:
|
|
|
|
|
# Determine if we need to reverse the results and if Whoosh can
|
|
|
|
|
# handle what it's being asked to sort by. Reversing is an
|
|
|
|
|
# all-or-nothing action, unfortunately.
|
|
|
|
|
sort_by_list = []
|
|
|
|
|
reverse_counter = 0
|
|
|
|
|
|
|
|
|
|
@ -399,6 +493,7 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
if order_by.startswith('-'):
|
|
|
|
|
reverse_counter += 1
|
|
|
|
|
|
|
|
|
|
# Whoosh要求所有排序字段方向一致
|
|
|
|
|
if reverse_counter and reverse_counter != len(sort_by):
|
|
|
|
|
raise SearchBackendError("Whoosh requires all order_by fields"
|
|
|
|
|
" to use the same sort direction")
|
|
|
|
|
@ -406,17 +501,16 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
for order_by in sort_by:
|
|
|
|
|
if order_by.startswith('-'):
|
|
|
|
|
sort_by_list.append(order_by[1:])
|
|
|
|
|
|
|
|
|
|
if len(sort_by_list) == 1:
|
|
|
|
|
reverse = True
|
|
|
|
|
else:
|
|
|
|
|
sort_by_list.append(order_by)
|
|
|
|
|
|
|
|
|
|
if len(sort_by_list) == 1:
|
|
|
|
|
reverse = False
|
|
|
|
|
|
|
|
|
|
sort_by = sort_by_list[0]
|
|
|
|
|
|
|
|
|
|
# Whoosh不支持facet功能
|
|
|
|
|
if facets is not None:
|
|
|
|
|
warnings.warn(
|
|
|
|
|
"Whoosh does not handle faceting.",
|
|
|
|
|
@ -438,6 +532,7 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
narrowed_results = None
|
|
|
|
|
self.index = self.index.refresh()
|
|
|
|
|
|
|
|
|
|
# 模型限制处理
|
|
|
|
|
if limit_to_registered_models is None:
|
|
|
|
|
limit_to_registered_models = getattr(
|
|
|
|
|
settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)
|
|
|
|
|
@ -445,12 +540,11 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
if models and len(models):
|
|
|
|
|
model_choices = sorted(get_model_ct(model) for model in models)
|
|
|
|
|
elif limit_to_registered_models:
|
|
|
|
|
# Using narrow queries, limit the results to only models handled
|
|
|
|
|
# with the current routers.
|
|
|
|
|
model_choices = self.build_models_list()
|
|
|
|
|
else:
|
|
|
|
|
model_choices = []
|
|
|
|
|
|
|
|
|
|
# 构建窄查询
|
|
|
|
|
if len(model_choices) > 0:
|
|
|
|
|
if narrow_queries is None:
|
|
|
|
|
narrow_queries = set()
|
|
|
|
|
@ -460,9 +554,8 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
|
|
|
|
|
narrow_searcher = None
|
|
|
|
|
|
|
|
|
|
# 处理窄查询
|
|
|
|
|
if narrow_queries is not None:
|
|
|
|
|
# Potentially expensive? I don't see another way to do it in
|
|
|
|
|
# Whoosh...
|
|
|
|
|
narrow_searcher = self.index.searcher()
|
|
|
|
|
|
|
|
|
|
for nq in narrow_queries:
|
|
|
|
|
@ -482,11 +575,12 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
|
|
|
|
|
self.index = self.index.refresh()
|
|
|
|
|
|
|
|
|
|
# 执行搜索
|
|
|
|
|
if self.index.doc_count():
|
|
|
|
|
searcher = self.index.searcher()
|
|
|
|
|
parsed_query = self.parser.parse(query_string)
|
|
|
|
|
|
|
|
|
|
# In the event of an invalid/stopworded query, recover gracefully.
|
|
|
|
|
# 处理无效查询
|
|
|
|
|
if parsed_query is None:
|
|
|
|
|
return {
|
|
|
|
|
'results': [],
|
|
|
|
|
@ -502,7 +596,7 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
'reverse': reverse,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Handle the case where the results have been narrowed.
|
|
|
|
|
# 应用窄查询过滤
|
|
|
|
|
if narrowed_results is not None:
|
|
|
|
|
search_kwargs['filter'] = narrowed_results
|
|
|
|
|
|
|
|
|
|
@ -522,8 +616,7 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
'spelling_suggestion': None,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Because as of Whoosh 2.5.1, it will return the wrong page of
|
|
|
|
|
# results if you request something too high. :(
|
|
|
|
|
# 检查页码有效性
|
|
|
|
|
if raw_page.pagenum < page_num:
|
|
|
|
|
return {
|
|
|
|
|
'results': [],
|
|
|
|
|
@ -531,6 +624,7 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
'spelling_suggestion': None,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 处理搜索结果
|
|
|
|
|
results = self._process_results(
|
|
|
|
|
raw_page,
|
|
|
|
|
highlight=highlight,
|
|
|
|
|
@ -544,6 +638,7 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
else:
|
|
|
|
|
# 无文档时的处理
|
|
|
|
|
if self.include_spelling:
|
|
|
|
|
if spelling_query:
|
|
|
|
|
spelling_suggestion = self.create_spelling_suggestion(
|
|
|
|
|
@ -570,18 +665,21 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
limit_to_registered_models=None,
|
|
|
|
|
result_class=None,
|
|
|
|
|
**kwargs):
|
|
|
|
|
"""
|
|
|
|
|
查找相似文档
|
|
|
|
|
|
|
|
|
|
基于给定模型实例查找相似内容。
|
|
|
|
|
"""
|
|
|
|
|
if not self.setup_complete:
|
|
|
|
|
self.setup()
|
|
|
|
|
|
|
|
|
|
# Deferred models will have a different class ("RealClass_Deferred_fieldname")
|
|
|
|
|
# which won't be in our registry:
|
|
|
|
|
model_klass = model_instance._meta.concrete_model
|
|
|
|
|
|
|
|
|
|
field_name = self.content_field_name
|
|
|
|
|
narrow_queries = set()
|
|
|
|
|
narrowed_results = None
|
|
|
|
|
self.index = self.index.refresh()
|
|
|
|
|
|
|
|
|
|
# 模型限制处理
|
|
|
|
|
if limit_to_registered_models is None:
|
|
|
|
|
limit_to_registered_models = getattr(
|
|
|
|
|
settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)
|
|
|
|
|
@ -589,12 +687,11 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
if models and len(models):
|
|
|
|
|
model_choices = sorted(get_model_ct(model) for model in models)
|
|
|
|
|
elif limit_to_registered_models:
|
|
|
|
|
# Using narrow queries, limit the results to only models handled
|
|
|
|
|
# with the current routers.
|
|
|
|
|
model_choices = self.build_models_list()
|
|
|
|
|
else:
|
|
|
|
|
model_choices = []
|
|
|
|
|
|
|
|
|
|
# 构建查询
|
|
|
|
|
if len(model_choices) > 0:
|
|
|
|
|
if narrow_queries is None:
|
|
|
|
|
narrow_queries = set()
|
|
|
|
|
@ -607,9 +704,8 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
|
|
|
|
|
narrow_searcher = None
|
|
|
|
|
|
|
|
|
|
# 处理窄查询
|
|
|
|
|
if narrow_queries is not None:
|
|
|
|
|
# Potentially expensive? I don't see another way to do it in
|
|
|
|
|
# Whoosh...
|
|
|
|
|
narrow_searcher = self.index.searcher()
|
|
|
|
|
|
|
|
|
|
for nq in narrow_queries:
|
|
|
|
|
@ -632,6 +728,7 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
self.index = self.index.refresh()
|
|
|
|
|
raw_results = EmptyResults()
|
|
|
|
|
|
|
|
|
|
# 执行相似文档搜索
|
|
|
|
|
if self.index.doc_count():
|
|
|
|
|
query = "%s:%s" % (ID, get_identifier(model_instance))
|
|
|
|
|
searcher = self.index.searcher()
|
|
|
|
|
@ -642,7 +739,7 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
raw_results = results[0].more_like_this(
|
|
|
|
|
field_name, top=end_offset)
|
|
|
|
|
|
|
|
|
|
# Handle the case where the results have been narrowed.
|
|
|
|
|
# 应用窄查询过滤
|
|
|
|
|
if narrowed_results is not None and hasattr(raw_results, 'filter'):
|
|
|
|
|
raw_results.filter(narrowed_results)
|
|
|
|
|
|
|
|
|
|
@ -658,8 +755,7 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
'spelling_suggestion': None,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Because as of Whoosh 2.5.1, it will return the wrong page of
|
|
|
|
|
# results if you request something too high. :(
|
|
|
|
|
# 检查页码有效性
|
|
|
|
|
if raw_page.pagenum < page_num:
|
|
|
|
|
return {
|
|
|
|
|
'results': [],
|
|
|
|
|
@ -667,6 +763,7 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
'spelling_suggestion': None,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 处理结果
|
|
|
|
|
results = self._process_results(raw_page, result_class=result_class)
|
|
|
|
|
searcher.close()
|
|
|
|
|
|
|
|
|
|
@ -682,11 +779,15 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
query_string='',
|
|
|
|
|
spelling_query=None,
|
|
|
|
|
result_class=None):
|
|
|
|
|
"""
|
|
|
|
|
处理搜索结果
|
|
|
|
|
|
|
|
|
|
将Whoosh原始结果转换为Haystack格式。
|
|
|
|
|
"""
|
|
|
|
|
from haystack import connections
|
|
|
|
|
results = []
|
|
|
|
|
|
|
|
|
|
# It's important to grab the hits first before slicing. Otherwise, this
|
|
|
|
|
# can cause pagination failures.
|
|
|
|
|
# 获取命中数
|
|
|
|
|
hits = len(raw_page)
|
|
|
|
|
|
|
|
|
|
if result_class is None:
|
|
|
|
|
@ -697,6 +798,7 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
unified_index = connections[self.connection_alias].get_unified_index()
|
|
|
|
|
indexed_models = unified_index.get_indexed_models()
|
|
|
|
|
|
|
|
|
|
# 处理每个结果
|
|
|
|
|
for doc_offset, raw_result in enumerate(raw_page):
|
|
|
|
|
score = raw_page.score(doc_offset) or 0
|
|
|
|
|
app_label, model_name = raw_result[DJANGO_CT].split('.')
|
|
|
|
|
@ -704,13 +806,14 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
model = haystack_get_model(app_label, model_name)
|
|
|
|
|
|
|
|
|
|
if model and model in indexed_models:
|
|
|
|
|
# 处理字段值
|
|
|
|
|
for key, value in raw_result.items():
|
|
|
|
|
index = unified_index.get_index(model)
|
|
|
|
|
string_key = str(key)
|
|
|
|
|
|
|
|
|
|
if string_key in index.fields and hasattr(
|
|
|
|
|
index.fields[string_key], 'convert'):
|
|
|
|
|
# Special-cased due to the nature of KEYWORD fields.
|
|
|
|
|
# 多值字段特殊处理
|
|
|
|
|
if index.fields[string_key].is_multivalued:
|
|
|
|
|
if value is None or len(value) == 0:
|
|
|
|
|
additional_fields[string_key] = []
|
|
|
|
|
@ -723,9 +826,11 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
else:
|
|
|
|
|
additional_fields[string_key] = self._to_python(value)
|
|
|
|
|
|
|
|
|
|
# 移除系统字段
|
|
|
|
|
del (additional_fields[DJANGO_CT])
|
|
|
|
|
del (additional_fields[DJANGO_ID])
|
|
|
|
|
|
|
|
|
|
# 高亮处理
|
|
|
|
|
if highlight:
|
|
|
|
|
sa = StemmingAnalyzer()
|
|
|
|
|
formatter = WhooshHtmlFormatter('em')
|
|
|
|
|
@ -742,6 +847,7 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
self.content_field_name: [whoosh_result],
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 创建结果对象
|
|
|
|
|
result = result_class(
|
|
|
|
|
app_label,
|
|
|
|
|
model_name,
|
|
|
|
|
@ -752,6 +858,7 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
else:
|
|
|
|
|
hits -= 1
|
|
|
|
|
|
|
|
|
|
# 拼写建议
|
|
|
|
|
if self.include_spelling:
|
|
|
|
|
if spelling_query:
|
|
|
|
|
spelling_suggestion = self.create_spelling_suggestion(
|
|
|
|
|
@ -768,6 +875,15 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def create_spelling_suggestion(self, query_string):
|
|
|
|
|
"""
|
|
|
|
|
创建拼写建议
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
query_string: 查询字符串
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
str: 拼写建议
|
|
|
|
|
"""
|
|
|
|
|
spelling_suggestion = None
|
|
|
|
|
reader = self.index.reader()
|
|
|
|
|
corrector = reader.corrector(self.content_field_name)
|
|
|
|
|
@ -776,14 +892,14 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
if not query_string:
|
|
|
|
|
return spelling_suggestion
|
|
|
|
|
|
|
|
|
|
# Clean the string.
|
|
|
|
|
# 清理查询字符串
|
|
|
|
|
for rev_word in self.RESERVED_WORDS:
|
|
|
|
|
cleaned_query = cleaned_query.replace(rev_word, '')
|
|
|
|
|
|
|
|
|
|
for rev_char in self.RESERVED_CHARACTERS:
|
|
|
|
|
cleaned_query = cleaned_query.replace(rev_char, '')
|
|
|
|
|
|
|
|
|
|
# Break it down.
|
|
|
|
|
# 分词并获取建议
|
|
|
|
|
query_words = cleaned_query.split()
|
|
|
|
|
suggested_words = []
|
|
|
|
|
|
|
|
|
|
@ -798,22 +914,29 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
|
|
|
|
|
def _from_python(self, value):
|
|
|
|
|
"""
|
|
|
|
|
Converts Python values to a string for Whoosh.
|
|
|
|
|
Python值转换为Whoosh字符串
|
|
|
|
|
|
|
|
|
|
Code courtesy of pysolr.
|
|
|
|
|
Args:
|
|
|
|
|
value: Python值
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
str: Whoosh格式字符串
|
|
|
|
|
"""
|
|
|
|
|
if hasattr(value, 'strftime'):
|
|
|
|
|
# 日期时间处理
|
|
|
|
|
if not hasattr(value, 'hour'):
|
|
|
|
|
value = datetime(value.year, value.month, value.day, 0, 0, 0)
|
|
|
|
|
elif isinstance(value, bool):
|
|
|
|
|
# 布尔值处理
|
|
|
|
|
if value:
|
|
|
|
|
value = 'true'
|
|
|
|
|
else:
|
|
|
|
|
value = 'false'
|
|
|
|
|
elif isinstance(value, (list, tuple)):
|
|
|
|
|
# 列表元组处理
|
|
|
|
|
value = u','.join([force_str(v) for v in value])
|
|
|
|
|
elif isinstance(value, (six.integer_types, float)):
|
|
|
|
|
# Leave it alone.
|
|
|
|
|
# 数字类型保持原样
|
|
|
|
|
pass
|
|
|
|
|
else:
|
|
|
|
|
value = force_str(value)
|
|
|
|
|
@ -821,15 +944,20 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
|
|
|
|
|
def _to_python(self, value):
|
|
|
|
|
"""
|
|
|
|
|
Converts values from Whoosh to native Python values.
|
|
|
|
|
Whoosh值转换为Python值
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
value: Whoosh值
|
|
|
|
|
|
|
|
|
|
A port of the same method in pysolr, as they deal with data the same way.
|
|
|
|
|
Returns:
|
|
|
|
|
object: Python值
|
|
|
|
|
"""
|
|
|
|
|
if value == 'true':
|
|
|
|
|
return True
|
|
|
|
|
elif value == 'false':
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# 日期时间解析
|
|
|
|
|
if value and isinstance(value, six.string_types):
|
|
|
|
|
possible_datetime = DATETIME_REGEX.search(value)
|
|
|
|
|
|
|
|
|
|
@ -847,11 +975,10 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
date_values['minute'],
|
|
|
|
|
date_values['second'])
|
|
|
|
|
|
|
|
|
|
# JSON解析尝试
|
|
|
|
|
try:
|
|
|
|
|
# Attempt to use json to load the values.
|
|
|
|
|
converted_value = json.loads(value)
|
|
|
|
|
|
|
|
|
|
# Try to handle most built-in types.
|
|
|
|
|
if isinstance(
|
|
|
|
|
converted_value,
|
|
|
|
|
(list,
|
|
|
|
|
@ -863,15 +990,28 @@ class WhooshSearchBackend(BaseSearchBackend):
|
|
|
|
|
complex)):
|
|
|
|
|
return converted_value
|
|
|
|
|
except BaseException:
|
|
|
|
|
# If it fails (SyntaxError or its ilk) or we don't trust it,
|
|
|
|
|
# continue on.
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
return value
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class WhooshSearchQuery(BaseSearchQuery):
|
|
|
|
|
"""
|
|
|
|
|
Whoosh搜索查询构建器
|
|
|
|
|
|
|
|
|
|
负责构建Whoosh搜索引擎的查询语句。
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def _convert_datetime(self, date):
|
|
|
|
|
"""
|
|
|
|
|
日期时间转换
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
date: 日期时间对象
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
str: 格式化字符串
|
|
|
|
|
"""
|
|
|
|
|
if hasattr(date, 'hour'):
|
|
|
|
|
return force_str(date.strftime('%Y%m%d%H%M%S'))
|
|
|
|
|
else:
|
|
|
|
|
@ -879,20 +1019,25 @@ class WhooshSearchQuery(BaseSearchQuery):
|
|
|
|
|
|
|
|
|
|
def clean(self, query_fragment):
|
|
|
|
|
"""
|
|
|
|
|
Provides a mechanism for sanitizing user input before presenting the
|
|
|
|
|
value to the backend.
|
|
|
|
|
清理查询片段
|
|
|
|
|
|
|
|
|
|
对用户输入进行清理和转义处理。
|
|
|
|
|
|
|
|
|
|
Whoosh 1.X differs here in that you can no longer use a backslash
|
|
|
|
|
to escape reserved characters. Instead, the whole word should be
|
|
|
|
|
quoted.
|
|
|
|
|
Args:
|
|
|
|
|
query_fragment: 查询片段
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
str: 清理后的查询字符串
|
|
|
|
|
"""
|
|
|
|
|
words = query_fragment.split()
|
|
|
|
|
cleaned_words = []
|
|
|
|
|
|
|
|
|
|
for word in words:
|
|
|
|
|
# 保留字转为小写
|
|
|
|
|
if word in self.backend.RESERVED_WORDS:
|
|
|
|
|
word = word.replace(word, word.lower())
|
|
|
|
|
|
|
|
|
|
# 保留字符用引号包围
|
|
|
|
|
for char in self.backend.RESERVED_CHARACTERS:
|
|
|
|
|
if char in word:
|
|
|
|
|
word = "'%s'" % word
|
|
|
|
|
@ -903,12 +1048,23 @@ class WhooshSearchQuery(BaseSearchQuery):
|
|
|
|
|
return ' '.join(cleaned_words)
|
|
|
|
|
|
|
|
|
|
def build_query_fragment(self, field, filter_type, value):
|
|
|
|
|
"""
|
|
|
|
|
构建查询片段
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
field: 字段名
|
|
|
|
|
filter_type: 过滤器类型
|
|
|
|
|
value: 字段值
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
str: 查询片段
|
|
|
|
|
"""
|
|
|
|
|
from haystack import connections
|
|
|
|
|
query_frag = ''
|
|
|
|
|
is_datetime = False
|
|
|
|
|
|
|
|
|
|
# 值类型处理
|
|
|
|
|
if not hasattr(value, 'input_type_name'):
|
|
|
|
|
# Handle when we've got a ``ValuesListQuerySet``...
|
|
|
|
|
if hasattr(value, 'values_list'):
|
|
|
|
|
value = list(value)
|
|
|
|
|
|
|
|
|
|
@ -916,26 +1072,24 @@ class WhooshSearchQuery(BaseSearchQuery):
|
|
|
|
|
is_datetime = True
|
|
|
|
|
|
|
|
|
|
if isinstance(value, six.string_types) and value != ' ':
|
|
|
|
|
# It's not an ``InputType``. Assume ``Clean``.
|
|
|
|
|
value = Clean(value)
|
|
|
|
|
else:
|
|
|
|
|
value = PythonData(value)
|
|
|
|
|
|
|
|
|
|
# Prepare the query using the InputType.
|
|
|
|
|
# 准备值
|
|
|
|
|
prepared_value = value.prepare(self)
|
|
|
|
|
|
|
|
|
|
if not isinstance(prepared_value, (set, list, tuple)):
|
|
|
|
|
# Then convert whatever we get back to what pysolr wants if needed.
|
|
|
|
|
prepared_value = self.backend._from_python(prepared_value)
|
|
|
|
|
|
|
|
|
|
# 'content' is a special reserved word, much like 'pk' in
|
|
|
|
|
# Django's ORM layer. It indicates 'no special field'.
|
|
|
|
|
# 字段名处理
|
|
|
|
|
if field == 'content':
|
|
|
|
|
index_fieldname = ''
|
|
|
|
|
else:
|
|
|
|
|
index_fieldname = u'%s:' % connections[self._using].get_unified_index(
|
|
|
|
|
).get_index_fieldname(field)
|
|
|
|
|
|
|
|
|
|
# 过滤器类型映射
|
|
|
|
|
filter_types = {
|
|
|
|
|
'content': '%s',
|
|
|
|
|
'contains': '*%s*',
|
|
|
|
|
@ -949,6 +1103,7 @@ class WhooshSearchQuery(BaseSearchQuery):
|
|
|
|
|
'fuzzy': u'%s~',
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 查询片段构建
|
|
|
|
|
if value.post_process is False:
|
|
|
|
|
query_frag = prepared_value
|
|
|
|
|
else:
|
|
|
|
|
@ -961,8 +1116,6 @@ class WhooshSearchQuery(BaseSearchQuery):
|
|
|
|
|
if value.input_type_name == 'exact':
|
|
|
|
|
query_frag = prepared_value
|
|
|
|
|
else:
|
|
|
|
|
# Iterate over terms & incorportate the converted form of
|
|
|
|
|
# each into the query.
|
|
|
|
|
terms = []
|
|
|
|
|
|
|
|
|
|
if isinstance(prepared_value, six.string_types):
|
|
|
|
|
@ -1026,19 +1179,19 @@ class WhooshSearchQuery(BaseSearchQuery):
|
|
|
|
|
|
|
|
|
|
query_frag = filter_types[filter_type] % prepared_value
|
|
|
|
|
|
|
|
|
|
# 添加括号
|
|
|
|
|
if len(query_frag) and not isinstance(value, Raw):
|
|
|
|
|
if not query_frag.startswith('(') and not query_frag.endswith(')'):
|
|
|
|
|
query_frag = "(%s)" % query_frag
|
|
|
|
|
|
|
|
|
|
return u"%s%s" % (index_fieldname, query_frag)
|
|
|
|
|
|
|
|
|
|
# if not filter_type in ('in', 'range'):
|
|
|
|
|
# # 'in' is a bit of a special case, as we don't want to
|
|
|
|
|
# # convert a valid list/tuple to string. Defer handling it
|
|
|
|
|
# # until later...
|
|
|
|
|
# value = self.backend._from_python(value)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class WhooshEngine(BaseEngine):
|
|
|
|
|
"""
|
|
|
|
|
Whoosh搜索引擎配置
|
|
|
|
|
|
|
|
|
|
配置Haystack使用Whoosh作为搜索后端。
|
|
|
|
|
"""
|
|
|
|
|
backend = WhooshSearchBackend
|
|
|
|
|
query = WhooshSearchQuery
|
|
|
|
|
query = WhooshSearchQuery
|