@ -1,7 +1,9 @@
# encoding: utf-8
# encoding: utf-8
# 导入Python 2/3兼容性支持
from __future__ import absolute_import , division , print_function , unicode_literals
from __future__ import absolute_import , division , print_function , unicode_literals
# 导入标准库模块
import json
import json
import os
import os
import re
import re
@ -9,20 +11,35 @@ import shutil
import threading
import threading
import warnings
import warnings
# 导入Python 2/3兼容性库
import six
import six
# 导入Django配置模块
from django . conf import settings
from django . conf import settings
# 导入Django配置异常
from django . core . exceptions import ImproperlyConfigured
from django . core . exceptions import ImproperlyConfigured
# 导入日期时间模块
from datetime import datetime
from datetime import datetime
# 导入字符串强制转换工具
from django . utils . encoding import force_str
from django . utils . encoding import force_str
# 导入Haystack搜索引擎基类
from haystack . backends import BaseEngine , BaseSearchBackend , BaseSearchQuery , EmptyResults , log_query
from haystack . backends import BaseEngine , BaseSearchBackend , BaseSearchQuery , EmptyResults , log_query
# 导入Haystack常量
from haystack . constants import DJANGO_CT , DJANGO_ID , ID
from haystack . constants import DJANGO_CT , DJANGO_ID , ID
# 导入Haystack异常
from haystack . exceptions import MissingDependency , SearchBackendError , SkipDocument
from haystack . exceptions import MissingDependency , SearchBackendError , SkipDocument
# 导入Haystack输入类型
from haystack . inputs import Clean , Exact , PythonData , Raw
from haystack . inputs import Clean , Exact , PythonData , Raw
# 导入Haystack搜索结果模型
from haystack . models import SearchResult
from haystack . models import SearchResult
# 导入Haystack工具函数
from haystack . utils import get_identifier , get_model_ct
from haystack . utils import get_identifier , get_model_ct
# 导入Haystack日志工具
from haystack . utils import log as logging
from haystack . utils import log as logging
# 导入Haystack模型加载工具
from haystack . utils . app_loading import haystack_get_model
from haystack . utils . app_loading import haystack_get_model
# 导入结巴中文分词器
from jieba . analyse import ChineseAnalyzer
from jieba . analyse import ChineseAnalyzer
# 导入Whoosh搜索库相关模块
from whoosh import index
from whoosh import index
from whoosh . analysis import StemmingAnalyzer
from whoosh . analysis import StemmingAnalyzer
from whoosh . fields import BOOLEAN , DATETIME , IDLIST , KEYWORD , NGRAM , NGRAMWORDS , NUMERIC , Schema , TEXT
from whoosh . fields import BOOLEAN , DATETIME , IDLIST , KEYWORD , NGRAM , NGRAMWORDS , NUMERIC , Schema , TEXT
@ -34,36 +51,40 @@ from whoosh.qparser import QueryParser
from whoosh . searching import ResultsPage
from whoosh . searching import ResultsPage
from whoosh . writing import AsyncWriter
from whoosh . writing import AsyncWriter
# 尝试导入Whoosh库, 如果失败则抛出缺失依赖异常
try :
try :
import whoosh
import whoosh
except ImportError :
except ImportError :
raise MissingDependency (
raise MissingDependency (
" The ' whoosh ' backend requires the installation of ' Whoosh ' . Please refer to the documentation. " )
" The ' whoosh ' backend requires the installation of ' Whoosh ' . Please refer to the documentation. " )
# Handle minimum requirement.
# 处理最低版本要求
if not hasattr ( whoosh , ' __version__ ' ) or whoosh . __version__ < ( 2 , 5 , 0 ) :
if not hasattr ( whoosh , ' __version__ ' ) or whoosh . __version__ < ( 2 , 5 , 0 ) :
raise MissingDependency (
raise MissingDependency (
" The ' whoosh ' backend requires version 2.5.0 or greater. " )
" The ' whoosh ' backend requires version 2.5.0 or greater. " )
# Bubble up the correct error.
# 日期时间正则表达式,用于匹配和解析日期时间字符串
DATETIME_REGEX = re . compile (
DATETIME_REGEX = re . compile (
' ^(?P<year> \ d {4} )-(?P<month> \ d {2} )-(?P<day> \ d {2} )T(?P<hour> \ d {2} ):(?P<minute> \ d {2} ):(?P<second> \ d {2} )( \ . \ d { 3,6}Z?)?$ ' )
' ^(?P<year> \ d {4} )-(?P<month> \ d {2} )-(?P<day> \ d {2} )T(?P<hour> \ d {2} ):(?P<minute> \ d {2} ):(?P<second> \ d {2} )( \ . \ d { 3,6}Z?)?$ ' )
# 线程本地存储, 用于RAM存储
LOCALS = threading . local ( )
LOCALS = threading . local ( )
LOCALS . RAM_STORE = None
LOCALS . RAM_STORE = None
# 自定义Whoosh HTML格式化器类
class WhooshHtmlFormatter ( HtmlFormatter ) :
class WhooshHtmlFormatter ( HtmlFormatter ) :
"""
"""
This is a HtmlFormatter simpler than the whoosh . HtmlFormatter .
This is a HtmlFormatter simpler than the whoosh . HtmlFormatter .
We use it to have consistent results across backends . Specifically ,
We use it to have consistent results across backends . Specifically ,
Solr , Xapian and Elasticsearch are using this formatting .
Solr , Xapian and Elasticsearch are using this formatting .
"""
"""
# 简化的HTML模板
template = ' < %(tag)s > %(t)s </ %(tag)s > '
template = ' < %(tag)s > %(t)s </ %(tag)s > '
# Whoosh搜索后端类, 继承自BaseSearchBackend
class WhooshSearchBackend ( BaseSearchBackend ) :
class WhooshSearchBackend ( BaseSearchBackend ) :
# W ord reserved by Whoosh for special use.
# W hoosh保留的关键字
RESERVED_WORDS = (
RESERVED_WORDS = (
' AND ' ,
' AND ' ,
' NOT ' ,
' NOT ' ,
@ -71,38 +92,47 @@ class WhooshSearchBackend(BaseSearchBackend):
' TO ' ,
' TO ' ,
)
)
# Characters reserved by Whoosh for special use.
# Whoosh保留的特殊字符
# The '\\' must come first, so as not to overwrite the other slash
# '\\' 必须放在第一位,以免覆盖其他斜杠替换
# replacements.
RESERVED_CHARACTERS = (
RESERVED_CHARACTERS = (
' \\ ' , ' + ' , ' - ' , ' && ' , ' || ' , ' ! ' , ' ( ' , ' ) ' , ' { ' , ' } ' ,
' \\ ' , ' + ' , ' - ' , ' && ' , ' || ' , ' ! ' , ' ( ' , ' ) ' , ' { ' , ' } ' ,
' [ ' , ' ] ' , ' ^ ' , ' " ' , ' ~ ' , ' * ' , ' ? ' , ' : ' , ' . ' ,
' [ ' , ' ] ' , ' ^ ' , ' " ' , ' ~ ' , ' * ' , ' ? ' , ' : ' , ' . ' ,
)
)
# 初始化方法
def __init__ ( self , connection_alias , * * connection_options ) :
def __init__ ( self , connection_alias , * * connection_options ) :
# 调用父类初始化方法
super (
super (
WhooshSearchBackend ,
WhooshSearchBackend ,
self ) . __init__ (
self ) . __init__ (
connection_alias ,
connection_alias ,
* * connection_options )
* * connection_options )
# 设置完成标志
self . setup_complete = False
self . setup_complete = False
# 是否使用文件存储
self . use_file_storage = True
self . use_file_storage = True
# 发布限制大小
self . post_limit = getattr (
self . post_limit = getattr (
connection_options ,
connection_options ,
' POST_LIMIT ' ,
' POST_LIMIT ' ,
128 * 1024 * 1024 )
128 * 1024 * 1024 )
# 索引路径
self . path = connection_options . get ( ' PATH ' )
self . path = connection_options . get ( ' PATH ' )
# 检查存储类型
if connection_options . get ( ' STORAGE ' , ' file ' ) != ' file ' :
if connection_options . get ( ' STORAGE ' , ' file ' ) != ' file ' :
self . use_file_storage = False
self . use_file_storage = False
# 如果使用文件存储但没有指定路径,抛出配置错误
if self . use_file_storage and not self . path :
if self . use_file_storage and not self . path :
raise ImproperlyConfigured (
raise ImproperlyConfigured (
" You must specify a ' PATH ' in your settings for connection ' %s ' . " %
" You must specify a ' PATH ' in your settings for connection ' %s ' . " %
connection_alias )
connection_alias )
# 设置日志器
self . log = logging . getLogger ( ' haystack ' )
self . log = logging . getLogger ( ' haystack ' )
# 设置方法,延迟加载直到需要时
def setup ( self ) :
def setup ( self ) :
"""
"""
Defers loading until needed .
Defers loading until needed .
@ -110,16 +140,18 @@ class WhooshSearchBackend(BaseSearchBackend):
from haystack import connections
from haystack import connections
new_index = False
new_index = False
# Make sure the index is there.
# 确保索引目录存在
if self . use_file_storage and not os . path . exists ( self . path ) :
if self . use_file_storage and not os . path . exists ( self . path ) :
os . makedirs ( self . path )
os . makedirs ( self . path )
new_index = True
new_index = True
# 检查索引目录是否可写
if self . use_file_storage and not os . access ( self . path , os . W_OK ) :
if self . use_file_storage and not os . access ( self . path , os . W_OK ) :
raise IOError (
raise IOError (
" The path to your Whoosh index ' %s ' is not writable for the current user/group. " %
" The path to your Whoosh index ' %s ' is not writable for the current user/group. " %
self . path )
self . path )
# 根据存储类型选择存储方式
if self . use_file_storage :
if self . use_file_storage :
self . storage = FileStorage ( self . path )
self . storage = FileStorage ( self . path )
else :
else :
@ -130,10 +162,13 @@ class WhooshSearchBackend(BaseSearchBackend):
self . storage = LOCALS . RAM_STORE
self . storage = LOCALS . RAM_STORE
# 构建schema和内容字段名
self . content_field_name , self . schema = self . build_schema (
self . content_field_name , self . schema = self . build_schema (
connections [ self . connection_alias ] . get_unified_index ( ) . all_searchfields ( ) )
connections [ self . connection_alias ] . get_unified_index ( ) . all_searchfields ( ) )
# 创建查询解析器
self . parser = QueryParser ( self . content_field_name , schema = self . schema )
self . parser = QueryParser ( self . content_field_name , schema = self . schema )
# 创建或打开索引
if new_index is True :
if new_index is True :
self . index = self . storage . create_index ( self . schema )
self . index = self . storage . create_index ( self . schema )
else :
else :
@ -142,19 +177,22 @@ class WhooshSearchBackend(BaseSearchBackend):
except index . EmptyIndexError :
except index . EmptyIndexError :
self . index = self . storage . create_index ( self . schema )
self . index = self . storage . create_index ( self . schema )
# 标记设置完成
self . setup_complete = True
self . setup_complete = True
# 构建schema的方法
def build_schema ( self , fields ) :
def build_schema ( self , fields ) :
# 基础schema字段
schema_fields = {
schema_fields = {
ID : WHOOSH_ID ( stored = True , unique = True ) ,
ID : WHOOSH_ID ( stored = True , unique = True ) ,
DJANGO_CT : WHOOSH_ID ( stored = True ) ,
DJANGO_CT : WHOOSH_ID ( stored = True ) ,
DJANGO_ID : WHOOSH_ID ( stored = True ) ,
DJANGO_ID : WHOOSH_ID ( stored = True ) ,
}
}
# Grab the number of keys that are hard-coded into Haystack.
# 获取硬编码到Haystack中的键数量
# We'll use this to (possibly) fail slightly more gracefully later.
initial_key_count = len ( schema_fields )
initial_key_count = len ( schema_fields )
content_field_name = ' '
content_field_name = ' '
# 遍历所有字段, 构建schema
for field_name , field_class in fields . items ( ) :
for field_name , field_class in fields . items ( ) :
if field_class . is_multivalued :
if field_class . is_multivalued :
if field_class . indexed is False :
if field_class . indexed is False :
@ -173,7 +211,7 @@ class WhooshSearchBackend(BaseSearchBackend):
schema_fields [ field_class . index_fieldname ] = NUMERIC (
schema_fields [ field_class . index_fieldname ] = NUMERIC (
stored = field_class . stored , numtype = float , field_boost = field_class . boost )
stored = field_class . stored , numtype = float , field_boost = field_class . boost )
elif field_class . field_type == ' boolean ' :
elif field_class . field_type == ' boolean ' :
# Field boost isn't supported on BOOLEAN as of 1.8.2.
# 在Whoosh 1.8.2中, BOOLEAN字段不支持字段提升
schema_fields [ field_class . index_fieldname ] = BOOLEAN (
schema_fields [ field_class . index_fieldname ] = BOOLEAN (
stored = field_class . stored )
stored = field_class . stored )
elif field_class . field_type == ' ngram ' :
elif field_class . field_type == ' ngram ' :
@ -184,40 +222,43 @@ class WhooshSearchBackend(BaseSearchBackend):
stored = field_class . stored ,
stored = field_class . stored ,
field_boost = field_class . boost )
field_boost = field_class . boost )
else :
else :
# 使用中文分词器处理文本字段
# schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True)
# schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True)
schema_fields [ field_class . index_fieldname ] = TEXT (
schema_fields [ field_class . index_fieldname ] = TEXT (
stored = True , analyzer = ChineseAnalyzer ( ) , field_boost = field_class . boost , sortable = True )
stored = True , analyzer = ChineseAnalyzer ( ) , field_boost = field_class . boost , sortable = True )
# 标记内容字段
if field_class . document is True :
if field_class . document is True :
content_field_name = field_class . index_fieldname
content_field_name = field_class . index_fieldname
schema_fields [ field_class . index_fieldname ] . spelling = True
schema_fields [ field_class . index_fieldname ] . spelling = True
# Fail more gracefully than relying on the backend to die if no fields
# 如果没有找到字段,优雅地失败
# are found.
if len ( schema_fields ) < = initial_key_count :
if len ( schema_fields ) < = initial_key_count :
raise SearchBackendError (
raise SearchBackendError (
" No fields were found in any search_indexes. Please correct this before attempting to search. " )
" No fields were found in any search_indexes. Please correct this before attempting to search. " )
return ( content_field_name , Schema ( * * schema_fields ) )
return ( content_field_name , Schema ( * * schema_fields ) )
# 更新索引的方法
def update ( self , index , iterable , commit = True ) :
def update ( self , index , iterable , commit = True ) :
if not self . setup_complete :
if not self . setup_complete :
self . setup ( )
self . setup ( )
self . index = self . index . refresh ( )
self . index = self . index . refresh ( )
# 使用异步写入器
writer = AsyncWriter ( self . index )
writer = AsyncWriter ( self . index )
# 遍历所有对象,准备并更新文档
for obj in iterable :
for obj in iterable :
try :
try :
doc = index . full_prepare ( obj )
doc = index . full_prepare ( obj )
except SkipDocument :
except SkipDocument :
self . log . debug ( u " Indexing for object ` %s ` skipped " , obj )
self . log . debug ( u " Indexing for object ` %s ` skipped " , obj )
else :
else :
# Really make sure it's unicode, because Whoosh won't have it any
# 确保所有值都是Unicode, 因为Whoosh只接受Unicode
# other way.
for key in doc :
for key in doc :
doc [ key ] = self . _from_python ( doc [ key ] )
doc [ key ] = self . _from_python ( doc [ key ] )
# Document boosts aren't supported in Whoosh 2.5.0+.
# Whoosh 2.5.0+不支持文档提升
if ' boost ' in doc :
if ' boost ' in doc :
del doc [ ' boost ' ]
del doc [ ' boost ' ]
@ -227,9 +268,7 @@ class WhooshSearchBackend(BaseSearchBackend):
if not self . silently_fail :
if not self . silently_fail :
raise
raise
# We'll log the object identifier but won't include the actual object
# 记录对象标识符但不包含实际对象,以避免生成编码错误
# to avoid the possibility of that generating encoding errors while
# processing the log message:
self . log . error (
self . log . error (
u " %s while preparing object for update " %
u " %s while preparing object for update " %
e . __class__ . __name__ ,
e . __class__ . __name__ ,
@ -239,11 +278,12 @@ class WhooshSearchBackend(BaseSearchBackend):
" index " : index ,
" index " : index ,
" object " : get_identifier ( obj ) } } )
" object " : get_identifier ( obj ) } } )
# 如果有对象需要处理,提交写入
if len ( iterable ) > 0 :
if len ( iterable ) > 0 :
# For now, commit no matter what, as we run into locking issues
# 暂时无论如何都提交,否则会遇到锁定问题
# otherwise.
writer . commit ( )
writer . commit ( )
# 移除文档的方法
def remove ( self , obj_or_string , commit = True ) :
def remove ( self , obj_or_string , commit = True ) :
if not self . setup_complete :
if not self . setup_complete :
self . setup ( )
self . setup ( )
@ -252,6 +292,7 @@ class WhooshSearchBackend(BaseSearchBackend):
whoosh_id = get_identifier ( obj_or_string )
whoosh_id = get_identifier ( obj_or_string )
try :
try :
# 通过查询删除文档
self . index . delete_by_query (
self . index . delete_by_query (
q = self . parser . parse (
q = self . parser . parse (
u ' %s : " %s " ' %
u ' %s : " %s " ' %
@ -266,6 +307,7 @@ class WhooshSearchBackend(BaseSearchBackend):
e ,
e ,
exc_info = True )
exc_info = True )
# 清空索引的方法
def clear ( self , models = None , commit = True ) :
def clear ( self , models = None , commit = True ) :
if not self . setup_complete :
if not self . setup_complete :
self . setup ( )
self . setup ( )
@ -277,6 +319,7 @@ class WhooshSearchBackend(BaseSearchBackend):
try :
try :
if models is None :
if models is None :
# 如果未指定模型,删除整个索引
self . delete_index ( )
self . delete_index ( )
else :
else :
models_to_delete = [ ]
models_to_delete = [ ]
@ -286,6 +329,7 @@ class WhooshSearchBackend(BaseSearchBackend):
u " %s : %s " %
u " %s : %s " %
( DJANGO_CT , get_model_ct ( model ) ) )
( DJANGO_CT , get_model_ct ( model ) ) )
# 通过查询删除指定模型的文档
self . index . delete_by_query (
self . index . delete_by_query (
q = self . parser . parse (
q = self . parser . parse (
u " OR " . join ( models_to_delete ) ) )
u " OR " . join ( models_to_delete ) ) )
@ -303,17 +347,18 @@ class WhooshSearchBackend(BaseSearchBackend):
self . log . error (
self . log . error (
" Failed to clear Whoosh index: %s " , e , exc_info = True )
" Failed to clear Whoosh index: %s " , e , exc_info = True )
# 删除索引的方法
def delete_index ( self ) :
def delete_index ( self ) :
# Per the Whoosh mailing list, if wiping out everything from the index,
# 根据Whoosh邮件列表, 如果要清除索引中的所有内容, 直接删除索引文件更高效
# it's much more efficient to simply delete the index files.
if self . use_file_storage and os . path . exists ( self . path ) :
if self . use_file_storage and os . path . exists ( self . path ) :
shutil . rmtree ( self . path )
shutil . rmtree ( self . path )
elif not self . use_file_storage :
elif not self . use_file_storage :
self . storage . clean ( )
self . storage . clean ( )
# Recreate everything.
# 重新创建所有内容
self . setup ( )
self . setup ( )
# 优化索引的方法
def optimize ( self ) :
def optimize ( self ) :
if not self . setup_complete :
if not self . setup_complete :
self . setup ( )
self . setup ( )
@ -321,13 +366,13 @@ class WhooshSearchBackend(BaseSearchBackend):
self . index = self . index . refresh ( )
self . index = self . index . refresh ( )
self . index . optimize ( )
self . index . optimize ( )
# 计算分页的方法
def calculate_page ( self , start_offset = 0 , end_offset = None ) :
def calculate_page ( self , start_offset = 0 , end_offset = None ) :
# Prevent against Whoosh throwing an error. Requires an end_offset
# 防止Whoosh抛出错误。需要end_offset大于0
# greater than 0.
if end_offset is not None and end_offset < = 0 :
if end_offset is not None and end_offset < = 0 :
end_offset = 1
end_offset = 1
# Determine the page.
# 确定页码
page_num = 0
page_num = 0
if end_offset is None :
if end_offset is None :
@ -341,10 +386,11 @@ class WhooshSearchBackend(BaseSearchBackend):
if page_length and page_length > 0 :
if page_length and page_length > 0 :
page_num = int ( start_offset / page_length )
page_num = int ( start_offset / page_length )
# Increment because Whoosh uses 1-based page numbers.
# 递增, 因为Whoosh使用基于1的页码
page_num + = 1
page_num + = 1
return page_num , page_length
return page_num , page_length
# 搜索方法,使用日志查询装饰器
@log_query
@log_query
def search (
def search (
self ,
self ,
@ -369,7 +415,7 @@ class WhooshSearchBackend(BaseSearchBackend):
if not self . setup_complete :
if not self . setup_complete :
self . setup ( )
self . setup ( )
# A zero length query should return no results.
# 零长度查询应该返回无结果
if len ( query_string ) == 0 :
if len ( query_string ) == 0 :
return {
return {
' results ' : [ ] ,
' results ' : [ ] ,
@ -378,8 +424,7 @@ class WhooshSearchBackend(BaseSearchBackend):
query_string = force_str ( query_string )
query_string = force_str ( query_string )
# A one-character query (non-wildcard) gets nabbed by a stopwords
# 单字符查询(非通配符)被停用词过滤器捕获,应该返回零结果
# filter and should yield zero results.
if len ( query_string ) < = 1 and query_string != u ' * ' :
if len ( query_string ) < = 1 and query_string != u ' * ' :
return {
return {
' results ' : [ ] ,
' results ' : [ ] ,
@ -389,9 +434,7 @@ class WhooshSearchBackend(BaseSearchBackend):
reverse = False
reverse = False
if sort_by is not None :
if sort_by is not None :
# Determine if we need to reverse the results and if Whoosh can
# 确定是否需要反转结果以及Whoosh是否可以处理被要求排序的内容
# handle what it's being asked to sort by. Reversing is an
# all-or-nothing action, unfortunately.
sort_by_list = [ ]
sort_by_list = [ ]
reverse_counter = 0
reverse_counter = 0
@ -417,6 +460,7 @@ class WhooshSearchBackend(BaseSearchBackend):
sort_by = sort_by_list [ 0 ]
sort_by = sort_by_list [ 0 ]
# 处理不支持的功能警告
if facets is not None :
if facets is not None :
warnings . warn (
warnings . warn (
" Whoosh does not handle faceting. " ,
" Whoosh does not handle faceting. " ,
@ -438,19 +482,21 @@ class WhooshSearchBackend(BaseSearchBackend):
narrowed_results = None
narrowed_results = None
self . index = self . index . refresh ( )
self . index = self . index . refresh ( )
# 限制到注册模型的设置
if limit_to_registered_models is None :
if limit_to_registered_models is None :
limit_to_registered_models = getattr (
limit_to_registered_models = getattr (
settings , ' HAYSTACK_LIMIT_TO_REGISTERED_MODELS ' , True )
settings , ' HAYSTACK_LIMIT_TO_REGISTERED_MODELS ' , True )
# 处理模型选择
if models and len ( models ) :
if models and len ( models ) :
model_choices = sorted ( get_model_ct ( model ) for model in models )
model_choices = sorted ( get_model_ct ( model ) for model in models )
elif limit_to_registered_models :
elif limit_to_registered_models :
# Using narrow queries, limit the results to only models handled
# 使用窄查询,将结果限制为当前路由器处理的模型
# with the current routers.
model_choices = self . build_models_list ( )
model_choices = self . build_models_list ( )
else :
else :
model_choices = [ ]
model_choices = [ ]
# 构建窄查询
if len ( model_choices ) > 0 :
if len ( model_choices ) > 0 :
if narrow_queries is None :
if narrow_queries is None :
narrow_queries = set ( )
narrow_queries = set ( )
@ -460,9 +506,9 @@ class WhooshSearchBackend(BaseSearchBackend):
narrow_searcher = None
narrow_searcher = None
# 处理窄查询
if narrow_queries is not None :
if narrow_queries is not None :
# Potentially expensive? I don't see another way to do it in
# 可能很昂贵? 在Whoosh中我没有看到其他方法...
# Whoosh...
narrow_searcher = self . index . searcher ( )
narrow_searcher = self . index . searcher ( )
for nq in narrow_queries :
for nq in narrow_queries :
@ -482,11 +528,12 @@ class WhooshSearchBackend(BaseSearchBackend):
self . index = self . index . refresh ( )
self . index = self . index . refresh ( )
# 执行搜索
if self . index . doc_count ( ) :
if self . index . doc_count ( ) :
searcher = self . index . searcher ( )
searcher = self . index . searcher ( )
parsed_query = self . parser . parse ( query_string )
parsed_query = self . parser . parse ( query_string )
# In the event of an invalid/stopworded query, recover gracefully.
# 如果查询无效/包含停用词,优雅地恢复
if parsed_query is None :
if parsed_query is None :
return {
return {
' results ' : [ ] ,
' results ' : [ ] ,
@ -502,11 +549,12 @@ class WhooshSearchBackend(BaseSearchBackend):
' reverse ' : reverse ,
' reverse ' : reverse ,
}
}
# Handle the case where the results have been narrowed.
# 处理结果被缩小的情况
if narrowed_results is not None :
if narrowed_results is not None :
search_kwargs [ ' filter ' ] = narrowed_results
search_kwargs [ ' filter ' ] = narrowed_results
try :
try :
# 执行分页搜索
raw_page = searcher . search_page (
raw_page = searcher . search_page (
parsed_query ,
parsed_query ,
page_num ,
page_num ,
@ -522,8 +570,7 @@ class WhooshSearchBackend(BaseSearchBackend):
' spelling_suggestion ' : None ,
' spelling_suggestion ' : None ,
}
}
# Because as of Whoosh 2.5.1, it will return the wrong page of
# 因为Whoosh 2.5.1中,如果请求的页码太高,它会返回错误的页面
# results if you request something too high. :(
if raw_page . pagenum < page_num :
if raw_page . pagenum < page_num :
return {
return {
' results ' : [ ] ,
' results ' : [ ] ,
@ -531,6 +578,7 @@ class WhooshSearchBackend(BaseSearchBackend):
' spelling_suggestion ' : None ,
' spelling_suggestion ' : None ,
}
}
# 处理搜索结果
results = self . _process_results (
results = self . _process_results (
raw_page ,
raw_page ,
highlight = highlight ,
highlight = highlight ,
@ -544,6 +592,7 @@ class WhooshSearchBackend(BaseSearchBackend):
return results
return results
else :
else :
# 处理空索引的情况
if self . include_spelling :
if self . include_spelling :
if spelling_query :
if spelling_query :
spelling_suggestion = self . create_spelling_suggestion (
spelling_suggestion = self . create_spelling_suggestion (
@ -560,6 +609,7 @@ class WhooshSearchBackend(BaseSearchBackend):
' spelling_suggestion ' : spelling_suggestion ,
' spelling_suggestion ' : spelling_suggestion ,
}
}
# 更多类似此内容的方法(推荐相关)
def more_like_this (
def more_like_this (
self ,
self ,
model_instance ,
model_instance ,
@ -573,8 +623,7 @@ class WhooshSearchBackend(BaseSearchBackend):
if not self . setup_complete :
if not self . setup_complete :
self . setup ( )
self . setup ( )
# Deferred models will have a different class ("RealClass_Deferred_fieldname")
# 延迟模型会有不同的类,不在我们的注册表中
# which won't be in our registry:
model_klass = model_instance . _meta . concrete_model
model_klass = model_instance . _meta . concrete_model
field_name = self . content_field_name
field_name = self . content_field_name
@ -582,19 +631,21 @@ class WhooshSearchBackend(BaseSearchBackend):
narrowed_results = None
narrowed_results = None
self . index = self . index . refresh ( )
self . index = self . index . refresh ( )
# 限制到注册模型的设置
if limit_to_registered_models is None :
if limit_to_registered_models is None :
limit_to_registered_models = getattr (
limit_to_registered_models = getattr (
settings , ' HAYSTACK_LIMIT_TO_REGISTERED_MODELS ' , True )
settings , ' HAYSTACK_LIMIT_TO_REGISTERED_MODELS ' , True )
# 处理模型选择
if models and len ( models ) :
if models and len ( models ) :
model_choices = sorted ( get_model_ct ( model ) for model in models )
model_choices = sorted ( get_model_ct ( model ) for model in models )
elif limit_to_registered_models :
elif limit_to_registered_models :
# Using narrow queries, limit the results to only models handled
# 使用窄查询,将结果限制为当前路由器处理的模型
# with the current routers.
model_choices = self . build_models_list ( )
model_choices = self . build_models_list ( )
else :
else :
model_choices = [ ]
model_choices = [ ]
# 构建窄查询
if len ( model_choices ) > 0 :
if len ( model_choices ) > 0 :
if narrow_queries is None :
if narrow_queries is None :
narrow_queries = set ( )
narrow_queries = set ( )
@ -607,9 +658,8 @@ class WhooshSearchBackend(BaseSearchBackend):
narrow_searcher = None
narrow_searcher = None
# 处理窄查询
if narrow_queries is not None :
if narrow_queries is not None :
# Potentially expensive? I don't see another way to do it in
# Whoosh...
narrow_searcher = self . index . searcher ( )
narrow_searcher = self . index . searcher ( )
for nq in narrow_queries :
for nq in narrow_queries :
@ -632,6 +682,7 @@ class WhooshSearchBackend(BaseSearchBackend):
self . index = self . index . refresh ( )
self . index = self . index . refresh ( )
raw_results = EmptyResults ( )
raw_results = EmptyResults ( )
# 执行更多类似此内容的搜索
if self . index . doc_count ( ) :
if self . index . doc_count ( ) :
query = " %s : %s " % ( ID , get_identifier ( model_instance ) )
query = " %s : %s " % ( ID , get_identifier ( model_instance ) )
searcher = self . index . searcher ( )
searcher = self . index . searcher ( )
@ -642,7 +693,7 @@ class WhooshSearchBackend(BaseSearchBackend):
raw_results = results [ 0 ] . more_like_this (
raw_results = results [ 0 ] . more_like_this (
field_name , top = end_offset )
field_name , top = end_offset )
# Handle the case where the results have been narrowed.
# 处理结果被缩小的情况
if narrowed_results is not None and hasattr ( raw_results , ' filter ' ) :
if narrowed_results is not None and hasattr ( raw_results , ' filter ' ) :
raw_results . filter ( narrowed_results )
raw_results . filter ( narrowed_results )
@ -658,8 +709,7 @@ class WhooshSearchBackend(BaseSearchBackend):
' spelling_suggestion ' : None ,
' spelling_suggestion ' : None ,
}
}
# Because as of Whoosh 2.5.1, it will return the wrong page of
# 因为Whoosh 2.5.1中,如果请求的页码太高,它会返回错误的页面
# results if you request something too high. :(
if raw_page . pagenum < page_num :
if raw_page . pagenum < page_num :
return {
return {
' results ' : [ ] ,
' results ' : [ ] ,
@ -667,6 +717,7 @@ class WhooshSearchBackend(BaseSearchBackend):
' spelling_suggestion ' : None ,
' spelling_suggestion ' : None ,
}
}
# 处理结果
results = self . _process_results ( raw_page , result_class = result_class )
results = self . _process_results ( raw_page , result_class = result_class )
searcher . close ( )
searcher . close ( )
@ -675,6 +726,7 @@ class WhooshSearchBackend(BaseSearchBackend):
return results
return results
# 处理搜索结果的方法
def _process_results (
def _process_results (
self ,
self ,
raw_page ,
raw_page ,
@ -685,8 +737,7 @@ class WhooshSearchBackend(BaseSearchBackend):
from haystack import connections
from haystack import connections
results = [ ]
results = [ ]
# It's important to grab the hits first before slicing. Otherwise, this
# 在切片之前先获取命中数很重要
# can cause pagination failures.
hits = len ( raw_page )
hits = len ( raw_page )
if result_class is None :
if result_class is None :
@ -697,6 +748,7 @@ class WhooshSearchBackend(BaseSearchBackend):
unified_index = connections [ self . connection_alias ] . get_unified_index ( )
unified_index = connections [ self . connection_alias ] . get_unified_index ( )
indexed_models = unified_index . get_indexed_models ( )
indexed_models = unified_index . get_indexed_models ( )
# 处理每个搜索结果
for doc_offset , raw_result in enumerate ( raw_page ) :
for doc_offset , raw_result in enumerate ( raw_page ) :
score = raw_page . score ( doc_offset ) or 0
score = raw_page . score ( doc_offset ) or 0
app_label , model_name = raw_result [ DJANGO_CT ] . split ( ' . ' )
app_label , model_name = raw_result [ DJANGO_CT ] . split ( ' . ' )
@ -710,7 +762,7 @@ class WhooshSearchBackend(BaseSearchBackend):
if string_key in index . fields and hasattr (
if string_key in index . fields and hasattr (
index . fields [ string_key ] , ' convert ' ) :
index . fields [ string_key ] , ' convert ' ) :
# Special-cased due to the nature of KEYWORD fields.
# 由于KEYWORD字段的性质, 需要特殊处理
if index . fields [ string_key ] . is_multivalued :
if index . fields [ string_key ] . is_multivalued :
if value is None or len ( value ) == 0 :
if value is None or len ( value ) == 0 :
additional_fields [ string_key ] = [ ]
additional_fields [ string_key ] = [ ]
@ -723,9 +775,11 @@ class WhooshSearchBackend(BaseSearchBackend):
else :
else :
additional_fields [ string_key ] = self . _to_python ( value )
additional_fields [ string_key ] = self . _to_python ( value )
# 删除系统字段
del ( additional_fields [ DJANGO_CT ] )
del ( additional_fields [ DJANGO_CT ] )
del ( additional_fields [ DJANGO_ID ] )
del ( additional_fields [ DJANGO_ID ] )
# 处理高亮显示
if highlight :
if highlight :
sa = StemmingAnalyzer ( )
sa = StemmingAnalyzer ( )
formatter = WhooshHtmlFormatter ( ' em ' )
formatter = WhooshHtmlFormatter ( ' em ' )
@ -742,6 +796,7 @@ class WhooshSearchBackend(BaseSearchBackend):
self . content_field_name : [ whoosh_result ] ,
self . content_field_name : [ whoosh_result ] ,
}
}
# 创建搜索结果对象
result = result_class (
result = result_class (
app_label ,
app_label ,
model_name ,
model_name ,
@ -752,6 +807,7 @@ class WhooshSearchBackend(BaseSearchBackend):
else :
else :
hits - = 1
hits - = 1
# 处理拼写建议
if self . include_spelling :
if self . include_spelling :
if spelling_query :
if spelling_query :
spelling_suggestion = self . create_spelling_suggestion (
spelling_suggestion = self . create_spelling_suggestion (
@ -767,6 +823,7 @@ class WhooshSearchBackend(BaseSearchBackend):
' spelling_suggestion ' : spelling_suggestion ,
' spelling_suggestion ' : spelling_suggestion ,
}
}
# 创建拼写建议的方法
def create_spelling_suggestion ( self , query_string ) :
def create_spelling_suggestion ( self , query_string ) :
spelling_suggestion = None
spelling_suggestion = None
reader = self . index . reader ( )
reader = self . index . reader ( )
@ -776,17 +833,19 @@ class WhooshSearchBackend(BaseSearchBackend):
if not query_string :
if not query_string :
return spelling_suggestion
return spelling_suggestion
# Clean the string.
# 清理字符串,移除保留字
for rev_word in self . RESERVED_WORDS :
for rev_word in self . RESERVED_WORDS :
cleaned_query = cleaned_query . replace ( rev_word , ' ' )
cleaned_query = cleaned_query . replace ( rev_word , ' ' )
# 清理字符串,移除保留字符
for rev_char in self . RESERVED_CHARACTERS :
for rev_char in self . RESERVED_CHARACTERS :
cleaned_query = cleaned_query . replace ( rev_char , ' ' )
cleaned_query = cleaned_query . replace ( rev_char , ' ' )
# Break it down.
# 分解查询词
query_words = cleaned_query . split ( )
query_words = cleaned_query . split ( )
suggested_words = [ ]
suggested_words = [ ]
# 为每个查询词获取建议
for word in query_words :
for word in query_words :
suggestions = corrector . suggest ( word , limit = 1 )
suggestions = corrector . suggest ( word , limit = 1 )
@ -796,6 +855,7 @@ class WhooshSearchBackend(BaseSearchBackend):
spelling_suggestion = ' ' . join ( suggested_words )
spelling_suggestion = ' ' . join ( suggested_words )
return spelling_suggestion
return spelling_suggestion
# Python值转换为Whoosh字符串的方法
def _from_python ( self , value ) :
def _from_python ( self , value ) :
"""
"""
Converts Python values to a string for Whoosh .
Converts Python values to a string for Whoosh .
@ -813,12 +873,13 @@ class WhooshSearchBackend(BaseSearchBackend):
elif isinstance ( value , ( list , tuple ) ) :
elif isinstance ( value , ( list , tuple ) ) :
value = u ' , ' . join ( [ force_str ( v ) for v in value ] )
value = u ' , ' . join ( [ force_str ( v ) for v in value ] )
elif isinstance ( value , ( six . integer_types , float ) ) :
elif isinstance ( value , ( six . integer_types , float ) ) :
# Leave it alone.
# 保持原样
pass
pass
else :
else :
value = force_str ( value )
value = force_str ( value )
return value
return value
# Whoosh值转换为Python值的方法
def _to_python ( self , value ) :
def _to_python ( self , value ) :
"""
"""
Converts values from Whoosh to native Python values .
Converts values from Whoosh to native Python values .
@ -848,10 +909,10 @@ class WhooshSearchBackend(BaseSearchBackend):
date_values [ ' second ' ] )
date_values [ ' second ' ] )
try :
try :
# Attempt to use json to load the values.
# 尝试使用json加载值
converted_value = json . loads ( value )
converted_value = json . loads ( value )
# Try to handle most built-in types.
# 尝试处理大多数内置类型
if isinstance (
if isinstance (
converted_value ,
converted_value ,
( list ,
( list ,
@ -863,20 +924,22 @@ class WhooshSearchBackend(BaseSearchBackend):
complex ) ) :
complex ) ) :
return converted_value
return converted_value
except BaseException :
except BaseException :
# If it fails (SyntaxError or its ilk) or we don't trust it,
# 如果失败( SyntaxError或其同类) 或者我们不信任它, 继续
# continue on.
pass
pass
return value
return value
# Whoosh搜索查询类, 继承自BaseSearchQuery
class WhooshSearchQuery ( BaseSearchQuery ) :
class WhooshSearchQuery ( BaseSearchQuery ) :
# 日期时间转换方法
def _convert_datetime ( self , date ) :
def _convert_datetime ( self , date ) :
if hasattr ( date , ' hour ' ) :
if hasattr ( date , ' hour ' ) :
return force_str ( date . strftime ( ' % Y % m %d % H % M % S ' ) )
return force_str ( date . strftime ( ' % Y % m %d % H % M % S ' ) )
else :
else :
return force_str ( date . strftime ( ' % Y % m %d 000000 ' ) )
return force_str ( date . strftime ( ' % Y % m %d 000000 ' ) )
# 清理查询片段的方法
def clean ( self , query_fragment ) :
def clean ( self , query_fragment ) :
"""
"""
Provides a mechanism for sanitizing user input before presenting the
Provides a mechanism for sanitizing user input before presenting the
@ -902,13 +965,14 @@ class WhooshSearchQuery(BaseSearchQuery):
return ' ' . join ( cleaned_words )
return ' ' . join ( cleaned_words )
# 构建查询片段的方法
def build_query_fragment ( self , field , filter_type , value ) :
def build_query_fragment ( self , field , filter_type , value ) :
from haystack import connections
from haystack import connections
query_frag = ' '
query_frag = ' '
is_datetime = False
is_datetime = False
if not hasattr ( value , ' input_type_name ' ) :
if not hasattr ( value , ' input_type_name ' ) :
# Handle when we've got a ``ValuesListQuerySet`` ...
# 处理当我们有``ValuesListQuerySet``时 ...
if hasattr ( value , ' values_list ' ) :
if hasattr ( value , ' values_list ' ) :
value = list ( value )
value = list ( value )
@ -916,26 +980,26 @@ class WhooshSearchQuery(BaseSearchQuery):
is_datetime = True
is_datetime = True
if isinstance ( value , six . string_types ) and value != ' ' :
if isinstance ( value , six . string_types ) and value != ' ' :
# It's not an ``InputType``. Assume ``Clean``.
# 不是``InputType``。假设是``Clean``。
value = Clean ( value )
value = Clean ( value )
else :
else :
value = PythonData ( value )
value = PythonData ( value )
# Prepare the query using the InputType.
# 使用InputType准备查询
prepared_value = value . prepare ( self )
prepared_value = value . prepare ( self )
if not isinstance ( prepared_value , ( set , list , tuple ) ) :
if not isinstance ( prepared_value , ( set , list , tuple ) ) :
# Then convert whatever we get back to what pysolr wants if needed.
# 然后将我们得到的任何内容转换为pysolr需要的格式
prepared_value = self . backend . _from_python ( prepared_value )
prepared_value = self . backend . _from_python ( prepared_value )
# 'content' is a special reserved word, much like 'pk' in
# 'content'是一个特殊的保留字, 就像Django ORM层中的'pk'一样
# Django's ORM layer. It indicates 'no special field'.
if field == ' content ' :
if field == ' content ' :
index_fieldname = ' '
index_fieldname = ' '
else :
else :
index_fieldname = u ' %s : ' % connections [ self . _using ] . get_unified_index (
index_fieldname = u ' %s : ' % connections [ self . _using ] . get_unified_index (
) . get_index_fieldname ( field )
) . get_index_fieldname ( field )
# 过滤器类型映射
filter_types = {
filter_types = {
' content ' : ' %s ' ,
' content ' : ' %s ' ,
' contains ' : ' * %s * ' ,
' contains ' : ' * %s * ' ,
@ -949,6 +1013,7 @@ class WhooshSearchQuery(BaseSearchQuery):
' fuzzy ' : u ' %s ~ ' ,
' fuzzy ' : u ' %s ~ ' ,
}
}
# 构建查询片段
if value . post_process is False :
if value . post_process is False :
query_frag = prepared_value
query_frag = prepared_value
else :
else :
@ -961,8 +1026,7 @@ class WhooshSearchQuery(BaseSearchQuery):
if value . input_type_name == ' exact ' :
if value . input_type_name == ' exact ' :
query_frag = prepared_value
query_frag = prepared_value
else :
else :
# Iterate over terms & incorportate the converted form of
# 遍历术语并将每个的转换形式纳入查询
# each into the query.
terms = [ ]
terms = [ ]
if isinstance ( prepared_value , six . string_types ) :
if isinstance ( prepared_value , six . string_types ) :
@ -1026,19 +1090,17 @@ class WhooshSearchQuery(BaseSearchQuery):
query_frag = filter_types [ filter_type ] % prepared_value
query_frag = filter_types [ filter_type ] % prepared_value
# 添加括号
if len ( query_frag ) and not isinstance ( value , Raw ) :
if len ( query_frag ) and not isinstance ( value , Raw ) :
if not query_frag . startswith ( ' ( ' ) and not query_frag . endswith ( ' ) ' ) :
if not query_frag . startswith ( ' ( ' ) and not query_frag . endswith ( ' ) ' ) :
query_frag = " ( %s ) " % query_frag
query_frag = " ( %s ) " % query_frag
return u " %s %s " % ( index_fieldname , query_frag )
return u " %s %s " % ( index_fieldname , query_frag )
# if not filter_type in ('in', 'range'):
# # 'in' is a bit of a special case, as we don't want to
# # convert a valid list/tuple to string. Defer handling it
# # until later...
# value = self.backend._from_python(value)
# Whoosh搜索引擎类, 继承自BaseEngine
class WhooshEngine ( BaseEngine ) :
class WhooshEngine ( BaseEngine ) :
# 指定后端类
backend = WhooshSearchBackend
backend = WhooshSearchBackend
query = WhooshSearchQuery
# 指定查询类
query = WhooshSearchQuery