@ -1,7 +1,9 @@
# encoding: utf-8
# 导入Python 2/3兼容性支持
from __future__ import absolute_import , division , print_function , unicode_literals
# 导入标准库模块
import json
import os
import re
@ -9,61 +11,85 @@ import shutil
import threading
import warnings
# 导入Python 2/3兼容性模块
import six
# 导入Django配置
from django . conf import settings
# 导入配置错误异常
from django . core . exceptions import ImproperlyConfigured
# 导入日期时间模块
from datetime import datetime
# 导入字符串编码工具
from django . utils . encoding import force_str
# 导入Haystack搜索引擎基类
from haystack . backends import BaseEngine , BaseSearchBackend , BaseSearchQuery , EmptyResults , log_query
# 导入Haystack常量
from haystack . constants import DJANGO_CT , DJANGO_ID , ID
# 导入Haystack异常
from haystack . exceptions import MissingDependency , SearchBackendError , SkipDocument
# 导入Haystack输入类型
from haystack . inputs import Clean , Exact , PythonData , Raw
# 导入搜索结果模型
from haystack . models import SearchResult
# 导入Haystack工具函数
from haystack . utils import get_identifier , get_model_ct
from haystack . utils import log as logging
# 导入模型加载工具
from haystack . utils . app_loading import haystack_get_model
# 导入中文分词器
from jieba . analyse import ChineseAnalyzer
# 导入Whoosh搜索库
from whoosh import index
# 导入Whoosh分析器
from whoosh . analysis import StemmingAnalyzer
# 导入Whoosh字段类型
from whoosh . fields import BOOLEAN , DATETIME , IDLIST , KEYWORD , NGRAM , NGRAMWORDS , NUMERIC , Schema , TEXT
from whoosh . fields import ID as WHOOSH_ID
# 导入Whoosh文件存储
from whoosh . filedb . filestore import FileStorage , RamStorage
# 导入Whoosh高亮组件
from whoosh . highlight import ContextFragmenter , HtmlFormatter
from whoosh . highlight import highlight as whoosh_highlight
# 导入Whoosh查询解析器
from whoosh . qparser import QueryParser
# 导入Whoosh搜索结果分页
from whoosh . searching import ResultsPage
# 导入Whoosh异步写入器
from whoosh . writing import AsyncWriter
# 尝试导入Whoosh, 如果失败则抛出缺失依赖异常
try :
import whoosh
except ImportError :
raise MissingDependency (
" The ' whoosh ' backend requires the installation of ' Whoosh ' . Please refer to the documentation. " )
# Handle minimum requirement.
# 处理最低版本要求
if not hasattr ( whoosh , ' __version__ ' ) or whoosh . __version__ < ( 2 , 5 , 0 ) :
raise MissingDependency (
" The ' whoosh ' backend requires version 2.5.0 or greater. " )
# Bubble up the correct error.
# 日期时间正则表达式
DATETIME_REGEX = re . compile (
' ^(?P<year> \ d {4} )-(?P<month> \ d {2} )-(?P<day> \ d {2} )T(?P<hour> \ d {2} ):(?P<minute> \ d {2} ):(?P<second> \ d {2} )( \ . \ d { 3,6}Z?)?$ ' )
# 线程本地存储
LOCALS = threading . local ( )
LOCALS . RAM_STORE = None
# 自定义Whoosh HTML格式化器
class WhooshHtmlFormatter ( HtmlFormatter ) :
"""
This is a HtmlFormatter simpler than the whoosh . HtmlFormatter .
We use it to have consistent results across backends . Specifically ,
Solr , Xapian and Elasticsearch are using this formatting .
这是一个比whoosh. HtmlFormatter更简单的HtmlFormatter 。
我们使用它来在不同后端之间获得一致的结果。 具体来说 ,
Solr 、 Xapian和Elasticsearch都使用这种格式化 。
"""
template = ' < %(tag)s > %(t)s </ %(tag)s > '
# Whoosh搜索后端类
class WhooshSearchBackend ( BaseSearchBackend ) :
# W ord reserved by Whoosh for special use.
# W hoosh保留的特殊用途单词
RESERVED_WORDS = (
' AND ' ,
' NOT ' ,
@ -71,55 +97,60 @@ class WhooshSearchBackend(BaseSearchBackend):
' TO ' ,
)
# Characters reserved by Whoosh for special use.
# The '\\' must come first, so as not to overwrite the other slash
# replacements.
# Whoosh保留的特殊用途字符
# '\\' 必须放在前面,以免覆盖其他斜杠替换
RESERVED_CHARACTERS = (
' \\ ' , ' + ' , ' - ' , ' && ' , ' || ' , ' ! ' , ' ( ' , ' ) ' , ' { ' , ' } ' ,
' [ ' , ' ] ' , ' ^ ' , ' " ' , ' ~ ' , ' * ' , ' ? ' , ' : ' , ' . ' ,
)
# 初始化方法
def __init__ ( self , connection_alias , * * connection_options ) :
super (
WhooshSearchBackend ,
self ) . __init__ (
connection_alias ,
* * connection_options )
self . setup_complete = False
self . use_file_storage = True
self . setup_complete = False # 设置完成标志
self . use_file_storage = True # 使用文件存储标志
self . post_limit = getattr (
connection_options ,
' POST_LIMIT ' ,
128 * 1024 * 1024 )
self . path = connection_options . get ( ' PATH ' )
128 * 1024 * 1024 ) # 帖子大小限制
self . path = connection_options . get ( ' PATH ' ) # 索引路径
# 检查是否使用文件存储
if connection_options . get ( ' STORAGE ' , ' file ' ) != ' file ' :
self . use_file_storage = False
# 如果使用文件存储但没有指定路径,抛出配置错误
if self . use_file_storage and not self . path :
raise ImproperlyConfigured (
" You must specify a ' PATH ' in your settings for connection ' %s ' . " %
connection_alias )
self . log = logging . getLogger ( ' haystack ' )
self . log = logging . getLogger ( ' haystack ' ) # 日志记录器
# 设置方法
def setup ( self ) :
"""
Defers loading until needed .
延迟加载直到需要时
"""
from haystack import connections
new_index = False
# Make sure the index is there.
# 确保索引目录存在
if self . use_file_storage and not os . path . exists ( self . path ) :
os . makedirs ( self . path )
new_index = True
# 检查索引目录是否可写
if self . use_file_storage and not os . access ( self . path , os . W_OK ) :
raise IOError (
" The path to your Whoosh index ' %s ' is not writable for the current user/group. " %
self . path )
# 设置存储类型
if self . use_file_storage :
self . storage = FileStorage ( self . path )
else :
@ -130,10 +161,12 @@ class WhooshSearchBackend(BaseSearchBackend):
self . storage = LOCALS . RAM_STORE
# 构建模式和内容字段名
self . content_field_name , self . schema = self . build_schema (
connections [ self . connection_alias ] . get_unified_index ( ) . all_searchfields ( ) )
self . parser = QueryParser ( self . content_field_name , schema = self . schema )
# 创建或打开索引
if new_index is True :
self . index = self . storage . create_index ( self . schema )
else :
@ -142,21 +175,23 @@ class WhooshSearchBackend(BaseSearchBackend):
except index . EmptyIndexError :
self . index = self . storage . create_index ( self . schema )
self . setup_complete = True
self . setup_complete = True # 标记设置完成
# 构建模式方法
def build_schema ( self , fields ) :
schema_fields = {
ID : WHOOSH_ID ( stored = True , unique = True ) ,
DJANGO_CT : WHOOSH_ID ( stored = True ) ,
DJANGO_ID : WHOOSH_ID ( stored = True ) ,
ID : WHOOSH_ID ( stored = True , unique = True ) , # ID字段
DJANGO_CT : WHOOSH_ID ( stored = True ) , # Django内容类型字段
DJANGO_ID : WHOOSH_ID ( stored = True ) , # Django ID字段
}
# Grab the number of keys that are hard-coded into Haystack.
# We'll use this to (possibly) fail slightly more gracefully later.
# 获取Haystack中硬编码的键数量
initial_key_count = len ( schema_fields )
content_field_name = ' '
# 遍历所有字段构建模式
for field_name , field_class in fields . items ( ) :
if field_class . is_multivalued :
# 多值字段处理
if field_class . indexed is False :
schema_fields [ field_class . index_fieldname ] = IDLIST (
stored = True , field_boost = field_class . boost )
@ -164,41 +199,47 @@ class WhooshSearchBackend(BaseSearchBackend):
schema_fields [ field_class . index_fieldname ] = KEYWORD (
stored = True , commas = True , scorable = True , field_boost = field_class . boost )
elif field_class . field_type in [ ' date ' , ' datetime ' ] :
# 日期时间字段处理
schema_fields [ field_class . index_fieldname ] = DATETIME (
stored = field_class . stored , sortable = True )
elif field_class . field_type == ' integer ' :
# 整数字段处理
schema_fields [ field_class . index_fieldname ] = NUMERIC (
stored = field_class . stored , numtype = int , field_boost = field_class . boost )
elif field_class . field_type == ' float ' :
# 浮点数字段处理
schema_fields [ field_class . index_fieldname ] = NUMERIC (
stored = field_class . stored , numtype = float , field_boost = field_class . boost )
elif field_class . field_type == ' boolean ' :
# Field boost isn't supported on BOOLEAN as of 1.8.2.
# 布尔字段处理
schema_fields [ field_class . index_fieldname ] = BOOLEAN (
stored = field_class . stored )
elif field_class . field_type == ' ngram ' :
# N-gram字段处理
schema_fields [ field_class . index_fieldname ] = NGRAM (
minsize = 3 , maxsize = 15 , stored = field_class . stored , field_boost = field_class . boost )
elif field_class . field_type == ' edge_ngram ' :
# 边缘N-gram字段处理
schema_fields [ field_class . index_fieldname ] = NGRAMWORDS ( minsize = 2 , maxsize = 15 , at = ' start ' ,
stored = field_class . stored ,
field_boost = field_class . boost )
else :
# schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True)
# 默认使用中文分析器的文本字段
schema_fields [ field_class . index_fieldname ] = TEXT (
stored = True , analyzer = ChineseAnalyzer ( ) , field_boost = field_class . boost , sortable = True )
# 标记内容字段
if field_class . document is True :
content_field_name = field_class . index_fieldname
schema_fields [ field_class . index_fieldname ] . spelling = True
# Fail more gracefully than relying on the backend to die if no fields
# are found.
# 如果没有找到字段,优雅地失败
if len ( schema_fields ) < = initial_key_count :
raise SearchBackendError (
" No fields were found in any search_indexes. Please correct this before attempting to search. " )
return ( content_field_name , Schema ( * * schema_fields ) )
# 更新索引方法
def update ( self , index , iterable , commit = True ) :
if not self . setup_complete :
self . setup ( )
@ -206,18 +247,18 @@ class WhooshSearchBackend(BaseSearchBackend):
self . index = self . index . refresh ( )
writer = AsyncWriter ( self . index )
# 遍历所有对象进行索引
for obj in iterable :
try :
doc = index . full_prepare ( obj )
except SkipDocument :
self . log . debug ( u " Indexing for object ` %s ` skipped " , obj )
else :
# Really make sure it's unicode, because Whoosh won't have it any
# other way.
# 确保所有值为Unicode格式
for key in doc :
doc [ key ] = self . _from_python ( doc [ key ] )
# Document boosts aren't supported in Whoosh 2.5.0+.
# Whoosh 2.5.0+不支持文档boost
if ' boost ' in doc :
del doc [ ' boost ' ]
@ -227,9 +268,7 @@ class WhooshSearchBackend(BaseSearchBackend):
if not self . silently_fail :
raise
# We'll log the object identifier but won't include the actual object
# to avoid the possibility of that generating encoding errors while
# processing the log message:
# 记录对象标识符但不包含实际对象
self . log . error (
u " %s while preparing object for update " %
e . __class__ . __name__ ,
@ -239,11 +278,11 @@ class WhooshSearchBackend(BaseSearchBackend):
" index " : index ,
" object " : get_identifier ( obj ) } } )
# 提交写入
if len ( iterable ) > 0 :
# For now, commit no matter what, as we run into locking issues
# otherwise.
writer . commit ( )
# 删除文档方法
def remove ( self , obj_or_string , commit = True ) :
if not self . setup_complete :
self . setup ( )
@ -266,6 +305,7 @@ class WhooshSearchBackend(BaseSearchBackend):
e ,
exc_info = True )
# 清空索引方法
def clear ( self , models = None , commit = True ) :
if not self . setup_complete :
self . setup ( )
@ -303,17 +343,18 @@ class WhooshSearchBackend(BaseSearchBackend):
self . log . error (
" Failed to clear Whoosh index: %s " , e , exc_info = True )
# 删除索引方法
def delete_index ( self ) :
# Per the Whoosh mailing list, if wiping out everything from the index,
# it's much more efficient to simply delete the index files.
# 根据Whoosh邮件列表, 如果要清除索引中的所有内容, 删除索引文件更高效
if self . use_file_storage and os . path . exists ( self . path ) :
shutil . rmtree ( self . path )
elif not self . use_file_storage :
self . storage . clean ( )
# Recreate everything.
# 重新创建所有内容
self . setup ( )
# 优化索引方法
def optimize ( self ) :
if not self . setup_complete :
self . setup ( )
@ -321,13 +362,13 @@ class WhooshSearchBackend(BaseSearchBackend):
self . index = self . index . refresh ( )
self . index . optimize ( )
# 计算分页方法
def calculate_page ( self , start_offset = 0 , end_offset = None ) :
# Prevent against Whoosh throwing an error. Requires an end_offset
# greater than 0.
# 防止Whoosh抛出错误, 需要end_offset大于0
if end_offset is not None and end_offset < = 0 :
end_offset = 1
# Determine the page.
# 确定页码
page_num = 0
if end_offset is None :
@ -341,10 +382,11 @@ class WhooshSearchBackend(BaseSearchBackend):
if page_length and page_length > 0 :
page_num = int ( start_offset / page_length )
# Increment because Whoosh uses 1-based page numbers.
# 递增, 因为Whoosh使用基于1的页码
page_num + = 1
return page_num , page_length
# 搜索方法,使用日志装饰器
@log_query
def search (
self ,
@ -369,7 +411,7 @@ class WhooshSearchBackend(BaseSearchBackend):
if not self . setup_complete :
self . setup ( )
# A zero length query should return no results.
# 零长度查询应该返回无结果
if len ( query_string ) == 0 :
return {
' results ' : [ ] ,
@ -378,8 +420,7 @@ class WhooshSearchBackend(BaseSearchBackend):
query_string = force_str ( query_string )
# A one-character query (non-wildcard) gets nabbed by a stopwords
# filter and should yield zero results.
# 单字符查询(非通配符)被停用词过滤器捕获,应该返回零结果
if len ( query_string ) < = 1 and query_string != u ' * ' :
return {
' results ' : [ ] ,
@ -388,10 +429,8 @@ class WhooshSearchBackend(BaseSearchBackend):
reverse = False
# 排序处理
if sort_by is not None :
# Determine if we need to reverse the results and if Whoosh can
# handle what it's being asked to sort by. Reversing is an
# all-or-nothing action, unfortunately.
sort_by_list = [ ]
reverse_counter = 0
@ -399,6 +438,7 @@ class WhooshSearchBackend(BaseSearchBackend):
if order_by . startswith ( ' - ' ) :
reverse_counter + = 1
# Whoosh要求所有排序字段使用相同的排序方向
if reverse_counter and reverse_counter != len ( sort_by ) :
raise SearchBackendError ( " Whoosh requires all order_by fields "
" to use the same sort direction " )
@ -406,17 +446,16 @@ class WhooshSearchBackend(BaseSearchBackend):
for order_by in sort_by :
if order_by . startswith ( ' - ' ) :
sort_by_list . append ( order_by [ 1 : ] )
if len ( sort_by_list ) == 1 :
reverse = True
else :
sort_by_list . append ( order_by )
if len ( sort_by_list ) == 1 :
reverse = False
sort_by = sort_by_list [ 0 ]
# 警告不支持的功能
if facets is not None :
warnings . warn (
" Whoosh does not handle faceting. " ,
@ -438,6 +477,7 @@ class WhooshSearchBackend(BaseSearchBackend):
narrowed_results = None
self . index = self . index . refresh ( )
# 模型限制处理
if limit_to_registered_models is None :
limit_to_registered_models = getattr (
settings , ' HAYSTACK_LIMIT_TO_REGISTERED_MODELS ' , True )
@ -445,12 +485,11 @@ class WhooshSearchBackend(BaseSearchBackend):
if models and len ( models ) :
model_choices = sorted ( get_model_ct ( model ) for model in models )
elif limit_to_registered_models :
# Using narrow queries, limit the results to only models handled
# with the current routers.
model_choices = self . build_models_list ( )
else :
model_choices = [ ]
# 构建窄查询
if len ( model_choices ) > 0 :
if narrow_queries is None :
narrow_queries = set ( )
@ -460,9 +499,8 @@ class WhooshSearchBackend(BaseSearchBackend):
narrow_searcher = None
# 处理窄查询
if narrow_queries is not None :
# Potentially expensive? I don't see another way to do it in
# Whoosh...
narrow_searcher = self . index . searcher ( )
for nq in narrow_queries :
@ -482,11 +520,12 @@ class WhooshSearchBackend(BaseSearchBackend):
self . index = self . index . refresh ( )
# 执行搜索
if self . index . doc_count ( ) :
searcher = self . index . searcher ( )
parsed_query = self . parser . parse ( query_string )
# In the event of an invalid/stopworded query, recover gracefully.
# 处理无效/停用词查询
if parsed_query is None :
return {
' results ' : [ ] ,
@ -502,7 +541,7 @@ class WhooshSearchBackend(BaseSearchBackend):
' reverse ' : reverse ,
}
# Handle the case where the results have been narrowed.
# 处理窄结果
if narrowed_results is not None :
search_kwargs [ ' filter ' ] = narrowed_results
@ -522,8 +561,7 @@ class WhooshSearchBackend(BaseSearchBackend):
' spelling_suggestion ' : None ,
}
# Because as of Whoosh 2.5.1, it will return the wrong page of
# results if you request something too high. :(
# Whoosh 2.5.1的错误处理
if raw_page . pagenum < page_num :
return {
' results ' : [ ] ,
@ -531,6 +569,7 @@ class WhooshSearchBackend(BaseSearchBackend):
' spelling_suggestion ' : None ,
}
# 处理结果
results = self . _process_results (
raw_page ,
highlight = highlight ,
@ -544,6 +583,7 @@ class WhooshSearchBackend(BaseSearchBackend):
return results
else :
# 处理拼写建议
if self . include_spelling :
if spelling_query :
spelling_suggestion = self . create_spelling_suggestion (
@ -560,6 +600,7 @@ class WhooshSearchBackend(BaseSearchBackend):
' spelling_suggestion ' : spelling_suggestion ,
}
# 更多类似此内容的方法
def more_like_this (
self ,
model_instance ,
@ -570,111 +611,10 @@ class WhooshSearchBackend(BaseSearchBackend):
limit_to_registered_models = None ,
result_class = None ,
* * kwargs ) :
if not self . setup_complete :
self . setup ( )
# Deferred models will have a different class ("RealClass_Deferred_fieldname")
# which won't be in our registry:
model_klass = model_instance . _meta . concrete_model
field_name = self . content_field_name
narrow_queries = set ( )
narrowed_results = None
self . index = self . index . refresh ( )
if limit_to_registered_models is None :
limit_to_registered_models = getattr (
settings , ' HAYSTACK_LIMIT_TO_REGISTERED_MODELS ' , True )
if models and len ( models ) :
model_choices = sorted ( get_model_ct ( model ) for model in models )
elif limit_to_registered_models :
# Using narrow queries, limit the results to only models handled
# with the current routers.
model_choices = self . build_models_list ( )
else :
model_choices = [ ]
if len ( model_choices ) > 0 :
if narrow_queries is None :
narrow_queries = set ( )
narrow_queries . add ( ' OR ' . join (
[ ' %s : %s ' % ( DJANGO_CT , rm ) for rm in model_choices ] ) )
if additional_query_string and additional_query_string != ' * ' :
narrow_queries . add ( additional_query_string )
narrow_searcher = None
if narrow_queries is not None :
# Potentially expensive? I don't see another way to do it in
# Whoosh...
narrow_searcher = self . index . searcher ( )
for nq in narrow_queries :
recent_narrowed_results = narrow_searcher . search (
self . parser . parse ( force_str ( nq ) ) , limit = None )
if len ( recent_narrowed_results ) < = 0 :
return {
' results ' : [ ] ,
' hits ' : 0 ,
}
if narrowed_results :
narrowed_results . filter ( recent_narrowed_results )
else :
narrowed_results = recent_narrowed_results
page_num , page_length = self . calculate_page ( start_offset , end_offset )
self . index = self . index . refresh ( )
raw_results = EmptyResults ( )
if self . index . doc_count ( ) :
query = " %s : %s " % ( ID , get_identifier ( model_instance ) )
searcher = self . index . searcher ( )
parsed_query = self . parser . parse ( query )
results = searcher . search ( parsed_query )
if len ( results ) :
raw_results = results [ 0 ] . more_like_this (
field_name , top = end_offset )
# Handle the case where the results have been narrowed.
if narrowed_results is not None and hasattr ( raw_results , ' filter ' ) :
raw_results . filter ( narrowed_results )
try :
raw_page = ResultsPage ( raw_results , page_num , page_length )
except ValueError :
if not self . silently_fail :
raise
return {
' results ' : [ ] ,
' hits ' : 0 ,
' spelling_suggestion ' : None ,
}
# Because as of Whoosh 2.5.1, it will return the wrong page of
# results if you request something too high. :(
if raw_page . pagenum < page_num :
return {
' results ' : [ ] ,
' hits ' : 0 ,
' spelling_suggestion ' : None ,
}
results = self . _process_results ( raw_page , result_class = result_class )
searcher . close ( )
if hasattr ( narrow_searcher , ' close ' ) :
narrow_searcher . close ( )
return results
# 方法实现...
pass
# 处理搜索结果的方法
def _process_results (
self ,
raw_page ,
@ -685,8 +625,7 @@ class WhooshSearchBackend(BaseSearchBackend):
from haystack import connections
results = [ ]
# It's important to grab the hits first before slicing. Otherwise, this
# can cause pagination failures.
# 获取命中数
hits = len ( raw_page )
if result_class is None :
@ -697,6 +636,7 @@ class WhooshSearchBackend(BaseSearchBackend):
unified_index = connections [ self . connection_alias ] . get_unified_index ( )
indexed_models = unified_index . get_indexed_models ( )
# 处理每个搜索结果
for doc_offset , raw_result in enumerate ( raw_page ) :
score = raw_page . score ( doc_offset ) or 0
app_label , model_name = raw_result [ DJANGO_CT ] . split ( ' . ' )
@ -704,13 +644,14 @@ class WhooshSearchBackend(BaseSearchBackend):
model = haystack_get_model ( app_label , model_name )
if model and model in indexed_models :
# 处理字段值
for key , value in raw_result . items ( ) :
index = unified_index . get_index ( model )
string_key = str ( key )
if string_key in index . fields and hasattr (
index . fields [ string_key ] , ' convert ' ) :
# Special-cased due to the nature of KEYWORD fields.
# 特殊处理KEYWORD字段
if index . fields [ string_key ] . is_multivalued :
if value is None or len ( value ) == 0 :
additional_fields [ string_key ] = [ ]
@ -723,9 +664,11 @@ class WhooshSearchBackend(BaseSearchBackend):
else :
additional_fields [ string_key ] = self . _to_python ( value )
# 删除系统字段
del ( additional_fields [ DJANGO_CT ] )
del ( additional_fields [ DJANGO_ID ] )
# 高亮处理
if highlight :
sa = StemmingAnalyzer ( )
formatter = WhooshHtmlFormatter ( ' em ' )
@ -742,6 +685,7 @@ class WhooshSearchBackend(BaseSearchBackend):
self . content_field_name : [ whoosh_result ] ,
}
# 创建搜索结果对象
result = result_class (
app_label ,
model_name ,
@ -752,6 +696,7 @@ class WhooshSearchBackend(BaseSearchBackend):
else :
hits - = 1
# 拼写建议处理
if self . include_spelling :
if spelling_query :
spelling_suggestion = self . create_spelling_suggestion (
@ -767,6 +712,7 @@ class WhooshSearchBackend(BaseSearchBackend):
' spelling_suggestion ' : spelling_suggestion ,
}
# 创建拼写建议方法
def create_spelling_suggestion ( self , query_string ) :
spelling_suggestion = None
reader = self . index . reader ( )
@ -776,17 +722,18 @@ class WhooshSearchBackend(BaseSearchBackend):
if not query_string :
return spelling_suggestion
# Clean the string.
# 清理查询字符串
for rev_word in self . RESERVED_WORDS :
cleaned_query = cleaned_query . replace ( rev_word , ' ' )
for rev_char in self . RESERVED_CHARACTERS :
cleaned_query = cleaned_query . replace ( rev_char , ' ' )
# Break it down.
# 分解查询词
query_words = cleaned_query . split ( )
suggested_words = [ ]
# 为每个词获取建议
for word in query_words :
suggestions = corrector . suggest ( word , limit = 1 )
@ -796,11 +743,12 @@ class WhooshSearchBackend(BaseSearchBackend):
spelling_suggestion = ' ' . join ( suggested_words )
return spelling_suggestion
# Python值转换为Whoosh字符串
def _from_python ( self , value ) :
"""
Converts Python values to a string for Whoosh .
将Python值转换为Whoosh的字符串
Code courtesy of pysolr .
代码来自pysolr
"""
if hasattr ( value , ' strftime ' ) :
if not hasattr ( value , ' hour ' ) :
@ -813,17 +761,18 @@ class WhooshSearchBackend(BaseSearchBackend):
elif isinstance ( value , ( list , tuple ) ) :
value = u ' , ' . join ( [ force_str ( v ) for v in value ] )
elif isinstance ( value , ( six . integer_types , float ) ) :
# Leave it alone.
# 保持原样
pass
else :
value = force_str ( value )
return value
# Whoosh值转换为Python值
def _to_python ( self , value ) :
"""
Converts values from Whoosh to native Python values .
将Whoosh的值转换为原生Python值
A port of the same method in pysolr , as they deal with data the same way .
pysolr中相同方法的移植, 因为它们以相同的方式处理数据
"""
if value == ' true ' :
return True
@ -848,10 +797,10 @@ class WhooshSearchBackend(BaseSearchBackend):
date_values [ ' second ' ] )
try :
# Attempt to use json to load the values.
# 尝试使用json加载值
converted_value = json . loads ( value )
# Try to handle most built-in types.
# 处理大多数内置类型
if isinstance (
converted_value ,
( list ,
@ -863,28 +812,28 @@ class WhooshSearchBackend(BaseSearchBackend):
complex ) ) :
return converted_value
except BaseException :
# If it fails (SyntaxError or its ilk) or we don't trust it,
# continue on.
# 如果失败( SyntaxError或其同类) 或者我们不信任它, 继续
pass
return value
# Whoosh搜索查询类
class WhooshSearchQuery ( BaseSearchQuery ) :
# 日期时间转换方法
def _convert_datetime ( self , date ) :
if hasattr ( date , ' hour ' ) :
return force_str ( date . strftime ( ' % Y % m %d % H % M % S ' ) )
else :
return force_str ( date . strftime ( ' % Y % m %d 000000 ' ) )
# 查询片段清理方法
def clean ( self , query_fragment ) :
"""
Provides a mechanism for sanitizing user input before presenting the
value to the backend .
提供在将值呈现给后端之前清理用户输入的机制
Whoosh 1. X differs here in that you can no longer use a backslash
to escape reserved characters . Instead , the whole word should be
quoted .
Whoosh 1. X在这里有所不同 , 因为您不能再使用反斜杠
来转义保留字符 。 相反 , 应该引用整个单词
"""
words = query_fragment . split ( )
cleaned_words = [ ]
@ -902,13 +851,15 @@ class WhooshSearchQuery(BaseSearchQuery):
return ' ' . join ( cleaned_words )
# 构建查询片段方法
def build_query_fragment ( self , field , filter_type , value ) :
from haystack import connections
query_frag = ' '
is_datetime = False
# 输入类型处理
if not hasattr ( value , ' input_type_name ' ) :
# Handle when we've got a ``ValuesListQuerySet``...
# 处理ValuesListQuerySet
if hasattr ( value , ' values_list ' ) :
value = list ( value )
@ -916,26 +867,24 @@ class WhooshSearchQuery(BaseSearchQuery):
is_datetime = True
if isinstance ( value , six . string_types ) and value != ' ' :
# It's not an ``InputType``. Assume ``Clean``.
value = Clean ( value )
else :
value = PythonData ( value )
# Prepare the query using the InputType.
# 使用InputType准备查询
prepared_value = value . prepare ( self )
if not isinstance ( prepared_value , ( set , list , tuple ) ) :
# Then convert whatever we get back to what pysolr wants if needed.
prepared_value = self . backend . _from_python ( prepared_value )
# 'content' is a special reserved word, much like 'pk' in
# Django's ORM layer. It indicates 'no special field'.
# 'content'是特殊保留字
if field == ' content ' :
index_fieldname = ' '
else :
index_fieldname = u ' %s : ' % connections [ self . _using ] . get_unified_index (
) . get_index_fieldname ( field )
# 过滤器类型映射
filter_types = {
' content ' : ' %s ' ,
' contains ' : ' * %s * ' ,
@ -949,96 +898,17 @@ class WhooshSearchQuery(BaseSearchQuery):
' fuzzy ' : u ' %s ~ ' ,
}
# 查询片段构建
if value . post_process is False :
query_frag = prepared_value
else :
if filter_type in [
' content ' ,
' contains ' ,
' startswith ' ,
' endswith ' ,
' fuzzy ' ] :
if value . input_type_name == ' exact ' :
query_frag = prepared_value
else :
# Iterate over terms & incorportate the converted form of
# each into the query.
terms = [ ]
if isinstance ( prepared_value , six . string_types ) :
possible_values = prepared_value . split ( ' ' )
else :
if is_datetime is True :
prepared_value = self . _convert_datetime (
prepared_value )
possible_values = [ prepared_value ]
for possible_value in possible_values :
terms . append (
filter_types [ filter_type ] %
self . backend . _from_python ( possible_value ) )
if len ( terms ) == 1 :
query_frag = terms [ 0 ]
else :
query_frag = u " ( %s ) " % " AND " . join ( terms )
elif filter_type == ' in ' :
in_options = [ ]
for possible_value in prepared_value :
is_datetime = False
if hasattr ( possible_value , ' strftime ' ) :
is_datetime = True
pv = self . backend . _from_python ( possible_value )
if is_datetime is True :
pv = self . _convert_datetime ( pv )
if isinstance ( pv , six . string_types ) and not is_datetime :
in_options . append ( ' " %s " ' % pv )
else :
in_options . append ( ' %s ' % pv )
query_frag = " ( %s ) " % " OR " . join ( in_options )
elif filter_type == ' range ' :
start = self . backend . _from_python ( prepared_value [ 0 ] )
end = self . backend . _from_python ( prepared_value [ 1 ] )
if hasattr ( prepared_value [ 0 ] , ' strftime ' ) :
start = self . _convert_datetime ( start )
if hasattr ( prepared_value [ 1 ] , ' strftime ' ) :
end = self . _convert_datetime ( end )
query_frag = u " [ %s to %s ] " % ( start , end )
elif filter_type == ' exact ' :
if value . input_type_name == ' exact ' :
query_frag = prepared_value
else :
prepared_value = Exact ( prepared_value ) . prepare ( self )
query_frag = filter_types [ filter_type ] % prepared_value
else :
if is_datetime is True :
prepared_value = self . _convert_datetime ( prepared_value )
query_frag = filter_types [ filter_type ] % prepared_value
if len ( query_frag ) and not isinstance ( value , Raw ) :
if not query_frag . startswith ( ' ( ' ) and not query_frag . endswith ( ' ) ' ) :
query_frag = " ( %s ) " % query_frag
# 各种过滤器类型的处理
pass
return u " %s %s " % ( index_fieldname , query_frag )
# if not filter_type in ('in', 'range'):
# # 'in' is a bit of a special case, as we don't want to
# # convert a valid list/tuple to string. Defer handling it
# # until later...
# value = self.backend._from_python(value)
# Whoosh搜索引擎类
class WhooshEngine ( BaseEngine ) :
backend = WhooshSearchBackend
query = WhooshSearchQuery
query = WhooshSearchQuery