@ -1,5 +1,19 @@
# encoding: utf-8
"""
Whoosh中文搜索后端模块
本模块提供了基于Whoosh搜索引擎的中文全文搜索功能 , 专门针对Django Haystack框架进行定制 。
集成了jieba中文分词器 , 支持中文文本的高效索引和搜索 。
主要特性 :
- 中文分词支持 ( 使用jieba )
- 高性能索引和搜索
- 拼写建议和查询高亮
- 多字段类型支持 ( 文本 、 数字 、 日期等 )
- 与Django Haystack框架深度集成
"""
from __future__ import absolute_import , division , print_function , unicode_literals
import json
@ -40,30 +54,39 @@ except ImportError:
raise MissingDependency (
" The ' whoosh ' backend requires the installation of ' Whoosh ' . Please refer to the documentation. " )
# Handle minimum requirement.
# 检查Whoosh版本要求
if not hasattr ( whoosh , ' __version__ ' ) or whoosh . __version__ < ( 2 , 5 , 0 ) :
raise MissingDependency (
" The ' whoosh ' backend requires version 2.5.0 or greater. " )
# Bubble up the correct error.
# 日期时间正则表达式 - 用于解析日期格式
DATETIME_REGEX = re . compile (
' ^(?P<year> \ d {4} )-(?P<month> \ d {2} )-(?P<day> \ d {2} )T(?P<hour> \ d {2} ):(?P<minute> \ d {2} ):(?P<second> \ d {2} )( \ . \ d { 3,6}Z?)?$ ' )
# 线程本地存储 - 用于内存索引
LOCALS = threading . local ( )
LOCALS . RAM_STORE = None
class WhooshHtmlFormatter ( HtmlFormatter ) :
"""
This is a HtmlFormatter simpler than the whoosh . HtmlFormatter .
We use it to have consistent results across backends . Specifically ,
Solr , Xapian and Elasticsearch are using this formatting .
简化的Whoosh HTML格式化器
提供跨后端一致的高亮结果显示格式 。
Solr 、 Xapian和Elasticsearch都使用这种格式化方式 。
"""
template = ' < %(tag)s > %(t)s </ %(tag)s > '
class WhooshSearchBackend ( BaseSearchBackend ) :
# Word reserved by Whoosh for special use.
"""
Whoosh搜索后端实现
继承自Haystack的BaseSearchBackend , 提供Whoosh搜索引擎的核心功能 。
支持文件存储和内存存储两种方式 。
"""
# Whoosh保留关键字
RESERVED_WORDS = (
' AND ' ,
' NOT ' ,
@ -71,15 +94,20 @@ class WhooshSearchBackend(BaseSearchBackend):
' TO ' ,
)
# Characters reserved by Whoosh for special use.
# The '\\' must come first, so as not to overwrite the other slash
# replacements.
# Whoosh保留字符
RESERVED_CHARACTERS = (
' \\ ' , ' + ' , ' - ' , ' && ' , ' || ' , ' ! ' , ' ( ' , ' ) ' , ' { ' , ' } ' ,
' [ ' , ' ] ' , ' ^ ' , ' " ' , ' ~ ' , ' * ' , ' ? ' , ' : ' , ' . ' ,
)
def __init__ ( self , connection_alias , * * connection_options ) :
"""
初始化Whoosh搜索后端
Args :
connection_alias : 连接别名
* * connection_options : 连接配置选项
"""
super (
WhooshSearchBackend ,
self ) . __init__ (
@ -93,9 +121,11 @@ class WhooshSearchBackend(BaseSearchBackend):
128 * 1024 * 1024 )
self . path = connection_options . get ( ' PATH ' )
# 检查存储类型
if connection_options . get ( ' STORAGE ' , ' file ' ) != ' file ' :
self . use_file_storage = False
# 文件存储必须指定路径
if self . use_file_storage and not self . path :
raise ImproperlyConfigured (
" You must specify a ' PATH ' in your settings for connection ' %s ' . " %
@ -105,21 +135,26 @@ class WhooshSearchBackend(BaseSearchBackend):
def setup ( self ) :
"""
Defers loading until needed .
初始化设置
延迟加载 , 在需要时进行初始化 。
创建或打开索引 , 构建schema 。
"""
from haystack import connections
new_index = False
# Make sure the index is there.
# 确保索引目录存在
if self . use_file_storage and not os . path . exists ( self . path ) :
os . makedirs ( self . path )
new_index = True
# 检查目录写入权限
if self . use_file_storage and not os . access ( self . path , os . W_OK ) :
raise IOError (
" The path to your Whoosh index ' %s ' is not writable for the current user/group. " %
self . path )
# 初始化存储
if self . use_file_storage :
self . storage = FileStorage ( self . path )
else :
@ -130,10 +165,12 @@ class WhooshSearchBackend(BaseSearchBackend):
self . storage = LOCALS . RAM_STORE
# 构建schema和解析器
self . content_field_name , self . schema = self . build_schema (
connections [ self . connection_alias ] . get_unified_index ( ) . all_searchfields ( ) )
self . parser = QueryParser ( self . content_field_name , schema = self . schema )
# 创建或打开索引
if new_index is True :
self . index = self . storage . create_index ( self . schema )
else :
@ -145,18 +182,30 @@ class WhooshSearchBackend(BaseSearchBackend):
self . setup_complete = True
def build_schema ( self , fields ) :
"""
构建Whoosh schema
根据字段定义创建Whoosh索引schema 。
Args :
fields : 字段定义字典
Returns :
tuple : ( 内容字段名 , schema对象 )
"""
# 基础字段
schema_fields = {
ID : WHOOSH_ID ( stored = True , unique = True ) ,
DJANGO_CT : WHOOSH_ID ( stored = True ) ,
DJANGO_ID : WHOOSH_ID ( stored = True ) ,
}
# Grab the number of keys that are hard-coded into Haystack.
# We'll use this to (possibly) fail slightly more gracefully later.
initial_key_count = len ( schema_fields )
content_field_name = ' '
# 处理每个字段
for field_name , field_class in fields . items ( ) :
if field_class . is_multivalued :
# 多值字段
if field_class . indexed is False :
schema_fields [ field_class . index_fieldname ] = IDLIST (
stored = True , field_boost = field_class . boost )
@ -164,35 +213,42 @@ class WhooshSearchBackend(BaseSearchBackend):
schema_fields [ field_class . index_fieldname ] = KEYWORD (
stored = True , commas = True , scorable = True , field_boost = field_class . boost )
elif field_class . field_type in [ ' date ' , ' datetime ' ] :
# 日期时间字段
schema_fields [ field_class . index_fieldname ] = DATETIME (
stored = field_class . stored , sortable = True )
elif field_class . field_type == ' integer ' :
# 整数字段
schema_fields [ field_class . index_fieldname ] = NUMERIC (
stored = field_class . stored , numtype = int , field_boost = field_class . boost )
elif field_class . field_type == ' float ' :
# 浮点数字段
schema_fields [ field_class . index_fieldname ] = NUMERIC (
stored = field_class . stored , numtype = float , field_boost = field_class . boost )
elif field_class . field_type == ' boolean ' :
# Field boost isn't supported on BOOLEAN as of 1.8.2.
# 布尔字段
schema_fields [ field_class . index_fieldname ] = BOOLEAN (
stored = field_class . stored )
elif field_class . field_type == ' ngram ' :
# N-gram字段
schema_fields [ field_class . index_fieldname ] = NGRAM (
minsize = 3 , maxsize = 15 , stored = field_class . stored , field_boost = field_class . boost )
elif field_class . field_type == ' edge_ngram ' :
# 边缘N-gram字段
schema_fields [ field_class . index_fieldname ] = NGRAMWORDS ( minsize = 2 , maxsize = 15 , at = ' start ' ,
stored = field_class . stored ,
field_boost = field_class . boost )
else :
# 文本字段 - 使用中文分析器
# schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True)
schema_fields [ field_class . index_fieldname ] = TEXT (
stored = True , analyzer = ChineseAnalyzer ( ) , field_boost = field_class . boost , sortable = True )
# 标记内容字段
if field_class . document is True :
content_field_name = field_class . index_fieldname
schema_fields [ field_class . index_fieldname ] . spelling = True
# Fail more gracefully than relying on the backend to die if no fields
# are found.
# 检查是否有有效字段
if len ( schema_fields ) < = initial_key_count :
raise SearchBackendError (
" No fields were found in any search_indexes. Please correct this before attempting to search. " )
@ -200,6 +256,14 @@ class WhooshSearchBackend(BaseSearchBackend):
return ( content_field_name , Schema ( * * schema_fields ) )
def update ( self , index , iterable , commit = True ) :
"""
更新索引
Args :
index : 搜索索引
iterable : 可迭代对象
commit : 是否提交更改
"""
if not self . setup_complete :
self . setup ( )
@ -212,12 +276,11 @@ class WhooshSearchBackend(BaseSearchBackend):
except SkipDocument :
self . log . debug ( u " Indexing for object ` %s ` skipped " , obj )
else :
# Really make sure it's unicode, because Whoosh won't have it any
# other way.
# 确保所有值为unicode
for key in doc :
doc [ key ] = self . _from_python ( doc [ key ] )
# Document boosts aren't supported in Whoosh 2.5.0+.
# Whoosh 2.5.0+不支持文档boost
if ' boost ' in doc :
del doc [ ' boost ' ]
@ -227,9 +290,6 @@ class WhooshSearchBackend(BaseSearchBackend):
if not self . silently_fail :
raise
# We'll log the object identifier but won't include the actual object
# to avoid the possibility of that generating encoding errors while
# processing the log message:
self . log . error (
u " %s while preparing object for update " %
e . __class__ . __name__ ,
@ -239,12 +299,18 @@ class WhooshSearchBackend(BaseSearchBackend):
" index " : index ,
" object " : get_identifier ( obj ) } } )
# 提交更改
if len ( iterable ) > 0 :
# For now, commit no matter what, as we run into locking issues
# otherwise.
writer . commit ( )
def remove ( self , obj_or_string , commit = True ) :
"""
移除文档
Args :
obj_or_string : 对象或标识符
commit : 是否提交更改
"""
if not self . setup_complete :
self . setup ( )
@ -267,6 +333,13 @@ class WhooshSearchBackend(BaseSearchBackend):
exc_info = True )
def clear ( self , models = None , commit = True ) :
"""
清空索引
Args :
models : 要清空的模型列表
commit : 是否提交更改
"""
if not self . setup_complete :
self . setup ( )
@ -304,17 +377,27 @@ class WhooshSearchBackend(BaseSearchBackend):
" Failed to clear Whoosh index: %s " , e , exc_info = True )
def delete_index ( self ) :
# Per the Whoosh mailing list, if wiping out everything from the index,
# it's much more efficient to simply delete the index files.
"""
删除索引
彻底删除索引文件并重新创建 。
"""
# 文件存储:直接删除目录
if self . use_file_storage and os . path . exists ( self . path ) :
shutil . rmtree ( self . path )
elif not self . use_file_storage :
# 内存存储:清理存储
self . storage . clean ( )
# Recreate everything.
# 重新创建
self . setup ( )
def optimize ( self ) :
"""
优化索引
提高搜索性能 。
"""
if not self . setup_complete :
self . setup ( )
@ -322,12 +405,21 @@ class WhooshSearchBackend(BaseSearchBackend):
self . index . optimize ( )
def calculate_page ( self , start_offset = 0 , end_offset = None ) :
# Prevent against Whoosh throwing an error. Requires an end_offset
# greater than 0.
"""
计算分页参数
Args :
start_offset : 起始偏移量
end_offset : 结束偏移量
Returns :
tuple : ( 页码 , 页大小 )
"""
# 防止Whoosh错误
if end_offset is not None and end_offset < = 0 :
end_offset = 1
# Determine the page.
# 确定页码
page_num = 0
if end_offset is None :
@ -341,7 +433,7 @@ class WhooshSearchBackend(BaseSearchBackend):
if page_length and page_length > 0 :
page_num = int ( start_offset / page_length )
# Increment because Whoosh uses 1-based page numbers.
# Whoosh使用1-based页码
page_num + = 1
return page_num , page_length
@ -366,10 +458,15 @@ class WhooshSearchBackend(BaseSearchBackend):
limit_to_registered_models = None ,
result_class = None ,
* * kwargs ) :
"""
执行搜索查询
核心搜索方法 , 处理各种搜索参数和选项 。
"""
if not self . setup_complete :
self . setup ( )
# A zero length query should return no results.
# 空查询返回无结果
if len ( query_string ) == 0 :
return {
' results ' : [ ] ,
@ -378,8 +475,7 @@ class WhooshSearchBackend(BaseSearchBackend):
query_string = force_str ( query_string )
# A one-character query (non-wildcard) gets nabbed by a stopwords
# filter and should yield zero results.
# 单字符查询(非通配符)返回无结果
if len ( query_string ) < = 1 and query_string != u ' * ' :
return {
' results ' : [ ] ,
@ -388,10 +484,8 @@ class WhooshSearchBackend(BaseSearchBackend):
reverse = False
# 处理排序
if sort_by is not None :
# Determine if we need to reverse the results and if Whoosh can
# handle what it's being asked to sort by. Reversing is an
# all-or-nothing action, unfortunately.
sort_by_list = [ ]
reverse_counter = 0
@ -399,6 +493,7 @@ class WhooshSearchBackend(BaseSearchBackend):
if order_by . startswith ( ' - ' ) :
reverse_counter + = 1
# Whoosh要求所有排序字段方向一致
if reverse_counter and reverse_counter != len ( sort_by ) :
raise SearchBackendError ( " Whoosh requires all order_by fields "
" to use the same sort direction " )
@ -406,17 +501,16 @@ class WhooshSearchBackend(BaseSearchBackend):
for order_by in sort_by :
if order_by . startswith ( ' - ' ) :
sort_by_list . append ( order_by [ 1 : ] )
if len ( sort_by_list ) == 1 :
reverse = True
else :
sort_by_list . append ( order_by )
if len ( sort_by_list ) == 1 :
reverse = False
sort_by = sort_by_list [ 0 ]
# Whoosh不支持facet功能
if facets is not None :
warnings . warn (
" Whoosh does not handle faceting. " ,
@ -438,6 +532,7 @@ class WhooshSearchBackend(BaseSearchBackend):
narrowed_results = None
self . index = self . index . refresh ( )
# 模型限制处理
if limit_to_registered_models is None :
limit_to_registered_models = getattr (
settings , ' HAYSTACK_LIMIT_TO_REGISTERED_MODELS ' , True )
@ -445,12 +540,11 @@ class WhooshSearchBackend(BaseSearchBackend):
if models and len ( models ) :
model_choices = sorted ( get_model_ct ( model ) for model in models )
elif limit_to_registered_models :
# Using narrow queries, limit the results to only models handled
# with the current routers.
model_choices = self . build_models_list ( )
else :
model_choices = [ ]
# 构建窄查询
if len ( model_choices ) > 0 :
if narrow_queries is None :
narrow_queries = set ( )
@ -460,9 +554,8 @@ class WhooshSearchBackend(BaseSearchBackend):
narrow_searcher = None
# 处理窄查询
if narrow_queries is not None :
# Potentially expensive? I don't see another way to do it in
# Whoosh...
narrow_searcher = self . index . searcher ( )
for nq in narrow_queries :
@ -482,11 +575,12 @@ class WhooshSearchBackend(BaseSearchBackend):
self . index = self . index . refresh ( )
# 执行搜索
if self . index . doc_count ( ) :
searcher = self . index . searcher ( )
parsed_query = self . parser . parse ( query_string )
# In the event of an invalid/stopworded query, recover gracefully.
# 处理无效查询
if parsed_query is None :
return {
' results ' : [ ] ,
@ -502,7 +596,7 @@ class WhooshSearchBackend(BaseSearchBackend):
' reverse ' : reverse ,
}
# Handle the case where the results have been narrowed.
# 应用窄查询过滤
if narrowed_results is not None :
search_kwargs [ ' filter ' ] = narrowed_results
@ -522,8 +616,7 @@ class WhooshSearchBackend(BaseSearchBackend):
' spelling_suggestion ' : None ,
}
# Because as of Whoosh 2.5.1, it will return the wrong page of
# results if you request something too high. :(
# 检查页码有效性
if raw_page . pagenum < page_num :
return {
' results ' : [ ] ,
@ -531,6 +624,7 @@ class WhooshSearchBackend(BaseSearchBackend):
' spelling_suggestion ' : None ,
}
# 处理搜索结果
results = self . _process_results (
raw_page ,
highlight = highlight ,
@ -544,6 +638,7 @@ class WhooshSearchBackend(BaseSearchBackend):
return results
else :
# 无文档时的处理
if self . include_spelling :
if spelling_query :
spelling_suggestion = self . create_spelling_suggestion (
@ -570,18 +665,21 @@ class WhooshSearchBackend(BaseSearchBackend):
limit_to_registered_models = None ,
result_class = None ,
* * kwargs ) :
"""
查找相似文档
基于给定模型实例查找相似内容 。
"""
if not self . setup_complete :
self . setup ( )
# Deferred models will have a different class ("RealClass_Deferred_fieldname")
# which won't be in our registry:
model_klass = model_instance . _meta . concrete_model
field_name = self . content_field_name
narrow_queries = set ( )
narrowed_results = None
self . index = self . index . refresh ( )
# 模型限制处理
if limit_to_registered_models is None :
limit_to_registered_models = getattr (
settings , ' HAYSTACK_LIMIT_TO_REGISTERED_MODELS ' , True )
@ -589,12 +687,11 @@ class WhooshSearchBackend(BaseSearchBackend):
if models and len ( models ) :
model_choices = sorted ( get_model_ct ( model ) for model in models )
elif limit_to_registered_models :
# Using narrow queries, limit the results to only models handled
# with the current routers.
model_choices = self . build_models_list ( )
else :
model_choices = [ ]
# 构建查询
if len ( model_choices ) > 0 :
if narrow_queries is None :
narrow_queries = set ( )
@ -607,9 +704,8 @@ class WhooshSearchBackend(BaseSearchBackend):
narrow_searcher = None
# 处理窄查询
if narrow_queries is not None :
# Potentially expensive? I don't see another way to do it in
# Whoosh...
narrow_searcher = self . index . searcher ( )
for nq in narrow_queries :
@ -632,6 +728,7 @@ class WhooshSearchBackend(BaseSearchBackend):
self . index = self . index . refresh ( )
raw_results = EmptyResults ( )
# 执行相似文档搜索
if self . index . doc_count ( ) :
query = " %s : %s " % ( ID , get_identifier ( model_instance ) )
searcher = self . index . searcher ( )
@ -642,7 +739,7 @@ class WhooshSearchBackend(BaseSearchBackend):
raw_results = results [ 0 ] . more_like_this (
field_name , top = end_offset )
# Handle the case where the results have been narrowed.
# 应用窄查询过滤
if narrowed_results is not None and hasattr ( raw_results , ' filter ' ) :
raw_results . filter ( narrowed_results )
@ -658,8 +755,7 @@ class WhooshSearchBackend(BaseSearchBackend):
' spelling_suggestion ' : None ,
}
# Because as of Whoosh 2.5.1, it will return the wrong page of
# results if you request something too high. :(
# 检查页码有效性
if raw_page . pagenum < page_num :
return {
' results ' : [ ] ,
@ -667,6 +763,7 @@ class WhooshSearchBackend(BaseSearchBackend):
' spelling_suggestion ' : None ,
}
# 处理结果
results = self . _process_results ( raw_page , result_class = result_class )
searcher . close ( )
@ -682,11 +779,15 @@ class WhooshSearchBackend(BaseSearchBackend):
query_string = ' ' ,
spelling_query = None ,
result_class = None ) :
"""
处理搜索结果
将Whoosh原始结果转换为Haystack格式 。
"""
from haystack import connections
results = [ ]
# It's important to grab the hits first before slicing. Otherwise, this
# can cause pagination failures.
# 获取命中数
hits = len ( raw_page )
if result_class is None :
@ -697,6 +798,7 @@ class WhooshSearchBackend(BaseSearchBackend):
unified_index = connections [ self . connection_alias ] . get_unified_index ( )
indexed_models = unified_index . get_indexed_models ( )
# 处理每个结果
for doc_offset , raw_result in enumerate ( raw_page ) :
score = raw_page . score ( doc_offset ) or 0
app_label , model_name = raw_result [ DJANGO_CT ] . split ( ' . ' )
@ -704,13 +806,14 @@ class WhooshSearchBackend(BaseSearchBackend):
model = haystack_get_model ( app_label , model_name )
if model and model in indexed_models :
# 处理字段值
for key , value in raw_result . items ( ) :
index = unified_index . get_index ( model )
string_key = str ( key )
if string_key in index . fields and hasattr (
index . fields [ string_key ] , ' convert ' ) :
# Special-cased due to the nature of KEYWORD fields.
# 多值字段特殊处理
if index . fields [ string_key ] . is_multivalued :
if value is None or len ( value ) == 0 :
additional_fields [ string_key ] = [ ]
@ -723,9 +826,11 @@ class WhooshSearchBackend(BaseSearchBackend):
else :
additional_fields [ string_key ] = self . _to_python ( value )
# 移除系统字段
del ( additional_fields [ DJANGO_CT ] )
del ( additional_fields [ DJANGO_ID ] )
# 高亮处理
if highlight :
sa = StemmingAnalyzer ( )
formatter = WhooshHtmlFormatter ( ' em ' )
@ -742,6 +847,7 @@ class WhooshSearchBackend(BaseSearchBackend):
self . content_field_name : [ whoosh_result ] ,
}
# 创建结果对象
result = result_class (
app_label ,
model_name ,
@ -752,6 +858,7 @@ class WhooshSearchBackend(BaseSearchBackend):
else :
hits - = 1
# 拼写建议
if self . include_spelling :
if spelling_query :
spelling_suggestion = self . create_spelling_suggestion (
@ -768,6 +875,15 @@ class WhooshSearchBackend(BaseSearchBackend):
}
def create_spelling_suggestion ( self , query_string ) :
"""
创建拼写建议
Args :
query_string : 查询字符串
Returns :
str : 拼写建议
"""
spelling_suggestion = None
reader = self . index . reader ( )
corrector = reader . corrector ( self . content_field_name )
@ -776,14 +892,14 @@ class WhooshSearchBackend(BaseSearchBackend):
if not query_string :
return spelling_suggestion
# Clean the string.
# 清理查询字符串
for rev_word in self . RESERVED_WORDS :
cleaned_query = cleaned_query . replace ( rev_word , ' ' )
for rev_char in self . RESERVED_CHARACTERS :
cleaned_query = cleaned_query . replace ( rev_char , ' ' )
# Break it down.
# 分词并获取建议
query_words = cleaned_query . split ( )
suggested_words = [ ]
@ -798,22 +914,29 @@ class WhooshSearchBackend(BaseSearchBackend):
def _from_python ( self , value ) :
"""
Converts Python values to a string for Whoosh .
Python值转换为Whoosh字符串
Code courtesy of pysolr .
Args :
value : Python值
Returns :
str : Whoosh格式字符串
"""
if hasattr ( value , ' strftime ' ) :
# 日期时间处理
if not hasattr ( value , ' hour ' ) :
value = datetime ( value . year , value . month , value . day , 0 , 0 , 0 )
elif isinstance ( value , bool ) :
# 布尔值处理
if value :
value = ' true '
else :
value = ' false '
elif isinstance ( value , ( list , tuple ) ) :
# 列表元组处理
value = u ' , ' . join ( [ force_str ( v ) for v in value ] )
elif isinstance ( value , ( six . integer_types , float ) ) :
# Leave it alone.
# 数字类型保持原样
pass
else :
value = force_str ( value )
@ -821,15 +944,20 @@ class WhooshSearchBackend(BaseSearchBackend):
def _to_python ( self , value ) :
"""
Converts values from Whoosh to native Python values .
Whoosh值转换为Python值
Args :
value : Whoosh值
A port of the same method in pysolr , as they deal with data the same way .
Returns :
object : Python值
"""
if value == ' true ' :
return True
elif value == ' false ' :
return False
# 日期时间解析
if value and isinstance ( value , six . string_types ) :
possible_datetime = DATETIME_REGEX . search ( value )
@ -847,11 +975,10 @@ class WhooshSearchBackend(BaseSearchBackend):
date_values [ ' minute ' ] ,
date_values [ ' second ' ] )
# JSON解析尝试
try :
# Attempt to use json to load the values.
converted_value = json . loads ( value )
# Try to handle most built-in types.
if isinstance (
converted_value ,
( list ,
@ -863,15 +990,28 @@ class WhooshSearchBackend(BaseSearchBackend):
complex ) ) :
return converted_value
except BaseException :
# If it fails (SyntaxError or its ilk) or we don't trust it,
# continue on.
pass
return value
class WhooshSearchQuery ( BaseSearchQuery ) :
"""
Whoosh搜索查询构建器
负责构建Whoosh搜索引擎的查询语句 。
"""
def _convert_datetime ( self , date ) :
"""
日期时间转换
Args :
date : 日期时间对象
Returns :
str : 格式化字符串
"""
if hasattr ( date , ' hour ' ) :
return force_str ( date . strftime ( ' % Y % m %d % H % M % S ' ) )
else :
@ -879,20 +1019,25 @@ class WhooshSearchQuery(BaseSearchQuery):
def clean ( self , query_fragment ) :
"""
Provides a mechanism for sanitizing user input before presenting the
value to the backend .
清理查询片段
对用户输入进行清理和转义处理 。
Whoosh 1. X differs here in that you can no longer use a backslash
to escape reserved characters . Instead , the whole word should be
quoted .
Args :
query_fragment : 查询片段
Returns :
str : 清理后的查询字符串
"""
words = query_fragment . split ( )
cleaned_words = [ ]
for word in words :
# 保留字转为小写
if word in self . backend . RESERVED_WORDS :
word = word . replace ( word , word . lower ( ) )
# 保留字符用引号包围
for char in self . backend . RESERVED_CHARACTERS :
if char in word :
word = " ' %s ' " % word
@ -903,12 +1048,23 @@ class WhooshSearchQuery(BaseSearchQuery):
return ' ' . join ( cleaned_words )
def build_query_fragment ( self , field , filter_type , value ) :
"""
构建查询片段
Args :
field : 字段名
filter_type : 过滤器类型
value : 字段值
Returns :
str : 查询片段
"""
from haystack import connections
query_frag = ' '
is_datetime = False
# 值类型处理
if not hasattr ( value , ' input_type_name ' ) :
# Handle when we've got a ``ValuesListQuerySet``...
if hasattr ( value , ' values_list ' ) :
value = list ( value )
@ -916,26 +1072,24 @@ class WhooshSearchQuery(BaseSearchQuery):
is_datetime = True
if isinstance ( value , six . string_types ) and value != ' ' :
# It's not an ``InputType``. Assume ``Clean``.
value = Clean ( value )
else :
value = PythonData ( value )
# Prepare the query using the InputType.
# 准备值
prepared_value = value . prepare ( self )
if not isinstance ( prepared_value , ( set , list , tuple ) ) :
# Then convert whatever we get back to what pysolr wants if needed.
prepared_value = self . backend . _from_python ( prepared_value )
# 'content' is a special reserved word, much like 'pk' in
# Django's ORM layer. It indicates 'no special field'.
# 字段名处理
if field == ' content ' :
index_fieldname = ' '
else :
index_fieldname = u ' %s : ' % connections [ self . _using ] . get_unified_index (
) . get_index_fieldname ( field )
# 过滤器类型映射
filter_types = {
' content ' : ' %s ' ,
' contains ' : ' * %s * ' ,
@ -949,6 +1103,7 @@ class WhooshSearchQuery(BaseSearchQuery):
' fuzzy ' : u ' %s ~ ' ,
}
# 查询片段构建
if value . post_process is False :
query_frag = prepared_value
else :
@ -961,8 +1116,6 @@ class WhooshSearchQuery(BaseSearchQuery):
if value . input_type_name == ' exact ' :
query_frag = prepared_value
else :
# Iterate over terms & incorportate the converted form of
# each into the query.
terms = [ ]
if isinstance ( prepared_value , six . string_types ) :
@ -1026,19 +1179,19 @@ class WhooshSearchQuery(BaseSearchQuery):
query_frag = filter_types [ filter_type ] % prepared_value
# 添加括号
if len ( query_frag ) and not isinstance ( value , Raw ) :
if not query_frag . startswith ( ' ( ' ) and not query_frag . endswith ( ' ) ' ) :
query_frag = " ( %s ) " % query_frag
return u " %s %s " % ( index_fieldname , query_frag )
# if not filter_type in ('in', 'range'):
# # 'in' is a bit of a special case, as we don't want to
# # convert a valid list/tuple to string. Defer handling it
# # until later...
# value = self.backend._from_python(value)
class WhooshEngine ( BaseEngine ) :
"""
Whoosh搜索引擎配置
配置Haystack使用Whoosh作为搜索后端 。
"""
backend = WhooshSearchBackend
query = WhooshSearchQuery
query = WhooshSearchQuery