diff --git a/src/代码注释/djangoblog/whoosh_cn_backend.py b/src/代码注释/djangoblog/whoosh_cn_backend.py new file mode 100644 index 0000000..b99f83d --- /dev/null +++ b/src/代码注释/djangoblog/whoosh_cn_backend.py @@ -0,0 +1,1073 @@ +# encoding: utf-8 +# 文件编码声明 + +from __future__ import absolute_import, division, print_function, unicode_literals +# 兼容Python 2和3的导入 + +import json +import os +import re +import shutil +import threading +import warnings + +import six +from django.conf import settings +from django.core.exceptions import ImproperlyConfigured +from datetime import datetime +from django.utils.encoding import force_str +from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, EmptyResults, log_query +from haystack.constants import DJANGO_CT, DJANGO_ID, ID +from haystack.exceptions import MissingDependency, SearchBackendError, SkipDocument +from haystack.inputs import Clean, Exact, PythonData, Raw +from haystack.models import SearchResult +from haystack.utils import get_identifier, get_model_ct +from haystack.utils import log as logging +from haystack.utils.app_loading import haystack_get_model +from jieba.analyse import ChineseAnalyzer # 中文分词器 +from whoosh import index +from whoosh.analysis import StemmingAnalyzer +from whoosh.fields import BOOLEAN, DATETIME, IDLIST, KEYWORD, NGRAM, NGRAMWORDS, NUMERIC, Schema, TEXT +from whoosh.fields import ID as WHOOSH_ID +from whoosh.filedb.filestore import FileStorage, RamStorage +from whoosh.highlight import ContextFragmenter, HtmlFormatter +from whoosh.highlight import highlight as whoosh_highlight +from whoosh.qparser import QueryParser +from whoosh.searching import ResultsPage +from whoosh.writing import AsyncWriter + +try: + import whoosh +except ImportError: + # 如果Whoosh没有安装,抛出缺失依赖异常 + raise MissingDependency( + "The 'whoosh' backend requires the installation of 'Whoosh'. Please refer to the documentation.") + +# 处理最低版本要求 +if not hasattr(whoosh, '__version__') or whoosh.__version__ < (2, 5, 0): + raise MissingDependency( + "The 'whoosh' backend requires version 2.5.0 or greater.") + +# 日期时间正则表达式,用于解析日期时间字符串 +DATETIME_REGEX = re.compile( + '^(?P\d{4})-(?P\d{2})-(?P\d{2})T(?P\d{2}):(?P\d{2}):(?P\d{2})(\.\d{3,6}Z?)?$') +LOCALS = threading.local() # 线程本地存储 +LOCALS.RAM_STORE = None + + +class WhooshHtmlFormatter(HtmlFormatter): + """ + 简化的Whoosh HTML格式化器 + 我们使用它来在不同后端之间获得一致的结果。 + 具体来说,Solr、Xapian和Elasticsearch都使用这种格式化。 + """ + template = '<%(tag)s>%(t)s' + + +class WhooshSearchBackend(BaseSearchBackend): + """Whoosh搜索引擎后端实现""" + + # Whoosh保留的特殊用途单词 + RESERVED_WORDS = ( + 'AND', + 'NOT', + 'OR', + 'TO', + ) + + # Whoosh保留的特殊用途字符 + # '\\'必须放在前面,以免覆盖其他斜杠替换 + RESERVED_CHARACTERS = ( + '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', + '[', ']', '^', '"', '~', '*', '?', ':', '.', + ) + + def __init__(self, connection_alias, **connection_options): + super( + WhooshSearchBackend, + self).__init__( + connection_alias, + **connection_options) + self.setup_complete = False # 设置完成标志 + self.use_file_storage = True # 是否使用文件存储 + self.post_limit = getattr( + connection_options, + 'POST_LIMIT', + 128 * 1024 * 1024) # 帖子大小限制,默认128MB + self.path = connection_options.get('PATH') # 索引存储路径 + + if connection_options.get('STORAGE', 'file') != 'file': + self.use_file_storage = False # 不使用文件存储,使用内存存储 + + if self.use_file_storage and not self.path: + raise ImproperlyConfigured( + "You must specify a 'PATH' in your settings for connection '%s'." % + connection_alias) + + self.log = logging.getLogger('haystack') # 日志记录器 + + def setup(self): + """ + 延迟加载,直到需要时才设置 + """ + from haystack import connections + new_index = False # 是否创建新索引 + + # 确保索引目录存在 + if self.use_file_storage and not os.path.exists(self.path): + os.makedirs(self.path) + new_index = True + + if self.use_file_storage and not os.access(self.path, os.W_OK): + raise IOError( + "The path to your Whoosh index '%s' is not writable for the current user/group." % + self.path) + + # 选择存储类型:文件存储或内存存储 + if self.use_file_storage: + self.storage = FileStorage(self.path) + else: + global LOCALS + + if getattr(LOCALS, 'RAM_STORE', None) is None: + LOCALS.RAM_STORE = RamStorage() + + self.storage = LOCALS.RAM_STORE + + # 构建schema和内容字段名 + self.content_field_name, self.schema = self.build_schema( + connections[self.connection_alias].get_unified_index().all_searchfields()) + self.parser = QueryParser(self.content_field_name, schema=self.schema) # 查询解析器 + + # 创建或打开索引 + if new_index is True: + self.index = self.storage.create_index(self.schema) + else: + try: + self.index = self.storage.open_index(schema=self.schema) + except index.EmptyIndexError: + self.index = self.storage.create_index(self.schema) + + self.setup_complete = True # 标记设置完成 + + def build_schema(self, fields): + """构建Whoosh索引schema""" + schema_fields = { + ID: WHOOSH_ID(stored=True, unique=True), # 唯一标识符 + DJANGO_CT: WHOOSH_ID(stored=True), # Django内容类型 + DJANGO_ID: WHOOSH_ID(stored=True), # Django对象ID + } + # 获取Haystack硬编码的键数量 + initial_key_count = len(schema_fields) + content_field_name = '' # 内容字段名 + + for field_name, field_class in fields.items(): + if field_class.is_multivalued: # 多值字段 + if field_class.indexed is False: + schema_fields[field_class.index_fieldname] = IDLIST( + stored=True, field_boost=field_class.boost) + else: + schema_fields[field_class.index_fieldname] = KEYWORD( + stored=True, commas=True, scorable=True, field_boost=field_class.boost) + elif field_class.field_type in ['date', 'datetime']: # 日期时间字段 + schema_fields[field_class.index_fieldname] = DATETIME( + stored=field_class.stored, sortable=True) + elif field_class.field_type == 'integer': # 整数字段 + schema_fields[field_class.index_fieldname] = NUMERIC( + stored=field_class.stored, numtype=int, field_boost=field_class.boost) + elif field_class.field_type == 'float': # 浮点数字段 + schema_fields[field_class.index_fieldname] = NUMERIC( + stored=field_class.stored, numtype=float, field_boost=field_class.boost) + elif field_class.field_type == 'boolean': # 布尔字段 + # Field boost在1.8.2版本中不支持BOOLEAN + schema_fields[field_class.index_fieldname] = BOOLEAN( + stored=field_class.stored) + elif field_class.field_type == 'ngram': # N-gram字段 + schema_fields[field_class.index_fieldname] = NGRAM( + minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) + elif field_class.field_type == 'edge_ngram': # 边缘N-gram字段 + schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, at='start', + stored=field_class.stored, + field_boost=field_class.boost) + else: + # 默认使用中文分析器的文本字段 + # 原代码使用StemmingAnalyzer,现改为ChineseAnalyzer以支持中文分词 + schema_fields[field_class.index_fieldname] = TEXT( + stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True) + if field_class.document is True: # 主文档字段 + content_field_name = field_class.index_fieldname + schema_fields[field_class.index_fieldname].spelling = True # 启用拼写建议 + + # 如果没有找到字段,优雅地失败 + if len(schema_fields) <= initial_key_count: + raise SearchBackendError( + "No fields were found in any search_indexes. Please correct this before attempting to search.") + + return (content_field_name, Schema(**schema_fields)) + + def update(self, index, iterable, commit=True): + """更新索引文档""" + if not self.setup_complete: + self.setup() + + self.index = self.index.refresh() # 刷新索引 + writer = AsyncWriter(self.index) # 异步写入器 + + for obj in iterable: + try: + doc = index.full_prepare(obj) # 准备文档 + except SkipDocument: + self.log.debug(u"Indexing for object `%s` skipped", obj) + else: + # 确保所有值都是Unicode,因为Whoosh只接受Unicode + for key in doc: + doc[key] = self._from_python(doc[key]) + + # Whoosh 2.5.0+不支持文档boost + if 'boost' in doc: + del doc['boost'] + + try: + writer.update_document(**doc) # 更新文档 + except Exception as e: + if not self.silently_fail: + raise + + # 记录对象标识符但不包含实际对象,避免处理日志消息时产生编码错误 + self.log.error( + u"%s while preparing object for update" % + e.__class__.__name__, + exc_info=True, + extra={ + "data": { + "index": index, + "object": get_identifier(obj)}}) + + if len(iterable) > 0: + # 目前无论如何都要提交,否则会遇到锁定问题 + writer.commit() + + def remove(self, obj_or_string, commit=True): + """从索引中移除文档""" + if not self.setup_complete: + self.setup() + + self.index = self.index.refresh() + whoosh_id = get_identifier(obj_or_string) + + try: + # 通过查询删除文档 + self.index.delete_by_query( + q=self.parser.parse( + u'%s:"%s"' % + (ID, whoosh_id))) + except Exception as e: + if not self.silently_fail: + raise + + self.log.error( + "Failed to remove document '%s' from Whoosh: %s", + whoosh_id, + e, + exc_info=True) + + def clear(self, models=None, commit=True): + """清空索引""" + if not self.setup_complete: + self.setup() + + self.index = self.index.refresh() + + if models is not None: + assert isinstance(models, (list, tuple)) + + try: + if models is None: + self.delete_index() # 完全删除索引 + else: + models_to_delete = [] + + for model in models: + models_to_delete.append( + u"%s:%s" % + (DJANGO_CT, get_model_ct(model))) + + # 通过查询删除指定模型的文档 + self.index.delete_by_query( + q=self.parser.parse( + u" OR ".join(models_to_delete))) + except Exception as e: + if not self.silently_fail: + raise + + if models is not None: + self.log.error( + "Failed to clear Whoosh index of models '%s': %s", + ','.join(models_to_delete), + e, + exc_info=True) + else: + self.log.error( + "Failed to clear Whoosh index: %s", e, exc_info=True) + + def delete_index(self): + """删除整个索引""" + # 根据Whoosh邮件列表,如果要清除索引中的所有内容,直接删除索引文件更高效 + if self.use_file_storage and os.path.exists(self.path): + shutil.rmtree(self.path) + elif not self.use_file_storage: + self.storage.clean() + + # 重新创建所有内容 + self.setup() + + def optimize(self): + """优化索引""" + if not self.setup_complete: + self.setup() + + self.index = self.index.refresh() + self.index.optimize() + + def calculate_page(self, start_offset=0, end_offset=None): + """计算分页信息""" + # 防止Whoosh抛出错误。需要end_offset大于0 + if end_offset is not None and end_offset <= 0: + end_offset = 1 + + # 确定页码 + page_num = 0 + + if end_offset is None: + end_offset = 1000000 # 默认大数 + + if start_offset is None: + start_offset = 0 + + page_length = end_offset - start_offset # 页面长度 + + if page_length and page_length > 0: + page_num = int(start_offset / page_length) + + # 递增,因为Whoosh使用基于1的页码 + page_num += 1 + return page_num, page_length + + @log_query # 记录查询日志的装饰器 + def search( + self, + query_string, + sort_by=None, + start_offset=0, + end_offset=None, + fields='', + highlight=False, + facets=None, + date_facets=None, + query_facets=None, + narrow_queries=None, + spelling_query=None, + within=None, + dwithin=None, + distance_point=None, + models=None, + limit_to_registered_models=None, + result_class=None, + **kwargs): + """执行搜索查询""" + if not self.setup_complete: + self.setup() + + # 零长度查询应该返回无结果 + if len(query_string) == 0: + return { + 'results': [], + 'hits': 0, + } + + query_string = force_str(query_string) # 确保查询字符串是字符串 + + # 单字符查询(非通配符)会被停用词过滤器捕获,应该返回零结果 + if len(query_string) <= 1 and query_string != u'*': + return { + 'results': [], + 'hits': 0, + } + + reverse = False # 是否反转排序 + + if sort_by is not None: + # 确定是否需要反转结果以及Whoosh是否可以处理被要求排序的字段 + # 反转是一个全有或全无的操作 + sort_by_list = [] + reverse_counter = 0 + + for order_by in sort_by: + if order_by.startswith('-'): # 降序排序 + reverse_counter += 1 + + if reverse_counter and reverse_counter != len(sort_by): + raise SearchBackendError("Whoosh requires all order_by fields" + " to use the same sort direction") + + for order_by in sort_by: + if order_by.startswith('-'): + sort_by_list.append(order_by[1:]) # 移除负号 + + if len(sort_by_list) == 1: + reverse = True + else: + sort_by_list.append(order_by) + + if len(sort_by_list) == 1: + reverse = False + + sort_by = sort_by_list[0] # Whoosh只支持单字段排序 + + # Whoosh不支持分面搜索,发出警告 + if facets is not None: + warnings.warn( + "Whoosh does not handle faceting.", + Warning, + stacklevel=2) + + if date_facets is not None: + warnings.warn( + "Whoosh does not handle date faceting.", + Warning, + stacklevel=2) + + if query_facets is not None: + warnings.warn( + "Whoosh does not handle query faceting.", + Warning, + stacklevel=2) + + narrowed_results = None + self.index = self.index.refresh() + + # 限制到注册的模型 + if limit_to_registered_models is None: + limit_to_registered_models = getattr( + settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) + + # 构建模型选择列表 + if models and len(models): + model_choices = sorted(get_model_ct(model) for model in models) + elif limit_to_registered_models: + # 使用窄查询,将结果限制为当前路由器处理的模型 + model_choices = self.build_models_list() + else: + model_choices = [] + + # 如果有模型选择,添加到窄查询中 + if len(model_choices) > 0: + if narrow_queries is None: + narrow_queries = set() + + narrow_queries.add(' OR '.join( + ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices])) + + narrow_searcher = None + + # 处理窄查询 + if narrow_queries is not None: + # 可能很昂贵?在Whoosh中没有看到其他方法... + narrow_searcher = self.index.searcher() + + for nq in narrow_queries: + recent_narrowed_results = narrow_searcher.search( + self.parser.parse(force_str(nq)), limit=None) + + if len(recent_narrowed_results) <= 0: + return { + 'results': [], + 'hits': 0, + } + + if narrowed_results: + narrowed_results.filter(recent_narrowed_results) + else: + narrowed_results = recent_narrowed_results + + self.index = self.index.refresh() + + # 如果索引中有文档,执行搜索 + if self.index.doc_count(): + searcher = self.index.searcher() # 创建搜索器 + parsed_query = self.parser.parse(query_string) # 解析查询 + + # 如果查询无效/包含停用词,优雅地恢复 + if parsed_query is None: + return { + 'results': [], + 'hits': 0, + } + + page_num, page_length = self.calculate_page( + start_offset, end_offset) + + search_kwargs = { + 'pagelen': page_length, # 页面长度 + 'sortedby': sort_by, # 排序字段 + 'reverse': reverse, # 是否反转 + } + + # 处理结果已被窄化的情况 + if narrowed_results is not None: + search_kwargs['filter'] = narrowed_results + + try: + # 执行分页搜索 + raw_page = searcher.search_page( + parsed_query, + page_num, + **search_kwargs + ) + except ValueError: + if not self.silently_fail: + raise + + return { + 'results': [], + 'hits': 0, + 'spelling_suggestion': None, + } + + # 由于Whoosh 2.5.1的问题,如果请求的页码过高,它会返回错误的页面 + if raw_page.pagenum < page_num: + return { + 'results': [], + 'hits': 0, + 'spelling_suggestion': None, + } + + # 处理搜索结果 + results = self._process_results( + raw_page, + highlight=highlight, + query_string=query_string, + spelling_query=spelling_query, + result_class=result_class) + searcher.close() # 关闭搜索器 + + if hasattr(narrow_searcher, 'close'): + narrow_searcher.close() + + return results + else: + # 如果没有文档,处理拼写建议 + if self.include_spelling: + if spelling_query: + spelling_suggestion = self.create_spelling_suggestion( + spelling_query) + else: + spelling_suggestion = self.create_spelling_suggestion( + query_string) + else: + spelling_suggestion = None + + return { + 'results': [], + 'hits': 0, + 'spelling_suggestion': spelling_suggestion, + } + +def more_like_this( + self, + model_instance, + additional_query_string=None, + start_offset=0, + end_offset=None, + models=None, + limit_to_registered_models=None, + result_class=None, + **kwargs): + """查找相似文档(基于内容的推荐)""" + if not self.setup_complete: + self.setup() + + # 延迟模型会有不同的类名("RealClass_Deferred_fieldname"),不在我们的注册表中 + model_klass = model_instance._meta.concrete_model # 获取具体模型类 + + field_name = self.content_field_name + narrow_queries = set() + narrowed_results = None + self.index = self.index.refresh() + + if limit_to_registered_models is None: + limit_to_registered_models = getattr( + settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) + + # 构建模型选择列表 + if models and len(models): + model_choices = sorted(get_model_ct(model) for model in models) + elif limit_to_registered_models: + # 使用窄查询,将结果限制为当前路由器处理的模型 + model_choices = self.build_models_list() + else: + model_choices = [] + + if len(model_choices) > 0: + if narrow_queries is None: + narrow_queries = set() + + narrow_queries.add(' OR '.join( + ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices])) + + # 添加额外的查询字符串 + if additional_query_string and additional_query_string != '*': + narrow_queries.add(additional_query_string) + + narrow_searcher = None + + # 处理窄查询 + if narrow_queries is not None: + # 可能很昂贵?在Whoosh中没有看到其他方法... + narrow_searcher = self.index.searcher() + + for nq in narrow_queries: + recent_narrowed_results = narrow_searcher.search( + self.parser.parse(force_str(nq)), limit=None) + + if len(recent_narrowed_results) <= 0: + return { + 'results': [], + 'hits': 0, + } + + if narrowed_results: + narrowed_results.filter(recent_narrowed_results) + else: + narrowed_results = recent_narrowed_results + + page_num, page_length = self.calculate_page(start_offset, end_offset) + + self.index = self.index.refresh() + raw_results = EmptyResults() # 空结果集 + + if self.index.doc_count(): + # 构建查询:查找指定模型实例 + query = "%s:%s" % (ID, get_identifier(model_instance)) + searcher = self.index.searcher() + parsed_query = self.parser.parse(query) + results = searcher.search(parsed_query) + + if len(results): + # 使用Whoosh的more_like_this功能查找相似文档 + raw_results = results[0].more_like_this( + field_name, top=end_offset) + + # 处理结果已被窄化的情况 + if narrowed_results is not None and hasattr(raw_results, 'filter'): + raw_results.filter(narrowed_results) + + try: + raw_page = ResultsPage(raw_results, page_num, page_length) + except ValueError: + if not self.silently_fail: + raise + + return { + 'results': [], + 'hits': 0, + 'spelling_suggestion': None, + } + + # 由于Whoosh 2.5.1的问题,如果请求的页码过高,它会返回错误的页面 + if raw_page.pagenum < page_num: + return { + 'results': [], + 'hits': 0, + 'spelling_suggestion': None, + } + + results = self._process_results(raw_page, result_class=result_class) + searcher.close() + + if hasattr(narrow_searcher, 'close'): + narrow_searcher.close() + + return results + +def _process_results( + self, + raw_page, + highlight=False, + query_string='', + spelling_query=None, + result_class=None): + """处理原始搜索结果,转换为SearchResult对象""" + from haystack import connections + results = [] + + # 在切片之前先获取命中数很重要,否则可能导致分页失败 + hits = len(raw_page) + + if result_class is None: + result_class = SearchResult + + facets = {} # 分面数据(Whoosh不支持) + spelling_suggestion = None # 拼写建议 + unified_index = connections[self.connection_alias].get_unified_index() + indexed_models = unified_index.get_indexed_models() # 已索引的模型 + + # 处理每个搜索结果 + for doc_offset, raw_result in enumerate(raw_page): + score = raw_page.score(doc_offset) or 0 # 相关性分数 + app_label, model_name = raw_result[DJANGO_CT].split('.') # 解析应用标签和模型名 + additional_fields = {} + model = haystack_get_model(app_label, model_name) # 获取模型类 + + if model and model in indexed_models: + # 处理每个字段的值 + for key, value in raw_result.items(): + index = unified_index.get_index(model) + string_key = str(key) + + if string_key in index.fields and hasattr( + index.fields[string_key], 'convert'): + # 由于KEYWORD字段的性质需要特殊处理 + if index.fields[string_key].is_multivalued: # 多值字段 + if value is None or len(value) == 0: + additional_fields[string_key] = [] + else: + additional_fields[string_key] = value.split(',') # 逗号分隔的值 + else: + additional_fields[string_key] = index.fields[string_key].convert( + value) # 使用字段的转换方法 + else: + additional_fields[string_key] = self._to_python(value) # 转换为Python类型 + + # 删除内部使用的字段 + del (additional_fields[DJANGO_CT]) + del (additional_fields[DJANGO_ID]) + + # 如果需要高亮显示 + if highlight: + sa = StemmingAnalyzer() # 词干分析器 + formatter = WhooshHtmlFormatter('em') # HTML格式化器 + terms = [token.text for token in sa(query_string)] # 提取查询词 + + # 执行高亮 + whoosh_result = whoosh_highlight( + additional_fields.get(self.content_field_name), # 内容字段 + terms, # 查询词 + sa, # 分析器 + ContextFragmenter(), # 上下文片段生成器 + formatter # 格式化器 + ) + additional_fields['highlighted'] = { + self.content_field_name: [whoosh_result], # 高亮结果 + } + + # 创建搜索结果对象 + result = result_class( + app_label, + model_name, + raw_result[DJANGO_ID], # 对象ID + score, # 相关性分数 + **additional_fields) # 额外字段 + results.append(result) + else: + hits -= 1 # 减少命中数(如果模型不在索引中) + + # 生成拼写建议 + if self.include_spelling: + if spelling_query: + spelling_suggestion = self.create_spelling_suggestion( + spelling_query) + else: + spelling_suggestion = self.create_spelling_suggestion( + query_string) + + return { + 'results': results, # 搜索结果列表 + 'hits': hits, # 命中数 + 'facets': facets, # 分面数据 + 'spelling_suggestion': spelling_suggestion, # 拼写建议 + } + +def create_spelling_suggestion(self, query_string): + """创建拼写建议""" + spelling_suggestion = None + reader = self.index.reader() # 索引读取器 + corrector = reader.corrector(self.content_field_name) # 拼写校正器 + cleaned_query = force_str(query_string) + + if not query_string: + return spelling_suggestion + + # 清理字符串:移除保留词 + for rev_word in self.RESERVED_WORDS: + cleaned_query = cleaned_query.replace(rev_word, '') + + # 清理字符串:移除保留字符 + for rev_char in self.RESERVED_CHARACTERS: + cleaned_query = cleaned_query.replace(rev_char, '') + + # 分解查询词 + query_words = cleaned_query.split() + suggested_words = [] + + # 为每个词获取拼写建议 + for word in query_words: + suggestions = corrector.suggest(word, limit=1) # 获取第一个建议 + + if len(suggestions) > 0: + suggested_words.append(suggestions[0]) # 使用建议词 + else: + suggested_words.append(word) # 如果没有建议,使用原词 + + spelling_suggestion = ' '.join(suggested_words) + return spelling_suggestion + +def _from_python(self, value): + """ + 将Python值转换为Whoosh使用的字符串 + + 代码源自pysolr + """ + if hasattr(value, 'strftime'): # 日期时间对象 + if not hasattr(value, 'hour'): # 如果没有时间部分 + value = datetime(value.year, value.month, value.day, 0, 0, 0) # 设置为午夜 + elif isinstance(value, bool): # 布尔值 + if value: + value = 'true' + else: + value = 'false' + elif isinstance(value, (list, tuple)): # 列表或元组 + value = u','.join([force_str(v) for v in value]) # 转换为逗号分隔的字符串 + elif isinstance(value, (six.integer_types, float)): # 数字类型 + # 保持原样 + pass + else: + value = force_str(value) # 转换为字符串 + return value + +def _to_python(self, value): + """ + 将Whoosh的值转换为原生Python值 + + 移植自pysolr中的相同方法,因为它们处理数据的方式相同 + """ + if value == 'true': # 布尔真值 + return True + elif value == 'false': # 布尔假值 + return False + + if value and isinstance(value, six.string_types): + possible_datetime = DATETIME_REGEX.search(value) # 尝试匹配日期时间格式 + + if possible_datetime: + date_values = possible_datetime.groupdict() + + for dk, dv in date_values.items(): + date_values[dk] = int(dv) # 转换为整数 + + return datetime( # 返回datetime对象 + date_values['year'], + date_values['month'], + date_values['day'], + date_values['hour'], + date_values['minute'], + date_values['second']) + + try: + # 尝试使用json加载值 + converted_value = json.loads(value) + + # 尝试处理大多数内置类型 + if isinstance( + converted_value, + (list, + tuple, + set, + dict, + six.integer_types, + float, + complex)): + return converted_value + except BaseException: + # 如果失败(SyntaxError或其同类)或者我们不信任它,继续处理 + pass + + return value # 返回原始值 + + +class WhooshSearchQuery(BaseSearchQuery): + """Whoosh搜索查询构建器""" + + def _convert_datetime(self, date): + """转换日期时间为Whoosh格式""" + if hasattr(date, 'hour'): # 包含时间的日期时间 + return force_str(date.strftime('%Y%m%d%H%M%S')) + else: # 仅日期 + return force_str(date.strftime('%Y%m%d000000')) + + def clean(self, query_fragment): + """ + 在将值呈现给后端之前,提供清理用户输入的机制 + + Whoosh 1.X在这里有所不同,不再使用反斜杠转义保留字符。 + 相反,应该引用整个单词。 + """ + words = query_fragment.split() + cleaned_words = [] + + for word in words: + if word in self.backend.RESERVED_WORDS: # 保留词 + word = word.replace(word, word.lower()) # 转换为小写 + + for char in self.backend.RESERVED_CHARACTERS: # 保留字符 + if char in word: + word = "'%s'" % word # 用引号包围单词 + break + + cleaned_words.append(word) + + return ' '.join(cleaned_words) # 重新组合为字符串 + + def build_query_fragment(self, field, filter_type, value): + """构建查询片段""" + from haystack import connections + query_frag = '' + is_datetime = False + + if not hasattr(value, 'input_type_name'): + # 处理ValuesListQuerySet... + if hasattr(value, 'values_list'): + value = list(value) + + if hasattr(value, 'strftime'): + is_datetime = True + + if isinstance(value, six.string_types) and value != ' ': + # 不是InputType,假设是Clean + value = Clean(value) + else: + value = PythonData(value) + + # 使用InputType准备查询 + prepared_value = value.prepare(self) + + if not isinstance(prepared_value, (set, list, tuple)): + # 然后根据需要将我们得到的任何内容转换为pysolr需要的格式 + prepared_value = self.backend._from_python(prepared_value) + + # 'content'是一个特殊的保留词,类似于Django ORM层中的'pk' + # 它表示"没有特殊字段" + if field == 'content': + index_fieldname = '' # 无字段名 + else: + index_fieldname = u'%s:' % connections[self._using].get_unified_index( + ).get_index_fieldname(field) # 获取索引字段名 + + # 过滤器类型映射 + filter_types = { + 'content': '%s', # 内容搜索 + 'contains': '*%s*', # 包含 + 'endswith': "*%s", # 以...结尾 + 'startswith': "%s*", # 以...开头 + 'exact': '%s', # 精确匹配 + 'gt': "{%s to}", # 大于 + 'gte': "[%s to]", # 大于等于 + 'lt': "{to %s}", # 小于 + 'lte': "[to %s]", # 小于等于 + 'fuzzy': u'%s~', # 模糊搜索 + } + + if value.post_process is False: # 不进行后处理 + query_frag = prepared_value + else: + if filter_type in [ + 'content', + 'contains', + 'startswith', + 'endswith', + 'fuzzy']: + if value.input_type_name == 'exact': # 精确输入类型 + query_frag = prepared_value + else: + # 遍历术语并将每个术语的转换形式合并到查询中 + terms = [] + + if isinstance(prepared_value, six.string_types): + possible_values = prepared_value.split(' ') # 按空格分割 + else: + if is_datetime is True: + prepared_value = self._convert_datetime( + prepared_value) # 转换日期时间 + + possible_values = [prepared_value] + + for possible_value in possible_values: + terms.append( + filter_types[filter_type] % + self.backend._from_python(possible_value)) # 应用过滤器 + + if len(terms) == 1: + query_frag = terms[0] + else: + query_frag = u"(%s)" % " AND ".join(terms) # 使用AND连接 + elif filter_type == 'in': # 在...中 + in_options = [] + + for possible_value in prepared_value: + is_datetime = False + + if hasattr(possible_value, 'strftime'): + is_datetime = True + + pv = self.backend._from_python(possible_value) + + if is_datetime is True: + pv = self._convert_datetime(pv) # 转换日期时间 + + if isinstance(pv, six.string_types) and not is_datetime: + in_options.append('"%s"' % pv) # 字符串用引号包围 + else: + in_options.append('%s' % pv) + + query_frag = "(%s)" % " OR ".join(in_options) # 使用OR连接 + elif filter_type == 'range': # 范围查询 + start = self.backend._from_python(prepared_value[0]) # 起始值 + end = self.backend._from_python(prepared_value[1]) # 结束值 + + if hasattr(prepared_value[0], 'strftime'): + start = self._convert_datetime(start) # 转换起始日期时间 + + if hasattr(prepared_value[1], 'strftime'): + end = self._convert_datetime(end) # 转换结束日期时间 + + query_frag = u"[%s to %s]" % (start, end) # 范围格式 + elif filter_type == 'exact': # 精确匹配 + if value.input_type_name == 'exact': + query_frag = prepared_value + else: + prepared_value = Exact(prepared_value).prepare(self) # 准备精确值 + query_frag = filter_types[filter_type] % prepared_value + else: + if is_datetime is True: + prepared_value = self._convert_datetime(prepared_value) # 转换日期时间 + + query_frag = filter_types[filter_type] % prepared_value # 应用过滤器 + + # 添加括号(如果不是原始值) + if len(query_frag) and not isinstance(value, Raw): + if not query_frag.startswith('(') and not query_frag.endswith(')'): + query_frag = "(%s)" % query_frag + + return u"%s%s" % (index_fieldname, query_frag) # 组合字段名和查询片段 + + # 注释掉的代码:处理'in'和'range'之外的情况 + # if not filter_type in ('in', 'range'): + # # 'in'是一个特殊情况,因为我们不想将有效的列表/元组转换为字符串 + # # 推迟处理... + # value = self.backend._from_python(value) + + +class WhooshEngine(BaseEngine): + """Whoosh搜索引擎""" + backend = WhooshSearchBackend # 指定后端类 + query = WhooshSearchQuery # 指定查询类 \ No newline at end of file