DjangoBlog/djangoblog/whoosh_cn_backend.py

#gq:
# encoding: utf-8

from __future__ import absolute_import, division, print_function, unicode_literals
# 未来兼容导入：保证代码在 Python2/3 下行为一致

import json
import os
import re
import shutil
import threading
import warnings

import six
from django.conf import settings
from django.core.exceptions import ImproperlyConfigured
from datetime import datetime
from django.utils.encoding import force_str
from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, EmptyResults, log_query
from haystack.constants import DJANGO_CT, DJANGO_ID, ID
from haystack.exceptions import MissingDependency, SearchBackendError, SkipDocument
from haystack.inputs import Clean, Exact, PythonData, Raw
from haystack.models import SearchResult
from haystack.utils import get_identifier, get_model_ct
from haystack.utils import log as logging
from haystack.utils.app_loading import haystack_get_model
from jieba.analyse import ChineseAnalyzer          # 中文分词器
from whoosh import index
from whoosh.analysis import StemmingAnalyzer
from whoosh.fields import BOOLEAN, DATETIME, IDLIST, KEYWORD, NGRAM, NGRAMWORDS, NUMERIC, Schema, TEXT
from whoosh.fields import ID as WHOOSH_ID
from whoosh.filedb.filestore import FileStorage, RamStorage
from whoosh.highlight import ContextFragmenter, HtmlFormatter
from whoosh.highlight import highlight as whoosh_highlight
from whoosh.qparser import QueryParser
from whoosh.searching import ResultsPage
from whoosh.writing import AsyncWriter

try:
    import whoosh
except ImportError:
    raise MissingDependency(
        "The 'whoosh' backend requires the installation of 'Whoosh'. Please refer to the documentation.")

# Handle minimum requirement.
if not hasattr(whoosh, '__version__') or whoosh.__version__ < (2, 5, 0):
    raise MissingDependency(
        "The 'whoosh' backend requires version 2.5.0 or greater.")

# Bubble up the correct error.

# 用于解析 whoosh 存储的 datetime 字符串的正则
DATETIME_REGEX = re.compile(
    '^(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})T(?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2})(\.\d{3,6}Z?)?$')

# 线程本地存储：用于在内存模式时共享 RamStorage
LOCALS = threading.local()
LOCALS.RAM_STORE = None


class WhooshHtmlFormatter(HtmlFormatter):
    """
    自定义高亮 HTML 输出格式，保持与其他后端（Solr、ES 等）一致。
    模板：<em>高亮文本</em>
    """
    template = '<%(tag)s>%(t)s</%(tag)s>'


class WhooshSearchBackend(BaseSearchBackend):
    # Whoosh 保留字，查询时需转义或避免
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )

    # Whoosh 保留字符，同样需转义
    RESERVED_CHARACTERS = (
        '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
        '[', ']', '^', '"', '~', '*', '?', ':', '.',
    )

    def __init__(self, connection_alias, **connection_options):
        """
        初始化后端实例
        :param connection_alias: settings 中 HAYSTACK_CONNECTIONS 的 key
        :param connection_options: 该连接的配置字典
        """
        super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options)
        self.setup_complete = False          # 延迟 setup 标记
        self.use_file_storage = True         # 默认使用文件存储
        # 提交缓冲区大小，默认 128 MB
        self.post_limit = getattr(connection_options, 'POST_LIMIT', 128 * 1024 * 1024)
        self.path = connection_options.get('PATH')  # 索引存放路径

        # 如果显式指定 STORAGE != 'file'，则使用内存存储
        if connection_options.get('STORAGE', 'file') != 'file':
            self.use_file_storage = False

        # 文件存储模式下 PATH 不能为空
        if self.use_file_storage and not self.path:
            raise ImproperlyConfigured(
                "You must specify a 'PATH' in your settings for connection '%s'." % connection_alias)

        self.log = logging.getLogger('haystack')

    def setup(self):
        """
        延迟初始化：真正用到时才创建/打开索引，避免进程启动时即锁定索引。
        """
        from haystack import connections
        new_index = False

        # 文件存储：目录不存在则创建
        if self.use_file_storage and not os.path.exists(self.path):
            os.makedirs(self.path)
            new_index = True

        # 确保目录可写
        if self.use_file_storage and not os.access(self.path, os.W_OK):
            raise IOError(
                "The path to your Whoosh index '%s' is not writable for the current user/group." % self.path)

        # 根据配置选择存储后端
        if self.use_file_storage:
            self.storage = FileStorage(self.path)
        else:
            global LOCALS
            # 内存模式：线程本地共享 RamStorage
            if getattr(LOCALS, 'RAM_STORE', None) is None:
                LOCALS.RAM_STORE = RamStorage()
            self.storage = LOCALS.RAM_STORE

        # 构建 whoosh Schema
        self.content_field_name, self.schema = self.build_schema(
            connections[self.connection_alias].get_unified_index().all_searchfields())
        self.parser = QueryParser(self.content_field_name, schema=self.schema)

        # 创建或打开索引
        if new_index is True:
            self.index = self.storage.create_index(self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = self.storage.create_index(self.schema)

        self.setup_complete = True

    def build_schema(self, fields):
        """
        把 Haystack 的 SearchField 列表转换成 whoosh 的 Schema
        :param fields: dict {field_name: field_instance}
        :return: (content_field_name, Schema)
        """
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),   # 主键
            DJANGO_CT: WHOOSH_ID(stored=True),        # 模型类名
            DJANGO_ID: WHOOSH_ID(stored=True),        # 模型 pk
        }
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                # 多值字段：非索引用 IDLIST，索引用 KEYWORD
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True, commas=True, scorable=True, field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored, sortable=True)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored, numtype=int, field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored, numtype=float, field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # BOOLEAN 不支持 field_boost
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2, maxsize=15, at='start',
                    stored=field_class.stored, field_boost=field_class.boost)
            else:
                # 默认 TEXT，使用 jieba 中文分词
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True, analyzer=ChineseAnalyzer(),
                    field_boost=field_class.boost, sortable=True)

            # 标记文档主字段（用于高亮/拼写检查）
            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # 没有任何业务字段则抛错
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search.")

        return (content_field_name, Schema(**schema_fields))

    def update(self, index, iterable, commit=True):
        """
        批量更新/新增文档
        :param index: SearchIndex 实例
        :param iterable: 要索引的模型实例可迭代对象
        :param commit: 是否立即提交（whoosh 2.5+ 建议写完即 commit）
        """
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)  # 异步写，避免长期锁

        for obj in iterable:
            try:
                doc = index.full_prepare(obj)  # 提取字段值
            except SkipDocument:
                self.log.debug(u"Indexing for object `%s` skipped", obj)
                continue

            # 全部转 unicode
            for key in doc:
                doc[key] = self._from_python(doc[key])

            # whoosh 2.5+ 已移除文档级 boost
            if 'boost' in doc:
                del doc['boost']

            try:
                writer.update_document(**doc)  # 存在即更新
            except Exception as e:
                if not self.silently_fail:
                    raise
                self.log.error(
                    u"%s while preparing object for update" % e.__class__.__name__,
                    exc_info=True,
                    extra={"data": {"index": index, "object": get_identifier(obj)}})

        if len(iterable) > 0:
            writer.commit()  # 真正落地

    def remove(self, obj_or_string, commit=True):
        """
        根据唯一标识删除单条文档
        """
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        whoosh_id = get_identifier(obj_or_string)

        try:
            self.index.delete_by_query(
                q=self.parser.parse(u'%s:"%s"' % (ID, whoosh_id)))
        except Exception as e:
            if not self.silently_fail:
                raise
            self.log.error(
                "Failed to remove document '%s' from Whoosh: %s",
                whoosh_id, e, exc_info=True)

    def clear(self, models=None, commit=True):
        """
        清空整个索引或指定模型的索引
        :param models: None 表示全部；list/tuple 表示指定模型
        """
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()

        if models is not None:
            assert isinstance(models, (list, tuple))

        try:
            if models is None:
                self.delete_index()  # 删目录/清内存
            else:
                models_to_delete = []
                for model in models:
                    models_to_delete.append(
                        u"%s:%s" % (DJANGO_CT, get_model_ct(model)))
                # 构造 OR 查询一次性删除
                self.index.delete_by_query(
                    q=self.parser.parse(u" OR ".join(models_to_delete)))
        except Exception as e:
            if not self.silently_fail:
                raise
            if models is not None:
                self.log.error(
                    "Failed to clear Whoosh index of models '%s': %s",
                    ','.join(models_to_delete), e, exc_info=True)
            else:
                self.log.error("Failed to clear Whoosh index: %s", e, exc_info=True)

    def delete_index(self):
        """
        物理删除索引文件/内存，并重新 setup
        """
        if self.use_file_storage and os.path.exists(self.path):
            shutil.rmtree(self.path)
        elif not self.use_file_storage:
            self.storage.clean()
        self.setup()

    def optimize(self):
        """
        手动合并索引段，提升查询速度（耗时操作）
        """
        if not self.setup_complete:
            self.setup()
        self.index = self.index.refresh()
        self.index.optimize()

    def calculate_page(self, start_offset=0, end_offset=None):
        """
        把 Django 风格的分页起止偏移换算成 whoosh 的页码+每页条数
        """
        if end_offset is not None and end_offset <= 0:
            end_offset = 1
        page_num = 0
        if end_offset is None:
            end_offset = 1000000
        if start_offset is None:
            start_offset = 0
        page_length = end_offset - start_offset
        if page_length > 0:
            page_num = int(start_offset / page_length)
        # whoosh 页码从 1 开始
        page_num += 1
        return page_num, page_length

    @log_query
    def search(self, query_string, sort_by=None, start_offset=0, end_offset=None,
               fields='', highlight=False, facets=None, date_facets=None,
               query_facets=None, narrow_queries=None, spelling_query=None,
               within=None, dwithin=None, distance_point=None, models=None,
               limit_to_registered_models=None, result_class=None, **kwargs):
        """
        核心查询方法，返回 {'results': [...], 'hits': n, ...}
        各参数含义见 Haystack 文档，此处不赘述
        """
        if not self.setup_complete:
            self.setup()

        # 空查询直接返回 0 条
        if len(query_string) == 0:
            return {'results': [], 'hits': 0}

        query_string = force_str(query_string)

        # 单个非通配字符会被 whoosh 当 stopword 过滤掉，直接返回 0 条
        if len(query_string) <= 1 and query_string != u'*':
            return {'results': [], 'hits': 0}

        reverse = False
        # 处理排序方向（whoosh 要求所有字段同向）
        if sort_by is not None:
            sort_by_list = []
            reverse_counter = 0
            for order_by in sort_by:
                if order_by.startswith('-'):
                    reverse_counter += 1
            if reverse_counter and reverse_counter != len(sort_by):
                raise SearchBackendError("Whoosh requires all order_by fields"
                                         " to use the same sort direction")
            for order_by in sort_by:
                if order_by.startswith('-'):
                    sort_by_list.append(order_by[1:])
                    if len(sort_by_list) == 1:
                        reverse = True
                else:
                    sort_by_list.append(order_by)
                    if len(sort_by_list) == 1:
                        reverse = False
            sort_by = sort_by_list[0]

        # whoosh 不支持 faceting，仅警告
        if facets is not None:
            warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2)
        if date_facets is not None:
            warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2)
        if query_facets is not None:
            warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2)

        narrowed_results = None
        self.index = self.index.refresh()

        # 模型过滤
        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()
            narrow_queries.add(' OR '.join(
                ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]))

        narrow_searcher = None
        if narrow_queries is not None:
            # 用 searcher 先过滤缩小结果集
            narrow_searcher = self.index.searcher()
            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(
                    self.parser.parse(force_str(nq)), limit=None)
                if len(recent_narrowed_results) <= 0:
                    return {'results': [], 'hits': 0}
                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                    narrowed_results = recent_narrowed_results

        self.index = self.index.refresh()

        if self.index.doc_count():
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query_string)
            if parsed_query is None:  # 非法/全停词查询
                return {'results': [], 'hits': 0}

            page_num, page_length = self.calculate_page(start_offset, end_offset)
            search_kwargs = {'pagelen': page_length,
                             'sortedby': sort_by, 'reverse': reverse}
            if narrowed_results is not None:
                search_kwargs['filter'] = narrowed_results

            try:
                raw_page = searcher.search_page(parsed_query, page_num, **search_kwargs)
            except ValueError:
                if not self.silently_fail:
                    raise
                return {'results': [], 'hits': 0, 'spelling_suggestion': None}

            # whoosh 2.5.1 在请求页码过大时会返回错误页，需检测
            if raw_page.pagenum < page_num:
                return {'results': [], 'hits': 0, 'spelling_suggestion': None}

            results = self._process_results(
                raw_page, highlight=highlight, query_string=query_string,
                spelling_query=spelling_query, result_class=result_class)
            searcher.close()
            if hasattr(narrow_searcher, 'close'):
                narrow_searcher.close()
            return results
        else:
            # 索引为空时返回拼写建议
            if self.include_spelling:
                if spelling_query:
                    spelling_suggestion = self.create_spelling_suggestion(spelling_query)
                else:
                    spelling_suggestion = self.create_spelling_suggestion(query_string)
            else:
                spelling_suggestion = None
            return {'results': [], 'hits': 0, 'spelling_suggestion': spelling_suggestion}

    def more_like_this(self, model_instance, additional_query_string=None,
                       start_offset=0, end_offset=None, models=None,
                       limit_to_registered_models=None, result_class=None, **kwargs):
        """
        根据给定实例找“相似文档”
        """
        if not self.setup_complete:
            self.setup()

        # 处理延迟加载模型
        model_klass = model_instance._meta.concrete_model
        field_name = self.content_field_name
        narrow_queries = set()
        narrowed_results = None
        self.index = self.index.refresh()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()
            narrow_queries.add(' OR '.join(
                ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]))

        if additional_query_string and additional_query_string != '*':
            narrow_queries.add(additional_query_string)

        narrow_searcher = None
        if narrow_queries is not None:
            narrow_searcher = self.index.searcher()
            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(
                    self.parser.parse(force_str(nq)), limit=None)
                if len(recent_narrowed_results) <= 0:
                    return {'results': [], 'hits': 0}
                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                    narrowed_results = recent_narrowed_results

        page_num, page_length = self.calculate_page(start_offset, end_offset)
        self.index = self.index.refresh()
        raw_results = EmptyResults()

        if self.index.doc_count():
            query = "%s:%s" % (ID, get_identifier(model_instance))
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query)
            results = searcher.search(parsed_query)
            if len(results):
                # 取第一条结果调 more_like_this
                raw_results = results[0].more_like_this(
                    field_name, top=end_offset)
            if narrowed_results is not None and hasattr(raw_results, 'filter'):
                raw_results.filter(narrowed_results)

        try:
            raw_page = ResultsPage(raw_results, page_num, page_length)
        except ValueError:
            if not self.silently_fail:
                raise
            return {'results': [], 'hits': 0, 'spelling_suggestion': None}

        if raw_page.pagenum < page_num:
            return {'results': [], 'hits': 0, 'spelling_suggestion': None}

        results = self._process_results(raw_page, result_class=result_class)
        searcher.close()
        if hasattr(narrow_searcher, 'close'):
            narrow_searcher.close()
        return results

    def _process_results(self, raw_page, highlight=False, query_string='',
                         spelling_query=None, result_class=None):
        """
        把 whoosh 的 ResultsPage 转成 haystack SearchResult 列表
        """
        from haystack import connections
        results = []
        hits = len(raw_page)  # 必须在切片前取总数

        if result_class is None:
            result_class = SearchResult

        facets = {}
        spelling_suggestion = None
        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result[DJANGO_CT].split('.')
            additional_fields = {}
            model = haystack_get_model(app_label, model_name)

            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = unified_index.get_index(model)
                    string_key = str(key)
                    if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
                        # 多值 KEYWORD 字段用逗号拆分
                        if index.fields[string_key].is_multivalued:
                            if value is None or len(value) == 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(',')
                        else:
                            additional_fields[string_key] = index.fields[string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)

                # 删除内部字段
                del additional_fields[DJANGO_CT]
                del additional_fields[DJANGO_ID]

                # 高亮处理
                if highlight:
                    sa = StemmingAnalyzer()
                    formatter = WhooshHtmlFormatter('em')
                    terms = [token.text for token in sa(query_string)]
                    whoosh_result = whoosh_highlight(
                        additional_fields.get(self.content_field_name),
                        terms, sa, ContextFragmenter(), formatter)
                    additional_fields['highlighted'] = {
                        self.content_field_name: [whoosh_result],
                    }

                result = result_class(app_label, model_name,
                                      raw_result[DJANGO_ID], score,
                                      **additional_fields)
                results.append(result)
            else:
                hits -= 1  # 模型未注册，命中不计入总数

        # 拼写建议
        if self.include_spelling:
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(query_string)

        return {'results': results, 'hits': hits,
                'facets': facets, 'spelling_suggestion': spelling_suggestion}

    def create_spelling_suggestion(self, query_string):
        """
        基于 whoosh corrector 给出拼写纠正建议
        """
        spelling_suggestion = None
        reader = self.index.reader()
        corrector = reader.corrector(self.content_field_name)
        cleaned_query = force_str(query_string)
        if not query_string:
            return spelling_suggestion
        # 去掉保留字/符
        for rev_word in self.RESERVED_WORDS:
            cleaned_query = cleaned_query.replace(rev_word, '')
        for rev_char in self.RESERVED_CHARACTERS:
            cleaned_query = cleaned_query.replace(rev_char, '')
        # 按单词纠正
        query_words = cleaned_query.split()
        suggested_words = []
        for word in query_words:
            suggestions = corrector.suggest(word, limit=1)
            if len(suggestions) > 0:
                suggested_words.append(suggestions[0])
        spelling_suggestion = ' '.join(suggested_words)
        return spelling_suggestion

    def _from_python(self, value):
        """
        把 Python 值转成 whoosh 可索引的 unicode / string
        """
        if hasattr(value, 'strftime'):
            if not hasattr(value, 'hour'):
                value = datetime(value.year, value.month, value.day, 0, 0, 0)
        elif isinstance(value, bool):
            value = 'true' if value else 'false'
        elif isinstance(value, (list, tuple)):
            value = u','.join([force_str(v) for v in value])
        elif isinstance(value, (six.integer_types, float)):
            pass  # 数字保持原样
        else:
            value = force_str(value)
        return value

    def _to_python(self, value):
        """
        把 whoosh 存储的字符串转回 Python 对象
        """
        if value == 'true':
            return True
        elif value == 'false':
            return False
        if value and isinstance(value, six.string_types):
            possible_datetime = DATETIME_REGEX.search(value)
            if possible_datetime:
                date_values = possible_datetime.groupdict()
                for dk, dv in date_values.items():
                    date_values[dk] = int(dv)
                return datetime(date_values['year'], date_values['month'],
                                date_values['day'], date_values['hour'],
                                date_values['minute'], date_values['second'])
        try:
            converted_value = json.loads(value)
            if isinstance(converted_value, (list, tuple, set, dict,
                                            six.integer_types, float, complex)):
                return converted_value
        except BaseException:
            pass
        return value


class WhooshSearchQuery(BaseSearchQuery):
    """
    负责把 Haystack 的 filter/exclude 条件转换成 whoosh 查询语法
    """

    def _convert_datetime(self, date):
        """
        统一把 datetime 转成 whoosh 需要的 YYYYMMDDHHMMSS 字符串
        """
        if hasattr(date, 'hour'):
            return force_str(date.strftime('%Y%m%d%H%M%S'))
        else:
            return force_str(date.strftime('%Y%m%d000000'))

    def clean(self, query_fragment):
        """
        转义保留字/符；whoosh 1.x 不再支持反斜杠转义，需用引号包裹
        """
        words = query_fragment.split()
        cleaned_words = []
        for word in words:
            if word in self.backend.RESERVED_WORDS:
                word = word.lower()  # 保留字小写化
            for char in self.backend.RESERVED_CHARACTERS:
                if char in word:
                    word = "'%s'" % word
                    break
            cleaned_words.append(word)
        return ' '.join(cleaned_words)

    def build_query_fragment(self, field, filter_type, value):
        """
        把单个 filter 条件转成 whoosh 查询子串
        例如：name='exact' -> name:xxx
        """
        from haystack import connections
        query_frag = ''
        is_datetime = False

        if not hasattr(value, 'input_type_name'):
            # 处理 ValuesListQuerySet 等
            if hasattr(value, 'values_list'):
                value = list(value)
            if hasattr(value, 'strftime'):
                is_datetime = True
            if isinstance(value, six.string_types) and value != ' ':
                value = Clean(value)
            else:
                value = PythonData(value)

        prepared_value = value.prepare(self)

        if not isinstance(prepared_value, (set, list, tuple)):
            prepared_value = self.backend._from_python(prepared_value)

        # 'content' 是保留字段，表示全文检索
        if field == 'content':
            index_fieldname = ''
        else:
            index_fieldname = u'%s:' % connections[self._using].get_unified_index(
            ).get_index_fieldname(field)

        # 查询模板映射
        filter_types = {
            'content': '%s',
            'contains': '*%s*',
            'endswith': "*%s",
            'startswith': "%s*",
            'exact': '%s',
            'gt': "{%s to}",
            'gte': "[%s to]",
            'lt': "{to %s}",
            'lte': "[to %s]",
            'fuzzy': u'%s~',
        }

        if value.post_process is False:
            query_frag = prepared_value
        else:
            if filter_type in ['content', 'contains', 'startswith',
                               'endswith', 'fuzzy']:
                if value.input_type_name == 'exact':
                    query_frag = prepared_value
                else:
                    # 分词后每个词都加通配符/模板
                    terms = []
                    if isinstance(prepared_value, six.string_types):
                        possible_values = prepared_value.split(' ')
                    else:
                        if is_datetime is True:
                            prepared_value = self._convert_datetime(prepared_value)
                        possible_values = [prepared_value]
                    for possible_value in possible_values:
                        terms.append(
                            filter_types[filter_type] % self.backend._from_python(possible_value))
                    if len(terms) == 1:
                        query_frag = terms[0]
                    else:
                        query_frag = u"(%s)" % " AND ".join(terms)
            elif filter_type == 'in':
                in_options = []
                for possible_value in prepared_value:
                    is_datetime = False
                    if hasattr(possible_value, 'strftime'):
                        is_datetime = True
                    pv = self.backend._from_python(possible_value)
                    if is_datetime is True:
                        pv = self._convert_datetime(pv)
                    if isinstance(pv, six.string_types) and not is_datetime:
                        in_options.append('"%s"' % pv)
                    else:
                        in_options.append('%s' % pv)
                query_frag = "(%s)" % " OR ".join(in_options)
            elif filter_type == 'range':
                start = self.backend._from_python(prepared_value[0])
                end = self.backend._from_python(prepared_value[1])
                if hasattr(prepared_value[0], 'strftime'):
                    start = self._convert_datetime(start)
                if hasattr(prepared_value[1], 'strftime'):
                    end = self._convert_datetime(end)
                query_frag = u"[%s to %s]" % (start, end)
            elif filter_type == 'exact':
                if value.input_type_name == 'exact':
                    query_frag = prepared_value
                else:
                    prepared_value = Exact(prepared_value).prepare(self)
                    query_frag = filter_types[filter_type] % prepared_value
            else:
                if is_datetime is True:
                    prepared_value = self._convert_datetime(prepared_value)
                query_frag = filter_types[filter_type] % prepared_value

        # 非 Raw 输入且未带括号，则整体括号包裹
        if len(query_frag) and not isinstance(value, Raw):
            if not query_frag.startswith('(') and not query_frag.endswith(')'):
                query_frag = "(%s)" % query_frag

        return u"%s%s" % (index_fieldname, query_frag)


class WhooshEngine(BaseEngine):
    """
    入口 Engine，供 Haystack 加载
    """
    backend = WhooshSearchBackend
    query = WhooshSearchQuery