Merge pull request #242 from liangliangyy/es

搜索支持es
sh_branch
且听风吟 7 years ago committed by GitHub
commit 8bea2de9b7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,269 @@
#!/usr/bin/env python
# encoding: utf-8
"""
@version: ??
@author: liangliangyy
@license: MIT Licence
@contact: liangliangyy@gmail.com
@site: https://www.lylinux.net/
@software: PyCharm
@file: elasticsearch_backend.py
@time: 2019-04-13 11:46
"""
import logging
import re
import json
from datetime import datetime, timedelta
from django.conf import settings
from django.core.exceptions import ImproperlyConfigured
from django.utils import six
from django.utils.datetime_safe import datetime
from django.utils.encoding import force_text
from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, EmptyResults, log_query
from haystack.constants import DJANGO_CT, DJANGO_ID, ID
from haystack.exceptions import MissingDependency, SearchBackendError, SkipDocument
from haystack.inputs import Clean, Exact, PythonData, Raw
from haystack.models import SearchResult
from haystack.utils import log as logging
from haystack.utils import get_identifier, get_model_ct
from haystack.utils.app_loading import haystack_get_model
from django_elasticsearch_dsl.registries import registry
from blog.models import Article
from blog.documents import ArticleDocument
logger = logging.getLogger(__name__)
DATETIME_REGEX = re.compile(
'^(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})T(?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2})(\.\d{3,6}Z?)?$')
class ElasticSearchBackend(BaseSearchBackend):
def _get_models(self):
models = registry.get_models()
return set(models)
def _create(self, models):
for index in registry.get_indices(models):
index.create()
def _populate(self, models):
for doc in registry.get_documents(models):
qs = doc().get_queryset()
doc().update(qs)
def _delete(self, models):
for index in registry.get_indices(models):
index.delete(ignore=404)
return True
def _rebuild(self, models):
if not self._delete(models):
return
self._create(models)
self._populate(models)
def update(self, index, iterable, commit=True):
models = self._get_models()
self._rebuild(models)
def remove(self, obj_or_string):
models = self._get_models()
self._delete(models)
def clear(self, models=None, commit=True):
self.remove(None)
@log_query
def search(self, query_string, **kwargs):
logger.info('search query_string:' + query_string)
start_offset = kwargs.get('start_offset')
end_offset = kwargs.get('end_offset')
search = ArticleDocument.search() \
.query("match", body=query_string) \
.filter('term', status='p') \
.filter('term', type='a') \
[start_offset: end_offset]
results = search.execute()
return self._process_results(raw_results=results)
def _process_results(self, raw_results, highlight=False,
result_class=None, distance_point=None,
geo_sort=False):
from haystack import connections
results = []
hits = raw_results['hits'].total
facets = {}
spelling_suggestion = None
if result_class is None:
result_class = SearchResult
if 'facets' in raw_results:
facets = {
'fields': {},
'dates': {},
'queries': {},
}
# ES can return negative timestamps for pre-1970 data. Handle it.
def from_timestamp(tm):
if tm >= 0:
return datetime.utcfromtimestamp(tm)
else:
return datetime(1970, 1, 1) + timedelta(seconds=tm)
for facet_fieldname, facet_info in raw_results['facets'].items():
if facet_info.get('_type', 'terms') == 'terms':
facets['fields'][facet_fieldname] = [(individual['term'], individual['count']) for individual in
facet_info['terms']]
elif facet_info.get('_type', 'terms') == 'date_histogram':
# Elasticsearch provides UTC timestamps with an extra three
# decimals of precision, which datetime barfs on.
facets['dates'][facet_fieldname] = [(from_timestamp(individual['time'] / 1000),
individual['count'])
for individual in facet_info['entries']]
elif facet_info.get('_type', 'terms') == 'query':
facets['queries'][facet_fieldname] = facet_info['count']
unified_index = connections[self.connection_alias].get_unified_index()
content_field = unified_index.document_field
# articleids = list(map(lambda x: x['_id'], raw_results['hits']['hits']))
# article_results = list(Article.objects.filter(id__in=articleids))
for raw_result in raw_results['hits']['hits']:
app_label = 'blog'
model_name = 'Article'
additional_fields = {}
if 'highlight' in raw_result:
additional_fields['highlighted'] = raw_result['highlight'].get(content_field, '')
if distance_point:
additional_fields['_point_of_origin'] = distance_point
if geo_sort and raw_result.get('sort'):
from haystack.utils.geo import Distance
additional_fields['_distance'] = Distance(km=float(raw_result['sort'][0]))
else:
additional_fields['_distance'] = None
result = result_class(app_label, model_name, raw_result['_id'], raw_result['_score'],
**additional_fields)
results.append(result)
return {
'results': results,
'hits': hits,
'facets': facets,
'spelling_suggestion': spelling_suggestion,
}
def _from_python(self, value):
"""
Converts Python values to a string for Whoosh.
Code courtesy of pysolr.
"""
if hasattr(value, 'strftime'):
if not hasattr(value, 'hour'):
value = datetime(value.year, value.month, value.day, 0, 0, 0)
elif isinstance(value, bool):
if value:
value = 'true'
else:
value = 'false'
elif isinstance(value, (list, tuple)):
value = u','.join([force_text(v) for v in value])
elif isinstance(value, (six.integer_types, float)):
# Leave it alone.
pass
else:
value = force_text(value)
return value
def _to_python(self, value):
"""
Converts values from Whoosh to native Python values.
A port of the same method in pysolr, as they deal with data the same way.
"""
if value == 'true':
return True
elif value == 'false':
return False
if value and isinstance(value, six.string_types):
possible_datetime = DATETIME_REGEX.search(value)
if possible_datetime:
date_values = possible_datetime.groupdict()
for dk, dv in date_values.items():
date_values[dk] = int(dv)
return datetime(date_values['year'], date_values['month'], date_values['day'], date_values['hour'],
date_values['minute'], date_values['second'])
try:
# Attempt to use json to load the values.
converted_value = json.loads(value)
# Try to handle most built-in types.
if isinstance(converted_value, (list, tuple, set, dict, six.integer_types, float, complex)):
return converted_value
except:
# If it fails (SyntaxError or its ilk) or we don't trust it,
# continue on.
pass
return value
class ElasticSearchQuery(BaseSearchQuery):
def _convert_datetime(self, date):
if hasattr(date, 'hour'):
return force_text(date.strftime('%Y%m%d%H%M%S'))
else:
return force_text(date.strftime('%Y%m%d000000'))
def clean(self, query_fragment):
"""
Provides a mechanism for sanitizing user input before presenting the
value to the backend.
Whoosh 1.X differs here in that you can no longer use a backslash
to escape reserved characters. Instead, the whole word should be
quoted.
"""
words = query_fragment.split()
cleaned_words = []
for word in words:
if word in self.backend.RESERVED_WORDS:
word = word.replace(word, word.lower())
for char in self.backend.RESERVED_CHARACTERS:
if char in word:
word = "'%s'" % word
break
cleaned_words.append(word)
return ' '.join(cleaned_words)
def build_query_fragment(self, field, filter_type, value):
return value.query_string
class ElasticSearchEngine(BaseEngine):
backend = ElasticSearchBackend
query = ElasticSearchQuery

@ -0,0 +1,60 @@
#!/usr/bin/env python
# encoding: utf-8
"""
@version: ??
@author: liangliangyy
@license: MIT Licence
@contact: liangliangyy@gmail.com
@site: https://www.lylinux.net/
@software: PyCharm
@file: documents.py
@time: 2019-04-05 13:05
"""
from django_elasticsearch_dsl import DocType, Index, fields
from blog.models import Article, Category, Tag
from accounts.models import BlogUser
blog = Index('blog')
blog.settings(
number_of_shards=1,
number_of_replicas=0
)
@blog.doc_type
class ArticleDocument(DocType):
body = fields.TextField(attr='body_to_string', analyzer='ik_max_word')
title = fields.TextField(analyzer='ik_max_word')
author = fields.ObjectField(properties={
'nickname': fields.TextField(analyzer='ik_max_word'),
'id': fields.IntegerField()
})
category = fields.ObjectField(properties={
'name': fields.TextField(analyzer='ik_max_word'),
'id': fields.IntegerField()
})
tags = fields.ObjectField(properties={
'name': fields.TextField(analyzer='ik_max_word'),
'id': fields.IntegerField()
})
# def get_instances_from_related(self, related_instance):
# if isinstance(related_instance, BlogUser):
# return related_instance
# elif isinstance(related_instance, Category):
# pass
class Meta:
model = Article
fields = [
'pub_time',
'status',
'comment_status',
'type',
'views',
'article_order',
]
# related_models = [Category, Tag, BlogUser]
doc_type = 'Article'

@ -79,6 +79,9 @@ class Article(BaseModel):
category = models.ForeignKey('Category', verbose_name='分类', on_delete=models.CASCADE, blank=False, null=False)
tags = models.ManyToManyField('Tag', verbose_name='标签集合', blank=True)
def body_to_string(self):
return self.body
def __str__(self):
return self.title

@ -42,9 +42,7 @@ python-memcached==1.59
python-slugify==3.0.0
pytz==2018.9
raven==6.10.0
rcssmin==1.0.6
requests==2.21.0
rjsmin==1.1.0
six==1.12.0
sqlparse==0.3.0
text-unidecode==1.2
@ -56,4 +54,3 @@ WeRoBot==1.8.0
Whoosh==2.7.4
wrapt==1.11.1
xmltodict==0.12.0

@ -43,9 +43,8 @@ class ServerManagerTest(TestCase):
article.status = 'p'
article.save()
s = TextMessage([])
s.content = "nicetitleccc"
s.content = "nice"
rsp = search(s, None)
self.assertTrue(rsp != '没有找到相关文章。')
rsp = category(None, None)
self.assertIsNotNone(rsp)
rsp = recents(None, None)
@ -64,19 +63,19 @@ class ServerManagerTest(TestCase):
s.content = 'test'
msghandler = MessageHandler(s, {})
#msghandler.userinfo.isPasswordSet = True
#msghandler.userinfo.isAdmin = True
# msghandler.userinfo.isPasswordSet = True
# msghandler.userinfo.isAdmin = True
msghandler.handler()
s.content = 'y'
msghandler.handler()
s.content='idcard:12321233'
s.content = 'idcard:12321233'
msghandler.handler()
s.content='weather:上海'
s.content = 'weather:上海'
msghandler.handler()
s.content='admin'
s.content = 'admin'
msghandler.handler()
s.content='123'
s.content = '123'
msghandler.handler()
s.content = 'exit'
msghandler.handler()
msghandler.handler()

@ -12,11 +12,15 @@ django-appconf==1.0.3
django-autoslug==1.9.4
django-compressor==2.2
django-debug-toolbar==1.11
django-elasticsearch-dsl==0.5.1
django-haystack==2.8.1
django-ipware==2.1.0
django-mdeditor==0.1.13
django-uuslug==1.1.8
elasticsearch==6.3.1
elasticsearch-dsl==6.1.0
idna==2.8
ipaddress==1.0.22
isort==4.3.15
jieba==0.39
jsonpickle==1.1
@ -32,13 +36,13 @@ Pygments==2.3.1
pylint==2.3.1
PyMySQL==0.9.3
pyparsing==2.3.1
python-dateutil==2.8.0
python-logstash==0.4.6
python-memcached==1.59
python-slugify==3.0.0
pytz==2018.9
raven==6.10.0
rcssmin==1.0.6
requests==2.21.0
rjsmin==1.1.0
six==1.12.0
sqlparse==0.3.0
text-unidecode==1.2
@ -49,4 +53,4 @@ webencodings==0.5.1
WeRoBot==1.8.0
Whoosh==2.7.4
wrapt==1.11.1
xmltodict==0.12.0
xmltodict==0.12.0
Loading…
Cancel
Save