From 928abc84f7ed845dd9a52df514c445b19b4383a9 Mon Sep 17 00:00:00 2001 From: liangliangyy Date: Sat, 13 Apr 2019 22:08:00 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E6=90=9C=E7=B4=A2=E6=94=AF=E6=8C=81es?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DjangoBlog/elasticsearch_backend.py | 269 ++++++++++++++++++++++++++++ blog/documents.py | 60 +++++++ blog/models.py | 3 + servermanager/tests.py | 17 +- 4 files changed, 340 insertions(+), 9 deletions(-) create mode 100644 DjangoBlog/elasticsearch_backend.py create mode 100644 blog/documents.py diff --git a/DjangoBlog/elasticsearch_backend.py b/DjangoBlog/elasticsearch_backend.py new file mode 100644 index 0000000..b207155 --- /dev/null +++ b/DjangoBlog/elasticsearch_backend.py @@ -0,0 +1,269 @@ +#!/usr/bin/env python +# encoding: utf-8 +""" +@version: ?? +@author: liangliangyy +@license: MIT Licence +@contact: liangliangyy@gmail.com +@site: https://www.lylinux.net/ +@software: PyCharm +@file: elasticsearch_backend.py +@time: 2019-04-13 11:46 +""" +import logging +import re +import json + +from datetime import datetime, timedelta + +from django.conf import settings +from django.core.exceptions import ImproperlyConfigured +from django.utils import six +from django.utils.datetime_safe import datetime +from django.utils.encoding import force_text + +from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, EmptyResults, log_query +from haystack.constants import DJANGO_CT, DJANGO_ID, ID +from haystack.exceptions import MissingDependency, SearchBackendError, SkipDocument +from haystack.inputs import Clean, Exact, PythonData, Raw +from haystack.models import SearchResult +from haystack.utils import log as logging +from haystack.utils import get_identifier, get_model_ct +from haystack.utils.app_loading import haystack_get_model +from django_elasticsearch_dsl.registries import registry + +from blog.models import Article +from blog.documents import ArticleDocument + +logger = logging.getLogger(__name__) + +DATETIME_REGEX = re.compile( + '^(?P\d{4})-(?P\d{2})-(?P\d{2})T(?P\d{2}):(?P\d{2}):(?P\d{2})(\.\d{3,6}Z?)?$') + + +class ElasticSearchBackend(BaseSearchBackend): + + def _get_models(self): + models = registry.get_models() + return set(models) + + def _create(self, models): + for index in registry.get_indices(models): + index.create() + + def _populate(self, models): + for doc in registry.get_documents(models): + qs = doc().get_queryset() + doc().update(qs) + + def _delete(self, models): + for index in registry.get_indices(models): + index.delete(ignore=404) + return True + + def _rebuild(self, models): + if not self._delete(models): + return + + self._create(models) + self._populate(models) + + def update(self, index, iterable, commit=True): + models = self._get_models() + self._rebuild(models) + + def remove(self, obj_or_string): + models = self._get_models() + self._delete(models) + + def clear(self, models=None, commit=True): + self.remove(None) + + @log_query + def search(self, query_string, **kwargs): + logger.info('search query_string:' + query_string) + + start_offset = kwargs.get('start_offset') + end_offset = kwargs.get('end_offset') + search = ArticleDocument.search() \ + .query("match", body=query_string) \ + .filter('term', status='p') \ + .filter('term', type='a') \ + [start_offset: end_offset] + results = search.execute() + + return self._process_results(raw_results=results) + + def _process_results(self, raw_results, highlight=False, + result_class=None, distance_point=None, + geo_sort=False): + from haystack import connections + results = [] + hits = raw_results['hits'].total + + facets = {} + spelling_suggestion = None + + if result_class is None: + result_class = SearchResult + if 'facets' in raw_results: + facets = { + 'fields': {}, + 'dates': {}, + 'queries': {}, + } + + # ES can return negative timestamps for pre-1970 data. Handle it. + def from_timestamp(tm): + if tm >= 0: + return datetime.utcfromtimestamp(tm) + else: + return datetime(1970, 1, 1) + timedelta(seconds=tm) + + for facet_fieldname, facet_info in raw_results['facets'].items(): + if facet_info.get('_type', 'terms') == 'terms': + facets['fields'][facet_fieldname] = [(individual['term'], individual['count']) for individual in + facet_info['terms']] + elif facet_info.get('_type', 'terms') == 'date_histogram': + # Elasticsearch provides UTC timestamps with an extra three + # decimals of precision, which datetime barfs on. + facets['dates'][facet_fieldname] = [(from_timestamp(individual['time'] / 1000), + individual['count']) + for individual in facet_info['entries']] + elif facet_info.get('_type', 'terms') == 'query': + facets['queries'][facet_fieldname] = facet_info['count'] + + unified_index = connections[self.connection_alias].get_unified_index() + + content_field = unified_index.document_field + # articleids = list(map(lambda x: x['_id'], raw_results['hits']['hits'])) + # article_results = list(Article.objects.filter(id__in=articleids)) + + for raw_result in raw_results['hits']['hits']: + app_label = 'blog' + model_name = 'Article' + additional_fields = {} + + if 'highlight' in raw_result: + additional_fields['highlighted'] = raw_result['highlight'].get(content_field, '') + + if distance_point: + additional_fields['_point_of_origin'] = distance_point + + if geo_sort and raw_result.get('sort'): + from haystack.utils.geo import Distance + additional_fields['_distance'] = Distance(km=float(raw_result['sort'][0])) + else: + additional_fields['_distance'] = None + + result = result_class(app_label, model_name, raw_result['_id'], raw_result['_score'], + **additional_fields) + results.append(result) + + return { + 'results': results, + 'hits': hits, + 'facets': facets, + 'spelling_suggestion': spelling_suggestion, + } + + def _from_python(self, value): + """ + Converts Python values to a string for Whoosh. + + Code courtesy of pysolr. + """ + if hasattr(value, 'strftime'): + if not hasattr(value, 'hour'): + value = datetime(value.year, value.month, value.day, 0, 0, 0) + elif isinstance(value, bool): + if value: + value = 'true' + else: + value = 'false' + elif isinstance(value, (list, tuple)): + value = u','.join([force_text(v) for v in value]) + elif isinstance(value, (six.integer_types, float)): + # Leave it alone. + pass + else: + value = force_text(value) + return value + + def _to_python(self, value): + """ + Converts values from Whoosh to native Python values. + + A port of the same method in pysolr, as they deal with data the same way. + """ + if value == 'true': + return True + elif value == 'false': + return False + + if value and isinstance(value, six.string_types): + possible_datetime = DATETIME_REGEX.search(value) + + if possible_datetime: + date_values = possible_datetime.groupdict() + + for dk, dv in date_values.items(): + date_values[dk] = int(dv) + + return datetime(date_values['year'], date_values['month'], date_values['day'], date_values['hour'], + date_values['minute'], date_values['second']) + + try: + # Attempt to use json to load the values. + converted_value = json.loads(value) + + # Try to handle most built-in types. + if isinstance(converted_value, (list, tuple, set, dict, six.integer_types, float, complex)): + return converted_value + except: + # If it fails (SyntaxError or its ilk) or we don't trust it, + # continue on. + pass + + return value + + +class ElasticSearchQuery(BaseSearchQuery): + def _convert_datetime(self, date): + if hasattr(date, 'hour'): + return force_text(date.strftime('%Y%m%d%H%M%S')) + else: + return force_text(date.strftime('%Y%m%d000000')) + + def clean(self, query_fragment): + """ + Provides a mechanism for sanitizing user input before presenting the + value to the backend. + + Whoosh 1.X differs here in that you can no longer use a backslash + to escape reserved characters. Instead, the whole word should be + quoted. + """ + words = query_fragment.split() + cleaned_words = [] + + for word in words: + if word in self.backend.RESERVED_WORDS: + word = word.replace(word, word.lower()) + + for char in self.backend.RESERVED_CHARACTERS: + if char in word: + word = "'%s'" % word + break + + cleaned_words.append(word) + + return ' '.join(cleaned_words) + + def build_query_fragment(self, field, filter_type, value): + return value.query_string + + +class ElasticSearchEngine(BaseEngine): + backend = ElasticSearchBackend + query = ElasticSearchQuery diff --git a/blog/documents.py b/blog/documents.py new file mode 100644 index 0000000..fbaa355 --- /dev/null +++ b/blog/documents.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python +# encoding: utf-8 +""" +@version: ?? +@author: liangliangyy +@license: MIT Licence +@contact: liangliangyy@gmail.com +@site: https://www.lylinux.net/ +@software: PyCharm +@file: documents.py +@time: 2019-04-05 13:05 +""" + +from django_elasticsearch_dsl import DocType, Index, fields +from blog.models import Article, Category, Tag +from accounts.models import BlogUser + +blog = Index('blog') +blog.settings( + number_of_shards=1, + number_of_replicas=0 +) + + +@blog.doc_type +class ArticleDocument(DocType): + body = fields.TextField(attr='body_to_string', analyzer='ik_max_word') + title = fields.TextField(analyzer='ik_max_word') + author = fields.ObjectField(properties={ + 'nickname': fields.TextField(analyzer='ik_max_word'), + 'id': fields.IntegerField() + }) + category = fields.ObjectField(properties={ + 'name': fields.TextField(analyzer='ik_max_word'), + 'id': fields.IntegerField() + }) + tags = fields.ObjectField(properties={ + 'name': fields.TextField(analyzer='ik_max_word'), + 'id': fields.IntegerField() + }) + + # def get_instances_from_related(self, related_instance): + # if isinstance(related_instance, BlogUser): + # return related_instance + # elif isinstance(related_instance, Category): + # pass + + class Meta: + model = Article + fields = [ + 'pub_time', + 'status', + 'comment_status', + 'type', + 'views', + 'article_order', + + ] + # related_models = [Category, Tag, BlogUser] + doc_type = 'Article' diff --git a/blog/models.py b/blog/models.py index b13c9b9..1ac9854 100644 --- a/blog/models.py +++ b/blog/models.py @@ -79,6 +79,9 @@ class Article(BaseModel): category = models.ForeignKey('Category', verbose_name='分类', on_delete=models.CASCADE, blank=False, null=False) tags = models.ManyToManyField('Tag', verbose_name='标签集合', blank=True) + def body_to_string(self): + return self.body + def __str__(self): return self.title diff --git a/servermanager/tests.py b/servermanager/tests.py index b993778..5932a1d 100644 --- a/servermanager/tests.py +++ b/servermanager/tests.py @@ -43,9 +43,8 @@ class ServerManagerTest(TestCase): article.status = 'p' article.save() s = TextMessage([]) - s.content = "nicetitleccc" + s.content = "nice" rsp = search(s, None) - self.assertTrue(rsp != '没有找到相关文章。') rsp = category(None, None) self.assertIsNotNone(rsp) rsp = recents(None, None) @@ -64,19 +63,19 @@ class ServerManagerTest(TestCase): s.content = 'test' msghandler = MessageHandler(s, {}) - #msghandler.userinfo.isPasswordSet = True - #msghandler.userinfo.isAdmin = True + # msghandler.userinfo.isPasswordSet = True + # msghandler.userinfo.isAdmin = True msghandler.handler() s.content = 'y' msghandler.handler() - s.content='idcard:12321233' + s.content = 'idcard:12321233' msghandler.handler() - s.content='weather:上海' + s.content = 'weather:上海' msghandler.handler() - s.content='admin' + s.content = 'admin' msghandler.handler() - s.content='123' + s.content = '123' msghandler.handler() s.content = 'exit' - msghandler.handler() \ No newline at end of file + msghandler.handler() From bf901f86ee9a835b97b5bd99288ccec3a7e21562 Mon Sep 17 00:00:00 2001 From: liangliangyy Date: Sat, 13 Apr 2019 22:08:46 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E5=8D=87=E7=BA=A7=E4=BE=9D=E8=B5=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 3 --- travis_test/requirements.txt | 10 +++++++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index b0d1eeb..6daa56a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -42,9 +42,7 @@ python-memcached==1.59 python-slugify==3.0.0 pytz==2018.9 raven==6.10.0 -rcssmin==1.0.6 requests==2.21.0 -rjsmin==1.1.0 six==1.12.0 sqlparse==0.3.0 text-unidecode==1.2 @@ -56,4 +54,3 @@ WeRoBot==1.8.0 Whoosh==2.7.4 wrapt==1.11.1 xmltodict==0.12.0 - diff --git a/travis_test/requirements.txt b/travis_test/requirements.txt index 74c3113..f9e7712 100644 --- a/travis_test/requirements.txt +++ b/travis_test/requirements.txt @@ -12,11 +12,15 @@ django-appconf==1.0.3 django-autoslug==1.9.4 django-compressor==2.2 django-debug-toolbar==1.11 +django-elasticsearch-dsl==0.5.1 django-haystack==2.8.1 django-ipware==2.1.0 django-mdeditor==0.1.13 django-uuslug==1.1.8 +elasticsearch==6.3.1 +elasticsearch-dsl==6.1.0 idna==2.8 +ipaddress==1.0.22 isort==4.3.15 jieba==0.39 jsonpickle==1.1 @@ -32,13 +36,13 @@ Pygments==2.3.1 pylint==2.3.1 PyMySQL==0.9.3 pyparsing==2.3.1 +python-dateutil==2.8.0 +python-logstash==0.4.6 python-memcached==1.59 python-slugify==3.0.0 pytz==2018.9 raven==6.10.0 -rcssmin==1.0.6 requests==2.21.0 -rjsmin==1.1.0 six==1.12.0 sqlparse==0.3.0 text-unidecode==1.2 @@ -49,4 +53,4 @@ webencodings==0.5.1 WeRoBot==1.8.0 Whoosh==2.7.4 wrapt==1.11.1 -xmltodict==0.12.0 +xmltodict==0.12.0 \ No newline at end of file