diff --git a/ArticleSpider/ArticleSpider/__pycache__/settings.cpython-39.pyc b/ArticleSpider/ArticleSpider/__pycache__/settings.cpython-39.pyc index 22bc2d0..b08b487 100644 Binary files a/ArticleSpider/ArticleSpider/__pycache__/settings.cpython-39.pyc and b/ArticleSpider/ArticleSpider/__pycache__/settings.cpython-39.pyc differ diff --git a/ArticleSpider/ArticleSpider/settings.py b/ArticleSpider/ArticleSpider/settings.py index 48bdb61..abdc2b1 100644 --- a/ArticleSpider/ArticleSpider/settings.py +++ b/ArticleSpider/ArticleSpider/settings.py @@ -1,115 +1,115 @@ -# Scrapy settings for ArticleSpider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html -import os -import sys - -import scrapy.downloadermiddlewares.useragent - -import ArticleSpider.pipelines - -BOT_NAME = "ArticleSpider" - -SPIDER_MODULES = ["ArticleSpider.spiders"] -NEWSPIDER_MODULE = "ArticleSpider.spiders" - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36" - -# Obey robots.txt rules -ROBOTSTXT_OBEY = False - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -# CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -# DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -# CONCURRENT_REQUESTS_PER_DOMAIN = 16 -# CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -COOKIES_ENABLED = True -COOKIES_DEBUG = True - -# Disable Telnet Console (enabled by default) -# TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -# DEFAULT_REQUEST_HEADERS = { -# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", -# "Accept-Language": "en", -# } - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -# SPIDER_MIDDLEWARES = { -# "ArticleSpider.middlewares.ArticlespiderSpiderMiddleware": 543, -# } - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -DOWNLOADER_MIDDLEWARES = { - # "ArticleSpider.middlewares.ArticlespiderDownloaderMiddleware": 543, - 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2, -} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -# EXTENSIONS = { -# "scrapy.extensions.telnet.TelnetConsole": None, -# } - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - # 'scrapy.pipelines.images.ImagesPipeline': 1, - # 'ArticleSpider.pipelines.JsonWithEncodingPipeline': 2, - # 'ArticleSpider.pipelines.JsonExporterPipeline': 3, - # 'ArticleSpider.pipelines.MysqlPipeline': 4, - # 'ArticleSpider.pipelines.MysqlTwistedPipline': 5, - 'ArticleSpider.pipelines.ElasticsearchPipeline': 6, - 'ArticleSpider.pipelines.ArticlespiderPipeline': 300, -} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -# AUTOTHROTTLE_ENABLED = True -# The initial download delay -# AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -# AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -# AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -# HTTPCACHE_ENABLED = True -# HTTPCACHE_EXPIRATION_SECS = 0 -# HTTPCACHE_DIR = "httpcache" -# HTTPCACHE_IGNORE_HTTP_CODES = [] -# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" - -# Set settings whose default value is deprecated to a future-proof value -REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" -TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" -FEED_EXPORT_ENCODING = "utf-8" - -# IMAGES_URLS_FIELD = 'front_image_url' -project_dir = os.path.abspath(os.path.dirname(__file__)) -IMAGES_STORE = os.path.join(project_dir, 'images') - -MYSQL_HOST = '127.0.0.1' -MYSQL_DBNAME = 'article_spider' -MYSQL_USER = 'root' +# Scrapy settings for ArticleSpider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html +import os +import sys + +import scrapy.downloadermiddlewares.useragent + +import ArticleSpider.pipelines + +BOT_NAME = "ArticleSpider" + +SPIDER_MODULES = ["ArticleSpider.spiders"] +NEWSPIDER_MODULE = "ArticleSpider.spiders" + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +# CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +# DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +COOKIES_ENABLED = True +COOKIES_DEBUG = True + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +# "Accept-Language": "en", +# } + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# "ArticleSpider.middlewares.ArticlespiderSpiderMiddleware": 543, +# } + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +DOWNLOADER_MIDDLEWARES = { + # "ArticleSpider.middlewares.ArticlespiderDownloaderMiddleware": 543, + 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2, +} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +# } + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + # 'scrapy.pipelines.images.ImagesPipeline': 1, + # 'ArticleSpider.pipelines.JsonWithEncodingPipeline': 2, + # 'ArticleSpider.pipelines.JsonExporterPipeline': 3, + # 'ArticleSpider.pipelines.MysqlPipeline': 4, + # 'ArticleSpider.pipelines.MysqlTwistedPipline': 5, + 'ArticleSpider.pipelines.ElasticsearchPipeline': 6, + 'ArticleSpider.pipelines.ArticlespiderPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = "httpcache" +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +FEED_EXPORT_ENCODING = "utf-8" + +# IMAGES_URLS_FIELD = 'front_image_url' +project_dir = os.path.abspath(os.path.dirname(__file__)) +IMAGES_STORE = os.path.join(project_dir, 'images') + +MYSQL_HOST = '127.0.0.1' +MYSQL_DBNAME = 'article_spider' +MYSQL_USER = 'root' MYSQL_PASSWORD = 'qweasdzxc227' \ No newline at end of file diff --git a/ArticleSpider/ArticleSpider/spiders/__pycache__/jobbole.cpython-39.pyc b/ArticleSpider/ArticleSpider/spiders/__pycache__/jobbole.cpython-39.pyc index b0ada79..639c1c6 100644 Binary files a/ArticleSpider/ArticleSpider/spiders/__pycache__/jobbole.cpython-39.pyc and b/ArticleSpider/ArticleSpider/spiders/__pycache__/jobbole.cpython-39.pyc differ diff --git a/ArticleSpider/ArticleSpider/spiders/jobbole.py b/ArticleSpider/ArticleSpider/spiders/jobbole.py index 3bb86bc..fa0fcc5 100644 --- a/ArticleSpider/ArticleSpider/spiders/jobbole.py +++ b/ArticleSpider/ArticleSpider/spiders/jobbole.py @@ -1,128 +1,87 @@ -import json -import re -import os -import requests -import scrapy -import pickle -import datetime -from scrapy.http import Request -from urllib import parse -from scrapy.loader import ItemLoader -from ArticleSpider.items import ArticleItemLoader -from ArticleSpider.items import JobBoleArticleItem -from ArticleSpider.utils import common -from ArticleSpider.utils.common import get_md5 -from scrapy import signals -import time -from selenium import webdriver -from scrapy.loader import ItemLoader - - -class JobboleSpider(scrapy.Spider): - name = "jobbole" - allowed_domains = ["news.cnblogs.com"] - start_urls = ["http://news.cnblogs.com/"] - - def start_requests(self): - cookies = [] - if os.path.exists(r'C:\Users\10955\ArticleSpider\cookies\jobbole.cookie'): - cookies = pickle.load(open(r'C:\Users\10955\ArticleSpider\cookies\jobbole.cookie', 'rb')) - if not cookies: - driver = webdriver.Chrome() - driver.implicitly_wait(10) - # 进入登录网站 - driver.get('https://account.cnblogs.com/signin') - # 使点击验证码失效 - driver.execute_script("Object.defineProperties(navigator,{webdriver:{get:()=>undefined}})") - # 输入账号 - driver.find_element_by_id('mat-input-0').send_keys('包包1') - # 输入密码 - driver.find_element_by_id('mat-input-1').send_keys('qweasdzxc227') - # 点击登录 - driver.find_element_by_css_selector('.mat-button-wrapper').click() - # 点击验证码 - driver.find_element_by_xpath('//*[@id="Shape3"]').click() - time.sleep(5) - cookies = driver.get_cookies() - pickle.dump(cookies, open(r'C:\Users\10955\ArticleSpider\cookies\jobbole.cookie', 'wb')) - cookie_dict = {} - for cookie in cookies: - cookie_dict[cookie['name']] = cookie['value'] - for url in self.start_urls: - yield scrapy.Request(url, dont_filter=True, cookies=cookie_dict) - # cookie_dict = {cookie['name']: cookie['value'] for cookie in cookies} - # print(cookies) - # print(cookie_dict) - # yield scrapy.Request(url='https://account.cnblogs.com/signin', callback=self.parse, cookies=cookie_dict) - - def parse(self, response): - # 1.获取新闻列表页中的新闻url并交给scrapy进行下载后调用相应的解析方 - # 提取文章链接,extract_first()提取第一个值 - post_nodes = response.css('#news_list .news_block')[:1] - for post_node in post_nodes: - image_url = "https:" + post_node.css('.entry_summary a img::attr(src)').extract_first("") - post_url = post_node.css('h2 a::attr(href)').extract_first("") - yield Request(url=parse.urljoin(response.url, post_url), meta={'front_image_url': image_url}, callback=self.parse_detail, dont_filter=True) - # 2.获取下一页的url并交给scrapy进行下载,下载完成后交给parse继续跟进 - # next_url = response.css('div.pager a:last-child::attr(href)').extract_first("") - # yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse) - - def parse_detail(self, response): - match_re = re.match(".*?(\d+)", response.url) - if match_re: - post_id = match_re.group(1) - # article_item = JobBoleArticleItem() - # title = response.css('#news_title a::text').extract_first("") - # create_date = response.css('#news_info .time::text').extract_first("") - # match_re = re.match('.*?(\d+.*)', create_date) - # if match_re: - # create_date = match_re.group(1) - # # create_date = response.xpath('//*[@id="news_info"]//*[@class="time"]/text()').extract_first("") - # - # content = response.css('#news_content').extract()[0] - # tag_list = response.css('.news_tags a::text').extract() - # tags = ','.join(tag_list) - # article_item['title'] = title - # article_item['create_date'] = create_date - # article_item['content'] = content - # article_item['tags'] = tags - # article_item['url'] = response.url - # if response.meta.get('front_image_url', ""): - # article_item['front_image_url'] = [response.meta.get('front_image_url', "")] - # else: - # article_item['front_image_url'] = [] - item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) - item_loader.add_css('title', '#news_title a::text') - item_loader.add_css('content', '#news_content') - item_loader.add_css('tags', '.news_tags a::text') - item_loader.add_css('create_date', '#news_info .time::text') - item_loader.add_value('url', response.url) - item_loader.add_value('front_image_url', response.meta.get('front_image_url', '')) - # article_item = item_loader.load_item() - # if response.meta.get('front_image_url', ""): - # article_item['front_image_url'] = [response.meta.get('front_image_url', "")] - # else: - # article_item['front_image_url'] = [] - yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), - meta={'article_item': item_loader, 'url':response.url}, callback=self.parse_nums) - # praise_nums = j_data['DiggCount'] - # fav_nums = j_data['TotalView'] - # comment_nums = j_data['CommentCount'] - # pass - - def parse_nums(self, response): - j_data = json.loads(response.text) - item_loader = response.meta.get('article_item', "") - # praise_nums = j_data['DiggCount'] - # fav_nums = j_data['TotalView'] - # comment_nums = j_data['CommentCount'] - item_loader.add_value('praise_nums', j_data['DiggCount']) - item_loader.add_value('fav_nums', j_data['TotalView']) - item_loader.add_value('comment_nums', j_data['CommentCount']) - item_loader.add_value('url_object_id', common.get_md5(response.meta.get('url', ''))) - # article_item['praise_nums'] = praise_nums - # article_item['fav_nums'] = fav_nums - # article_item['comment_nums'] = comment_nums - # article_item['url_object_id'] = common.get_md5(article_item['url']) - article_item = item_loader.load_item() - yield article_item +import json +import re +import os +import requests +import scrapy +import pickle +import datetime +from scrapy.http import Request +from urllib import parse +from scrapy.loader import ItemLoader +from ArticleSpider.items import ArticleItemLoader +from ArticleSpider.items import JobBoleArticleItem +from ArticleSpider.utils import common +from ArticleSpider.utils.common import get_md5 +from scrapy import signals +import time +from selenium import webdriver +from scrapy.loader import ItemLoader + + +class JobboleSpider(scrapy.Spider): + name = "jobbole" + allowed_domains = ["news.cnblogs.com"] + start_urls = ["http://news.cnblogs.com/"] + + def start_requests(self): + cookies = [] + if os.path.exists(r'C:\Users\10955\ArticleSpider\cookies\jobbole.cookie'): + cookies = pickle.load(open(r'C:\Users\10955\ArticleSpider\cookies\jobbole.cookie', 'rb')) + if not cookies: + driver = webdriver.Chrome() + driver.implicitly_wait(10) + # 进入登录网站 + driver.get('https://account.cnblogs.com/signin') + # 使点击验证码失效 + driver.execute_script("Object.defineProperties(navigator,{webdriver:{get:()=>undefined}})") + # 输入账号 + driver.find_element_by_id('mat-input-0').send_keys('包包1') + # 输入密码 + driver.find_element_by_id('mat-input-1').send_keys('qweasdzxc227') + # 点击登录 + driver.find_element_by_css_selector('.mat-button-wrapper').click() + # 点击验证码 + driver.find_element_by_xpath('//*[@id="Shape3"]').click() + time.sleep(5) + cookies = driver.get_cookies() + pickle.dump(cookies, open(r'C:\Users\10955\ArticleSpider\cookies\jobbole.cookie', 'wb')) + cookie_dict = {} + for cookie in cookies: + cookie_dict[cookie['name']] = cookie['value'] + for url in self.start_urls: + yield scrapy.Request(url, dont_filter=True, cookies=cookie_dict) + def parse(self, response): + # 1.获取新闻列表页中的新闻url并交给scrapy进行下载后调用相应的解析方 + # 提取文章链接,extract_first()提取第一个值 + post_nodes = response.css('#news_list .news_block')[:100] + for post_node in post_nodes: + image_url = "https:" + post_node.css('.entry_summary a img::attr(src)').extract_first("") + post_url = post_node.css('h2 a::attr(href)').extract_first("") + yield Request(url=parse.urljoin(response.url, post_url), meta={'front_image_url': image_url}, callback=self.parse_detail, dont_filter=True) + # 2.获取下一页的url并交给scrapy进行下载,下载完成后交给parse继续跟进 + next_url = response.css('div.pager a:last-child::attr(href)').extract_first("") + yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse) + + def parse_detail(self, response): + match_re = re.match(".*?(\d+)", response.url) + if match_re: + post_id = match_re.group(1) + item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) + item_loader.add_css('title', '#news_title a::text') + item_loader.add_css('content', '#news_content') + item_loader.add_css('tags', '.news_tags a::text') + item_loader.add_css('create_date', '#news_info .time::text') + item_loader.add_value('url', response.url) + item_loader.add_value('front_image_url', response.meta.get('front_image_url', '')) + yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), + meta={'article_item': item_loader, 'url':response.url}, callback=self.parse_nums) + + def parse_nums(self, response): + j_data = json.loads(response.text) + item_loader = response.meta.get('article_item', "") + item_loader.add_value('praise_nums', j_data['DiggCount']) + item_loader.add_value('fav_nums', j_data['TotalView']) + item_loader.add_value('comment_nums', j_data['CommentCount']) + item_loader.add_value('url_object_id', common.get_md5(response.meta.get('url', ''))) + article_item = item_loader.load_item() + yield article_item diff --git a/LcvSearch/LcvSearch/__pycache__/urls.cpython-39.pyc b/LcvSearch/LcvSearch/__pycache__/urls.cpython-39.pyc index fd4412f..8fa5b59 100644 Binary files a/LcvSearch/LcvSearch/__pycache__/urls.cpython-39.pyc and b/LcvSearch/LcvSearch/__pycache__/urls.cpython-39.pyc differ diff --git a/LcvSearch/LcvSearch/urls.py b/LcvSearch/LcvSearch/urls.py index 30e2b8a..3a20af3 100644 --- a/LcvSearch/LcvSearch/urls.py +++ b/LcvSearch/LcvSearch/urls.py @@ -17,8 +17,11 @@ Including another URLconf from django.contrib import admin from django.urls import path from django.views.generic import TemplateView +from search.views import SearchSuggest,SearchView urlpatterns = [ path('admin/', admin.site.urls), path('', TemplateView.as_view(template_name='index.html'), name='index'), + path('suggest/', SearchSuggest.as_view(), name='suggest'), + path('search/', SearchView.as_view(), name='search'), ] diff --git a/LcvSearch/search/__pycache__/models.cpython-39.pyc b/LcvSearch/search/__pycache__/models.cpython-39.pyc index 4a1703a..e631c0b 100644 Binary files a/LcvSearch/search/__pycache__/models.cpython-39.pyc and b/LcvSearch/search/__pycache__/models.cpython-39.pyc differ diff --git a/LcvSearch/search/__pycache__/views.cpython-39.pyc b/LcvSearch/search/__pycache__/views.cpython-39.pyc new file mode 100644 index 0000000..4bfade8 Binary files /dev/null and b/LcvSearch/search/__pycache__/views.cpython-39.pyc differ diff --git a/LcvSearch/search/models.py b/LcvSearch/search/models.py index fd18c6e..df9c9c7 100644 --- a/LcvSearch/search/models.py +++ b/LcvSearch/search/models.py @@ -1,3 +1,47 @@ from django.db import models # Create your models here. +# -*- coding: utf-8 -*- +__author__ = 'bobby' + +from datetime import datetime +from elasticsearch_dsl import DocType, Date, Nested, Boolean, \ + analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer + +from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer + +from elasticsearch_dsl.connections import connections + +connections.create_connection(hosts=["localhost"]) + + +class CustomAnalyzer(_CustomAnalyzer): + def get_analysis_definition(self): + return {} + + +ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"]) + + +class ArticleType(DocType): + # 伯乐在线文章类型 + suggest = Completion(analyzer=ik_analyzer) + title = Text(analyzer="ik_max_word") + create_date = Date() + url = Keyword() + url_object_id = Keyword() + front_image_url = Keyword() + front_image_path = Keyword() + praise_nums = Integer() + comment_nums = Integer() + fav_nums = Integer() + tags = Text(analyzer="ik_max_word") + content = Text(analyzer="ik_max_word") + + class Meta: + index = "jobbole" + doc_type = "article" + + +if __name__ == "__main__": + ArticleType.init() diff --git a/LcvSearch/search/views.py b/LcvSearch/search/views.py index c60c790..8c31c11 100644 --- a/LcvSearch/search/views.py +++ b/LcvSearch/search/views.py @@ -1,3 +1,88 @@ from django.shortcuts import render +from django.views.generic.base import View +from search.models import ArticleType +from django.http import HttpResponse +import json +from elasticsearch import Elasticsearch +from datetime import datetime + +client = Elasticsearch(hosts=['127.0.0.1']) + # Create your views here. +class SearchSuggest(View): + # 搜索建议模块 + def get(self, request): + key_words = request.GET.get('s', '') + re_datas = [] + if key_words: + s = ArticleType.search() + s = s.suggest('my_suggest', key_words, completion={ + "field": "suggest", "fuzzy": { + "fuzziness": 2 + }, + "size": 10 + }) + suggestions = s.execute_suggest() + for match in suggestions.my_suggest[0].options: + source = match._source + re_datas.append(source["title"]) + return HttpResponse(json.dumps(re_datas), content_type="application/json") + + +class SearchView(View): + def get(self, request): + key_words = request.GET.get("q", '') + page = request.GET.get('p', '1') + try: + page = int(page) + except: + page = 1 + start_time = datetime.now() + response = client.search( + index="jobbole", + body={ + "query": { + "multi_match": { + "query": key_words, + "fields": ["tags", "title", "content"] + } + }, + "from": (page - 1) * 10, + "size": 10, + "highlight": { + "pre_tags": [''], + "post_tags": [''], + "fields": { + "title": {}, + "content": {}, + } + } + } + ) + end_time = datetime.now() + last_seconds = (end_time - start_time).total_seconds() + total_nums = response['hits']['total'] + if (page % 10) > 0: + page_nums = int(total_nums / 10) + 1 + else: + page_nums = int(total_nums / 10) + # 构造值,获取每个字段的值 + hit_list = [] + for hit in response['hits']['hits']: + hit_dict = {} + if 'title' in hit['highlight']: + hit_dict['title'] = "".join(hit['highlight']['title']) + else: + hit_dict['title'] = hit['_source']['title'] + if 'content' in hit['highlight']: + hit_dict['content'] = "".join(hit['highlight']['content'])[:500] + else: + hit_dict['content'] = hit['_source']['content'][:500] + hit_dict["create_date"] = hit['_source']['create_date'] + hit_dict["url"] = hit['_source']['url'] + hit_dict["score"] = hit['_score'] + hit_list.append(hit_dict) + return render(request, 'result.html', + {'page': page, 'total_nums': total_nums, 'all_hits': hit_list, 'key_words': key_words, + 'page_nums': page_nums, 'total_nums': total_nums,'last_seconds':last_seconds}) diff --git a/LcvSearch/templates/index.html b/LcvSearch/templates/index.html index 42d3a1c..8b08c57 100644 --- a/LcvSearch/templates/index.html +++ b/LcvSearch/templates/index.html @@ -66,8 +66,8 @@