parent
d737bee569
commit
31d6d15c69
Binary file not shown.
@ -1,115 +1,115 @@
|
|||||||
# Scrapy settings for ArticleSpider project
|
# Scrapy settings for ArticleSpider project
|
||||||
#
|
#
|
||||||
# For simplicity, this file contains only settings considered important or
|
# For simplicity, this file contains only settings considered important or
|
||||||
# commonly used. You can find more settings consulting the documentation:
|
# commonly used. You can find more settings consulting the documentation:
|
||||||
#
|
#
|
||||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
import scrapy.downloadermiddlewares.useragent
|
import scrapy.downloadermiddlewares.useragent
|
||||||
|
|
||||||
import ArticleSpider.pipelines
|
import ArticleSpider.pipelines
|
||||||
|
|
||||||
BOT_NAME = "ArticleSpider"
|
BOT_NAME = "ArticleSpider"
|
||||||
|
|
||||||
SPIDER_MODULES = ["ArticleSpider.spiders"]
|
SPIDER_MODULES = ["ArticleSpider.spiders"]
|
||||||
NEWSPIDER_MODULE = "ArticleSpider.spiders"
|
NEWSPIDER_MODULE = "ArticleSpider.spiders"
|
||||||
|
|
||||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
|
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
|
||||||
|
|
||||||
# Obey robots.txt rules
|
# Obey robots.txt rules
|
||||||
ROBOTSTXT_OBEY = False
|
ROBOTSTXT_OBEY = False
|
||||||
|
|
||||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||||
# CONCURRENT_REQUESTS = 32
|
# CONCURRENT_REQUESTS = 32
|
||||||
|
|
||||||
# Configure a delay for requests for the same website (default: 0)
|
# Configure a delay for requests for the same website (default: 0)
|
||||||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||||
# See also autothrottle settings and docs
|
# See also autothrottle settings and docs
|
||||||
# DOWNLOAD_DELAY = 3
|
# DOWNLOAD_DELAY = 3
|
||||||
# The download delay setting will honor only one of:
|
# The download delay setting will honor only one of:
|
||||||
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||||
# CONCURRENT_REQUESTS_PER_IP = 16
|
# CONCURRENT_REQUESTS_PER_IP = 16
|
||||||
|
|
||||||
# Disable cookies (enabled by default)
|
# Disable cookies (enabled by default)
|
||||||
COOKIES_ENABLED = True
|
COOKIES_ENABLED = True
|
||||||
COOKIES_DEBUG = True
|
COOKIES_DEBUG = True
|
||||||
|
|
||||||
# Disable Telnet Console (enabled by default)
|
# Disable Telnet Console (enabled by default)
|
||||||
# TELNETCONSOLE_ENABLED = False
|
# TELNETCONSOLE_ENABLED = False
|
||||||
|
|
||||||
# Override the default request headers:
|
# Override the default request headers:
|
||||||
# DEFAULT_REQUEST_HEADERS = {
|
# DEFAULT_REQUEST_HEADERS = {
|
||||||
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
# "Accept-Language": "en",
|
# "Accept-Language": "en",
|
||||||
# }
|
# }
|
||||||
|
|
||||||
# Enable or disable spider middlewares
|
# Enable or disable spider middlewares
|
||||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
# SPIDER_MIDDLEWARES = {
|
# SPIDER_MIDDLEWARES = {
|
||||||
# "ArticleSpider.middlewares.ArticlespiderSpiderMiddleware": 543,
|
# "ArticleSpider.middlewares.ArticlespiderSpiderMiddleware": 543,
|
||||||
# }
|
# }
|
||||||
|
|
||||||
# Enable or disable downloader middlewares
|
# Enable or disable downloader middlewares
|
||||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
DOWNLOADER_MIDDLEWARES = {
|
DOWNLOADER_MIDDLEWARES = {
|
||||||
# "ArticleSpider.middlewares.ArticlespiderDownloaderMiddleware": 543,
|
# "ArticleSpider.middlewares.ArticlespiderDownloaderMiddleware": 543,
|
||||||
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2,
|
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Enable or disable extensions
|
# Enable or disable extensions
|
||||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||||
# EXTENSIONS = {
|
# EXTENSIONS = {
|
||||||
# "scrapy.extensions.telnet.TelnetConsole": None,
|
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||||
# }
|
# }
|
||||||
|
|
||||||
# Configure item pipelines
|
# Configure item pipelines
|
||||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
ITEM_PIPELINES = {
|
ITEM_PIPELINES = {
|
||||||
# 'scrapy.pipelines.images.ImagesPipeline': 1,
|
# 'scrapy.pipelines.images.ImagesPipeline': 1,
|
||||||
# 'ArticleSpider.pipelines.JsonWithEncodingPipeline': 2,
|
# 'ArticleSpider.pipelines.JsonWithEncodingPipeline': 2,
|
||||||
# 'ArticleSpider.pipelines.JsonExporterPipeline': 3,
|
# 'ArticleSpider.pipelines.JsonExporterPipeline': 3,
|
||||||
# 'ArticleSpider.pipelines.MysqlPipeline': 4,
|
# 'ArticleSpider.pipelines.MysqlPipeline': 4,
|
||||||
# 'ArticleSpider.pipelines.MysqlTwistedPipline': 5,
|
# 'ArticleSpider.pipelines.MysqlTwistedPipline': 5,
|
||||||
'ArticleSpider.pipelines.ElasticsearchPipeline': 6,
|
'ArticleSpider.pipelines.ElasticsearchPipeline': 6,
|
||||||
'ArticleSpider.pipelines.ArticlespiderPipeline': 300,
|
'ArticleSpider.pipelines.ArticlespiderPipeline': 300,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||||
# AUTOTHROTTLE_ENABLED = True
|
# AUTOTHROTTLE_ENABLED = True
|
||||||
# The initial download delay
|
# The initial download delay
|
||||||
# AUTOTHROTTLE_START_DELAY = 5
|
# AUTOTHROTTLE_START_DELAY = 5
|
||||||
# The maximum download delay to be set in case of high latencies
|
# The maximum download delay to be set in case of high latencies
|
||||||
# AUTOTHROTTLE_MAX_DELAY = 60
|
# AUTOTHROTTLE_MAX_DELAY = 60
|
||||||
# The average number of requests Scrapy should be sending in parallel to
|
# The average number of requests Scrapy should be sending in parallel to
|
||||||
# each remote server
|
# each remote server
|
||||||
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||||
# Enable showing throttling stats for every response received:
|
# Enable showing throttling stats for every response received:
|
||||||
# AUTOTHROTTLE_DEBUG = False
|
# AUTOTHROTTLE_DEBUG = False
|
||||||
|
|
||||||
# Enable and configure HTTP caching (disabled by default)
|
# Enable and configure HTTP caching (disabled by default)
|
||||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||||
# HTTPCACHE_ENABLED = True
|
# HTTPCACHE_ENABLED = True
|
||||||
# HTTPCACHE_EXPIRATION_SECS = 0
|
# HTTPCACHE_EXPIRATION_SECS = 0
|
||||||
# HTTPCACHE_DIR = "httpcache"
|
# HTTPCACHE_DIR = "httpcache"
|
||||||
# HTTPCACHE_IGNORE_HTTP_CODES = []
|
# HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||||
|
|
||||||
# Set settings whose default value is deprecated to a future-proof value
|
# Set settings whose default value is deprecated to a future-proof value
|
||||||
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||||
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||||
FEED_EXPORT_ENCODING = "utf-8"
|
FEED_EXPORT_ENCODING = "utf-8"
|
||||||
|
|
||||||
# IMAGES_URLS_FIELD = 'front_image_url'
|
# IMAGES_URLS_FIELD = 'front_image_url'
|
||||||
project_dir = os.path.abspath(os.path.dirname(__file__))
|
project_dir = os.path.abspath(os.path.dirname(__file__))
|
||||||
IMAGES_STORE = os.path.join(project_dir, 'images')
|
IMAGES_STORE = os.path.join(project_dir, 'images')
|
||||||
|
|
||||||
MYSQL_HOST = '127.0.0.1'
|
MYSQL_HOST = '127.0.0.1'
|
||||||
MYSQL_DBNAME = 'article_spider'
|
MYSQL_DBNAME = 'article_spider'
|
||||||
MYSQL_USER = 'root'
|
MYSQL_USER = 'root'
|
||||||
MYSQL_PASSWORD = 'qweasdzxc227'
|
MYSQL_PASSWORD = 'qweasdzxc227'
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,3 +1,47 @@
|
|||||||
from django.db import models
|
from django.db import models
|
||||||
|
|
||||||
# Create your models here.
|
# Create your models here.
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
__author__ = 'bobby'
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
|
||||||
|
analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
|
||||||
|
|
||||||
|
from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer
|
||||||
|
|
||||||
|
from elasticsearch_dsl.connections import connections
|
||||||
|
|
||||||
|
connections.create_connection(hosts=["localhost"])
|
||||||
|
|
||||||
|
|
||||||
|
class CustomAnalyzer(_CustomAnalyzer):
|
||||||
|
def get_analysis_definition(self):
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"])
|
||||||
|
|
||||||
|
|
||||||
|
class ArticleType(DocType):
|
||||||
|
# 伯乐在线文章类型
|
||||||
|
suggest = Completion(analyzer=ik_analyzer)
|
||||||
|
title = Text(analyzer="ik_max_word")
|
||||||
|
create_date = Date()
|
||||||
|
url = Keyword()
|
||||||
|
url_object_id = Keyword()
|
||||||
|
front_image_url = Keyword()
|
||||||
|
front_image_path = Keyword()
|
||||||
|
praise_nums = Integer()
|
||||||
|
comment_nums = Integer()
|
||||||
|
fav_nums = Integer()
|
||||||
|
tags = Text(analyzer="ik_max_word")
|
||||||
|
content = Text(analyzer="ik_max_word")
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
index = "jobbole"
|
||||||
|
doc_type = "article"
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
ArticleType.init()
|
||||||
|
@ -1,3 +1,88 @@
|
|||||||
from django.shortcuts import render
|
from django.shortcuts import render
|
||||||
|
from django.views.generic.base import View
|
||||||
|
from search.models import ArticleType
|
||||||
|
from django.http import HttpResponse
|
||||||
|
import json
|
||||||
|
from elasticsearch import Elasticsearch
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
client = Elasticsearch(hosts=['127.0.0.1'])
|
||||||
|
|
||||||
|
|
||||||
# Create your views here.
|
# Create your views here.
|
||||||
|
class SearchSuggest(View):
|
||||||
|
# 搜索建议模块
|
||||||
|
def get(self, request):
|
||||||
|
key_words = request.GET.get('s', '')
|
||||||
|
re_datas = []
|
||||||
|
if key_words:
|
||||||
|
s = ArticleType.search()
|
||||||
|
s = s.suggest('my_suggest', key_words, completion={
|
||||||
|
"field": "suggest", "fuzzy": {
|
||||||
|
"fuzziness": 2
|
||||||
|
},
|
||||||
|
"size": 10
|
||||||
|
})
|
||||||
|
suggestions = s.execute_suggest()
|
||||||
|
for match in suggestions.my_suggest[0].options:
|
||||||
|
source = match._source
|
||||||
|
re_datas.append(source["title"])
|
||||||
|
return HttpResponse(json.dumps(re_datas), content_type="application/json")
|
||||||
|
|
||||||
|
|
||||||
|
class SearchView(View):
|
||||||
|
def get(self, request):
|
||||||
|
key_words = request.GET.get("q", '')
|
||||||
|
page = request.GET.get('p', '1')
|
||||||
|
try:
|
||||||
|
page = int(page)
|
||||||
|
except:
|
||||||
|
page = 1
|
||||||
|
start_time = datetime.now()
|
||||||
|
response = client.search(
|
||||||
|
index="jobbole",
|
||||||
|
body={
|
||||||
|
"query": {
|
||||||
|
"multi_match": {
|
||||||
|
"query": key_words,
|
||||||
|
"fields": ["tags", "title", "content"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"from": (page - 1) * 10,
|
||||||
|
"size": 10,
|
||||||
|
"highlight": {
|
||||||
|
"pre_tags": ['<span class="keyWord">'],
|
||||||
|
"post_tags": ['</span>'],
|
||||||
|
"fields": {
|
||||||
|
"title": {},
|
||||||
|
"content": {},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
end_time = datetime.now()
|
||||||
|
last_seconds = (end_time - start_time).total_seconds()
|
||||||
|
total_nums = response['hits']['total']
|
||||||
|
if (page % 10) > 0:
|
||||||
|
page_nums = int(total_nums / 10) + 1
|
||||||
|
else:
|
||||||
|
page_nums = int(total_nums / 10)
|
||||||
|
# 构造值,获取每个字段的值
|
||||||
|
hit_list = []
|
||||||
|
for hit in response['hits']['hits']:
|
||||||
|
hit_dict = {}
|
||||||
|
if 'title' in hit['highlight']:
|
||||||
|
hit_dict['title'] = "".join(hit['highlight']['title'])
|
||||||
|
else:
|
||||||
|
hit_dict['title'] = hit['_source']['title']
|
||||||
|
if 'content' in hit['highlight']:
|
||||||
|
hit_dict['content'] = "".join(hit['highlight']['content'])[:500]
|
||||||
|
else:
|
||||||
|
hit_dict['content'] = hit['_source']['content'][:500]
|
||||||
|
hit_dict["create_date"] = hit['_source']['create_date']
|
||||||
|
hit_dict["url"] = hit['_source']['url']
|
||||||
|
hit_dict["score"] = hit['_score']
|
||||||
|
hit_list.append(hit_dict)
|
||||||
|
return render(request, 'result.html',
|
||||||
|
{'page': page, 'total_nums': total_nums, 'all_hits': hit_list, 'key_words': key_words,
|
||||||
|
'page_nums': page_nums, 'total_nums': total_nums,'last_seconds':last_seconds})
|
||||||
|
Loading…
Reference in new issue