parent
d737bee569
commit
31d6d15c69
Binary file not shown.
@ -1,115 +1,115 @@
|
||||
# Scrapy settings for ArticleSpider project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
import os
|
||||
import sys
|
||||
|
||||
import scrapy.downloadermiddlewares.useragent
|
||||
|
||||
import ArticleSpider.pipelines
|
||||
|
||||
BOT_NAME = "ArticleSpider"
|
||||
|
||||
SPIDER_MODULES = ["ArticleSpider.spiders"]
|
||||
NEWSPIDER_MODULE = "ArticleSpider.spiders"
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
# CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
# DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
# CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
COOKIES_ENABLED = True
|
||||
COOKIES_DEBUG = True
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
# TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
# DEFAULT_REQUEST_HEADERS = {
|
||||
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
# "Accept-Language": "en",
|
||||
# }
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
# SPIDER_MIDDLEWARES = {
|
||||
# "ArticleSpider.middlewares.ArticlespiderSpiderMiddleware": 543,
|
||||
# }
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
DOWNLOADER_MIDDLEWARES = {
|
||||
# "ArticleSpider.middlewares.ArticlespiderDownloaderMiddleware": 543,
|
||||
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2,
|
||||
}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
# EXTENSIONS = {
|
||||
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||
# }
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
# 'scrapy.pipelines.images.ImagesPipeline': 1,
|
||||
# 'ArticleSpider.pipelines.JsonWithEncodingPipeline': 2,
|
||||
# 'ArticleSpider.pipelines.JsonExporterPipeline': 3,
|
||||
# 'ArticleSpider.pipelines.MysqlPipeline': 4,
|
||||
# 'ArticleSpider.pipelines.MysqlTwistedPipline': 5,
|
||||
'ArticleSpider.pipelines.ElasticsearchPipeline': 6,
|
||||
'ArticleSpider.pipelines.ArticlespiderPipeline': 300,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
# AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
# AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
# AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
# AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
# HTTPCACHE_ENABLED = True
|
||||
# HTTPCACHE_EXPIRATION_SECS = 0
|
||||
# HTTPCACHE_DIR = "httpcache"
|
||||
# HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||
|
||||
# Set settings whose default value is deprecated to a future-proof value
|
||||
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||
FEED_EXPORT_ENCODING = "utf-8"
|
||||
|
||||
# IMAGES_URLS_FIELD = 'front_image_url'
|
||||
project_dir = os.path.abspath(os.path.dirname(__file__))
|
||||
IMAGES_STORE = os.path.join(project_dir, 'images')
|
||||
|
||||
MYSQL_HOST = '127.0.0.1'
|
||||
MYSQL_DBNAME = 'article_spider'
|
||||
MYSQL_USER = 'root'
|
||||
# Scrapy settings for ArticleSpider project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
import os
|
||||
import sys
|
||||
|
||||
import scrapy.downloadermiddlewares.useragent
|
||||
|
||||
import ArticleSpider.pipelines
|
||||
|
||||
BOT_NAME = "ArticleSpider"
|
||||
|
||||
SPIDER_MODULES = ["ArticleSpider.spiders"]
|
||||
NEWSPIDER_MODULE = "ArticleSpider.spiders"
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
# CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
# DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
# CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
COOKIES_ENABLED = True
|
||||
COOKIES_DEBUG = True
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
# TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
# DEFAULT_REQUEST_HEADERS = {
|
||||
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
# "Accept-Language": "en",
|
||||
# }
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
# SPIDER_MIDDLEWARES = {
|
||||
# "ArticleSpider.middlewares.ArticlespiderSpiderMiddleware": 543,
|
||||
# }
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
DOWNLOADER_MIDDLEWARES = {
|
||||
# "ArticleSpider.middlewares.ArticlespiderDownloaderMiddleware": 543,
|
||||
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2,
|
||||
}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
# EXTENSIONS = {
|
||||
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||
# }
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
# 'scrapy.pipelines.images.ImagesPipeline': 1,
|
||||
# 'ArticleSpider.pipelines.JsonWithEncodingPipeline': 2,
|
||||
# 'ArticleSpider.pipelines.JsonExporterPipeline': 3,
|
||||
# 'ArticleSpider.pipelines.MysqlPipeline': 4,
|
||||
# 'ArticleSpider.pipelines.MysqlTwistedPipline': 5,
|
||||
'ArticleSpider.pipelines.ElasticsearchPipeline': 6,
|
||||
'ArticleSpider.pipelines.ArticlespiderPipeline': 300,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
# AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
# AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
# AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
# AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
# HTTPCACHE_ENABLED = True
|
||||
# HTTPCACHE_EXPIRATION_SECS = 0
|
||||
# HTTPCACHE_DIR = "httpcache"
|
||||
# HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||
|
||||
# Set settings whose default value is deprecated to a future-proof value
|
||||
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||
FEED_EXPORT_ENCODING = "utf-8"
|
||||
|
||||
# IMAGES_URLS_FIELD = 'front_image_url'
|
||||
project_dir = os.path.abspath(os.path.dirname(__file__))
|
||||
IMAGES_STORE = os.path.join(project_dir, 'images')
|
||||
|
||||
MYSQL_HOST = '127.0.0.1'
|
||||
MYSQL_DBNAME = 'article_spider'
|
||||
MYSQL_USER = 'root'
|
||||
MYSQL_PASSWORD = 'qweasdzxc227'
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,3 +1,47 @@
|
||||
from django.db import models
|
||||
|
||||
# Create your models here.
|
||||
# -*- coding: utf-8 -*-
|
||||
__author__ = 'bobby'
|
||||
|
||||
from datetime import datetime
|
||||
from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
|
||||
analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
|
||||
|
||||
from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer
|
||||
|
||||
from elasticsearch_dsl.connections import connections
|
||||
|
||||
connections.create_connection(hosts=["localhost"])
|
||||
|
||||
|
||||
class CustomAnalyzer(_CustomAnalyzer):
|
||||
def get_analysis_definition(self):
|
||||
return {}
|
||||
|
||||
|
||||
ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"])
|
||||
|
||||
|
||||
class ArticleType(DocType):
|
||||
# 伯乐在线文章类型
|
||||
suggest = Completion(analyzer=ik_analyzer)
|
||||
title = Text(analyzer="ik_max_word")
|
||||
create_date = Date()
|
||||
url = Keyword()
|
||||
url_object_id = Keyword()
|
||||
front_image_url = Keyword()
|
||||
front_image_path = Keyword()
|
||||
praise_nums = Integer()
|
||||
comment_nums = Integer()
|
||||
fav_nums = Integer()
|
||||
tags = Text(analyzer="ik_max_word")
|
||||
content = Text(analyzer="ik_max_word")
|
||||
|
||||
class Meta:
|
||||
index = "jobbole"
|
||||
doc_type = "article"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ArticleType.init()
|
||||
|
@ -1,3 +1,88 @@
|
||||
from django.shortcuts import render
|
||||
from django.views.generic.base import View
|
||||
from search.models import ArticleType
|
||||
from django.http import HttpResponse
|
||||
import json
|
||||
from elasticsearch import Elasticsearch
|
||||
from datetime import datetime
|
||||
|
||||
client = Elasticsearch(hosts=['127.0.0.1'])
|
||||
|
||||
|
||||
# Create your views here.
|
||||
class SearchSuggest(View):
|
||||
# 搜索建议模块
|
||||
def get(self, request):
|
||||
key_words = request.GET.get('s', '')
|
||||
re_datas = []
|
||||
if key_words:
|
||||
s = ArticleType.search()
|
||||
s = s.suggest('my_suggest', key_words, completion={
|
||||
"field": "suggest", "fuzzy": {
|
||||
"fuzziness": 2
|
||||
},
|
||||
"size": 10
|
||||
})
|
||||
suggestions = s.execute_suggest()
|
||||
for match in suggestions.my_suggest[0].options:
|
||||
source = match._source
|
||||
re_datas.append(source["title"])
|
||||
return HttpResponse(json.dumps(re_datas), content_type="application/json")
|
||||
|
||||
|
||||
class SearchView(View):
|
||||
def get(self, request):
|
||||
key_words = request.GET.get("q", '')
|
||||
page = request.GET.get('p', '1')
|
||||
try:
|
||||
page = int(page)
|
||||
except:
|
||||
page = 1
|
||||
start_time = datetime.now()
|
||||
response = client.search(
|
||||
index="jobbole",
|
||||
body={
|
||||
"query": {
|
||||
"multi_match": {
|
||||
"query": key_words,
|
||||
"fields": ["tags", "title", "content"]
|
||||
}
|
||||
},
|
||||
"from": (page - 1) * 10,
|
||||
"size": 10,
|
||||
"highlight": {
|
||||
"pre_tags": ['<span class="keyWord">'],
|
||||
"post_tags": ['</span>'],
|
||||
"fields": {
|
||||
"title": {},
|
||||
"content": {},
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
end_time = datetime.now()
|
||||
last_seconds = (end_time - start_time).total_seconds()
|
||||
total_nums = response['hits']['total']
|
||||
if (page % 10) > 0:
|
||||
page_nums = int(total_nums / 10) + 1
|
||||
else:
|
||||
page_nums = int(total_nums / 10)
|
||||
# 构造值,获取每个字段的值
|
||||
hit_list = []
|
||||
for hit in response['hits']['hits']:
|
||||
hit_dict = {}
|
||||
if 'title' in hit['highlight']:
|
||||
hit_dict['title'] = "".join(hit['highlight']['title'])
|
||||
else:
|
||||
hit_dict['title'] = hit['_source']['title']
|
||||
if 'content' in hit['highlight']:
|
||||
hit_dict['content'] = "".join(hit['highlight']['content'])[:500]
|
||||
else:
|
||||
hit_dict['content'] = hit['_source']['content'][:500]
|
||||
hit_dict["create_date"] = hit['_source']['create_date']
|
||||
hit_dict["url"] = hit['_source']['url']
|
||||
hit_dict["score"] = hit['_score']
|
||||
hit_list.append(hit_dict)
|
||||
return render(request, 'result.html',
|
||||
{'page': page, 'total_nums': total_nums, 'all_hits': hit_list, 'key_words': key_words,
|
||||
'page_nums': page_nums, 'total_nums': total_nums,'last_seconds':last_seconds})
|
||||
|
Loading…
Reference in new issue