2000300826-scrapy

master
Yearneal 2 years ago
commit fbc6918058

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (spider)" project-jdk-type="Python SDK" />
</project>

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/spider.iml" filepath="$PROJECT_DIR$/.idea/spider.iml" />
</modules>
</component>
</project>

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

@ -0,0 +1,18 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class ArticleItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
author = scrapy.Field()
source = scrapy.Field()
keyword = scrapy.Field()
time = scrapy.Field()
content = scrapy.Field()

@ -0,0 +1,103 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class HomeworkSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class HomeworkDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)

@ -0,0 +1,41 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import openpyxl
class HomeworkPipeline:
# 初始化方法
def __init__(self):
# 工作簿对象
self.wb = openpyxl.Workbook()
# 创建新的工作表
# wb.create_sheet()
# 获取默认激活的工作表
self.ws = self.wb.active
# 修改表名
self.ws.title = '采集文章数据'
# self.ws.row_dimensions[1].height = 100
self.ws.column_dimensions['A'].width = 80
self.ws.append(('标题', '作者', '来源', '关键字', '时间', '正文'))
def close_spider(self, spider):
self.wb.save('采集结果.xlsx')
# 专门用来处理item
# 每接收到一个item就会被调用一次
def process_item(self, item, spider):
title = item.get('title', '')
author = item.get('author', '')
source = item.get('source', '')
keyword = item.get('keyword', '')
time = item.get('time', '')
content = item.get('content', '')
self.ws.append((title, author, source, keyword, time, content))
return item

@ -0,0 +1,101 @@
# Scrapy settings for homework project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "homework"
SPIDER_MODULES = ["homework.spiders"]
NEWSPIDER_MODULE = "homework.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# 一定要进行UA伪装
# USER_AGENT = "homework (+http://www.yourdomain.com)"
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
# Obey robots.txt rules
# 可以选择关闭robots协议
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = False
# 指定只输出错误类型的日志
LOG_LEVEL = 'ERROR'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "en",
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "homework.middlewares.HomeworkSpiderMiddleware": 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# "homework.middlewares.HomeworkDownloaderMiddleware": 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# 管道配置
ITEM_PIPELINES = {
# 数字小的先执行,数字大的后执行
"homework.pipelines.HomeworkPipeline": 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

@ -0,0 +1,82 @@
import scrapy
from homework.items import ArticleItem
from scrapy import Request
class ZhangqiaokeyanSpider(scrapy.Spider):
# 爬虫文件的名称:就是爬虫源文件的唯一标识
name = "zhangqiaokeyan"
# 允许的域名用来限定start_urls列表中哪些url可以进行请求发送
# allowed_domains = ["www.zhangqiaokeyan.com"]
# 起始的URL列表该列表中存放的URL会被scrapy自动进行请求的发送
# start_urls = ["https://www.zhangqiaokeyan.com/article_item/list-1_1/"]
def start_requests(self):
for page_num in range(10):
# url = "https://www.zhangqiaokeyan.com/article_item/list-1_" + str(page_num)
# 默认调用下面的parse进行爬虫
yield Request(url=f"https://www.zhangqiaokeyan.com/article/list-1_{page_num}")
def parse(self, response):
# print(response)
# 使用xpath进行数据解析
item_list = response.xpath('//div[@class="yc_right"]/ul/li')
for index, item in enumerate(item_list):
list = item.xpath('./div')
# 解决一个bug
if len(list) == 1:
div = 'div[1]'
else:
div = 'div[2]'
# 创建item对象
article_item = ArticleItem()
# xpath返回的是列表但是列表元素是Selector类型的对象
# extract 可以将Selector对象中data参数存储的字符串提取出来
# 可以使用[0]但是可能会空指针异常所以可以使用extract_first如果没有数据会返回None
article_item['title'] = item.xpath('./' + div + '/a/text()').extract_first()
#列表调用了extract表示将列表中每一个Selector对象中data对应的字符串提取了出来
keyword = item.xpath('./' + div + '/h2/span/em/text()').extract()
article_item['keyword'] = ','.join(keyword)
article_item['time'] = item.xpath('./' + div + '/h2/span/time/text()').extract_first()
# 获取url进行进一步爬取
detail_url = item.xpath('./' + div + '/a/@href').extract_first()
yield Request(url='https://www.zhangqiaokeyan.com' + detail_url, callback=self.parse_detail, cb_kwargs={'item' : article_item})
# yield article_item # 生成item将item提交给管道
def parse_detail(self, response, **kwargs):
base = '//div[@class="yc_atice"]/div[1]'
# 获取数据
article_item = kwargs['item']
line = response.xpath(base + '/p/span[1]/span[1]/text()').extract_first()
if line != None and line[0] == '':
info_list = list(line.split())
article_item['author'] = info_list[0][3:]
# 有可能没有来源
if len(info_list) == 2:
article_item['source'] = info_list[1][3:]
elif line != None and line[0] == '':
article_item['source'] = line[3:]
content = response.xpath(base + '/p//text()').extract()
content_filter = [element if '\r\n\t' not in element else element.strip().replace('\r\n\t', '') for element in content]
content_filter = [element for element in content_filter if element != '']
content_filter = [element for element in content_filter if '\r\n' not in element]
content_filter = ['\n' if element == ' ' or element == '\xa0' else element for element in content_filter]
if content_filter != None and len(content_filter) > 0 and ('作者:' or '来源:') in content_filter[0]:
content_filter[0] += '\n'
article_item['content'] = ''.join(content_filter)
print("题目:" + article_item.get('title', ''))
print("作者:" + article_item.get('author', ''))
print("来源:" + article_item.get('source', ''))
print("关键字:" + article_item.get('keyword', ''))
print("时间:" + article_item.get('time', ''))
print()
# print(content)
# print(content_filter)
# print(article_item.get('content', ''))
yield article_item

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = homework.settings
[deploy]
#url = http://localhost:6800/
project = homework
Loading…
Cancel
Save