2000300826-scrapy

2 years ago · fbc6918058
commit fbc6918058
21 changed files with 397 additions and 0 deletions
--- a/2000300826-scrapy/.idea/.gitignore
+++ b/2000300826-scrapy/.idea/.gitignore
@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
--- a/2000300826-scrapy/.idea/.name
+++ b/2000300826-scrapy/.idea/.name
@ -0,0 +1 @@
+spider01.py
--- a/2000300826-scrapy/.idea/inspectionProfiles/profiles_settings.xml
+++ b/2000300826-scrapy/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
--- a/2000300826-scrapy/.idea/misc.xml
+++ b/2000300826-scrapy/.idea/misc.xml
@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (spider)" project-jdk-type="Python SDK" />
+</project>
--- a/2000300826-scrapy/.idea/modules.xml
+++ b/2000300826-scrapy/.idea/modules.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/spider.iml" filepath="$PROJECT_DIR$/.idea/spider.iml" />
+    </modules>
+  </component>
+</project>
--- a/2000300826-scrapy/.idea/spider.iml
+++ b/2000300826-scrapy/.idea/spider.iml
@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/venv" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
--- a/2000300826-scrapy/scrapy/homework/homework/init.py
+++ b/2000300826-scrapy/scrapy/homework/homework/init.py
--- a/2000300826-scrapy/scrapy/homework/homework/pycache/init.cpython-310.pyc
+++ b/2000300826-scrapy/scrapy/homework/homework/pycache/init.cpython-310.pyc
--- a/2000300826-scrapy/scrapy/homework/homework/pycache/items.cpython-310.pyc
+++ b/2000300826-scrapy/scrapy/homework/homework/pycache/items.cpython-310.pyc
--- a/2000300826-scrapy/scrapy/homework/homework/pycache/pipelines.cpython-310.pyc
+++ b/2000300826-scrapy/scrapy/homework/homework/pycache/pipelines.cpython-310.pyc
--- a/2000300826-scrapy/scrapy/homework/homework/pycache/settings.cpython-310.pyc
+++ b/2000300826-scrapy/scrapy/homework/homework/pycache/settings.cpython-310.pyc
--- a/2000300826-scrapy/scrapy/homework/homework/items.py
+++ b/2000300826-scrapy/scrapy/homework/homework/items.py
@ -0,0 +1,18 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class ArticleItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    author = scrapy.Field()
+    source = scrapy.Field()
+    keyword = scrapy.Field()
+    time = scrapy.Field()
+    content = scrapy.Field()
+
--- a/2000300826-scrapy/scrapy/homework/homework/middlewares.py
+++ b/2000300826-scrapy/scrapy/homework/homework/middlewares.py
@ -0,0 +1,103 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+# useful for handling different item types with a single interface
+from itemadapter import is_item, ItemAdapter
+
+
+class HomeworkSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class HomeworkDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
--- a/2000300826-scrapy/scrapy/homework/homework/pipelines.py
+++ b/2000300826-scrapy/scrapy/homework/homework/pipelines.py
@ -0,0 +1,41 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+import openpyxl
+
+class HomeworkPipeline:
+
+    # 初始化方法
+    def __init__(self):
+        # 工作簿对象
+        self.wb = openpyxl.Workbook()
+        # 创建新的工作表
+        # wb.create_sheet()
+        # 获取默认激活的工作表
+        self.ws = self.wb.active
+        # 修改表名
+        self.ws.title = '采集文章数据'
+        # self.ws.row_dimensions[1].height = 100
+        self.ws.column_dimensions['A'].width = 80
+        self.ws.append(('标题', '作者', '来源', '关键字', '时间', '正文'))
+
+    def close_spider(self, spider):
+        self.wb.save('采集结果.xlsx')
+
+    # 专门用来处理item
+    # 每接收到一个item就会被调用一次
+    def process_item(self, item, spider):
+        title = item.get('title', '')
+        author = item.get('author', '')
+        source = item.get('source', '')
+        keyword = item.get('keyword', '')
+        time = item.get('time', '')
+        content = item.get('content', '')
+
+        self.ws.append((title, author, source, keyword, time, content))
+        return item
--- a/2000300826-scrapy/scrapy/homework/homework/settings.py
+++ b/2000300826-scrapy/scrapy/homework/homework/settings.py
@ -0,0 +1,101 @@
+# Scrapy settings for homework project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = "homework"
+
+SPIDER_MODULES = ["homework.spiders"]
+NEWSPIDER_MODULE = "homework.spiders"
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+# 一定要进行UA伪装
+# USER_AGENT = "homework (+http://www.yourdomain.com)"
+USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
+# Obey robots.txt rules
+# 可以选择关闭robots协议
+ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = False
+
+# 指定只输出错误类型的日志
+LOG_LEVEL = 'ERROR'
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+#    "Accept-Language": "en",
+#}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    "homework.middlewares.HomeworkSpiderMiddleware": 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    "homework.middlewares.HomeworkDownloaderMiddleware": 543,
+#}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    "scrapy.extensions.telnet.TelnetConsole": None,
+#}
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+# 管道配置
+ITEM_PIPELINES = {
+    # 数字小的先执行，数字大的后执行
+   "homework.pipelines.HomeworkPipeline": 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = "httpcache"
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+
+# Set settings whose default value is deprecated to a future-proof value
+REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
+FEED_EXPORT_ENCODING = "utf-8"
--- a/2000300826-scrapy/scrapy/homework/homework/spiders/init.py
+++ b/2000300826-scrapy/scrapy/homework/homework/spiders/init.py
@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/2000300826-scrapy/scrapy/homework/homework/spiders/pycache/init.cpython-310.pyc
+++ b/2000300826-scrapy/scrapy/homework/homework/spiders/pycache/init.cpython-310.pyc
--- a/2000300826-scrapy/scrapy/homework/homework/spiders/pycache/zhangqiaokeyan.cpython-310.pyc
+++ b/2000300826-scrapy/scrapy/homework/homework/spiders/pycache/zhangqiaokeyan.cpython-310.pyc
--- a/2000300826-scrapy/scrapy/homework/homework/spiders/zhangqiaokeyan.py
+++ b/2000300826-scrapy/scrapy/homework/homework/spiders/zhangqiaokeyan.py
@ -0,0 +1,82 @@
+import scrapy
+
+from homework.items import ArticleItem
+from scrapy import Request
+
+
+class ZhangqiaokeyanSpider(scrapy.Spider):
+    # 爬虫文件的名称：就是爬虫源文件的唯一标识
+    name = "zhangqiaokeyan"
+    # 允许的域名：用来限定start_urls列表中哪些url可以进行请求发送
+    # allowed_domains = ["www.zhangqiaokeyan.com"]
+    # 起始的URL列表：该列表中存放的URL会被scrapy自动进行请求的发送
+    # start_urls = ["https://www.zhangqiaokeyan.com/article_item/list-1_1/"]
+
+    def start_requests(self):
+        for page_num in range(10):
+            # url = "https://www.zhangqiaokeyan.com/article_item/list-1_" + str(page_num)
+            # 默认调用下面的parse进行爬虫
+            yield Request(url=f"https://www.zhangqiaokeyan.com/article/list-1_{page_num}")
+
+    def parse(self, response):
+        # print(response)
+        # 使用xpath进行数据解析
+        item_list = response.xpath('//div[@class="yc_right"]/ul/li')
+        for index, item in enumerate(item_list):
+            list = item.xpath('./div')
+            # 解决一个bug
+            if len(list) == 1:
+                div = 'div[1]'
+            else:
+                div = 'div[2]'
+            # 创建item对象
+            article_item = ArticleItem()
+            # xpath返回的是列表，但是列表元素是Selector类型的对象
+            # extract 可以将Selector对象中data参数存储的字符串提取出来
+            # 可以使用[0]，但是可能会空指针异常，所以可以使用extract_first，如果没有数据，会返回None
+            article_item['title'] = item.xpath('./' + div + '/a/text()').extract_first()
+            #列表调用了extract，表示将列表中每一个Selector对象中data对应的字符串提取了出来
+            keyword = item.xpath('./' + div + '/h2/span/em/text()').extract()
+            article_item['keyword'] = ','.join(keyword)
+            article_item['time'] = item.xpath('./' + div + '/h2/span/time/text()').extract_first()
+
+            # 获取url进行进一步爬取
+            detail_url = item.xpath('./' + div + '/a/@href').extract_first()
+
+            yield Request(url='https://www.zhangqiaokeyan.com' + detail_url, callback=self.parse_detail, cb_kwargs={'item' : article_item})
+            # yield article_item  # 生成item，将item提交给管道
+
+    def parse_detail(self, response, **kwargs):
+        base = '//div[@class="yc_atice"]/div[1]'
+        # 获取数据
+        article_item = kwargs['item']
+
+        line = response.xpath(base + '/p/span[1]/span[1]/text()').extract_first()
+        if line != None and line[0] == '作':
+            info_list = list(line.split())
+            article_item['author'] = info_list[0][3:]
+            # 有可能没有来源
+            if len(info_list) == 2:
+                article_item['source'] = info_list[1][3:]
+        elif line != None and line[0] == '来':
+            article_item['source'] = line[3:]
+
+        content = response.xpath(base + '/p//text()').extract()
+        content_filter = [element if '\r\n\t' not in element else element.strip().replace('\r\n\t', '') for element in content]
+        content_filter = [element for element in content_filter if element != '']
+        content_filter = [element for element in content_filter if '\r\n' not in element]
+        content_filter = ['\n' if element == ' ' or element == '\xa0' else element for element in content_filter]
+        if content_filter != None and len(content_filter) > 0 and ('作者：' or '来源：') in content_filter[0]:
+            content_filter[0] += '\n'
+        article_item['content'] = ''.join(content_filter)
+
+        print("题目：" + article_item.get('title', '无'))
+        print("作者：" + article_item.get('author', '无'))
+        print("来源：" + article_item.get('source', '无'))
+        print("关键字：" + article_item.get('keyword', '无'))
+        print("时间：" + article_item.get('time', '无'))
+        print()
+        # print(content)
+        # print(content_filter)
+        # print(article_item.get('content', ''))
+        yield article_item
--- a/2000300826-scrapy/scrapy/homework/scrapy.cfg
+++ b/2000300826-scrapy/scrapy/homework/scrapy.cfg
@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = homework.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = homework
--- a/2000300826-scrapy/scrapy/homework/采集结果.xlsx
+++ b/2000300826-scrapy/scrapy/homework/采集结果.xlsx