项目提交

master
Goshic 2 years ago
parent 2d6a6a1f85
commit 2deaaaf78c

@ -0,0 +1,208 @@
import matplotlib.pyplot as plt
from matplotlib import font_manager
import pymysql
class show(object):
info = {}
TScore = [] # 综合评分
name = [] # 动漫名字
bfl = [] # 播放量
pls = [] # 评论数
scs = [] # 收藏数
def save(self):#把数据从数据中取出存入对象
db = pymysql.connect(host='47.106.183.36', port=3306,
user='fuchuang', password='fuchuang',
database='fuchuang', charset='utf8mb4') # 数据库连接
use = 'use fuchuang;'
show = 'show tables;'
str = 'select * from bangumi;'
cursor = db.cursor()
try:
cursor.execute(show)#
cursor.execute(str)#选中表
desc = cursor.description
data = cursor.fetchall()#获取表信息
list = []
for data in data:#筛选出评分大于6的数据
if (data[3] != '暂无评分'):
if (float(data[3]) > 9.5):
#print(data)
self.name.append(data[1])# 动漫名字
list = [data[2]]
if '' in list[0]:
list[0] = float(list[0].replace('', ''))
elif '亿' in list[0]:
list[0] = float(list[0].replace('亿', '')) * 10000
self.bfl.append(list[0]) # 播放量
list = [data[4]]
if '' in list[0]:
list[0] = float(list[0].replace('', ''))
else:
list[0] = float(list[0])
self.TScore.append(float(data[3])*list[0]) # 综合评分
self.scs.append(list[0])# 收藏数
list = [data[5]]
if '' in list[0]:
list[0] = float(list[0].replace('', ''))
else:
list[0] = float(list[0])
self.pls.append(list[0])# 评论数
except Exception as e:
print(e)
db.rollback()
finally:
cursor.close()
db.close()
print(self.name)
print(self.TScore)
print(self.bfl)
print(self.pls)
print(self.scs)
#info = {'动漫名': self.name, '播放量(万)': self.bfl, '评论数(万)': self.pls, '收藏数(万)': self.scs, '综合评分': self.TScore}
#dm_file = pandas.DataFrame(info)
#dm_file.to_excel('Dongman.xlsx', sheet_name="动漫数据分析")
# 将所有列表返回
return self.name, self.bfl, self.pls, self.scs, self.TScore
def view(self):#数据可视化
# 为了坐标轴上能显示中文
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
my_font = font_manager.FontProperties(fname='STHeiti-Medium.ttc')
dm_name = self.name # 番剧名
dm_play = self.bfl # 番剧播放量
dm_review = self.pls # 番剧评论数
dm_favorite = self.scs # 番剧收藏数
dm_com_score = self.TScore # 番剧评分
y_score = [9.6, 9.7, 9.8, 9.9,10.0]
#dm_all = self.TScore * self.scs
fig, ax1 = plt.subplots()
plt.bar(dm_name, dm_com_score, color='red')
plt.title('综合评分和播放量数据分校', fontproperties=my_font)
ax1.tick_params(labelsize=6)
plt.xlabel('番剧名')
plt.ylabel('综合评分')
plt.xticks(rotation=90, color='green')
ax2 = ax1.twinx()
ax2.plot(dm_play, color='cyan')
plt.ylabel('播放量')
plt.plot(1, label='评分', color='red', linewidth=5.0)
plt.plot(1, label='播放量', color="cyan", linewidth=1.0, linestyle="-")
plt.legend()
plt.savefig(r'E:1.png', dpi=1000, bbox_inches='tight')
# **********************************************************************评论数和收藏数对比
# ********评论数条形图
fig, ax3 = plt.subplots()
plt.bar(dm_name, dm_play, color='green')
plt.title('番剧收藏数与评论数分析')
plt.ylabel('评论数(万)')
ax3.tick_params(labelsize=6)
plt.xticks(rotation=90, color='green')
# *******收藏数折线图
ax4 = ax3.twinx() # 组合图必须加这个
ax4.plot(dm_favorite, color='yellow') # 设置线粗细,节点样式
plt.ylabel('收藏数(万)')
plt.plot(1, label='评论数', color="green", linewidth=5.0)
plt.plot(1, label='收藏数', color="yellow", linewidth=1.0, linestyle="-")
plt.legend()
plt.savefig(r'E:2.png', dpi=1000, bbox_inches='tight')
# **********************************************************************综合评分和收藏数对比
# *******综合评分条形图
fig, ax5 = plt.subplots()
plt.bar(dm_name, dm_com_score, color='red')
plt.title('综合评分和收藏数量数据分析')
plt.ylabel('综合评分')
ax5.tick_params(labelsize=6)
plt.xticks(rotation=90, color='green')
# *******收藏折线图
ax6 = ax5.twinx() # 组合图必须加这个
ax6.plot(dm_favorite, color='yellow') # 设置线粗细,节点样式
plt.ylabel('收藏数(万)')
plt.plot(1, label='综合评分', color="red", linewidth=5.0)
plt.plot(1, label='收藏数', color="yellow", linewidth=1.0, linestyle="-")
plt.legend()
plt.savefig(r'E:3.png', dpi=1000, bbox_inches='tight')
# **********************************************************************播放量和评论数对比
# *******播放量条形图
fig, ax7 = plt.subplots()
plt.bar(dm_name, dm_play, color='cyan')
plt.title('播放量和收藏数 数据分析')
plt.ylabel('播放量(万)')
ax7.tick_params(labelsize=6)
plt.xticks(rotation=90, color='green')
# *******收藏数折线图
ax8 = ax7.twinx() # 组合图必须加这个
ax8.plot(dm_favorite, color='yellow') # 设置线粗细,节点样式
plt.ylabel('收藏数(万)')
plt.plot(1, label='评论数', color="green", linewidth=5.0)
plt.plot(1, label='收藏数', color="yellow", linewidth=1.0, linestyle="-")
plt.legend()
plt.savefig(r'E:4.png', dpi=1000, bbox_inches='tight')
# *******评论数折线图
# ax8 = ax7.twinx() # 组合图必须加这个
# ax8.plot(dm_review, color='green') # 设置线粗细,节点样式
# plt.ylabel('评论数(万)')
# plt.plot(1, label='播放量', color="cyan", linewidth=5.0)
# plt.plot(1, label='评论数', color="green", linewidth=1.0, linestyle="-")
# plt.legend()
# plt.savefig(r'E:4.png', dpi=1000, bbox_inches='tight')
#评论数的数据展示有问题
plt.show()
def print(self):
print(len(self.name))
print(len(self.bfl))
print(len(self.pls))
print(len(self.scs))
print(len(self.TScore))
def sort(self,i,j):
temp = self.name[i]
self.name[i] = self.name[j]
self.name[j] = temp
temp = self.bfl[i]
self.bfl[i] = self.bfl[j]
self.bfl[j] = temp
temp = self.pls[i]
self.pls[i] = self.pls[j]
self.pls[j] = temp
temp = self.scs[i]
self.scs[i] = self.scs[j]
self.scs[j] = temp
temp = self.TScore[i]
self.TScore[i] = self.TScore[j]
self.TScore[j] = temp
def main():
a=show()#创建对象
a.save()#从数据库取数据
a.print() # 输出各个数据个数
a.view()#可视化
if __name__ == '__main__':
main()

@ -0,0 +1,37 @@
import pymysql
import xlwt as xlwt
def Toexcel(path, sql, title):
conn = pymysql.connect(host='47.106.183.36', port=3306,
user='fuchuang', password='fuchuang',
database='fuchuang', charset='utf8mb4')
curs = conn.cursor()
curs.execute(sql)
rows = curs.fetchall()
w = xlwt.Workbook(encoding='utf-8')
style = xlwt.XFStyle() # 初始化样式
font = xlwt.Font() # 为样式创建字体
font.name = "微软雅黑" # 如果是 python2 ,需要这样写 u"微软雅黑"
style.font = font # 为样式设置字体
ws = w.add_sheet("视频信息", cell_overwrite_ok=True)
# 将 title 作为 Excel 的列名
title = title.split(",")
for i in range(len(title)):
ws.write(0, i, title[i], style)
# 开始写入数据库查询到的数据
for i in range(len(rows)):
row = rows[i]
for j in range(len(row)):
if row[j]:
item = row[j]
ws.write(i + 1, j, item, style)
# 写文件完成开始保存xls文件
w.save(path)
conn.close()
sql_1 = '''select * from video'''
Toexcel('视频信息.xls', sql_1, "id,标题,播放量,弹幕数,发布者")
sql_2 = '''select * from bangumi'''
Toexcel('番剧信息.xls', sql_2, "id,番名,播放量,评分,弹幕数")

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = spider.settings
[deploy]
#url = http://localhost:6800/
project = spider

@ -0,0 +1,20 @@
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from time import sleep
import json
s = Service("../chromedriver.exe")
bro = webdriver.Chrome(service=s)
# 打开b站
bro.get('https://www.bilibili.com')
bro.delete_all_cookies() # 先删除cookies
# 60秒时间留你进行登陆
sleep(60)
dictcookies = bro.get_cookies() # 读取登录之后浏览器的cookies
jsoncookies = json.dumps(dictcookies) # 将字典数据转成json数据便于保存
# 生成cookies.txt文件
with open('cookies.txt', 'w') as f: # 写进文本保存
f.write(jsoncookies)
print('cookies is ok')

@ -0,0 +1 @@
[{"domain": ".bilibili.com", "httpOnly": false, "name": "innersign", "path": "/", "secure": false, "value": "0"}, {"domain": ".bilibili.com", "expiry": 1700969682, "httpOnly": false, "name": "i-wanna-go-back", "path": "/", "secure": false, "value": "-1"}, {"domain": ".bilibili.com", "expiry": 1684985681, "httpOnly": false, "name": "bili_jct", "path": "/", "secure": false, "value": "6c88a668c7442fa148fc9d06d6e40849"}, {"domain": ".bilibili.com", "expiry": 1700969680, "httpOnly": false, "name": "sid", "path": "/", "secure": false, "value": "qauykkrb"}, {"domain": ".bilibili.com", "expiry": 1684985681, "httpOnly": true, "name": "SESSDATA", "path": "/", "secure": true, "value": "8e595145%2C1684985681%2C785d7%2Ab1"}, {"domain": ".bilibili.com", "expiry": 1764041679, "httpOnly": false, "name": "buvid_fp", "path": "/", "secure": false, "value": "3fdd662d6b3f6d9fd8b2a5f1834013d4"}, {"domain": ".bilibili.com", "expiry": 1684985681, "httpOnly": false, "name": "DedeUserID", "path": "/", "secure": false, "value": "178665301"}, {"domain": ".bilibili.com", "expiry": 1764041638, "httpOnly": false, "name": "b_nut", "path": "/", "secure": false, "value": "1669433638"}, {"domain": ".bilibili.com", "expiry": 1700969682, "httpOnly": false, "name": "b_ut", "path": "/", "secure": false, "value": "5"}, {"domain": ".bilibili.com", "httpOnly": false, "name": "b_lsid", "path": "/", "secure": false, "value": "B3E3109EB_184B2000F25"}, {"domain": ".bilibili.com", "expiry": 1684985681, "httpOnly": false, "name": "DedeUserID__ckMd5", "path": "/", "secure": false, "value": "3f4304303449401f"}, {"domain": ".bilibili.com", "expiry": 1700969654, "httpOnly": false, "name": "_uuid", "path": "/", "secure": false, "value": "F0B6051B-46F1-CEFA-D47B-30415304BFA054324infoc"}, {"domain": ".bilibili.com", "expiry": 1764041638, "httpOnly": false, "name": "buvid3", "path": "/", "secure": false, "value": "69BFBC33-CEEC-E4A5-9E0A-7D9DB491BA0738742infoc"}, {"domain": ".bilibili.com", "expiry": 1764041638, "httpOnly": false, "name": "buvid4", "path": "/", "secure": false, "value": "7FF54BEA-5D27-6E79-874D-2092619183B623196-022112611-0kiN2FN18wB5k/cGcFypkA%3D%3D"}]

@ -0,0 +1,21 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class VideoItem(scrapy.Item):
title = scrapy.Field() # 视频名
view_counts = scrapy.Field() # 播放量
barrage = scrapy.Field() # 弹幕数
up = scrapy.Field() # up主
class BiliItem(scrapy.Item):
title = scrapy.Field() # 视频名
view_counts = scrapy.Field() # 播放量
evaluate = scrapy.Field() # 评分
attention = scrapy.Field() # 追番数
barrage = scrapy.Field() # 弹幕数

@ -0,0 +1,123 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals, Request
from scrapy.http import HtmlResponse
from spider.utils import create_chrome_driver, add_cookies
# useful for handling different item types with a single interface
class SpiderSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class SpiderDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def __init__(self): # 初始化数据管道时模拟用户登录
self.browser = create_chrome_driver(headless=True) #headless=True
self.browser.get('https://www.bilibili.com/v/popular/rank/all')
add_cookies(self.browser, "../cookies.txt")
def __del__(self): # 销毁时执行该方法
self.browser.close()
def process_request(self, request: Request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
# request.cookies = COOKIES_DICT[randint(0, len(COOKIES_DICT))]
# request.meta = {'proxy': 'http://127.0.0.1:7080'} # 使用代理,防封ip
self.browser.get(request.url)
return HtmlResponse(url=request.url, body=self.browser.page_source, # 获取动态内容
request=request, encoding='utf-8')
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class ProxyMiddleware(object):
def process_request(self, request, spider):
proxy = "http://172.20.10.2:7080"
request.meta["proxy"] = proxy
print(f"ProxyMiddleware --> {proxy}")

@ -0,0 +1,87 @@
import pymysql
from spider.items import VideoItem, BiliItem
# import openpyxl
# class ExcelPipeline:
# def __int__(self):
# self.wb = openpyxl.Workbook()
# self.ws = self.wb.active
# self.ws.title = 'Goods'
# self.ws.append(('标题', '价格', '销量', '图片', '店铺', '位置'))
#
# def close_spider(self, spider):
# self.wb.save('商品数据.xlsx')
#
# def process_item(self, item, spider):
# title = item.get('title', '') # 如果拿不到,则赋空
# price = item.get('price', 0)
# deal_count = item.get('deal_count', 0)
# picture = item.get('picture', '')
# location = item.get('location', '')
# shop = item.get('shop', '')
# self.ws.append((title, price, deal_count, picture, shop, location))
# return item
class MysqlPipeline:
def __init__(self):
self.conn = pymysql.connect(host='47.106.183.36', port=3306,
user='fuchuang', password='fuchuang',
database='fuchuang', charset='utf8mb4')
self.cursor = self.conn.cursor()
self.data_bangumi = []
self.data_video = []
def close_spider(self, spider):
if len(self.data_bangumi) > 0:
self._write_to_mysql_bangumi()
self.data_bangumi.clear()
if len(self.data_video) > 0:
self._write_to_mysql_video()
self.data_video.clear()
self.conn.commit()
self.conn.close()
def process_item(self, item, spider):
if type(item) == VideoItem:
title = item.get('title', '') # 如果拿不到,则赋空
view_counts = item.get('view_counts', '0')
barrage = item.get('barrage', '0')
up = item.get('up', '')
self.data_video.append((title, view_counts, barrage, up))
if type(item) == BiliItem:
title = item.get('title', '') # 如果拿不到,则赋空
view_counts = item.get('view_counts', '0')
evaluate = item.get('evaluate', '0')
attention = item.get('attention', '0')
barrage = item.get('barrage', '0')
self.data_bangumi.append((title, view_counts, evaluate, attention, barrage))
if len(self.data_bangumi) >= 20:
self._write_to_mysql_bangumi()
self.data_bangumi.clear()
if len(self.data_video) >= 20:
self._write_to_mysql_video()
self.data_video.clear()
return item
def _write_to_mysql_bangumi(self):
for item in self.data_bangumi:
self.cursor.execute(
'insert into bangumi (title, view_counts, evaluate, attention, barrage) values (%s, %s, %s, %s, %s)',
item
)
self.conn.commit()
def _write_to_mysql_video(self):
for item in self.data_video:
self.cursor.execute(
'insert into video (title, view_counts, barrage, up) values (%s, %s, %s, %s)',
item
)
self.conn.commit()

@ -0,0 +1,101 @@
BOT_NAME = 'spider'
SPIDER_MODULES = ['spider.spiders']
NEWSPIDER_MODULE = 'spider.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'spider (+http://www.yourdomain.com)'
# scrapy-redis
# 去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 使用scrapy_redis的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 是否允许暂停
SCHEDULER_PERSIST = True
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3 #延迟时间
RANDOMIZE_DOWNLOAD_DELAY = True # 随机延迟开关
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# }
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'spider.middlewares.SpiderSpiderMiddleware': 543,
# }
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'spider.middlewares.SpiderDownloaderMiddleware': 543,
# 'spider.middlewares.ProxyMiddleware': 600,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# 'spider.pipelines.ExcelPipeline': 300,
'spider.pipelines.MysqlPipeline': 300,
# 使用scrapy_redis的管道
'scrapy_redis.pipelines.RedisPipeline': 400,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'

@ -0,0 +1,54 @@
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy_redis.spiders import RedisCrawlSpider
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule
from scrapy import Request
from spider.items import VideoItem, BiliItem
class BiliSpider(RedisCrawlSpider):
name = 'Bili'
redis_key = 'Bili'
rules = [
Rule(LinkExtractor(allow=r"https://www.bilibili.com/bangumi/.*?"), callback='parse_Item', follow=True),
Rule(LinkExtractor(allow=r"https://www.bilibili.com/video/BV.*?"), callback='parse_Videoitem', follow=True),
Rule(LinkExtractor(allow=r"https://www.bilibili.com/bangumi/media/md.*?"), callback='parse_BiliItem',
follow=True),
]
def parse_Videoitem(self, response, **kwargs):
Video_item = VideoItem()
Video_item['title'] = response.xpath('//*[@id="viewbox_report"]/h1/@title').extract()[0]
Video_item['view_counts'] = str(
response.xpath('//*[@id="viewbox_report"]/div/div/span[1]/@title').extract()[0]).replace("总播放数", "")
Video_item['barrage'] = str(
response.xpath('//*[@id="viewbox_report"]/div/div/span[2]/@title').extract()[0]).replace(
"历史累计弹幕数", "")
Video_item['up'] = str(response.xpath('//*[@id="v_upinfo"]/div[2]/div[1]/a[1]/text()').extract()[0]).replace(
"\\n",
"").strip()
yield Video_item
def parse_BiliItem(self, response, **kwargs):
bangumi_item = BiliItem()
bangumi_item['title'] = response.xpath(
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[1]/span[1]/text()').extract()[0]
bangumi_item['view_counts'] = response.xpath(
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[1]/em/text()').extract()[0]
bangumi_item['attention'] = response.xpath(
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[2]/em/text()').extract()[0]
bangumi_item['barrage'] = response.xpath(
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[3]/em/text()').extract()[0]
bangumi_item['evaluate'] = response.xpath(
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[2]/div/div[1]/text()').extract()[0]
yield bangumi_item
def parse_Item(self, response, **kwargs):
url = 'https:' + response.xpath('//*[@id="media_module"]/div/a/@href').extract()[0]
yield Request(url=url, callback=self.parse_BiliItem)

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

@ -0,0 +1,24 @@
import json
from selenium import webdriver
def create_chrome_driver(*, headless=False):
options = webdriver.ChromeOptions()
if headless:
options.add_argument('--headless') # 不显示浏览器窗口
options.add_experimental_option('excludeSwitches', ['enable-automation']) # 防止识别Selenium驱动浏览器
options.add_experimental_option('useAutomationExtension', False)
browser = webdriver.Chrome(options=options,executable_path=r'chromedriver.exe')
browser.execute_cdp_cmd(
'Page.addScriptToEvaluateOnNewDocument',
{'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined'} # 修改浏览器标识
)
return browser
def add_cookies(browser, cookie_file):
with open(cookie_file, 'r') as file:
cookies_list = json.loads(file.read())
for cookie_dict in cookies_list:
browser.add_cookie(cookie_dict)
Loading…
Cancel
Save