parent
2d6a6a1f85
commit
2deaaaf78c
@ -0,0 +1,2 @@
|
||||
# python
|
||||
|
Binary file not shown.
@ -0,0 +1,208 @@
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib import font_manager
|
||||
import pymysql
|
||||
|
||||
class show(object):
|
||||
info = {}
|
||||
TScore = [] # 综合评分
|
||||
name = [] # 动漫名字
|
||||
bfl = [] # 播放量
|
||||
pls = [] # 评论数
|
||||
scs = [] # 收藏数
|
||||
|
||||
def save(self):#把数据从数据中取出存入对象
|
||||
db = pymysql.connect(host='47.106.183.36', port=3306,
|
||||
user='fuchuang', password='fuchuang',
|
||||
database='fuchuang', charset='utf8mb4') # 数据库连接
|
||||
use = 'use fuchuang;'
|
||||
show = 'show tables;'
|
||||
str = 'select * from bangumi;'
|
||||
cursor = db.cursor()
|
||||
try:
|
||||
cursor.execute(show)#
|
||||
cursor.execute(str)#选中表
|
||||
desc = cursor.description
|
||||
data = cursor.fetchall()#获取表信息
|
||||
list = []
|
||||
for data in data:#筛选出评分大于6的数据
|
||||
if (data[3] != '暂无评分'):
|
||||
if (float(data[3]) > 9.5):
|
||||
#print(data)
|
||||
self.name.append(data[1])# 动漫名字
|
||||
|
||||
list = [data[2]]
|
||||
if '万' in list[0]:
|
||||
list[0] = float(list[0].replace('万', ''))
|
||||
elif '亿' in list[0]:
|
||||
list[0] = float(list[0].replace('亿', '')) * 10000
|
||||
self.bfl.append(list[0]) # 播放量
|
||||
|
||||
list = [data[4]]
|
||||
if '万' in list[0]:
|
||||
list[0] = float(list[0].replace('万', ''))
|
||||
else:
|
||||
list[0] = float(list[0])
|
||||
self.TScore.append(float(data[3])*list[0]) # 综合评分
|
||||
self.scs.append(list[0])# 收藏数
|
||||
|
||||
list = [data[5]]
|
||||
if '万' in list[0]:
|
||||
list[0] = float(list[0].replace('万', ''))
|
||||
else:
|
||||
list[0] = float(list[0])
|
||||
self.pls.append(list[0])# 评论数
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
db.rollback()
|
||||
finally:
|
||||
cursor.close()
|
||||
db.close()
|
||||
print(self.name)
|
||||
print(self.TScore)
|
||||
print(self.bfl)
|
||||
print(self.pls)
|
||||
print(self.scs)
|
||||
#info = {'动漫名': self.name, '播放量(万)': self.bfl, '评论数(万)': self.pls, '收藏数(万)': self.scs, '综合评分': self.TScore}
|
||||
#dm_file = pandas.DataFrame(info)
|
||||
#dm_file.to_excel('Dongman.xlsx', sheet_name="动漫数据分析")
|
||||
# 将所有列表返回
|
||||
return self.name, self.bfl, self.pls, self.scs, self.TScore
|
||||
def view(self):#数据可视化
|
||||
# 为了坐标轴上能显示中文
|
||||
plt.rcParams['font.sans-serif'] = ['SimHei']
|
||||
plt.rcParams['axes.unicode_minus'] = False
|
||||
|
||||
my_font = font_manager.FontProperties(fname='STHeiti-Medium.ttc')
|
||||
dm_name = self.name # 番剧名
|
||||
dm_play = self.bfl # 番剧播放量
|
||||
dm_review = self.pls # 番剧评论数
|
||||
dm_favorite = self.scs # 番剧收藏数
|
||||
dm_com_score = self.TScore # 番剧评分
|
||||
y_score = [9.6, 9.7, 9.8, 9.9,10.0]
|
||||
|
||||
#dm_all = self.TScore * self.scs
|
||||
|
||||
|
||||
fig, ax1 = plt.subplots()
|
||||
plt.bar(dm_name, dm_com_score, color='red')
|
||||
plt.title('综合评分和播放量数据分校', fontproperties=my_font)
|
||||
ax1.tick_params(labelsize=6)
|
||||
plt.xlabel('番剧名')
|
||||
plt.ylabel('综合评分')
|
||||
plt.xticks(rotation=90, color='green')
|
||||
|
||||
ax2 = ax1.twinx()
|
||||
ax2.plot(dm_play, color='cyan')
|
||||
plt.ylabel('播放量')
|
||||
|
||||
plt.plot(1, label='评分', color='red', linewidth=5.0)
|
||||
plt.plot(1, label='播放量', color="cyan", linewidth=1.0, linestyle="-")
|
||||
plt.legend()
|
||||
|
||||
plt.savefig(r'E:1.png', dpi=1000, bbox_inches='tight')
|
||||
|
||||
# **********************************************************************评论数和收藏数对比
|
||||
# ********评论数条形图
|
||||
fig, ax3 = plt.subplots()
|
||||
plt.bar(dm_name, dm_play, color='green')
|
||||
plt.title('番剧收藏数与评论数分析')
|
||||
plt.ylabel('评论数(万)')
|
||||
ax3.tick_params(labelsize=6)
|
||||
plt.xticks(rotation=90, color='green')
|
||||
|
||||
# *******收藏数折线图
|
||||
ax4 = ax3.twinx() # 组合图必须加这个
|
||||
ax4.plot(dm_favorite, color='yellow') # 设置线粗细,节点样式
|
||||
plt.ylabel('收藏数(万)')
|
||||
|
||||
plt.plot(1, label='评论数', color="green", linewidth=5.0)
|
||||
plt.plot(1, label='收藏数', color="yellow", linewidth=1.0, linestyle="-")
|
||||
plt.legend()
|
||||
plt.savefig(r'E:2.png', dpi=1000, bbox_inches='tight')
|
||||
|
||||
# **********************************************************************综合评分和收藏数对比
|
||||
# *******综合评分条形图
|
||||
fig, ax5 = plt.subplots()
|
||||
plt.bar(dm_name, dm_com_score, color='red')
|
||||
plt.title('综合评分和收藏数量数据分析')
|
||||
plt.ylabel('综合评分')
|
||||
ax5.tick_params(labelsize=6)
|
||||
plt.xticks(rotation=90, color='green')
|
||||
|
||||
# *******收藏折线图
|
||||
ax6 = ax5.twinx() # 组合图必须加这个
|
||||
ax6.plot(dm_favorite, color='yellow') # 设置线粗细,节点样式
|
||||
plt.ylabel('收藏数(万)')
|
||||
plt.plot(1, label='综合评分', color="red", linewidth=5.0)
|
||||
plt.plot(1, label='收藏数', color="yellow", linewidth=1.0, linestyle="-")
|
||||
plt.legend()
|
||||
|
||||
plt.savefig(r'E:3.png', dpi=1000, bbox_inches='tight')
|
||||
|
||||
# **********************************************************************播放量和评论数对比
|
||||
# *******播放量条形图
|
||||
fig, ax7 = plt.subplots()
|
||||
plt.bar(dm_name, dm_play, color='cyan')
|
||||
plt.title('播放量和收藏数 数据分析')
|
||||
plt.ylabel('播放量(万)')
|
||||
ax7.tick_params(labelsize=6)
|
||||
plt.xticks(rotation=90, color='green')
|
||||
|
||||
# *******收藏数折线图
|
||||
ax8 = ax7.twinx() # 组合图必须加这个
|
||||
ax8.plot(dm_favorite, color='yellow') # 设置线粗细,节点样式
|
||||
plt.ylabel('收藏数(万)')
|
||||
|
||||
plt.plot(1, label='评论数', color="green", linewidth=5.0)
|
||||
plt.plot(1, label='收藏数', color="yellow", linewidth=1.0, linestyle="-")
|
||||
plt.legend()
|
||||
plt.savefig(r'E:4.png', dpi=1000, bbox_inches='tight')
|
||||
|
||||
# *******评论数折线图
|
||||
# ax8 = ax7.twinx() # 组合图必须加这个
|
||||
# ax8.plot(dm_review, color='green') # 设置线粗细,节点样式
|
||||
# plt.ylabel('评论数(万)')
|
||||
# plt.plot(1, label='播放量', color="cyan", linewidth=5.0)
|
||||
# plt.plot(1, label='评论数', color="green", linewidth=1.0, linestyle="-")
|
||||
# plt.legend()
|
||||
# plt.savefig(r'E:4.png', dpi=1000, bbox_inches='tight')
|
||||
#评论数的数据展示有问题
|
||||
plt.show()
|
||||
def print(self):
|
||||
print(len(self.name))
|
||||
print(len(self.bfl))
|
||||
print(len(self.pls))
|
||||
print(len(self.scs))
|
||||
print(len(self.TScore))
|
||||
def sort(self,i,j):
|
||||
temp = self.name[i]
|
||||
self.name[i] = self.name[j]
|
||||
self.name[j] = temp
|
||||
|
||||
temp = self.bfl[i]
|
||||
self.bfl[i] = self.bfl[j]
|
||||
self.bfl[j] = temp
|
||||
|
||||
temp = self.pls[i]
|
||||
self.pls[i] = self.pls[j]
|
||||
self.pls[j] = temp
|
||||
|
||||
temp = self.scs[i]
|
||||
self.scs[i] = self.scs[j]
|
||||
self.scs[j] = temp
|
||||
|
||||
temp = self.TScore[i]
|
||||
self.TScore[i] = self.TScore[j]
|
||||
self.TScore[j] = temp
|
||||
|
||||
def main():
|
||||
a=show()#创建对象
|
||||
a.save()#从数据库取数据
|
||||
a.print() # 输出各个数据个数
|
||||
a.view()#可视化
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Binary file not shown.
@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = spider.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = spider
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,20 @@
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.common.by import By
|
||||
from time import sleep
|
||||
import json
|
||||
|
||||
s = Service("../chromedriver.exe")
|
||||
bro = webdriver.Chrome(service=s)
|
||||
# 打开b站
|
||||
bro.get('https://www.bilibili.com')
|
||||
bro.delete_all_cookies() # 先删除cookies
|
||||
# 60秒时间留你进行登陆
|
||||
sleep(60)
|
||||
dictcookies = bro.get_cookies() # 读取登录之后浏览器的cookies
|
||||
jsoncookies = json.dumps(dictcookies) # 将字典数据转成json数据便于保存
|
||||
|
||||
# 生成cookies.txt文件
|
||||
with open('cookies.txt', 'w') as f: # 写进文本保存
|
||||
f.write(jsoncookies)
|
||||
print('cookies is ok')
|
@ -0,0 +1 @@
|
||||
[{"domain": ".bilibili.com", "httpOnly": false, "name": "innersign", "path": "/", "secure": false, "value": "0"}, {"domain": ".bilibili.com", "expiry": 1700969682, "httpOnly": false, "name": "i-wanna-go-back", "path": "/", "secure": false, "value": "-1"}, {"domain": ".bilibili.com", "expiry": 1684985681, "httpOnly": false, "name": "bili_jct", "path": "/", "secure": false, "value": "6c88a668c7442fa148fc9d06d6e40849"}, {"domain": ".bilibili.com", "expiry": 1700969680, "httpOnly": false, "name": "sid", "path": "/", "secure": false, "value": "qauykkrb"}, {"domain": ".bilibili.com", "expiry": 1684985681, "httpOnly": true, "name": "SESSDATA", "path": "/", "secure": true, "value": "8e595145%2C1684985681%2C785d7%2Ab1"}, {"domain": ".bilibili.com", "expiry": 1764041679, "httpOnly": false, "name": "buvid_fp", "path": "/", "secure": false, "value": "3fdd662d6b3f6d9fd8b2a5f1834013d4"}, {"domain": ".bilibili.com", "expiry": 1684985681, "httpOnly": false, "name": "DedeUserID", "path": "/", "secure": false, "value": "178665301"}, {"domain": ".bilibili.com", "expiry": 1764041638, "httpOnly": false, "name": "b_nut", "path": "/", "secure": false, "value": "1669433638"}, {"domain": ".bilibili.com", "expiry": 1700969682, "httpOnly": false, "name": "b_ut", "path": "/", "secure": false, "value": "5"}, {"domain": ".bilibili.com", "httpOnly": false, "name": "b_lsid", "path": "/", "secure": false, "value": "B3E3109EB_184B2000F25"}, {"domain": ".bilibili.com", "expiry": 1684985681, "httpOnly": false, "name": "DedeUserID__ckMd5", "path": "/", "secure": false, "value": "3f4304303449401f"}, {"domain": ".bilibili.com", "expiry": 1700969654, "httpOnly": false, "name": "_uuid", "path": "/", "secure": false, "value": "F0B6051B-46F1-CEFA-D47B-30415304BFA054324infoc"}, {"domain": ".bilibili.com", "expiry": 1764041638, "httpOnly": false, "name": "buvid3", "path": "/", "secure": false, "value": "69BFBC33-CEEC-E4A5-9E0A-7D9DB491BA0738742infoc"}, {"domain": ".bilibili.com", "expiry": 1764041638, "httpOnly": false, "name": "buvid4", "path": "/", "secure": false, "value": "7FF54BEA-5D27-6E79-874D-2092619183B623196-022112611-0kiN2FN18wB5k/cGcFypkA%3D%3D"}]
|
@ -0,0 +1,21 @@
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class VideoItem(scrapy.Item):
|
||||
title = scrapy.Field() # 视频名
|
||||
view_counts = scrapy.Field() # 播放量
|
||||
barrage = scrapy.Field() # 弹幕数
|
||||
up = scrapy.Field() # up主
|
||||
|
||||
|
||||
class BiliItem(scrapy.Item):
|
||||
title = scrapy.Field() # 视频名
|
||||
view_counts = scrapy.Field() # 播放量
|
||||
evaluate = scrapy.Field() # 评分
|
||||
attention = scrapy.Field() # 追番数
|
||||
barrage = scrapy.Field() # 弹幕数
|
@ -0,0 +1,87 @@
|
||||
|
||||
import pymysql
|
||||
from spider.items import VideoItem, BiliItem
|
||||
|
||||
# import openpyxl
|
||||
|
||||
|
||||
# class ExcelPipeline:
|
||||
# def __int__(self):
|
||||
# self.wb = openpyxl.Workbook()
|
||||
# self.ws = self.wb.active
|
||||
# self.ws.title = 'Goods'
|
||||
# self.ws.append(('标题', '价格', '销量', '图片', '店铺', '位置'))
|
||||
#
|
||||
# def close_spider(self, spider):
|
||||
# self.wb.save('商品数据.xlsx')
|
||||
#
|
||||
# def process_item(self, item, spider):
|
||||
# title = item.get('title', '') # 如果拿不到,则赋空
|
||||
# price = item.get('price', 0)
|
||||
# deal_count = item.get('deal_count', 0)
|
||||
# picture = item.get('picture', '')
|
||||
# location = item.get('location', '')
|
||||
# shop = item.get('shop', '')
|
||||
# self.ws.append((title, price, deal_count, picture, shop, location))
|
||||
# return item
|
||||
|
||||
|
||||
class MysqlPipeline:
|
||||
def __init__(self):
|
||||
self.conn = pymysql.connect(host='47.106.183.36', port=3306,
|
||||
user='fuchuang', password='fuchuang',
|
||||
database='fuchuang', charset='utf8mb4')
|
||||
self.cursor = self.conn.cursor()
|
||||
self.data_bangumi = []
|
||||
self.data_video = []
|
||||
|
||||
def close_spider(self, spider):
|
||||
if len(self.data_bangumi) > 0:
|
||||
self._write_to_mysql_bangumi()
|
||||
self.data_bangumi.clear()
|
||||
if len(self.data_video) > 0:
|
||||
self._write_to_mysql_video()
|
||||
self.data_video.clear()
|
||||
self.conn.commit()
|
||||
self.conn.close()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
if type(item) == VideoItem:
|
||||
title = item.get('title', '') # 如果拿不到,则赋空
|
||||
view_counts = item.get('view_counts', '0')
|
||||
barrage = item.get('barrage', '0')
|
||||
up = item.get('up', '')
|
||||
self.data_video.append((title, view_counts, barrage, up))
|
||||
if type(item) == BiliItem:
|
||||
title = item.get('title', '') # 如果拿不到,则赋空
|
||||
view_counts = item.get('view_counts', '0')
|
||||
evaluate = item.get('evaluate', '0')
|
||||
attention = item.get('attention', '0')
|
||||
barrage = item.get('barrage', '0')
|
||||
self.data_bangumi.append((title, view_counts, evaluate, attention, barrage))
|
||||
if len(self.data_bangumi) >= 20:
|
||||
self._write_to_mysql_bangumi()
|
||||
self.data_bangumi.clear()
|
||||
if len(self.data_video) >= 20:
|
||||
self._write_to_mysql_video()
|
||||
self.data_video.clear()
|
||||
return item
|
||||
|
||||
def _write_to_mysql_bangumi(self):
|
||||
for item in self.data_bangumi:
|
||||
self.cursor.execute(
|
||||
'insert into bangumi (title, view_counts, evaluate, attention, barrage) values (%s, %s, %s, %s, %s)',
|
||||
item
|
||||
)
|
||||
self.conn.commit()
|
||||
|
||||
def _write_to_mysql_video(self):
|
||||
for item in self.data_video:
|
||||
self.cursor.execute(
|
||||
'insert into video (title, view_counts, barrage, up) values (%s, %s, %s, %s)',
|
||||
item
|
||||
)
|
||||
self.conn.commit()
|
||||
|
||||
|
||||
|
@ -0,0 +1,101 @@
|
||||
|
||||
BOT_NAME = 'spider'
|
||||
|
||||
SPIDER_MODULES = ['spider.spiders']
|
||||
NEWSPIDER_MODULE = 'spider.spiders'
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
# USER_AGENT = 'spider (+http://www.yourdomain.com)'
|
||||
|
||||
# scrapy-redis
|
||||
# 去重
|
||||
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
|
||||
# 使用scrapy_redis的调度器
|
||||
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
|
||||
# 是否允许暂停
|
||||
SCHEDULER_PERSIST = True
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
# CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
|
||||
DOWNLOAD_DELAY = 3 #延迟时间
|
||||
RANDOMIZE_DOWNLOAD_DELAY = True # 随机延迟开关
|
||||
|
||||
# The download delay setting will honor only one of:
|
||||
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
# CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
# COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
# TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
# DEFAULT_REQUEST_HEADERS = {
|
||||
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
# 'Accept-Language': 'en',
|
||||
# }
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
# SPIDER_MIDDLEWARES = {
|
||||
# 'spider.middlewares.SpiderSpiderMiddleware': 543,
|
||||
# }
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
DOWNLOADER_MIDDLEWARES = {
|
||||
'spider.middlewares.SpiderDownloaderMiddleware': 543,
|
||||
# 'spider.middlewares.ProxyMiddleware': 600,
|
||||
|
||||
|
||||
}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
# EXTENSIONS = {
|
||||
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||
# }
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
# 'spider.pipelines.ExcelPipeline': 300,
|
||||
'spider.pipelines.MysqlPipeline': 300,
|
||||
|
||||
# 使用scrapy_redis的管道
|
||||
'scrapy_redis.pipelines.RedisPipeline': 400,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
# AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
# AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
# AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
# AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
# HTTPCACHE_ENABLED = True
|
||||
# HTTPCACHE_EXPIRATION_SECS = 0
|
||||
# HTTPCACHE_DIR = 'httpcache'
|
||||
# HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
|
||||
# Set settings whose default value is deprecated to a future-proof value
|
||||
REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
|
||||
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
|
@ -0,0 +1,54 @@
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
from scrapy.utils.project import get_project_settings
|
||||
from scrapy_redis.spiders import RedisCrawlSpider
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
from scrapy.spiders import Rule
|
||||
from scrapy import Request
|
||||
|
||||
from spider.items import VideoItem, BiliItem
|
||||
|
||||
|
||||
class BiliSpider(RedisCrawlSpider):
|
||||
name = 'Bili'
|
||||
redis_key = 'Bili'
|
||||
|
||||
rules = [
|
||||
Rule(LinkExtractor(allow=r"https://www.bilibili.com/bangumi/.*?"), callback='parse_Item', follow=True),
|
||||
Rule(LinkExtractor(allow=r"https://www.bilibili.com/video/BV.*?"), callback='parse_Videoitem', follow=True),
|
||||
Rule(LinkExtractor(allow=r"https://www.bilibili.com/bangumi/media/md.*?"), callback='parse_BiliItem',
|
||||
follow=True),
|
||||
]
|
||||
|
||||
def parse_Videoitem(self, response, **kwargs):
|
||||
Video_item = VideoItem()
|
||||
Video_item['title'] = response.xpath('//*[@id="viewbox_report"]/h1/@title').extract()[0]
|
||||
Video_item['view_counts'] = str(
|
||||
response.xpath('//*[@id="viewbox_report"]/div/div/span[1]/@title').extract()[0]).replace("总播放数", "")
|
||||
Video_item['barrage'] = str(
|
||||
response.xpath('//*[@id="viewbox_report"]/div/div/span[2]/@title').extract()[0]).replace(
|
||||
"历史累计弹幕数", "")
|
||||
Video_item['up'] = str(response.xpath('//*[@id="v_upinfo"]/div[2]/div[1]/a[1]/text()').extract()[0]).replace(
|
||||
"\\n",
|
||||
"").strip()
|
||||
yield Video_item
|
||||
|
||||
def parse_BiliItem(self, response, **kwargs):
|
||||
bangumi_item = BiliItem()
|
||||
bangumi_item['title'] = response.xpath(
|
||||
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[1]/span[1]/text()').extract()[0]
|
||||
bangumi_item['view_counts'] = response.xpath(
|
||||
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[1]/em/text()').extract()[0]
|
||||
bangumi_item['attention'] = response.xpath(
|
||||
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[2]/em/text()').extract()[0]
|
||||
bangumi_item['barrage'] = response.xpath(
|
||||
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[3]/em/text()').extract()[0]
|
||||
bangumi_item['evaluate'] = response.xpath(
|
||||
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[2]/div/div[1]/text()').extract()[0]
|
||||
yield bangumi_item
|
||||
|
||||
def parse_Item(self, response, **kwargs):
|
||||
url = 'https:' + response.xpath('//*[@id="media_module"]/div/a/@href').extract()[0]
|
||||
yield Request(url=url, callback=self.parse_BiliItem)
|
||||
|
||||
|
||||
|
@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
Binary file not shown.
Binary file not shown.
@ -0,0 +1,24 @@
|
||||
import json
|
||||
|
||||
from selenium import webdriver
|
||||
|
||||
|
||||
def create_chrome_driver(*, headless=False):
|
||||
options = webdriver.ChromeOptions()
|
||||
if headless:
|
||||
options.add_argument('--headless') # 不显示浏览器窗口
|
||||
options.add_experimental_option('excludeSwitches', ['enable-automation']) # 防止识别Selenium驱动浏览器
|
||||
options.add_experimental_option('useAutomationExtension', False)
|
||||
browser = webdriver.Chrome(options=options,executable_path=r'chromedriver.exe')
|
||||
browser.execute_cdp_cmd(
|
||||
'Page.addScriptToEvaluateOnNewDocument',
|
||||
{'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined'} # 修改浏览器标识
|
||||
)
|
||||
return browser
|
||||
|
||||
|
||||
def add_cookies(browser, cookie_file):
|
||||
with open(cookie_file, 'r') as file:
|
||||
cookies_list = json.loads(file.read())
|
||||
for cookie_dict in cookies_list:
|
||||
browser.add_cookie(cookie_dict)
|
Loading…
Reference in new issue