parent
2d6a6a1f85
commit
2deaaaf78c
@ -0,0 +1,2 @@
|
|||||||
|
# python
|
||||||
|
|
Binary file not shown.
@ -0,0 +1,208 @@
|
|||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from matplotlib import font_manager
|
||||||
|
import pymysql
|
||||||
|
|
||||||
|
class show(object):
|
||||||
|
info = {}
|
||||||
|
TScore = [] # 综合评分
|
||||||
|
name = [] # 动漫名字
|
||||||
|
bfl = [] # 播放量
|
||||||
|
pls = [] # 评论数
|
||||||
|
scs = [] # 收藏数
|
||||||
|
|
||||||
|
def save(self):#把数据从数据中取出存入对象
|
||||||
|
db = pymysql.connect(host='47.106.183.36', port=3306,
|
||||||
|
user='fuchuang', password='fuchuang',
|
||||||
|
database='fuchuang', charset='utf8mb4') # 数据库连接
|
||||||
|
use = 'use fuchuang;'
|
||||||
|
show = 'show tables;'
|
||||||
|
str = 'select * from bangumi;'
|
||||||
|
cursor = db.cursor()
|
||||||
|
try:
|
||||||
|
cursor.execute(show)#
|
||||||
|
cursor.execute(str)#选中表
|
||||||
|
desc = cursor.description
|
||||||
|
data = cursor.fetchall()#获取表信息
|
||||||
|
list = []
|
||||||
|
for data in data:#筛选出评分大于6的数据
|
||||||
|
if (data[3] != '暂无评分'):
|
||||||
|
if (float(data[3]) > 9.5):
|
||||||
|
#print(data)
|
||||||
|
self.name.append(data[1])# 动漫名字
|
||||||
|
|
||||||
|
list = [data[2]]
|
||||||
|
if '万' in list[0]:
|
||||||
|
list[0] = float(list[0].replace('万', ''))
|
||||||
|
elif '亿' in list[0]:
|
||||||
|
list[0] = float(list[0].replace('亿', '')) * 10000
|
||||||
|
self.bfl.append(list[0]) # 播放量
|
||||||
|
|
||||||
|
list = [data[4]]
|
||||||
|
if '万' in list[0]:
|
||||||
|
list[0] = float(list[0].replace('万', ''))
|
||||||
|
else:
|
||||||
|
list[0] = float(list[0])
|
||||||
|
self.TScore.append(float(data[3])*list[0]) # 综合评分
|
||||||
|
self.scs.append(list[0])# 收藏数
|
||||||
|
|
||||||
|
list = [data[5]]
|
||||||
|
if '万' in list[0]:
|
||||||
|
list[0] = float(list[0].replace('万', ''))
|
||||||
|
else:
|
||||||
|
list[0] = float(list[0])
|
||||||
|
self.pls.append(list[0])# 评论数
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
db.rollback()
|
||||||
|
finally:
|
||||||
|
cursor.close()
|
||||||
|
db.close()
|
||||||
|
print(self.name)
|
||||||
|
print(self.TScore)
|
||||||
|
print(self.bfl)
|
||||||
|
print(self.pls)
|
||||||
|
print(self.scs)
|
||||||
|
#info = {'动漫名': self.name, '播放量(万)': self.bfl, '评论数(万)': self.pls, '收藏数(万)': self.scs, '综合评分': self.TScore}
|
||||||
|
#dm_file = pandas.DataFrame(info)
|
||||||
|
#dm_file.to_excel('Dongman.xlsx', sheet_name="动漫数据分析")
|
||||||
|
# 将所有列表返回
|
||||||
|
return self.name, self.bfl, self.pls, self.scs, self.TScore
|
||||||
|
def view(self):#数据可视化
|
||||||
|
# 为了坐标轴上能显示中文
|
||||||
|
plt.rcParams['font.sans-serif'] = ['SimHei']
|
||||||
|
plt.rcParams['axes.unicode_minus'] = False
|
||||||
|
|
||||||
|
my_font = font_manager.FontProperties(fname='STHeiti-Medium.ttc')
|
||||||
|
dm_name = self.name # 番剧名
|
||||||
|
dm_play = self.bfl # 番剧播放量
|
||||||
|
dm_review = self.pls # 番剧评论数
|
||||||
|
dm_favorite = self.scs # 番剧收藏数
|
||||||
|
dm_com_score = self.TScore # 番剧评分
|
||||||
|
y_score = [9.6, 9.7, 9.8, 9.9,10.0]
|
||||||
|
|
||||||
|
#dm_all = self.TScore * self.scs
|
||||||
|
|
||||||
|
|
||||||
|
fig, ax1 = plt.subplots()
|
||||||
|
plt.bar(dm_name, dm_com_score, color='red')
|
||||||
|
plt.title('综合评分和播放量数据分校', fontproperties=my_font)
|
||||||
|
ax1.tick_params(labelsize=6)
|
||||||
|
plt.xlabel('番剧名')
|
||||||
|
plt.ylabel('综合评分')
|
||||||
|
plt.xticks(rotation=90, color='green')
|
||||||
|
|
||||||
|
ax2 = ax1.twinx()
|
||||||
|
ax2.plot(dm_play, color='cyan')
|
||||||
|
plt.ylabel('播放量')
|
||||||
|
|
||||||
|
plt.plot(1, label='评分', color='red', linewidth=5.0)
|
||||||
|
plt.plot(1, label='播放量', color="cyan", linewidth=1.0, linestyle="-")
|
||||||
|
plt.legend()
|
||||||
|
|
||||||
|
plt.savefig(r'E:1.png', dpi=1000, bbox_inches='tight')
|
||||||
|
|
||||||
|
# **********************************************************************评论数和收藏数对比
|
||||||
|
# ********评论数条形图
|
||||||
|
fig, ax3 = plt.subplots()
|
||||||
|
plt.bar(dm_name, dm_play, color='green')
|
||||||
|
plt.title('番剧收藏数与评论数分析')
|
||||||
|
plt.ylabel('评论数(万)')
|
||||||
|
ax3.tick_params(labelsize=6)
|
||||||
|
plt.xticks(rotation=90, color='green')
|
||||||
|
|
||||||
|
# *******收藏数折线图
|
||||||
|
ax4 = ax3.twinx() # 组合图必须加这个
|
||||||
|
ax4.plot(dm_favorite, color='yellow') # 设置线粗细,节点样式
|
||||||
|
plt.ylabel('收藏数(万)')
|
||||||
|
|
||||||
|
plt.plot(1, label='评论数', color="green", linewidth=5.0)
|
||||||
|
plt.plot(1, label='收藏数', color="yellow", linewidth=1.0, linestyle="-")
|
||||||
|
plt.legend()
|
||||||
|
plt.savefig(r'E:2.png', dpi=1000, bbox_inches='tight')
|
||||||
|
|
||||||
|
# **********************************************************************综合评分和收藏数对比
|
||||||
|
# *******综合评分条形图
|
||||||
|
fig, ax5 = plt.subplots()
|
||||||
|
plt.bar(dm_name, dm_com_score, color='red')
|
||||||
|
plt.title('综合评分和收藏数量数据分析')
|
||||||
|
plt.ylabel('综合评分')
|
||||||
|
ax5.tick_params(labelsize=6)
|
||||||
|
plt.xticks(rotation=90, color='green')
|
||||||
|
|
||||||
|
# *******收藏折线图
|
||||||
|
ax6 = ax5.twinx() # 组合图必须加这个
|
||||||
|
ax6.plot(dm_favorite, color='yellow') # 设置线粗细,节点样式
|
||||||
|
plt.ylabel('收藏数(万)')
|
||||||
|
plt.plot(1, label='综合评分', color="red", linewidth=5.0)
|
||||||
|
plt.plot(1, label='收藏数', color="yellow", linewidth=1.0, linestyle="-")
|
||||||
|
plt.legend()
|
||||||
|
|
||||||
|
plt.savefig(r'E:3.png', dpi=1000, bbox_inches='tight')
|
||||||
|
|
||||||
|
# **********************************************************************播放量和评论数对比
|
||||||
|
# *******播放量条形图
|
||||||
|
fig, ax7 = plt.subplots()
|
||||||
|
plt.bar(dm_name, dm_play, color='cyan')
|
||||||
|
plt.title('播放量和收藏数 数据分析')
|
||||||
|
plt.ylabel('播放量(万)')
|
||||||
|
ax7.tick_params(labelsize=6)
|
||||||
|
plt.xticks(rotation=90, color='green')
|
||||||
|
|
||||||
|
# *******收藏数折线图
|
||||||
|
ax8 = ax7.twinx() # 组合图必须加这个
|
||||||
|
ax8.plot(dm_favorite, color='yellow') # 设置线粗细,节点样式
|
||||||
|
plt.ylabel('收藏数(万)')
|
||||||
|
|
||||||
|
plt.plot(1, label='评论数', color="green", linewidth=5.0)
|
||||||
|
plt.plot(1, label='收藏数', color="yellow", linewidth=1.0, linestyle="-")
|
||||||
|
plt.legend()
|
||||||
|
plt.savefig(r'E:4.png', dpi=1000, bbox_inches='tight')
|
||||||
|
|
||||||
|
# *******评论数折线图
|
||||||
|
# ax8 = ax7.twinx() # 组合图必须加这个
|
||||||
|
# ax8.plot(dm_review, color='green') # 设置线粗细,节点样式
|
||||||
|
# plt.ylabel('评论数(万)')
|
||||||
|
# plt.plot(1, label='播放量', color="cyan", linewidth=5.0)
|
||||||
|
# plt.plot(1, label='评论数', color="green", linewidth=1.0, linestyle="-")
|
||||||
|
# plt.legend()
|
||||||
|
# plt.savefig(r'E:4.png', dpi=1000, bbox_inches='tight')
|
||||||
|
#评论数的数据展示有问题
|
||||||
|
plt.show()
|
||||||
|
def print(self):
|
||||||
|
print(len(self.name))
|
||||||
|
print(len(self.bfl))
|
||||||
|
print(len(self.pls))
|
||||||
|
print(len(self.scs))
|
||||||
|
print(len(self.TScore))
|
||||||
|
def sort(self,i,j):
|
||||||
|
temp = self.name[i]
|
||||||
|
self.name[i] = self.name[j]
|
||||||
|
self.name[j] = temp
|
||||||
|
|
||||||
|
temp = self.bfl[i]
|
||||||
|
self.bfl[i] = self.bfl[j]
|
||||||
|
self.bfl[j] = temp
|
||||||
|
|
||||||
|
temp = self.pls[i]
|
||||||
|
self.pls[i] = self.pls[j]
|
||||||
|
self.pls[j] = temp
|
||||||
|
|
||||||
|
temp = self.scs[i]
|
||||||
|
self.scs[i] = self.scs[j]
|
||||||
|
self.scs[j] = temp
|
||||||
|
|
||||||
|
temp = self.TScore[i]
|
||||||
|
self.TScore[i] = self.TScore[j]
|
||||||
|
self.TScore[j] = temp
|
||||||
|
|
||||||
|
def main():
|
||||||
|
a=show()#创建对象
|
||||||
|
a.save()#从数据库取数据
|
||||||
|
a.print() # 输出各个数据个数
|
||||||
|
a.view()#可视化
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
Binary file not shown.
@ -0,0 +1,11 @@
|
|||||||
|
# Automatically created by: scrapy startproject
|
||||||
|
#
|
||||||
|
# For more information about the [deploy] section see:
|
||||||
|
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||||
|
|
||||||
|
[settings]
|
||||||
|
default = spider.settings
|
||||||
|
|
||||||
|
[deploy]
|
||||||
|
#url = http://localhost:6800/
|
||||||
|
project = spider
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,20 @@
|
|||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from time import sleep
|
||||||
|
import json
|
||||||
|
|
||||||
|
s = Service("../chromedriver.exe")
|
||||||
|
bro = webdriver.Chrome(service=s)
|
||||||
|
# 打开b站
|
||||||
|
bro.get('https://www.bilibili.com')
|
||||||
|
bro.delete_all_cookies() # 先删除cookies
|
||||||
|
# 60秒时间留你进行登陆
|
||||||
|
sleep(60)
|
||||||
|
dictcookies = bro.get_cookies() # 读取登录之后浏览器的cookies
|
||||||
|
jsoncookies = json.dumps(dictcookies) # 将字典数据转成json数据便于保存
|
||||||
|
|
||||||
|
# 生成cookies.txt文件
|
||||||
|
with open('cookies.txt', 'w') as f: # 写进文本保存
|
||||||
|
f.write(jsoncookies)
|
||||||
|
print('cookies is ok')
|
@ -0,0 +1 @@
|
|||||||
|
[{"domain": ".bilibili.com", "httpOnly": false, "name": "innersign", "path": "/", "secure": false, "value": "0"}, {"domain": ".bilibili.com", "expiry": 1700969682, "httpOnly": false, "name": "i-wanna-go-back", "path": "/", "secure": false, "value": "-1"}, {"domain": ".bilibili.com", "expiry": 1684985681, "httpOnly": false, "name": "bili_jct", "path": "/", "secure": false, "value": "6c88a668c7442fa148fc9d06d6e40849"}, {"domain": ".bilibili.com", "expiry": 1700969680, "httpOnly": false, "name": "sid", "path": "/", "secure": false, "value": "qauykkrb"}, {"domain": ".bilibili.com", "expiry": 1684985681, "httpOnly": true, "name": "SESSDATA", "path": "/", "secure": true, "value": "8e595145%2C1684985681%2C785d7%2Ab1"}, {"domain": ".bilibili.com", "expiry": 1764041679, "httpOnly": false, "name": "buvid_fp", "path": "/", "secure": false, "value": "3fdd662d6b3f6d9fd8b2a5f1834013d4"}, {"domain": ".bilibili.com", "expiry": 1684985681, "httpOnly": false, "name": "DedeUserID", "path": "/", "secure": false, "value": "178665301"}, {"domain": ".bilibili.com", "expiry": 1764041638, "httpOnly": false, "name": "b_nut", "path": "/", "secure": false, "value": "1669433638"}, {"domain": ".bilibili.com", "expiry": 1700969682, "httpOnly": false, "name": "b_ut", "path": "/", "secure": false, "value": "5"}, {"domain": ".bilibili.com", "httpOnly": false, "name": "b_lsid", "path": "/", "secure": false, "value": "B3E3109EB_184B2000F25"}, {"domain": ".bilibili.com", "expiry": 1684985681, "httpOnly": false, "name": "DedeUserID__ckMd5", "path": "/", "secure": false, "value": "3f4304303449401f"}, {"domain": ".bilibili.com", "expiry": 1700969654, "httpOnly": false, "name": "_uuid", "path": "/", "secure": false, "value": "F0B6051B-46F1-CEFA-D47B-30415304BFA054324infoc"}, {"domain": ".bilibili.com", "expiry": 1764041638, "httpOnly": false, "name": "buvid3", "path": "/", "secure": false, "value": "69BFBC33-CEEC-E4A5-9E0A-7D9DB491BA0738742infoc"}, {"domain": ".bilibili.com", "expiry": 1764041638, "httpOnly": false, "name": "buvid4", "path": "/", "secure": false, "value": "7FF54BEA-5D27-6E79-874D-2092619183B623196-022112611-0kiN2FN18wB5k/cGcFypkA%3D%3D"}]
|
@ -0,0 +1,21 @@
|
|||||||
|
# Define here the models for your scraped items
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class VideoItem(scrapy.Item):
|
||||||
|
title = scrapy.Field() # 视频名
|
||||||
|
view_counts = scrapy.Field() # 播放量
|
||||||
|
barrage = scrapy.Field() # 弹幕数
|
||||||
|
up = scrapy.Field() # up主
|
||||||
|
|
||||||
|
|
||||||
|
class BiliItem(scrapy.Item):
|
||||||
|
title = scrapy.Field() # 视频名
|
||||||
|
view_counts = scrapy.Field() # 播放量
|
||||||
|
evaluate = scrapy.Field() # 评分
|
||||||
|
attention = scrapy.Field() # 追番数
|
||||||
|
barrage = scrapy.Field() # 弹幕数
|
@ -0,0 +1,87 @@
|
|||||||
|
|
||||||
|
import pymysql
|
||||||
|
from spider.items import VideoItem, BiliItem
|
||||||
|
|
||||||
|
# import openpyxl
|
||||||
|
|
||||||
|
|
||||||
|
# class ExcelPipeline:
|
||||||
|
# def __int__(self):
|
||||||
|
# self.wb = openpyxl.Workbook()
|
||||||
|
# self.ws = self.wb.active
|
||||||
|
# self.ws.title = 'Goods'
|
||||||
|
# self.ws.append(('标题', '价格', '销量', '图片', '店铺', '位置'))
|
||||||
|
#
|
||||||
|
# def close_spider(self, spider):
|
||||||
|
# self.wb.save('商品数据.xlsx')
|
||||||
|
#
|
||||||
|
# def process_item(self, item, spider):
|
||||||
|
# title = item.get('title', '') # 如果拿不到,则赋空
|
||||||
|
# price = item.get('price', 0)
|
||||||
|
# deal_count = item.get('deal_count', 0)
|
||||||
|
# picture = item.get('picture', '')
|
||||||
|
# location = item.get('location', '')
|
||||||
|
# shop = item.get('shop', '')
|
||||||
|
# self.ws.append((title, price, deal_count, picture, shop, location))
|
||||||
|
# return item
|
||||||
|
|
||||||
|
|
||||||
|
class MysqlPipeline:
|
||||||
|
def __init__(self):
|
||||||
|
self.conn = pymysql.connect(host='47.106.183.36', port=3306,
|
||||||
|
user='fuchuang', password='fuchuang',
|
||||||
|
database='fuchuang', charset='utf8mb4')
|
||||||
|
self.cursor = self.conn.cursor()
|
||||||
|
self.data_bangumi = []
|
||||||
|
self.data_video = []
|
||||||
|
|
||||||
|
def close_spider(self, spider):
|
||||||
|
if len(self.data_bangumi) > 0:
|
||||||
|
self._write_to_mysql_bangumi()
|
||||||
|
self.data_bangumi.clear()
|
||||||
|
if len(self.data_video) > 0:
|
||||||
|
self._write_to_mysql_video()
|
||||||
|
self.data_video.clear()
|
||||||
|
self.conn.commit()
|
||||||
|
self.conn.close()
|
||||||
|
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
if type(item) == VideoItem:
|
||||||
|
title = item.get('title', '') # 如果拿不到,则赋空
|
||||||
|
view_counts = item.get('view_counts', '0')
|
||||||
|
barrage = item.get('barrage', '0')
|
||||||
|
up = item.get('up', '')
|
||||||
|
self.data_video.append((title, view_counts, barrage, up))
|
||||||
|
if type(item) == BiliItem:
|
||||||
|
title = item.get('title', '') # 如果拿不到,则赋空
|
||||||
|
view_counts = item.get('view_counts', '0')
|
||||||
|
evaluate = item.get('evaluate', '0')
|
||||||
|
attention = item.get('attention', '0')
|
||||||
|
barrage = item.get('barrage', '0')
|
||||||
|
self.data_bangumi.append((title, view_counts, evaluate, attention, barrage))
|
||||||
|
if len(self.data_bangumi) >= 20:
|
||||||
|
self._write_to_mysql_bangumi()
|
||||||
|
self.data_bangumi.clear()
|
||||||
|
if len(self.data_video) >= 20:
|
||||||
|
self._write_to_mysql_video()
|
||||||
|
self.data_video.clear()
|
||||||
|
return item
|
||||||
|
|
||||||
|
def _write_to_mysql_bangumi(self):
|
||||||
|
for item in self.data_bangumi:
|
||||||
|
self.cursor.execute(
|
||||||
|
'insert into bangumi (title, view_counts, evaluate, attention, barrage) values (%s, %s, %s, %s, %s)',
|
||||||
|
item
|
||||||
|
)
|
||||||
|
self.conn.commit()
|
||||||
|
|
||||||
|
def _write_to_mysql_video(self):
|
||||||
|
for item in self.data_video:
|
||||||
|
self.cursor.execute(
|
||||||
|
'insert into video (title, view_counts, barrage, up) values (%s, %s, %s, %s)',
|
||||||
|
item
|
||||||
|
)
|
||||||
|
self.conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,101 @@
|
|||||||
|
|
||||||
|
BOT_NAME = 'spider'
|
||||||
|
|
||||||
|
SPIDER_MODULES = ['spider.spiders']
|
||||||
|
NEWSPIDER_MODULE = 'spider.spiders'
|
||||||
|
|
||||||
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
|
# USER_AGENT = 'spider (+http://www.yourdomain.com)'
|
||||||
|
|
||||||
|
# scrapy-redis
|
||||||
|
# 去重
|
||||||
|
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
|
||||||
|
# 使用scrapy_redis的调度器
|
||||||
|
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
|
||||||
|
# 是否允许暂停
|
||||||
|
SCHEDULER_PERSIST = True
|
||||||
|
|
||||||
|
# Obey robots.txt rules
|
||||||
|
ROBOTSTXT_OBEY = True
|
||||||
|
|
||||||
|
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||||
|
# CONCURRENT_REQUESTS = 32
|
||||||
|
|
||||||
|
# Configure a delay for requests for the same website (default: 0)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||||
|
# See also autothrottle settings and docs
|
||||||
|
|
||||||
|
DOWNLOAD_DELAY = 3 #延迟时间
|
||||||
|
RANDOMIZE_DOWNLOAD_DELAY = True # 随机延迟开关
|
||||||
|
|
||||||
|
# The download delay setting will honor only one of:
|
||||||
|
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||||
|
# CONCURRENT_REQUESTS_PER_IP = 16
|
||||||
|
|
||||||
|
# Disable cookies (enabled by default)
|
||||||
|
# COOKIES_ENABLED = False
|
||||||
|
|
||||||
|
# Disable Telnet Console (enabled by default)
|
||||||
|
# TELNETCONSOLE_ENABLED = False
|
||||||
|
|
||||||
|
# Override the default request headers:
|
||||||
|
# DEFAULT_REQUEST_HEADERS = {
|
||||||
|
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||||
|
# 'Accept-Language': 'en',
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Enable or disable spider middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
# SPIDER_MIDDLEWARES = {
|
||||||
|
# 'spider.middlewares.SpiderSpiderMiddleware': 543,
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Enable or disable downloader middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
DOWNLOADER_MIDDLEWARES = {
|
||||||
|
'spider.middlewares.SpiderDownloaderMiddleware': 543,
|
||||||
|
# 'spider.middlewares.ProxyMiddleware': 600,
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
# Enable or disable extensions
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||||
|
# EXTENSIONS = {
|
||||||
|
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Configure item pipelines
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
ITEM_PIPELINES = {
|
||||||
|
# 'spider.pipelines.ExcelPipeline': 300,
|
||||||
|
'spider.pipelines.MysqlPipeline': 300,
|
||||||
|
|
||||||
|
# 使用scrapy_redis的管道
|
||||||
|
'scrapy_redis.pipelines.RedisPipeline': 400,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||||
|
# AUTOTHROTTLE_ENABLED = True
|
||||||
|
# The initial download delay
|
||||||
|
# AUTOTHROTTLE_START_DELAY = 5
|
||||||
|
# The maximum download delay to be set in case of high latencies
|
||||||
|
# AUTOTHROTTLE_MAX_DELAY = 60
|
||||||
|
# The average number of requests Scrapy should be sending in parallel to
|
||||||
|
# each remote server
|
||||||
|
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||||
|
# Enable showing throttling stats for every response received:
|
||||||
|
# AUTOTHROTTLE_DEBUG = False
|
||||||
|
|
||||||
|
# Enable and configure HTTP caching (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||||
|
# HTTPCACHE_ENABLED = True
|
||||||
|
# HTTPCACHE_EXPIRATION_SECS = 0
|
||||||
|
# HTTPCACHE_DIR = 'httpcache'
|
||||||
|
# HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
|
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||||
|
|
||||||
|
# Set settings whose default value is deprecated to a future-proof value
|
||||||
|
REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
|
||||||
|
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
|
@ -0,0 +1,54 @@
|
|||||||
|
from scrapy.crawler import CrawlerProcess
|
||||||
|
from scrapy.utils.project import get_project_settings
|
||||||
|
from scrapy_redis.spiders import RedisCrawlSpider
|
||||||
|
from scrapy.linkextractors import LinkExtractor
|
||||||
|
from scrapy.spiders import Rule
|
||||||
|
from scrapy import Request
|
||||||
|
|
||||||
|
from spider.items import VideoItem, BiliItem
|
||||||
|
|
||||||
|
|
||||||
|
class BiliSpider(RedisCrawlSpider):
|
||||||
|
name = 'Bili'
|
||||||
|
redis_key = 'Bili'
|
||||||
|
|
||||||
|
rules = [
|
||||||
|
Rule(LinkExtractor(allow=r"https://www.bilibili.com/bangumi/.*?"), callback='parse_Item', follow=True),
|
||||||
|
Rule(LinkExtractor(allow=r"https://www.bilibili.com/video/BV.*?"), callback='parse_Videoitem', follow=True),
|
||||||
|
Rule(LinkExtractor(allow=r"https://www.bilibili.com/bangumi/media/md.*?"), callback='parse_BiliItem',
|
||||||
|
follow=True),
|
||||||
|
]
|
||||||
|
|
||||||
|
def parse_Videoitem(self, response, **kwargs):
|
||||||
|
Video_item = VideoItem()
|
||||||
|
Video_item['title'] = response.xpath('//*[@id="viewbox_report"]/h1/@title').extract()[0]
|
||||||
|
Video_item['view_counts'] = str(
|
||||||
|
response.xpath('//*[@id="viewbox_report"]/div/div/span[1]/@title').extract()[0]).replace("总播放数", "")
|
||||||
|
Video_item['barrage'] = str(
|
||||||
|
response.xpath('//*[@id="viewbox_report"]/div/div/span[2]/@title').extract()[0]).replace(
|
||||||
|
"历史累计弹幕数", "")
|
||||||
|
Video_item['up'] = str(response.xpath('//*[@id="v_upinfo"]/div[2]/div[1]/a[1]/text()').extract()[0]).replace(
|
||||||
|
"\\n",
|
||||||
|
"").strip()
|
||||||
|
yield Video_item
|
||||||
|
|
||||||
|
def parse_BiliItem(self, response, **kwargs):
|
||||||
|
bangumi_item = BiliItem()
|
||||||
|
bangumi_item['title'] = response.xpath(
|
||||||
|
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[1]/span[1]/text()').extract()[0]
|
||||||
|
bangumi_item['view_counts'] = response.xpath(
|
||||||
|
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[1]/em/text()').extract()[0]
|
||||||
|
bangumi_item['attention'] = response.xpath(
|
||||||
|
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[2]/em/text()').extract()[0]
|
||||||
|
bangumi_item['barrage'] = response.xpath(
|
||||||
|
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[3]/em/text()').extract()[0]
|
||||||
|
bangumi_item['evaluate'] = response.xpath(
|
||||||
|
'//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[2]/div/div[1]/text()').extract()[0]
|
||||||
|
yield bangumi_item
|
||||||
|
|
||||||
|
def parse_Item(self, response, **kwargs):
|
||||||
|
url = 'https:' + response.xpath('//*[@id="media_module"]/div/a/@href').extract()[0]
|
||||||
|
yield Request(url=url, callback=self.parse_BiliItem)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,4 @@
|
|||||||
|
# This package will contain the spiders of your Scrapy project
|
||||||
|
#
|
||||||
|
# Please refer to the documentation for information on how to create and manage
|
||||||
|
# your spiders.
|
Binary file not shown.
Binary file not shown.
@ -0,0 +1,24 @@
|
|||||||
|
import json
|
||||||
|
|
||||||
|
from selenium import webdriver
|
||||||
|
|
||||||
|
|
||||||
|
def create_chrome_driver(*, headless=False):
|
||||||
|
options = webdriver.ChromeOptions()
|
||||||
|
if headless:
|
||||||
|
options.add_argument('--headless') # 不显示浏览器窗口
|
||||||
|
options.add_experimental_option('excludeSwitches', ['enable-automation']) # 防止识别Selenium驱动浏览器
|
||||||
|
options.add_experimental_option('useAutomationExtension', False)
|
||||||
|
browser = webdriver.Chrome(options=options,executable_path=r'chromedriver.exe')
|
||||||
|
browser.execute_cdp_cmd(
|
||||||
|
'Page.addScriptToEvaluateOnNewDocument',
|
||||||
|
{'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined'} # 修改浏览器标识
|
||||||
|
)
|
||||||
|
return browser
|
||||||
|
|
||||||
|
|
||||||
|
def add_cookies(browser, cookie_file):
|
||||||
|
with open(cookie_file, 'r') as file:
|
||||||
|
cookies_list = json.loads(file.read())
|
||||||
|
for cookie_dict in cookies_list:
|
||||||
|
browser.add_cookie(cookie_dict)
|
Loading…
Reference in new issue