|
|
@ -0,0 +1,534 @@
|
|
|
|
|
|
|
|
import csv
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
from pymysql import *
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
from fake_useragent import UserAgent
|
|
|
|
|
|
|
|
from lxml import etree
|
|
|
|
|
|
|
|
from sqlalchemy import create_engine
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import 优化ip代理池 as Optimize
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
engine = create_engine('mysql+pymysql://root:1234@localhost:3306/dbmovie')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class spider(object):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
|
|
|
self.name = '日本'
|
|
|
|
|
|
|
|
# 读取Excel文件 获取url
|
|
|
|
|
|
|
|
file_path = f'豆瓣电影网站/{self.name}/{self.name}电影网站(整合后).xlsx'
|
|
|
|
|
|
|
|
self.df = pd.read_excel(file_path, engine='openpyxl')
|
|
|
|
|
|
|
|
# URL数据在'电影网站'这一列中
|
|
|
|
|
|
|
|
self.urls = self.df['电影网站'].tolist() # 转换为列表,方便遍历
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 定义请求头
|
|
|
|
|
|
|
|
# self.headers()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.proxies = [
|
|
|
|
|
|
|
|
{'http': 'http://180.121.130.208:8089'},
|
|
|
|
|
|
|
|
{'http': 'http://114.106.147.14:8089'},
|
|
|
|
|
|
|
|
{'http': 'http://117.86.9.250:8089'},
|
|
|
|
|
|
|
|
{'http': 'http://114.231.45.235:8089'},
|
|
|
|
|
|
|
|
{'http': 'http://114.132.202.80:8080'},
|
|
|
|
|
|
|
|
# 如果有HTTPS代理,也可以这样添加
|
|
|
|
|
|
|
|
# {'https': 'https://example.com:port'},
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
# 返回测试成功的随机ip代理
|
|
|
|
|
|
|
|
# self.success_proxies = Optimize.startup()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def request_headers(self):
|
|
|
|
|
|
|
|
# 生成一个模拟随机浏览器的User-Agent
|
|
|
|
|
|
|
|
UserAgents = UserAgent().random
|
|
|
|
|
|
|
|
# 定义请求头
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
|
|
|
"User-Agent": f"{UserAgents}",
|
|
|
|
|
|
|
|
# "Cookie": 'll="118313"; bid=QVNFc5M31Ds; _pk_id.100001.4cf6=b91d85f3dfe7a18f.1708781347.; _vwo_uuid_v2=D41D2EAD2A7C867B2EF7CAA05192E9D9B|0d7f976cadeba2dd51716ba2b90223b7; viewed="1866298_20396037"; __utmz=223695111.1711160671.9.2.utmcsr=search.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/movie/subject_search; push_noty_num=0; push_doumail_num=0; __utmz=30149280.1711182158.14.5.utmcsr=help.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=30149280.27921; douban-fav-remind=1; dbcl2="279216488:y4XIrGbz4fQ"; ck=rkAR; frodotk_db="2e17d8cc08294f6a8a478a64187bee3e"; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1712647865%2C%22https%3A%2F%2Fsearch.douban.com%2Fmovie%2Fsubject_search%3Fsearch_text%3D%E5%8A%A8%E6%BC%AB%26cat%3D1002%26start%3D60%22%5D; _pk_ses.100001.4cf6=1; __utma=30149280.1472987265.1708781347.1712582234.1712647866.102; __utmb=30149280.0.10.1712647866; __utmc=30149280; __utma=223695111.718378847.1708781347.1712582234.1712647866.100; __utmb=223695111.0.10.1712647866; __utmc=223695111'
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return headers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def init(self):
|
|
|
|
|
|
|
|
if not os.path.exists('./tempData.csv'):
|
|
|
|
|
|
|
|
with open('./tempData.csv', 'w', newline='') as writer_f:
|
|
|
|
|
|
|
|
writer = csv.writer(writer_f)
|
|
|
|
|
|
|
|
writer.writerow(
|
|
|
|
|
|
|
|
['directors', 'rate', 'title', 'casts', 'cover', 'detailLink', 'year', 'types', 'country', 'lang',
|
|
|
|
|
|
|
|
'time', 'movieTime', 'comment_len', 'starts', 'summary', 'comments', 'imgList', 'movieUrl']
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
# writer.writerow(
|
|
|
|
|
|
|
|
# [
|
|
|
|
|
|
|
|
# '电影导演', '电影评分', '电影名字', '电影演员', '电影封面', '电影详情链接', '电影年份',
|
|
|
|
|
|
|
|
# '电影类型',
|
|
|
|
|
|
|
|
# '电影国家', '电影语言', '电影上映时间', '电影片长', '短评个数', '电影星级',
|
|
|
|
|
|
|
|
# '电影信息介绍', '电影短评(短评用户,短评评分,评论时间,评论内容)', '图片列表', '预告片链接'
|
|
|
|
|
|
|
|
# ]
|
|
|
|
|
|
|
|
# )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not os.path.exists('./spiderPage.txt'):
|
|
|
|
|
|
|
|
with open('./spiderPage.txt', 'w', encoding="utf-8") as f:
|
|
|
|
|
|
|
|
f.write('0\r')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
conn = connect(host='localhost', user='root', password='1234', database='dbmovie', port=3306,
|
|
|
|
|
|
|
|
charset='utf8mb4')
|
|
|
|
|
|
|
|
sql = '''
|
|
|
|
|
|
|
|
create table movie (
|
|
|
|
|
|
|
|
id int primary key auto_increment,
|
|
|
|
|
|
|
|
directors VARCHAR(255),
|
|
|
|
|
|
|
|
rate VARCHAR(255),
|
|
|
|
|
|
|
|
title VARCHAR(255),
|
|
|
|
|
|
|
|
casts text,
|
|
|
|
|
|
|
|
cover VARCHAR(255),
|
|
|
|
|
|
|
|
detailLink VARCHAR(255),
|
|
|
|
|
|
|
|
year VARCHAR(255),
|
|
|
|
|
|
|
|
types VARCHAR(255),
|
|
|
|
|
|
|
|
country VARCHAR(255),
|
|
|
|
|
|
|
|
lang VARCHAR(255),
|
|
|
|
|
|
|
|
time VARCHAR(255),
|
|
|
|
|
|
|
|
movieTime VARCHAR(255),
|
|
|
|
|
|
|
|
comment_len VARCHAR(255),
|
|
|
|
|
|
|
|
starts VARCHAR(255),
|
|
|
|
|
|
|
|
summary text,
|
|
|
|
|
|
|
|
comments text,
|
|
|
|
|
|
|
|
imgList text,
|
|
|
|
|
|
|
|
movieUrl VARCHAR(255)
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
'''
|
|
|
|
|
|
|
|
cursor = conn.cursor()
|
|
|
|
|
|
|
|
cursor.execute(sql)
|
|
|
|
|
|
|
|
conn.commit()
|
|
|
|
|
|
|
|
except:
|
|
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_page(self):
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
with open('./spiderPage.txt', 'r') as r_f:
|
|
|
|
|
|
|
|
return int(r_f.read().strip())
|
|
|
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
|
|
|
return 0 # 如果没有文件,返回0作为起始索引
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def set_page(self, newPage):
|
|
|
|
|
|
|
|
with open('./spiderPage.txt', 'w') as a_f:
|
|
|
|
|
|
|
|
a_f.write(str(newPage))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def spiderMain(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def xpathelement(xpath, value=None):
|
|
|
|
|
|
|
|
text_xpath = tree.xpath(xpath)
|
|
|
|
|
|
|
|
# 如果没有提供 value 值
|
|
|
|
|
|
|
|
if not value:
|
|
|
|
|
|
|
|
# 获取内容写法[0].text
|
|
|
|
|
|
|
|
text = text_xpath[0].text
|
|
|
|
|
|
|
|
# print(text)
|
|
|
|
|
|
|
|
return text
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
if value:
|
|
|
|
|
|
|
|
if isinstance(value, str): # 检查 value 是否为字符串类型
|
|
|
|
|
|
|
|
# 假设 value 是属性名,并且我们想要获取该属性的值
|
|
|
|
|
|
|
|
# print("进来了")
|
|
|
|
|
|
|
|
xpath = xpath + '/@{}'.format(value)
|
|
|
|
|
|
|
|
# print(xpath)
|
|
|
|
|
|
|
|
text_xpath = tree.xpath(xpath)[0]
|
|
|
|
|
|
|
|
return text_xpath
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
return text_xpath
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
max_retries = 99999 # 设置最大重试次数
|
|
|
|
|
|
|
|
retries = 0
|
|
|
|
|
|
|
|
while retries < max_retries:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
start_page = self.get_page() + 1 # 获取上次爬取的页面索引,并加1作为本次开始的索引
|
|
|
|
|
|
|
|
if start_page > len(self.urls):
|
|
|
|
|
|
|
|
print("所有网站已爬取完毕!")
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f'正在从第{start_page}个网站开始爬取')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
# 从上次爬取的位置开始继续爬取
|
|
|
|
|
|
|
|
for index, url in enumerate(self.urls[start_page - 1:], start=start_page):
|
|
|
|
|
|
|
|
headers = self.request_headers()
|
|
|
|
|
|
|
|
success = Optimize.startup()
|
|
|
|
|
|
|
|
# proxies = random.choice(self.proxies) # 从代理列表中随机选择一个代理
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
resultData = []
|
|
|
|
|
|
|
|
# url = 'https://movie.douban.com/subject/36868913/'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(url)
|
|
|
|
|
|
|
|
# 在每次请求之后添加1秒的延迟
|
|
|
|
|
|
|
|
# time.sleep(1)
|
|
|
|
|
|
|
|
# 发送请求
|
|
|
|
|
|
|
|
response = requests.get(url=url, headers=headers, proxies=success)
|
|
|
|
|
|
|
|
# 获取网站返回文本
|
|
|
|
|
|
|
|
context = response.text
|
|
|
|
|
|
|
|
# 使用 lxml 的 etree 解析 HTML
|
|
|
|
|
|
|
|
tree = etree.HTML(context)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
|
|
|
# 处理成功的响应
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 电影导演
|
|
|
|
|
|
|
|
directors = tree.xpath('//*[@id="info"]//a[@rel="v:directedBy"]')
|
|
|
|
|
|
|
|
if directors:
|
|
|
|
|
|
|
|
directors_list = []
|
|
|
|
|
|
|
|
for i in directors:
|
|
|
|
|
|
|
|
directors_list.append(i.text)
|
|
|
|
|
|
|
|
resultData.append(','.join(directors_list))
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
resultData.append(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 电影评分
|
|
|
|
|
|
|
|
rate = tree.xpath('//*[@id="interest_sectl"]/div/div[2]/strong')
|
|
|
|
|
|
|
|
if rate and rate[0].text:
|
|
|
|
|
|
|
|
rate_txt = rate[0].text.strip() # 使用strip()去除可能的空白字符
|
|
|
|
|
|
|
|
resultData.append(rate_txt)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
resultData.append(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 电影名字//*[@id="content"]/h1/span[1]
|
|
|
|
|
|
|
|
title = tree.xpath('//*[@id="content"]/h1//span[@property="v:itemreviewed"]')
|
|
|
|
|
|
|
|
title_txt = title[0].text
|
|
|
|
|
|
|
|
print(title_txt)
|
|
|
|
|
|
|
|
resultData.append(title_txt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 电影演员
|
|
|
|
|
|
|
|
# 查找包含目标span元素的父级span//*[@id="info"]/span[2]/span[2]
|
|
|
|
|
|
|
|
actors_links = tree.xpath('//*[@id="info"]//a[@rel="v:starring"]')
|
|
|
|
|
|
|
|
# print(actors_links)
|
|
|
|
|
|
|
|
if actors_links:
|
|
|
|
|
|
|
|
# 初始化一个空列表来保存演员的名字
|
|
|
|
|
|
|
|
actors = []
|
|
|
|
|
|
|
|
# 遍历每个链接元素
|
|
|
|
|
|
|
|
for link in actors_links:
|
|
|
|
|
|
|
|
# 获取链接的文本内容
|
|
|
|
|
|
|
|
text = link.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 打印链接元素和它的text属性
|
|
|
|
|
|
|
|
# print(f"Link element: {link}")
|
|
|
|
|
|
|
|
# print(f"Text content: {text}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 检查text是否为None
|
|
|
|
|
|
|
|
if text is not None:
|
|
|
|
|
|
|
|
# 使用strip()方法去除文本两侧的空白字符
|
|
|
|
|
|
|
|
trimmed_text = text.strip()
|
|
|
|
|
|
|
|
# 检查去除空白后的文本是否为空
|
|
|
|
|
|
|
|
if trimmed_text:
|
|
|
|
|
|
|
|
# 如果不为空,则将其添加到演员列表中
|
|
|
|
|
|
|
|
actors.append(trimmed_text)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
# 处理text为None的情况
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
resultData.append(','.join(actors))
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
resultData.append(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 电影封面
|
|
|
|
|
|
|
|
# cover_img_src = tree.xpath('//*[@id="mainpic"]/a/img/@src')[0]
|
|
|
|
|
|
|
|
cover_img_src = tree.xpath('//*[@id="mainpic"]/a/img/@src')
|
|
|
|
|
|
|
|
if cover_img_src:
|
|
|
|
|
|
|
|
resultData.append(cover_img_src[0])
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
resultData.append(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 电影详情链接
|
|
|
|
|
|
|
|
detailLink = url
|
|
|
|
|
|
|
|
resultData.append(detailLink)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 电影年份
|
|
|
|
|
|
|
|
# year = tree.xpath('//*[@id="content"]/h1/span[2]/text()')
|
|
|
|
|
|
|
|
# print(year)
|
|
|
|
|
|
|
|
year = tree.xpath('//*[@id="content"]/h1/span[2]')
|
|
|
|
|
|
|
|
if year:
|
|
|
|
|
|
|
|
# 使用正则表达式去除所有括号
|
|
|
|
|
|
|
|
year_without_parentheses = re.search('\d+', year[0].text).group()
|
|
|
|
|
|
|
|
# year_without_parentheses = re.sub(r'\(|\)', '', year)
|
|
|
|
|
|
|
|
resultData.append(year_without_parentheses)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
resultData.append(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 电影类型
|
|
|
|
|
|
|
|
# types = tree.xpath('//span[contains(@property,"v:genre")]')
|
|
|
|
|
|
|
|
# types = tree.xpath('//*[@id="info"]/span[contains(@property,"v:genre")]')
|
|
|
|
|
|
|
|
types = tree.xpath('//*[@id="info"]/span[@property="v:genre"]')
|
|
|
|
|
|
|
|
if types:
|
|
|
|
|
|
|
|
types_list = []
|
|
|
|
|
|
|
|
for i in types:
|
|
|
|
|
|
|
|
types_list.append(i.text)
|
|
|
|
|
|
|
|
resultData.append(','.join(types_list))
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
resultData.append(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 电影国家
|
|
|
|
|
|
|
|
textInfo = tree.xpath('//*[@id="info"]/text()')
|
|
|
|
|
|
|
|
texts = []
|
|
|
|
|
|
|
|
if textInfo:
|
|
|
|
|
|
|
|
for i in textInfo:
|
|
|
|
|
|
|
|
if i.strip() and not i.strip() == '/':
|
|
|
|
|
|
|
|
texts.append(i)
|
|
|
|
|
|
|
|
# print(texts) # 打印完整的texts列表,检查原始数据
|
|
|
|
|
|
|
|
# 假设texts[0]包含国家信息
|
|
|
|
|
|
|
|
if len(texts) > 0: # 确保texts至少有两个元素
|
|
|
|
|
|
|
|
if texts[0]:
|
|
|
|
|
|
|
|
country_info = texts[0].split('/') # 分割国家信息
|
|
|
|
|
|
|
|
cleaned_countries = [i.strip() for i in country_info] # 去除每个国家名称的空格(尽管之前已经处理过)
|
|
|
|
|
|
|
|
resultData.append(','.join(cleaned_countries)) # 合并成一个字符串并添加到resultData中
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
resultData.append(0)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
resultData.append(0)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
resultData.append(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 电影语言
|
|
|
|
|
|
|
|
if textInfo:
|
|
|
|
|
|
|
|
if len(texts) > 1: # 确保texts至少有两个元素
|
|
|
|
|
|
|
|
if texts[1]:
|
|
|
|
|
|
|
|
lang_info = texts[1].split(sep='/') # 分割语言信息
|
|
|
|
|
|
|
|
cleaned_lang = [i.strip() for i in lang_info] # 去除每个语言的空格(尽管之前已经处理过)
|
|
|
|
|
|
|
|
resultData.append(','.join(cleaned_lang)) # 合并成一个字符串并添加到resultData中
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
resultData.append(0)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
resultData.append(0)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
resultData.append(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 电影上映时间
|
|
|
|
|
|
|
|
time_movie = tree.xpath('//*[@id="info"]/span[@property = "v:initialReleaseDate"]')
|
|
|
|
|
|
|
|
if time_movie:
|
|
|
|
|
|
|
|
time_list = []
|
|
|
|
|
|
|
|
for i in time_movie:
|
|
|
|
|
|
|
|
time_list.append(i.text)
|
|
|
|
|
|
|
|
# time_one = time_list[0][:10]
|
|
|
|
|
|
|
|
# time_one = re.search('\d+', time_list[0]).group()
|
|
|
|
|
|
|
|
# 去除括号内的内容
|
|
|
|
|
|
|
|
time_str = re.sub(r'\(.*?\)', '', time_list[0])
|
|
|
|
|
|
|
|
# 使用正则表达式匹配 YYYY-MM-DD, YYYY-M-DD, 和 YYYY-M-D 格式的日期
|
|
|
|
|
|
|
|
match = re.search(r'\b\d{4}-(?:\d{2}|\d)-(?:\d{2}|\d)\b', time_str)
|
|
|
|
|
|
|
|
if match:
|
|
|
|
|
|
|
|
resultData.append(match.group())
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
# 如果没有日期格式,尝试直接匹配4位数字作为年份
|
|
|
|
|
|
|
|
match = re.search(r'\d{4}', time_str)
|
|
|
|
|
|
|
|
if match:
|
|
|
|
|
|
|
|
resultData.append(match.group())
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
# 如果没有找到匹配项,返回0
|
|
|
|
|
|
|
|
resultData.append(0)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
resultData.append(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 电影片长
|
|
|
|
|
|
|
|
movieTime = tree.xpath('//*[@id="info"]/span[@property="v:runtime"]/@content')
|
|
|
|
|
|
|
|
if movieTime:
|
|
|
|
|
|
|
|
resultData.append(movieTime[0])
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
resultData.append(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 短评个数
|
|
|
|
|
|
|
|
comment_len = tree.xpath('//*[@id="comments-section"]/div[1]/h2/span/a')
|
|
|
|
|
|
|
|
if comment_len:
|
|
|
|
|
|
|
|
comment_len_txt = re.search('\d+', comment_len[0].text).group()
|
|
|
|
|
|
|
|
resultData.append(comment_len_txt)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
resultData.append(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 电影星级占比//*[@id="interest_sectl"]/div/div[3]/div[1]/span[2]
|
|
|
|
|
|
|
|
starts = tree.xpath('//*[@id="interest_sectl"]/div/div[3]/div[@class="item"]')
|
|
|
|
|
|
|
|
# starts = tree.xpath('//*[@id="interest_sectl"]//div[@class="ratings-on-weight"]/div[@class="item"]')
|
|
|
|
|
|
|
|
# starts = tree.xpath('//div[@id="interest_sectl"]//div[@class="ratings-on-weight"]/div[@class="item"]')
|
|
|
|
|
|
|
|
if starts:
|
|
|
|
|
|
|
|
starts_list = []
|
|
|
|
|
|
|
|
for i in starts:
|
|
|
|
|
|
|
|
# span_html = etree.tostring(i, pretty_print=True, encoding='unicode')
|
|
|
|
|
|
|
|
# # 打印或处理单个span元素的HTML
|
|
|
|
|
|
|
|
# print(span_html)
|
|
|
|
|
|
|
|
# 类名以"rating_per"开头的span元素
|
|
|
|
|
|
|
|
# span_tag = i.xpath('.//span[@class="rating_per"]/text()')
|
|
|
|
|
|
|
|
# span_tag = i.xpath('.//span[@class="rating_per"]')[0].text
|
|
|
|
|
|
|
|
span_tag = i.xpath('.//span[2]')[0].text
|
|
|
|
|
|
|
|
starts_list.append(span_tag)
|
|
|
|
|
|
|
|
resultData.append(','.join(starts_list))
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
resultData.append(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 电影简介
|
|
|
|
|
|
|
|
summary = tree.xpath('//*[@id="link-report-intra"]/span/text()')
|
|
|
|
|
|
|
|
if summary:
|
|
|
|
|
|
|
|
summary_str = ''
|
|
|
|
|
|
|
|
for i in summary:
|
|
|
|
|
|
|
|
summary_str += i.strip()
|
|
|
|
|
|
|
|
# print(i.strip())
|
|
|
|
|
|
|
|
resultData.append(summary_str)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
resultData.append(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 电影短评(短评用户,短评评分,评论时间,评论内容)
|
|
|
|
|
|
|
|
comments = tree.xpath('//*[@id="hot-comments"]/div')
|
|
|
|
|
|
|
|
if comments:
|
|
|
|
|
|
|
|
comments_list = []
|
|
|
|
|
|
|
|
for i in comments:
|
|
|
|
|
|
|
|
# 用户
|
|
|
|
|
|
|
|
user = i.xpath('.//h3/span[2]/a')[0].text
|
|
|
|
|
|
|
|
# 评分
|
|
|
|
|
|
|
|
start_classes = i.xpath('.//h3/span[2]/span')
|
|
|
|
|
|
|
|
if len(start_classes) == 4:
|
|
|
|
|
|
|
|
# 获取class属性值列表
|
|
|
|
|
|
|
|
class_attributes = i.xpath('.//h3/span[2]/span[2]/@class')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 检查是否至少有一个class属性值,并且它不是"comment-time-tip"
|
|
|
|
|
|
|
|
if class_attributes and class_attributes[0] != 'comment-time-tip':
|
|
|
|
|
|
|
|
# 尝试从class属性值中提取数字
|
|
|
|
|
|
|
|
match = re.search('\d+', class_attributes[0])
|
|
|
|
|
|
|
|
if match:
|
|
|
|
|
|
|
|
start = match.group()
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
# 如果没有找到数字,则设置start为默认值或进行其他处理
|
|
|
|
|
|
|
|
start = 0 # 或者其他合适的默认值
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
time_pl = i.xpath('.//h3/span[2]/span[3]/@title')[0]
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
start = 0
|
|
|
|
|
|
|
|
# 时间
|
|
|
|
|
|
|
|
time_pl = i.xpath('.//h3/span[2]/span[2]/@title')[0]
|
|
|
|
|
|
|
|
# 内容
|
|
|
|
|
|
|
|
content = i.xpath('.//div/p/span')[0].text
|
|
|
|
|
|
|
|
comments_list.append({
|
|
|
|
|
|
|
|
'user': user,
|
|
|
|
|
|
|
|
'start': start,
|
|
|
|
|
|
|
|
'time': time_pl,
|
|
|
|
|
|
|
|
'content': content
|
|
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
resultData.append(json.dumps(comments_list))
|
|
|
|
|
|
|
|
# resultData.append(comments_list)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
resultData.append(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# # 图片列表
|
|
|
|
|
|
|
|
# imgList = tree.xpath('//*[@id="related-pic"]/ul//a/img/@src')
|
|
|
|
|
|
|
|
# if imgList:
|
|
|
|
|
|
|
|
# resultData.append(','.join(imgList))
|
|
|
|
|
|
|
|
# else:
|
|
|
|
|
|
|
|
# resultData.append(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 初始化图片链接列表
|
|
|
|
|
|
|
|
img_urls = []
|
|
|
|
|
|
|
|
# 查找包含特定 class 的 <a> 标签
|
|
|
|
|
|
|
|
a_tags = tree.xpath('//*[@id="related-pic"]/ul//a[@class="related-pic-video"]')
|
|
|
|
|
|
|
|
# 提取第一个 video 图片链接
|
|
|
|
|
|
|
|
found_video_image = False
|
|
|
|
|
|
|
|
for a_tag in a_tags:
|
|
|
|
|
|
|
|
style_attr = a_tag.get('style')
|
|
|
|
|
|
|
|
if style_attr:
|
|
|
|
|
|
|
|
start_index = style_attr.find('url(') + 4
|
|
|
|
|
|
|
|
end_index = style_attr.find(')', start_index)
|
|
|
|
|
|
|
|
if start_index != -1 and end_index != -1:
|
|
|
|
|
|
|
|
img_url = style_attr[start_index:end_index]
|
|
|
|
|
|
|
|
img_urls.insert(0, img_url) # 将视频图片链接插入到列表的第一个位置
|
|
|
|
|
|
|
|
found_video_image = True
|
|
|
|
|
|
|
|
break # 找到后跳出循环
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 如果没有找到 video 图片链接,添加默认值
|
|
|
|
|
|
|
|
if not found_video_image:
|
|
|
|
|
|
|
|
img_urls.append('0') # 使用字符串 '0' 作为默认值,因为后面会用 join(',')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 查找图片链接
|
|
|
|
|
|
|
|
imgList = tree.xpath('//*[@id="related-pic"]/ul//a/img/@src')
|
|
|
|
|
|
|
|
if imgList:
|
|
|
|
|
|
|
|
img_urls.extend(imgList) # 将其他图片链接添加到列表中
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
img_urls.extend('0')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 将 img_urls 列表转换为逗号分隔的字符串,并添加到 resultData
|
|
|
|
|
|
|
|
resultData.append(','.join(img_urls))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 预告片链接
|
|
|
|
|
|
|
|
# movieUrl = tree.xpath('//*[@id="related-pic"]/ul//a[@class="related-pic-video"]/@href')[0]
|
|
|
|
|
|
|
|
# print(movieUrl, type(movieUrl))
|
|
|
|
|
|
|
|
result = tree.xpath('//*[@id="related-pic"]/ul//a[@class="related-pic-video"]/@href')
|
|
|
|
|
|
|
|
if result:
|
|
|
|
|
|
|
|
movieUrl = result[0]
|
|
|
|
|
|
|
|
resultData.append(movieUrl)
|
|
|
|
|
|
|
|
# for i in range(1, 99):
|
|
|
|
|
|
|
|
# # 获取视频链接
|
|
|
|
|
|
|
|
# # 请求网站
|
|
|
|
|
|
|
|
# response_sp = requests.get(url=movieUrl, headers=self.headers, proxies=success)
|
|
|
|
|
|
|
|
# # 检查请求是否成功
|
|
|
|
|
|
|
|
# # 获取网站返回文本
|
|
|
|
|
|
|
|
# context_sp = response_sp.text
|
|
|
|
|
|
|
|
# # 使用 lxml 的 etree 解析 HTML
|
|
|
|
|
|
|
|
# tree_sp = etree.HTML(context_sp)
|
|
|
|
|
|
|
|
# # 查找视频链接,并检查是否找到
|
|
|
|
|
|
|
|
# movie_sources = tree_sp.xpath('.//video/source/@src')
|
|
|
|
|
|
|
|
# if movie_sources:
|
|
|
|
|
|
|
|
# movie = movie_sources[0] # 获取列表中的第一个视频源
|
|
|
|
|
|
|
|
# resultData.append(movie)
|
|
|
|
|
|
|
|
# break
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
|
|
# else:
|
|
|
|
|
|
|
|
# print("有视频源链接,尝试获取中...")
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
resultData.append(0)
|
|
|
|
|
|
|
|
# print("未找到相关视频预告片链接")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(resultData)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if self.save_to_csv(resultData):
|
|
|
|
|
|
|
|
# 如果爬取成功,更新页面索引
|
|
|
|
|
|
|
|
self.set_page(index)
|
|
|
|
|
|
|
|
print(f'成功爬取第{index}个网站')
|
|
|
|
|
|
|
|
print()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
|
|
# 捕获异常,并打印错误信息
|
|
|
|
|
|
|
|
print(f"爬取时遇到错误: {e}")
|
|
|
|
|
|
|
|
retries += 1 # 重试次数加1
|
|
|
|
|
|
|
|
if retries < max_retries:
|
|
|
|
|
|
|
|
print(f"正在重试({retries}/{max_retries})...")
|
|
|
|
|
|
|
|
print()
|
|
|
|
|
|
|
|
# time.sleep(1.333) # 等待一段时间再重试,避免过于频繁的请求
|
|
|
|
|
|
|
|
# else:
|
|
|
|
|
|
|
|
# print("已达到最大重试次数,程序退出")
|
|
|
|
|
|
|
|
# raise # 如果需要,可以抛出异常或退出程序
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_to_csv(self, rowData):
|
|
|
|
|
|
|
|
with open(f'./tempData.csv', 'a', newline='', encoding='utf-8') as f:
|
|
|
|
|
|
|
|
writer = csv.writer(f)
|
|
|
|
|
|
|
|
writer.writerow(rowData)
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_to_sql(self, df):
|
|
|
|
|
|
|
|
df.to_sql('movie', con=engine, index=False, if_exists='append')
|
|
|
|
|
|
|
|
print('导入数据完成~')
|
|
|
|
|
|
|
|
# df.to_sql('movie', con=engine, index=False, if_exists='append', method='multi')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clear_csv(self):
|
|
|
|
|
|
|
|
df = pd.read_csv('./2024.csv')
|
|
|
|
|
|
|
|
df.dropna(inplace=True) # 删除缺失值
|
|
|
|
|
|
|
|
df.drop_duplicates(inplace=True) # 删除重复值,并使用 inplace=True 修改原始 DataFrame
|
|
|
|
|
|
|
|
self.save_to_sql(df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def delete_csv_file(self):
|
|
|
|
|
|
|
|
# 文件路径,这里假设tempData.csv在当前工作目录下
|
|
|
|
|
|
|
|
file_path = 'tempData.csv'
|
|
|
|
|
|
|
|
# 检查文件是否存在
|
|
|
|
|
|
|
|
if os.path.exists(file_path):
|
|
|
|
|
|
|
|
# 删除文件
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
os.remove(file_path)
|
|
|
|
|
|
|
|
print(f"文件 {file_path} 已成功删除。")
|
|
|
|
|
|
|
|
except OSError as e:
|
|
|
|
|
|
|
|
print(f"删除文件 {file_path} 时发生错误: {e.strerror}")
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
print(f"文件 {file_path} 不存在,无法删除。")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
|
|
spiderObj = spider()
|
|
|
|
|
|
|
|
spiderObj.init()
|
|
|
|
|
|
|
|
spiderObj.spiderMain()
|
|
|
|
|
|
|
|
# spiderObj.clear_csv()
|