You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

535 lines
27 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import csv
import json
import os
import re
import pandas as pd
from pymysql import *
import requests
from fake_useragent import UserAgent
from lxml import etree
from sqlalchemy import create_engine
import 优化ip代理池 as Optimize
engine = create_engine('mysql+pymysql://root:1234@localhost:3306/dbmovie')
class spider(object):
def __init__(self):
self.name = '日本'
# 读取Excel文件 获取url
file_path = f'豆瓣电影网站/{self.name}/{self.name}电影网站(整合后).xlsx'
self.df = pd.read_excel(file_path, engine='openpyxl')
# URL数据在'电影网站'这一列中
self.urls = self.df['电影网站'].tolist() # 转换为列表,方便遍历
# 定义请求头
# self.headers()
self.proxies = [
{'http': 'http://180.121.130.208:8089'},
{'http': 'http://114.106.147.14:8089'},
{'http': 'http://117.86.9.250:8089'},
{'http': 'http://114.231.45.235:8089'},
{'http': 'http://114.132.202.80:8080'},
# 如果有HTTPS代理也可以这样添加
# {'https': 'https://example.com:port'},
]
# 返回测试成功的随机ip代理
# self.success_proxies = Optimize.startup()
def request_headers(self):
# 生成一个模拟随机浏览器的User-Agent
UserAgents = UserAgent().random
# 定义请求头
headers = {
"User-Agent": f"{UserAgents}",
# "Cookie": 'll="118313"; bid=QVNFc5M31Ds; _pk_id.100001.4cf6=b91d85f3dfe7a18f.1708781347.; _vwo_uuid_v2=D41D2EAD2A7C867B2EF7CAA05192E9D9B|0d7f976cadeba2dd51716ba2b90223b7; viewed="1866298_20396037"; __utmz=223695111.1711160671.9.2.utmcsr=search.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/movie/subject_search; push_noty_num=0; push_doumail_num=0; __utmz=30149280.1711182158.14.5.utmcsr=help.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=30149280.27921; douban-fav-remind=1; dbcl2="279216488:y4XIrGbz4fQ"; ck=rkAR; frodotk_db="2e17d8cc08294f6a8a478a64187bee3e"; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1712647865%2C%22https%3A%2F%2Fsearch.douban.com%2Fmovie%2Fsubject_search%3Fsearch_text%3D%E5%8A%A8%E6%BC%AB%26cat%3D1002%26start%3D60%22%5D; _pk_ses.100001.4cf6=1; __utma=30149280.1472987265.1708781347.1712582234.1712647866.102; __utmb=30149280.0.10.1712647866; __utmc=30149280; __utma=223695111.718378847.1708781347.1712582234.1712647866.100; __utmb=223695111.0.10.1712647866; __utmc=223695111'
}
return headers
def init(self):
if not os.path.exists('./tempData.csv'):
with open('./tempData.csv', 'w', newline='') as writer_f:
writer = csv.writer(writer_f)
writer.writerow(
['directors', 'rate', 'title', 'casts', 'cover', 'detailLink', 'year', 'types', 'country', 'lang',
'time', 'movieTime', 'comment_len', 'starts', 'summary', 'comments', 'imgList', 'movieUrl']
)
# writer.writerow(
# [
# '电影导演', '电影评分', '电影名字', '电影演员', '电影封面', '电影详情链接', '电影年份',
# '电影类型',
# '电影国家', '电影语言', '电影上映时间', '电影片长', '短评个数', '电影星级',
# '电影信息介绍', '电影短评(短评用户,短评评分,评论时间,评论内容)', '图片列表', '预告片链接'
# ]
# )
if not os.path.exists('./spiderPage.txt'):
with open('./spiderPage.txt', 'w', encoding="utf-8") as f:
f.write('0\r')
try:
conn = connect(host='localhost', user='root', password='1234', database='dbmovie', port=3306,
charset='utf8mb4')
sql = '''
create table movie (
id int primary key auto_increment,
directors VARCHAR(255),
rate VARCHAR(255),
title VARCHAR(255),
casts text,
cover VARCHAR(255),
detailLink VARCHAR(255),
year VARCHAR(255),
types VARCHAR(255),
country VARCHAR(255),
lang VARCHAR(255),
time VARCHAR(255),
movieTime VARCHAR(255),
comment_len VARCHAR(255),
starts VARCHAR(255),
summary text,
comments text,
imgList text,
movieUrl VARCHAR(255)
)
'''
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
except:
pass
def get_page(self):
try:
with open('./spiderPage.txt', 'r') as r_f:
return int(r_f.read().strip())
except FileNotFoundError:
return 0 # 如果没有文件返回0作为起始索引
def set_page(self, newPage):
with open('./spiderPage.txt', 'w') as a_f:
a_f.write(str(newPage))
def spiderMain(self):
def xpathelement(xpath, value=None):
text_xpath = tree.xpath(xpath)
# 如果没有提供 value 值
if not value:
# 获取内容写法[0].text
text = text_xpath[0].text
# print(text)
return text
else:
if value:
if isinstance(value, str): # 检查 value 是否为字符串类型
# 假设 value 是属性名,并且我们想要获取该属性的值
# print("进来了")
xpath = xpath + '/@{}'.format(value)
# print(xpath)
text_xpath = tree.xpath(xpath)[0]
return text_xpath
else:
return text_xpath
max_retries = 99999 # 设置最大重试次数
retries = 0
while retries < max_retries:
start_page = self.get_page() + 1 # 获取上次爬取的页面索引并加1作为本次开始的索引
if start_page > len(self.urls):
print("所有网站已爬取完毕!")
return
print(f'正在从第{start_page}个网站开始爬取')
try:
# 从上次爬取的位置开始继续爬取
for index, url in enumerate(self.urls[start_page - 1:], start=start_page):
headers = self.request_headers()
success = Optimize.startup()
# proxies = random.choice(self.proxies) # 从代理列表中随机选择一个代理
resultData = []
# url = 'https://movie.douban.com/subject/36868913/'
print(url)
# 在每次请求之后添加1秒的延迟
# time.sleep(1)
# 发送请求
response = requests.get(url=url, headers=headers, proxies=success)
# 获取网站返回文本
context = response.text
# 使用 lxml 的 etree 解析 HTML
tree = etree.HTML(context)
if response.status_code == 200:
# 处理成功的响应
# 电影导演
directors = tree.xpath('//*[@id="info"]//a[@rel="v:directedBy"]')
if directors:
directors_list = []
for i in directors:
directors_list.append(i.text)
resultData.append(','.join(directors_list))
else:
resultData.append(0)
# 电影评分
rate = tree.xpath('//*[@id="interest_sectl"]/div/div[2]/strong')
if rate and rate[0].text:
rate_txt = rate[0].text.strip() # 使用strip()去除可能的空白字符
resultData.append(rate_txt)
else:
resultData.append(0)
# 电影名字//*[@id="content"]/h1/span[1]
title = tree.xpath('//*[@id="content"]/h1//span[@property="v:itemreviewed"]')
title_txt = title[0].text
print(title_txt)
resultData.append(title_txt)
# 电影演员
# 查找包含目标span元素的父级span//*[@id="info"]/span[2]/span[2]
actors_links = tree.xpath('//*[@id="info"]//a[@rel="v:starring"]')
# print(actors_links)
if actors_links:
# 初始化一个空列表来保存演员的名字
actors = []
# 遍历每个链接元素
for link in actors_links:
# 获取链接的文本内容
text = link.text
# 打印链接元素和它的text属性
# print(f"Link element: {link}")
# print(f"Text content: {text}")
# 检查text是否为None
if text is not None:
# 使用strip()方法去除文本两侧的空白字符
trimmed_text = text.strip()
# 检查去除空白后的文本是否为空
if trimmed_text:
# 如果不为空,则将其添加到演员列表中
actors.append(trimmed_text)
else:
# 处理text为None的情况
continue
resultData.append(','.join(actors))
else:
resultData.append(0)
# 电影封面
# cover_img_src = tree.xpath('//*[@id="mainpic"]/a/img/@src')[0]
cover_img_src = tree.xpath('//*[@id="mainpic"]/a/img/@src')
if cover_img_src:
resultData.append(cover_img_src[0])
else:
resultData.append(0)
# 电影详情链接
detailLink = url
resultData.append(detailLink)
# 电影年份
# year = tree.xpath('//*[@id="content"]/h1/span[2]/text()')
# print(year)
year = tree.xpath('//*[@id="content"]/h1/span[2]')
if year:
# 使用正则表达式去除所有括号
year_without_parentheses = re.search('\d+', year[0].text).group()
# year_without_parentheses = re.sub(r'\(|\)', '', year)
resultData.append(year_without_parentheses)
else:
resultData.append(0)
# 电影类型
# types = tree.xpath('//span[contains(@property,"v:genre")]')
# types = tree.xpath('//*[@id="info"]/span[contains(@property,"v:genre")]')
types = tree.xpath('//*[@id="info"]/span[@property="v:genre"]')
if types:
types_list = []
for i in types:
types_list.append(i.text)
resultData.append(','.join(types_list))
else:
resultData.append(0)
# 电影国家
textInfo = tree.xpath('//*[@id="info"]/text()')
texts = []
if textInfo:
for i in textInfo:
if i.strip() and not i.strip() == '/':
texts.append(i)
# print(texts) # 打印完整的texts列表检查原始数据
# 假设texts[0]包含国家信息
if len(texts) > 0: # 确保texts至少有两个元素
if texts[0]:
country_info = texts[0].split('/') # 分割国家信息
cleaned_countries = [i.strip() for i in country_info] # 去除每个国家名称的空格(尽管之前已经处理过)
resultData.append(','.join(cleaned_countries)) # 合并成一个字符串并添加到resultData中
else:
resultData.append(0)
else:
resultData.append(0)
else:
resultData.append(0)
# 电影语言
if textInfo:
if len(texts) > 1: # 确保texts至少有两个元素
if texts[1]:
lang_info = texts[1].split(sep='/') # 分割语言信息
cleaned_lang = [i.strip() for i in lang_info] # 去除每个语言的空格(尽管之前已经处理过)
resultData.append(','.join(cleaned_lang)) # 合并成一个字符串并添加到resultData中
else:
resultData.append(0)
else:
resultData.append(0)
else:
resultData.append(0)
# 电影上映时间
time_movie = tree.xpath('//*[@id="info"]/span[@property = "v:initialReleaseDate"]')
if time_movie:
time_list = []
for i in time_movie:
time_list.append(i.text)
# time_one = time_list[0][:10]
# time_one = re.search('\d+', time_list[0]).group()
# 去除括号内的内容
time_str = re.sub(r'\(.*?\)', '', time_list[0])
# 使用正则表达式匹配 YYYY-MM-DD, YYYY-M-DD, 和 YYYY-M-D 格式的日期
match = re.search(r'\b\d{4}-(?:\d{2}|\d)-(?:\d{2}|\d)\b', time_str)
if match:
resultData.append(match.group())
else:
# 如果没有日期格式尝试直接匹配4位数字作为年份
match = re.search(r'\d{4}', time_str)
if match:
resultData.append(match.group())
else:
# 如果没有找到匹配项返回0
resultData.append(0)
else:
resultData.append(0)
# 电影片长
movieTime = tree.xpath('//*[@id="info"]/span[@property="v:runtime"]/@content')
if movieTime:
resultData.append(movieTime[0])
else:
resultData.append(0)
# 短评个数
comment_len = tree.xpath('//*[@id="comments-section"]/div[1]/h2/span/a')
if comment_len:
comment_len_txt = re.search('\d+', comment_len[0].text).group()
resultData.append(comment_len_txt)
else:
resultData.append(0)
# 电影星级占比//*[@id="interest_sectl"]/div/div[3]/div[1]/span[2]
starts = tree.xpath('//*[@id="interest_sectl"]/div/div[3]/div[@class="item"]')
# starts = tree.xpath('//*[@id="interest_sectl"]//div[@class="ratings-on-weight"]/div[@class="item"]')
# starts = tree.xpath('//div[@id="interest_sectl"]//div[@class="ratings-on-weight"]/div[@class="item"]')
if starts:
starts_list = []
for i in starts:
# span_html = etree.tostring(i, pretty_print=True, encoding='unicode')
# # 打印或处理单个span元素的HTML
# print(span_html)
# 类名以"rating_per"开头的span元素
# span_tag = i.xpath('.//span[@class="rating_per"]/text()')
# span_tag = i.xpath('.//span[@class="rating_per"]')[0].text
span_tag = i.xpath('.//span[2]')[0].text
starts_list.append(span_tag)
resultData.append(','.join(starts_list))
else:
resultData.append(0)
# 电影简介
summary = tree.xpath('//*[@id="link-report-intra"]/span/text()')
if summary:
summary_str = ''
for i in summary:
summary_str += i.strip()
# print(i.strip())
resultData.append(summary_str)
else:
resultData.append(0)
# 电影短评(短评用户,短评评分,评论时间,评论内容)
comments = tree.xpath('//*[@id="hot-comments"]/div')
if comments:
comments_list = []
for i in comments:
# 用户
user = i.xpath('.//h3/span[2]/a')[0].text
# 评分
start_classes = i.xpath('.//h3/span[2]/span')
if len(start_classes) == 4:
# 获取class属性值列表
class_attributes = i.xpath('.//h3/span[2]/span[2]/@class')
# 检查是否至少有一个class属性值并且它不是"comment-time-tip"
if class_attributes and class_attributes[0] != 'comment-time-tip':
# 尝试从class属性值中提取数字
match = re.search('\d+', class_attributes[0])
if match:
start = match.group()
else:
# 如果没有找到数字则设置start为默认值或进行其他处理
start = 0 # 或者其他合适的默认值
time_pl = i.xpath('.//h3/span[2]/span[3]/@title')[0]
else:
start = 0
# 时间
time_pl = i.xpath('.//h3/span[2]/span[2]/@title')[0]
# 内容
content = i.xpath('.//div/p/span')[0].text
comments_list.append({
'user': user,
'start': start,
'time': time_pl,
'content': content
})
resultData.append(json.dumps(comments_list))
# resultData.append(comments_list)
else:
resultData.append(0)
# # 图片列表
# imgList = tree.xpath('//*[@id="related-pic"]/ul//a/img/@src')
# if imgList:
# resultData.append(','.join(imgList))
# else:
# resultData.append(0)
# 初始化图片链接列表
img_urls = []
# 查找包含特定 class 的 <a> 标签
a_tags = tree.xpath('//*[@id="related-pic"]/ul//a[@class="related-pic-video"]')
# 提取第一个 video 图片链接
found_video_image = False
for a_tag in a_tags:
style_attr = a_tag.get('style')
if style_attr:
start_index = style_attr.find('url(') + 4
end_index = style_attr.find(')', start_index)
if start_index != -1 and end_index != -1:
img_url = style_attr[start_index:end_index]
img_urls.insert(0, img_url) # 将视频图片链接插入到列表的第一个位置
found_video_image = True
break # 找到后跳出循环
# 如果没有找到 video 图片链接,添加默认值
if not found_video_image:
img_urls.append('0') # 使用字符串 '0' 作为默认值,因为后面会用 join(',')
# 查找图片链接
imgList = tree.xpath('//*[@id="related-pic"]/ul//a/img/@src')
if imgList:
img_urls.extend(imgList) # 将其他图片链接添加到列表中
else:
img_urls.extend('0')
# 将 img_urls 列表转换为逗号分隔的字符串,并添加到 resultData
resultData.append(','.join(img_urls))
# 预告片链接
# movieUrl = tree.xpath('//*[@id="related-pic"]/ul//a[@class="related-pic-video"]/@href')[0]
# print(movieUrl, type(movieUrl))
result = tree.xpath('//*[@id="related-pic"]/ul//a[@class="related-pic-video"]/@href')
if result:
movieUrl = result[0]
resultData.append(movieUrl)
# for i in range(1, 99):
# # 获取视频链接
# # 请求网站
# response_sp = requests.get(url=movieUrl, headers=self.headers, proxies=success)
# # 检查请求是否成功
# # 获取网站返回文本
# context_sp = response_sp.text
# # 使用 lxml 的 etree 解析 HTML
# tree_sp = etree.HTML(context_sp)
# # 查找视频链接,并检查是否找到
# movie_sources = tree_sp.xpath('.//video/source/@src')
# if movie_sources:
# movie = movie_sources[0] # 获取列表中的第一个视频源
# resultData.append(movie)
# break
#
# else:
# print("有视频源链接,尝试获取中...")
else:
resultData.append(0)
# print("未找到相关视频预告片链接")
print(resultData)
if self.save_to_csv(resultData):
# 如果爬取成功,更新页面索引
self.set_page(index)
print(f'成功爬取第{index}个网站')
print()
except Exception as e:
# 捕获异常,并打印错误信息
print(f"爬取时遇到错误: {e}")
retries += 1 # 重试次数加1
if retries < max_retries:
print(f"正在重试({retries}/{max_retries}...")
print()
# time.sleep(1.333) # 等待一段时间再重试,避免过于频繁的请求
# else:
# print("已达到最大重试次数,程序退出")
# raise # 如果需要,可以抛出异常或退出程序
def save_to_csv(self, rowData):
with open(f'./tempData.csv', 'a', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(rowData)
return True
def save_to_sql(self, df):
df.to_sql('movie', con=engine, index=False, if_exists='append')
print('导入数据完成~')
# df.to_sql('movie', con=engine, index=False, if_exists='append', method='multi')
def clear_csv(self):
df = pd.read_csv('./2024.csv')
df.dropna(inplace=True) # 删除缺失值
df.drop_duplicates(inplace=True) # 删除重复值,并使用 inplace=True 修改原始 DataFrame
self.save_to_sql(df)
def delete_csv_file(self):
# 文件路径这里假设tempData.csv在当前工作目录下
file_path = 'tempData.csv'
# 检查文件是否存在
if os.path.exists(file_path):
# 删除文件
try:
os.remove(file_path)
print(f"文件 {file_path} 已成功删除。")
except OSError as e:
print(f"删除文件 {file_path} 时发生错误: {e.strerror}")
else:
print(f"文件 {file_path} 不存在,无法删除。")
if __name__ == '__main__':
spiderObj = spider()
spiderObj.init()
spiderObj.spiderMain()
# spiderObj.clear_csv()