import csv import json import os import re import pandas as pd from pymysql import * import requests from fake_useragent import UserAgent from lxml import etree from sqlalchemy import create_engine import 优化ip代理池 as Optimize engine = create_engine('mysql+pymysql://root:1234@localhost:3306/dbmovie') class spider(object): def __init__(self): self.name = '日本' # 读取Excel文件 获取url file_path = f'豆瓣电影网站/{self.name}/{self.name}电影网站(整合后).xlsx' self.df = pd.read_excel(file_path, engine='openpyxl') # URL数据在'电影网站'这一列中 self.urls = self.df['电影网站'].tolist() # 转换为列表,方便遍历 # 定义请求头 # self.headers() self.proxies = [ {'http': 'http://180.121.130.208:8089'}, {'http': 'http://114.106.147.14:8089'}, {'http': 'http://117.86.9.250:8089'}, {'http': 'http://114.231.45.235:8089'}, {'http': 'http://114.132.202.80:8080'}, # 如果有HTTPS代理,也可以这样添加 # {'https': 'https://example.com:port'}, ] # 返回测试成功的随机ip代理 # self.success_proxies = Optimize.startup() def request_headers(self): # 生成一个模拟随机浏览器的User-Agent UserAgents = UserAgent().random # 定义请求头 headers = { "User-Agent": f"{UserAgents}", # "Cookie": 'll="118313"; bid=QVNFc5M31Ds; _pk_id.100001.4cf6=b91d85f3dfe7a18f.1708781347.; _vwo_uuid_v2=D41D2EAD2A7C867B2EF7CAA05192E9D9B|0d7f976cadeba2dd51716ba2b90223b7; viewed="1866298_20396037"; __utmz=223695111.1711160671.9.2.utmcsr=search.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/movie/subject_search; push_noty_num=0; push_doumail_num=0; __utmz=30149280.1711182158.14.5.utmcsr=help.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=30149280.27921; douban-fav-remind=1; dbcl2="279216488:y4XIrGbz4fQ"; ck=rkAR; frodotk_db="2e17d8cc08294f6a8a478a64187bee3e"; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1712647865%2C%22https%3A%2F%2Fsearch.douban.com%2Fmovie%2Fsubject_search%3Fsearch_text%3D%E5%8A%A8%E6%BC%AB%26cat%3D1002%26start%3D60%22%5D; _pk_ses.100001.4cf6=1; __utma=30149280.1472987265.1708781347.1712582234.1712647866.102; __utmb=30149280.0.10.1712647866; __utmc=30149280; __utma=223695111.718378847.1708781347.1712582234.1712647866.100; __utmb=223695111.0.10.1712647866; __utmc=223695111' } return headers def init(self): if not os.path.exists('./tempData.csv'): with open('./tempData.csv', 'w', newline='') as writer_f: writer = csv.writer(writer_f) writer.writerow( ['directors', 'rate', 'title', 'casts', 'cover', 'detailLink', 'year', 'types', 'country', 'lang', 'time', 'movieTime', 'comment_len', 'starts', 'summary', 'comments', 'imgList', 'movieUrl'] ) # writer.writerow( # [ # '电影导演', '电影评分', '电影名字', '电影演员', '电影封面', '电影详情链接', '电影年份', # '电影类型', # '电影国家', '电影语言', '电影上映时间', '电影片长', '短评个数', '电影星级', # '电影信息介绍', '电影短评(短评用户,短评评分,评论时间,评论内容)', '图片列表', '预告片链接' # ] # ) if not os.path.exists('./spiderPage.txt'): with open('./spiderPage.txt', 'w', encoding="utf-8") as f: f.write('0\r') try: conn = connect(host='localhost', user='root', password='1234', database='dbmovie', port=3306, charset='utf8mb4') sql = ''' create table movie ( id int primary key auto_increment, directors VARCHAR(255), rate VARCHAR(255), title VARCHAR(255), casts text, cover VARCHAR(255), detailLink VARCHAR(255), year VARCHAR(255), types VARCHAR(255), country VARCHAR(255), lang VARCHAR(255), time VARCHAR(255), movieTime VARCHAR(255), comment_len VARCHAR(255), starts VARCHAR(255), summary text, comments text, imgList text, movieUrl VARCHAR(255) ) ''' cursor = conn.cursor() cursor.execute(sql) conn.commit() except: pass def get_page(self): try: with open('./spiderPage.txt', 'r') as r_f: return int(r_f.read().strip()) except FileNotFoundError: return 0 # 如果没有文件,返回0作为起始索引 def set_page(self, newPage): with open('./spiderPage.txt', 'w') as a_f: a_f.write(str(newPage)) def spiderMain(self): def xpathelement(xpath, value=None): text_xpath = tree.xpath(xpath) # 如果没有提供 value 值 if not value: # 获取内容写法[0].text text = text_xpath[0].text # print(text) return text else: if value: if isinstance(value, str): # 检查 value 是否为字符串类型 # 假设 value 是属性名,并且我们想要获取该属性的值 # print("进来了") xpath = xpath + '/@{}'.format(value) # print(xpath) text_xpath = tree.xpath(xpath)[0] return text_xpath else: return text_xpath max_retries = 99999 # 设置最大重试次数 retries = 0 while retries < max_retries: start_page = self.get_page() + 1 # 获取上次爬取的页面索引,并加1作为本次开始的索引 if start_page > len(self.urls): print("所有网站已爬取完毕!") return print(f'正在从第{start_page}个网站开始爬取') try: # 从上次爬取的位置开始继续爬取 for index, url in enumerate(self.urls[start_page - 1:], start=start_page): headers = self.request_headers() success = Optimize.startup() # proxies = random.choice(self.proxies) # 从代理列表中随机选择一个代理 resultData = [] # url = 'https://movie.douban.com/subject/36868913/' print(url) # 在每次请求之后添加1秒的延迟 # time.sleep(1) # 发送请求 response = requests.get(url=url, headers=headers, proxies=success) # 获取网站返回文本 context = response.text # 使用 lxml 的 etree 解析 HTML tree = etree.HTML(context) if response.status_code == 200: # 处理成功的响应 # 电影导演 directors = tree.xpath('//*[@id="info"]//a[@rel="v:directedBy"]') if directors: directors_list = [] for i in directors: directors_list.append(i.text) resultData.append(','.join(directors_list)) else: resultData.append(0) # 电影评分 rate = tree.xpath('//*[@id="interest_sectl"]/div/div[2]/strong') if rate and rate[0].text: rate_txt = rate[0].text.strip() # 使用strip()去除可能的空白字符 resultData.append(rate_txt) else: resultData.append(0) # 电影名字//*[@id="content"]/h1/span[1] title = tree.xpath('//*[@id="content"]/h1//span[@property="v:itemreviewed"]') title_txt = title[0].text print(title_txt) resultData.append(title_txt) # 电影演员 # 查找包含目标span元素的父级span//*[@id="info"]/span[2]/span[2] actors_links = tree.xpath('//*[@id="info"]//a[@rel="v:starring"]') # print(actors_links) if actors_links: # 初始化一个空列表来保存演员的名字 actors = [] # 遍历每个链接元素 for link in actors_links: # 获取链接的文本内容 text = link.text # 打印链接元素和它的text属性 # print(f"Link element: {link}") # print(f"Text content: {text}") # 检查text是否为None if text is not None: # 使用strip()方法去除文本两侧的空白字符 trimmed_text = text.strip() # 检查去除空白后的文本是否为空 if trimmed_text: # 如果不为空,则将其添加到演员列表中 actors.append(trimmed_text) else: # 处理text为None的情况 continue resultData.append(','.join(actors)) else: resultData.append(0) # 电影封面 # cover_img_src = tree.xpath('//*[@id="mainpic"]/a/img/@src')[0] cover_img_src = tree.xpath('//*[@id="mainpic"]/a/img/@src') if cover_img_src: resultData.append(cover_img_src[0]) else: resultData.append(0) # 电影详情链接 detailLink = url resultData.append(detailLink) # 电影年份 # year = tree.xpath('//*[@id="content"]/h1/span[2]/text()') # print(year) year = tree.xpath('//*[@id="content"]/h1/span[2]') if year: # 使用正则表达式去除所有括号 year_without_parentheses = re.search('\d+', year[0].text).group() # year_without_parentheses = re.sub(r'\(|\)', '', year) resultData.append(year_without_parentheses) else: resultData.append(0) # 电影类型 # types = tree.xpath('//span[contains(@property,"v:genre")]') # types = tree.xpath('//*[@id="info"]/span[contains(@property,"v:genre")]') types = tree.xpath('//*[@id="info"]/span[@property="v:genre"]') if types: types_list = [] for i in types: types_list.append(i.text) resultData.append(','.join(types_list)) else: resultData.append(0) # 电影国家 textInfo = tree.xpath('//*[@id="info"]/text()') texts = [] if textInfo: for i in textInfo: if i.strip() and not i.strip() == '/': texts.append(i) # print(texts) # 打印完整的texts列表,检查原始数据 # 假设texts[0]包含国家信息 if len(texts) > 0: # 确保texts至少有两个元素 if texts[0]: country_info = texts[0].split('/') # 分割国家信息 cleaned_countries = [i.strip() for i in country_info] # 去除每个国家名称的空格(尽管之前已经处理过) resultData.append(','.join(cleaned_countries)) # 合并成一个字符串并添加到resultData中 else: resultData.append(0) else: resultData.append(0) else: resultData.append(0) # 电影语言 if textInfo: if len(texts) > 1: # 确保texts至少有两个元素 if texts[1]: lang_info = texts[1].split(sep='/') # 分割语言信息 cleaned_lang = [i.strip() for i in lang_info] # 去除每个语言的空格(尽管之前已经处理过) resultData.append(','.join(cleaned_lang)) # 合并成一个字符串并添加到resultData中 else: resultData.append(0) else: resultData.append(0) else: resultData.append(0) # 电影上映时间 time_movie = tree.xpath('//*[@id="info"]/span[@property = "v:initialReleaseDate"]') if time_movie: time_list = [] for i in time_movie: time_list.append(i.text) # time_one = time_list[0][:10] # time_one = re.search('\d+', time_list[0]).group() # 去除括号内的内容 time_str = re.sub(r'\(.*?\)', '', time_list[0]) # 使用正则表达式匹配 YYYY-MM-DD, YYYY-M-DD, 和 YYYY-M-D 格式的日期 match = re.search(r'\b\d{4}-(?:\d{2}|\d)-(?:\d{2}|\d)\b', time_str) if match: resultData.append(match.group()) else: # 如果没有日期格式,尝试直接匹配4位数字作为年份 match = re.search(r'\d{4}', time_str) if match: resultData.append(match.group()) else: # 如果没有找到匹配项,返回0 resultData.append(0) else: resultData.append(0) # 电影片长 movieTime = tree.xpath('//*[@id="info"]/span[@property="v:runtime"]/@content') if movieTime: resultData.append(movieTime[0]) else: resultData.append(0) # 短评个数 comment_len = tree.xpath('//*[@id="comments-section"]/div[1]/h2/span/a') if comment_len: comment_len_txt = re.search('\d+', comment_len[0].text).group() resultData.append(comment_len_txt) else: resultData.append(0) # 电影星级占比//*[@id="interest_sectl"]/div/div[3]/div[1]/span[2] starts = tree.xpath('//*[@id="interest_sectl"]/div/div[3]/div[@class="item"]') # starts = tree.xpath('//*[@id="interest_sectl"]//div[@class="ratings-on-weight"]/div[@class="item"]') # starts = tree.xpath('//div[@id="interest_sectl"]//div[@class="ratings-on-weight"]/div[@class="item"]') if starts: starts_list = [] for i in starts: # span_html = etree.tostring(i, pretty_print=True, encoding='unicode') # # 打印或处理单个span元素的HTML # print(span_html) # 类名以"rating_per"开头的span元素 # span_tag = i.xpath('.//span[@class="rating_per"]/text()') # span_tag = i.xpath('.//span[@class="rating_per"]')[0].text span_tag = i.xpath('.//span[2]')[0].text starts_list.append(span_tag) resultData.append(','.join(starts_list)) else: resultData.append(0) # 电影简介 summary = tree.xpath('//*[@id="link-report-intra"]/span/text()') if summary: summary_str = '' for i in summary: summary_str += i.strip() # print(i.strip()) resultData.append(summary_str) else: resultData.append(0) # 电影短评(短评用户,短评评分,评论时间,评论内容) comments = tree.xpath('//*[@id="hot-comments"]/div') if comments: comments_list = [] for i in comments: # 用户 user = i.xpath('.//h3/span[2]/a')[0].text # 评分 start_classes = i.xpath('.//h3/span[2]/span') if len(start_classes) == 4: # 获取class属性值列表 class_attributes = i.xpath('.//h3/span[2]/span[2]/@class') # 检查是否至少有一个class属性值,并且它不是"comment-time-tip" if class_attributes and class_attributes[0] != 'comment-time-tip': # 尝试从class属性值中提取数字 match = re.search('\d+', class_attributes[0]) if match: start = match.group() else: # 如果没有找到数字,则设置start为默认值或进行其他处理 start = 0 # 或者其他合适的默认值 time_pl = i.xpath('.//h3/span[2]/span[3]/@title')[0] else: start = 0 # 时间 time_pl = i.xpath('.//h3/span[2]/span[2]/@title')[0] # 内容 content = i.xpath('.//div/p/span')[0].text comments_list.append({ 'user': user, 'start': start, 'time': time_pl, 'content': content }) resultData.append(json.dumps(comments_list)) # resultData.append(comments_list) else: resultData.append(0) # # 图片列表 # imgList = tree.xpath('//*[@id="related-pic"]/ul//a/img/@src') # if imgList: # resultData.append(','.join(imgList)) # else: # resultData.append(0) # 初始化图片链接列表 img_urls = [] # 查找包含特定 class 的 标签 a_tags = tree.xpath('//*[@id="related-pic"]/ul//a[@class="related-pic-video"]') # 提取第一个 video 图片链接 found_video_image = False for a_tag in a_tags: style_attr = a_tag.get('style') if style_attr: start_index = style_attr.find('url(') + 4 end_index = style_attr.find(')', start_index) if start_index != -1 and end_index != -1: img_url = style_attr[start_index:end_index] img_urls.insert(0, img_url) # 将视频图片链接插入到列表的第一个位置 found_video_image = True break # 找到后跳出循环 # 如果没有找到 video 图片链接,添加默认值 if not found_video_image: img_urls.append('0') # 使用字符串 '0' 作为默认值,因为后面会用 join(',') # 查找图片链接 imgList = tree.xpath('//*[@id="related-pic"]/ul//a/img/@src') if imgList: img_urls.extend(imgList) # 将其他图片链接添加到列表中 else: img_urls.extend('0') # 将 img_urls 列表转换为逗号分隔的字符串,并添加到 resultData resultData.append(','.join(img_urls)) # 预告片链接 # movieUrl = tree.xpath('//*[@id="related-pic"]/ul//a[@class="related-pic-video"]/@href')[0] # print(movieUrl, type(movieUrl)) result = tree.xpath('//*[@id="related-pic"]/ul//a[@class="related-pic-video"]/@href') if result: movieUrl = result[0] resultData.append(movieUrl) # for i in range(1, 99): # # 获取视频链接 # # 请求网站 # response_sp = requests.get(url=movieUrl, headers=self.headers, proxies=success) # # 检查请求是否成功 # # 获取网站返回文本 # context_sp = response_sp.text # # 使用 lxml 的 etree 解析 HTML # tree_sp = etree.HTML(context_sp) # # 查找视频链接,并检查是否找到 # movie_sources = tree_sp.xpath('.//video/source/@src') # if movie_sources: # movie = movie_sources[0] # 获取列表中的第一个视频源 # resultData.append(movie) # break # # else: # print("有视频源链接,尝试获取中...") else: resultData.append(0) # print("未找到相关视频预告片链接") print(resultData) if self.save_to_csv(resultData): # 如果爬取成功,更新页面索引 self.set_page(index) print(f'成功爬取第{index}个网站') print() except Exception as e: # 捕获异常,并打印错误信息 print(f"爬取时遇到错误: {e}") retries += 1 # 重试次数加1 if retries < max_retries: print(f"正在重试({retries}/{max_retries})...") print() # time.sleep(1.333) # 等待一段时间再重试,避免过于频繁的请求 # else: # print("已达到最大重试次数,程序退出") # raise # 如果需要,可以抛出异常或退出程序 def save_to_csv(self, rowData): with open(f'./tempData.csv', 'a', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(rowData) return True def save_to_sql(self, df): df.to_sql('movie', con=engine, index=False, if_exists='append') print('导入数据完成~') # df.to_sql('movie', con=engine, index=False, if_exists='append', method='multi') def clear_csv(self): df = pd.read_csv('./2024.csv') df.dropna(inplace=True) # 删除缺失值 df.drop_duplicates(inplace=True) # 删除重复值,并使用 inplace=True 修改原始 DataFrame self.save_to_sql(df) def delete_csv_file(self): # 文件路径,这里假设tempData.csv在当前工作目录下 file_path = 'tempData.csv' # 检查文件是否存在 if os.path.exists(file_path): # 删除文件 try: os.remove(file_path) print(f"文件 {file_path} 已成功删除。") except OSError as e: print(f"删除文件 {file_path} 时发生错误: {e.strerror}") else: print(f"文件 {file_path} 不存在,无法删除。") if __name__ == '__main__': spiderObj = spider() spiderObj.init() spiderObj.spiderMain() # spiderObj.clear_csv()