You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

188 lines
7.1 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import os
import 删除重复项 as remove
from selenium.webdriver.edge.options import Options
# 创建Edge浏览器实例
driver = webdriver.Edge()
# 豆瓣登录页URL
login_url = "https://accounts.douban.com/passport/login?redir=https%3A%2F%2Fmovie.douban.com%2Fexplore"
# 打开登录页面
driver.get(login_url)
def wait_and_interact_with_element(driver, xpath, value=None):
wait = WebDriverWait(driver, 45) # 设置等待时间为45秒
element = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
if not value:
text = element.text.strip() # 获取并清理元素的文本内容
element.click() # 点击元素
return text # 返回元素的文本内容
else:
# 如果提供了 value则向输入框发送文本
element.send_keys(value)
# 切换到账号密码登陆界面
wait_and_interact_with_element(driver, '//*[@id="account"]/div[2]/div[2]/div/div[1]/ul[1]/li[2]')
# 定位并输入用户名
wait_and_interact_with_element(driver, '//*[@id="username"]', '18877228660')
# 定位并输入密码
wait_and_interact_with_element(driver, '//*[@id="password"]', 'qq1453641651')
# 定位并提交登录按钮
wait_and_interact_with_element(driver, '//*[@id="account"]/div[2]/div[2]/div/div[2]/div[1]/div[4]/a')
time.sleep(10)
# 假设此时已经登录成功
# # 关闭当前有头模式的浏览器实例
# driver.quit()
#
# # 启动一个新的无头Edge浏览器实例
# # 创建一个Edge浏览器的配置选项对象
# options = Options()
# # 添加参数以启动无头模式
# options.add_argument("--headless")
# # 添加参数以禁用GPU减少资源消耗并避免图形相关问题
# # 在服务器或无图形界面的环境中特别有用
# options.add_argument("--disable-gpu") # 禁用GPU以提高兼容性和减少资源使用
# # 使用配置好的选项启动Edge浏览器实例
# driver = webdriver.Edge(options=options)
#
# # 使用新的无头浏览器实例继续你的操作...
# # 例如:
# driver.get("https://movie.douban.com/explore")
# 4,24
for j in range(7, 24):
time.sleep(1)
# 2,16
for i in range(2, 16):
# 等待地区元素变得可点击,并点击它
wait_and_interact_with_element(driver,
'/html/body/div[3]/div[1]/div/div[1]/div/div/div[1]/div/div[1]/div[2]/div/div[1]')
time.sleep(1)
# 等待韩国元素变得可点击,并点击它
region = wait_and_interact_with_element(driver,
f'//*[@id="app"]/div/div[1]/div/div[1]/div[2]/div/div[2]/div/div/ul/li[{j}]/span')
time.sleep(1)
# 点击年代
wait_and_interact_with_element(driver,
'/html/body/div[3]/div[1]/div/div[1]/div/div/div[1]/div/div[1]/div[3]/div/div')
time.sleep(1)
# 选2020年代
year = wait_and_interact_with_element(driver,
f'/html/body/div[3]/div[1]/div/div[1]/div/div/div[1]/div/div[1]/div[3]/div/div[2]/div/div/ul/li[{i}]/span')
time.sleep(1)
# 等待包含特定文本的span元素变得可点击
# wait = WebDriverWait(driver, 30) # 设置等待时间为30秒
# element_to_click = wait.until(
# EC.element_to_be_clickable((By.XPATH, "//div[contains(@class, 'base-selector-title')]/span[text()='地区']"))
# )
print(f"正在加载{region}-{year}的电影...")
# “加载更多”
n = 0
previous_li_count = 0
while True:
try:
# 等待加载更多按钮变得可点击
wait_and_interact_with_element(driver, '/html/body/div[3]/div[1]/div/div[1]/div/div/div[2]/div/button')
# 等待页面加载新内容
WebDriverWait(driver, 30).until(
lambda driver: len(driver.find_elements(By.CSS_SELECTOR, '.explore-list li')) > previous_li_count
)
# 更新li标签的数量
current_li_count = len(driver.find_elements(By.CSS_SELECTOR, '.explore-list li'))
if current_li_count <= previous_li_count:
break # 如果没有加载新内容,则退出循环
previous_li_count = current_li_count
n += 1
print(n)
time.sleep(1)
except Exception as e:
print(f"{e}已加载到底部或等待超时")
break # 捕获异常并退出循环
# 获取页面源代码
html = driver.page_source
# 使用BeautifulSoup解析HTML内容
soup = BeautifulSoup(html, 'lxml')
# 用于存储所有行数据的列表
all_rows_data = []
# 使用CSS选择器定位.lm_tabe tr标签
li_tags = soup.select('.explore-list li')
index = 0
# 遍历li_tags中的每个li元素
for li in li_tags:
# 在当前li元素内部查找.drc-rating-num标签
pf_tag = li.find('span', class_='drc-rating-num')
# 检查是否找到了.drc-rating-num标签
if pf_tag:
# 获取.drc-rating-num标签的文本内容
rating_text = pf_tag.text.strip()
# 检查文本内容是否为"暂无评分"
if rating_text == "暂无评分":
# 如果是"暂无评分"则跳过当前循环不处理这个li元素
continue
else:
index += 1
# 如果不是"暂无评分"则处理这个li元素例如打印其他信息
# print(f"{index}次处理li元素有评分: {rating_text}")
link_href = li.a.get('href') # 使用.get方法获取href属性值
all_rows_data.append(link_href)
else:
# 如果没有找到.drc-rating-num标签也可以在这里处理这种情况
print("当前li元素内没有.drc-rating-num标签")
print(f"{index}次处理li元素有评分")
# 创建DataFrame
df = pd.DataFrame(all_rows_data, columns=['电影网站'])
# 构造文件路径
file_path = f'豆瓣电影网站/{region}/{region}-{year}电影网站.xlsx'
# 分割文件路径以获取目录路径
directory = os.path.dirname(file_path)
# 如果目录不存在,则创建它
if not os.path.exists(directory):
os.makedirs(directory)
# 将DataFrame保存为Excel文件
df.to_excel(file_path, index=False)
print(f'数据已保存为Excel文件路径为{file_path}')
time.sleep(1)
print(f'{region}电影网站获取全部完成文件路径为:豆瓣电影网站/{region}')
# 去重电影网站
remove.deduplication(f'豆瓣电影网站/{region}', region)
# 浏览器实例停留
time.sleep(99999999)
# 关闭浏览器实例
driver.quit()