|
|
from selenium import webdriver
|
|
|
from bs4 import BeautifulSoup
|
|
|
from selenium.webdriver.common.by import By
|
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
import time
|
|
|
import pandas as pd
|
|
|
import os
|
|
|
import 删除重复项 as remove
|
|
|
|
|
|
from selenium.webdriver.edge.options import Options
|
|
|
|
|
|
# 创建Edge浏览器实例
|
|
|
driver = webdriver.Edge()
|
|
|
|
|
|
# 豆瓣登录页URL
|
|
|
login_url = "https://accounts.douban.com/passport/login?redir=https%3A%2F%2Fmovie.douban.com%2Fexplore"
|
|
|
|
|
|
# 打开登录页面
|
|
|
driver.get(login_url)
|
|
|
|
|
|
|
|
|
def wait_and_interact_with_element(driver, xpath, value=None):
|
|
|
wait = WebDriverWait(driver, 45) # 设置等待时间为45秒
|
|
|
element = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
|
|
|
if not value:
|
|
|
text = element.text.strip() # 获取并清理元素的文本内容
|
|
|
element.click() # 点击元素
|
|
|
return text # 返回元素的文本内容
|
|
|
else:
|
|
|
# 如果提供了 value,则向输入框发送文本
|
|
|
element.send_keys(value)
|
|
|
|
|
|
|
|
|
# 切换到账号密码登陆界面
|
|
|
wait_and_interact_with_element(driver, '//*[@id="account"]/div[2]/div[2]/div/div[1]/ul[1]/li[2]')
|
|
|
# 定位并输入用户名
|
|
|
wait_and_interact_with_element(driver, '//*[@id="username"]', '18877228660')
|
|
|
# 定位并输入密码
|
|
|
wait_and_interact_with_element(driver, '//*[@id="password"]', 'qq1453641651')
|
|
|
# 定位并提交登录按钮
|
|
|
wait_and_interact_with_element(driver, '//*[@id="account"]/div[2]/div[2]/div/div[2]/div[1]/div[4]/a')
|
|
|
time.sleep(10)
|
|
|
|
|
|
# 假设此时已经登录成功
|
|
|
|
|
|
# # 关闭当前有头模式的浏览器实例
|
|
|
# driver.quit()
|
|
|
#
|
|
|
# # 启动一个新的无头Edge浏览器实例
|
|
|
# # 创建一个Edge浏览器的配置选项对象
|
|
|
# options = Options()
|
|
|
# # 添加参数以启动无头模式
|
|
|
# options.add_argument("--headless")
|
|
|
# # 添加参数以禁用GPU,减少资源消耗并避免图形相关问题
|
|
|
# # 在服务器或无图形界面的环境中特别有用
|
|
|
# options.add_argument("--disable-gpu") # 禁用GPU以提高兼容性和减少资源使用
|
|
|
# # 使用配置好的选项启动Edge浏览器实例
|
|
|
# driver = webdriver.Edge(options=options)
|
|
|
#
|
|
|
# # 使用新的无头浏览器实例继续你的操作...
|
|
|
# # 例如:
|
|
|
# driver.get("https://movie.douban.com/explore")
|
|
|
|
|
|
# 4,24
|
|
|
for j in range(7, 24):
|
|
|
|
|
|
time.sleep(1)
|
|
|
# 2,16
|
|
|
for i in range(2, 16):
|
|
|
|
|
|
# 等待地区元素变得可点击,并点击它
|
|
|
wait_and_interact_with_element(driver,
|
|
|
'/html/body/div[3]/div[1]/div/div[1]/div/div/div[1]/div/div[1]/div[2]/div/div[1]')
|
|
|
|
|
|
time.sleep(1)
|
|
|
# 等待韩国元素变得可点击,并点击它
|
|
|
region = wait_and_interact_with_element(driver,
|
|
|
f'//*[@id="app"]/div/div[1]/div/div[1]/div[2]/div/div[2]/div/div/ul/li[{j}]/span')
|
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
# 点击年代
|
|
|
wait_and_interact_with_element(driver,
|
|
|
'/html/body/div[3]/div[1]/div/div[1]/div/div/div[1]/div/div[1]/div[3]/div/div')
|
|
|
time.sleep(1)
|
|
|
|
|
|
# 选2020年代
|
|
|
year = wait_and_interact_with_element(driver,
|
|
|
f'/html/body/div[3]/div[1]/div/div[1]/div/div/div[1]/div/div[1]/div[3]/div/div[2]/div/div/ul/li[{i}]/span')
|
|
|
time.sleep(1)
|
|
|
|
|
|
# 等待包含特定文本的span元素变得可点击
|
|
|
# wait = WebDriverWait(driver, 30) # 设置等待时间为30秒
|
|
|
# element_to_click = wait.until(
|
|
|
# EC.element_to_be_clickable((By.XPATH, "//div[contains(@class, 'base-selector-title')]/span[text()='地区']"))
|
|
|
# )
|
|
|
|
|
|
print(f"正在加载{region}-{year}的电影...")
|
|
|
|
|
|
# “加载更多”
|
|
|
n = 0
|
|
|
previous_li_count = 0
|
|
|
while True:
|
|
|
try:
|
|
|
# 等待加载更多按钮变得可点击
|
|
|
wait_and_interact_with_element(driver, '/html/body/div[3]/div[1]/div/div[1]/div/div/div[2]/div/button')
|
|
|
|
|
|
# 等待页面加载新内容
|
|
|
WebDriverWait(driver, 30).until(
|
|
|
lambda driver: len(driver.find_elements(By.CSS_SELECTOR, '.explore-list li')) > previous_li_count
|
|
|
)
|
|
|
|
|
|
# 更新li标签的数量
|
|
|
current_li_count = len(driver.find_elements(By.CSS_SELECTOR, '.explore-list li'))
|
|
|
if current_li_count <= previous_li_count:
|
|
|
break # 如果没有加载新内容,则退出循环
|
|
|
previous_li_count = current_li_count
|
|
|
|
|
|
n += 1
|
|
|
print(n)
|
|
|
time.sleep(1)
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"{e}已加载到底部或等待超时")
|
|
|
break # 捕获异常并退出循环
|
|
|
|
|
|
# 获取页面源代码
|
|
|
html = driver.page_source
|
|
|
# 使用BeautifulSoup解析HTML内容
|
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
|
|
|
|
# 用于存储所有行数据的列表
|
|
|
all_rows_data = []
|
|
|
|
|
|
# 使用CSS选择器定位.lm_tabe tr标签
|
|
|
li_tags = soup.select('.explore-list li')
|
|
|
index = 0
|
|
|
# 遍历li_tags中的每个li元素
|
|
|
for li in li_tags:
|
|
|
# 在当前li元素内部查找.drc-rating-num标签
|
|
|
pf_tag = li.find('span', class_='drc-rating-num')
|
|
|
|
|
|
# 检查是否找到了.drc-rating-num标签
|
|
|
if pf_tag:
|
|
|
# 获取.drc-rating-num标签的文本内容
|
|
|
rating_text = pf_tag.text.strip()
|
|
|
|
|
|
# 检查文本内容是否为"暂无评分"
|
|
|
if rating_text == "暂无评分":
|
|
|
# 如果是"暂无评分",则跳过当前循环,不处理这个li元素
|
|
|
continue
|
|
|
else:
|
|
|
index += 1
|
|
|
# 如果不是"暂无评分",则处理这个li元素(例如打印其他信息)
|
|
|
# print(f"{index}次处理li元素,有评分: {rating_text}")
|
|
|
link_href = li.a.get('href') # 使用.get方法获取href属性值
|
|
|
all_rows_data.append(link_href)
|
|
|
else:
|
|
|
# 如果没有找到.drc-rating-num标签,也可以在这里处理这种情况
|
|
|
print("当前li元素内没有.drc-rating-num标签")
|
|
|
print(f"{index}次处理li元素,有评分")
|
|
|
# 创建DataFrame
|
|
|
df = pd.DataFrame(all_rows_data, columns=['电影网站'])
|
|
|
|
|
|
# 构造文件路径
|
|
|
file_path = f'豆瓣电影网站/{region}/{region}-{year}电影网站.xlsx'
|
|
|
# 分割文件路径以获取目录路径
|
|
|
directory = os.path.dirname(file_path)
|
|
|
|
|
|
# 如果目录不存在,则创建它
|
|
|
if not os.path.exists(directory):
|
|
|
os.makedirs(directory)
|
|
|
|
|
|
# 将DataFrame保存为Excel文件
|
|
|
df.to_excel(file_path, index=False)
|
|
|
print(f'数据已保存为Excel文件路径为:{file_path}')
|
|
|
time.sleep(1)
|
|
|
|
|
|
print(f'{region}电影网站获取全部完成文件路径为:豆瓣电影网站/{region}')
|
|
|
# 去重电影网站
|
|
|
remove.deduplication(f'豆瓣电影网站/{region}', region)
|
|
|
|
|
|
# 浏览器实例停留
|
|
|
time.sleep(99999999)
|
|
|
# 关闭浏览器实例
|
|
|
driver.quit()
|