lin/豆瓣电影爬虫/豆瓣登陆.py

from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import os
import 删除重复项 as remove

from selenium.webdriver.edge.options import Options

# 创建Edge浏览器实例
driver = webdriver.Edge()

# 豆瓣登录页URL
login_url = "https://accounts.douban.com/passport/login?redir=https%3A%2F%2Fmovie.douban.com%2Fexplore"

# 打开登录页面
driver.get(login_url)


def wait_and_interact_with_element(driver, xpath, value=None):
    wait = WebDriverWait(driver, 45)  # 设置等待时间为45秒
    element = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
    if not value:
        text = element.text.strip()  # 获取并清理元素的文本内容
        element.click()  # 点击元素
        return text  # 返回元素的文本内容
    else:
        # 如果提供了 value，则向输入框发送文本
        element.send_keys(value)


# 切换到账号密码登陆界面
wait_and_interact_with_element(driver, '//*[@id="account"]/div[2]/div[2]/div/div[1]/ul[1]/li[2]')
# 定位并输入用户名
wait_and_interact_with_element(driver, '//*[@id="username"]', '18877228660')
# 定位并输入密码
wait_and_interact_with_element(driver, '//*[@id="password"]', 'qq1453641651')
# 定位并提交登录按钮
wait_and_interact_with_element(driver, '//*[@id="account"]/div[2]/div[2]/div/div[2]/div[1]/div[4]/a')
time.sleep(10)

# 假设此时已经登录成功

# # 关闭当前有头模式的浏览器实例
# driver.quit()
#
# # 启动一个新的无头Edge浏览器实例
# # 创建一个Edge浏览器的配置选项对象
# options = Options()
# # 添加参数以启动无头模式
# options.add_argument("--headless")
# # 添加参数以禁用GPU，减少资源消耗并避免图形相关问题
# # 在服务器或无图形界面的环境中特别有用
# options.add_argument("--disable-gpu")  # 禁用GPU以提高兼容性和减少资源使用
# # 使用配置好的选项启动Edge浏览器实例
# driver = webdriver.Edge(options=options)
#
# # 使用新的无头浏览器实例继续你的操作...
# # 例如：
# driver.get("https://movie.douban.com/explore")

# 4,24
for j in range(7, 24):

    time.sleep(1)
    # 2,16
    for i in range(2, 16):

        # 等待地区元素变得可点击，并点击它
        wait_and_interact_with_element(driver,
                                       '/html/body/div[3]/div[1]/div/div[1]/div/div/div[1]/div/div[1]/div[2]/div/div[1]')

        time.sleep(1)
        # 等待韩国元素变得可点击，并点击它
        region = wait_and_interact_with_element(driver,
                                                f'//*[@id="app"]/div/div[1]/div/div[1]/div[2]/div/div[2]/div/div/ul/li[{j}]/span')

        time.sleep(1)

        # 点击年代
        wait_and_interact_with_element(driver,
                                       '/html/body/div[3]/div[1]/div/div[1]/div/div/div[1]/div/div[1]/div[3]/div/div')
        time.sleep(1)

        # 选2020年代
        year = wait_and_interact_with_element(driver,
                                              f'/html/body/div[3]/div[1]/div/div[1]/div/div/div[1]/div/div[1]/div[3]/div/div[2]/div/div/ul/li[{i}]/span')
        time.sleep(1)

        # 等待包含特定文本的span元素变得可点击
        # wait = WebDriverWait(driver, 30)  # 设置等待时间为30秒
        # element_to_click = wait.until(
        #     EC.element_to_be_clickable((By.XPATH, "//div[contains(@class, 'base-selector-title')]/span[text()='地区']"))
        # )

        print(f"正在加载{region}-{year}的电影...")

        # “加载更多”
        n = 0
        previous_li_count = 0
        while True:
            try:
                # 等待加载更多按钮变得可点击
                wait_and_interact_with_element(driver, '/html/body/div[3]/div[1]/div/div[1]/div/div/div[2]/div/button')

                # 等待页面加载新内容
                WebDriverWait(driver, 30).until(
                    lambda driver: len(driver.find_elements(By.CSS_SELECTOR, '.explore-list li')) > previous_li_count
                )

                # 更新li标签的数量
                current_li_count = len(driver.find_elements(By.CSS_SELECTOR, '.explore-list li'))
                if current_li_count <= previous_li_count:
                    break  # 如果没有加载新内容，则退出循环
                previous_li_count = current_li_count

                n += 1
                print(n)
                time.sleep(1)

            except Exception as e:
                print(f"{e}已加载到底部或等待超时")
                break  # 捕获异常并退出循环

        # 获取页面源代码
        html = driver.page_source
        # 使用BeautifulSoup解析HTML内容
        soup = BeautifulSoup(html, 'lxml')

        # 用于存储所有行数据的列表
        all_rows_data = []

        # 使用CSS选择器定位.lm_tabe tr标签
        li_tags = soup.select('.explore-list li')
        index = 0
        # 遍历li_tags中的每个li元素
        for li in li_tags:
            # 在当前li元素内部查找.drc-rating-num标签
            pf_tag = li.find('span', class_='drc-rating-num')

            # 检查是否找到了.drc-rating-num标签
            if pf_tag:
                # 获取.drc-rating-num标签的文本内容
                rating_text = pf_tag.text.strip()

                # 检查文本内容是否为"暂无评分"
                if rating_text == "暂无评分":
                    # 如果是"暂无评分"，则跳过当前循环，不处理这个li元素
                    continue
                else:
                    index += 1
                    # 如果不是"暂无评分"，则处理这个li元素（例如打印其他信息）
                    # print(f"{index}次处理li元素，有评分: {rating_text}")
                    link_href = li.a.get('href')  # 使用.get方法获取href属性值
                    all_rows_data.append(link_href)
            else:
                # 如果没有找到.drc-rating-num标签，也可以在这里处理这种情况
                print("当前li元素内没有.drc-rating-num标签")
        print(f"{index}次处理li元素，有评分")
        # 创建DataFrame
        df = pd.DataFrame(all_rows_data, columns=['电影网站'])

        # 构造文件路径
        file_path = f'豆瓣电影网站/{region}/{region}-{year}电影网站.xlsx'
        # 分割文件路径以获取目录路径
        directory = os.path.dirname(file_path)

        # 如果目录不存在，则创建它
        if not os.path.exists(directory):
            os.makedirs(directory)

        # 将DataFrame保存为Excel文件
        df.to_excel(file_path, index=False)
        print(f'数据已保存为Excel文件路径为：{file_path}')
        time.sleep(1)

    print(f'{region}电影网站获取全部完成文件路径为：豆瓣电影网站/{region}')
    # 去重电影网站
    remove.deduplication(f'豆瓣电影网站/{region}', region)

# 浏览器实例停留
time.sleep(99999999)
# 关闭浏览器实例
driver.quit()