You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

150 lines
6.2 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime
import random
def get_movie_data(page):
# Use a list of different User-Agents to avoid detection
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
]
url = f"https://movie.douban.com/top250?start={page}"
headers = {
'User-Agent': random.choice(user_agents),
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Connection': 'keep-alive',
'Referer': 'https://movie.douban.com/top250',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1'
}
try:
print(f"\n正在请求页面: {url}")
response = requests.get(url, headers=headers, timeout=10)
print(f"响应状态码: {response.status_code}")
response.raise_for_status()
# Check if we got a valid response
if '验证码' in response.text:
print(f"检测到验证码等待10秒后重试...")
time.sleep(10)
return []
soup = BeautifulSoup(response.text, 'html.parser')
print(f"页面标题: {soup.title.string if soup.title else '无标题'}")
movies = []
# Print the HTML structure to debug
print("\n页面结构:")
print(soup.prettify()[:1000]) # Print first 1000 characters
for item in soup.find_all('div', class_='item'):
print("\n找到一个电影项:")
movie_data = {}
# Get title (both Chinese and English)
title_elem = item.find('span', class_='title')
if title_elem:
print(f"标题: {title_elem.text}")
movie_data['中文标题'] = title_elem.text.strip()
# Also get English title if available
eng_title_elem = item.find('span', class_='other')
if eng_title_elem:
print(f"英文标题: {eng_title_elem.text}")
movie_data['英文标题'] = eng_title_elem.text.strip().replace('/', '').strip()
# Get year
year_elem = item.find('div', class_='bd').find('p')
if year_elem:
year_span = year_elem.find('span', class_='year')
if year_span:
print(f"年份: {year_span.text}")
movie_data['年份'] = year_span.text.strip('()')
# Get rating
rating_elem = item.find('span', class_='rating_num')
if rating_elem:
print(f"评分: {rating_elem.text}")
movie_data['评分'] = float(rating_elem.text)
# Get number of votes
star_div = item.find('div', class_='star')
if star_div:
votes_elem = star_div.find_all('span')
if votes_elem and len(votes_elem) >= 4:
votes_text = votes_elem[3].text.strip()
if '人评价' in votes_text:
print(f"评价人数: {votes_text}")
try:
movie_data['评价人数'] = int(votes_text.replace('人评价', '').strip())
except ValueError:
print(f"警告: 无法转换评价人数: {votes_text}")
movie_data['评价人数'] = None
# Get director and actors
info_elem = item.find('div', class_='bd').find('p')
if info_elem:
info_text = info_elem.text.strip()
if '导演' in info_text and '主演' in info_text:
director = info_text.split('导演: ')[1].split('主演: ')[0].strip()
actors = info_text.split('主演: ')[1].strip()
print(f"导演: {director}")
print(f"主演: {actors}")
movie_data['导演'] = director
movie_data['主演'] = actors
if movie_data: # Only add if we got some data
print("\n收集到的电影数据:")
print(movie_data)
movies.append(movie_data)
print(f"\n本页收集到的电影数量: {len(movies)}")
return movies
except Exception as e:
print(f"获取电影数据时出错: {str(e)}")
return []
def main():
all_movies = []
try:
# Scrape multiple pages (each page contains 25 movies)
for page in range(0, 250, 25):
print(f"\n正在抓取第 {page//25 + 1} 页...")
movies = get_movie_data(page)
all_movies.extend(movies)
time.sleep(random.uniform(1, 2)) # Reduced delay to 1-2 seconds
# Print progress
print(f"已收集电影数量: {len(all_movies)}")
# If we got no movies, wait longer before next attempt
if not movies:
print("等待更长时间以避免检测...")
time.sleep(random.uniform(5, 8)) # Reduced long wait to 5-8 seconds
if all_movies:
# Create DataFrame and save to CSV
df = pd.DataFrame(all_movies)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
df.to_csv(f'豆瓣电影_{timestamp}.csv', index=False, encoding='utf-8-sig')
print(f"\n成功保存 {len(all_movies)} 部电影到 CSV 文件!")
else:
print("\n没有收集到任何电影数据。请检查错误信息。")
except Exception as e:
print(f"主程序出错: {str(e)}")
if __name__ == "__main__":
main()