|
|
|
@ -0,0 +1,149 @@
|
|
|
|
|
import requests
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import time
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
import random
|
|
|
|
|
|
|
|
|
|
def get_movie_data(page):
|
|
|
|
|
# Use a list of different User-Agents to avoid detection
|
|
|
|
|
user_agents = [
|
|
|
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
|
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
|
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
|
|
|
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
url = f"https://movie.douban.com/top250?start={page}"
|
|
|
|
|
headers = {
|
|
|
|
|
'User-Agent': random.choice(user_agents),
|
|
|
|
|
'Accept-Language': 'zh-CN,zh;q=0.9',
|
|
|
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
|
|
|
'Connection': 'keep-alive',
|
|
|
|
|
'Referer': 'https://movie.douban.com/top250',
|
|
|
|
|
'Cache-Control': 'max-age=0',
|
|
|
|
|
'Upgrade-Insecure-Requests': '1'
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
print(f"\n正在请求页面: {url}")
|
|
|
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
|
|
|
print(f"响应状态码: {response.status_code}")
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
|
|
|
|
|
# Check if we got a valid response
|
|
|
|
|
if '验证码' in response.text:
|
|
|
|
|
print(f"检测到验证码,等待10秒后重试...")
|
|
|
|
|
time.sleep(10)
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
print(f"页面标题: {soup.title.string if soup.title else '无标题'}")
|
|
|
|
|
|
|
|
|
|
movies = []
|
|
|
|
|
|
|
|
|
|
# Print the HTML structure to debug
|
|
|
|
|
print("\n页面结构:")
|
|
|
|
|
print(soup.prettify()[:1000]) # Print first 1000 characters
|
|
|
|
|
|
|
|
|
|
for item in soup.find_all('div', class_='item'):
|
|
|
|
|
print("\n找到一个电影项:")
|
|
|
|
|
movie_data = {}
|
|
|
|
|
|
|
|
|
|
# Get title (both Chinese and English)
|
|
|
|
|
title_elem = item.find('span', class_='title')
|
|
|
|
|
if title_elem:
|
|
|
|
|
print(f"标题: {title_elem.text}")
|
|
|
|
|
movie_data['中文标题'] = title_elem.text.strip()
|
|
|
|
|
|
|
|
|
|
# Also get English title if available
|
|
|
|
|
eng_title_elem = item.find('span', class_='other')
|
|
|
|
|
if eng_title_elem:
|
|
|
|
|
print(f"英文标题: {eng_title_elem.text}")
|
|
|
|
|
movie_data['英文标题'] = eng_title_elem.text.strip().replace('/', '').strip()
|
|
|
|
|
|
|
|
|
|
# Get year
|
|
|
|
|
year_elem = item.find('div', class_='bd').find('p')
|
|
|
|
|
if year_elem:
|
|
|
|
|
year_span = year_elem.find('span', class_='year')
|
|
|
|
|
if year_span:
|
|
|
|
|
print(f"年份: {year_span.text}")
|
|
|
|
|
movie_data['年份'] = year_span.text.strip('()')
|
|
|
|
|
|
|
|
|
|
# Get rating
|
|
|
|
|
rating_elem = item.find('span', class_='rating_num')
|
|
|
|
|
if rating_elem:
|
|
|
|
|
print(f"评分: {rating_elem.text}")
|
|
|
|
|
movie_data['评分'] = float(rating_elem.text)
|
|
|
|
|
|
|
|
|
|
# Get number of votes
|
|
|
|
|
star_div = item.find('div', class_='star')
|
|
|
|
|
if star_div:
|
|
|
|
|
votes_elem = star_div.find_all('span')
|
|
|
|
|
if votes_elem and len(votes_elem) >= 4:
|
|
|
|
|
votes_text = votes_elem[3].text.strip()
|
|
|
|
|
if '人评价' in votes_text:
|
|
|
|
|
print(f"评价人数: {votes_text}")
|
|
|
|
|
try:
|
|
|
|
|
movie_data['评价人数'] = int(votes_text.replace('人评价', '').strip())
|
|
|
|
|
except ValueError:
|
|
|
|
|
print(f"警告: 无法转换评价人数: {votes_text}")
|
|
|
|
|
movie_data['评价人数'] = None
|
|
|
|
|
|
|
|
|
|
# Get director and actors
|
|
|
|
|
info_elem = item.find('div', class_='bd').find('p')
|
|
|
|
|
if info_elem:
|
|
|
|
|
info_text = info_elem.text.strip()
|
|
|
|
|
if '导演' in info_text and '主演' in info_text:
|
|
|
|
|
director = info_text.split('导演: ')[1].split('主演: ')[0].strip()
|
|
|
|
|
actors = info_text.split('主演: ')[1].strip()
|
|
|
|
|
print(f"导演: {director}")
|
|
|
|
|
print(f"主演: {actors}")
|
|
|
|
|
movie_data['导演'] = director
|
|
|
|
|
movie_data['主演'] = actors
|
|
|
|
|
|
|
|
|
|
if movie_data: # Only add if we got some data
|
|
|
|
|
print("\n收集到的电影数据:")
|
|
|
|
|
print(movie_data)
|
|
|
|
|
movies.append(movie_data)
|
|
|
|
|
|
|
|
|
|
print(f"\n本页收集到的电影数量: {len(movies)}")
|
|
|
|
|
return movies
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"获取电影数据时出错: {str(e)}")
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
all_movies = []
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# Scrape multiple pages (each page contains 25 movies)
|
|
|
|
|
for page in range(0, 250, 25):
|
|
|
|
|
print(f"\n正在抓取第 {page//25 + 1} 页...")
|
|
|
|
|
movies = get_movie_data(page)
|
|
|
|
|
all_movies.extend(movies)
|
|
|
|
|
time.sleep(random.uniform(1, 2)) # Reduced delay to 1-2 seconds
|
|
|
|
|
|
|
|
|
|
# Print progress
|
|
|
|
|
print(f"已收集电影数量: {len(all_movies)}")
|
|
|
|
|
|
|
|
|
|
# If we got no movies, wait longer before next attempt
|
|
|
|
|
if not movies:
|
|
|
|
|
print("等待更长时间以避免检测...")
|
|
|
|
|
time.sleep(random.uniform(5, 8)) # Reduced long wait to 5-8 seconds
|
|
|
|
|
|
|
|
|
|
if all_movies:
|
|
|
|
|
# Create DataFrame and save to CSV
|
|
|
|
|
df = pd.DataFrame(all_movies)
|
|
|
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
|
|
|
df.to_csv(f'豆瓣电影_{timestamp}.csv', index=False, encoding='utf-8-sig')
|
|
|
|
|
print(f"\n成功保存 {len(all_movies)} 部电影到 CSV 文件!")
|
|
|
|
|
else:
|
|
|
|
|
print("\n没有收集到任何电影数据。请检查错误信息。")
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"主程序出错: {str(e)}")
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|