diff --git a/douban_scraper.py b/douban_scraper.py new file mode 100644 index 0000000..fd95c8a --- /dev/null +++ b/douban_scraper.py @@ -0,0 +1,149 @@ +import requests +from bs4 import BeautifulSoup +import pandas as pd +import time +from datetime import datetime +import random + +def get_movie_data(page): + # Use a list of different User-Agents to avoid detection + user_agents = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36' + ] + + url = f"https://movie.douban.com/top250?start={page}" + headers = { + 'User-Agent': random.choice(user_agents), + 'Accept-Language': 'zh-CN,zh;q=0.9', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Connection': 'keep-alive', + 'Referer': 'https://movie.douban.com/top250', + 'Cache-Control': 'max-age=0', + 'Upgrade-Insecure-Requests': '1' + } + + try: + print(f"\n正在请求页面: {url}") + response = requests.get(url, headers=headers, timeout=10) + print(f"响应状态码: {response.status_code}") + response.raise_for_status() + + # Check if we got a valid response + if '验证码' in response.text: + print(f"检测到验证码,等待10秒后重试...") + time.sleep(10) + return [] + + soup = BeautifulSoup(response.text, 'html.parser') + print(f"页面标题: {soup.title.string if soup.title else '无标题'}") + + movies = [] + + # Print the HTML structure to debug + print("\n页面结构:") + print(soup.prettify()[:1000]) # Print first 1000 characters + + for item in soup.find_all('div', class_='item'): + print("\n找到一个电影项:") + movie_data = {} + + # Get title (both Chinese and English) + title_elem = item.find('span', class_='title') + if title_elem: + print(f"标题: {title_elem.text}") + movie_data['中文标题'] = title_elem.text.strip() + + # Also get English title if available + eng_title_elem = item.find('span', class_='other') + if eng_title_elem: + print(f"英文标题: {eng_title_elem.text}") + movie_data['英文标题'] = eng_title_elem.text.strip().replace('/', '').strip() + + # Get year + year_elem = item.find('div', class_='bd').find('p') + if year_elem: + year_span = year_elem.find('span', class_='year') + if year_span: + print(f"年份: {year_span.text}") + movie_data['年份'] = year_span.text.strip('()') + + # Get rating + rating_elem = item.find('span', class_='rating_num') + if rating_elem: + print(f"评分: {rating_elem.text}") + movie_data['评分'] = float(rating_elem.text) + + # Get number of votes + star_div = item.find('div', class_='star') + if star_div: + votes_elem = star_div.find_all('span') + if votes_elem and len(votes_elem) >= 4: + votes_text = votes_elem[3].text.strip() + if '人评价' in votes_text: + print(f"评价人数: {votes_text}") + try: + movie_data['评价人数'] = int(votes_text.replace('人评价', '').strip()) + except ValueError: + print(f"警告: 无法转换评价人数: {votes_text}") + movie_data['评价人数'] = None + + # Get director and actors + info_elem = item.find('div', class_='bd').find('p') + if info_elem: + info_text = info_elem.text.strip() + if '导演' in info_text and '主演' in info_text: + director = info_text.split('导演: ')[1].split('主演: ')[0].strip() + actors = info_text.split('主演: ')[1].strip() + print(f"导演: {director}") + print(f"主演: {actors}") + movie_data['导演'] = director + movie_data['主演'] = actors + + if movie_data: # Only add if we got some data + print("\n收集到的电影数据:") + print(movie_data) + movies.append(movie_data) + + print(f"\n本页收集到的电影数量: {len(movies)}") + return movies + + except Exception as e: + print(f"获取电影数据时出错: {str(e)}") + return [] + +def main(): + all_movies = [] + + try: + # Scrape multiple pages (each page contains 25 movies) + for page in range(0, 250, 25): + print(f"\n正在抓取第 {page//25 + 1} 页...") + movies = get_movie_data(page) + all_movies.extend(movies) + time.sleep(random.uniform(1, 2)) # Reduced delay to 1-2 seconds + + # Print progress + print(f"已收集电影数量: {len(all_movies)}") + + # If we got no movies, wait longer before next attempt + if not movies: + print("等待更长时间以避免检测...") + time.sleep(random.uniform(5, 8)) # Reduced long wait to 5-8 seconds + + if all_movies: + # Create DataFrame and save to CSV + df = pd.DataFrame(all_movies) + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + df.to_csv(f'豆瓣电影_{timestamp}.csv', index=False, encoding='utf-8-sig') + print(f"\n成功保存 {len(all_movies)} 部电影到 CSV 文件!") + else: + print("\n没有收集到任何电影数据。请检查错误信息。") + + except Exception as e: + print(f"主程序出错: {str(e)}") + +if __name__ == "__main__": + main()