You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

131 lines
5.7 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
def requests_get(url):
user_agent_list = [ \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", \
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3"
]
user_agent = random.choice(user_agent_list)
headers = {
"User-Agent": user_agent
}
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
print(f"请求失败: {e}")
return None
def get_movie_info(url):
# BeautifulSoup的三个方法find:
# 返回的是第一个符合条件的数据
# 。可以根据title的值来找到对应的标签对象
# ·根据class的值来找需要在class下添加下划线find_a11
# ·返回的是一个列表,并且返回了匹配的所有的标签
# select
# 有了如上基础,对源代码进行标签路径的分析,就可以取值了。
soup = BeautifulSoup(requests_get(url), 'html.parser')
item_elements = soup.find_all("div", class_="item")
# print(item_elements)
movie_list = []
for item in item_elements:
movie = {}
# 获取电影排名
movie['ranking'] = item.find('em', class_='').get_text(strip=True)
# 获取电影名称
movie['title'] = item.find('span', class_='title').get_text(strip=True)
# 获取电影详情链接
movie['link'] = item.find('a', href=True)['href']
# 获取电影评分
movie['rating'] = item.find('span', class_='rating_num').get_text(strip=True)
# 获取电影简介
try:
movie['profile'] = item.find('span', class_='inq').get_text(strip=True)
except AttributeError:
movie['profile'] = ''
# 获取电影评价人数
span_list_len = len(item.find_all('span'))
if span_list_len >= 6:
num_reviews = item.find_all('span')[span_list_len - 2].get_text(strip=True).replace('人评价', '')
movie['num_reviews'] = num_reviews
# 获取电影海报链接
poster = item.find('img', class_='')['src']
movie['poster'] = poster
# 获取电影类型与首映国家与上映年份
movie_infos = item.select('p', class_="")[0].text.strip()
# print(movie_infos)
if movie['title'] == '大闹天宫':
movie['type'] = movie_infos.split('\n')[1].split('/')[4].strip()
movie['country'] = movie_infos.split('\n')[1].split('/')[3].split(' ')[0].strip()
else:
movie['type'] = movie_infos.split('\n')[1].split('/')[2].strip()
movie['country'] = movie_infos.split('\n')[1].split('/')[1].split(' ')[0].strip()
# print(movie['type'])
# print(movie['country'])
movie['year'] = movie_infos.split('\n')[1].split('/')[0].strip(' ').strip('(中国大陆)')
# print(movie['year'])
# print(movie['country'])
# 获取电影导演与上映时间
# 注意:因为时间想要一个年月日,所以直接进入详情页获取数据
# 且top页一些导演名称太长导致遮挡了主演的全称
# 这些都需要我们重新请求电影详情链接
movie_soup = BeautifulSoup(requests_get(movie['link']), 'html.parser')
movie['director'] = movie_soup.find("a", rel="v:directedBy").get_text()
# movie['year'] = movi_infose_soup.find("span", property="v:initialReleaseDate").get_text().split('(')[0]
#获取片长
movie['time'] = movie_soup.find("span", property="v:runtime").get_text(strip=True).replace('片长','')
#获取主演
#主演有些为空,所以需要我们定义异常处理
try:
movie['actor'] = movie_soup.find("a", rel="v:starring").get_text(strip=True)
except AttributeError:
movie['actor'] = ''
# print(movie)
movie_list.append(movie)
return movie_list
if __name__ == '__main__':
print(requests_get('https://movie.douban.com/top250'))
base_url = 'https://movie.douban.com/top250'
movie_data = []
for page in range(10):
start = page * 25
# url ='https://movie.douban.com/top250?start=50'
url = f'{base_url}?start={start}'
print(f"开始爬取第 {page + 1} 页: {url}")
movies = get_movie_info(url)
movie_data.extend(movies)
print('爬取完成')
df = pd.DataFrame(movie_data)
# # 将数据保存为CSV文件
csv_file = 'flaskProject/data/export.csv'
df.to_csv(csv_file, index=False, encoding='utf-8')
print('存储完成')