You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
86 lines
3.5 KiB
86 lines
3.5 KiB
import requests
|
|
from bs4 import BeautifulSoup
|
|
from time import sleep
|
|
import pandas as pd
|
|
|
|
def requests_get(url):
|
|
headers ={ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"}
|
|
try:
|
|
response = requests.get(url, headers=headers)
|
|
response.raise_for_status()#检查响应码
|
|
return response.text
|
|
except requests.exceptions.RequestException as e:#捕获异常
|
|
print(f"请求失败: {e}")
|
|
return None
|
|
|
|
def get_movie_info(html):
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
item_elements = soup.find_all("div", class_="item")
|
|
|
|
movie_list = []
|
|
|
|
for item in item_elements:
|
|
movie = {}
|
|
#获取电影排名
|
|
movie['ranking'] = item.find('em', class_='').get_text(strip=True)#strip除去前后空白字符
|
|
#获取电影名称
|
|
movie['title'] = item.find('span', class_='title').get_text(strip=True)
|
|
#获取电影详情链接
|
|
movie['link'] = item.find('a', href=True)['href']
|
|
#获取电影评分
|
|
movie['rating'] = item.find('span', class_='rating_num').get_text(strip=True)
|
|
#获取电影简介
|
|
try:
|
|
movie['profile'] = item.find('span', class_='inq').get_text(strip=True)
|
|
except AttributeError:
|
|
movie['profile'] = '无'
|
|
#获取电影评价人数
|
|
span_list_len = len(item.find_all('span'))
|
|
if span_list_len >= 6:
|
|
num_reviews = item.find_all('span')[span_list_len - 2].get_text(strip=True).replace('人评价', '')
|
|
movie['num_reviews'] = num_reviews
|
|
#获取电影海报链接
|
|
poster = item.find('img', class_='')['src']
|
|
movie['poster'] = poster
|
|
|
|
#获取电影类型与首映国家与上映年份
|
|
movie_infos = item.select('p', class_="")[0].text.strip()
|
|
|
|
if movie['title'] == '大闹天宫':
|
|
movie['type'] = movie_infos.split('\n')[1].split('/')[4].strip()
|
|
movie['country'] = movie_infos.split('\n')[1].split('/')[3].split(' ')[0].strip()
|
|
else:
|
|
movie['type'] = movie_infos.split('\n')[1].split('/')[2].strip()
|
|
movie['country'] = movie_infos.split('\n')[1].split('/')[1].split(' ')[0].strip()
|
|
movie['year'] = movie_infos.split('\n')[1].split('/')[0].strip(' ').strip('(中国大陆)')
|
|
|
|
movie_soup = BeautifulSoup(requests_get(movie['link']), 'lxml')#进一步解析电影详细页面
|
|
movie['director'] = movie_soup.find("a", rel="v:directedBy").get_text()
|
|
|
|
#获取片长
|
|
movie['time'] = movie_soup.find("span", property="v:runtime").get_text(strip=True).replace('片长','')
|
|
|
|
try:
|
|
movie['actor'] = movie_soup.find("a", rel="v:starring").get_text(strip=True)
|
|
except AttributeError:
|
|
movie['actor'] = '无'
|
|
|
|
movie_list.append(movie)
|
|
|
|
return movie_list
|
|
|
|
if __name__ == '__main__':
|
|
movie_data = []
|
|
headers ={ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"}
|
|
for i in range(0,250,25):
|
|
content=requests.get(f'https://movie.douban.com/top250?start={i}',headers=headers)
|
|
html=content.text
|
|
movies = get_movie_info(html)
|
|
movie_data.extend(movies)
|
|
print(f'爬取进度 {i}/250')
|
|
sleep(3)
|
|
print('爬取完成')
|
|
print(movie_data)
|
|
df = pd.DataFrame(movie_data)#转为DataFrame格式
|
|
df.to_csv('export.csv', index=False, encoding='utf-8')
|
|
print('存储完成') |