|
|
import requests
|
|
|
from bs4 import BeautifulSoup
|
|
|
import pandas as pd
|
|
|
import random
|
|
|
|
|
|
def requests_get(url):
|
|
|
user_agent_list = [ \
|
|
|
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", \
|
|
|
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
|
|
|
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
|
|
|
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
|
|
|
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
|
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
|
|
|
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
|
|
|
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
|
|
|
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
|
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
|
|
|
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3"
|
|
|
]
|
|
|
user_agent = random.choice(user_agent_list)
|
|
|
headers = {
|
|
|
"User-Agent": user_agent
|
|
|
}
|
|
|
try:
|
|
|
response = requests.get(url, headers=headers)
|
|
|
response.raise_for_status()
|
|
|
return response.text
|
|
|
except requests.exceptions.RequestException as e:
|
|
|
print(f"请求失败: {e}")
|
|
|
return None
|
|
|
# BeautifulSoup的三个方法find:
|
|
|
# 返回的是第一个符合条件的数据
|
|
|
# 。可以根据title的值来找到对应的标签对象
|
|
|
# ·根据class的值来找,需要在class下添加下划线find_a11
|
|
|
# ·返回的是一个列表,并且返回了匹配的所有的标签
|
|
|
# select
|
|
|
# 有了如上基础,对源代码进行标签路径的分析,就可以取值了。
|
|
|
|
|
|
def get_movie_info(url):
|
|
|
|
|
|
soup = BeautifulSoup(requests_get(url), 'html.parser')
|
|
|
item_elements = soup.find_all("div", class_="item")
|
|
|
|
|
|
movie_list = []
|
|
|
for item in item_elements:
|
|
|
movie = {}
|
|
|
# 获取电影排名
|
|
|
movie['ranking'] = item.find('em', class_='').get_text(strip=True)
|
|
|
# 获取电影名称
|
|
|
movie['title'] = item.find('span', class_='title').get_text(strip=True)
|
|
|
# 获取电影详情链接
|
|
|
movie['link'] = item.find('a', href=True)['href']
|
|
|
# 获取电影评分
|
|
|
movie['rating'] = item.find('span', class_='rating_num').get_text(strip=True)
|
|
|
# 获取电影简介
|
|
|
try:
|
|
|
movie['profile'] = item.find('span', class_='inq').get_text(strip=True)
|
|
|
except AttributeError:
|
|
|
movie['profile'] = '无'
|
|
|
span_list_len = len(item.find_all('span'))
|
|
|
if span_list_len >= 6:
|
|
|
num_reviews = item.find_all('span')[span_list_len - 2].get_text(strip=True).replace('人评价', '')
|
|
|
movie['num_reviews'] = num_reviews
|
|
|
poster = item.find('img', class_='')['src']
|
|
|
movie['poster'] = poster
|
|
|
movie_infos = item.select('p', class_="")[0].text.strip()
|
|
|
# print(movie_infos)
|
|
|
|
|
|
if movie['title'] == '大闹天宫':
|
|
|
movie['type'] = movie_infos.split('\n')[1].split('/')[4].strip()
|
|
|
movie['country'] = movie_infos.split('\n')[1].split('/')[3].split(' ')[0].strip()
|
|
|
else:
|
|
|
movie['type'] = movie_infos.split('\n')[1].split('/')[2].strip()
|
|
|
movie['country'] = movie_infos.split('\n')[1].split('/')[1].split(' ')[0].strip()
|
|
|
movie['year'] = movie_infos.split('\n')[1].split('/')[0].strip(' ').strip('(中国大陆)')
|
|
|
movie_soup = BeautifulSoup(requests_get(movie['link']), 'html.parser')
|
|
|
movie['director'] = movie_soup.find("a", rel="v:directedBy").get_text()
|
|
|
movie['time'] = movie_soup.find("span", property="v:runtime").get_text(strip=True).replace('片长','')
|
|
|
try:
|
|
|
movie['actor'] = movie_soup.find("a", rel="v:starring").get_text(strip=True)
|
|
|
except AttributeError:
|
|
|
movie['actor'] = '无'
|
|
|
movie_list.append(movie)
|
|
|
|
|
|
return movie_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
print(requests_get('https://movie.douban.com/top250'))
|
|
|
base_url = 'https://movie.douban.com/top250'
|
|
|
movie_data = []
|
|
|
|
|
|
for page in range(10):
|
|
|
start = page * 25
|
|
|
# url ='https://movie.douban.com/top250?start=50'
|
|
|
url = f'{base_url}?start={start}'
|
|
|
print(f"开始爬取第 {page + 1} 页: {url}")
|
|
|
movies = get_movie_info(url)
|
|
|
movie_data.extend(movies)
|
|
|
|
|
|
print('爬取完成')
|
|
|
df = pd.DataFrame(movie_data)
|
|
|
|
|
|
# # 将数据保存为CSV文件
|
|
|
csv_file = 'flaskProject/data/export.csv'
|
|
|
df.to_csv(csv_file, index=False, encoding='utf-8')
|
|
|
print('存储完成')
|