You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

111 lines
5.1 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
def requests_get(url):
user_agent_list = [ \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", \
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3"
]
user_agent = random.choice(user_agent_list)
headers = {
"User-Agent": user_agent
}
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
print(f"请求失败: {e}")
return None
# BeautifulSoup的三个方法find:
# 返回的是第一个符合条件的数据
# 。可以根据title的值来找到对应的标签对象
# ·根据class的值来找需要在class下添加下划线find_a11
# ·返回的是一个列表,并且返回了匹配的所有的标签
# select
# 有了如上基础,对源代码进行标签路径的分析,就可以取值了。
def get_movie_info(url):
soup = BeautifulSoup(requests_get(url), 'html.parser')
item_elements = soup.find_all("div", class_="item")
movie_list = []
for item in item_elements:
movie = {}
# 获取电影排名
movie['ranking'] = item.find('em', class_='').get_text(strip=True)
# 获取电影名称
movie['title'] = item.find('span', class_='title').get_text(strip=True)
# 获取电影详情链接
movie['link'] = item.find('a', href=True)['href']
# 获取电影评分
movie['rating'] = item.find('span', class_='rating_num').get_text(strip=True)
# 获取电影简介
try:
movie['profile'] = item.find('span', class_='inq').get_text(strip=True)
except AttributeError:
movie['profile'] = ''
span_list_len = len(item.find_all('span'))
if span_list_len >= 6:
num_reviews = item.find_all('span')[span_list_len - 2].get_text(strip=True).replace('人评价', '')
movie['num_reviews'] = num_reviews
poster = item.find('img', class_='')['src']
movie['poster'] = poster
movie_infos = item.select('p', class_="")[0].text.strip()
# print(movie_infos)
if movie['title'] == '大闹天宫':
movie['type'] = movie_infos.split('\n')[1].split('/')[4].strip()
movie['country'] = movie_infos.split('\n')[1].split('/')[3].split(' ')[0].strip()
else:
movie['type'] = movie_infos.split('\n')[1].split('/')[2].strip()
movie['country'] = movie_infos.split('\n')[1].split('/')[1].split(' ')[0].strip()
movie['year'] = movie_infos.split('\n')[1].split('/')[0].strip(' ').strip('(中国大陆)')
movie_soup = BeautifulSoup(requests_get(movie['link']), 'html.parser')
movie['director'] = movie_soup.find("a", rel="v:directedBy").get_text()
movie['time'] = movie_soup.find("span", property="v:runtime").get_text(strip=True).replace('片长','')
try:
movie['actor'] = movie_soup.find("a", rel="v:starring").get_text(strip=True)
except AttributeError:
movie['actor'] = ''
movie_list.append(movie)
return movie_list
if __name__ == '__main__':
print(requests_get('https://movie.douban.com/top250'))
base_url = 'https://movie.douban.com/top250'
movie_data = []
for page in range(10):
start = page * 25
# url ='https://movie.douban.com/top250?start=50'
url = f'{base_url}?start={start}'
print(f"开始爬取第 {page + 1} 页: {url}")
movies = get_movie_info(url)
movie_data.extend(movies)
print('爬取完成')
df = pd.DataFrame(movie_data)
# # 将数据保存为CSV文件
csv_file = 'flaskProject/data/export.csv'
df.to_csv(csv_file, index=False, encoding='utf-8')
print('存储完成')