diff --git a/test1.py b/test1.py deleted file mode 100644 index 9dcb9af..0000000 --- a/test1.py +++ /dev/null @@ -1,130 +0,0 @@ -import requests -from bs4 import BeautifulSoup -import pandas as pd -import random - -def requests_get(url): - user_agent_list = [ \ - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", \ - "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \ - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \ - "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \ - "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \ - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \ - "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \ - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ - "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ - "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3" - ] - user_agent = random.choice(user_agent_list) - headers = { - "User-Agent": user_agent - } - try: - response = requests.get(url, headers=headers) - response.raise_for_status() - return response.text - except requests.exceptions.RequestException as e: - print(f"请求失败: {e}") - return None - - -def get_movie_info(url): - # BeautifulSoup的三个方法find: - # 返回的是第一个符合条件的数据 - # 。可以根据title的值来找到对应的标签对象 - # ·根据class的值来找,需要在class下添加下划线find_a11 - # ·返回的是一个列表,并且返回了匹配的所有的标签 - # select - # 有了如上基础,对源代码进行标签路径的分析,就可以取值了。 - soup = BeautifulSoup(requests_get(url), 'html.parser') - item_elements = soup.find_all("div", class_="item") - # print(item_elements) - - movie_list = [] - for item in item_elements: - movie = {} - # 获取电影排名 - movie['ranking'] = item.find('em', class_='').get_text(strip=True) - # 获取电影名称 - movie['title'] = item.find('span', class_='title').get_text(strip=True) - # 获取电影详情链接 - movie['link'] = item.find('a', href=True)['href'] - # 获取电影评分 - movie['rating'] = item.find('span', class_='rating_num').get_text(strip=True) - # 获取电影简介 - try: - movie['profile'] = item.find('span', class_='inq').get_text(strip=True) - except AttributeError: - movie['profile'] = '无' - # 获取电影评价人数 - span_list_len = len(item.find_all('span')) - if span_list_len >= 6: - num_reviews = item.find_all('span')[span_list_len - 2].get_text(strip=True).replace('人评价', '') - movie['num_reviews'] = num_reviews - # 获取电影海报链接 - poster = item.find('img', class_='')['src'] - movie['poster'] = poster - - # 获取电影类型与首映国家与上映年份 - movie_infos = item.select('p', class_="")[0].text.strip() - # print(movie_infos) - - if movie['title'] == '大闹天宫': - movie['type'] = movie_infos.split('\n')[1].split('/')[4].strip() - movie['country'] = movie_infos.split('\n')[1].split('/')[3].split(' ')[0].strip() - else: - movie['type'] = movie_infos.split('\n')[1].split('/')[2].strip() - movie['country'] = movie_infos.split('\n')[1].split('/')[1].split(' ')[0].strip() - # print(movie['type']) - # print(movie['country']) - movie['year'] = movie_infos.split('\n')[1].split('/')[0].strip(' ').strip('(中国大陆)') -# print(movie['year']) - # print(movie['country']) -# 获取电影导演与上映时间 -# 注意:因为时间想要一个年月日,所以直接进入详情页获取数据 -# 且top页,一些导演名称太长,导致遮挡了主演的全称 -# 这些都需要我们重新请求电影详情链接 - movie_soup = BeautifulSoup(requests_get(movie['link']), 'html.parser') - movie['director'] = movie_soup.find("a", rel="v:directedBy").get_text() - # movie['year'] = movi_infose_soup.find("span", property="v:initialReleaseDate").get_text().split('(')[0] - -#获取片长 - movie['time'] = movie_soup.find("span", property="v:runtime").get_text(strip=True).replace('片长','') - -#获取主演 -#主演有些为空,所以需要我们定义异常处理 - try: - movie['actor'] = movie_soup.find("a", rel="v:starring").get_text(strip=True) - except AttributeError: - movie['actor'] = '无' - # print(movie) - movie_list.append(movie) - - return movie_list - - - - - -if __name__ == '__main__': - print(requests_get('https://movie.douban.com/top250')) - base_url = 'https://movie.douban.com/top250' - movie_data = [] - - for page in range(10): - start = page * 25 - # url ='https://movie.douban.com/top250?start=50' - url = f'{base_url}?start={start}' - print(f"开始爬取第 {page + 1} 页: {url}") - movies = get_movie_info(url) - movie_data.extend(movies) - - print('爬取完成') - df = pd.DataFrame(movie_data) - - # # 将数据保存为CSV文件 - csv_file = 'flaskProject/data/export.csv' - df.to_csv(csv_file, index=False, encoding='utf-8') - print('存储完成')