From 5f7d6b8aa740943fae05d6f572f808e4f3769b2c Mon Sep 17 00:00:00 2001 From: pn2yfabhg <1249144594@qq.com> Date: Sat, 8 Jun 2024 17:06:53 +0800 Subject: [PATCH] ADD file via upload --- test.py | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 test.py diff --git a/test.py b/test.py new file mode 100644 index 0000000..9780a0b --- /dev/null +++ b/test.py @@ -0,0 +1,110 @@ +import requests +from bs4 import BeautifulSoup +import pandas as pd +import random + +def requests_get(url): + user_agent_list = [ \ + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", \ + "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \ + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \ + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \ + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \ + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \ + "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \ + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ + "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3" + ] + user_agent = random.choice(user_agent_list) + headers = { + "User-Agent": user_agent + } + try: + response = requests.get(url, headers=headers) + response.raise_for_status() + return response.text + except requests.exceptions.RequestException as e: + print(f"请求失败: {e}") + return None + # BeautifulSoup的三个方法find: + # 返回的是第一个符合条件的数据 + # 。可以根据title的值来找到对应的标签对象 + # ·根据class的值来找,需要在class下添加下划线find_a11 + # ·返回的是一个列表,并且返回了匹配的所有的标签 + # select + # 有了如上基础,对源代码进行标签路径的分析,就可以取值了。 + +def get_movie_info(url): + + soup = BeautifulSoup(requests_get(url), 'html.parser') + item_elements = soup.find_all("div", class_="item") + + movie_list = [] + for item in item_elements: + movie = {} + # 获取电影排名 + movie['ranking'] = item.find('em', class_='').get_text(strip=True) + # 获取电影名称 + movie['title'] = item.find('span', class_='title').get_text(strip=True) + # 获取电影详情链接 + movie['link'] = item.find('a', href=True)['href'] + # 获取电影评分 + movie['rating'] = item.find('span', class_='rating_num').get_text(strip=True) + # 获取电影简介 + try: + movie['profile'] = item.find('span', class_='inq').get_text(strip=True) + except AttributeError: + movie['profile'] = '无' + span_list_len = len(item.find_all('span')) + if span_list_len >= 6: + num_reviews = item.find_all('span')[span_list_len - 2].get_text(strip=True).replace('人评价', '') + movie['num_reviews'] = num_reviews + poster = item.find('img', class_='')['src'] + movie['poster'] = poster + movie_infos = item.select('p', class_="")[0].text.strip() + # print(movie_infos) + + if movie['title'] == '大闹天宫': + movie['type'] = movie_infos.split('\n')[1].split('/')[4].strip() + movie['country'] = movie_infos.split('\n')[1].split('/')[3].split(' ')[0].strip() + else: + movie['type'] = movie_infos.split('\n')[1].split('/')[2].strip() + movie['country'] = movie_infos.split('\n')[1].split('/')[1].split(' ')[0].strip() + movie['year'] = movie_infos.split('\n')[1].split('/')[0].strip(' ').strip('(中国大陆)') + movie_soup = BeautifulSoup(requests_get(movie['link']), 'html.parser') + movie['director'] = movie_soup.find("a", rel="v:directedBy").get_text() + movie['time'] = movie_soup.find("span", property="v:runtime").get_text(strip=True).replace('片长','') + try: + movie['actor'] = movie_soup.find("a", rel="v:starring").get_text(strip=True) + except AttributeError: + movie['actor'] = '无' + movie_list.append(movie) + + return movie_list + + + + + +if __name__ == '__main__': + print(requests_get('https://movie.douban.com/top250')) + base_url = 'https://movie.douban.com/top250' + movie_data = [] + + for page in range(10): + start = page * 25 + # url ='https://movie.douban.com/top250?start=50' + url = f'{base_url}?start={start}' + print(f"开始爬取第 {page + 1} 页: {url}") + movies = get_movie_info(url) + movie_data.extend(movies) + + print('爬取完成') + df = pd.DataFrame(movie_data) + + # # 将数据保存为CSV文件 + csv_file = 'flaskProject/data/export.csv' + df.to_csv(csv_file, index=False, encoding='utf-8') + print('存储完成')