diff --git a/get.py b/get.py new file mode 100644 index 0000000..3b224cc --- /dev/null +++ b/get.py @@ -0,0 +1,86 @@ +import requests +from bs4 import BeautifulSoup +from time import sleep +import pandas as pd + +def requests_get(url): + headers ={ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"} + try: + response = requests.get(url, headers=headers) + response.raise_for_status()#检查响应码 + return response.text + except requests.exceptions.RequestException as e:#捕获异常 + print(f"请求失败: {e}") + return None + +def get_movie_info(html): + soup = BeautifulSoup(html, 'lxml') + item_elements = soup.find_all("div", class_="item") + + movie_list = [] + + for item in item_elements: + movie = {} + #获取电影排名 + movie['ranking'] = item.find('em', class_='').get_text(strip=True)#strip除去前后空白字符 + #获取电影名称 + movie['title'] = item.find('span', class_='title').get_text(strip=True) + #获取电影详情链接 + movie['link'] = item.find('a', href=True)['href'] + #获取电影评分 + movie['rating'] = item.find('span', class_='rating_num').get_text(strip=True) + #获取电影简介 + try: + movie['profile'] = item.find('span', class_='inq').get_text(strip=True) + except AttributeError: + movie['profile'] = '无' + #获取电影评价人数 + span_list_len = len(item.find_all('span')) + if span_list_len >= 6: + num_reviews = item.find_all('span')[span_list_len - 2].get_text(strip=True).replace('人评价', '') + movie['num_reviews'] = num_reviews + #获取电影海报链接 + poster = item.find('img', class_='')['src'] + movie['poster'] = poster + + #获取电影类型与首映国家与上映年份 + movie_infos = item.select('p', class_="")[0].text.strip() + + if movie['title'] == '大闹天宫': + movie['type'] = movie_infos.split('\n')[1].split('/')[4].strip() + movie['country'] = movie_infos.split('\n')[1].split('/')[3].split(' ')[0].strip() + else: + movie['type'] = movie_infos.split('\n')[1].split('/')[2].strip() + movie['country'] = movie_infos.split('\n')[1].split('/')[1].split(' ')[0].strip() + movie['year'] = movie_infos.split('\n')[1].split('/')[0].strip(' ').strip('(中国大陆)') + + movie_soup = BeautifulSoup(requests_get(movie['link']), 'lxml')#进一步解析电影详细页面 + movie['director'] = movie_soup.find("a", rel="v:directedBy").get_text() + + #获取片长 + movie['time'] = movie_soup.find("span", property="v:runtime").get_text(strip=True).replace('片长','') + + try: + movie['actor'] = movie_soup.find("a", rel="v:starring").get_text(strip=True) + except AttributeError: + movie['actor'] = '无' + + movie_list.append(movie) + + return movie_list + +if __name__ == '__main__': + movie_data = [] + headers ={ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"} + for i in range(0,250,25): + content=requests.get(f'https://movie.douban.com/top250?start={i}',headers=headers) + html=content.text + movies = get_movie_info(html) + movie_data.extend(movies) + print(f'爬取进度 {i}/250') + sleep(3) + print('爬取完成') + print(movie_data) + df = pd.DataFrame(movie_data)#转为DataFrame格式 + df.to_csv('export.csv', index=False, encoding='utf-8') + print('存储完成') \ No newline at end of file