diff --git a/携程餐厅.py b/携程餐厅.py new file mode 100644 index 0000000..258c017 --- /dev/null +++ b/携程餐厅.py @@ -0,0 +1,79 @@ +import requests +from bs4 import BeautifulSoup +import pandas as pd + +# 伪造请求头 +headers = { + 'cookie': 'ASP.NET_SessionSvc=MTAuMTEzLjkyLjkzfDkwOTB8b3V5YW5nfGRlZmF1bHR8MTYzODQzNDE5NTI2NQ; _bfa=1.1727420783628.2v4j7k.1.1727420783628.1727420783628.1.1.154019; _ubtstatus=%7B%22vid%22%3A%221727420783628.2v4j7k%22%2C%22sid%22%3A1%2C%22pvid%22%3A1%2C%22pid%22%3A154019%7D; Hm_lvt_e4211314613fcf074540918eb10eeecb=1727420784; Hm_lpvt_e4211314613fcf074540918eb10eeecb=1727420784; HMACCOUNT=34C5D5D87EFFB691; GUID=09031042416180792327; MKT_CKID=1727420785262.vgfup.ox1u; _jzqco=%7C%7C%7C%7C1727420785652%7C1.608322706.1727420785265.1727420785265.1727420785265.1727420785265.1727420785265.0.0.0.1.1; _RF1=182.88.188.57; _RSG=SnajKuQaCp5rinW_EZPRm8; _RDG=28338c247be07326791c678684a664d512; _RGUID=5ccc91ae-14b6-4c5c-8c1b-6633bf831b35; _bfaStatusPVSend=1; _bfaStatus=success; _bfi=p1%3D154019%26p2%3D0%26v1%3D1%26v2%3D0', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36' +} + +dic = [] + + +def work(url, headers, cuisine): + # 发送请求 + response = requests.get(url, headers=headers) + # 使用BeautifulSoup解析HTML + soup = BeautifulSoup(response.text, 'lxml') + + # 获取所有标签 + li_elements = soup.find('div', class_='list_wide_mod2').find_all('div', class_='list_mod2') + for div in li_elements: + # 人均消费 + renjun = div.find_next('div', class_='rdetailbox').find('dl').find_all('dd')[1].find_next('span').text + renjun = renjun.replace('¥', '') + # 餐厅评分 + pingfen = div.find_next('div', class_='rdetailbox').find('ul').find_all('li')[0].find_next('a').text + pingfen = pingfen.replace('\xa0分', '') + # 餐厅链接 + lianjie = div.find_next('div', class_='rdetailbox').find('dl').find('dt').find('a').get('href') + xiechen_url = 'https://you.ctrip.com' + lianjie + # 发送请求 + xiechen_response = requests.get(xiechen_url, headers=headers) + # 使用BeautifulSoup解析HTML + xiechen_soup = BeautifulSoup(xiechen_response.text, 'lxml') + evaluation_number = xiechen_soup.find('ul', class_='tablist').find_all('li') + # 餐厅名称 + name = xiechen_soup.find('div', class_='f_left').find('h1').text + # 评论数量 + total_comments = xiechen_soup.find('dfn', id='hrefyyDp').find('span', class_='f_orange').text + # 每种评价数量 + detailed_scoring = {} + for num in evaluation_number: + index1 = num.find_next('a').text.find('(') + index2 = num.find_next('a').text.find(')') + detailed_scoring = {**detailed_scoring, + num.find_next('a').text[0:index1]: num.find_next('a').text[index1 + 1:index2]} + data = { + '餐厅名称': name, + '人均消费': renjun, + '评分': pingfen, + '餐厅链接': xiechen_url, + '评论总数': total_comments, + **detailed_scoring, + '菜系': cuisine + } + dic.append(data) + + +def main(): + # 菜系字典 + # "火锅","广西菜","西餐","烧烤","川菜","粤菜","湘菜","鲁菜","闽菜","徽菜" + cuisine_dic = {'火锅': 4903, '广西菜': 4918,'西餐':4900,'烧烤':4931,'川菜':4896,'粤菜':4898,'湘菜':4915,'鲁菜':4920,'闽菜':4916,'徽菜':4911} + # 一页15条数据 + for key in cuisine_dic: + for i in range(1, 2): + # 请求地址 + url = f'https://you.ctrip.com/restaurantlist/China110000/list-c{cuisine_dic[key]}-p{i}.html' + # 调用函数 + work(url=url, headers=headers, cuisine=key) + # 打印结果 + print(dic) + # 将结果保存到excel + df = pd.DataFrame(dic) + df.to_excel('携程餐厅数据.xlsx', index=False) + + +if __name__ == '__main__': + main()