xiecheng/携程餐厅.py

import requests
from bs4 import BeautifulSoup
import pandas as pd

# 伪造请求头
headers = {
    'cookie': 'ASP.NET_SessionSvc=MTAuMTEzLjkyLjkzfDkwOTB8b3V5YW5nfGRlZmF1bHR8MTYzODQzNDE5NTI2NQ; _bfa=1.1727420783628.2v4j7k.1.1727420783628.1727420783628.1.1.154019; _ubtstatus=%7B%22vid%22%3A%221727420783628.2v4j7k%22%2C%22sid%22%3A1%2C%22pvid%22%3A1%2C%22pid%22%3A154019%7D; Hm_lvt_e4211314613fcf074540918eb10eeecb=1727420784; Hm_lpvt_e4211314613fcf074540918eb10eeecb=1727420784; HMACCOUNT=34C5D5D87EFFB691; GUID=09031042416180792327; MKT_CKID=1727420785262.vgfup.ox1u; _jzqco=%7C%7C%7C%7C1727420785652%7C1.608322706.1727420785265.1727420785265.1727420785265.1727420785265.1727420785265.0.0.0.1.1; _RF1=182.88.188.57; _RSG=SnajKuQaCp5rinW_EZPRm8; _RDG=28338c247be07326791c678684a664d512; _RGUID=5ccc91ae-14b6-4c5c-8c1b-6633bf831b35; _bfaStatusPVSend=1; _bfaStatus=success; _bfi=p1%3D154019%26p2%3D0%26v1%3D1%26v2%3D0',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
}

dic = []


def work(url, headers, cuisine):
    # 发送请求
    response = requests.get(url, headers=headers)
    # 使用BeautifulSoup解析HTML
    soup = BeautifulSoup(response.text, 'lxml')

    # 获取所有标签
    li_elements = soup.find('div', class_='list_wide_mod2').find_all('div', class_='list_mod2')
    for div in li_elements:
        # 人均消费
        renjun = div.find_next('div', class_='rdetailbox').find('dl').find_all('dd')[1].find_next('span').text
        renjun = renjun.replace('￥', '')
        # 餐厅评分
        pingfen = div.find_next('div', class_='rdetailbox').find('ul').find_all('li')[0].find_next('a').text
        pingfen = pingfen.replace('\xa0分', '')
        # 餐厅链接
        lianjie = div.find_next('div', class_='rdetailbox').find('dl').find('dt').find('a').get('href')
        xiechen_url = 'https://you.ctrip.com' + lianjie
        # 发送请求
        xiechen_response = requests.get(xiechen_url, headers=headers)
        # 使用BeautifulSoup解析HTML
        xiechen_soup = BeautifulSoup(xiechen_response.text, 'lxml')
        evaluation_number = xiechen_soup.find('ul', class_='tablist').find_all('li')
        # 餐厅名称
        name = xiechen_soup.find('div', class_='f_left').find('h1').text
        # 评论数量
        total_comments = xiechen_soup.find('dfn', id='hrefyyDp').find('span', class_='f_orange').text
        # 每种评价数量
        detailed_scoring = {}
        for num in evaluation_number:
            index1 = num.find_next('a').text.find('(')
            index2 = num.find_next('a').text.find(')')
            detailed_scoring = {**detailed_scoring,
                                num.find_next('a').text[0:index1]: num.find_next('a').text[index1 + 1:index2]}
        data = {
            '餐厅名称': name,
            '人均消费': renjun,
            '评分': pingfen,
            '餐厅链接': xiechen_url,
            '评论总数': total_comments,
            **detailed_scoring,
            '菜系': cuisine
        }
        dic.append(data)


def main():
    # 菜系字典
    # "火锅","广西菜","西餐","烧烤","川菜","粤菜","湘菜","鲁菜","闽菜","徽菜"
    cuisine_dic = {'火锅': 4903, '广西菜': 4918,'西餐':4900,'烧烤':4931,'川菜':4896,'粤菜':4898,'湘菜':4915,'鲁菜':4920,'闽菜':4916,'徽菜':4911}
    # 一页15条数据
    for key in cuisine_dic:
        for i in range(1, 2):
            # 请求地址
            url = f'https://you.ctrip.com/restaurantlist/China110000/list-c{cuisine_dic[key]}-p{i}.html'
            # 调用函数
            work(url=url, headers=headers, cuisine=key)
    # 打印结果
    print(dic)
    # 将结果保存到excel
    df = pd.DataFrame(dic)
    df.to_excel('携程餐厅数据.xlsx', index=False)


if __name__ == '__main__':
    main()
ADD file via upload 2 months ago			`import requests`
			`from bs4 import BeautifulSoup`
			`import pandas as pd`

			`# 伪造请求头`
			`headers = {`
			'cookie': 'ASP.NET_SessionSvc=MTAuMTEzLjkyLjkzfDkwOTB8b3V5YW5nfGRlZmF1bHR8MTYzODQzNDE5NTI2NQ; _bfa=1.1727420783628.2v4j7k.1.1727420783628.1727420783628.1.1.154019; _ubtstatus=%7B%22vid%22%3A%221727420783628.2v4j7k%22%2C%22sid%22%3A1%2C%22pvid%22%3A1%2C%22pid%22%3A154019%7D; Hm_lvt_e4211314613fcf074540918eb10eeecb=1727420784; Hm_lpvt_e4211314613fcf074540918eb10eeecb=1727420784; HMACCOUNT=34C5D5D87EFFB691; GUID=09031042416180792327; MKT_CKID=1727420785262.vgfup.ox1u; _jzqco=%7C%7C%7C%7C1727420785652%7C1.608322706.1727420785265.1727420785265.1727420785265.1727420785265.1727420785265.0.0.0.1.1; _RF1=182.88.188.57; _RSG=SnajKuQaCp5rinW_EZPRm8; _RDG=28338c247be07326791c678684a664d512; _RGUID=5ccc91ae-14b6-4c5c-8c1b-6633bf831b35; _bfaStatusPVSend=1; _bfaStatus=success; _bfi=p1%3D154019%26p2%3D0%26v1%3D1%26v2%3D0',
			`'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'`
			`}`

			`dic = []`


			`def work(url, headers, cuisine):`
			`# 发送请求`
			`response = requests.get(url, headers=headers)`
			`# 使用BeautifulSoup解析HTML`
			`soup = BeautifulSoup(response.text, 'lxml')`

			`# 获取所有标签`
			`li_elements = soup.find('div', class_='list_wide_mod2').find_all('div', class_='list_mod2')`
			`for div in li_elements:`
			`# 人均消费`
			`renjun = div.find_next('div', class_='rdetailbox').find('dl').find_all('dd')[1].find_next('span').text`
			`renjun = renjun.replace('￥', '')`
			`# 餐厅评分`
			`pingfen = div.find_next('div', class_='rdetailbox').find('ul').find_all('li')[0].find_next('a').text`
			`pingfen = pingfen.replace('\xa0分', '')`
			`# 餐厅链接`
			`lianjie = div.find_next('div', class_='rdetailbox').find('dl').find('dt').find('a').get('href')`
			`xiechen_url = 'https://you.ctrip.com' + lianjie`
			`# 发送请求`
			`xiechen_response = requests.get(xiechen_url, headers=headers)`
			`# 使用BeautifulSoup解析HTML`
			`xiechen_soup = BeautifulSoup(xiechen_response.text, 'lxml')`
			`evaluation_number = xiechen_soup.find('ul', class_='tablist').find_all('li')`
			`# 餐厅名称`
			`name = xiechen_soup.find('div', class_='f_left').find('h1').text`
			`# 评论数量`
			`total_comments = xiechen_soup.find('dfn', id='hrefyyDp').find('span', class_='f_orange').text`
			`# 每种评价数量`
			`detailed_scoring = {}`
			`for num in evaluation_number:`
			`index1 = num.find_next('a').text.find('(')`
			`index2 = num.find_next('a').text.find(')')`
			`detailed_scoring = {**detailed_scoring,`
			`num.find_next('a').text[0:index1]: num.find_next('a').text[index1 + 1:index2]}`
			`data = {`
			`'餐厅名称': name,`
			`'人均消费': renjun,`
			`'评分': pingfen,`
			`'餐厅链接': xiechen_url,`
			`'评论总数': total_comments,`
			`**detailed_scoring,`
			`'菜系': cuisine`
			`}`
			`dic.append(data)`


			`def main():`
			`# 菜系字典`
			`# "火锅","广西菜","西餐","烧烤","川菜","粤菜","湘菜","鲁菜","闽菜","徽菜"`
			`cuisine_dic = {'火锅': 4903, '广西菜': 4918,'西餐':4900,'烧烤':4931,'川菜':4896,'粤菜':4898,'湘菜':4915,'鲁菜':4920,'闽菜':4916,'徽菜':4911}`
			`# 一页15条数据`
			`for key in cuisine_dic:`
			`for i in range(1, 2):`
			`# 请求地址`
			`url = f'https://you.ctrip.com/restaurantlist/China110000/list-c{cuisine_dic[key]}-p{i}.html'`
			`# 调用函数`
			`work(url=url, headers=headers, cuisine=key)`
			`# 打印结果`
			`print(dic)`
			`# 将结果保存到excel`
			`df = pd.DataFrame(dic)`
			`df.to_excel('携程餐厅数据.xlsx', index=False)`


			`if __name__ == '__main__':`
			`main()`