www.ctrip.com/xiexheng.py

# 导入selenium模快
from selenium import webdriver
# 导入selenium的设置模快
from selenium.webdriver.chrome.options import Options

# 由于该服务器是在linux上运行，没有可视化界面，所有我们要使用无头模式
# 对谷歌浏览器进行设置
chrome_options = Options()
# linux上运行时需要以最高权限运行
chrome_options.add_argument('--no-sandbox')
# 设置无头模式，不打开浏览器窗口
chrome_options.add_argument('--headless')
# 设置浏览器窗口大小
chrome_options.add_argument('window-size=1920,1080')
driver = webdriver.Chrome(executable_path=r'chromedriver', options=chrome_options)   #获取chrome浏览器的驱动，并启动Chrome浏览器
driver.get('https://www.baidu.com')    # 打开百度
print(driver.current_url)    # 打印浏览器当前url
# 打开携程长沙飞南京2022-05-21日的航班信息页面
driver.get("https://flights.ctrip.com/online/list/oneway-CSX-NKG?_=1&depdate=2022-05-21")
# 定位到所有航班号所在的标签
flights = driver.find_elements_by_class_name("plane-No")
flights
# 导入显示等待需要的包
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# 设置元素等待实例，最多等10秒，每0.5秒查看条件是否成立
WebDriverWait(driver, 10, 0.5).until(
    # 条件：直到元素加载完成
    EC.presence_of_element_located((By.ID, "page_id"))
)
# 打开携程长沙飞南京2022-05-21日的航班信息页面
driver.get("https://flights.ctrip.com/online/list/oneway-CSX-NKG?_=1&depdate=2022-05-21")
# 等航班信息加载完
WebDriverWait(driver, 10, 0.5).until(
    EC.presence_of_element_located((By.CLASS_NAME, "plane-No"))
)
# 定位到所有航班号所在的标签
flights = driver.find_elements_by_class_name("plane-No")
for x in flights:
    print(x.text)
# 打开携程长沙飞南京2022-01-12日的航班信息页面
driver.get("https://flights.ctrip.com/online/list/oneway-CSX-NKG?_=1&depdate=2022-01-12")
# 等航班信息加载完
WebDriverWait(driver, 10, 0.5).until(
    EC.presence_of_element_located((By.CLASS_NAME, "plane-No"))
)

# 获取滚动条的高度的js代码
js = "return document.body.scrollHeight"
# 运行js代码
new_height = driver.execute_script(js)
# 通过for循环分次滑动
for i in range(0, new_height, 350):
    # 执行滑动代码
    driver.execute_script('window.scrollTo(0, %s)' % i)

# 定位到所有航班号所在的标签
flights = driver.find_elements_by_class_name("plane-No")
# 依次获取标签下的文本数据
for x in flights:
    print(x.text)
arrTime = driver.find_elements_by_xpath('//div[@class="arrive-box"]/div[@class="time"]')
for x in arrTime:
    # 由于有的到达时间与起飞时间并不是同一天，我们需要对数据进行处理
    print("".join(x.text.split("\n")))
prices = driver.find_elements_by_xpath('//span[@class="price"]')
for x in prices:
    # 去除价格中的¥
    print(x.text.replace("¥", ""))
# 每次获取完数据后需要关闭浏览器，如果不关闭浏览器，驱动会一直运行，消耗很多内存
driver.quit()
# 导入selenium模快
from selenium import webdriver
# 导入selenium的设置模快
from selenium.webdriver.chrome.options import Options
# 导入显示等待需要的包
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# 导入datetime模块处理时间
import datetime


# 对谷歌浏览器进行设置
chrome_options = Options()
# linux上运行时需要以最高权限运行
chrome_options.add_argument('--no-sandbox')
# 设置无头模式，不打开浏览器窗口
chrome_options.add_argument('--headless')
# 设置浏览器窗口大小
chrome_options.add_argument('window-size=1920,1080')
driver = webdriver.Chrome(executable_path=r'chromedriver', options=chrome_options)   #获取chrome浏览器的驱动，并启动Chrome浏览器
# 处理时间，将字符串转换成datetime
date = datetime.datetime.strptime("2022-01-01","%Y-%m-%d")
# 保存数据
message = []
# 捕获异常，如果发生异常需要关闭driver
try:
    for x in range(10):
        url = "https://flights.ctrip.com/online/list/oneway-CSX-NKG?_=1&depdate=" + str(date).split(" ")[0]
        # 打开携程航班信息页面
        driver.get(url)
        # 等航班信息加载完
        WebDriverWait(driver, 10, 0.5).until(
            EC.presence_of_element_located((By.CLASS_NAME, "plane-No"))
        )
        try:
            # 首次打开页面时会有疫情提醒页面，需要关闭一次
            driver.find_element_by_class_name("close-icon").click()
        except:
            pass
        # 获取滚动条的高度的js代码
        js = "return document.body.scrollHeight"
        # 运行js代码
        new_height = driver.execute_script(js)
        # 通过for循环分次滑动
        for i in range(0, new_height, 100):
            # 执行滑动代码
            driver.execute_script('window.scrollTo(0, %s)' % i)
        # 航班号
        flights = driver.find_elements_by_class_name("plane-No")
        flights = [x.text[:6] for x in flights]
        # 起飞日期，使列表长度和航班号列表长度一致，方便后续数据处理
        flightDate = [url.split("=")[-1] for x in range(len(flights))]
        # 起飞时间
        depTime = driver.find_elements_by_xpath('//div[@class="depart-box"]/div[@class="time"]')
        depTime = [x.text for x in depTime]
        # 到达时间
        arrTime = driver.find_elements_by_xpath('//div[@class="arrive-box"]/div[@class="time"]')
        arrTime = ["".join(x.text.split("\n")) for x in arrTime]
        # 最低价
        prices = driver.find_elements_by_xpath('//span[@class="price"]')
        prices = [x.text.replace("¥", "") for x in prices]
        # 使用zip函数打包
        mess = list(zip(flights, flightDate, depTime, arrTime, prices))
        message.extend(mess)
        # 日期加一天
        date = date + datetime.timedelta(days=1)
    driver.quit()
except:
    driver.quit()
message
# 导入python中的内置模块csv
import csv
with open("content.csv", "w") as f:
    w = csv.writer(f)
    w.writerows(message)
!cat content.csv
import pandas as pd

# 读取数据
df = pd.read_csv("content.csv", names=["flightNum", "date", "depTime", "arrTime", "price"])
df.head()
df.info()
# 根据航班号和日期进行分组，取MU2754的价格数据
df1 = df.groupby(['flightNum', "date"]).apply(lambda x: x.loc[:,"price"])["MU2754"]
df1
# 价格
prices = df1.values
prices
# 日期
date = df1.index.levels[0].values
date
# 在jupyter中直接展示图像
%matplotlib inline
import matplotlib.pyplot as plt
# 用黑体显示中文
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['figure.figsize'] = (10, 5)  # 设置figure_size尺寸
# x轴坐标，并转换成字符串格式
x = date
# y轴坐标
y = prices
# 绘制折线图
plt.plot(x, y, marker="o")
# 设置x轴名称
plt.xlabel("价格",fontsize=14)
# 设置x轴名称
plt.ylabel("日期",fontsize=14)
plt.show()
# 根据日期进行分组，取每天的航班数量
df1 = df.groupby("date").count()["flightNum"]
df1
# x轴坐标
x = df1.index
# y轴坐标
y = df1.values
# 绘制柱状图
plt.bar(x, y)
# 设置x轴名称
plt.xlabel("日期",fontsize=14)
# 设置x轴名称
plt.ylabel("航班数量",fontsize=14)
plt.show()