From b66477b7131375b1338bf9a9545349fd61679d60 Mon Sep 17 00:00:00 2001 From: hnu202410040233 <3571724693@qq.com> Date: Wed, 21 May 2025 00:50:56 +0800 Subject: [PATCH] ADD file via upload --- xiexheng.py | 196 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 xiexheng.py diff --git a/xiexheng.py b/xiexheng.py new file mode 100644 index 0000000..eb9d773 --- /dev/null +++ b/xiexheng.py @@ -0,0 +1,196 @@ +# 导入selenium模快 +from selenium import webdriver +# 导入selenium的设置模快 +from selenium.webdriver.chrome.options import Options + +# 由于该服务器是在linux上运行,没有可视化界面,所有我们要使用无头模式 +# 对谷歌浏览器进行设置 +chrome_options = Options() +# linux上运行时需要以最高权限运行 +chrome_options.add_argument('--no-sandbox') +# 设置无头模式,不打开浏览器窗口 +chrome_options.add_argument('--headless') +# 设置浏览器窗口大小 +chrome_options.add_argument('window-size=1920,1080') +driver = webdriver.Chrome(executable_path=r'chromedriver', options=chrome_options) #获取chrome浏览器的驱动,并启动Chrome浏览器 +driver.get('https://www.baidu.com') # 打开百度 +print(driver.current_url) # 打印浏览器当前url +# 打开携程长沙飞南京2022-05-21日的航班信息页面 +driver.get("https://flights.ctrip.com/online/list/oneway-CSX-NKG?_=1&depdate=2022-05-21") +# 定位到所有航班号所在的标签 +flights = driver.find_elements_by_class_name("plane-No") +flights +# 导入显示等待需要的包 +from selenium.webdriver.common.by import By +from selenium.webdriver.support.wait import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC + +# 设置元素等待实例,最多等10秒,每0.5秒查看条件是否成立 +WebDriverWait(driver, 10, 0.5).until( + # 条件:直到元素加载完成 + EC.presence_of_element_located((By.ID, "page_id")) +) +# 打开携程长沙飞南京2022-05-21日的航班信息页面 +driver.get("https://flights.ctrip.com/online/list/oneway-CSX-NKG?_=1&depdate=2022-05-21") +# 等航班信息加载完 +WebDriverWait(driver, 10, 0.5).until( + EC.presence_of_element_located((By.CLASS_NAME, "plane-No")) +) +# 定位到所有航班号所在的标签 +flights = driver.find_elements_by_class_name("plane-No") +for x in flights: + print(x.text) +# 打开携程长沙飞南京2022-01-12日的航班信息页面 +driver.get("https://flights.ctrip.com/online/list/oneway-CSX-NKG?_=1&depdate=2022-01-12") +# 等航班信息加载完 +WebDriverWait(driver, 10, 0.5).until( + EC.presence_of_element_located((By.CLASS_NAME, "plane-No")) +) + +# 获取滚动条的高度的js代码 +js = "return document.body.scrollHeight" +# 运行js代码 +new_height = driver.execute_script(js) +# 通过for循环分次滑动 +for i in range(0, new_height, 350): + # 执行滑动代码 + driver.execute_script('window.scrollTo(0, %s)' % i) + +# 定位到所有航班号所在的标签 +flights = driver.find_elements_by_class_name("plane-No") +# 依次获取标签下的文本数据 +for x in flights: + print(x.text) +arrTime = driver.find_elements_by_xpath('//div[@class="arrive-box"]/div[@class="time"]') +for x in arrTime: + # 由于有的到达时间与起飞时间并不是同一天,我们需要对数据进行处理 + print("".join(x.text.split("\n"))) +prices = driver.find_elements_by_xpath('//span[@class="price"]') +for x in prices: + # 去除价格中的¥ + print(x.text.replace("¥", "")) +# 每次获取完数据后需要关闭浏览器,如果不关闭浏览器,驱动会一直运行,消耗很多内存 +driver.quit() +# 导入selenium模快 +from selenium import webdriver +# 导入selenium的设置模快 +from selenium.webdriver.chrome.options import Options +# 导入显示等待需要的包 +from selenium.webdriver.common.by import By +from selenium.webdriver.support.wait import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +# 导入datetime模块处理时间 +import datetime + + +# 对谷歌浏览器进行设置 +chrome_options = Options() +# linux上运行时需要以最高权限运行 +chrome_options.add_argument('--no-sandbox') +# 设置无头模式,不打开浏览器窗口 +chrome_options.add_argument('--headless') +# 设置浏览器窗口大小 +chrome_options.add_argument('window-size=1920,1080') +driver = webdriver.Chrome(executable_path=r'chromedriver', options=chrome_options) #获取chrome浏览器的驱动,并启动Chrome浏览器 +# 处理时间,将字符串转换成datetime +date = datetime.datetime.strptime("2022-01-01","%Y-%m-%d") +# 保存数据 +message = [] +# 捕获异常,如果发生异常需要关闭driver +try: + for x in range(10): + url = "https://flights.ctrip.com/online/list/oneway-CSX-NKG?_=1&depdate=" + str(date).split(" ")[0] + # 打开携程航班信息页面 + driver.get(url) + # 等航班信息加载完 + WebDriverWait(driver, 10, 0.5).until( + EC.presence_of_element_located((By.CLASS_NAME, "plane-No")) + ) + try: + # 首次打开页面时会有疫情提醒页面,需要关闭一次 + driver.find_element_by_class_name("close-icon").click() + except: + pass + # 获取滚动条的高度的js代码 + js = "return document.body.scrollHeight" + # 运行js代码 + new_height = driver.execute_script(js) + # 通过for循环分次滑动 + for i in range(0, new_height, 100): + # 执行滑动代码 + driver.execute_script('window.scrollTo(0, %s)' % i) + # 航班号 + flights = driver.find_elements_by_class_name("plane-No") + flights = [x.text[:6] for x in flights] + # 起飞日期,使列表长度和航班号列表长度一致,方便后续数据处理 + flightDate = [url.split("=")[-1] for x in range(len(flights))] + # 起飞时间 + depTime = driver.find_elements_by_xpath('//div[@class="depart-box"]/div[@class="time"]') + depTime = [x.text for x in depTime] + # 到达时间 + arrTime = driver.find_elements_by_xpath('//div[@class="arrive-box"]/div[@class="time"]') + arrTime = ["".join(x.text.split("\n")) for x in arrTime] + # 最低价 + prices = driver.find_elements_by_xpath('//span[@class="price"]') + prices = [x.text.replace("¥", "") for x in prices] + # 使用zip函数打包 + mess = list(zip(flights, flightDate, depTime, arrTime, prices)) + message.extend(mess) + # 日期加一天 + date = date + datetime.timedelta(days=1) + driver.quit() +except: + driver.quit() +message +# 导入python中的内置模块csv +import csv +with open("content.csv", "w") as f: + w = csv.writer(f) + w.writerows(message) +!cat content.csv +import pandas as pd + +# 读取数据 +df = pd.read_csv("content.csv", names=["flightNum", "date", "depTime", "arrTime", "price"]) +df.head() +df.info() +# 根据航班号和日期进行分组,取MU2754的价格数据 +df1 = df.groupby(['flightNum', "date"]).apply(lambda x: x.loc[:,"price"])["MU2754"] +df1 +# 价格 +prices = df1.values +prices +# 日期 +date = df1.index.levels[0].values +date +# 在jupyter中直接展示图像 +%matplotlib inline +import matplotlib.pyplot as plt +# 用黑体显示中文 +plt.rcParams['font.sans-serif'] = ['SimHei'] +plt.rcParams['figure.figsize'] = (10, 5) # 设置figure_size尺寸 +# x轴坐标,并转换成字符串格式 +x = date +# y轴坐标 +y = prices +# 绘制折线图 +plt.plot(x, y, marker="o") +# 设置x轴名称 +plt.xlabel("价格",fontsize=14) +# 设置x轴名称 +plt.ylabel("日期",fontsize=14) +plt.show() +# 根据日期进行分组,取每天的航班数量 +df1 = df.groupby("date").count()["flightNum"] +df1 +# x轴坐标 +x = df1.index +# y轴坐标 +y = df1.values +# 绘制柱状图 +plt.bar(x, y) +# 设置x轴名称 +plt.xlabel("日期",fontsize=14) +# 设置x轴名称 +plt.ylabel("航班数量",fontsize=14) +plt.show() \ No newline at end of file