|
|
|
@ -0,0 +1,196 @@
|
|
|
|
|
# 导入selenium模快
|
|
|
|
|
from selenium import webdriver
|
|
|
|
|
# 导入selenium的设置模快
|
|
|
|
|
from selenium.webdriver.chrome.options import Options
|
|
|
|
|
|
|
|
|
|
# 由于该服务器是在linux上运行,没有可视化界面,所有我们要使用无头模式
|
|
|
|
|
# 对谷歌浏览器进行设置
|
|
|
|
|
chrome_options = Options()
|
|
|
|
|
# linux上运行时需要以最高权限运行
|
|
|
|
|
chrome_options.add_argument('--no-sandbox')
|
|
|
|
|
# 设置无头模式,不打开浏览器窗口
|
|
|
|
|
chrome_options.add_argument('--headless')
|
|
|
|
|
# 设置浏览器窗口大小
|
|
|
|
|
chrome_options.add_argument('window-size=1920,1080')
|
|
|
|
|
driver = webdriver.Chrome(executable_path=r'chromedriver', options=chrome_options) #获取chrome浏览器的驱动,并启动Chrome浏览器
|
|
|
|
|
driver.get('https://www.baidu.com') # 打开百度
|
|
|
|
|
print(driver.current_url) # 打印浏览器当前url
|
|
|
|
|
# 打开携程长沙飞南京2022-05-21日的航班信息页面
|
|
|
|
|
driver.get("https://flights.ctrip.com/online/list/oneway-CSX-NKG?_=1&depdate=2022-05-21")
|
|
|
|
|
# 定位到所有航班号所在的标签
|
|
|
|
|
flights = driver.find_elements_by_class_name("plane-No")
|
|
|
|
|
flights
|
|
|
|
|
# 导入显示等待需要的包
|
|
|
|
|
from selenium.webdriver.common.by import By
|
|
|
|
|
from selenium.webdriver.support.wait import WebDriverWait
|
|
|
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
|
|
|
|
|
|
|
# 设置元素等待实例,最多等10秒,每0.5秒查看条件是否成立
|
|
|
|
|
WebDriverWait(driver, 10, 0.5).until(
|
|
|
|
|
# 条件:直到元素加载完成
|
|
|
|
|
EC.presence_of_element_located((By.ID, "page_id"))
|
|
|
|
|
)
|
|
|
|
|
# 打开携程长沙飞南京2022-05-21日的航班信息页面
|
|
|
|
|
driver.get("https://flights.ctrip.com/online/list/oneway-CSX-NKG?_=1&depdate=2022-05-21")
|
|
|
|
|
# 等航班信息加载完
|
|
|
|
|
WebDriverWait(driver, 10, 0.5).until(
|
|
|
|
|
EC.presence_of_element_located((By.CLASS_NAME, "plane-No"))
|
|
|
|
|
)
|
|
|
|
|
# 定位到所有航班号所在的标签
|
|
|
|
|
flights = driver.find_elements_by_class_name("plane-No")
|
|
|
|
|
for x in flights:
|
|
|
|
|
print(x.text)
|
|
|
|
|
# 打开携程长沙飞南京2022-01-12日的航班信息页面
|
|
|
|
|
driver.get("https://flights.ctrip.com/online/list/oneway-CSX-NKG?_=1&depdate=2022-01-12")
|
|
|
|
|
# 等航班信息加载完
|
|
|
|
|
WebDriverWait(driver, 10, 0.5).until(
|
|
|
|
|
EC.presence_of_element_located((By.CLASS_NAME, "plane-No"))
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# 获取滚动条的高度的js代码
|
|
|
|
|
js = "return document.body.scrollHeight"
|
|
|
|
|
# 运行js代码
|
|
|
|
|
new_height = driver.execute_script(js)
|
|
|
|
|
# 通过for循环分次滑动
|
|
|
|
|
for i in range(0, new_height, 350):
|
|
|
|
|
# 执行滑动代码
|
|
|
|
|
driver.execute_script('window.scrollTo(0, %s)' % i)
|
|
|
|
|
|
|
|
|
|
# 定位到所有航班号所在的标签
|
|
|
|
|
flights = driver.find_elements_by_class_name("plane-No")
|
|
|
|
|
# 依次获取标签下的文本数据
|
|
|
|
|
for x in flights:
|
|
|
|
|
print(x.text)
|
|
|
|
|
arrTime = driver.find_elements_by_xpath('//div[@class="arrive-box"]/div[@class="time"]')
|
|
|
|
|
for x in arrTime:
|
|
|
|
|
# 由于有的到达时间与起飞时间并不是同一天,我们需要对数据进行处理
|
|
|
|
|
print("".join(x.text.split("\n")))
|
|
|
|
|
prices = driver.find_elements_by_xpath('//span[@class="price"]')
|
|
|
|
|
for x in prices:
|
|
|
|
|
# 去除价格中的¥
|
|
|
|
|
print(x.text.replace("¥", ""))
|
|
|
|
|
# 每次获取完数据后需要关闭浏览器,如果不关闭浏览器,驱动会一直运行,消耗很多内存
|
|
|
|
|
driver.quit()
|
|
|
|
|
# 导入selenium模快
|
|
|
|
|
from selenium import webdriver
|
|
|
|
|
# 导入selenium的设置模快
|
|
|
|
|
from selenium.webdriver.chrome.options import Options
|
|
|
|
|
# 导入显示等待需要的包
|
|
|
|
|
from selenium.webdriver.common.by import By
|
|
|
|
|
from selenium.webdriver.support.wait import WebDriverWait
|
|
|
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
|
|
# 导入datetime模块处理时间
|
|
|
|
|
import datetime
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 对谷歌浏览器进行设置
|
|
|
|
|
chrome_options = Options()
|
|
|
|
|
# linux上运行时需要以最高权限运行
|
|
|
|
|
chrome_options.add_argument('--no-sandbox')
|
|
|
|
|
# 设置无头模式,不打开浏览器窗口
|
|
|
|
|
chrome_options.add_argument('--headless')
|
|
|
|
|
# 设置浏览器窗口大小
|
|
|
|
|
chrome_options.add_argument('window-size=1920,1080')
|
|
|
|
|
driver = webdriver.Chrome(executable_path=r'chromedriver', options=chrome_options) #获取chrome浏览器的驱动,并启动Chrome浏览器
|
|
|
|
|
# 处理时间,将字符串转换成datetime
|
|
|
|
|
date = datetime.datetime.strptime("2022-01-01","%Y-%m-%d")
|
|
|
|
|
# 保存数据
|
|
|
|
|
message = []
|
|
|
|
|
# 捕获异常,如果发生异常需要关闭driver
|
|
|
|
|
try:
|
|
|
|
|
for x in range(10):
|
|
|
|
|
url = "https://flights.ctrip.com/online/list/oneway-CSX-NKG?_=1&depdate=" + str(date).split(" ")[0]
|
|
|
|
|
# 打开携程航班信息页面
|
|
|
|
|
driver.get(url)
|
|
|
|
|
# 等航班信息加载完
|
|
|
|
|
WebDriverWait(driver, 10, 0.5).until(
|
|
|
|
|
EC.presence_of_element_located((By.CLASS_NAME, "plane-No"))
|
|
|
|
|
)
|
|
|
|
|
try:
|
|
|
|
|
# 首次打开页面时会有疫情提醒页面,需要关闭一次
|
|
|
|
|
driver.find_element_by_class_name("close-icon").click()
|
|
|
|
|
except:
|
|
|
|
|
pass
|
|
|
|
|
# 获取滚动条的高度的js代码
|
|
|
|
|
js = "return document.body.scrollHeight"
|
|
|
|
|
# 运行js代码
|
|
|
|
|
new_height = driver.execute_script(js)
|
|
|
|
|
# 通过for循环分次滑动
|
|
|
|
|
for i in range(0, new_height, 100):
|
|
|
|
|
# 执行滑动代码
|
|
|
|
|
driver.execute_script('window.scrollTo(0, %s)' % i)
|
|
|
|
|
# 航班号
|
|
|
|
|
flights = driver.find_elements_by_class_name("plane-No")
|
|
|
|
|
flights = [x.text[:6] for x in flights]
|
|
|
|
|
# 起飞日期,使列表长度和航班号列表长度一致,方便后续数据处理
|
|
|
|
|
flightDate = [url.split("=")[-1] for x in range(len(flights))]
|
|
|
|
|
# 起飞时间
|
|
|
|
|
depTime = driver.find_elements_by_xpath('//div[@class="depart-box"]/div[@class="time"]')
|
|
|
|
|
depTime = [x.text for x in depTime]
|
|
|
|
|
# 到达时间
|
|
|
|
|
arrTime = driver.find_elements_by_xpath('//div[@class="arrive-box"]/div[@class="time"]')
|
|
|
|
|
arrTime = ["".join(x.text.split("\n")) for x in arrTime]
|
|
|
|
|
# 最低价
|
|
|
|
|
prices = driver.find_elements_by_xpath('//span[@class="price"]')
|
|
|
|
|
prices = [x.text.replace("¥", "") for x in prices]
|
|
|
|
|
# 使用zip函数打包
|
|
|
|
|
mess = list(zip(flights, flightDate, depTime, arrTime, prices))
|
|
|
|
|
message.extend(mess)
|
|
|
|
|
# 日期加一天
|
|
|
|
|
date = date + datetime.timedelta(days=1)
|
|
|
|
|
driver.quit()
|
|
|
|
|
except:
|
|
|
|
|
driver.quit()
|
|
|
|
|
message
|
|
|
|
|
# 导入python中的内置模块csv
|
|
|
|
|
import csv
|
|
|
|
|
with open("content.csv", "w") as f:
|
|
|
|
|
w = csv.writer(f)
|
|
|
|
|
w.writerows(message)
|
|
|
|
|
!cat content.csv
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
|
|
# 读取数据
|
|
|
|
|
df = pd.read_csv("content.csv", names=["flightNum", "date", "depTime", "arrTime", "price"])
|
|
|
|
|
df.head()
|
|
|
|
|
df.info()
|
|
|
|
|
# 根据航班号和日期进行分组,取MU2754的价格数据
|
|
|
|
|
df1 = df.groupby(['flightNum', "date"]).apply(lambda x: x.loc[:,"price"])["MU2754"]
|
|
|
|
|
df1
|
|
|
|
|
# 价格
|
|
|
|
|
prices = df1.values
|
|
|
|
|
prices
|
|
|
|
|
# 日期
|
|
|
|
|
date = df1.index.levels[0].values
|
|
|
|
|
date
|
|
|
|
|
# 在jupyter中直接展示图像
|
|
|
|
|
%matplotlib inline
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
# 用黑体显示中文
|
|
|
|
|
plt.rcParams['font.sans-serif'] = ['SimHei']
|
|
|
|
|
plt.rcParams['figure.figsize'] = (10, 5) # 设置figure_size尺寸
|
|
|
|
|
# x轴坐标,并转换成字符串格式
|
|
|
|
|
x = date
|
|
|
|
|
# y轴坐标
|
|
|
|
|
y = prices
|
|
|
|
|
# 绘制折线图
|
|
|
|
|
plt.plot(x, y, marker="o")
|
|
|
|
|
# 设置x轴名称
|
|
|
|
|
plt.xlabel("价格",fontsize=14)
|
|
|
|
|
# 设置x轴名称
|
|
|
|
|
plt.ylabel("日期",fontsize=14)
|
|
|
|
|
plt.show()
|
|
|
|
|
# 根据日期进行分组,取每天的航班数量
|
|
|
|
|
df1 = df.groupby("date").count()["flightNum"]
|
|
|
|
|
df1
|
|
|
|
|
# x轴坐标
|
|
|
|
|
x = df1.index
|
|
|
|
|
# y轴坐标
|
|
|
|
|
y = df1.values
|
|
|
|
|
# 绘制柱状图
|
|
|
|
|
plt.bar(x, y)
|
|
|
|
|
# 设置x轴名称
|
|
|
|
|
plt.xlabel("日期",fontsize=14)
|
|
|
|
|
# 设置x轴名称
|
|
|
|
|
plt.ylabel("航班数量",fontsize=14)
|
|
|
|
|
plt.show()
|