You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

196 lines
7.5 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# 导入selenium模快
from selenium import webdriver
# 导入selenium的设置模快
from selenium.webdriver.chrome.options import Options
# 由于该服务器是在linux上运行没有可视化界面所有我们要使用无头模式
# 对谷歌浏览器进行设置
chrome_options = Options()
# linux上运行时需要以最高权限运行
chrome_options.add_argument('--no-sandbox')
# 设置无头模式,不打开浏览器窗口
chrome_options.add_argument('--headless')
# 设置浏览器窗口大小
chrome_options.add_argument('window-size=1920,1080')
driver = webdriver.Chrome(executable_path=r'chromedriver', options=chrome_options) #获取chrome浏览器的驱动并启动Chrome浏览器
driver.get('https://www.baidu.com') # 打开百度
print(driver.current_url) # 打印浏览器当前url
# 打开携程长沙飞南京2022-05-21日的航班信息页面
driver.get("https://flights.ctrip.com/online/list/oneway-CSX-NKG?_=1&depdate=2022-05-21")
# 定位到所有航班号所在的标签
flights = driver.find_elements_by_class_name("plane-No")
flights
# 导入显示等待需要的包
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# 设置元素等待实例最多等10秒每0.5秒查看条件是否成立
WebDriverWait(driver, 10, 0.5).until(
# 条件:直到元素加载完成
EC.presence_of_element_located((By.ID, "page_id"))
)
# 打开携程长沙飞南京2022-05-21日的航班信息页面
driver.get("https://flights.ctrip.com/online/list/oneway-CSX-NKG?_=1&depdate=2022-05-21")
# 等航班信息加载完
WebDriverWait(driver, 10, 0.5).until(
EC.presence_of_element_located((By.CLASS_NAME, "plane-No"))
)
# 定位到所有航班号所在的标签
flights = driver.find_elements_by_class_name("plane-No")
for x in flights:
print(x.text)
# 打开携程长沙飞南京2022-01-12日的航班信息页面
driver.get("https://flights.ctrip.com/online/list/oneway-CSX-NKG?_=1&depdate=2022-01-12")
# 等航班信息加载完
WebDriverWait(driver, 10, 0.5).until(
EC.presence_of_element_located((By.CLASS_NAME, "plane-No"))
)
# 获取滚动条的高度的js代码
js = "return document.body.scrollHeight"
# 运行js代码
new_height = driver.execute_script(js)
# 通过for循环分次滑动
for i in range(0, new_height, 350):
# 执行滑动代码
driver.execute_script('window.scrollTo(0, %s)' % i)
# 定位到所有航班号所在的标签
flights = driver.find_elements_by_class_name("plane-No")
# 依次获取标签下的文本数据
for x in flights:
print(x.text)
arrTime = driver.find_elements_by_xpath('//div[@class="arrive-box"]/div[@class="time"]')
for x in arrTime:
# 由于有的到达时间与起飞时间并不是同一天,我们需要对数据进行处理
print("".join(x.text.split("\n")))
prices = driver.find_elements_by_xpath('//span[@class="price"]')
for x in prices:
# 去除价格中的¥
print(x.text.replace("¥", ""))
# 每次获取完数据后需要关闭浏览器,如果不关闭浏览器,驱动会一直运行,消耗很多内存
driver.quit()
# 导入selenium模快
from selenium import webdriver
# 导入selenium的设置模快
from selenium.webdriver.chrome.options import Options
# 导入显示等待需要的包
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# 导入datetime模块处理时间
import datetime
# 对谷歌浏览器进行设置
chrome_options = Options()
# linux上运行时需要以最高权限运行
chrome_options.add_argument('--no-sandbox')
# 设置无头模式,不打开浏览器窗口
chrome_options.add_argument('--headless')
# 设置浏览器窗口大小
chrome_options.add_argument('window-size=1920,1080')
driver = webdriver.Chrome(executable_path=r'chromedriver', options=chrome_options) #获取chrome浏览器的驱动并启动Chrome浏览器
# 处理时间将字符串转换成datetime
date = datetime.datetime.strptime("2022-01-01","%Y-%m-%d")
# 保存数据
message = []
# 捕获异常如果发生异常需要关闭driver
try:
for x in range(10):
url = "https://flights.ctrip.com/online/list/oneway-CSX-NKG?_=1&depdate=" + str(date).split(" ")[0]
# 打开携程航班信息页面
driver.get(url)
# 等航班信息加载完
WebDriverWait(driver, 10, 0.5).until(
EC.presence_of_element_located((By.CLASS_NAME, "plane-No"))
)
try:
# 首次打开页面时会有疫情提醒页面,需要关闭一次
driver.find_element_by_class_name("close-icon").click()
except:
pass
# 获取滚动条的高度的js代码
js = "return document.body.scrollHeight"
# 运行js代码
new_height = driver.execute_script(js)
# 通过for循环分次滑动
for i in range(0, new_height, 100):
# 执行滑动代码
driver.execute_script('window.scrollTo(0, %s)' % i)
# 航班号
flights = driver.find_elements_by_class_name("plane-No")
flights = [x.text[:6] for x in flights]
# 起飞日期,使列表长度和航班号列表长度一致,方便后续数据处理
flightDate = [url.split("=")[-1] for x in range(len(flights))]
# 起飞时间
depTime = driver.find_elements_by_xpath('//div[@class="depart-box"]/div[@class="time"]')
depTime = [x.text for x in depTime]
# 到达时间
arrTime = driver.find_elements_by_xpath('//div[@class="arrive-box"]/div[@class="time"]')
arrTime = ["".join(x.text.split("\n")) for x in arrTime]
# 最低价
prices = driver.find_elements_by_xpath('//span[@class="price"]')
prices = [x.text.replace("¥", "") for x in prices]
# 使用zip函数打包
mess = list(zip(flights, flightDate, depTime, arrTime, prices))
message.extend(mess)
# 日期加一天
date = date + datetime.timedelta(days=1)
driver.quit()
except:
driver.quit()
message
# 导入python中的内置模块csv
import csv
with open("content.csv", "w") as f:
w = csv.writer(f)
w.writerows(message)
!cat content.csv
import pandas as pd
# 读取数据
df = pd.read_csv("content.csv", names=["flightNum", "date", "depTime", "arrTime", "price"])
df.head()
df.info()
# 根据航班号和日期进行分组取MU2754的价格数据
df1 = df.groupby(['flightNum', "date"]).apply(lambda x: x.loc[:,"price"])["MU2754"]
df1
# 价格
prices = df1.values
prices
# 日期
date = df1.index.levels[0].values
date
# 在jupyter中直接展示图像
%matplotlib inline
import matplotlib.pyplot as plt
# 用黑体显示中文
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['figure.figsize'] = (10, 5) # 设置figure_size尺寸
# x轴坐标并转换成字符串格式
x = date
# y轴坐标
y = prices
# 绘制折线图
plt.plot(x, y, marker="o")
# 设置x轴名称
plt.xlabel("价格",fontsize=14)
# 设置x轴名称
plt.ylabel("日期",fontsize=14)
plt.show()
# 根据日期进行分组,取每天的航班数量
df1 = df.groupby("date").count()["flightNum"]
df1
# x轴坐标
x = df1.index
# y轴坐标
y = df1.values
# 绘制柱状图
plt.bar(x, y)
# 设置x轴名称
plt.xlabel("日期",fontsize=14)
# 设置x轴名称
plt.ylabel("航班数量",fontsize=14)
plt.show()