You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Covid9/疫情数据爬取.py

142 lines
5.1 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
from lxml import etree
import csv
import pandas as pd
import matplotlib.pyplot as plt
# 步骤一替换sans-serif字体
plt.rcParams['font.sans-serif'] = ['SimHei']
# 步骤二(解决坐标轴负数的负号显示问题)
plt.rcParams['axes.unicode_minus'] = False
# 设置请求头信息
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
}
url = "https://www.bitpush.news/covid19/"
try:
# 发起 GET 请求
response = requests.get(url, headers=headers)
response.raise_for_status()
response.encoding = response.apparent_encoding # 确保编码正确
html = response.text
except requests.RequestException as e:
print(f"请求出错: {e}")
html = ""
if html:
# 解析 HTML
doc = etree.HTML(html)
# 获取国家名称
country= doc.xpath('//div[@class="table_container"]//tbody/tr/td/span/text()')
# 获取确诊人数
person = doc.xpath('//div[@class="table_container"]//tbody/tr/td[2]/text()')
person = [x.replace(",", "") for x in person]
# 获取死亡人数
death = doc.xpath('//div[@class="table_container"]//tbody/tr/td[3]/text()')
death = [x.replace(",", "") for x in death]
# 组合数据
message = list(zip(country, person, death))
# 按死亡人数排序
message1 = sorted(message, key=lambda x: int(x[-1]), reverse=True)
# 保存数据到CSV文件
with open("content1.csv", "w", newline='', encoding='utf-8') as f:
w = csv.writer(f)
w.writerow(["country", "person", "death"]) # 添加表头
w.writerows(message1)
# 读取数据
df = pd.read_csv("content1.csv")
df1 = df.head(10)
# 绘制柱状图
plt.figure(figsize=(12, 6))
plt.bar(df1["country"], df1["death"].astype(int))
plt.xlabel("国家", fontsize=14)
plt.ylabel("死亡人数", fontsize=14)
plt.title("COVID-19死亡人数前10的国家", fontsize=16)
# 显示每个柱的数值
for i, v in enumerate(df1["death"].astype(int)):
plt.text(i, v + 0.02 * max(df1["death"].astype(int)), str(v), ha='center', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# 按确诊人数排序
message2 = sorted(message, key=lambda x: int(x[-2]), reverse=True)
# 保存数据到CSV文件
with open("content2.csv", "w", newline='', encoding='utf-8') as f:
w = csv.writer(f)
w.writerow(["country", "person", "death"]) # 添加表头
w.writerows(message2)
# 读取数据
df = pd.read_csv("content2.csv")
df2 = df.head(10)
# 绘制柱状图
plt.figure(figsize=(12, 6))
plt.bar(df2["country"], df2["person"].astype(int))
plt.xlabel("国家", fontsize=14)
plt.ylabel("确诊人数", fontsize=14)
plt.title("COVID-19确诊人数前10的国家", fontsize=16)
# 显示每个柱的数值
for i, v in enumerate(df2["person"].astype(int)):
plt.text(i, v + 0.02 * max(df2["person"].astype(int)), str(v), ha='center', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# 获取州的名称
a=country.index('加州')
zhou=country[a:]
# 获取确诊人数
person1=person[a:]
# 获取死亡人数
death1=death[a:]
#组合数据
message0=list(zip(zhou,person1,death1))
# 按死亡人数排序
message3=sorted(message0, key=lambda x: int(x[-1]), reverse=True)
# 保存数据到CSV文件
with open("content3.csv", "w", newline='', encoding='utf-8') as f:
w = csv.writer(f)
w.writerow(["zhou", "person1", "death1"]) # 添加表头
w.writerows(message3)
# 读取数据
df = pd.read_csv("content3.csv")
df3 = df.head(10)
# 绘制折线图
plt.figure(figsize=(12, 6))
plt.plot(df3["zhou"], df3["death1"].astype(int), marker='o')
plt.xlabel("", fontsize=14)
plt.ylabel("死亡人数", fontsize=14)
plt.title("美国COVID9死亡人数前十的州", fontsize=16)
# 显示每个点的数值
for i, v in enumerate(df3["death1"].astype(int)):
plt.text(i, v + 0.02 * max(df3["death1"].astype(int)), str(v), ha='center', fontsize=12)
# 按确诊人数排序
message4=sorted(message0, key=lambda x: int(x[-2]), reverse=True)
# 保存数据到CSV文件
with open("content4.csv", "w", newline='', encoding='utf-8') as f:
w = csv.writer(f)
w.writerow(["zhou", "person1", "death1"]) # 添加表头
w.writerows(message4)
# 读取数据
df = pd.read_csv("content4.csv")
df4 = df.head(10)
# 绘制折线图
plt.figure(figsize=(12, 6))
plt.plot(df4["zhou"], df4["person1"].astype(int), marker='o')
plt.xlabel("", fontsize=14)
plt.ylabel("确诊人数", fontsize=14)
plt.title("美国COVID9确诊人数前十的州", fontsize=16)
# 显示每个点的数值
for i, v in enumerate(df4["person1"].astype(int)):
plt.text(i, v + 0.02 * max(df4["person1"].astype(int)), str(v), ha='center', fontsize=12)