You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Jen_Jon/新冠疫情各国死亡人数爬虫.py

149 lines
5.0 KiB

# -*- coding: utf-8 -*-
"""
Created on Wed Jun 5 19:22:56 2024
@author: 44665
"""
import requests
from lxml import etree
import csv
import pandas as pd
import matplotlib.pyplot as plt
url = "https://www.bitpush.news/covid19/"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
}
try:
response = requests.get(url, headers=headers)
response.raise_for_status() # 检查是否请求成功
except requests.exceptions.RequestException as e:
print("请求错误:", e)
exit()
html = response.text
parse = etree.HTMLParser(encoding='utf-8')
doc = etree.HTML(html)
# 解析国家、确诊人数和死亡人数
country = doc.xpath('//div[@class="table_container"]//tbody/tr/td/span/text()')
person = doc.xpath('//div[@class="table_container"]//tbody/tr/td[2]/text()')
death = doc.xpath('//div[@class="table_container"]//tbody/tr/td[3]/text()')
a = country.index('加州')
state = country[a:]
state_person = person[a:]
state_death = death[a:]
# 去除确诊人数和死亡人数中的逗号,并转换为整数
person = [int(x.replace(",", "")) for x in person]
death = [int(x.replace(",", "")) for x in death]
state_person = [int(x.replace(",", "")) for x in state_person]
state_death = [int(x.replace(",", "")) for x in state_death]
# 将数据存储到列表中
message = list(zip(country, person, death))
message = sorted(message, key=lambda x: x[-1], reverse=True)
# 将数据写入 CSV 文件
with open("content.csv", "w", newline='', encoding='utf-8') as f:
w = csv.writer(f)
w.writerows(message)
# 读取 CSV 文件并创建 DataFrame
df = pd.read_csv("content.csv", names=["country", "person", "death"])
df = df.drop(0)
df = df.head(10)
# 绘制柱状图
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['figure.figsize'] = (10, 5)
x = df["country"].values
y = df["death"].values
z = df['person'].values
colors=['red','green','blue','cyan','grey','black','yellow','orange','pink','purple']
plt.bar(x, y,color=colors)
plt.xlabel("国家", fontsize=14)
plt.ylabel("死亡人数", fontsize=14)
plt.title("COVID-19 死亡人数排名前十国家", fontsize=16)
for i,j in zip(x,y):
plt.text(i,j,j,ha='center')
plt.xticks(rotation=45) # x轴标签旋转45度防止重叠
plt.show()
#绘制饼图
plt.rcParams['figure.figsize'] = (15, 10)
colors1=['maroon','crimson','yellow','olive','cyan','lavender','purple','teal','pink','magenta']
explode=[0.1,0,0,0,0,0,0,0,0,0]
labels=['美国','巴西','印度','俄罗斯','墨西哥','秘鲁','英国','印尼','意大利','伊朗']
plt.pie(y,explode=explode,labels=labels,colors=colors1,shadow=True,autopct='%.2f%%')
plt.legend()
plt.title('COVID-19 死亡人数排名前十国家')
plt.show()
#将数据重新存储一个列表
message = list(zip(country, person, death))
message = sorted(message, key=lambda x: x[-2], reverse=True)
# 将数据写入 CSV 文件
with open("content.csv", "w", newline='', encoding='utf-8') as f:
w = csv.writer(f)
w.writerows(message)
# 读取 CSV 文件并创建 DataFrame
df = pd.read_csv("content.csv", names=["country", "person", "death"])
df = df.drop(0)
df = df.head(20)
# 绘制两幅折线图
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['figure.figsize'] = (15, 12)
x = df["country"].values
y = df["death"].values
z = df['person'].values
plt.subplot(211)
plt.plot(x,z,color='crimson',marker='*',linestyle='--',label='确诊人数')
plt.legend(loc='upper right')
plt.xlabel("国家", fontsize=14)
plt.ylabel("确诊人数", fontsize=14)
plt.xticks(rotation=45) # x轴标签旋转45度防止重叠
plt.title("COVID-19 确诊人数排名前二十国家确诊人数与死亡人数对比", fontsize=16)
plt.subplot(212)
plt.plot(x,y,color='teal',linestyle='-',marker='D',label='死亡人数')
plt.xlabel("国家", fontsize=14)
plt.ylabel("死亡人数", fontsize=14)
plt.legend(loc='upper right')
plt.xticks(rotation=45) # x轴标签旋转45度防止重叠
plt.show()
#将州数据再写进列表
state_message = list(zip(state, state_person, state_death))
state_message = sorted(state_message, key=lambda x: x[-1], reverse=True)
# 将数据写入 CSV 文件
with open("state_content.csv", "w", newline='', encoding='utf-8') as f:
w = csv.writer(f)
w.writerows(state_message)
# 读取 CSV 文件并创建 DataFrame
df1 = pd.read_csv("state_content.csv", names=["state", "state_person", "state_death"])
df1 = df1.drop(0)
df1 = df1.head(10)
#绘制柱状图
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['figure.figsize'] = (10, 5)
colors1=['maroon','crimson','yellow','olive','cyan','lavender','purple','teal','pink','magenta']
x = df1["state"].values
y = df1["state_death"].values
plt.barh(x,y,height=0.8,color=colors1)
plt.xlabel('死亡人数')
plt.ylabel('')
plt.title('美国COVID-19 死亡人数排名前十的州')
plt.show()