diff --git a/新冠疫情各国死亡人数爬虫.py b/新冠疫情各国死亡人数爬虫.py new file mode 100644 index 0000000..3157901 --- /dev/null +++ b/新冠疫情各国死亡人数爬虫.py @@ -0,0 +1,149 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Jun 5 19:22:56 2024 + +@author: 44665 +""" + +import requests +from lxml import etree +import csv +import pandas as pd +import matplotlib.pyplot as plt + +url = "https://www.bitpush.news/covid19/" + +headers = { + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36" +} + +try: + response = requests.get(url, headers=headers) + response.raise_for_status() # 检查是否请求成功 +except requests.exceptions.RequestException as e: + print("请求错误:", e) + exit() + +html = response.text + +parse = etree.HTMLParser(encoding='utf-8') +doc = etree.HTML(html) + +# 解析国家、确诊人数和死亡人数 +country = doc.xpath('//div[@class="table_container"]//tbody/tr/td/span/text()') +person = doc.xpath('//div[@class="table_container"]//tbody/tr/td[2]/text()') +death = doc.xpath('//div[@class="table_container"]//tbody/tr/td[3]/text()') +a = country.index('加州') +state = country[a:] +state_person = person[a:] +state_death = death[a:] + + +# 去除确诊人数和死亡人数中的逗号,并转换为整数 +person = [int(x.replace(",", "")) for x in person] +death = [int(x.replace(",", "")) for x in death] +state_person = [int(x.replace(",", "")) for x in state_person] +state_death = [int(x.replace(",", "")) for x in state_death] +# 将数据存储到列表中 +message = list(zip(country, person, death)) +message = sorted(message, key=lambda x: x[-1], reverse=True) + +# 将数据写入 CSV 文件 +with open("content.csv", "w", newline='', encoding='utf-8') as f: + w = csv.writer(f) + w.writerows(message) + +# 读取 CSV 文件并创建 DataFrame +df = pd.read_csv("content.csv", names=["country", "person", "death"]) +df = df.drop(0) +df = df.head(10) + +# 绘制柱状图 +plt.rcParams['font.sans-serif'] = ['SimHei'] +plt.rcParams['figure.figsize'] = (10, 5) + +x = df["country"].values +y = df["death"].values +z = df['person'].values + +colors=['red','green','blue','cyan','grey','black','yellow','orange','pink','purple'] +plt.bar(x, y,color=colors) +plt.xlabel("国家", fontsize=14) +plt.ylabel("死亡人数", fontsize=14) +plt.title("COVID-19 死亡人数排名前十国家", fontsize=16) +for i,j in zip(x,y): + plt.text(i,j,j,ha='center') +plt.xticks(rotation=45) # x轴标签旋转45度,防止重叠 +plt.show() +#绘制饼图 +plt.rcParams['figure.figsize'] = (15, 10) +colors1=['maroon','crimson','yellow','olive','cyan','lavender','purple','teal','pink','magenta'] +explode=[0.1,0,0,0,0,0,0,0,0,0] +labels=['美国','巴西','印度','俄罗斯','墨西哥','秘鲁','英国','印尼','意大利','伊朗'] +plt.pie(y,explode=explode,labels=labels,colors=colors1,shadow=True,autopct='%.2f%%') +plt.legend() +plt.title('COVID-19 死亡人数排名前十国家') +plt.show() + + +#将数据重新存储一个列表 +message = list(zip(country, person, death)) +message = sorted(message, key=lambda x: x[-2], reverse=True) + +# 将数据写入 CSV 文件 +with open("content.csv", "w", newline='', encoding='utf-8') as f: + w = csv.writer(f) + w.writerows(message) + +# 读取 CSV 文件并创建 DataFrame +df = pd.read_csv("content.csv", names=["country", "person", "death"]) +df = df.drop(0) +df = df.head(20) + +# 绘制两幅折线图 +plt.rcParams['font.sans-serif'] = ['SimHei'] +plt.rcParams['figure.figsize'] = (15, 12) + +x = df["country"].values +y = df["death"].values +z = df['person'].values + +plt.subplot(211) +plt.plot(x,z,color='crimson',marker='*',linestyle='--',label='确诊人数') +plt.legend(loc='upper right') +plt.xlabel("国家", fontsize=14) +plt.ylabel("确诊人数", fontsize=14) +plt.xticks(rotation=45) # x轴标签旋转45度,防止重叠 +plt.title("COVID-19 确诊人数排名前二十国家确诊人数与死亡人数对比", fontsize=16) +plt.subplot(212) +plt.plot(x,y,color='teal',linestyle='-',marker='D',label='死亡人数') +plt.xlabel("国家", fontsize=14) +plt.ylabel("死亡人数", fontsize=14) +plt.legend(loc='upper right') +plt.xticks(rotation=45) # x轴标签旋转45度,防止重叠 +plt.show() + +#将州数据再写进列表 +state_message = list(zip(state, state_person, state_death)) +state_message = sorted(state_message, key=lambda x: x[-1], reverse=True) +# 将数据写入 CSV 文件 +with open("state_content.csv", "w", newline='', encoding='utf-8') as f: + w = csv.writer(f) + w.writerows(state_message) + +# 读取 CSV 文件并创建 DataFrame +df1 = pd.read_csv("state_content.csv", names=["state", "state_person", "state_death"]) +df1 = df1.drop(0) +df1 = df1.head(10) +#绘制柱状图 +plt.rcParams['font.sans-serif'] = ['SimHei'] +plt.rcParams['figure.figsize'] = (10, 5) + +colors1=['maroon','crimson','yellow','olive','cyan','lavender','purple','teal','pink','magenta'] +x = df1["state"].values +y = df1["state_death"].values +plt.barh(x,y,height=0.8,color=colors1) +plt.xlabel('死亡人数') +plt.ylabel('州') +plt.title('美国COVID-19 死亡人数排名前十的州') +plt.show() \ No newline at end of file