diff --git a/疫情数据爬取.py b/疫情数据爬取.py new file mode 100644 index 0000000..ff0df47 --- /dev/null +++ b/疫情数据爬取.py @@ -0,0 +1,142 @@ +import requests +from lxml import etree +import csv +import pandas as pd +import matplotlib.pyplot as plt +# 步骤一(替换sans-serif字体) +plt.rcParams['font.sans-serif'] = ['SimHei'] +# 步骤二(解决坐标轴负数的负号显示问题) +plt.rcParams['axes.unicode_minus'] = False +# 设置请求头信息 +headers = { + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36" +} + +url = "https://www.bitpush.news/covid19/" + +try: + # 发起 GET 请求 + response = requests.get(url, headers=headers) + response.raise_for_status() + response.encoding = response.apparent_encoding # 确保编码正确 + html = response.text +except requests.RequestException as e: + print(f"请求出错: {e}") + html = "" + +if html: + # 解析 HTML + doc = etree.HTML(html) + + # 获取国家名称 + country= doc.xpath('//div[@class="table_container"]//tbody/tr/td/span/text()') + # 获取确诊人数 + person = doc.xpath('//div[@class="table_container"]//tbody/tr/td[2]/text()') + person = [x.replace(",", "") for x in person] + # 获取死亡人数 + death = doc.xpath('//div[@class="table_container"]//tbody/tr/td[3]/text()') + death = [x.replace(",", "") for x in death] + + # 组合数据 + message = list(zip(country, person, death)) + # 按死亡人数排序 + message1 = sorted(message, key=lambda x: int(x[-1]), reverse=True) + + # 保存数据到CSV文件 + with open("content1.csv", "w", newline='', encoding='utf-8') as f: + w = csv.writer(f) + w.writerow(["country", "person", "death"]) # 添加表头 + w.writerows(message1) + + # 读取数据 + df = pd.read_csv("content1.csv") + df1 = df.head(10) + + # 绘制柱状图 + plt.figure(figsize=(12, 6)) + plt.bar(df1["country"], df1["death"].astype(int)) + plt.xlabel("国家", fontsize=14) + plt.ylabel("死亡人数", fontsize=14) + plt.title("COVID-19死亡人数前10的国家", fontsize=16) + + # 显示每个柱的数值 + for i, v in enumerate(df1["death"].astype(int)): + plt.text(i, v + 0.02 * max(df1["death"].astype(int)), str(v), ha='center', fontsize=12) + + plt.xticks(rotation=45) + plt.tight_layout() + plt.show() + # 按确诊人数排序 + message2 = sorted(message, key=lambda x: int(x[-2]), reverse=True) + + # 保存数据到CSV文件 + with open("content2.csv", "w", newline='', encoding='utf-8') as f: + w = csv.writer(f) + w.writerow(["country", "person", "death"]) # 添加表头 + w.writerows(message2) + + # 读取数据 + df = pd.read_csv("content2.csv") + df2 = df.head(10) + + # 绘制柱状图 + plt.figure(figsize=(12, 6)) + plt.bar(df2["country"], df2["person"].astype(int)) + plt.xlabel("国家", fontsize=14) + plt.ylabel("确诊人数", fontsize=14) + plt.title("COVID-19确诊人数前10的国家", fontsize=16) + + # 显示每个柱的数值 + for i, v in enumerate(df2["person"].astype(int)): + plt.text(i, v + 0.02 * max(df2["death"].astype(int)), str(v), ha='center', fontsize=12) + + plt.xticks(rotation=45) + plt.tight_layout() + plt.show() + # 获取州的名称 + a=country.index('加州') + zhou=country[a:] + # 获取确诊人数 + person1=person[a:] + # 获取死亡人数 + death1=death[a:] + #组合数据 + message0=list(zip(zhou,person1,death1)) + # 按死亡人数排序 + message3=sorted(message0, key=lambda x: int(x[-1]), reverse=True) + # 保存数据到CSV文件 + with open("content3.csv", "w", newline='', encoding='utf-8') as f: + w = csv.writer(f) + w.writerow(["zhou", "person1", "death1"]) # 添加表头 + w.writerows(message3) + # 读取数据 + df = pd.read_csv("content3.csv") + df3 = df.head(10) + # 绘制折线图 + plt.figure(figsize=(12, 6)) + plt.plot(df3["zhou"], df3["death1"].astype(int), marker='o') + plt.xlabel("州", fontsize=14) + plt.ylabel("死亡人数", fontsize=14) + plt.title("美国COVID9死亡人数前十的州", fontsize=16) + # 显示每个点的数值 + for i, v in enumerate(df3["death1"].astype(int)): + plt.text(i, v + 0.02 * max(df3["death1"].astype(int)), str(v), ha='center', fontsize=12) + # 按确诊人数排序 + message4=sorted(message0, key=lambda x: int(x[-2]), reverse=True) + # 保存数据到CSV文件 + with open("content4.csv", "w", newline='', encoding='utf-8') as f: + w = csv.writer(f) + w.writerow(["zhou", "person1", "death1"]) # 添加表头 + w.writerows(message4) + # 读取数据 + df = pd.read_csv("content4.csv") + df4 = df.head(10) + # 绘制折线图 + plt.figure(figsize=(12, 6)) + plt.plot(df3["zhou"], df3["person1"].astype(int), marker='o') + plt.xlabel("州", fontsize=14) + plt.ylabel("确诊人数", fontsize=14) + plt.title("美国COVID9确诊人数前十的州", fontsize=16) + # 显示每个点的数值 + for i, v in enumerate(df3["person1"].astype(int)): + plt.text(i, v + 0.02 * max(df3["death1"].astype(int)), str(v), ha='center', fontsize=12) \ No newline at end of file