Update 疫情数据爬虫.py

main
hnu202209100109 1 year ago
parent e0b9e1a1da
commit fb254c4144

@ -1,60 +1,58 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Created on Tue Dec 12 15:24:14 2023 Created on Tue Dec 12 15:24:14 2023
@author: Toon
@author: Toon """
"""
import requests
import requests from lxml import etree
from lxml import etree import csv
import csv import pandas as pd
import pandas as pd import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
# 请求的url
# 请求的url url = "https://www.bitpush.news/covid19/"
url = "https://www.bitpush.news/covid19/" # 设置请求头信息
# 设置请求头信息 headers = {
headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36" }
} # 使用reqeusts模快发起 GET 请求
# 使用reqeusts模快发起 GET 请求 response = requests.get(url, headers=headers)
response = requests.get(url, headers=headers) # 获取请求的返回结果
# 获取请求的返回结果 html = response.text
html = response.text parse = etree.HTMLParser(encoding='utf-8') # 添加编码
parse = etree.HTMLParser(encoding='utf-8') # 添加编码 # 解析 requests 返回的响应结果
# 解析 requests 返回的响应结果 doc = etree.HTML(html)
doc = etree.HTML(html) # 地区名称
state = doc.xpath('//html/body/div/div/div[2]/div[2]/div/div/div/div//tbody/tr/td/span/text()') state = doc.xpath('//html/body/div/div/div[2]/div[2]/div/div/div/div//tbody/tr/td/span/text()')
# 确诊人数 # 确诊人数
person = doc.xpath('//html/body/div/div/div[2]/div[2]/div/div/div/div//tbody/tr/td[2]/text()') person = doc.xpath('//html/body/div/div/div[2]/div[2]/div/div/div/div//tbody/tr/td[2]/text()')
# 由于确诊人数中有逗号,我们使用列表推导式删除 # 由于确诊人数中有逗号,我们使用列表推导式删除
person = [x.replace(",", "") for x in person] person = [x.replace(",", "") for x in person]
# 死亡人数 # 死亡人数
death = doc.xpath('//html/body/div/div/div[2]/div[2]/div/div/div/div//tbody/tr/td[3]/text()') death = doc.xpath('//html/body/div/div/div[2]/div[2]/div/div/div/div//tbody/tr/td[3]/text()')
# 同样使用列表推导式删除逗号 # 同样使用列表推导式删除逗号
death = [x.replace(",", "") for x in death] death = [x.replace(",", "") for x in death]
message = list(zip(state, person, death)) message = list(zip(state, person, death))
with open("content.csv", "w") as f: with open("content.csv", "w") as f:
w = csv.writer(f) w = csv.writer(f)
w.writerows(message) w.writerows(message)
df = pd.read_csv("content.csv", names=["country", "person", "death"],encoding='gbk') df = pd.read_csv("content.csv", names=["country", "person", "death"],encoding='gbk')
df.head() df.head()
df.info() df.info()
df1 = df.head(15) df1 = df.head(15)
df1 = df1[::-1] df1 = df1[::-1]
# 在jupyter中直接展示图像 # 设置中文显示
#%matplotlib inline plt.rcParams['font.sans-serif'] = ['SimHei']
# 设置中文显示 plt.rcParams['figure.figsize'] = (5,5) # 设置figure_size尺寸
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['figure.figsize'] = (5,5) # 设置figure_size尺寸 #制作柱形图
x = df1["country"].values
#制作柱形图 y = df1["person"].values
x = df1["country"].values plt.barh(x, y)
y = df1["person"].values plt.ylabel("国家",fontsize=14)
plt.barh(x, y) plt.xlabel("确诊人数",fontsize=14)
plt.ylabel("国家",fontsize=14) for x,y in zip(y,x):
plt.xlabel("确诊人数",fontsize=14) plt.text(x, y,x, ha='left', va='center',color='r')
for x,y in zip(y,x):
plt.text(x, y,x, ha='left', va='center',color='r')
plt.show() plt.show()
Loading…
Cancel
Save