You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Python/疫情数据爬取demo.py

46 lines
1.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import csv
from lxml import etree
import pandas as pd
import matplotlib.pyplot as plt
url = "https://www.bitpush.news/covid19/"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
} # 头部,伪装浏览器访问,防反爬虫。
response = requests.get(url=url, headers=headers) # request.get请求网址
html = response.text # 网址源代码
parse = etree.HTMLParser(encoding='utf-8') # 创建HTML解析格式为"utf-8"
doc = etree.HTML(html)
# xpath解析网站格式
country = doc.xpath('/html/body/div/div/div[2]/div[2]/div/div/div/div/div[1]/table/tbody/tr/td/span/text()')
person = doc.xpath('/html/body/div/div/div[2]/div[2]/div/div/div/div/div[1]/table/tbody/tr/td[2]/text()')
person = [x.replace(",", "") for x in person] # 由于确诊人数中有逗号,我们使用列表推导式删除
death = doc.xpath('/html/body/div/div/div[2]/div[2]/div/div/div/div/div[1]/table/tbody/tr/td[3]/text()')
death = [x.replace(",", "") for x in death] # 同上
message = list(zip(country, person, death)) # 打包数据之后将其转换成列表
# csv文件保存
with open("content.csv", "w", encoding="utf-8", newline="") as f: # newline=""防止空行
w = csv.writer(f)
w.writerows(message)
df = pd.read_csv("content.csv", names=["country", "person", "death"])
df1 = df.head(15) # df的前15项
# 绘制条形图
# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['figure.figsize'] = (10, 5) # 设置figure_size尺寸
# x轴坐标
x = df1["country"].values
# y轴坐标
y = df1["person"].values
# 绘制柱状图
plt.bar(x, y)
# 设置x轴名称
plt.xlabel("城市", fontsize=14)
# 设置x轴名称
plt.ylabel("确诊人数", fontsize=14)
plt.show() # 显示条形图