完成国内疫情信息的抓取

master
yangxudongll 5 years ago
parent a85227de52
commit 07309ee209

@ -70,7 +70,6 @@ class Spider():
# 获取国内疫情
def grapchina(self):
data=requests.get(self.url,headers=self.headers)
data.encoding='utf-8'
html = data.content.decode('utf-8')
@ -80,38 +79,53 @@ class Spider():
tag=soup.find('script',attrs={'id':'getAreaStat'})
yiqinginfo=str(tag)
provinceNames=re.findall(r'"provinceName":"(.*?)",',yiqinginfo)
print(provinceNames)
print(len(provinceNames))
provinceShortName=re.findall(r'"provinceShortName":"(.*?)",',yiqinginfo)
print(provinceShortName)
currentConfirmedCounts=re.findall(r'"currentConfirmedCount":(.*?),"',yiqinginfo)
print(currentConfirmedCounts)
print(len(currentConfirmedCounts))
confirmedCounts=re.findall(r'"confirmedCount":(.*?),',yiqinginfo)
print(confirmedCounts)
suspectedCounts=re.findall(r'"suspectedCount":(.*?),',yiqinginfo)
print(suspectedCounts)
curedCounts=re.findall(r'"curedCount":(.*?),',yiqinginfo)
print(curedCounts)
deadCounts=re.findall(r'"deadCount":(.*?),',yiqinginfo)
print(deadCounts)
comments=re.findall(r'"comment":(.*?),',yiqinginfo)
print(comments)
locationIds=re.findall(r'"locationId":(.*?),',yiqinginfo)
print(locationIds)
statisticsDatas=re.findall(r'"statisticsData":"(.*?)",',yiqinginfo)
print(statisticsDatas)
cities=re.findall(r'"cities":(.*?)}',yiqinginfo)
print(cities)
print(tag)
#各省信息
provinceinfo=re.findall(r'({"provinceName":.*?),{"provinceName"',yiqinginfo)
#遍历所有省的疫情信息,保存在一个列表中
provinces=[]
for pro in provinceinfo:
#转换成字典格式之后就可以直接获取其成员了
province=Provinceinfo()
j=json.loads(pro)
#录入省的疫情信息
province.provinceName=j['provinceName']
province.provinceShortName=j['provinceShortName']
province.currentConfirmedCount=j['currentConfirmedCount']
province.confirmedCount=j['confirmedCount']
province.suspectedCount=j['suspectedCount']
province.deadCount=j['deadCount']
province.curedCount=j['curedCount']
cities=re.findall(r'({.*?})',str(j['cities']))
for city in cities:
#json里面引号必须是“否则报错
city=str(city).replace('\'','\"')
#转换成字典格式就很容易获取各个字段信息了
cj=json.loads(city)
cityinfo=Cityinfo()
cityinfo.cityName=cj['cityName']
cityinfo.currentConfirmedCount=cj['currentConfirmedCount']
cityinfo.confirmedCount=cj['confirmedCount']
cityinfo.curedCount=cj['curedCount']
cityinfo.suspectedCount=cj['suspectedCount']
cityinfo.deadCount=cj['deadCount']
province.cities.append(cityinfo)
provinces.append(province)
return provinces
# 获取国外疫情
def grapforign(self):
pass
data=requests.get(self.url,headers=self.headers)
data.encoding='utf-8'
html = data.content.decode('utf-8')
#构建soup对象
soup=BeautifulSoup(html,'html.parser')
#查找数据
tag=soup.find('script',attrs={'id':'getListByCountryTypeService2true'})
yiqinginfo=str(tag)
#news类用来存储实时新闻的信息
class news():
@ -124,3 +138,49 @@ class news():
self.summay=summary
self.sourceUrl=sourceUrl
#存储各市的疫情信息
class Cityinfo():
def __int__(self):
#城市名
self.cityName=''
#当前确诊
self.currentConfirmedCount=0
#累计确诊
self.confirmedCount=0
#疑似
self.suspectedCount=0
#治愈
self.curedCount=0
#死亡人数
self.deadCount=0
def __str__(self):
return self.cityName+" 当前确诊:"+str(self.currentConfirmedCount)+\
" 累计确诊:"+str(self.confirmedCount)+"疑似:"+str(self.suspectedCount)+" 治愈:"+str(self.curedCount)+\
" 死亡人数:"+str(self.deadCount)
#存储省的疫情信息
class Provinceinfo():
def __init__(self,provinceName,provinceShortName,currentConfirmedCount,confirmedCount,
suspectedCount,curedCount,deadCount,cities):
self.provinceName=provinceName
self.provinceShortName=provinceShortName
self.currentConfirmedCount=currentConfirmedCount
self.confirmedCount=confirmedCount
self.suspectedCount=suspectedCount
self.curedCount=curedCount
self.deadCount=deadCount
#各个市的疫情信息,通过一个字典存储
self.cities=cities
def __init__(self):
self.provinceName = ''
self.provinceShortName = ''
self.currentConfirmedCount = 0
self.confirmedCount = 0
self.suspectedCount = 0
self.curedCount = 0
self.deadCount = 0
# 各个市的疫情信息,通过一个字典存储
self.cities = []

@ -8,7 +8,11 @@ def main():
#网站首页
url="https://ncov.dxy.cn/ncovh5/view/pneumonia"
spider=Spider(url)
spider.grapchina()
provinces= spider.grapchina()
print(provinces[5].provinceName)
for i in range(len(provinces[4].cities)):
print(provinces[4].cities[i])
if __name__ == '__main__':

Loading…
Cancel
Save