完成国内疫情信息的抓取

master
yangxudongll 5 years ago
parent a85227de52
commit 07309ee209

@ -70,7 +70,6 @@ class Spider():
# 获取国内疫情 # 获取国内疫情
def grapchina(self): def grapchina(self):
data=requests.get(self.url,headers=self.headers) data=requests.get(self.url,headers=self.headers)
data.encoding='utf-8' data.encoding='utf-8'
html = data.content.decode('utf-8') html = data.content.decode('utf-8')
@ -80,38 +79,53 @@ class Spider():
tag=soup.find('script',attrs={'id':'getAreaStat'}) tag=soup.find('script',attrs={'id':'getAreaStat'})
yiqinginfo=str(tag) yiqinginfo=str(tag)
provinceNames=re.findall(r'"provinceName":"(.*?)",',yiqinginfo) #各省信息
print(provinceNames) provinceinfo=re.findall(r'({"provinceName":.*?),{"provinceName"',yiqinginfo)
print(len(provinceNames)) #遍历所有省的疫情信息,保存在一个列表中
provinceShortName=re.findall(r'"provinceShortName":"(.*?)",',yiqinginfo) provinces=[]
print(provinceShortName) for pro in provinceinfo:
currentConfirmedCounts=re.findall(r'"currentConfirmedCount":(.*?),"',yiqinginfo) #转换成字典格式之后就可以直接获取其成员了
print(currentConfirmedCounts) province=Provinceinfo()
print(len(currentConfirmedCounts)) j=json.loads(pro)
confirmedCounts=re.findall(r'"confirmedCount":(.*?),',yiqinginfo) #录入省的疫情信息
print(confirmedCounts) province.provinceName=j['provinceName']
suspectedCounts=re.findall(r'"suspectedCount":(.*?),',yiqinginfo) province.provinceShortName=j['provinceShortName']
print(suspectedCounts) province.currentConfirmedCount=j['currentConfirmedCount']
curedCounts=re.findall(r'"curedCount":(.*?),',yiqinginfo) province.confirmedCount=j['confirmedCount']
print(curedCounts) province.suspectedCount=j['suspectedCount']
deadCounts=re.findall(r'"deadCount":(.*?),',yiqinginfo) province.deadCount=j['deadCount']
print(deadCounts) province.curedCount=j['curedCount']
comments=re.findall(r'"comment":(.*?),',yiqinginfo)
print(comments) cities=re.findall(r'({.*?})',str(j['cities']))
locationIds=re.findall(r'"locationId":(.*?),',yiqinginfo) for city in cities:
print(locationIds) #json里面引号必须是“否则报错
statisticsDatas=re.findall(r'"statisticsData":"(.*?)",',yiqinginfo) city=str(city).replace('\'','\"')
print(statisticsDatas) #转换成字典格式就很容易获取各个字段信息了
cities=re.findall(r'"cities":(.*?)}',yiqinginfo) cj=json.loads(city)
print(cities) cityinfo=Cityinfo()
cityinfo.cityName=cj['cityName']
cityinfo.currentConfirmedCount=cj['currentConfirmedCount']
cityinfo.confirmedCount=cj['confirmedCount']
print(tag) cityinfo.curedCount=cj['curedCount']
cityinfo.suspectedCount=cj['suspectedCount']
cityinfo.deadCount=cj['deadCount']
province.cities.append(cityinfo)
provinces.append(province)
return provinces
# 获取国外疫情 # 获取国外疫情
def grapforign(self): def grapforign(self):
pass data=requests.get(self.url,headers=self.headers)
data.encoding='utf-8'
html = data.content.decode('utf-8')
#构建soup对象
soup=BeautifulSoup(html,'html.parser')
#查找数据
tag=soup.find('script',attrs={'id':'getListByCountryTypeService2true'})
yiqinginfo=str(tag)
#news类用来存储实时新闻的信息 #news类用来存储实时新闻的信息
class news(): class news():
@ -124,3 +138,49 @@ class news():
self.summay=summary self.summay=summary
self.sourceUrl=sourceUrl self.sourceUrl=sourceUrl
#存储各市的疫情信息
class Cityinfo():
def __int__(self):
#城市名
self.cityName=''
#当前确诊
self.currentConfirmedCount=0
#累计确诊
self.confirmedCount=0
#疑似
self.suspectedCount=0
#治愈
self.curedCount=0
#死亡人数
self.deadCount=0
def __str__(self):
return self.cityName+" 当前确诊:"+str(self.currentConfirmedCount)+\
" 累计确诊:"+str(self.confirmedCount)+"疑似:"+str(self.suspectedCount)+" 治愈:"+str(self.curedCount)+\
" 死亡人数:"+str(self.deadCount)
#存储省的疫情信息
class Provinceinfo():
def __init__(self,provinceName,provinceShortName,currentConfirmedCount,confirmedCount,
suspectedCount,curedCount,deadCount,cities):
self.provinceName=provinceName
self.provinceShortName=provinceShortName
self.currentConfirmedCount=currentConfirmedCount
self.confirmedCount=confirmedCount
self.suspectedCount=suspectedCount
self.curedCount=curedCount
self.deadCount=deadCount
#各个市的疫情信息,通过一个字典存储
self.cities=cities
def __init__(self):
self.provinceName = ''
self.provinceShortName = ''
self.currentConfirmedCount = 0
self.confirmedCount = 0
self.suspectedCount = 0
self.curedCount = 0
self.deadCount = 0
# 各个市的疫情信息,通过一个字典存储
self.cities = []

@ -8,7 +8,11 @@ def main():
#网站首页 #网站首页
url="https://ncov.dxy.cn/ncovh5/view/pneumonia" url="https://ncov.dxy.cn/ncovh5/view/pneumonia"
spider=Spider(url) spider=Spider(url)
spider.grapchina() provinces= spider.grapchina()
print(provinces[5].provinceName)
for i in range(len(provinces[4].cities)):
print(provinces[4].cities[i])
if __name__ == '__main__': if __name__ == '__main__':

Loading…
Cancel
Save