数据采集1“

master
yangxudongll 5 years ago
parent 9f8eea8d94
commit a8b3beb303

2
.idea/.gitignore vendored

@ -0,0 +1,2 @@
# Default ignored files
/workspace.xml

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
</project>

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/疫情数据采集分析及可视化.iml" filepath="$PROJECT_DIR$/.idea/疫情数据采集分析及可视化.iml" />
</modules>
</component>
</project>

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
</component>
</module>

@ -0,0 +1,126 @@
import json
import time
import requests
from bs4 import BeautifulSoup
import re
class Spider():
def __init__(self,url):
#目标网站
self.url=url
self.headers={"user-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}
#时事新闻
self.news=[]
#网站消息更新时间
self.modifytime=''
# 获取实时新闻
def grapNews(self):
data=requests.get(self.url,headers=self.headers)
#编码为utf8
data.encoding='utf-8'
html=data.content.decode('utf-8')
soup=BeautifulSoup(html,"html.parser")
tag=soup.find('script',attrs={'id':'getTimelineService1'})
info=re.findall(r'\[(.*?)\]',str(tag))
#消息ID
ids=re.findall(r'"id":(.*?),',info[0])
#出版时间,时间戳格式,使用时需要进行转换
pubDates=re.findall(r'"pubDate":(.*?),',info[0])
#出版时间字段
pubDateStrs=re.findall(r'"pubDateStr":"(.*?)","title"',info[0])
#新闻标题
titles=re.findall(r'"title":"(.*?)","summary"',info[0])
#新闻内容
summarys=re.findall(r'"summary":"(.*?)","infoSource"',info[0])
#新闻URL
sourceUrls=re.findall(r'"sourceUrl":"(.*?)","provinceId"',info[0])
#省份ID
provinceIds=re.findall(r'"provinceId":"(.*?)"},',info[0])
#存储数据
for i in range(len(ids)):
self.news.append(news(ids[i],pubDateStrs[i],pubDateStrs[i],titles[i],summarys[i],sourceUrls[i]))
#获取更新时间
tag = soup.find('script', attrs={'id': 'getListByCountryTypeService2true'})
tagstr = str(tag)
# 正则表达式匹配
result = re.findall(r'(\{"id".*?"showRank":.*?\})', tagstr)
# 取第一条记录
firstInfo = result[0]
# 转化为json对象
jsObj = json.loads(firstInfo)
# 获取数据更新时间并进行格式转换
updateTimestamp = jsObj['modifyTime']
updateTimestamp=float(updateTimestamp)/1000
localt=time.localtime(updateTimestamp)
#格式化
timestr = time.strftime("%Y-%m-%d %H:%M:%S", localt)
self.modifytime=timestr
print(timestr)
# 获取国内疫情
def grapchina(self):
data=requests.get(self.url,headers=self.headers)
data.encoding='utf-8'
html = data.content.decode('utf-8')
#构建soup对象
soup=BeautifulSoup(html,'html.parser')
#查找数据
tag=soup.find('script',attrs={'id':'getAreaStat'})
yiqinginfo=str(tag)
provinceNames=re.findall(r'"provinceName":"(.*?)",',yiqinginfo)
print(provinceNames)
print(len(provinceNames))
provinceShortName=re.findall(r'"provinceShortName":"(.*?)",',yiqinginfo)
print(provinceShortName)
currentConfirmedCounts=re.findall(r'"currentConfirmedCount":(.*?),"',yiqinginfo)
print(currentConfirmedCounts)
print(len(currentConfirmedCounts))
confirmedCounts=re.findall(r'"confirmedCount":(.*?),',yiqinginfo)
print(confirmedCounts)
suspectedCounts=re.findall(r'"suspectedCount":(.*?),',yiqinginfo)
print(suspectedCounts)
curedCounts=re.findall(r'"curedCount":(.*?),',yiqinginfo)
print(curedCounts)
deadCounts=re.findall(r'"deadCount":(.*?),',yiqinginfo)
print(deadCounts)
comments=re.findall(r'"comment":(.*?),',yiqinginfo)
print(comments)
locationIds=re.findall(r'"locationId":(.*?),',yiqinginfo)
print(locationIds)
statisticsDatas=re.findall(r'"statisticsData":"(.*?)",',yiqinginfo)
print(statisticsDatas)
cities=re.findall(r'"cities":(.*?)}',yiqinginfo)
print(cities)
print(tag)
# 获取国外疫情
def grapforign(self):
pass
#news类用来存储实时新闻的信息
class news():
def __init__(self,id,pubDate,pubDateStr,title,summary,sourceUrl):
#默认数据成员都是public的可以直接访问
self.id=id
self.pubDate=pubDate
self.pubDateStr=pubDateStr
self.title=title
self.summay=summary
self.sourceUrl=sourceUrl

@ -0,0 +1,15 @@
import requests
from bs4 import BeautifulSoup
import re
from 数据采集.GrapSpider import Spider
def main():
#网站首页
url="https://ncov.dxy.cn/ncovh5/view/pneumonia"
spider=Spider(url)
spider.grapchina()
if __name__ == '__main__':
main()

File diff suppressed because one or more lines are too long

@ -1,3 +0,0 @@
从本地push到远程仓库时git报认证错误原因是本地认证的账号密码与educoder不匹配需要从新认证。
命令行输入命令git config --system --unset credential.helper
会报permission denied错误修改C/Program Files/Git /mingw64/etc文件夹权限即可。
Loading…
Cancel
Save