diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..e7e9d11 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,2 @@ +# Default ignored files +/workspace.xml diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..65531ca --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..4cbb0cc --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/疫情数据采集分析及可视化.iml b/.idea/疫情数据采集分析及可视化.iml new file mode 100644 index 0000000..a193443 --- /dev/null +++ b/.idea/疫情数据采集分析及可视化.iml @@ -0,0 +1,11 @@ + + + + + + + + + + \ No newline at end of file diff --git a/__pycache__/GrapSpider.cpython-36.pyc b/__pycache__/GrapSpider.cpython-36.pyc new file mode 100644 index 0000000..66a25ea Binary files /dev/null and b/__pycache__/GrapSpider.cpython-36.pyc differ diff --git a/数据采集/GrapSpider.py b/数据采集/GrapSpider.py new file mode 100644 index 0000000..78b5f27 --- /dev/null +++ b/数据采集/GrapSpider.py @@ -0,0 +1,126 @@ +import json +import time + +import requests +from bs4 import BeautifulSoup +import re + +class Spider(): + def __init__(self,url): + #目标网站 + self.url=url + self.headers={"user-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"} + #时事新闻 + self.news=[] + #网站消息更新时间 + self.modifytime='' + + + # 获取实时新闻 + def grapNews(self): + data=requests.get(self.url,headers=self.headers) + #编码为utf8 + data.encoding='utf-8' + html=data.content.decode('utf-8') + + soup=BeautifulSoup(html,"html.parser") + + tag=soup.find('script',attrs={'id':'getTimelineService1'}) + + info=re.findall(r'\[(.*?)\]',str(tag)) + #消息ID + ids=re.findall(r'"id":(.*?),',info[0]) + #出版时间,时间戳格式,使用时需要进行转换 + pubDates=re.findall(r'"pubDate":(.*?),',info[0]) + #出版时间字段 + pubDateStrs=re.findall(r'"pubDateStr":"(.*?)","title"',info[0]) + #新闻标题 + titles=re.findall(r'"title":"(.*?)","summary"',info[0]) + #新闻内容 + summarys=re.findall(r'"summary":"(.*?)","infoSource"',info[0]) + #新闻URL + sourceUrls=re.findall(r'"sourceUrl":"(.*?)","provinceId"',info[0]) + #省份ID + provinceIds=re.findall(r'"provinceId":"(.*?)"},',info[0]) + #存储数据 + for i in range(len(ids)): + self.news.append(news(ids[i],pubDateStrs[i],pubDateStrs[i],titles[i],summarys[i],sourceUrls[i])) + + #获取更新时间 + tag = soup.find('script', attrs={'id': 'getListByCountryTypeService2true'}) + + tagstr = str(tag) + + # 正则表达式匹配 + result = re.findall(r'(\{"id".*?"showRank":.*?\})', tagstr) + # 取第一条记录 + firstInfo = result[0] + # 转化为json对象 + jsObj = json.loads(firstInfo) + # 获取数据更新时间并进行格式转换 + updateTimestamp = jsObj['modifyTime'] + updateTimestamp=float(updateTimestamp)/1000 + + localt=time.localtime(updateTimestamp) + #格式化 + timestr = time.strftime("%Y-%m-%d %H:%M:%S", localt) + + self.modifytime=timestr + print(timestr) + + # 获取国内疫情 + def grapchina(self): + + data=requests.get(self.url,headers=self.headers) + data.encoding='utf-8' + html = data.content.decode('utf-8') + #构建soup对象 + soup=BeautifulSoup(html,'html.parser') + #查找数据 + tag=soup.find('script',attrs={'id':'getAreaStat'}) + yiqinginfo=str(tag) + + provinceNames=re.findall(r'"provinceName":"(.*?)",',yiqinginfo) + print(provinceNames) + print(len(provinceNames)) + provinceShortName=re.findall(r'"provinceShortName":"(.*?)",',yiqinginfo) + print(provinceShortName) + currentConfirmedCounts=re.findall(r'"currentConfirmedCount":(.*?),"',yiqinginfo) + print(currentConfirmedCounts) + print(len(currentConfirmedCounts)) + confirmedCounts=re.findall(r'"confirmedCount":(.*?),',yiqinginfo) + print(confirmedCounts) + suspectedCounts=re.findall(r'"suspectedCount":(.*?),',yiqinginfo) + print(suspectedCounts) + curedCounts=re.findall(r'"curedCount":(.*?),',yiqinginfo) + print(curedCounts) + deadCounts=re.findall(r'"deadCount":(.*?),',yiqinginfo) + print(deadCounts) + comments=re.findall(r'"comment":(.*?),',yiqinginfo) + print(comments) + locationIds=re.findall(r'"locationId":(.*?),',yiqinginfo) + print(locationIds) + statisticsDatas=re.findall(r'"statisticsData":"(.*?)",',yiqinginfo) + print(statisticsDatas) + cities=re.findall(r'"cities":(.*?)}',yiqinginfo) + print(cities) + + + + print(tag) + + # 获取国外疫情 + def grapforign(self): + pass + +#news类用来存储实时新闻的信息 +class news(): + def __init__(self,id,pubDate,pubDateStr,title,summary,sourceUrl): + #默认数据成员都是public的,可以直接访问 + self.id=id + self.pubDate=pubDate + self.pubDateStr=pubDateStr + self.title=title + self.summay=summary + self.sourceUrl=sourceUrl + diff --git a/数据采集/__pycache__/GrapSpider.cpython-36.pyc b/数据采集/__pycache__/GrapSpider.cpython-36.pyc new file mode 100644 index 0000000..de08539 Binary files /dev/null and b/数据采集/__pycache__/GrapSpider.cpython-36.pyc differ diff --git a/数据采集/grapdata.py b/数据采集/grapdata.py new file mode 100644 index 0000000..d83fdb0 --- /dev/null +++ b/数据采集/grapdata.py @@ -0,0 +1,15 @@ +import requests +from bs4 import BeautifulSoup +import re +from 数据采集.GrapSpider import Spider + + +def main(): + #网站首页 + url="https://ncov.dxy.cn/ncovh5/view/pneumonia" + spider=Spider(url) + spider.grapchina() + + +if __name__ == '__main__': + main() diff --git a/数据采集/text.txt b/数据采集/text.txt new file mode 100644 index 0000000..9ffeb88 --- /dev/null +++ b/数据采集/text.txt @@ -0,0 +1,21 @@ + + + + + + + + + + + + +
+ + + + + + \ No newline at end of file diff --git a/日志文件/第一天/杨旭东.txt b/日志文件/第一天/杨旭东.txt deleted file mode 100644 index 1c81406..0000000 --- a/日志文件/第一天/杨旭东.txt +++ /dev/null @@ -1,3 +0,0 @@ -从本地push到远程仓库时,git报认证错误,原因是本地认证的账号密码与educoder不匹配,需要从新认证。 -命令行输入命令git config --system --unset credential.helper -会报permission denied错误,修改C:/Program Files/Git /mingw64/etc文件夹权限即可。 \ No newline at end of file