You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
p4fkfpicj/新闻采集.ipynb

137 lines
5.3 KiB

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id:74894\n",
"pubDate:2020-08-12 17:17:01\n",
"pubDateStr:9分钟前\n",
"title:以色列新增366例新冠肺炎确诊病例 累计86959例\n",
"summary:当地时间8月12日上午以色列卫生部公布的疫情通报显示11日23时至12日11时以色列新增366例新冠肺炎确诊病例累计达86959例无新增死亡病例累计死亡622人。截至目前现存确诊病例24761例有377人病情危重其中110人使用呼吸机已有61576人治愈康复。\n",
"infoSource:央视新闻app\n",
"\n",
"\n",
"id:74891\n",
"pubDate:2020-08-12 17:14:28\n",
"pubDateStr:11分钟前\n",
"title:巴林新增407例新冠肺炎确诊病例 累计44804例\n",
"summary:巴林卫生部当地时间12日上午发布通告确认境内新增407例新型冠状病毒肺炎确诊病例同时新增2例死亡病例。至此巴林境内累计已发现44804例新型冠状病毒肺炎确诊病例其中41504名患者已经治愈165名患者病重不治去世。3135名仍在医院接受治疗的患者中38人病情较重。\n",
"infoSource:央视新闻app\n",
"\n",
"\n",
"id:74884\n",
"pubDate:2020-08-12 16:46:46\n",
"pubDateStr:39分钟前\n",
"title:香港新增62例新冠肺炎确诊病例 连续第十日新增病例少于100例\n",
"summary:8月12日下午香港特区政府卫生署卫生防护中心举行发布会。据介绍截至12日零时香港新增62例新冠肺炎确诊病例。这是连续第十日新增确诊病例少于100例。香港累计报告新冠肺炎确诊病例4243例。  新增确诊者中有1例境外输入病例剩余61例均为本地感染病例当中有28例病例暂未查明感染源。12日香港新增四例死亡病例累计死亡63例。\n",
"infoSource:央视新闻app\n",
"\n",
"\n",
"id:74878\n",
"pubDate:2020-08-12 16:30:22\n",
"pubDateStr:55分钟前\n",
"title:菲律宾新增4444例新冠肺炎确诊病例 累计确诊143749例\n",
"summary:8月12日菲律宾卫生部发布消息称截至12日下午4点菲律宾新增4444例新冠肺炎确诊病例确诊病例累计达143749例。此外新增死亡病例93例累计死亡2404例。同时新增治愈病例636例累计治愈人数为68997人。\n",
"infoSource:央视新闻app\n",
"\n",
"\n",
"id:74869\n",
"pubDate:2020-08-12 15:52:13\n",
"pubDateStr:1小时前\n",
"title:捷克新增288例新冠肺炎确诊病例 累计18783例\n",
"summary:当地时间8月12日捷克卫生部公布的新冠肺炎疫情数据显示11日全天捷克新增288例新冠肺炎确诊病例累计确诊18783例治愈病例和死亡病例分别为13222例和391例。\n",
"infoSource:央视新闻app\n",
"\n",
"\n"
]
}
],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import re\n",
"import json\n",
"import time\n",
"#爬取页面\n",
"class NEWS:\n",
" def __init__(self):\n",
" self.id=0\n",
" self.pubDate=''\n",
" self.pubDateStr=''\n",
" self.title=''\n",
" self.summary=''\n",
" self.infoSource=''\n",
" \n",
" def __str__(self):\n",
" return 'id:%d\\npubDate:%s\\npubDateStr:%s\\ntitle:%s\\nsummary:%s\\ninfoSource:%s\\n'%(self.id,self.pubDate,self.pubDateStr,self.title,self.summary,self.infoSource)\n",
" \n",
" \n",
"res=requests.get('http://ncov.dxy.cn/ncovh5/view/pneumonia')\n",
"#重新解码\n",
"res=res.content.decode('utf-8')\n",
"#构建对象\n",
"soup=BeautifulSoup(res,'html.parser')\n",
"#打标签\n",
"tag = soup.find('script', attrs={'id':'getTimelineService1'})\n",
"#转换为字符串\n",
"tagstr = tag.string\n",
"#使用正则表达式提取信息\n",
"results = re.findall('(\\\\{\\\"id\\\".*?\\\"provinceId\\\".*?\\\\})', tagstr)\n",
"\n",
"all_New=[]\n",
"for item in results:\n",
" New=NEWS()\n",
" obj=json.loads(item)\n",
" New.id=obj['id']\n",
" New.pubDate=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(float(obj['pubDate'])/1000))\n",
" New.pubDateStr=obj['pubDateStr']\n",
" New.title=obj['title']\n",
" New.summary=obj['summary']\n",
" New.infoSource=obj['infoSource']\n",
" all_New.append(New)\n",
"for item in all_New:\n",
" print(item)\n",
" print()\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}