You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
p4fkfpicj/新闻采集.ipynb

137 lines
5.3 KiB

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id:74894\n",
"pubDate:2020-08-12 17:17:01\n",
"pubDateStr:9分钟前\n",
"title:以色列新增366例新冠肺炎确诊病例 累计86959例\n",
"summary:当地时间8月12日上午以色列卫生部公布的疫情通报显示11日23时至12日11时以色列新增366例新冠肺炎确诊病例累计达86959例无新增死亡病例累计死亡622人。截至目前现存确诊病例24761例有377人病情危重其中110人使用呼吸机已有61576人治愈康复。\n",
"infoSource:央视新闻app\n",
"\n",
"\n",
"id:74891\n",
"pubDate:2020-08-12 17:14:28\n",
"pubDateStr:11分钟前\n",
"title:巴林新增407例新冠肺炎确诊病例 累计44804例\n",
"summary:巴林卫生部当地时间12日上午发布通告确认境内新增407例新型冠状病毒肺炎确诊病例同时新增2例死亡病例。至此巴林境内累计已发现44804例新型冠状病毒肺炎确诊病例其中41504名患者已经治愈165名患者病重不治去世。3135名仍在医院接受治疗的患者中38人病情较重。\n",
"infoSource:央视新闻app\n",
"\n",
"\n",
"id:74884\n",
"pubDate:2020-08-12 16:46:46\n",
"pubDateStr:39分钟前\n",
"title:香港新增62例新冠肺炎确诊病例 连续第十日新增病例少于100例\n",
"summary:8月12日下午香港特区政府卫生署卫生防护中心举行发布会。据介绍截至12日零时香港新增62例新冠肺炎确诊病例。这是连续第十日新增确诊病例少于100例。香港累计报告新冠肺炎确诊病例4243例。  新增确诊者中有1例境外输入病例剩余61例均为本地感染病例当中有28例病例暂未查明感染源。12日香港新增四例死亡病例累计死亡63例。\n",
"infoSource:央视新闻app\n",
"\n",
"\n",
"id:74878\n",
"pubDate:2020-08-12 16:30:22\n",
"pubDateStr:55分钟前\n",
"title:菲律宾新增4444例新冠肺炎确诊病例 累计确诊143749例\n",
"summary:8月12日菲律宾卫生部发布消息称截至12日下午4点菲律宾新增4444例新冠肺炎确诊病例确诊病例累计达143749例。此外新增死亡病例93例累计死亡2404例。同时新增治愈病例636例累计治愈人数为68997人。\n",
"infoSource:央视新闻app\n",
"\n",
"\n",
"id:74869\n",
"pubDate:2020-08-12 15:52:13\n",
"pubDateStr:1小时前\n",
"title:捷克新增288例新冠肺炎确诊病例 累计18783例\n",
"summary:当地时间8月12日捷克卫生部公布的新冠肺炎疫情数据显示11日全天捷克新增288例新冠肺炎确诊病例累计确诊18783例治愈病例和死亡病例分别为13222例和391例。\n",
"infoSource:央视新闻app\n",
"\n",
"\n"
]
}
],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import re\n",
"import json\n",
"import time\n",
"#爬取页面\n",
"class NEWS:\n",
" def __init__(self):\n",
" self.id=0\n",
" self.pubDate=''\n",
" self.pubDateStr=''\n",
" self.title=''\n",
" self.summary=''\n",
" self.infoSource=''\n",
" \n",
" def __str__(self):\n",
" return 'id:%d\\npubDate:%s\\npubDateStr:%s\\ntitle:%s\\nsummary:%s\\ninfoSource:%s\\n'%(self.id,self.pubDate,self.pubDateStr,self.title,self.summary,self.infoSource)\n",
" \n",
" \n",
"res=requests.get('http://ncov.dxy.cn/ncovh5/view/pneumonia')\n",
"#重新解码\n",
"res=res.content.decode('utf-8')\n",
"#构建对象\n",
"soup=BeautifulSoup(res,'html.parser')\n",
"#打标签\n",
"tag = soup.find('script', attrs={'id':'getTimelineService1'})\n",
"#转换为字符串\n",
"tagstr = tag.string\n",
"#使用正则表达式提取信息\n",
"results = re.findall('(\\\\{\\\"id\\\".*?\\\"provinceId\\\".*?\\\\})', tagstr)\n",
"\n",
"all_New=[]\n",
"for item in results:\n",
" New=NEWS()\n",
" obj=json.loads(item)\n",
" New.id=obj['id']\n",
" New.pubDate=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(float(obj['pubDate'])/1000))\n",
" New.pubDateStr=obj['pubDateStr']\n",
" New.title=obj['title']\n",
" New.summary=obj['summary']\n",
" New.infoSource=obj['infoSource']\n",
" all_New.append(New)\n",
"for item in all_New:\n",
" print(item)\n",
" print()\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}