commit 1dd836aa3f2ca2e5b43f2dac10c6250c81e8dc9b Author: yangguangcai <1873892196@qq.com> Date: Thu Aug 13 01:31:07 2020 +0800 新闻采集完成 diff --git a/.ipynb_checkpoints/新闻采集-checkpoint.ipynb b/.ipynb_checkpoints/新闻采集-checkpoint.ipynb new file mode 100644 index 0000000..acccd6c --- /dev/null +++ b/.ipynb_checkpoints/新闻采集-checkpoint.ipynb @@ -0,0 +1,136 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id:74894\n", + "pubDate:2020-08-12 17:17:01\n", + "pubDateStr:9分钟前\n", + "title:以色列新增366例新冠肺炎确诊病例 累计86959例\n", + "summary:当地时间8月12日上午,以色列卫生部公布的疫情通报显示,11日23时至12日11时,以色列新增366例新冠肺炎确诊病例,累计达86959例,无新增死亡病例,累计死亡622人。截至目前,现存确诊病例24761例,有377人病情危重,其中110人使用呼吸机,已有61576人治愈康复。\n", + "infoSource:央视新闻app\n", + "\n", + "\n", + "id:74891\n", + "pubDate:2020-08-12 17:14:28\n", + "pubDateStr:11分钟前\n", + "title:巴林新增407例新冠肺炎确诊病例 累计44804例\n", + "summary:巴林卫生部当地时间12日上午发布通告,确认境内新增407例新型冠状病毒肺炎确诊病例,同时新增2例死亡病例。至此巴林境内累计已发现44804例新型冠状病毒肺炎确诊病例,其中41504名患者已经治愈,165名患者病重不治去世。3135名仍在医院接受治疗的患者中38人病情较重。\n", + "infoSource:央视新闻app\n", + "\n", + "\n", + "id:74884\n", + "pubDate:2020-08-12 16:46:46\n", + "pubDateStr:39分钟前\n", + "title:香港新增62例新冠肺炎确诊病例 连续第十日新增病例少于100例\n", + "summary:8月12日下午,香港特区政府卫生署卫生防护中心举行发布会。据介绍,截至12日零时,香港新增62例新冠肺炎确诊病例。这是连续第十日新增确诊病例少于100例。香港累计报告新冠肺炎确诊病例4243例。  新增确诊者中,有1例境外输入病例,剩余61例均为本地感染病例,当中有28例病例暂未查明感染源。12日,香港新增四例死亡病例,累计死亡63例。\n", + "infoSource:央视新闻app\n", + "\n", + "\n", + "id:74878\n", + "pubDate:2020-08-12 16:30:22\n", + "pubDateStr:55分钟前\n", + "title:菲律宾新增4444例新冠肺炎确诊病例 累计确诊143749例\n", + "summary:8月12日,菲律宾卫生部发布消息称,截至12日下午4点,菲律宾新增4444例新冠肺炎确诊病例,确诊病例累计达143749例。此外,新增死亡病例93例,累计死亡2404例。同时,新增治愈病例636例,累计治愈人数为68997人。\n", + "infoSource:央视新闻app\n", + "\n", + "\n", + "id:74869\n", + "pubDate:2020-08-12 15:52:13\n", + "pubDateStr:1小时前\n", + "title:捷克新增288例新冠肺炎确诊病例 累计18783例\n", + "summary:当地时间8月12日,捷克卫生部公布的新冠肺炎疫情数据显示,11日全天捷克新增288例新冠肺炎确诊病例,累计确诊18783例;治愈病例和死亡病例分别为13222例和391例。\n", + "infoSource:央视新闻app\n", + "\n", + "\n" + ] + } + ], + "source": [ + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import re\n", + "import json\n", + "import time\n", + "#爬取页面\n", + "class NEWS:\n", + " def __init__(self):\n", + " self.id=0\n", + " self.pubDate=''\n", + " self.pubDateStr=''\n", + " self.title=''\n", + " self.summary=''\n", + " self.infoSource=''\n", + " \n", + " def __str__(self):\n", + " return 'id:%d\\npubDate:%s\\npubDateStr:%s\\ntitle:%s\\nsummary:%s\\ninfoSource:%s\\n'%(self.id,self.pubDate,self.pubDateStr,self.title,self.summary,self.infoSource)\n", + " \n", + " \n", + "res=requests.get('http://ncov.dxy.cn/ncovh5/view/pneumonia')\n", + "#重新解码\n", + "res=res.content.decode('utf-8')\n", + "#构建对象\n", + "soup=BeautifulSoup(res,'html.parser')\n", + "#打标签\n", + "tag = soup.find('script', attrs={'id':'getTimelineService1'})\n", + "#转换为字符串\n", + "tagstr = tag.string\n", + "#使用正则表达式提取信息\n", + "results = re.findall('(\\\\{\\\"id\\\".*?\\\"provinceId\\\".*?\\\\})', tagstr)\n", + "\n", + "all_New=[]\n", + "for item in results:\n", + " New=NEWS()\n", + " obj=json.loads(item)\n", + " New.id=obj['id']\n", + " New.pubDate=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(float(obj['pubDate'])/1000))\n", + " New.pubDateStr=obj['pubDateStr']\n", + " New.title=obj['title']\n", + " New.summary=obj['summary']\n", + " New.infoSource=obj['infoSource']\n", + " all_New.append(New)\n", + "for item in all_New:\n", + " print(item)\n", + " print()\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/新闻采集.ipynb b/新闻采集.ipynb new file mode 100644 index 0000000..acccd6c --- /dev/null +++ b/新闻采集.ipynb @@ -0,0 +1,136 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id:74894\n", + "pubDate:2020-08-12 17:17:01\n", + "pubDateStr:9分钟前\n", + "title:以色列新增366例新冠肺炎确诊病例 累计86959例\n", + "summary:当地时间8月12日上午,以色列卫生部公布的疫情通报显示,11日23时至12日11时,以色列新增366例新冠肺炎确诊病例,累计达86959例,无新增死亡病例,累计死亡622人。截至目前,现存确诊病例24761例,有377人病情危重,其中110人使用呼吸机,已有61576人治愈康复。\n", + "infoSource:央视新闻app\n", + "\n", + "\n", + "id:74891\n", + "pubDate:2020-08-12 17:14:28\n", + "pubDateStr:11分钟前\n", + "title:巴林新增407例新冠肺炎确诊病例 累计44804例\n", + "summary:巴林卫生部当地时间12日上午发布通告,确认境内新增407例新型冠状病毒肺炎确诊病例,同时新增2例死亡病例。至此巴林境内累计已发现44804例新型冠状病毒肺炎确诊病例,其中41504名患者已经治愈,165名患者病重不治去世。3135名仍在医院接受治疗的患者中38人病情较重。\n", + "infoSource:央视新闻app\n", + "\n", + "\n", + "id:74884\n", + "pubDate:2020-08-12 16:46:46\n", + "pubDateStr:39分钟前\n", + "title:香港新增62例新冠肺炎确诊病例 连续第十日新增病例少于100例\n", + "summary:8月12日下午,香港特区政府卫生署卫生防护中心举行发布会。据介绍,截至12日零时,香港新增62例新冠肺炎确诊病例。这是连续第十日新增确诊病例少于100例。香港累计报告新冠肺炎确诊病例4243例。  新增确诊者中,有1例境外输入病例,剩余61例均为本地感染病例,当中有28例病例暂未查明感染源。12日,香港新增四例死亡病例,累计死亡63例。\n", + "infoSource:央视新闻app\n", + "\n", + "\n", + "id:74878\n", + "pubDate:2020-08-12 16:30:22\n", + "pubDateStr:55分钟前\n", + "title:菲律宾新增4444例新冠肺炎确诊病例 累计确诊143749例\n", + "summary:8月12日,菲律宾卫生部发布消息称,截至12日下午4点,菲律宾新增4444例新冠肺炎确诊病例,确诊病例累计达143749例。此外,新增死亡病例93例,累计死亡2404例。同时,新增治愈病例636例,累计治愈人数为68997人。\n", + "infoSource:央视新闻app\n", + "\n", + "\n", + "id:74869\n", + "pubDate:2020-08-12 15:52:13\n", + "pubDateStr:1小时前\n", + "title:捷克新增288例新冠肺炎确诊病例 累计18783例\n", + "summary:当地时间8月12日,捷克卫生部公布的新冠肺炎疫情数据显示,11日全天捷克新增288例新冠肺炎确诊病例,累计确诊18783例;治愈病例和死亡病例分别为13222例和391例。\n", + "infoSource:央视新闻app\n", + "\n", + "\n" + ] + } + ], + "source": [ + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import re\n", + "import json\n", + "import time\n", + "#爬取页面\n", + "class NEWS:\n", + " def __init__(self):\n", + " self.id=0\n", + " self.pubDate=''\n", + " self.pubDateStr=''\n", + " self.title=''\n", + " self.summary=''\n", + " self.infoSource=''\n", + " \n", + " def __str__(self):\n", + " return 'id:%d\\npubDate:%s\\npubDateStr:%s\\ntitle:%s\\nsummary:%s\\ninfoSource:%s\\n'%(self.id,self.pubDate,self.pubDateStr,self.title,self.summary,self.infoSource)\n", + " \n", + " \n", + "res=requests.get('http://ncov.dxy.cn/ncovh5/view/pneumonia')\n", + "#重新解码\n", + "res=res.content.decode('utf-8')\n", + "#构建对象\n", + "soup=BeautifulSoup(res,'html.parser')\n", + "#打标签\n", + "tag = soup.find('script', attrs={'id':'getTimelineService1'})\n", + "#转换为字符串\n", + "tagstr = tag.string\n", + "#使用正则表达式提取信息\n", + "results = re.findall('(\\\\{\\\"id\\\".*?\\\"provinceId\\\".*?\\\\})', tagstr)\n", + "\n", + "all_New=[]\n", + "for item in results:\n", + " New=NEWS()\n", + " obj=json.loads(item)\n", + " New.id=obj['id']\n", + " New.pubDate=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(float(obj['pubDate'])/1000))\n", + " New.pubDateStr=obj['pubDateStr']\n", + " New.title=obj['title']\n", + " New.summary=obj['summary']\n", + " New.infoSource=obj['infoSource']\n", + " all_New.append(New)\n", + "for item in all_New:\n", + " print(item)\n", + " print()\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}