From 8afc1fa9fbb71dafbfe1d7b4d24910d94ba8f80a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=93=E4=BF=8A=E5=8D=8E?= <470471480@qq.com> Date: Sun, 16 Aug 2020 15:41:06 +0800 Subject: [PATCH] =?UTF-8?q?=E7=96=AB=E6=83=85=E6=95=B0=E6=8D=AE=E9=87=87?= =?UTF-8?q?=E9=9B=86=E5=AD=98=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 数据采集存取.txt | 74 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 数据采集存取.txt diff --git a/数据采集存取.txt b/数据采集存取.txt new file mode 100644 index 0000000..e66fdaf --- /dev/null +++ b/数据采集存取.txt @@ -0,0 +1,74 @@ +import requests +from bs4 import BeautifulSoup +import re +import json +import pymysql +import time + + +# 发送get请求,获得目标服务器相应 +response =requests.get('https://ncov.dxy.cn/ncovh5/view/pneumonia') +# 解码 +html = response.content.decode('utf-8') +# 构建soup对象 +soup = BeautifulSoup(html, 'html.parser') +# 查找指定标签 +tag = soup.find('script', attrs={'id':"getAreaStat"}) +# 转换为字符串 +tagstr = str(tag) +# 使用正则表达式查找所有内容, results为列表类型 +results = re.findall(r'\{"provinceName".*?"cities".*?\]\}',tagstr) + + +#提取数据更新时间 +ttag = soup.find('script', attrs={'id':"getListByCountryTypeService2true"}) +ttagstr = str(ttag) +tresults = re.findall(r'\{"id".*?"showRank":true\}',ttagstr) +tstr = tresults[0] +jststr = json.loads(tstr) +ts = jststr['modifyTime'] +ts = float(ts)/1000 +localt = time.localtime(ts) +pub_date = time.strftime("%Y-%m-%d %H:%M:%S", localt) + + +#打开数据库连接 +db = pymysql.connect('localhost', 'root', 'MySQL888766', 'test') +#创建cursor对象 +cursor = db.cursor() +#SQL语句 +Psql = 'insert into province_daily_datas(provinceName,provinceShortName,currentConfirmedCount,confirmedCount,suspectedCount,curedCount,deadCount,pub_date) values(%s, %s, %s, %s, %s, %s, %s, %s)' +Csql = 'insert into city_daily_datas(cityName,currentConfirmedCount,confirmedCount,suspectedCount,curedCount,deadCount,province,pub_date) values(%s, %s, %s, %s, %s, %s, %s, %s)' + +count = 1 +# 解析内容,提取各个省份以及该省份的各个地市疫情数据并存入数据库 +for item in results: + print("爬取中+%d" % count) + count+=1 + item = json.loads(item) + provinceName = item['provinceName'] + provinceShortName = item['provinceShortName'] + currentConfirmedCount = int(item['currentConfirmedCount']) + confirmedCount = int(item['confirmedCount']) + suspectedCount = int(item['suspectedCount']) + curedCount = int(item['curedCount']) + deadCount = int(item['deadCount']) + cities=item['cities'] + + #cursor.execute(Psql, (provinceName,provinceShortName,currentConfirmedCount,confirmedCount,suspectedCount,curedCount,deadCount,pub_date)) + #db.commit() + + for city in cities: + cityName = city['cityName'] + currentConfirmedCount =int(city['currentConfirmedCount']) + confirmedCount = int(city['confirmedCount']) + suspectedCount = int(city['suspectedCount']) + curedCount = int(city['curedCount']) + deadCount = int(city['deadCount']) + province = item['provinceShortName'] + + #cursor.execute(Csql, (cityName,currentConfirmedCount,confirmedCount,suspectedCount,curedCount,deadCount,province,pub_date)) + #db.commit() + +db.close() +print("爬取成功") \ No newline at end of file