爬虫模块正式完成

master
SaiCateDoan 4 years ago
commit af11081773

@ -0,0 +1,8 @@
[codestyle]
indentation = True
edge_line = True
edge_line_columns = 79
[main]
version = 0.2.0

@ -0,0 +1,5 @@
[codestyle]
indentation = True
edge_line = True
edge_line_columns = 79

@ -0,0 +1,3 @@
[encoding]
text_encoding = utf-8

@ -0,0 +1,4 @@
[vcs]
use_version_control = False
version_control_system =

@ -0,0 +1,6 @@
[workspace]
restore_data_on_startup = True
save_data_on_exit = True
save_history = True
save_non_project_files = False

@ -0,0 +1,6 @@
[encoding]
text_encoding = utf-8
[main]
version = 0.2.0

@ -0,0 +1,7 @@
[vcs]
use_version_control = False
version_control_system =
[main]
version = 0.2.0

@ -0,0 +1,10 @@
[workspace]
restore_data_on_startup = True
save_data_on_exit = True
save_history = True
save_non_project_files = False
[main]
version = 0.2.0
recent_files = []

@ -0,0 +1,556 @@
import time
import datetime
import requests
from bs4 import BeautifulSoup
import re
import json
import datetime
import pymysql
def Inside(ds):
class Province:
def __init__(self):
self.provinceName = ''
self.provinceShortName = ''
self.currentConfirmedCount = 0 #现有确诊病例数
self.confirmedCount = 0 #累计确诊
self.suspectedCount = 0 #疑似病例
self.curedCount = 0 #累计治愈
self.deadCount = 0#累计死亡
self.pub_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')#时间
self.cities = []
def __str__(self):
return 'provinceName:%s provinceShortName:%s currentConfirmedCount:%d \
confirmedCount:%d suspectedCount:%d curedCount:%d deadCount :%d '%(self.provinceName,self.provinceShortName,self.currentConfirmedCount,self.confirmedCount,self.suspectedCount,self.curedCount,self.deadCount)
def get_info_tuple(self):
return ((self.provinceName,self.provinceShortName,self.currentConfirmedCount,self.confirmedCount, self.suspectedCount, self.curedCount,self.deadCount,self.pub_time))
class City:
def __init__(self):
self.cityName = ''
self.currentConfirmedCount = 0
self.confirmedCount = 0
self.suspectedCount = 0
self.curedCount = 0
self.deadCount = 0
self.locationId =0
self.province = ''
self.pub_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')#时间
def __str__(self):
return 'cityName:%s, currentConfirmedCount:%d, confirmedCount:%d, suspectedCount:%d,\
curedCount:%d, deadCount:%d, locationId:%d, pub_time:%s ,province:%s '%(self.cityName, self.currentConfirmedCount, self.confirmedCount, self.suspectedCount, self.curedCount, self.deadCount, self.locationId,self.pub_time,self.province)
def get_info_tuple(self):
return ((self.cityName, self.currentConfirmedCount, self.confirmedCount, self.suspectedCount, self.curedCount, self.deadCount, self.locationId,self.province,self.pub_time ))
class MyDB:
def __init__(self,host,user,passwd,db):
self.conn = pymysql.connect(host,user,passwd,db)
self.cursor = self.conn.cursor()
def get_province_list_tuple(self,all_province):
info_tuple = []
for item in all_province:
info_tuple.append(item.get_info_tuple())
return info_tuple
def get_city_list_tuple(self,all_city):
info_tuple = []
for item in all_city:
info_tuple.append(item.get_info_tuple())
return info_tuple
#保存省份数据
def save_province_datas(self,all_province):
date1 = datetime.datetime.now().strftime('%Y-%m-%d')
sql1 = 'delete from province_daily_datas where pub_time like "%s"'%(date1 + '%')
print(sql1)
try:
self.cursor.execute(sql1)
self.conn.commit()
print("之前省份删除成功")
except Exception as a:
print(a)
sql = 'insert into province_daily_datas(provinceName,provinceShortName,currentConfirmedCount,confirmedCount,suspectedCount,curedCount,deadCount,pub_time) \
values(%s,%s,%s,%s,%s,%s,%s,%s)'
res = self.get_province_list_tuple(all_province)
print("+++++++ save_province_datas, datas len:%d"%(len(res)))
try:
self.cursor.executemany(sql,res)
self.conn.commit()
except Exception as e:
print(e)
print("???")
print("++++++++++++ save_province_datas is over")
#保存城市数据
def save_city_datas(self,all_city):
date2 = datetime.datetime.now().strftime('%Y-%m-%d')
sql2 = 'delete from city_daily_datas where pub_time like "%s"'%(date2 + '%')
try:
self.cursor.execute(sql2)
print((date2+"%"))
self.conn.commit()
print("之前城市删除成功")
except Exception as a:
print(a)
sql = 'insert into city_daily_datas(cityName,currentConfirmedCount,confirmedCount,suspectedCount,curedCount,deadCount,locationId,province,pub_time) \
values(%s,%s,%s,%s,%s,%s,%s,%s,%s)'
res = self.get_city_list_tuple(all_city)
print("+++++++ save_city_daily_datas, datas len:%d"%(len(res)))
try:
self.cursor.executemany(sql,res)
self.conn.commit()
except Exception as e:
print(e)
print("???")
print("++++++++++++ save_city_daily_datas is over")
def __del__(self):
if self.conn is not None:
self.conn.close()
class DataService:
def __init__(self,ds):
self.url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia'
self.db = MyDB(host = ds[0],user = ds[1],passwd = ds[2],db = ds[3])
#抓取网页
def fetch_html_page(self):
res = requests.get(self.url)
res = res.content.decode('utf-8')
return res
#解析网页
def parse_html_page(self,html):
soup = BeautifulSoup(html,'html.parser')
tag = soup.find('script',attrs = {'id':'getAreaStat'})
tagstr = tag.string
self.results = re.findall('\{"provinceName":.*?"cities":.*?\]\}',tagstr)
#提取各个省份及其城市数据
def fetch_province_datas(self):
all_province = []
all_city = []
province_name = ''
for item in self.results:
province = Province()
obj = json.loads(item)
province.provinceName = obj["provinceName"]
#提取省份名放入city()
province_name = province.provinceName
province.provinceShortName = obj["provinceShortName"]
province.currentConfirmedCount = obj["currentConfirmedCount"]
province.confirmedCount = obj["confirmedCount"]
province.suspectedCount = obj["suspectedCount"]
province.curedCount = obj["curedCount"]
province.deadCount = obj["deadCount"]
#提取城市数据
cities = obj["cities"]
for cityItem in cities:
# print(cityItem)
city = City()
city.province = province_name
city.cityName = cityItem["cityName"]
city.currentConfirmedCount = cityItem["currentConfirmedCount"]
city.confirmedCount = cityItem["confirmedCount"]
city.suspectedCount = cityItem["suspectedCount"]
city.curedCount = cityItem["curedCount"]
city.deadCount = cityItem["deadCount"]
city.locationId = cityItem["locationId"]
all_city.append(city)
province.cities.append(city)
all_province.append(province)
return all_province,all_city
#业务函数
def process_data(self):
html = self.fetch_html_page()
self.parse_html_page(html)
all_province,all_city = self.fetch_province_datas()
# print(len(all_province))
# for item in all_province:
# # print(item.get_info_tuple())
# for i in item.cities:
# print(i.get_info_tuple())
# print("++++++++++++++++++++++++++++++++++++++++++++++"*4)
# for i in all_city:
# print(i.get_info_tuple())
# print("++++++++++++++++++++++++++++++++++++++++++++++"*4)
# print(len(all_city))
# # 保存省份数据
self.db.save_province_datas(all_province)
# 保存城市数据
self.db.save_city_datas(all_city)
# 创建Dataservice对象
ds = DataService(ds)
ds.process_data()
def Outside(ds):
class Country:
def __init__(self):
self.countryName = ''
self.currentConfirmedCount = 0 # 现有确诊病例数
self.confirmedCount = 0 # 累计确诊
self.confirmedCountRank = 0 # 累计确诊排名
self.curedCount = 0 # 累计治愈
self.deadCount = 0 # 累计死亡
self.deadCountRank = 0 # 累计死亡排名
self.deadRate = 0.0 # 死亡率
self.deadRateRank = 0 # 死亡率排名
self.updatedTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
def get_info_tuple(self):
return (self.countryName, self.currentConfirmedCount, self.confirmedCount, self.confirmedCountRank, self.curedCount, self.deadCount, self.deadCountRank, self.deadRate, self.deadRateRank,self.updatedTime)
def __str__(self):
return 'countryName:%s,currentConfirmedCount:%d,confirmedCount:%d,\
confirmedCountRankt:%d,curedCount:%d,deadCount:%d,deadCountRank:%d,deadRate:%d,deadRateRank:%d,updatedTime:%s' % (self.countryName, self.currentConfirmedCount, self.confirmedCount, self.confirmedCountRank, self.curedCount, self.deadCount, self.deadCountRank, self.deadRate, self.deadRateRank,self.updatedTime)
class MyDB:
def __init__(self, host, user, passwd, db):
self.conn = pymysql.connect(host, user, passwd, db)
self.cursor = self.conn.cursor()
def get_country_list_tuple(self, all_country):
info_tuple = []
for item in all_country:
info_tuple.append(item.get_info_tuple())
return info_tuple
# 保存数据
def save_country_datas(self, all_country):
date=datetime.datetime.now().strftime('%Y-%m-%d')
sql='delete from country_daily_datas where updatedTime like "%s"'%(date+"%")
try:
self.cursor.executemany(sql)
self.conn.commit()
except Exception as e:
print(e)
sql = 'insert into country_daily_datas(countryName,currentConfirmedCount,\
confirmedCount,confirmedCountRank,curedCount,deadCount,deadCountRank,deadRate,deadRateRank,pub_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
res = self.get_country_list_tuple(all_country)
print('+++ save_country_datas, data len: %d' % len(res))
try:
self.cursor.executemany(sql, res)
self.conn.commit()
except Exception as e:
print(e)
print('+++ save_country_datas is over.')
def show_country_datas(self):
self.cursor.execute('select * from country_daily_datas')
def __del__(self):
if self.conn is not None:
self.conn.close()
def forign_data_search(ds):
db = MyDB(host = ds[0],user = ds[1],passwd = ds[2],db = ds[3])
res = requests.get('https://ncov.dxy.cn/ncovh5/view/pneumonia')# 爬取页面
res = res.content.decode('utf-8') # 重新解码
soup = BeautifulSoup(res, 'html.parser')# 构建soup对象
tag = soup.find('script', attrs={'id':'getListByCountryTypeService2true'}) # Tag# 使用soup对象查找实时播报新闻标签
tagStr = tag.string# 获取内容
results = re.findall('\{"id".*?"showRank".*?\}', tagStr) # length: 34, [str, str, ....]# 使用正则表达式匹配
all_country = []
for item in results:
country=Country()
obj = json.loads(item) # obj -> dict
country.countryName = obj['provinceName']
country.currentConfirmedCount = int(obj['currentConfirmedCount'])
country.confirmedCount = int(obj['confirmedCount'])
country.curedCount = int(obj['curedCount'])
country.deadCount = int(obj['deadCount'])
country.deadRate = float(obj['deadRate'])
country.updatedTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
try:
country.deadCountRank = int(obj['deadCountRank'])
country.deadRateRank = int(obj['deadRateRank'])
country.confirmedCountRank = int(obj['confirmedCountRank'])
except KeyError:
country.deadCountRank = 0
country.deadRateRank = 0
country.confirmedCountRank = 0
finally:
all_country.append(country)
db.save_country_datas(all_country)
forign_data_search(ds)
def OutsideSummary(dsin):
class OutsideSummary:
def __init__(self):
self.currentConfirmedCount = 0
self.confirmedCount = 0
self.suspectedCount = 0
self.curedCount = 0
self.deadCount = 0
self.suspectedIncr = 0
self.currentConfirmedIncr = 0
self.confirmedIncr = 0
self.curedIncr = 0
self.deadIncr = 0
self.updatedTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
def get_info_tuple(self):
return (self.currentConfirmedCount, self.confirmedCount, self.suspectedCount, self.curedCount,
self.deadCount, self.suspectedIncr, self.currentConfirmedIncr, self.confirmedIncr, self.curedIncr,
self.deadIncr, self.updatedTime)
def __str__(self):
return 'currentConfirmedCount:%s, confirmedCount:%s, suspectedCount:%s, curedCount:%s, deadCount:%s, suspectedIncr:%s, currentConfirmedIncr:%s, confirmedIncr:%s, curedIncr:%s, deadIncr:%s, updatedTime:%s' % (
self.currentConfirmedCount, self.confirmedCount, self.suspectedCount, self.curedCount, self.deadCount,
self.suspectedIncr, self.currentConfirmedIncr, self.confirmedIncr, self.curedIncr, self.deadIncr, self.updatedTime)
# 数据库实体类
#数据库实体类
import pymysql
class MyDB:
def __init__(self, host, user, passwd, db):
self.conn = pymysql.connect(host, user, passwd, db)
self.cursor = self.conn.cursor()
def get_outsideSummary_list_tuple(self, outsideSummary):
info_tuple = []
info_tuple.append(outsideSummary.get_info_tuple())
return info_tuple
# 保存数据
def save_outsideSummary_datas(self, outsideSummary):
print('+++ [MyDB] delete from outsideSummary_realtime_datas')
self.cursor.execute('delete from outsideSummary_realtime_datas')
self.conn.commit()
sql = 'insert into outsideSummary_realtime_datas(currentConfirmedCount,confirmedCount,suspectedCount,curedCount,deadCount,suspectedIncr,currentConfirmedIncr,confirmedIncr,curedIncr,deadIncr,updatedTime) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
res = self.get_outsideSummary_list_tuple(outsideSummary)
print('+++ save_outsideSummary_datas, data len: %d' % len(res))
try:
self.cursor.executemany(sql, res)
self.conn.commit()
except Exception as e:
print(e)
print('+++ save_outsideSummary_datas is over.')
def __del__(self):
if self.conn is not None:
self.conn.close()
# 业务逻辑类
#业务逻辑类
import datetime
import requests
import re
from bs4 import BeautifulSoup
import json
class DataService:
def __init__(self,ds):
self.db = MyDB(host = ds[0],user = ds[1],passwd = ds[2],db = ds[3])
# 爬取页面
res = requests.get('https://ncov.dxy.cn/ncovh5/view/pneumonia')
# 重新解码
res = res.content.decode('utf-8')
# 构建soup对象
soup = BeautifulSoup(res, 'html.parser')
# 使用soup对象查找国外疫情数据标签
tag = soup.find('script', attrs={'id': 'getStatisticsService'})
# 转成字符串
tagstr = tag.string
# 使用正则表达式查找所有内容
result = re.findall('\{"currentConfirmedCount".*?"deadIncr".*?\}', tagstr)
# 获取国外疫情数据
#print(result[0])
obj = json.loads(result[0])
#print(obj)
def fetch_outside_summary(obj):
outsideSummary = OutsideSummary()
outsideSummary.currentConfirmedCount = int(obj['currentConfirmedCount'])
outsideSummary.confirmedCount = int(obj['confirmedCount'])
outsideSummary.suspectedCount = int(obj['suspectedCount'])
outsideSummary.curedCount = int(obj['curedCount'])
outsideSummary.deadCount = int(obj['deadCount'])
outsideSummary.suspectedIncr = int(obj['suspectedIncr'])
outsideSummary.currentConfirmedIncr = int(obj['currentConfirmedIncr'])
outsideSummary.confirmedIncr = int(obj['confirmedIncr'])
outsideSummary.curedIncr = int(obj['curedIncr'])
outsideSummary.deadIncr = int(obj['deadIncr'])
outsideSummary.updatedTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
return outsideSummary
# 创建Dataservice对象
ds = DataService(dsin)
outsideSummary=fetch_outside_summary(obj)
ds.db.save_outsideSummary_datas(outsideSummary)
def InsideSummary(ds):
class InsideSummary:
def __init__(self):
self.curConfirm = 0 #现有确诊
self.curConfirmRelative = 0#较昨日新增确诊
self.asymptomatic = 0 #无症状感染
self.asymptomaticRelative = 0 #较昨日新增无症状感染
self.unconfirmed = 0 #现有疑似
self.unconfirmedRelative = 0#较昨日疑似新增
self.icu = 0 #现有重症
self.icuRelative = 0 #较昨日重症病例新增
self.confirmed = 0 #累计确诊
self.confirmedRelative = 0 #较昨日累计确诊新增
self.overseasInput = 0 #累计境外输入
self.overseasInputRelative = 0#较昨日累计境外输入新 增
self.cured = 0 #累计治愈
self.curedRelative = 0 #较昨日累计治愈新增
self.died = 0 #累计死亡
self.diedRelative = 0#较昨日累计死亡新增
self.updatedTime = 0 #发布时间
# 数据库接口
def get_inside_summary_tuple(self):
return ((self.curConfirm, self.curConfirmRelative, self.asymptomatic, self.asymptomaticRelative, \
self.unconfirmed, self.unconfirmedRelative, self.icu, self.icuRelative, self.confirmed, \
self.confirmedRelative, self.overseasInput, self.overseasInputRelative, self.cured, self.curedRelative, \
self.died, self.diedRelative, self.updatedTime))
# 输出接口
def __str__(self):
return '%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s' % (
self.curConfirm, self.curConfirmRelative, self.asymptomatic, self.asymptomaticRelative, self.unconfirmed,
self.unconfirmedRelative, self.icu, self.icuRelative, self.confirmed, self.confirmedRelative,
self.overseasInput, self.overseasInputRelative, self.cured, self.curedRelative, self.died, self.diedRelative,
self.updatedTime)
#######################################################################################################
#业务逻辑类
# 爬取国内疫情数据
res=requests.get('https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5')
#解码
res=res.content.decode('utf-8')
dict=json.loads(res) #str->dict
#规范数据
for key in dict:
try:
dict[key] = dict[key].replace('\\', '')
key=key.replace('\\','')
except:
pass
#新生成data字典
data=json.loads(dict['data'])
# 国内疫情数据实例化
# # #赋值
def fetch_inside_summary(data):
insideSummary = InsideSummary()
insideSummary.curConfirm = int(data['chinaTotal']['nowConfirm'])
insideSummary.curConfirmRelative = int(data['chinaAdd']['nowConfirm'])
insideSummary.asymptomatic = int(data['chinaTotal']['noInfect'])
insideSummary.asymptomaticRelative = int(data['chinaAdd']['noInfect'])
insideSummary.unconfirmed = int(data['chinaTotal']['suspect'])
insideSummary.unconfirmedRelative = int(data['chinaAdd']['suspect'])
insideSummary.icu = int(data['chinaTotal']['nowSevere'])
insideSummary.icuRelative = int(data['chinaAdd']['nowSevere'])
insideSummary.confirmed = int(data['chinaTotal']['confirm'])
insideSummary.updatedTime = data['lastUpdateTime']
insideSummary.confirmedRelative = int(data['chinaAdd']['confirm'])
insideSummary.overseasInput = int(data['chinaTotal']['importedCase'])
insideSummary.overseasInputRelative = int(data['chinaAdd']['importedCase'])
insideSummary.cured = int(data['chinaTotal']['heal'])
insideSummary.curedRelative = int(data['chinaAdd']['heal'])
insideSummary.died = int(data['chinaTotal']['dead'])
insideSummary.diedRelative = int(data['chinaAdd']['dead'])
return insideSummary
##########################################################################################################
#存储数据于数据库
#保存国内疫情概况数据
def insert(res,ds):
# 创建连接,并且返回连接对象)
conn =pymysql.connect(host = ds[0],user = ds[1],passwd = ds[2],db = ds[3])
# 创建游标对象
cursor = conn.cursor()
date = data['lastUpdateTime'].split(' ',1)[0]
print(date)
sql = 'delete from home_realtime_datas where updatedTime like "%s"'%(date+'%')
print('The update was successful')
try:
cursor.execute(sql)
conn.commit()
except Exception as e:
print(e)
sql='insert into home_realtime_datas(curConfirm,curConfirmRelative,asymptomatic,asymptomaticRelative,unconfirmed,unconfirmedRelative,icu,icuRelative,confirmed,confirmedRelative,overseasInput,overseasInputRelative,cured,curedRelative,died,diedRelative,updatedTime) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
try:
cursor.execute(sql, res)
conn.commit()
except Exception as e:
print(e)
print('+++ save_province_datas is over.')
cursor.close()
conn.close()
insert(fetch_inside_summary(data).get_inside_summary_tuple(),ds)
conn = pymysql.connect(host = ds[0],user = ds[1],passwd = ds[2],db = ds[3])
cursor = conn.cursor()
sql = 'select * from home_realtime_datas'
cursor.execute(sql)
results = cursor.fetchall()
print(results)
ds=['localhost','root','20Z00t10x28_my','covid19']
while(True):
Inside(ds)
Outside(ds)
OutsideSummary(ds)
InsideSummary(ds)
time.sleep(3600)

@ -0,0 +1,75 @@
create database covid19;
use covid19;
create table province_daily_datas(
provinceName varchar(30),
provinceShortName varchar(10),
currentConfirmedCount int,
confirmedCount int,
suspectedCount int,
curedCount int,
deadCount int,
pub_time varchar(30)
);
create table city_daily_datas(
cityName varchar(30),
currentConfirmedCount int,
confirmedCount int,
suspectedCount int,
curedCount int,
deadCount int,
locationId int,
province varchar(30),
pub_time varchar(30)
);
create table country_daily_datas(
countryName varchar(30),
currentConfirmedCount int,
confirmedCount int,
confirmedCountRank int,
curedCount int,
deadCount int,
deadCountRank int,
deadRate float,
deadRateRank int,
pub_time varchar(30)
);
create table home_realtime_datas
(
curConfirm INT comment '现有确诊',
curConfirmRelative INT comment '较昨日新增确诊',
asymptomatic INT comment '无症状感染',
asymptomaticRelative INT comment '较昨日新增无症状感染',
unconfirmed INT comment '现有疑似',
unconfirmedRelative INT comment '较昨日疑似新增',
icu INT comment '现有重症',
icuRelative INT comment '较昨日重症病例新增',
confirmed INT comment '累计确诊',
confirmedRelative INT comment '较昨日累计确诊新增',
overseasInput INT comment '累计境外输入',
overseasInputRelative INT comment '较昨日累计境外输入新增',
cured INT comment '累计治愈',
curedRelative INT comment '较昨日累计治愈新增',
died INT comment '累计死亡',
diedRelative INT comment '较昨日累计死亡新增',
updatedTime VARCHAR(4000)comment'发布时间'
) comment'国内实时疫情概况 该表中只保留当前最新的一条数据记录';
create table home_realtime_datas
(
curConfirm INT comment '现有确诊',
curConfirmRelative INT comment '较昨日新增确诊',
asymptomatic INT comment '无症状感染',
asymptomaticRelative INT comment '较昨日新增无症状感染',
unconfirmed INT comment '现有疑似',
unconfirmedRelative INT comment '较昨日疑似新增',
icu INT comment '现有重症',
icuRelative INT comment '较昨日重症病例新增',
confirmed INT comment '累计确诊',
confirmedRelative INT comment '较昨日累计确诊新增',
overseasInput INT comment '累计境外输入',
overseasInputRelative INT comment '较昨日累计境外输入新增',
cured INT comment '累计治愈',
curedRelative INT comment '较昨日累计治愈新增',
died INT comment '累计死亡',
diedRelative INT comment '较昨日累计死亡新增',
updatedTime VARCHAR(4000)comment'发布时间'
) comment'国内实时疫情概况 该表中只保留当前最新的一条数据记录';
Loading…
Cancel
Save