import pymysql import requests from bs4 import BeautifulSoup import json class ForeignCountry: def __init__(self): self.died = 0 self.confirmed = 0 self.crued = 0 self.area = '' self.curConfirm = 0 self.confirmedRelative = 0 self.pub_date = '' def __str__(self): return 'area: %s, died: %s, confirmed: %s, crued: %s, curConfirm: %s, confirmedRelative: %s, pub_date: %s' % (self.area, self.died, self.confirmed, self.crued, self.curConfirm, self.confirmedRelative, self.pub_date) def get_fcountry_info_tuple(self): return ((self.died, self.confirmed, self.crued, self.area, self.curConfirm, self.confirmedRelative, self.pub_date)) class MyDB: def __init__(self, host, user, passwd, db): self.conn = pymysql.connect(host, user, passwd, db) self.cursor = self.conn.cursor() # 获取国外国家类参数列表格式 def get_fcountry_args_list(self, all_foreign_countries): all_args_list = [] for country in all_foreign_countries: info = country.get_fcountry_info_tuple() all_args_list.append(info) return all_args_list # 保存国外每日疫情数据 def save_outside_daily_datas(self, all_foreign_countries): curdate=all_foreign_countries[0].pub_date #先删除当天已有的数据 sql='delete from foreign_daily_datas where pub_date like "%s"'%(curdate[:10]+ '%') try: self.cursor.execute(sql) self.conn.commit() except Exception as e: print(e) sql = 'insert into foreign_daily_datas(died, confirmed, crued, area, curConfirm, confirmedRelative, pub_date) values(%s, %s, %s, %s, %s, %s, %s)' res = self.get_fcountry_args_list(all_foreign_countries) print('+++ foreign_daily_datas, data len: %d' % len(res)) try: self.cursor.executemany(sql, res) self.conn.commit() except Exception as e: print(e) print('+++ foreign_daily_datas is over.') def __del__(self): if self.conn is not None: self.conn.close() class DataService: #解析网页 def __init__(self): self.url = 'https://voice.baidu.com/act/newpneumonia/newpneumonia' self.db = MyDB('localhost', 'root', '1999z5g24x','text_data_increasing') # 抓取网页 def fetch_html_page(self): res = requests.get(self.url) res = res.content.decode('utf-8') return res # 解析网页 def parse_target_page(self, html): soup = BeautifulSoup(html) tag = soup.find('script', attrs={'id':'captain-config'}) tagStr = tag.string tagDict = json.loads(tagStr) # 提取数据更新时间\n", self.pub_date = tagDict['component'][0]['mapLastUpdatedTime'] # 获取国外各国家的数据\n", self.outsideDatas = tagDict['component'][0]['caseOutsideList'] # 处理字符串 def process_str(self, s): ret = '' if s is None or s == '': ret ='0' else: ret = s return ret def getOrElse(self,target, key): ret = '' if target.get(key) != None: ret = target[key] else: ret = '' return ret #解析国外数据 def parse_outside_daily_datas(self): # self.__outsideDailyDatas为list,一个国家对应一个元素     all_foreign_countries = [] for item in self.outsideDatas: country = ForeignCountry() country.died = self.getOrElse(item, 'died') country.confirmed = self.getOrElse(item, 'confirmed') country.crued = self.getOrElse(item, 'crued') country.area = self.getOrElse(item, 'area') country.curConfirm = self.getOrElse(item, 'curConfirm') country.confirmedRelative = self.getOrElse(item, 'confirmedRelative') country.pub_date = self.pub_date all_foreign_countries.append(country) return all_foreign_countries # 提取内容生成对象 def fetch_page_datas(self): all_countries = self.parse_outside_daily_datas() #for item in all_countries: # print(item) return all_countries # 业务函数 def process_data(self): html = self.fetch_html_page() self.parse_target_page(html) all_countries = self.fetch_page_datas() # 保存数据 self.db.save_outside_daily_datas(all_countries) # 创建Dataservice对象 ds = DataService() ds.process_data()