diff --git a/2.ipynb b/丁香园爬虫.ipynb similarity index 88% rename from 2.ipynb rename to 丁香园爬虫.ipynb index 3a65efa..5d11c57 100644 --- a/2.ipynb +++ b/丁香园爬虫.ipynb @@ -1,49 +1,5 @@ { "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 79, - "metadata": {}, - "outputs": [], - "source": [ - "class Province:\n", - " def __init__(self):\n", - " self.provinceName = ''\n", - " self.provinceShortName = ''\n", - " self.currentConfirmedCount = 0 #现有确诊病例数\n", - " self.confirmedCount = 0 #累计确诊\n", - " self.suspectedCount = 0 #疑似病例\n", - " self.curedCount = 0 #累计治愈\n", - " self.deadCount = 0#累计死亡\n", - " self.cities = []\n", - " \n", - " def __str__(self):\n", - " return 'provinceName:%s provinceShortName:%s currentConfirmedCount:%d \\\n", - " confirmedCount:%d suspectedCount:%d curedCount:%d deadCount :%d '%(self.provinceName,self.provinceShortName,self.currentConfirmedCount,self.confirmedCount,self.suspectedCount,self.curedCount,self.deadCount)\n", - " \n", - " \n", - "class City:\n", - " def __init__(self):\n", - " self.cityName = ''\n", - " self.currentConfirmedCount = 0\n", - " self.confirmedCount = 0\n", - " self.suspectedCount = 0\n", - " self.curedCount = 0\n", - " self.deadCount = 0\n", - " self.locationId =0\n", - " \n", - " def __str__(self):\n", - " return 'cityName:%s, currentConfirmedCount:%d, confirmedCount:%d, suspectedCount:%d,\\\n", - " curedCount:%d, deadCount:%d, locationId:%d'%(self.cityName, self.currentConfirmedCount, self.confirmedCount, self.suspectedCount, self.curedCount, self.deadCount, self.locationId)" - ] - }, { "cell_type": "code", "execution_count": 80, @@ -571,57 +527,290 @@ ] } ], + "source": [ + "'''测试'''\n", + "\n", + "# import requests\n", + "# from bs4 import BeautifulSoup\n", + "# import re\n", + "# import json\n", + "\n", + "# url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia'\n", + "# res = requests.get(url)\n", + "# content = res.content.decode('utf-8')\n", + "# # print(content)\n", + "\n", + "# soup = BeautifulSoup(content,'html.parser')\n", + "\n", + "# tag = soup.find('script',attrs = {'id':'getAreaStat'})\n", + "# tagstr = tag.string\n", + "\n", + "# results = re.findall('\\{\"provinceName\":.*?\"cities\":.*?\\]\\}',tagstr)\n", + "# # print(result)\n", + "# all_province = []\n", + "\n", + "# for item in results:\n", + "# province = Province()\n", + "# obj = json.loads(item)\n", + "# province.provinceName = obj[\"provinceName\"]\n", + "# province.provinceShortName = obj[\"provinceShortName\"]\n", + "# province.currentConfirmedCount = obj[\"currentConfirmedCount\"]\n", + "# province.confirmedCount = obj[\"confirmedCount\"]\n", + "# province.suspectedCount = obj[\"suspectedCount\"]\n", + "# province.curedCount = obj[\"curedCount\"]\n", + "# province.deadCount = obj[\"deadCount\"]\n", + " \n", + "# cities = obj[\"cities\"]\n", + "# for cityItem in cities:\n", + "# # print(cityItem)\n", + "# city = City()\n", + " \n", + "# city.cityName = cityItem[\"cityName\"]\n", + "# city.currentConfirmedCount = cityItem[\"currentConfirmedCount\"]\n", + "# city.confirmedCount = cityItem[\"confirmedCount\"]\n", + "# city.suspectedCount = cityItem[\"suspectedCount\"]\n", + "# city.curedCount = cityItem[\"curedCount\"]\n", + "# city.deadCount = cityItem[\"deadCount\"]\n", + "# city.locationId = cityItem[\"locationId\"]\n", + "# province.cities.append(city)\n", + "# all_province.append(province)\n", + "\n", + "# for item in all_province:\n", + "# print(item)\n", + "# for i in item.cities:\n", + "# print(i)\n", + "# print(\"++++++++++++++++++++++++++++++++++++++++++++++\"*4)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "class Province:\n", + " def __init__(self):\n", + " self.provinceName = ''\n", + " self.provinceShortName = ''\n", + " self.currentConfirmedCount = 0 #现有确诊病例数\n", + " self.confirmedCount = 0 #累计确诊\n", + " self.suspectedCount = 0 #疑似病例\n", + " self.curedCount = 0 #累计治愈\n", + " self.deadCount = 0#累计死亡\n", + " self.pub_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')#时间\n", + " self.cities = []\n", + " \n", + " def __str__(self):\n", + " return 'provinceName:%s provinceShortName:%s currentConfirmedCount:%d \\\n", + " confirmedCount:%d suspectedCount:%d curedCount:%d deadCount :%d '%(self.provinceName,self.provinceShortName,self.currentConfirmedCount,self.confirmedCount,self.suspectedCount,self.curedCount,self.deadCount)\n", + " \n", + " \n", + " def get_info_tuple(self):\n", + " return ((self.provinceName,self.provinceShortName,self.currentConfirmedCount,self.confirmedCount, self.suspectedCount, self.curedCount,self.deadCount,self.pub_time))\n", + " \n", + "class City:\n", + " def __init__(self):\n", + " self.cityName = ''\n", + " self.currentConfirmedCount = 0\n", + " self.confirmedCount = 0\n", + " self.suspectedCount = 0\n", + " self.curedCount = 0\n", + " self.deadCount = 0\n", + " self.locationId =0\n", + " self.province = ''\n", + " self.pub_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')#时间\n", + " \n", + " def __str__(self):\n", + " return 'cityName:%s, currentConfirmedCount:%d, confirmedCount:%d, suspectedCount:%d,\\\n", + " curedCount:%d, deadCount:%d, locationId:%d, pub_time:%s ,province:%s '%(self.cityName, self.currentConfirmedCount, self.confirmedCount, self.suspectedCount, self.curedCount, self.deadCount, self.locationId,self.pub_time,self.province)\n", + " \n", + " def get_info_tuple(self):\n", + " return ((self.cityName, self.currentConfirmedCount, self.confirmedCount, self.suspectedCount, self.curedCount, self.deadCount, self.locationId,self.province,self.pub_time ))\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "import pymysql\n", + "class MyDB:\n", + " def __init__(self,host,user,passwd,db):\n", + " self.conn = pymysql.connect(host,user,passwd,db)\n", + " self.cursor = self.conn.cursor()\n", + " \n", + " def get_province_list_tuple(self,all_province):\n", + " info_tuple = []\n", + " for item in all_province:\n", + " info_tuple.append(item.get_info_tuple())\n", + " return info_tuple\n", + " \n", + " def get_city_list_tuple(self,all_city):\n", + " info_tuple = []\n", + " for item in all_city:\n", + " info_tuple.append(item.get_info_tuple())\n", + " return info_tuple\n", + " \n", + " #保存省份数据\n", + " def save_province_datas(self,all_province):\n", + " sql = 'insert into province_daily_datas(provinceName,provinceShortName,currentConfirmedCount,confirmedCount,suspectedCount,curedCount,deadCount,pub_time) \\\n", + " values(%s,%s,%s,%s,%s,%s,%s,%s)'\n", + " res = self.get_province_list_tuple(all_province)\n", + " \n", + " print(\"+++++++ save_province_datas, datas len:%d\"%(len(res)))\n", + " \n", + " try:\n", + " self.cursor.executemany(sql,res)\n", + " self.conn.commit()\n", + " except Exception as e:\n", + " print(e)\n", + " print(\"++++++++++++ save_province_datas is over\")\n", + " \n", + " \n", + " #保存城市数据\n", + " def save_city_datas(self,all_city):\n", + " sql = 'insert into city_daily_datas(cityName,currentConfirmedCount,confirmedCount,suspectedCount,curedCount,deadCount,locationId,province,pub_time) \\\n", + " values(%s,%s,%s,%s,%s,%s,%s,%s,%s)'\n", + " res = self.get_city_list_tuple(all_city)\n", + " \n", + " print(\"+++++++ save_city_daily_datas, datas len:%d\"%(len(res)))\n", + " \n", + " try:\n", + " self.cursor.executemany(sql,res)\n", + " self.conn.commit()\n", + " except Exception as e:\n", + " print(e)\n", + " print(\"++++++++++++ save_city_daily_datas is over\") \n", + " \n", + " def __del__(self):\n", + " if self.conn is not None:\n", + " self.conn.close()\n", + " \n", + " \n", + " \n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 业务逻辑" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "448\n", + "+++++++ save_province_datas, datas len:34\n", + "++++++++++++ save_province_datas is over\n", + "+++++++ save_city_daily_datas, datas len:448\n", + "++++++++++++ save_city_daily_datas is over\n" + ] + } + ], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "import re\n", "import json\n", "\n", - "url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia'\n", - "res = requests.get(url)\n", - "content = res.content.decode('utf-8')\n", - "# print(content)\n", - "\n", - "soup = BeautifulSoup(content,'html.parser')\n", "\n", - "tag = soup.find('script',attrs = {'id':'getAreaStat'})\n", - "tagstr = tag.string\n", + "class DataService:\n", + " def __init__(self):\n", + " self.url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia'\n", + " self.db = MyDB(host = 'localhost',user = 'root',passwd = '213015',db = 'zhengenhao')\n", + " \n", + " \n", + " #抓取网页\n", + " def fetch_html_page(self):\n", + " res = requests.get(self.url)\n", + " res = res.content.decode('utf-8')\n", + " return res\n", + " \n", + " #解析网页\n", + " def parse_html_page(self,html):\n", + " soup = BeautifulSoup(html,'html.parser')\n", "\n", - "results = re.findall('\\{\"provinceName\":.*?\"cities\":.*?\\]\\}',tagstr)\n", - "# print(result)\n", - "all_province = []\n", + " tag = soup.find('script',attrs = {'id':'getAreaStat'})\n", + " tagstr = tag.string\n", "\n", - "for item in results:\n", - " province = Province()\n", - " obj = json.loads(item)\n", - " province.provinceName = obj[\"provinceName\"]\n", - " province.provinceShortName = obj[\"provinceShortName\"]\n", - " province.currentConfirmedCount = obj[\"currentConfirmedCount\"]\n", - " province.confirmedCount = obj[\"confirmedCount\"]\n", - " province.suspectedCount = obj[\"suspectedCount\"]\n", - " province.curedCount = obj[\"curedCount\"]\n", - " province.deadCount = obj[\"deadCount\"]\n", + " self.results = re.findall('\\{\"provinceName\":.*?\"cities\":.*?\\]\\}',tagstr)\n", " \n", - " cities = obj[\"cities\"]\n", - " for cityItem in cities:\n", - "# print(cityItem)\n", - " city = City()\n", + " #提取各个省份及其城市数据\n", + " def fetch_province_datas(self):\n", + " all_province = []\n", + " \n", + " all_city = []\n", + " \n", + " province_name = ''\n", + "\n", + " for item in self.results:\n", + " province = Province()\n", + " obj = json.loads(item)\n", + " province.provinceName = obj[\"provinceName\"]\n", + " #提取省份名,放入city()\n", + " province_name = province.provinceName\n", + " \n", + " province.provinceShortName = obj[\"provinceShortName\"]\n", + " province.currentConfirmedCount = obj[\"currentConfirmedCount\"]\n", + " province.confirmedCount = obj[\"confirmedCount\"]\n", + " province.suspectedCount = obj[\"suspectedCount\"]\n", + " province.curedCount = obj[\"curedCount\"]\n", + " province.deadCount = obj[\"deadCount\"]\n", + " \n", + " #提取城市数据\n", + " cities = obj[\"cities\"]\n", + " for cityItem in cities:\n", + " # print(cityItem)\n", + " city = City()\n", + " \n", + " city.province = province_name\n", + " city.cityName = cityItem[\"cityName\"]\n", + " city.currentConfirmedCount = cityItem[\"currentConfirmedCount\"]\n", + " city.confirmedCount = cityItem[\"confirmedCount\"]\n", + " city.suspectedCount = cityItem[\"suspectedCount\"]\n", + " city.curedCount = cityItem[\"curedCount\"]\n", + " city.deadCount = cityItem[\"deadCount\"]\n", + " city.locationId = cityItem[\"locationId\"]\n", + " all_city.append(city)\n", + " province.cities.append(city)\n", + " all_province.append(province)\n", + " return all_province,all_city\n", " \n", - " city.cityName = cityItem[\"cityName\"]\n", - " city.currentConfirmedCount = cityItem[\"currentConfirmedCount\"]\n", - " city.confirmedCount = cityItem[\"confirmedCount\"]\n", - " city.suspectedCount = cityItem[\"suspectedCount\"]\n", - " city.curedCount = cityItem[\"curedCount\"]\n", - " city.deadCount = cityItem[\"deadCount\"]\n", - " city.locationId = cityItem[\"locationId\"]\n", - " province.cities.append(city)\n", - " all_province.append(province)\n", + " #业务函数\n", + " def process_data(self):\n", + " html = self.fetch_html_page()\n", + " self.parse_html_page(html)\n", + " all_province,all_city = self.fetch_province_datas()\n", + "# print(len(all_province))\n", + "# for item in all_province:\n", + "# # print(item.get_info_tuple())\n", + "# for i in item.cities:\n", + "# print(i.get_info_tuple())\n", + "# print(\"++++++++++++++++++++++++++++++++++++++++++++++\"*4)\n", + "\n", + "# for i in all_city:\n", + "# print(i.get_info_tuple())\n", + "# print(\"++++++++++++++++++++++++++++++++++++++++++++++\"*4)\n", + "# print(len(all_city))\n", + "# # 保存省份数据\n", + " self.db.save_province_datas(all_province)\n", + " # 保存城市数据\n", + " self.db.save_city_datas(all_city)\n", "\n", - "for item in all_province:\n", - " print(item)\n", - " for i in item.cities:\n", - " print(i)\n", - " print(\"++++++++++++++++++++++++++++++++++++++++++++++\"*4)" + "# 创建Dataservice对象\n", + "ds = DataService()\n", + "ds.process_data()\n" ] }, { diff --git a/国内疫情数据表.dmj b/国内疫情数据表.dmj new file mode 100644 index 0000000..5c29ef3 --- /dev/null +++ b/国内疫情数据表.dmj @@ -0,0 +1,191 @@ +{ + "RootName": "DataModels", + "CTVER": "43543234", + "TableCount": 2, + "Count": 1, + "items": [{ + "ID": 1, + "Name": "模型1", + "CreateDate": "2020/8/12 18:51:18", + "OrderNo": 1, + "Tables": { + "Count": 2, + "items": [ + { + "ID": 3, + "Name": "province_daily_datas", + "Caption": "各省份每日疫情数据", + "CreateDate": "2020/8/12 19:05:01", + "OrderNo": 1, + "GraphDesc": "Left=144.00\r\nTop=51.00", + "MetaFields": { + "Count": 9, + "items": [ + { + "ID": 1, + "Name": "Id", + "OrderNo": 1, + "DisplayName": "编号", + "DataType": 2, + "KeyFieldType": 1, + "DefaultValue": "{auto_increment}", + "Nullable": true + }, + { + "ID": 2, + "Name": "provinceName", + "OrderNo": 2, + "DisplayName": "省份名称", + "DataType": 1, + "Nullable": true + }, + { + "ID": 3, + "Name": "provinceShortName", + "OrderNo": 3, + "DisplayName": "省份缩写", + "DataType": 1, + "Nullable": true + }, + { + "ID": 4, + "Name": "currentConfirmedCount", + "OrderNo": 4, + "DisplayName": "现有确诊病例数", + "DataType": 2, + "Nullable": true + }, + { + "ID": 5, + "Name": "confirmedCount", + "OrderNo": 5, + "DisplayName": "累计确诊", + "DataType": 2, + "Nullable": true + }, + { + "ID": 6, + "Name": "suspectedCount", + "OrderNo": 6, + "DisplayName": "疑似病例", + "DataType": 2, + "Nullable": true + }, + { + "ID": 7, + "Name": "curedCount ", + "OrderNo": 7, + "DisplayName": "累计治愈", + "DataType": 2, + "Nullable": true + }, + { + "ID": 8, + "Name": "deadCount", + "OrderNo": 8, + "DisplayName": "累计死亡", + "DataType": 2, + "Nullable": true + }, + { + "ID": 10, + "Name": "pub_time", + "OrderNo": 9, + "DisplayName": "时间", + "DataType": 1, + "Nullable": true + } + ] + } + }, + { + "ID": 2, + "Name": "city_daily_datas", + "Caption": "各国内城市每日疫情数据", + "CreateDate": "2020/8/13 14:35:11", + "OrderNo": 2, + "GraphDesc": "Left=460.00\r\nTop=10.00", + "MetaFields": { + "Count": 9, + "items": [ + { + "ID": 1, + "Name": "Id", + "OrderNo": 1, + "DisplayName": "编号", + "DataType": 2, + "KeyFieldType": 1, + "DefaultValue": "{auto_increment}", + "Nullable": true + }, + { + "ID": 2, + "Name": "cityName", + "OrderNo": 2, + "DisplayName": "城市名称", + "DataType": 1, + "Nullable": true + }, + { + "ID": 3, + "Name": "currentConfirmedCount", + "OrderNo": 3, + "DisplayName": "现有确诊", + "DataType": 2, + "Nullable": true + }, + { + "ID": 4, + "Name": "confirmedCount", + "OrderNo": 4, + "DisplayName": "累计确诊", + "DataType": 2, + "Nullable": true + }, + { + "ID": 22, + "Name": "suspectedCount", + "OrderNo": 5, + "DisplayName": "疑似病例", + "DataType": 2, + "Nullable": true + }, + { + "ID": 23, + "Name": "curedCount", + "OrderNo": 6, + "DisplayName": "治愈人数", + "DataType": 2, + "Nullable": true + }, + { + "ID": 24, + "Name": "deadCount", + "OrderNo": 7, + "DisplayName": "死亡人数", + "DataType": 2, + "Nullable": true + }, + { + "ID": 25, + "Name": "pub_date", + "OrderNo": 8, + "DisplayName": "时间", + "DataType": 1, + "Nullable": true + }, + { + "ID": 26, + "Name": "province", + "OrderNo": 9, + "DisplayName": "所属省份", + "DataType": 1, + "Nullable": true + } + ] + } + } + ] + } + }] +} \ No newline at end of file