国内爬虫代码及表格

dev_m
1985682903@qq.com 4 years ago
parent ed3e686203
commit 9549a887f4

@ -1,49 +1,5 @@
{ {
"cells": [ "cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"class Province:\n",
" def __init__(self):\n",
" self.provinceName = ''\n",
" self.provinceShortName = ''\n",
" self.currentConfirmedCount = 0 #现有确诊病例数\n",
" self.confirmedCount = 0 #累计确诊\n",
" self.suspectedCount = 0 #疑似病例\n",
" self.curedCount = 0 #累计治愈\n",
" self.deadCount = 0#累计死亡\n",
" self.cities = []\n",
" \n",
" def __str__(self):\n",
" return 'provinceName:%s provinceShortName:%s currentConfirmedCount:%d \\\n",
" confirmedCount:%d suspectedCount:%d curedCount:%d deadCount :%d '%(self.provinceName,self.provinceShortName,self.currentConfirmedCount,self.confirmedCount,self.suspectedCount,self.curedCount,self.deadCount)\n",
" \n",
" \n",
"class City:\n",
" def __init__(self):\n",
" self.cityName = ''\n",
" self.currentConfirmedCount = 0\n",
" self.confirmedCount = 0\n",
" self.suspectedCount = 0\n",
" self.curedCount = 0\n",
" self.deadCount = 0\n",
" self.locationId =0\n",
" \n",
" def __str__(self):\n",
" return 'cityName:%s, currentConfirmedCount:%d, confirmedCount:%d, suspectedCount:%d,\\\n",
" curedCount:%d, deadCount:%d, locationId:%d'%(self.cityName, self.currentConfirmedCount, self.confirmedCount, self.suspectedCount, self.curedCount, self.deadCount, self.locationId)"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 80, "execution_count": 80,
@ -571,57 +527,290 @@
] ]
} }
], ],
"source": [
"'''测试'''\n",
"\n",
"# import requests\n",
"# from bs4 import BeautifulSoup\n",
"# import re\n",
"# import json\n",
"\n",
"# url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia'\n",
"# res = requests.get(url)\n",
"# content = res.content.decode('utf-8')\n",
"# # print(content)\n",
"\n",
"# soup = BeautifulSoup(content,'html.parser')\n",
"\n",
"# tag = soup.find('script',attrs = {'id':'getAreaStat'})\n",
"# tagstr = tag.string\n",
"\n",
"# results = re.findall('\\{\"provinceName\":.*?\"cities\":.*?\\]\\}',tagstr)\n",
"# # print(result)\n",
"# all_province = []\n",
"\n",
"# for item in results:\n",
"# province = Province()\n",
"# obj = json.loads(item)\n",
"# province.provinceName = obj[\"provinceName\"]\n",
"# province.provinceShortName = obj[\"provinceShortName\"]\n",
"# province.currentConfirmedCount = obj[\"currentConfirmedCount\"]\n",
"# province.confirmedCount = obj[\"confirmedCount\"]\n",
"# province.suspectedCount = obj[\"suspectedCount\"]\n",
"# province.curedCount = obj[\"curedCount\"]\n",
"# province.deadCount = obj[\"deadCount\"]\n",
" \n",
"# cities = obj[\"cities\"]\n",
"# for cityItem in cities:\n",
"# # print(cityItem)\n",
"# city = City()\n",
" \n",
"# city.cityName = cityItem[\"cityName\"]\n",
"# city.currentConfirmedCount = cityItem[\"currentConfirmedCount\"]\n",
"# city.confirmedCount = cityItem[\"confirmedCount\"]\n",
"# city.suspectedCount = cityItem[\"suspectedCount\"]\n",
"# city.curedCount = cityItem[\"curedCount\"]\n",
"# city.deadCount = cityItem[\"deadCount\"]\n",
"# city.locationId = cityItem[\"locationId\"]\n",
"# province.cities.append(city)\n",
"# all_province.append(province)\n",
"\n",
"# for item in all_province:\n",
"# print(item)\n",
"# for i in item.cities:\n",
"# print(i)\n",
"# print(\"++++++++++++++++++++++++++++++++++++++++++++++\"*4)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"import datetime\n",
"class Province:\n",
" def __init__(self):\n",
" self.provinceName = ''\n",
" self.provinceShortName = ''\n",
" self.currentConfirmedCount = 0 #现有确诊病例数\n",
" self.confirmedCount = 0 #累计确诊\n",
" self.suspectedCount = 0 #疑似病例\n",
" self.curedCount = 0 #累计治愈\n",
" self.deadCount = 0#累计死亡\n",
" self.pub_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')#时间\n",
" self.cities = []\n",
" \n",
" def __str__(self):\n",
" return 'provinceName:%s provinceShortName:%s currentConfirmedCount:%d \\\n",
" confirmedCount:%d suspectedCount:%d curedCount:%d deadCount :%d '%(self.provinceName,self.provinceShortName,self.currentConfirmedCount,self.confirmedCount,self.suspectedCount,self.curedCount,self.deadCount)\n",
" \n",
" \n",
" def get_info_tuple(self):\n",
" return ((self.provinceName,self.provinceShortName,self.currentConfirmedCount,self.confirmedCount, self.suspectedCount, self.curedCount,self.deadCount,self.pub_time))\n",
" \n",
"class City:\n",
" def __init__(self):\n",
" self.cityName = ''\n",
" self.currentConfirmedCount = 0\n",
" self.confirmedCount = 0\n",
" self.suspectedCount = 0\n",
" self.curedCount = 0\n",
" self.deadCount = 0\n",
" self.locationId =0\n",
" self.province = ''\n",
" self.pub_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')#时间\n",
" \n",
" def __str__(self):\n",
" return 'cityName:%s, currentConfirmedCount:%d, confirmedCount:%d, suspectedCount:%d,\\\n",
" curedCount:%d, deadCount:%d, locationId:%d, pub_time:%s ,province:%s '%(self.cityName, self.currentConfirmedCount, self.confirmedCount, self.suspectedCount, self.curedCount, self.deadCount, self.locationId,self.pub_time,self.province)\n",
" \n",
" def get_info_tuple(self):\n",
" return ((self.cityName, self.currentConfirmedCount, self.confirmedCount, self.suspectedCount, self.curedCount, self.deadCount, self.locationId,self.province,self.pub_time ))\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"import pymysql\n",
"class MyDB:\n",
" def __init__(self,host,user,passwd,db):\n",
" self.conn = pymysql.connect(host,user,passwd,db)\n",
" self.cursor = self.conn.cursor()\n",
" \n",
" def get_province_list_tuple(self,all_province):\n",
" info_tuple = []\n",
" for item in all_province:\n",
" info_tuple.append(item.get_info_tuple())\n",
" return info_tuple\n",
" \n",
" def get_city_list_tuple(self,all_city):\n",
" info_tuple = []\n",
" for item in all_city:\n",
" info_tuple.append(item.get_info_tuple())\n",
" return info_tuple\n",
" \n",
" #保存省份数据\n",
" def save_province_datas(self,all_province):\n",
" sql = 'insert into province_daily_datas(provinceName,provinceShortName,currentConfirmedCount,confirmedCount,suspectedCount,curedCount,deadCount,pub_time) \\\n",
" values(%s,%s,%s,%s,%s,%s,%s,%s)'\n",
" res = self.get_province_list_tuple(all_province)\n",
" \n",
" print(\"+++++++ save_province_datas, datas len:%d\"%(len(res)))\n",
" \n",
" try:\n",
" self.cursor.executemany(sql,res)\n",
" self.conn.commit()\n",
" except Exception as e:\n",
" print(e)\n",
" print(\"++++++++++++ save_province_datas is over\")\n",
" \n",
" \n",
" #保存城市数据\n",
" def save_city_datas(self,all_city):\n",
" sql = 'insert into city_daily_datas(cityName,currentConfirmedCount,confirmedCount,suspectedCount,curedCount,deadCount,locationId,province,pub_time) \\\n",
" values(%s,%s,%s,%s,%s,%s,%s,%s,%s)'\n",
" res = self.get_city_list_tuple(all_city)\n",
" \n",
" print(\"+++++++ save_city_daily_datas, datas len:%d\"%(len(res)))\n",
" \n",
" try:\n",
" self.cursor.executemany(sql,res)\n",
" self.conn.commit()\n",
" except Exception as e:\n",
" print(e)\n",
" print(\"++++++++++++ save_city_daily_datas is over\") \n",
" \n",
" def __del__(self):\n",
" if self.conn is not None:\n",
" self.conn.close()\n",
" \n",
" \n",
" \n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 业务逻辑"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"448\n",
"+++++++ save_province_datas, datas len:34\n",
"++++++++++++ save_province_datas is over\n",
"+++++++ save_city_daily_datas, datas len:448\n",
"++++++++++++ save_city_daily_datas is over\n"
]
}
],
"source": [ "source": [
"import requests\n", "import requests\n",
"from bs4 import BeautifulSoup\n", "from bs4 import BeautifulSoup\n",
"import re\n", "import re\n",
"import json\n", "import json\n",
"\n", "\n",
"url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia'\n",
"res = requests.get(url)\n",
"content = res.content.decode('utf-8')\n",
"# print(content)\n",
"\n",
"soup = BeautifulSoup(content,'html.parser')\n",
"\n", "\n",
"tag = soup.find('script',attrs = {'id':'getAreaStat'})\n", "class DataService:\n",
"tagstr = tag.string\n", " def __init__(self):\n",
" self.url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia'\n",
" self.db = MyDB(host = 'localhost',user = 'root',passwd = '213015',db = 'zhengenhao')\n",
" \n",
" \n",
" #抓取网页\n",
" def fetch_html_page(self):\n",
" res = requests.get(self.url)\n",
" res = res.content.decode('utf-8')\n",
" return res\n",
" \n",
" #解析网页\n",
" def parse_html_page(self,html):\n",
" soup = BeautifulSoup(html,'html.parser')\n",
"\n", "\n",
"results = re.findall('\\{\"provinceName\":.*?\"cities\":.*?\\]\\}',tagstr)\n", " tag = soup.find('script',attrs = {'id':'getAreaStat'})\n",
"# print(result)\n", " tagstr = tag.string\n",
"all_province = []\n",
"\n", "\n",
"for item in results:\n", " self.results = re.findall('\\{\"provinceName\":.*?\"cities\":.*?\\]\\}',tagstr)\n",
" province = Province()\n",
" obj = json.loads(item)\n",
" province.provinceName = obj[\"provinceName\"]\n",
" province.provinceShortName = obj[\"provinceShortName\"]\n",
" province.currentConfirmedCount = obj[\"currentConfirmedCount\"]\n",
" province.confirmedCount = obj[\"confirmedCount\"]\n",
" province.suspectedCount = obj[\"suspectedCount\"]\n",
" province.curedCount = obj[\"curedCount\"]\n",
" province.deadCount = obj[\"deadCount\"]\n",
" \n", " \n",
" cities = obj[\"cities\"]\n", " #提取各个省份及其城市数据\n",
" for cityItem in cities:\n", " def fetch_province_datas(self):\n",
"# print(cityItem)\n", " all_province = []\n",
" city = City()\n", " \n",
" all_city = []\n",
" \n",
" province_name = ''\n",
"\n",
" for item in self.results:\n",
" province = Province()\n",
" obj = json.loads(item)\n",
" province.provinceName = obj[\"provinceName\"]\n",
" #提取省份名放入city()\n",
" province_name = province.provinceName\n",
" \n",
" province.provinceShortName = obj[\"provinceShortName\"]\n",
" province.currentConfirmedCount = obj[\"currentConfirmedCount\"]\n",
" province.confirmedCount = obj[\"confirmedCount\"]\n",
" province.suspectedCount = obj[\"suspectedCount\"]\n",
" province.curedCount = obj[\"curedCount\"]\n",
" province.deadCount = obj[\"deadCount\"]\n",
" \n",
" #提取城市数据\n",
" cities = obj[\"cities\"]\n",
" for cityItem in cities:\n",
" # print(cityItem)\n",
" city = City()\n",
" \n",
" city.province = province_name\n",
" city.cityName = cityItem[\"cityName\"]\n",
" city.currentConfirmedCount = cityItem[\"currentConfirmedCount\"]\n",
" city.confirmedCount = cityItem[\"confirmedCount\"]\n",
" city.suspectedCount = cityItem[\"suspectedCount\"]\n",
" city.curedCount = cityItem[\"curedCount\"]\n",
" city.deadCount = cityItem[\"deadCount\"]\n",
" city.locationId = cityItem[\"locationId\"]\n",
" all_city.append(city)\n",
" province.cities.append(city)\n",
" all_province.append(province)\n",
" return all_province,all_city\n",
" \n", " \n",
" city.cityName = cityItem[\"cityName\"]\n", " #业务函数\n",
" city.currentConfirmedCount = cityItem[\"currentConfirmedCount\"]\n", " def process_data(self):\n",
" city.confirmedCount = cityItem[\"confirmedCount\"]\n", " html = self.fetch_html_page()\n",
" city.suspectedCount = cityItem[\"suspectedCount\"]\n", " self.parse_html_page(html)\n",
" city.curedCount = cityItem[\"curedCount\"]\n", " all_province,all_city = self.fetch_province_datas()\n",
" city.deadCount = cityItem[\"deadCount\"]\n", "# print(len(all_province))\n",
" city.locationId = cityItem[\"locationId\"]\n", "# for item in all_province:\n",
" province.cities.append(city)\n", "# # print(item.get_info_tuple())\n",
" all_province.append(province)\n", "# for i in item.cities:\n",
"# print(i.get_info_tuple())\n",
"# print(\"++++++++++++++++++++++++++++++++++++++++++++++\"*4)\n",
"\n",
"# for i in all_city:\n",
"# print(i.get_info_tuple())\n",
"# print(\"++++++++++++++++++++++++++++++++++++++++++++++\"*4)\n",
"# print(len(all_city))\n",
"# # 保存省份数据\n",
" self.db.save_province_datas(all_province)\n",
" # 保存城市数据\n",
" self.db.save_city_datas(all_city)\n",
"\n", "\n",
"for item in all_province:\n", "# 创建Dataservice对象\n",
" print(item)\n", "ds = DataService()\n",
" for i in item.cities:\n", "ds.process_data()\n"
" print(i)\n",
" print(\"++++++++++++++++++++++++++++++++++++++++++++++\"*4)"
] ]
}, },
{ {

@ -0,0 +1,191 @@
{
"RootName": "DataModels",
"CTVER": "43543234",
"TableCount": 2,
"Count": 1,
"items": [{
"ID": 1,
"Name": "模型1",
"CreateDate": "2020/8/12 18:51:18",
"OrderNo": 1,
"Tables": {
"Count": 2,
"items": [
{
"ID": 3,
"Name": "province_daily_datas",
"Caption": "各省份每日疫情数据",
"CreateDate": "2020/8/12 19:05:01",
"OrderNo": 1,
"GraphDesc": "Left=144.00\r\nTop=51.00",
"MetaFields": {
"Count": 9,
"items": [
{
"ID": 1,
"Name": "Id",
"OrderNo": 1,
"DisplayName": "编号",
"DataType": 2,
"KeyFieldType": 1,
"DefaultValue": "{auto_increment}",
"Nullable": true
},
{
"ID": 2,
"Name": "provinceName",
"OrderNo": 2,
"DisplayName": "省份名称",
"DataType": 1,
"Nullable": true
},
{
"ID": 3,
"Name": "provinceShortName",
"OrderNo": 3,
"DisplayName": "省份缩写",
"DataType": 1,
"Nullable": true
},
{
"ID": 4,
"Name": "currentConfirmedCount",
"OrderNo": 4,
"DisplayName": "现有确诊病例数",
"DataType": 2,
"Nullable": true
},
{
"ID": 5,
"Name": "confirmedCount",
"OrderNo": 5,
"DisplayName": "累计确诊",
"DataType": 2,
"Nullable": true
},
{
"ID": 6,
"Name": "suspectedCount",
"OrderNo": 6,
"DisplayName": "疑似病例",
"DataType": 2,
"Nullable": true
},
{
"ID": 7,
"Name": "curedCount ",
"OrderNo": 7,
"DisplayName": "累计治愈",
"DataType": 2,
"Nullable": true
},
{
"ID": 8,
"Name": "deadCount",
"OrderNo": 8,
"DisplayName": "累计死亡",
"DataType": 2,
"Nullable": true
},
{
"ID": 10,
"Name": "pub_time",
"OrderNo": 9,
"DisplayName": "时间",
"DataType": 1,
"Nullable": true
}
]
}
},
{
"ID": 2,
"Name": "city_daily_datas",
"Caption": "各国内城市每日疫情数据",
"CreateDate": "2020/8/13 14:35:11",
"OrderNo": 2,
"GraphDesc": "Left=460.00\r\nTop=10.00",
"MetaFields": {
"Count": 9,
"items": [
{
"ID": 1,
"Name": "Id",
"OrderNo": 1,
"DisplayName": "编号",
"DataType": 2,
"KeyFieldType": 1,
"DefaultValue": "{auto_increment}",
"Nullable": true
},
{
"ID": 2,
"Name": "cityName",
"OrderNo": 2,
"DisplayName": "城市名称",
"DataType": 1,
"Nullable": true
},
{
"ID": 3,
"Name": "currentConfirmedCount",
"OrderNo": 3,
"DisplayName": "现有确诊",
"DataType": 2,
"Nullable": true
},
{
"ID": 4,
"Name": "confirmedCount",
"OrderNo": 4,
"DisplayName": "累计确诊",
"DataType": 2,
"Nullable": true
},
{
"ID": 22,
"Name": "suspectedCount",
"OrderNo": 5,
"DisplayName": "疑似病例",
"DataType": 2,
"Nullable": true
},
{
"ID": 23,
"Name": "curedCount",
"OrderNo": 6,
"DisplayName": "治愈人数",
"DataType": 2,
"Nullable": true
},
{
"ID": 24,
"Name": "deadCount",
"OrderNo": 7,
"DisplayName": "死亡人数",
"DataType": 2,
"Nullable": true
},
{
"ID": 25,
"Name": "pub_date",
"OrderNo": 8,
"DisplayName": "时间",
"DataType": 1,
"Nullable": true
},
{
"ID": 26,
"Name": "province",
"OrderNo": 9,
"DisplayName": "所属省份",
"DataType": 1,
"Nullable": true
}
]
}
}
]
}
}]
}
Loading…
Cancel
Save