You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

127 lines
3.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import pandas as pd
import datetime
'''
使用python爬虫技术,爬取长春和全国的天气信息数据
爬取网站http://tianqi.2345.com/wea_history/54161.htm
areaid 和各省会城市对应关系
area_id = [
("黑龙江", 50953), ("内蒙古", 53463),("吉林", 54161), ("辽宁", 54342),
("河北", 53698), ("天津", 54527), ("山西", 53772), ("陕西",57036 ),
("甘肃",52889 ), ("宁夏",53614 ), ("青海",52866 ), ("新疆", 51463),
("西藏", 55591), ("四川", 56294), ("重庆", 57516), ("山东", 54823),
("河南", 57083), ("江苏",58238 ), ("安徽", 58321), ("湖北", 57494),
("浙江", 58457), ("福建",58847 ), ("江西", 58606), ("湖南",57687 ),
("贵州",57816 ), ("广西", 59431), ("海南",59758 ), ("上海",58362 ),
("广东",59287), ("云南",56778), ("台湾",59554) ,
]
'''
# 提供年份和月份,爬取对应的的表格数据
url = "http://tianqi.2345.com/Pc/GetHistory"
headers = {
"User-Agent":
"""Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Safari/537.36 Edg/101.0.1210.32"""
}
def craw_table(id,year,month):
params = {
"areaInfo[areaId]": id,
"areaInfo[areaType]": 2,
"date[year]": year,
"date[month]": month
}
resq = requests.get(url, headers=headers, params=params)
data = resq.json()["data"]
# data frame
df = pd.read_html(data)[0]
return df
# 输入城市id爬取该城市今日的天气数据
def getToday(id):
# 获取当前年份和月份
today = datetime.datetime.today()
year = today.year
month = today.month
# 获取当日长春天气数据
month_data =craw_table(id, year, month)
return month_data.tail(1)
# 输入城市id爬取该城市近七周的天气数据
def getWeek(id):
# 获取当前年份和月份
today = datetime.datetime.today()
year = today.year
month = today.month
# 获取当日长春天气数据
month_data =craw_table(id, year, month)
return month_data.tail(7)
# 爬取全国各个省会城市的今日的天气数据
def getChinaToday():
ids=[50953, 53463,54161,54342,53698,54527,53772,57036 ,52889,53614,52866,51463,
55591, 56294, 57516,54823,57083,58238, 58321, 57494, 58457,58847,58606,
57687,57816 ,59431,59758 ,58362 ,59287,56778,59554]
list=[]
for i in ids:
df=getToday(i)
list.append(df)
return pd.concat(list).reset_index(drop=True)
# 获取长春最近3年的天气数据用于预测
def getYears():
today = datetime.datetime.today()
df_list = []
for year in range(today.year-5, today.year):
for month in range(1, 13):
df = craw_table(54161,year, month)
df_list.append(df)
for month in range(1,today.month+1):
df = craw_table(54161, today.year, month)
df_list.append(df)
# 多年数据合并
return pd.concat(df_list).reset_index(drop=True)
# 传入一个时间范围,获取某个时间范围的天气数据
def getPredictDate(year0,month0,day0,year1,month1,day1):
id=54161
date_list=[]
if month0!=month1:
date0=craw_table(id,year0,month0)
date_ago=date0[day0-1:]
date1 = craw_table(id,year1, month1)
date_pre = date1[:day1]
date_list.append(date_ago)
date_list.append(date_pre)
date=pd.concat(date_list).reset_index(drop=True)
else:
date0 = craw_table(id, year0, month0)
date=date0[day0-1:day1]
return date
'''
def craw_year(year1, year2):
df_list= []
for year in range(year1, year2):
for month in range(1, 13):
df =craw_table(year, month)
df_list.append(df)
# 多年数据合并
return pd.concat(df_list).reset_index(drop=True)
#df =craw_table(2022,4)
#print(df)
#date =df["最高温"]
#print(date)
'''