diff --git a/spider.py b/spider.py new file mode 100644 index 0000000..f91d690 --- /dev/null +++ b/spider.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- + +import requests +from lxml import etree +import csv + +def getWeather(url): + weather_info = [] # 新建一个列表,将爬取的每月数据放进去 + + headers = { + 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36' + } + + + resp = requests.get(url, headers= headers) + + resp_html = etree.HTML(resp.text) + + resp_list = resp_html.xpath("//ul[@class='thrui']/li") + for li in resp_list: + day_weather_info = {} + + # 日期 + day_weather_info['date_time'] = li.xpath("./div[1]/text()")[0].split(' ')[0] + # 最高气温 + high = li.xpath("./div[2]/text()")[0] + day_weather_info['high'] = high[:high.find('℃')] + + # 最低气温 + low = li.xpath("./div[3]/text()")[0] + day_weather_info['low'] = low[:low.find('℃')] + + # 天气 + day_weather_info['weather'] = li.xpath("./div[4]/text()")[0] + weather_info.append(day_weather_info) + return weather_info + + +weathers = [] + +for month in range(1, 13): + # 获取某一月的天气信息 + weather_time = '2023' + ('0' + str(month) if month < 10 else str(month)) + url = f'https://lishi.tianqi.com/guilin/{weather_time}.html' + # 爬虫获取这个月的天气信息 + weather = getWeather(url) + # 存到列表中 + weathers.append(weather) +print(weathers) + + +# 数据写入(一次性写入) +with open("weather.csv", "w", newline = '', encoding='utf-8') as csvfile: + writer = csv.writer(csvfile) + # 先写入列名:columns_name 日期 最高气温 最低气温 天气 + writer.writerow(["日期", "最高气温", "最低气温", "天气"]) + writer.writerows([list(day_weather_dict.values()) for month_weather in weathers for day_weather_dict in month_weather])