wather2023/spider.py

# -*- coding: utf-8 -*-

import requests
from lxml import etree
import csv

def getWeather(url):
    weather_info = []  # 新建一个列表,将爬取的每月数据放进去

    headers = {
        'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
    }


    resp = requests.get(url, headers= headers)

    resp_html = etree.HTML(resp.text)

    resp_list = resp_html.xpath("//ul[@class='thrui']/li")
    for li in resp_list:
        day_weather_info = {}

        # 日期
        day_weather_info['date_time'] = li.xpath("./div[1]/text()")[0].split(' ')[0]
        # 最高气温
        high = li.xpath("./div[2]/text()")[0]
        day_weather_info['high'] = high[:high.find('℃')]

        # 最低气温
        low = li.xpath("./div[3]/text()")[0]
        day_weather_info['low'] = low[:low.find('℃')]

        # 天气
        day_weather_info['weather'] = li.xpath("./div[4]/text()")[0]
        weather_info.append(day_weather_info)
    return weather_info


weathers = []

for month in range(1, 13):
    # 获取某一月的天气信息
    weather_time = '2023' + ('0' + str(month) if month < 10 else str(month))
    url = f'https://lishi.tianqi.com/guilin/{weather_time}.html'
    # 爬虫获取这个月的天气信息
    weather = getWeather(url)
    # 存到列表中
    weathers.append(weather)
print(weathers)


# 数据写入(一次性写入)
with open("weather.csv", "w", newline = '', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    # 先写入列名:columns_name 日期 最高气温 最低气温  天气
    writer.writerow(["日期", "最高气温", "最低气温", "天气"])
    writer.writerows([list(day_weather_dict.values()) for month_weather in weathers for day_weather_dict in month_weather])