CSH/PythonScrapyWeather/spiders/Weathers.py

import scrapy
import requests
from PythonScrapyWeather.items import PythonscrapyweatherItem
"""
多页面爬取有两种形式。
1）从某一个或者多个主页中获取多个子页面的url列表，parse()函数依次爬取列表中的各个子页面。
2）从递归爬取，这个相对简单。在scrapy中只要定义好初始页面以及爬虫规则rules，就能够实现自动化的递归爬取。
"""

class WeathersSpider(scrapy.Spider):
    name = 'Weathers'
    allowed_domains = ['tianqi.com']
    start_urls = ['http://tianqi.com/']

    def parse(self, response):
        url = "https://www.tianqi.com"
        allProvince_list = response.xpath('//div[@class="tqqgsf"]/p/a/text()').extract()
        allCity_list = response.xpath('//div[@class="tqqgsf"]/p/a/@href').extract()
        print("*************allCity_list*************", allCity_list)
        for city_name in allCity_list:
            city_url = city_name
            print("*************city_url*************", city_url)
            # 再通过省、直辖市的URL请求每个省所有市的URL（请求）
            yield scrapy.Request(city_url, callback=self.subpage_content)

    # 获取到每个省所有市的URL（响应）
    def subpage_content(self, response):
        print("response", response.status)
        try:
            # 实例化对象item
            item = PythonscrapyweatherItem()
            # 使用xpath方法遍历HTML所需要的元素
            province_Data = response.xpath('//div[@class="left"]/div[5]')
            # print("*************province_Data*************", province_Data)
            for province_name in province_Data:
                item["province_Name"] = province_name.xpath('//div[@class="left"]/div[5]/div/h2/text()').extract()[0]
                province_Name = item["province_Name"]
                print("*****************province_Name*******************", province_Name)
                #获取每个省内的市区跳转

                province_url = response.xpath('/html/body/div[7]/div[1]/div[5]/ul/li/a[1]/@href').extract()
                print(province_url)
                for city_url1 in province_url:
                    url_test = 'http://tianqi.com/' + city_url1
                    print(url_test)
                    yield scrapy.Request(url_test, callback=self.subpage_content_1)
                    # return item

                    # requests.get('http://tianqi.com/' + city_url)

                    # weather_Detail_Data = response.xpath('//div[@class="left"]')
                    # for weather_detail in weather_Detail_Data:
                    #     # 获取item对象的属性值
                    #     item["city_Name"] = weather_detail.xpath('dl/dd[@class ="name"]/h1/text()').extract()[0]
                    #     city_Name1 = item["city_Name"]
                    #     print("*************************************************111111", city_Name1)
                    #     item["date"] = weather_detail.xpath('dl/dd[@class="week"]/text()').extract()[0]
                    #     item["temperature"] = weather_detail.xpath('dl/dd[@class="weather"]/span/text()').extract()[0]
                    #     item["weather_condition"] = weather_detail.xpath('dl/dd[@class="weather"]/span/b/text()').extract()[0]
                    #     item["air_quality"] = weather_detail.xpath('dl/dd[@class="kongqi"]/h5/text()').extract()[0]
                    #     return item
        except:
            print(response.status)
        pass

    def subpage_content_1(self, response):
        print("response2", response.status)
        try:
            # 实例化对象item
            item = PythonscrapyweatherItem()
            weather_Detail_Data = response.xpath('//div[@class="left"]')
            for weather_detail in weather_Detail_Data:
                # 获取item对象的属性值
                item["city_Name"] = weather_detail.xpath('dl/dd[@class ="name"]/h1/text()').extract()[0]
                # city_Name1 = item["city_Name"]
                # print("*************************************************111111", city_Name1)
                item["date"] = weather_detail.xpath('dl/dd[@class="week"]/text()').extract()[0]
                item["temperature"] = weather_detail.xpath('dl/dd[@class="weather"]/span/text()').extract()[0]
                item["weather_condition"] = weather_detail.xpath('dl/dd[@class="weather"]/span/b/text()').extract()[0]
                item["air_quality"] = weather_detail.xpath('dl/dd[@class="kongqi"]/h5/text()').extract()[0]
                return item
        except:
            print(response.status)
        pass