import scrapy import requests from PythonScrapyWeather.items import PythonscrapyweatherItem """ 多页面爬取有两种形式。 1)从某一个或者多个主页中获取多个子页面的url列表,parse()函数依次爬取列表中的各个子页面。 2)从递归爬取,这个相对简单。在scrapy中只要定义好初始页面以及爬虫规则rules,就能够实现自动化的递归爬取。 """ class WeathersSpider(scrapy.Spider): name = 'Weathers' allowed_domains = ['tianqi.com'] start_urls = ['http://tianqi.com/'] def parse(self, response): url = "https://www.tianqi.com" allProvince_list = response.xpath('//div[@class="tqqgsf"]/p/a/text()').extract() allCity_list = response.xpath('//div[@class="tqqgsf"]/p/a/@href').extract() print("*************allCity_list*************", allCity_list) for city_name in allCity_list: city_url = city_name print("*************city_url*************", city_url) # 再通过省、直辖市的URL请求每个省所有市的URL(请求) yield scrapy.Request(city_url, callback=self.subpage_content) # 获取到每个省所有市的URL(响应) def subpage_content(self, response): print("response", response.status) try: # 实例化对象item item = PythonscrapyweatherItem() # 使用xpath方法遍历HTML所需要的元素 province_Data = response.xpath('//div[@class="left"]/div[5]') # print("*************province_Data*************", province_Data) for province_name in province_Data: item["province_Name"] = province_name.xpath('//div[@class="left"]/div[5]/div/h2/text()').extract()[0] province_Name = item["province_Name"] print("*****************province_Name*******************", province_Name) #获取每个省内的市区跳转 province_url = response.xpath('/html/body/div[7]/div[1]/div[5]/ul/li/a[1]/@href').extract() print(province_url) for city_url1 in province_url: url_test = 'http://tianqi.com/' + city_url1 print(url_test) yield scrapy.Request(url_test, callback=self.subpage_content_1) # return item # requests.get('http://tianqi.com/' + city_url) # weather_Detail_Data = response.xpath('//div[@class="left"]') # for weather_detail in weather_Detail_Data: # # 获取item对象的属性值 # item["city_Name"] = weather_detail.xpath('dl/dd[@class ="name"]/h1/text()').extract()[0] # city_Name1 = item["city_Name"] # print("*************************************************111111", city_Name1) # item["date"] = weather_detail.xpath('dl/dd[@class="week"]/text()').extract()[0] # item["temperature"] = weather_detail.xpath('dl/dd[@class="weather"]/span/text()').extract()[0] # item["weather_condition"] = weather_detail.xpath('dl/dd[@class="weather"]/span/b/text()').extract()[0] # item["air_quality"] = weather_detail.xpath('dl/dd[@class="kongqi"]/h5/text()').extract()[0] # return item except: print(response.status) pass def subpage_content_1(self, response): print("response2", response.status) try: # 实例化对象item item = PythonscrapyweatherItem() weather_Detail_Data = response.xpath('//div[@class="left"]') for weather_detail in weather_Detail_Data: # 获取item对象的属性值 item["city_Name"] = weather_detail.xpath('dl/dd[@class ="name"]/h1/text()').extract()[0] # city_Name1 = item["city_Name"] # print("*************************************************111111", city_Name1) item["date"] = weather_detail.xpath('dl/dd[@class="week"]/text()').extract()[0] item["temperature"] = weather_detail.xpath('dl/dd[@class="weather"]/span/text()').extract()[0] item["weather_condition"] = weather_detail.xpath('dl/dd[@class="weather"]/span/b/text()').extract()[0] item["air_quality"] = weather_detail.xpath('dl/dd[@class="kongqi"]/h5/text()').extract()[0] return item except: print(response.status) pass