You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

86 lines
4.5 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import scrapy
import requests
from PythonScrapyWeather.items import PythonscrapyweatherItem
"""
多页面爬取有两种形式。
1从某一个或者多个主页中获取多个子页面的url列表parse()函数依次爬取列表中的各个子页面。
2从递归爬取这个相对简单。在scrapy中只要定义好初始页面以及爬虫规则rules就能够实现自动化的递归爬取。
"""
class WeathersSpider(scrapy.Spider):
name = 'Weathers'
allowed_domains = ['tianqi.com']
start_urls = ['http://tianqi.com/']
def parse(self, response):
url = "https://www.tianqi.com"
allProvince_list = response.xpath('//div[@class="tqqgsf"]/p/a/text()').extract()
allCity_list = response.xpath('//div[@class="tqqgsf"]/p/a/@href').extract()
print("*************allCity_list*************", allCity_list)
for city_name in allCity_list:
city_url = city_name
print("*************city_url*************", city_url)
# 再通过省、直辖市的URL请求每个省所有市的URL请求
yield scrapy.Request(city_url, callback=self.subpage_content)
# 获取到每个省所有市的URL响应
def subpage_content(self, response):
print("response", response.status)
try:
# 实例化对象item
item = PythonscrapyweatherItem()
# 使用xpath方法遍历HTML所需要的元素
province_Data = response.xpath('//div[@class="left"]/div[5]')
# print("*************province_Data*************", province_Data)
for province_name in province_Data:
item["province_Name"] = province_name.xpath('//div[@class="left"]/div[5]/div/h2/text()').extract()[0]
province_Name = item["province_Name"]
print("*****************province_Name*******************", province_Name)
#获取每个省内的市区跳转
province_url = response.xpath('/html/body/div[7]/div[1]/div[5]/ul/li/a[1]/@href').extract()
print(province_url)
for city_url1 in province_url:
url_test = 'http://tianqi.com/' + city_url1
print(url_test)
yield scrapy.Request(url_test, callback=self.subpage_content_1)
# return item
# requests.get('http://tianqi.com/' + city_url)
# weather_Detail_Data = response.xpath('//div[@class="left"]')
# for weather_detail in weather_Detail_Data:
# # 获取item对象的属性值
# item["city_Name"] = weather_detail.xpath('dl/dd[@class ="name"]/h1/text()').extract()[0]
# city_Name1 = item["city_Name"]
# print("*************************************************111111", city_Name1)
# item["date"] = weather_detail.xpath('dl/dd[@class="week"]/text()').extract()[0]
# item["temperature"] = weather_detail.xpath('dl/dd[@class="weather"]/span/text()').extract()[0]
# item["weather_condition"] = weather_detail.xpath('dl/dd[@class="weather"]/span/b/text()').extract()[0]
# item["air_quality"] = weather_detail.xpath('dl/dd[@class="kongqi"]/h5/text()').extract()[0]
# return item
except:
print(response.status)
pass
def subpage_content_1(self, response):
print("response2", response.status)
try:
# 实例化对象item
item = PythonscrapyweatherItem()
weather_Detail_Data = response.xpath('//div[@class="left"]')
for weather_detail in weather_Detail_Data:
# 获取item对象的属性值
item["city_Name"] = weather_detail.xpath('dl/dd[@class ="name"]/h1/text()').extract()[0]
# city_Name1 = item["city_Name"]
# print("*************************************************111111", city_Name1)
item["date"] = weather_detail.xpath('dl/dd[@class="week"]/text()').extract()[0]
item["temperature"] = weather_detail.xpath('dl/dd[@class="weather"]/span/text()').extract()[0]
item["weather_condition"] = weather_detail.xpath('dl/dd[@class="weather"]/span/b/text()').extract()[0]
item["air_quality"] = weather_detail.xpath('dl/dd[@class="kongqi"]/h5/text()').extract()[0]
return item
except:
print(response.status)
pass