|
|
import scrapy
|
|
|
import requests
|
|
|
from PythonScrapyWeather.items import PythonscrapyweatherItem
|
|
|
"""
|
|
|
多页面爬取有两种形式。
|
|
|
1)从某一个或者多个主页中获取多个子页面的url列表,parse()函数依次爬取列表中的各个子页面。
|
|
|
2)从递归爬取,这个相对简单。在scrapy中只要定义好初始页面以及爬虫规则rules,就能够实现自动化的递归爬取。
|
|
|
"""
|
|
|
|
|
|
class WeathersSpider(scrapy.Spider):
|
|
|
name = 'Weathers'
|
|
|
allowed_domains = ['tianqi.com']
|
|
|
start_urls = ['http://tianqi.com/']
|
|
|
|
|
|
def parse(self, response):
|
|
|
url = "https://www.tianqi.com"
|
|
|
allProvince_list = response.xpath('//div[@class="tqqgsf"]/p/a/text()').extract()
|
|
|
allCity_list = response.xpath('//div[@class="tqqgsf"]/p/a/@href').extract()
|
|
|
print("*************allCity_list*************", allCity_list)
|
|
|
for city_name in allCity_list:
|
|
|
city_url = city_name
|
|
|
print("*************city_url*************", city_url)
|
|
|
# 再通过省、直辖市的URL请求每个省所有市的URL(请求)
|
|
|
yield scrapy.Request(city_url, callback=self.subpage_content)
|
|
|
|
|
|
# 获取到每个省所有市的URL(响应)
|
|
|
def subpage_content(self, response):
|
|
|
print("response", response.status)
|
|
|
try:
|
|
|
# 实例化对象item
|
|
|
item = PythonscrapyweatherItem()
|
|
|
# 使用xpath方法遍历HTML所需要的元素
|
|
|
province_Data = response.xpath('//div[@class="left"]/div[5]')
|
|
|
# print("*************province_Data*************", province_Data)
|
|
|
for province_name in province_Data:
|
|
|
item["province_Name"] = province_name.xpath('//div[@class="left"]/div[5]/div/h2/text()').extract()[0]
|
|
|
province_Name = item["province_Name"]
|
|
|
print("*****************province_Name*******************", province_Name)
|
|
|
#获取每个省内的市区跳转
|
|
|
|
|
|
province_url = response.xpath('/html/body/div[7]/div[1]/div[5]/ul/li/a[1]/@href').extract()
|
|
|
print(province_url)
|
|
|
for city_url1 in province_url:
|
|
|
url_test = 'http://tianqi.com/' + city_url1
|
|
|
print(url_test)
|
|
|
yield scrapy.Request(url_test, callback=self.subpage_content_1)
|
|
|
# return item
|
|
|
|
|
|
# requests.get('http://tianqi.com/' + city_url)
|
|
|
|
|
|
# weather_Detail_Data = response.xpath('//div[@class="left"]')
|
|
|
# for weather_detail in weather_Detail_Data:
|
|
|
# # 获取item对象的属性值
|
|
|
# item["city_Name"] = weather_detail.xpath('dl/dd[@class ="name"]/h1/text()').extract()[0]
|
|
|
# city_Name1 = item["city_Name"]
|
|
|
# print("*************************************************111111", city_Name1)
|
|
|
# item["date"] = weather_detail.xpath('dl/dd[@class="week"]/text()').extract()[0]
|
|
|
# item["temperature"] = weather_detail.xpath('dl/dd[@class="weather"]/span/text()').extract()[0]
|
|
|
# item["weather_condition"] = weather_detail.xpath('dl/dd[@class="weather"]/span/b/text()').extract()[0]
|
|
|
# item["air_quality"] = weather_detail.xpath('dl/dd[@class="kongqi"]/h5/text()').extract()[0]
|
|
|
# return item
|
|
|
except:
|
|
|
print(response.status)
|
|
|
pass
|
|
|
|
|
|
def subpage_content_1(self, response):
|
|
|
print("response2", response.status)
|
|
|
try:
|
|
|
# 实例化对象item
|
|
|
item = PythonscrapyweatherItem()
|
|
|
weather_Detail_Data = response.xpath('//div[@class="left"]')
|
|
|
for weather_detail in weather_Detail_Data:
|
|
|
# 获取item对象的属性值
|
|
|
item["city_Name"] = weather_detail.xpath('dl/dd[@class ="name"]/h1/text()').extract()[0]
|
|
|
# city_Name1 = item["city_Name"]
|
|
|
# print("*************************************************111111", city_Name1)
|
|
|
item["date"] = weather_detail.xpath('dl/dd[@class="week"]/text()').extract()[0]
|
|
|
item["temperature"] = weather_detail.xpath('dl/dd[@class="weather"]/span/text()').extract()[0]
|
|
|
item["weather_condition"] = weather_detail.xpath('dl/dd[@class="weather"]/span/b/text()').extract()[0]
|
|
|
item["air_quality"] = weather_detail.xpath('dl/dd[@class="kongqi"]/h5/text()').extract()[0]
|
|
|
return item
|
|
|
except:
|
|
|
print(response.status)
|
|
|
pass
|
|
|
|