Compare commits

..

1 Commits
master ... scr

Author SHA1 Message Date
p8yivfznb 30d70c4c57 Add src
4 years ago

3
.idea/.gitignore vendored

@ -1,3 +0,0 @@
# Default ignored files
/shelf/
/workspace.xml

@ -1,8 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

@ -1,12 +0,0 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="N806" />
</list>
</option>
</inspection_tool>
</profile>
</component>

@ -1,6 +0,0 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

@ -1,4 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9" project-jdk-type="Python SDK" />
</project>

@ -1,8 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/PythonScrapyWeather.iml" filepath="$PROJECT_DIR$/.idea/PythonScrapyWeather.iml" />
</modules>
</component>
</project>

@ -1,17 +0,0 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class PythonscrapyweatherItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
province_Name = scrapy.Field()
city_Name = scrapy.Field()
date = scrapy.Field()
temperature = scrapy.Field()
weather_condition = scrapy.Field()
air_quality = scrapy.Field()

@ -1,103 +0,0 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class PythonscrapyweatherSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class PythonscrapyweatherDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)

@ -1,70 +0,0 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql
class PythonscrapyweatherPipeline(object):
# 连接数据库
def __init__(self, settings):
# self.connect = pymysql.connect(
# host='localhost',
# port=3306,
# db='datasave_sql',
# user='root',
# password='123456',
# charset="utf8",
# use_unicode=False)
# # 通过cursor执行增删查改
# self.cursor = self.connect.cursor()
# self.cursor.execute("SELECT VERSION()")
self.settings = settings
print("连接成功")
def process_item(self, item, spider):
print("开始插入")
# 插入数据库
sql = '''INSERT INTO weathers(city_Name,date,temperature,weather_condition,air_quality)
VALUES("{}","{}","{}","{}","{}")'''
try:
self.cursor.execute(sql.format(
# pymysql.converters.escape_string("1"),
pymysql.converters.escape_string(item["city_Name"]),
pymysql.converters.escape_string(item["date"]),
pymysql.converters.escape_string(item["temperature"]),
pymysql.converters.escape_string(item["weather_condition"]),
pymysql.converters.escape_string(item["air_quality"])))
self.connect.commit()
print(self.cursor.rowcount, "记录插入成功。")
except BaseException as e:
print("错误在这里>>>>>>>>>>>>>", e, "<<<<<<<<<<<<<错误在这里")
self.connect.rollback()
return item
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def open_spider(self, spider):
# 连接数据库
self.connect = pymysql.connect(
host=self.settings.get('MYSQL_HOST'),
port=self.settings.get('MYSQL_PORT'),
db=self.settings.get('MYSQL_DBNAME'),
user=self.settings.get('MYSQL_USER'),
passwd=self.settings.get('MYSQL_PASSWD'),
charset='utf8',
use_unicode=True)
# 通过cursor执行增删查改
self.cursor = self.connect.cursor();
self.connect.autocommit(True)
# 关闭数据库
def close_spider(self, spider):
self.cursor.close()
self.connect.close()

@ -1,110 +0,0 @@
# Scrapy settings for PythonScrapyWeather project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'PythonScrapyWeather'
SPIDER_MODULES = ['PythonScrapyWeather.spiders']
NEWSPIDER_MODULE = 'PythonScrapyWeather.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'PythonScrapyWeather (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'PythonScrapyWeather.middlewares.PythonscrapyweatherSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'PythonScrapyWeather.middlewares.PythonscrapyweatherDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'PythonScrapyWeather.pipelines.PythonscrapyweatherPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# 设置下载中间件
DOWNLOADER_MIDDLEWARES = {
'PythonScrapyWeather.middlewares.PythonscrapyweatherDownloaderMiddleware': 543,
}
# 设置请求头
DEFAULT_REQUEST_HEADERS = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
}
# 设置COOKIE
COOKIES_ENABLED = True
# 设置代理池
# IP_PROXY
ITEM_PIPELINES = {
'PythonScrapyWeather.pipelines.PythonscrapyweatherPipeline': 300,
}
MYSQL_HOST = 'localhost'
MYSQL_DBNAME = 'datasave_sql'
MYSQL_USER = 'root'
MYSQL_PASSWD = '123456'
MYSQL_PORT = 3306

@ -1,85 +0,0 @@
import scrapy
import requests
from PythonScrapyWeather.items import PythonscrapyweatherItem
"""
多页面爬取有两种形式
1从某一个或者多个主页中获取多个子页面的url列表parse()函数依次爬取列表中的各个子页面
2从递归爬取这个相对简单在scrapy中只要定义好初始页面以及爬虫规则rules就能够实现自动化的递归爬取
"""
class WeathersSpider(scrapy.Spider):
name = 'Weathers'
allowed_domains = ['tianqi.com']
start_urls = ['http://tianqi.com/']
def parse(self, response):
url = "https://www.tianqi.com"
allProvince_list = response.xpath('//div[@class="tqqgsf"]/p/a/text()').extract()
allCity_list = response.xpath('//div[@class="tqqgsf"]/p/a/@href').extract()
print("*************allCity_list*************", allCity_list)
for city_name in allCity_list:
city_url = city_name
print("*************city_url*************", city_url)
# 再通过省、直辖市的URL请求每个省所有市的URL请求
yield scrapy.Request(city_url, callback=self.subpage_content)
# 获取到每个省所有市的URL响应
def subpage_content(self, response):
print("response", response.status)
try:
# 实例化对象item
item = PythonscrapyweatherItem()
# 使用xpath方法遍历HTML所需要的元素
province_Data = response.xpath('//div[@class="left"]/div[5]')
# print("*************province_Data*************", province_Data)
for province_name in province_Data:
item["province_Name"] = province_name.xpath('//div[@class="left"]/div[5]/div/h2/text()').extract()[0]
province_Name = item["province_Name"]
print("*****************province_Name*******************", province_Name)
#获取每个省内的市区跳转
province_url = response.xpath('/html/body/div[7]/div[1]/div[5]/ul/li/a[1]/@href').extract()
print(province_url)
for city_url1 in province_url:
url_test = 'http://tianqi.com/' + city_url1
print(url_test)
yield scrapy.Request(url_test, callback=self.subpage_content_1)
# return item
# requests.get('http://tianqi.com/' + city_url)
# weather_Detail_Data = response.xpath('//div[@class="left"]')
# for weather_detail in weather_Detail_Data:
# # 获取item对象的属性值
# item["city_Name"] = weather_detail.xpath('dl/dd[@class ="name"]/h1/text()').extract()[0]
# city_Name1 = item["city_Name"]
# print("*************************************************111111", city_Name1)
# item["date"] = weather_detail.xpath('dl/dd[@class="week"]/text()').extract()[0]
# item["temperature"] = weather_detail.xpath('dl/dd[@class="weather"]/span/text()').extract()[0]
# item["weather_condition"] = weather_detail.xpath('dl/dd[@class="weather"]/span/b/text()').extract()[0]
# item["air_quality"] = weather_detail.xpath('dl/dd[@class="kongqi"]/h5/text()').extract()[0]
# return item
except:
print(response.status)
pass
def subpage_content_1(self, response):
print("response2", response.status)
try:
# 实例化对象item
item = PythonscrapyweatherItem()
weather_Detail_Data = response.xpath('//div[@class="left"]')
for weather_detail in weather_Detail_Data:
# 获取item对象的属性值
item["city_Name"] = weather_detail.xpath('dl/dd[@class ="name"]/h1/text()').extract()[0]
# city_Name1 = item["city_Name"]
# print("*************************************************111111", city_Name1)
item["date"] = weather_detail.xpath('dl/dd[@class="week"]/text()').extract()[0]
item["temperature"] = weather_detail.xpath('dl/dd[@class="weather"]/span/text()').extract()[0]
item["weather_condition"] = weather_detail.xpath('dl/dd[@class="weather"]/span/b/text()').extract()[0]
item["air_quality"] = weather_detail.xpath('dl/dd[@class="kongqi"]/h5/text()').extract()[0]
return item
except:
print(response.status)
pass

@ -1,4 +0,0 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

@ -1,11 +0,0 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = PythonScrapyWeather.settings
[deploy]
#url = http://localhost:6800/
project = PythonScrapyWeather
Loading…
Cancel
Save