parent
525f8a437a
commit
d77bb4b53f
@ -0,0 +1,8 @@
|
||||
# 默认忽略的文件
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# 基于编辑器的 HTTP 客户端请求
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.9 (article_spider)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
@ -0,0 +1,7 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.9 (article_spider)" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (article_spider)" project-jdk-type="Python SDK" />
|
||||
</project>
|
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/ArticleSpider.iml" filepath="$PROJECT_DIR$/.idea/ArticleSpider.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,64 @@
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||
import re
|
||||
import scrapy
|
||||
from scrapy.loader.processors import MapCompose, TakeFirst, Identity, Join
|
||||
from scrapy.loader import ItemLoader
|
||||
from ArticleSpider.models.es_types import ArticleType
|
||||
from w3lib.html import remove_tags
|
||||
|
||||
class ArticlespiderItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
# name = scrapy.Field()
|
||||
pass
|
||||
|
||||
|
||||
def date_convert(value):
|
||||
match_re = re.match('.*?(\d+.*)', value)
|
||||
if match_re:
|
||||
return match_re.group(1)
|
||||
else:
|
||||
return '1970-07-01'
|
||||
|
||||
|
||||
class ArticleItemLoader(ItemLoader):
|
||||
default_output_processor = TakeFirst()
|
||||
|
||||
|
||||
class JobBoleArticleItem(scrapy.Item):
|
||||
title = scrapy.Field() # 标题
|
||||
create_date = scrapy.Field(
|
||||
input_processor=MapCompose(date_convert)
|
||||
) # 发布时间
|
||||
url = scrapy.Field() # 链接
|
||||
url_object_id = scrapy.Field() # 链接id
|
||||
front_image_url = scrapy.Field(
|
||||
output_processor=Identity()
|
||||
) # 封面图
|
||||
front_image_path = scrapy.Field() # 封面图路径
|
||||
praise_nums = scrapy.Field() # 点赞数
|
||||
comment_nums = scrapy.Field() # 评论数
|
||||
fav_nums = scrapy.Field() # 收藏数
|
||||
tags = scrapy.Field(
|
||||
output_processor=Join(separator=',')
|
||||
) # 标签
|
||||
content = scrapy.Field() # 内容
|
||||
|
||||
def save_to_es(self):
|
||||
article = ArticleType()
|
||||
article.title = self['title']
|
||||
article.create_date = self['create_date']
|
||||
article.content = remove_tags(self['content'])
|
||||
article.front_image_url = self['front_image_url']
|
||||
if 'front_image_path' in self:
|
||||
article.front_image_path = self['front_image_path']
|
||||
article.praise_nums = self['praise_nums']
|
||||
article.fav_nums = self['fav_nums']
|
||||
article.comment_nums = self['comment_nums']
|
||||
article.url = self['url']
|
||||
article.tags = self['tags']
|
||||
article.meta.id = self['url_object_id']
|
||||
article.save()
|
||||
return
|
@ -0,0 +1,44 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
__author__ = 'bobby'
|
||||
|
||||
from datetime import datetime
|
||||
from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
|
||||
analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
|
||||
|
||||
from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer
|
||||
|
||||
from elasticsearch_dsl.connections import connections
|
||||
|
||||
connections.create_connection(hosts=["localhost"])
|
||||
|
||||
|
||||
class CustomAnalyzer(_CustomAnalyzer):
|
||||
def get_analysis_definition(self):
|
||||
return {}
|
||||
|
||||
|
||||
ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"])
|
||||
|
||||
|
||||
class ArticleType(DocType):
|
||||
# 伯乐在线文章类型
|
||||
suggest = Completion(analyzer=ik_analyzer)
|
||||
title = Text(analyzer="ik_max_word")
|
||||
create_date = Date()
|
||||
url = Keyword()
|
||||
url_object_id = Keyword()
|
||||
front_image_url = Keyword()
|
||||
front_image_path = Keyword()
|
||||
praise_nums = Integer()
|
||||
comment_nums = Integer()
|
||||
fav_nums = Integer()
|
||||
tags = Text(analyzer="ik_max_word")
|
||||
content = Text(analyzer="ik_max_word")
|
||||
|
||||
class Meta:
|
||||
index = "jobbole"
|
||||
doc_type = "article"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ArticleType.init()
|
@ -0,0 +1,169 @@
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
from itemadapter import ItemAdapter
|
||||
from scrapy.pipelines.images import ImagesPipeline
|
||||
from scrapy.http.request import Request
|
||||
from ArticleSpider.models.es_types import ArticleType
|
||||
import codecs
|
||||
import json
|
||||
from w3lib.html import remove_tags
|
||||
from scrapy.exporters import JsonItemExporter
|
||||
import MySQLdb
|
||||
from twisted.enterprise import adbapi
|
||||
from MySQLdb.cursors import DictCursor
|
||||
|
||||
|
||||
|
||||
class ArticlespiderPipeline(object):
|
||||
def process_item(self, item, spider):
|
||||
return item
|
||||
|
||||
|
||||
class MysqlPipeline(object):
|
||||
def __init__(self):
|
||||
self.conn = MySQLdb.connect('127.0.0.1', 'root', 'qweasdzxc227', 'article_spider', charset="utf8",
|
||||
use_unicode=True)
|
||||
self.cursor = self.conn.cursor()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
insert_sql = """
|
||||
insert into jobbole_article(title, url ,url_object_id, front_image_url, front_image_path, parise_nums, comment_nums, fav_nums, tags, content, create_date)
|
||||
values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE parise_nums=VALUES(parise_nums)
|
||||
"""
|
||||
params = list()
|
||||
params.append(item.get("title", ""))
|
||||
params.append(item.get("url", ""))
|
||||
params.append(item.get("url_object_id", ""))
|
||||
front_image = ','.join(item.get("front_image_url", []))
|
||||
params.append(front_image)
|
||||
params.append(item.get("front_image_path", ""))
|
||||
params.append(item.get("parise_nums", 0))
|
||||
params.append(item.get("comment_nums", 0))
|
||||
params.append(item.get("fav_nums", 0))
|
||||
params.append(item.get("tags", ""))
|
||||
params.append(item.get("content", ""))
|
||||
params.append(item.get("create_date", "1970-07-01"))
|
||||
self.cursor.execute(insert_sql, tuple(params))
|
||||
self.conn.commit()
|
||||
return item
|
||||
|
||||
|
||||
class MysqlTwistedPipline(object):
|
||||
def __init__(self, dbpool):
|
||||
self.dbpool = dbpool
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
dbparms = dict(
|
||||
host=settings["MYSQL_HOST"],
|
||||
db=settings["MYSQL_DBNAME"],
|
||||
user=settings["MYSQL_USER"],
|
||||
passwd=settings["MYSQL_PASSWORD"],
|
||||
charset='utf8',
|
||||
cursorclass=DictCursor,
|
||||
use_unicode=True,
|
||||
)
|
||||
dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
|
||||
|
||||
return cls(dbpool)
|
||||
|
||||
def process_item(self, item, spider):
|
||||
# 使用twisted将mysql插入变成异步执行
|
||||
query = self.dbpool.runInteraction(self.do_insert, item)
|
||||
query.addErrback(self.handle_error, item, spider) # 处理异常
|
||||
|
||||
def handle_error(self, failure, item, spider):
|
||||
# 处理异步插入的异常
|
||||
print(failure)
|
||||
|
||||
def do_insert(self, cursor, item):
|
||||
# 执行具体的插入
|
||||
insert_sql = """
|
||||
insert into jobbole_article(title, url ,url_object_id, front_image_url, front_image_path, parise_nums, comment_nums, fav_nums, tags, content, create_date)
|
||||
values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE parise_nums=VALUES(parise_nums)
|
||||
"""
|
||||
params = list()
|
||||
params.append(item.get("title", ""))
|
||||
params.append(item.get("url", ""))
|
||||
params.append(item.get("url_object_id", ""))
|
||||
front_image = ','.join(item.get("front_image_url", []))
|
||||
params.append(front_image)
|
||||
params.append(item.get("front_image_path", ""))
|
||||
params.append(item.get("parise_nums", 0))
|
||||
params.append(item.get("comment_nums", 0))
|
||||
params.append(item.get("fav_nums", 0))
|
||||
params.append(item.get("tags", ""))
|
||||
params.append(item.get("content", ""))
|
||||
params.append(item.get("create_date", "1970-07-01"))
|
||||
# 根据不同的item 构建不同的sql语句并插入到mysql中
|
||||
cursor.execute(insert_sql, tuple(params))
|
||||
|
||||
|
||||
class JsonWithEncodingPipeline(object):
|
||||
# 自定义json文件的导出
|
||||
def __init__(self):
|
||||
self.file = codecs.open('article.json', 'a', encoding="utf-8")
|
||||
|
||||
def process_item(self, item, spider):
|
||||
lines = json.dumps(dict(item), ensure_ascii=False) + "\n"
|
||||
self.file.write(lines)
|
||||
return item
|
||||
|
||||
def spider_closed(self, spider):
|
||||
self.file.close()
|
||||
|
||||
|
||||
class JsonExporterPipeline(object):
|
||||
def __init__(self):
|
||||
self.file = open('articleexport.json', 'wb')
|
||||
self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
|
||||
self.exporter.start_exporting()
|
||||
|
||||
def close_spider(self, spider):
|
||||
self.exporter.finish_exporting()
|
||||
self.file.close()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
self.exporter.export_item(item)
|
||||
|
||||
|
||||
class ArticleImagePipeline(ImagesPipeline):
|
||||
def item_completed(self, results, item, info):
|
||||
try:
|
||||
if "front_image_url" in item:
|
||||
image_file_path = ''
|
||||
for ok, value in results:
|
||||
image_file_path = value["path"]
|
||||
item["front_image_path"] = image_file_path
|
||||
return item
|
||||
except Exception as e:
|
||||
print(e)
|
||||
item['front_image_path'] = '图片不可用'
|
||||
return item
|
||||
|
||||
|
||||
class ElasticsearchPipeline(object):
|
||||
# 将数据写入到es中
|
||||
def process_item(self, item, spider):
|
||||
# article = ArticleType()
|
||||
# article.title = item['title']
|
||||
# article.create_date = item['create_date']
|
||||
# article.content = remove_tags(item['content'])
|
||||
# article.front_image_url = item['front_image_url']
|
||||
# if 'front_image_path' in item:
|
||||
# article.front_image_path = item['front_image_path']
|
||||
# article.praise_nums = item['praise_nums']
|
||||
# article.fav_nums = item['fav_nums']
|
||||
# article.comment_nums = item['comment_nums']
|
||||
# article.url = item['url']
|
||||
# article.tags = item['tags']
|
||||
# article.meta.id = item['url_object_id']
|
||||
# article.save()
|
||||
# 将item转换为es的数据
|
||||
item.save_to_es()
|
||||
return item
|
@ -0,0 +1,115 @@
|
||||
# Scrapy settings for ArticleSpider project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
import os
|
||||
import sys
|
||||
|
||||
import scrapy.downloadermiddlewares.useragent
|
||||
|
||||
import ArticleSpider.pipelines
|
||||
|
||||
BOT_NAME = "ArticleSpider"
|
||||
|
||||
SPIDER_MODULES = ["ArticleSpider.spiders"]
|
||||
NEWSPIDER_MODULE = "ArticleSpider.spiders"
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
# CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
# DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
# CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
COOKIES_ENABLED = True
|
||||
COOKIES_DEBUG = True
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
# TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
# DEFAULT_REQUEST_HEADERS = {
|
||||
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
# "Accept-Language": "en",
|
||||
# }
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
# SPIDER_MIDDLEWARES = {
|
||||
# "ArticleSpider.middlewares.ArticlespiderSpiderMiddleware": 543,
|
||||
# }
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
DOWNLOADER_MIDDLEWARES = {
|
||||
# "ArticleSpider.middlewares.ArticlespiderDownloaderMiddleware": 543,
|
||||
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2,
|
||||
}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
# EXTENSIONS = {
|
||||
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||
# }
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
# 'scrapy.pipelines.images.ImagesPipeline': 1,
|
||||
# 'ArticleSpider.pipelines.JsonWithEncodingPipeline': 2,
|
||||
# 'ArticleSpider.pipelines.JsonExporterPipeline': 3,
|
||||
# 'ArticleSpider.pipelines.MysqlPipeline': 4,
|
||||
# 'ArticleSpider.pipelines.MysqlTwistedPipline': 5,
|
||||
'ArticleSpider.pipelines.ElasticsearchPipeline': 6,
|
||||
'ArticleSpider.pipelines.ArticlespiderPipeline': 300,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
# AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
# AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
# AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
# AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
# HTTPCACHE_ENABLED = True
|
||||
# HTTPCACHE_EXPIRATION_SECS = 0
|
||||
# HTTPCACHE_DIR = "httpcache"
|
||||
# HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||
|
||||
# Set settings whose default value is deprecated to a future-proof value
|
||||
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||
FEED_EXPORT_ENCODING = "utf-8"
|
||||
|
||||
# IMAGES_URLS_FIELD = 'front_image_url'
|
||||
project_dir = os.path.abspath(os.path.dirname(__file__))
|
||||
IMAGES_STORE = os.path.join(project_dir, 'images')
|
||||
|
||||
MYSQL_HOST = '127.0.0.1'
|
||||
MYSQL_DBNAME = 'article_spider'
|
||||
MYSQL_USER = 'root'
|
||||
MYSQL_PASSWORD = 'qweasdzxc227'
|
@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,14 @@
|
||||
import hashlib
|
||||
|
||||
|
||||
# 生成md5
|
||||
def get_md5(url):
|
||||
if isinstance(url, str):
|
||||
url = url.encode("utf-8")
|
||||
m = hashlib.md5()
|
||||
m.update(url)
|
||||
return m.hexdigest()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(get_md5("https://cnblogs.com"))
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Binary file not shown.
@ -0,0 +1,12 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
__author__ = 'bobby'
|
||||
|
||||
from scrapy.cmdline import execute
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||
execute(["scrapy", "crawl", "jobbole"])
|
||||
# execute(["scrapy", "crawl", "zhihu"])
|
||||
# execute(["scrapy", "crawl", "lagou"])
|
@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = ArticleSpider.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = ArticleSpider
|
Loading…
Reference in new issue