parent
0cf5632f60
commit
97316efeb8
@ -1,44 +1,43 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
__author__ = 'bobby'
|
||||
|
||||
from datetime import datetime
|
||||
from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
|
||||
analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
|
||||
|
||||
from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer
|
||||
|
||||
from elasticsearch_dsl.connections import connections
|
||||
|
||||
connections.create_connection(hosts=["localhost"])
|
||||
|
||||
|
||||
class CustomAnalyzer(_CustomAnalyzer):
|
||||
def get_analysis_definition(self):
|
||||
return {}
|
||||
|
||||
|
||||
ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"])
|
||||
|
||||
|
||||
class ArticleType(DocType):
|
||||
# 伯乐在线文章类型
|
||||
suggest = Completion(analyzer=ik_analyzer)
|
||||
title = Text(analyzer="ik_max_word")
|
||||
create_date = Date()
|
||||
url = Keyword()
|
||||
url_object_id = Keyword()
|
||||
front_image_url = Keyword()
|
||||
front_image_path = Keyword()
|
||||
praise_nums = Integer()
|
||||
comment_nums = Integer()
|
||||
fav_nums = Integer()
|
||||
tags = Text(analyzer="ik_max_word")
|
||||
content = Text(analyzer="ik_max_word")
|
||||
|
||||
class Meta:
|
||||
index = "jobbole"
|
||||
doc_type = "article"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ArticleType.init()
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datetime import datetime
|
||||
from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
|
||||
analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
|
||||
|
||||
from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer
|
||||
|
||||
from elasticsearch_dsl.connections import connections
|
||||
|
||||
connections.create_connection(hosts=["localhost"])
|
||||
|
||||
|
||||
class CustomAnalyzer(_CustomAnalyzer):
|
||||
def get_analysis_definition(self):
|
||||
return {}
|
||||
|
||||
|
||||
ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"])
|
||||
|
||||
|
||||
class ArticleType(DocType):
|
||||
# 伯乐在线文章类型
|
||||
suggest = Completion(analyzer=ik_analyzer)
|
||||
title = Text(analyzer="ik_max_word")
|
||||
create_date = Date()
|
||||
url = Keyword()
|
||||
url_object_id = Keyword()
|
||||
front_image_url = Keyword()
|
||||
front_image_path = Keyword()
|
||||
praise_nums = Integer()
|
||||
comment_nums = Integer()
|
||||
fav_nums = Integer()
|
||||
tags = Text(analyzer="ik_max_word")
|
||||
content = Text(analyzer="ik_max_word")
|
||||
|
||||
class Meta:
|
||||
index = "jobbole"
|
||||
doc_type = "article"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ArticleType.init()
|
||||
|
@ -1,169 +1,138 @@
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
from itemadapter import ItemAdapter
|
||||
from scrapy.pipelines.images import ImagesPipeline
|
||||
from scrapy.http.request import Request
|
||||
from ArticleSpider.models.es_types import ArticleType
|
||||
import codecs
|
||||
import json
|
||||
from w3lib.html import remove_tags
|
||||
from scrapy.exporters import JsonItemExporter
|
||||
import MySQLdb
|
||||
from twisted.enterprise import adbapi
|
||||
from MySQLdb.cursors import DictCursor
|
||||
|
||||
|
||||
|
||||
class ArticlespiderPipeline(object):
|
||||
def process_item(self, item, spider):
|
||||
return item
|
||||
|
||||
|
||||
class MysqlPipeline(object):
|
||||
def __init__(self):
|
||||
self.conn = MySQLdb.connect('127.0.0.1', 'root', 'qweasdzxc227', 'article_spider', charset="utf8",
|
||||
use_unicode=True)
|
||||
self.cursor = self.conn.cursor()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
insert_sql = """
|
||||
insert into jobbole_article(title, url ,url_object_id, front_image_url, front_image_path, parise_nums, comment_nums, fav_nums, tags, content, create_date)
|
||||
values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE parise_nums=VALUES(parise_nums)
|
||||
"""
|
||||
params = list()
|
||||
params.append(item.get("title", ""))
|
||||
params.append(item.get("url", ""))
|
||||
params.append(item.get("url_object_id", ""))
|
||||
front_image = ','.join(item.get("front_image_url", []))
|
||||
params.append(front_image)
|
||||
params.append(item.get("front_image_path", ""))
|
||||
params.append(item.get("parise_nums", 0))
|
||||
params.append(item.get("comment_nums", 0))
|
||||
params.append(item.get("fav_nums", 0))
|
||||
params.append(item.get("tags", ""))
|
||||
params.append(item.get("content", ""))
|
||||
params.append(item.get("create_date", "1970-07-01"))
|
||||
self.cursor.execute(insert_sql, tuple(params))
|
||||
self.conn.commit()
|
||||
return item
|
||||
|
||||
|
||||
class MysqlTwistedPipline(object):
|
||||
def __init__(self, dbpool):
|
||||
self.dbpool = dbpool
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
dbparms = dict(
|
||||
host=settings["MYSQL_HOST"],
|
||||
db=settings["MYSQL_DBNAME"],
|
||||
user=settings["MYSQL_USER"],
|
||||
passwd=settings["MYSQL_PASSWORD"],
|
||||
charset='utf8',
|
||||
cursorclass=DictCursor,
|
||||
use_unicode=True,
|
||||
)
|
||||
dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
|
||||
|
||||
return cls(dbpool)
|
||||
|
||||
def process_item(self, item, spider):
|
||||
# 使用twisted将mysql插入变成异步执行
|
||||
query = self.dbpool.runInteraction(self.do_insert, item)
|
||||
query.addErrback(self.handle_error, item, spider) # 处理异常
|
||||
|
||||
def handle_error(self, failure, item, spider):
|
||||
# 处理异步插入的异常
|
||||
print(failure)
|
||||
|
||||
def do_insert(self, cursor, item):
|
||||
# 执行具体的插入
|
||||
insert_sql = """
|
||||
insert into jobbole_article(title, url ,url_object_id, front_image_url, front_image_path, parise_nums, comment_nums, fav_nums, tags, content, create_date)
|
||||
values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE parise_nums=VALUES(parise_nums)
|
||||
"""
|
||||
params = list()
|
||||
params.append(item.get("title", ""))
|
||||
params.append(item.get("url", ""))
|
||||
params.append(item.get("url_object_id", ""))
|
||||
front_image = ','.join(item.get("front_image_url", []))
|
||||
params.append(front_image)
|
||||
params.append(item.get("front_image_path", ""))
|
||||
params.append(item.get("parise_nums", 0))
|
||||
params.append(item.get("comment_nums", 0))
|
||||
params.append(item.get("fav_nums", 0))
|
||||
params.append(item.get("tags", ""))
|
||||
params.append(item.get("content", ""))
|
||||
params.append(item.get("create_date", "1970-07-01"))
|
||||
# 根据不同的item 构建不同的sql语句并插入到mysql中
|
||||
cursor.execute(insert_sql, tuple(params))
|
||||
|
||||
|
||||
class JsonWithEncodingPipeline(object):
|
||||
# 自定义json文件的导出
|
||||
def __init__(self):
|
||||
self.file = codecs.open('article.json', 'a', encoding="utf-8")
|
||||
|
||||
def process_item(self, item, spider):
|
||||
lines = json.dumps(dict(item), ensure_ascii=False) + "\n"
|
||||
self.file.write(lines)
|
||||
return item
|
||||
|
||||
def spider_closed(self, spider):
|
||||
self.file.close()
|
||||
|
||||
|
||||
class JsonExporterPipeline(object):
|
||||
def __init__(self):
|
||||
self.file = open('articleexport.json', 'wb')
|
||||
self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
|
||||
self.exporter.start_exporting()
|
||||
|
||||
def close_spider(self, spider):
|
||||
self.exporter.finish_exporting()
|
||||
self.file.close()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
self.exporter.export_item(item)
|
||||
|
||||
|
||||
class ArticleImagePipeline(ImagesPipeline):
|
||||
def item_completed(self, results, item, info):
|
||||
try:
|
||||
if "front_image_url" in item:
|
||||
image_file_path = ''
|
||||
for ok, value in results:
|
||||
image_file_path = value["path"]
|
||||
item["front_image_path"] = image_file_path
|
||||
return item
|
||||
except Exception as e:
|
||||
print(e)
|
||||
item['front_image_path'] = '图片不可用'
|
||||
return item
|
||||
|
||||
|
||||
class ElasticsearchPipeline(object):
|
||||
# 将数据写入到es中
|
||||
def process_item(self, item, spider):
|
||||
# article = ArticleType()
|
||||
# article.title = item['title']
|
||||
# article.create_date = item['create_date']
|
||||
# article.content = remove_tags(item['content'])
|
||||
# article.front_image_url = item['front_image_url']
|
||||
# if 'front_image_path' in item:
|
||||
# article.front_image_path = item['front_image_path']
|
||||
# article.praise_nums = item['praise_nums']
|
||||
# article.fav_nums = item['fav_nums']
|
||||
# article.comment_nums = item['comment_nums']
|
||||
# article.url = item['url']
|
||||
# article.tags = item['tags']
|
||||
# article.meta.id = item['url_object_id']
|
||||
# article.save()
|
||||
# 将item转换为es的数据
|
||||
item.save_to_es()
|
||||
return item
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
from itemadapter import ItemAdapter
|
||||
from scrapy.pipelines.images import ImagesPipeline
|
||||
from scrapy.http.request import Request
|
||||
from ArticleSpider.models.es_types import ArticleType
|
||||
import codecs
|
||||
import json
|
||||
from w3lib.html import remove_tags
|
||||
from scrapy.exporters import JsonItemExporter
|
||||
import MySQLdb
|
||||
from twisted.enterprise import adbapi
|
||||
from MySQLdb.cursors import DictCursor
|
||||
|
||||
|
||||
|
||||
class ArticlespiderPipeline(object):
|
||||
def process_item(self, item, spider):
|
||||
return item
|
||||
|
||||
|
||||
class MysqlPipeline(object):
|
||||
def __init__(self):
|
||||
self.conn = MySQLdb.connect('127.0.0.1', 'root', 'qweasdzxc227', 'article_spider', charset="utf8",
|
||||
use_unicode=True)
|
||||
self.cursor = self.conn.cursor()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
insert_sql = """
|
||||
insert into jobbole_article(title, url ,url_object_id, front_image_url, front_image_path, parise_nums, comment_nums, fav_nums, tags, content, create_date)
|
||||
values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE parise_nums=VALUES(parise_nums)
|
||||
"""
|
||||
params = list()
|
||||
params.append(item.get("title", ""))
|
||||
params.append(item.get("url", ""))
|
||||
params.append(item.get("url_object_id", ""))
|
||||
front_image = ','.join(item.get("front_image_url", []))
|
||||
params.append(front_image)
|
||||
params.append(item.get("front_image_path", ""))
|
||||
params.append(item.get("parise_nums", 0))
|
||||
params.append(item.get("comment_nums", 0))
|
||||
params.append(item.get("fav_nums", 0))
|
||||
params.append(item.get("tags", ""))
|
||||
params.append(item.get("content", ""))
|
||||
params.append(item.get("create_date", "1970-07-01"))
|
||||
self.cursor.execute(insert_sql, tuple(params))
|
||||
self.conn.commit()
|
||||
return item
|
||||
|
||||
|
||||
class MysqlTwistedPipline(object):
|
||||
def __init__(self, dbpool):
|
||||
self.dbpool = dbpool
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
dbparms = dict(
|
||||
host=settings["MYSQL_HOST"],
|
||||
db=settings["MYSQL_DBNAME"],
|
||||
user=settings["MYSQL_USER"],
|
||||
passwd=settings["MYSQL_PASSWORD"],
|
||||
charset='utf8',
|
||||
cursorclass=DictCursor,
|
||||
use_unicode=True,
|
||||
)
|
||||
dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
|
||||
return cls(dbpool)
|
||||
|
||||
def process_item(self, item, spider):
|
||||
# 使用twisted将mysql插入变成异步执行
|
||||
query = self.dbpool.runInteraction(self.do_insert, item)
|
||||
query.addErrback(self.handle_error, item, spider) # 处理异常
|
||||
|
||||
def handle_error(self, failure, item, spider):
|
||||
# 处理异步插入的异常
|
||||
print(failure)
|
||||
|
||||
def do_insert(self, cursor, item):
|
||||
# 执行具体的插入
|
||||
insert_sql = """
|
||||
insert into jobbole_article(title, url ,url_object_id, front_image_url, front_image_path, parise_nums, comment_nums, fav_nums, tags, content, create_date)
|
||||
values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE parise_nums=VALUES(parise_nums)
|
||||
"""
|
||||
params = list()
|
||||
params.append(item.get("title", ""))
|
||||
params.append(item.get("url", ""))
|
||||
params.append(item.get("url_object_id", ""))
|
||||
front_image = ','.join(item.get("front_image_url", []))
|
||||
params.append(front_image)
|
||||
params.append(item.get("front_image_path", ""))
|
||||
params.append(item.get("parise_nums", 0))
|
||||
params.append(item.get("comment_nums", 0))
|
||||
params.append(item.get("fav_nums", 0))
|
||||
params.append(item.get("tags", ""))
|
||||
params.append(item.get("content", ""))
|
||||
params.append(item.get("create_date", "1970-07-01"))
|
||||
# 根据不同的item 构建不同的sql语句并插入到mysql中
|
||||
cursor.execute(insert_sql, tuple(params))
|
||||
|
||||
|
||||
class JsonWithEncodingPipeline(object):
|
||||
# 自定义json文件的导出
|
||||
def __init__(self):
|
||||
self.file = codecs.open('article.json', 'a', encoding="utf-8")
|
||||
|
||||
def process_item(self, item, spider):
|
||||
lines = json.dumps(dict(item), ensure_ascii=False) + "\n"
|
||||
self.file.write(lines)
|
||||
return item
|
||||
|
||||
def spider_closed(self, spider):
|
||||
self.file.close()
|
||||
|
||||
|
||||
class JsonExporterPipeline(object):
|
||||
def __init__(self):
|
||||
self.file = open('articleexport.json', 'wb')
|
||||
self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
|
||||
self.exporter.start_exporting()
|
||||
|
||||
def close_spider(self, spider):
|
||||
self.exporter.finish_exporting()
|
||||
self.file.close()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
self.exporter.export_item(item)
|
||||
|
||||
|
||||
class ElasticsearchPipeline(object):
|
||||
# 将数据写入到es中
|
||||
def process_item(self, item, spider):
|
||||
item.save_to_es()
|
||||
return item
|
||||
|
Loading…
Reference in new issue