main
qweasdzxc227 6 months ago
parent 0cf5632f60
commit 97316efeb8

@ -1,5 +1,4 @@
# -*- coding: utf-8 -*-
__author__ = 'bobby'
from datetime import datetime
from elasticsearch_dsl import DocType, Date, Nested, Boolean, \

@ -69,7 +69,6 @@ class MysqlTwistedPipline(object):
use_unicode=True,
)
dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
return cls(dbpool)
def process_item(self, item, spider):
@ -132,38 +131,8 @@ class JsonExporterPipeline(object):
self.exporter.export_item(item)
class ArticleImagePipeline(ImagesPipeline):
def item_completed(self, results, item, info):
try:
if "front_image_url" in item:
image_file_path = ''
for ok, value in results:
image_file_path = value["path"]
item["front_image_path"] = image_file_path
return item
except Exception as e:
print(e)
item['front_image_path'] = '图片不可用'
return item
class ElasticsearchPipeline(object):
# 将数据写入到es中
def process_item(self, item, spider):
# article = ArticleType()
# article.title = item['title']
# article.create_date = item['create_date']
# article.content = remove_tags(item['content'])
# article.front_image_url = item['front_image_url']
# if 'front_image_path' in item:
# article.front_image_path = item['front_image_path']
# article.praise_nums = item['praise_nums']
# article.fav_nums = item['fav_nums']
# article.comment_nums = item['comment_nums']
# article.url = item['url']
# article.tags = item['tags']
# article.meta.id = item['url_object_id']
# article.save()
# 将item转换为es的数据
item.save_to_es()
return item

@ -50,6 +50,7 @@ class JobboleSpider(scrapy.Spider):
cookie_dict[cookie['name']] = cookie['value']
for url in self.start_urls:
yield scrapy.Request(url, dont_filter=True, cookies=cookie_dict)
def parse(self, response):
# 1.获取新闻列表页中的新闻url并交给scrapy进行下载后调用相应的解析方
# 提取文章链接,extract_first()提取第一个值
@ -57,7 +58,8 @@ class JobboleSpider(scrapy.Spider):
for post_node in post_nodes:
image_url = "https:" + post_node.css('.entry_summary a img::attr(src)').extract_first("")
post_url = post_node.css('h2 a::attr(href)').extract_first("")
yield Request(url=parse.urljoin(response.url, post_url), meta={'front_image_url': image_url}, callback=self.parse_detail, dont_filter=True)
yield Request(url=parse.urljoin(response.url, post_url), meta={'front_image_url': image_url},
callback=self.parse_detail, dont_filter=True)
# 2.获取下一页的url并交给scrapy进行下载下载完成后交给parse继续跟进
next_url = response.css('div.pager a:last-child::attr(href)').extract_first("")
yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
@ -74,9 +76,10 @@ class JobboleSpider(scrapy.Spider):
item_loader.add_value('url', response.url)
item_loader.add_value('front_image_url', response.meta.get('front_image_url', ''))
yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)),
meta={'article_item': item_loader, 'url':response.url}, callback=self.parse_nums)
meta={'article_item': item_loader, 'url': response.url}, callback=self.parse_nums)
def parse_nums(self, response):
# 提取点赞数,收藏数,评论数
j_data = json.loads(response.text)
item_loader = response.meta.get('article_item', "")
item_loader.add_value('praise_nums', j_data['DiggCount'])

@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
__author__ = 'bobby'
from scrapy.cmdline import execute

Loading…
Cancel
Save