main
qweasdzxc227 6 months ago
parent b71f4f6a8f
commit c0b1aa5823

@ -66,6 +66,7 @@ class JobBoleArticleItem(scrapy.Item):
output_processor=Join(separator=',') output_processor=Join(separator=',')
) # 标签 ) # 标签
content = scrapy.Field() # 内容 content = scrapy.Field() # 内容
where_from = scrapy.Field() # 来自
def save_to_es(self): def save_to_es(self):
article = ArticleType() article = ArticleType()
@ -80,6 +81,7 @@ class JobBoleArticleItem(scrapy.Item):
article.comment_nums = self['comment_nums'] article.comment_nums = self['comment_nums']
article.url = self['url'] article.url = self['url']
article.tags = self['tags'] article.tags = self['tags']
article.where_from = self['where_from']
article.meta.id = self['url_object_id'] article.meta.id = self['url_object_id']
article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7))) article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7)))
article.save() article.save()

@ -33,6 +33,7 @@ class ArticleType(DocType):
fav_nums = Integer() fav_nums = Integer()
tags = Text(analyzer="ik_max_word") tags = Text(analyzer="ik_max_word")
content = Text(analyzer="ik_max_word") content = Text(analyzer="ik_max_word")
where_from = Text(analyzer="ik_max_word")
class Meta: class Meta:
index = "jobbole" index = "jobbole"

@ -32,8 +32,8 @@ class MysqlPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
insert_sql = """ insert_sql = """
insert into jobbole_article(title, url ,url_object_id, front_image_url, front_image_path, parise_nums, comment_nums, fav_nums, tags, content, create_date) insert into jobbole_article(title, url ,url_object_id, front_image_url, front_image_path, parise_nums, comment_nums, fav_nums, tags, content, create_date, where_from)
values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE parise_nums=VALUES(parise_nums) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE parise_nums=VALUES(parise_nums)
""" """
params = list() params = list()
params.append(item.get("title", "")) params.append(item.get("title", ""))
@ -48,6 +48,7 @@ class MysqlPipeline(object):
params.append(item.get("tags", "")) params.append(item.get("tags", ""))
params.append(item.get("content", "")) params.append(item.get("content", ""))
params.append(item.get("create_date", "1970-07-01")) params.append(item.get("create_date", "1970-07-01"))
params.append(item.get("where_from", ""))
self.cursor.execute(insert_sql, tuple(params)) self.cursor.execute(insert_sql, tuple(params))
self.conn.commit() self.conn.commit()
return item return item
@ -83,8 +84,8 @@ class MysqlTwistedPipline(object):
def do_insert(self, cursor, item): def do_insert(self, cursor, item):
# 执行具体的插入 # 执行具体的插入
insert_sql = """ insert_sql = """
insert into jobbole_article(title, url ,url_object_id, front_image_url, front_image_path, parise_nums, comment_nums, fav_nums, tags, content, create_date) insert into jobbole_article(title, url ,url_object_id, front_image_url, front_image_path, parise_nums, comment_nums, fav_nums, tags, content, create_date, where_from)
values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE parise_nums=VALUES(parise_nums) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE parise_nums=VALUES(parise_nums)
""" """
params = list() params = list()
params.append(item.get("title", "")) params.append(item.get("title", ""))
@ -99,6 +100,7 @@ class MysqlTwistedPipline(object):
params.append(item.get("tags", "")) params.append(item.get("tags", ""))
params.append(item.get("content", "")) params.append(item.get("content", ""))
params.append(item.get("create_date", "1970-07-01")) params.append(item.get("create_date", "1970-07-01"))
params.append(item.get("where_from", ""))
# 根据不同的item 构建不同的sql语句并插入到mysql中 # 根据不同的item 构建不同的sql语句并插入到mysql中
cursor.execute(insert_sql, tuple(params)) cursor.execute(insert_sql, tuple(params))

@ -74,8 +74,8 @@ ITEM_PIPELINES = {
# 'ArticleSpider.pipelines.JsonWithEncodingPipeline': 2, # 'ArticleSpider.pipelines.JsonWithEncodingPipeline': 2,
# 'ArticleSpider.pipelines.JsonExporterPipeline': 3, # 'ArticleSpider.pipelines.JsonExporterPipeline': 3,
# 'ArticleSpider.pipelines.MysqlPipeline': 4, # 'ArticleSpider.pipelines.MysqlPipeline': 4,
# 'ArticleSpider.pipelines.MysqlTwistedPipline': 5, 'ArticleSpider.pipelines.MysqlTwistedPipline': 5,
'ArticleSpider.pipelines.ElasticsearchPipeline': 6, # 'ArticleSpider.pipelines.ElasticsearchPipeline': 6,
'ArticleSpider.pipelines.ArticlespiderPipeline': 300, 'ArticleSpider.pipelines.ArticlespiderPipeline': 300,
} }

@ -54,7 +54,7 @@ class JobboleSpider(scrapy.Spider):
def parse(self, response): def parse(self, response):
# 1.获取新闻列表页中的新闻url并交给scrapy进行下载后调用相应的解析方 # 1.获取新闻列表页中的新闻url并交给scrapy进行下载后调用相应的解析方
# 提取文章链接,extract_first()提取第一个值 # 提取文章链接,extract_first()提取第一个值
post_nodes = response.css('#news_list .news_block')[:100] post_nodes = response.css('#news_list .news_block')[:1]
for post_node in post_nodes: for post_node in post_nodes:
image_url = "https:" + post_node.css('.entry_summary a img::attr(src)').extract_first("") image_url = "https:" + post_node.css('.entry_summary a img::attr(src)').extract_first("")
post_url = post_node.css('h2 a::attr(href)').extract_first("") post_url = post_node.css('h2 a::attr(href)').extract_first("")
@ -73,6 +73,7 @@ class JobboleSpider(scrapy.Spider):
item_loader.add_css('content', '#news_content') item_loader.add_css('content', '#news_content')
item_loader.add_css('tags', '.news_tags a::text') item_loader.add_css('tags', '.news_tags a::text')
item_loader.add_css('create_date', '#news_info .time::text') item_loader.add_css('create_date', '#news_info .time::text')
item_loader.add_xpath('where_from', '//*[@id="link_source2"]/text()')
item_loader.add_value('url', response.url) item_loader.add_value('url', response.url)
item_loader.add_value('front_image_url', response.meta.get('front_image_url', '')) item_loader.add_value('front_image_url', response.meta.get('front_image_url', ''))
yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)),

Loading…
Cancel
Save