main
qweasdzxc227 6 months ago
parent 618ebdb791
commit d737bee569

@ -1,8 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4"> <module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager"> <component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" /> <content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.9 (article_spider)" jdkType="Python SDK" /> <orderEntry type="jdk" jdkName="Python 3.9 (article_spider)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="sourceFolder" forTests="false" />
</component> </component>
</module> </module>

@ -1,64 +1,86 @@
# Define here the models for your scraped items # Define here the models for your scraped items
# #
# See documentation in: # See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html # https://docs.scrapy.org/en/latest/topics/items.html
import re import re
import scrapy import scrapy
from scrapy.loader.processors import MapCompose, TakeFirst, Identity, Join from scrapy.loader.processors import MapCompose, TakeFirst, Identity, Join
from scrapy.loader import ItemLoader from scrapy.loader import ItemLoader
from ArticleSpider.models.es_types import ArticleType from ArticleSpider.models.es_types import ArticleType
from w3lib.html import remove_tags from w3lib.html import remove_tags
from elasticsearch_dsl.connections import connections
class ArticlespiderItem(scrapy.Item):
# define the fields for your item here like: es = connections.create_connection(ArticleType._doc_type.using)
# name = scrapy.Field()
pass
class ArticlespiderItem(scrapy.Item):
# define the fields for your item here like:
def date_convert(value): # name = scrapy.Field()
match_re = re.match('.*?(\d+.*)', value) pass
if match_re:
return match_re.group(1)
else: def gen_suggests(index, info_tuple):
return '1970-07-01' # 根据字符串生成字符串搜索建议数组
used_words = set() # 去重
suggests = []
class ArticleItemLoader(ItemLoader): for text, weight in info_tuple:
default_output_processor = TakeFirst() if text:
# 调用es的analyze接口分析字符串
words = es.indices.analyze(index=index, analyzer='ik_max_word', params={'filter': ['lowercase']}, body=text)
class JobBoleArticleItem(scrapy.Item): anylyzed_words = set([r['token'] for r in words['tokens'] if len(r['token']) > 1])
title = scrapy.Field() # 标题 new_words = anylyzed_words - used_words
create_date = scrapy.Field( else:
input_processor=MapCompose(date_convert) new_words = set()
) # 发布时间 if new_words:
url = scrapy.Field() # 链接 suggests.append({'input': list(new_words), 'weight': weight})
url_object_id = scrapy.Field() # 链接id return suggests
front_image_url = scrapy.Field(
output_processor=Identity()
) # 封面图 def date_convert(value):
front_image_path = scrapy.Field() # 封面图路径 match_re = re.match('.*?(\d+.*)', value)
praise_nums = scrapy.Field() # 点赞数 if match_re:
comment_nums = scrapy.Field() # 评论数 return match_re.group(1)
fav_nums = scrapy.Field() # 收藏数 else:
tags = scrapy.Field( return '1970-07-01'
output_processor=Join(separator=',')
) # 标签
content = scrapy.Field() # 内容 class ArticleItemLoader(ItemLoader):
default_output_processor = TakeFirst()
def save_to_es(self):
article = ArticleType()
article.title = self['title'] class JobBoleArticleItem(scrapy.Item):
article.create_date = self['create_date'] title = scrapy.Field() # 标题
article.content = remove_tags(self['content']) create_date = scrapy.Field(
article.front_image_url = self['front_image_url'] input_processor=MapCompose(date_convert)
if 'front_image_path' in self: ) # 发布时间
article.front_image_path = self['front_image_path'] url = scrapy.Field() # 链接
article.praise_nums = self['praise_nums'] url_object_id = scrapy.Field() # 链接id
article.fav_nums = self['fav_nums'] front_image_url = scrapy.Field(
article.comment_nums = self['comment_nums'] output_processor=Identity()
article.url = self['url'] ) # 封面图
article.tags = self['tags'] front_image_path = scrapy.Field() # 封面图路径
article.meta.id = self['url_object_id'] praise_nums = scrapy.Field() # 点赞数
article.save() comment_nums = scrapy.Field() # 评论数
return fav_nums = scrapy.Field() # 收藏数
tags = scrapy.Field(
output_processor=Join(separator=',')
) # 标签
content = scrapy.Field() # 内容
def save_to_es(self):
article = ArticleType()
article.title = self['title']
article.create_date = self['create_date']
article.content = remove_tags(self['content'])
article.front_image_url = self['front_image_url']
if 'front_image_path' in self:
article.front_image_path = self['front_image_path']
article.praise_nums = self['praise_nums']
article.fav_nums = self['fav_nums']
article.comment_nums = self['comment_nums']
article.url = self['url']
article.tags = self['tags']
article.meta.id = self['url_object_id']
article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7)))
article.save()
return

@ -1,12 +1,10 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
__author__ = 'bobby' __author__ = 'bobby'
from scrapy.cmdline import execute from scrapy.cmdline import execute
import sys import sys
import os import os
sys.path.append(os.path.dirname(os.path.abspath(__file__))) sys.path.append(os.path.dirname(os.path.abspath(__file__)))
execute(["scrapy", "crawl", "jobbole"]) execute(["scrapy", "crawl", "jobbole"])
# execute(["scrapy", "crawl", "zhihu"])
# execute(["scrapy", "crawl", "lagou"])

Loading…
Cancel
Save