diff --git a/ArticleSpider/.idea/ArticleSpider.iml b/ArticleSpider/.idea/ArticleSpider.iml index bbec2ce..0cb82f7 100644 --- a/ArticleSpider/.idea/ArticleSpider.iml +++ b/ArticleSpider/.idea/ArticleSpider.iml @@ -1,8 +1,8 @@ - - - - - - - + + + + + + + \ No newline at end of file diff --git a/ArticleSpider/ArticleSpider/__pycache__/items.cpython-39.pyc b/ArticleSpider/ArticleSpider/__pycache__/items.cpython-39.pyc index d053818..f34a24f 100644 Binary files a/ArticleSpider/ArticleSpider/__pycache__/items.cpython-39.pyc and b/ArticleSpider/ArticleSpider/__pycache__/items.cpython-39.pyc differ diff --git a/ArticleSpider/ArticleSpider/__pycache__/pipelines.cpython-39.pyc b/ArticleSpider/ArticleSpider/__pycache__/pipelines.cpython-39.pyc index b65d43a..983cf65 100644 Binary files a/ArticleSpider/ArticleSpider/__pycache__/pipelines.cpython-39.pyc and b/ArticleSpider/ArticleSpider/__pycache__/pipelines.cpython-39.pyc differ diff --git a/ArticleSpider/ArticleSpider/__pycache__/settings.cpython-39.pyc b/ArticleSpider/ArticleSpider/__pycache__/settings.cpython-39.pyc index b018caa..22bc2d0 100644 Binary files a/ArticleSpider/ArticleSpider/__pycache__/settings.cpython-39.pyc and b/ArticleSpider/ArticleSpider/__pycache__/settings.cpython-39.pyc differ diff --git a/ArticleSpider/ArticleSpider/items.py b/ArticleSpider/ArticleSpider/items.py index 902ba99..5076fd8 100644 --- a/ArticleSpider/ArticleSpider/items.py +++ b/ArticleSpider/ArticleSpider/items.py @@ -1,64 +1,86 @@ -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html -import re -import scrapy -from scrapy.loader.processors import MapCompose, TakeFirst, Identity, Join -from scrapy.loader import ItemLoader -from ArticleSpider.models.es_types import ArticleType -from w3lib.html import remove_tags - -class ArticlespiderItem(scrapy.Item): - # define the fields for your item here like: - # name = scrapy.Field() - pass - - -def date_convert(value): - match_re = re.match('.*?(\d+.*)', value) - if match_re: - return match_re.group(1) - else: - return '1970-07-01' - - -class ArticleItemLoader(ItemLoader): - default_output_processor = TakeFirst() - - -class JobBoleArticleItem(scrapy.Item): - title = scrapy.Field() # 标题 - create_date = scrapy.Field( - input_processor=MapCompose(date_convert) - ) # 发布时间 - url = scrapy.Field() # 链接 - url_object_id = scrapy.Field() # 链接id - front_image_url = scrapy.Field( - output_processor=Identity() - ) # 封面图 - front_image_path = scrapy.Field() # 封面图路径 - praise_nums = scrapy.Field() # 点赞数 - comment_nums = scrapy.Field() # 评论数 - fav_nums = scrapy.Field() # 收藏数 - tags = scrapy.Field( - output_processor=Join(separator=',') - ) # 标签 - content = scrapy.Field() # 内容 - - def save_to_es(self): - article = ArticleType() - article.title = self['title'] - article.create_date = self['create_date'] - article.content = remove_tags(self['content']) - article.front_image_url = self['front_image_url'] - if 'front_image_path' in self: - article.front_image_path = self['front_image_path'] - article.praise_nums = self['praise_nums'] - article.fav_nums = self['fav_nums'] - article.comment_nums = self['comment_nums'] - article.url = self['url'] - article.tags = self['tags'] - article.meta.id = self['url_object_id'] - article.save() - return \ No newline at end of file +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html +import re +import scrapy +from scrapy.loader.processors import MapCompose, TakeFirst, Identity, Join +from scrapy.loader import ItemLoader +from ArticleSpider.models.es_types import ArticleType +from w3lib.html import remove_tags +from elasticsearch_dsl.connections import connections + +es = connections.create_connection(ArticleType._doc_type.using) + + +class ArticlespiderItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass + + +def gen_suggests(index, info_tuple): + # 根据字符串生成字符串搜索建议数组 + used_words = set() # 去重 + suggests = [] + for text, weight in info_tuple: + if text: + # 调用es的analyze接口分析字符串 + words = es.indices.analyze(index=index, analyzer='ik_max_word', params={'filter': ['lowercase']}, body=text) + anylyzed_words = set([r['token'] for r in words['tokens'] if len(r['token']) > 1]) + new_words = anylyzed_words - used_words + else: + new_words = set() + if new_words: + suggests.append({'input': list(new_words), 'weight': weight}) + return suggests + + +def date_convert(value): + match_re = re.match('.*?(\d+.*)', value) + if match_re: + return match_re.group(1) + else: + return '1970-07-01' + + +class ArticleItemLoader(ItemLoader): + default_output_processor = TakeFirst() + + +class JobBoleArticleItem(scrapy.Item): + title = scrapy.Field() # 标题 + create_date = scrapy.Field( + input_processor=MapCompose(date_convert) + ) # 发布时间 + url = scrapy.Field() # 链接 + url_object_id = scrapy.Field() # 链接id + front_image_url = scrapy.Field( + output_processor=Identity() + ) # 封面图 + front_image_path = scrapy.Field() # 封面图路径 + praise_nums = scrapy.Field() # 点赞数 + comment_nums = scrapy.Field() # 评论数 + fav_nums = scrapy.Field() # 收藏数 + tags = scrapy.Field( + output_processor=Join(separator=',') + ) # 标签 + content = scrapy.Field() # 内容 + + def save_to_es(self): + article = ArticleType() + article.title = self['title'] + article.create_date = self['create_date'] + article.content = remove_tags(self['content']) + article.front_image_url = self['front_image_url'] + if 'front_image_path' in self: + article.front_image_path = self['front_image_path'] + article.praise_nums = self['praise_nums'] + article.fav_nums = self['fav_nums'] + article.comment_nums = self['comment_nums'] + article.url = self['url'] + article.tags = self['tags'] + article.meta.id = self['url_object_id'] + article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7))) + article.save() + return diff --git a/ArticleSpider/ArticleSpider/models/__pycache__/__init__.cpython-39.pyc b/ArticleSpider/ArticleSpider/models/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..4128638 Binary files /dev/null and b/ArticleSpider/ArticleSpider/models/__pycache__/__init__.cpython-39.pyc differ diff --git a/ArticleSpider/ArticleSpider/models/__pycache__/es_types.cpython-39.pyc b/ArticleSpider/ArticleSpider/models/__pycache__/es_types.cpython-39.pyc new file mode 100644 index 0000000..482af49 Binary files /dev/null and b/ArticleSpider/ArticleSpider/models/__pycache__/es_types.cpython-39.pyc differ diff --git a/ArticleSpider/ArticleSpider/spiders/__pycache__/jobbole.cpython-39.pyc b/ArticleSpider/ArticleSpider/spiders/__pycache__/jobbole.cpython-39.pyc index b028d16..b0ada79 100644 Binary files a/ArticleSpider/ArticleSpider/spiders/__pycache__/jobbole.cpython-39.pyc and b/ArticleSpider/ArticleSpider/spiders/__pycache__/jobbole.cpython-39.pyc differ diff --git a/ArticleSpider/main.py b/ArticleSpider/main.py index 0538b65..2b6e606 100644 --- a/ArticleSpider/main.py +++ b/ArticleSpider/main.py @@ -1,12 +1,10 @@ -# -*- coding: utf-8 -*- -__author__ = 'bobby' - -from scrapy.cmdline import execute - -import sys -import os - -sys.path.append(os.path.dirname(os.path.abspath(__file__))) -execute(["scrapy", "crawl", "jobbole"]) -# execute(["scrapy", "crawl", "zhihu"]) -# execute(["scrapy", "crawl", "lagou"]) \ No newline at end of file +# -*- coding: utf-8 -*- +__author__ = 'bobby' + +from scrapy.cmdline import execute + +import sys +import os + +sys.path.append(os.path.dirname(os.path.abspath(__file__))) +execute(["scrapy", "crawl", "jobbole"])