parent
618ebdb791
commit
d737bee569
@ -1,8 +1,8 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<module type="PYTHON_MODULE" version="4">
|
<module type="PYTHON_MODULE" version="4">
|
||||||
<component name="NewModuleRootManager">
|
<component name="NewModuleRootManager">
|
||||||
<content url="file://$MODULE_DIR$" />
|
<content url="file://$MODULE_DIR$" />
|
||||||
<orderEntry type="jdk" jdkName="Python 3.9 (article_spider)" jdkType="Python SDK" />
|
<orderEntry type="jdk" jdkName="Python 3.9 (article_spider)" jdkType="Python SDK" />
|
||||||
<orderEntry type="sourceFolder" forTests="false" />
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
</component>
|
</component>
|
||||||
</module>
|
</module>
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,64 +1,86 @@
|
|||||||
# Define here the models for your scraped items
|
# Define here the models for your scraped items
|
||||||
#
|
#
|
||||||
# See documentation in:
|
# See documentation in:
|
||||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||||
import re
|
import re
|
||||||
import scrapy
|
import scrapy
|
||||||
from scrapy.loader.processors import MapCompose, TakeFirst, Identity, Join
|
from scrapy.loader.processors import MapCompose, TakeFirst, Identity, Join
|
||||||
from scrapy.loader import ItemLoader
|
from scrapy.loader import ItemLoader
|
||||||
from ArticleSpider.models.es_types import ArticleType
|
from ArticleSpider.models.es_types import ArticleType
|
||||||
from w3lib.html import remove_tags
|
from w3lib.html import remove_tags
|
||||||
|
from elasticsearch_dsl.connections import connections
|
||||||
class ArticlespiderItem(scrapy.Item):
|
|
||||||
# define the fields for your item here like:
|
es = connections.create_connection(ArticleType._doc_type.using)
|
||||||
# name = scrapy.Field()
|
|
||||||
pass
|
|
||||||
|
class ArticlespiderItem(scrapy.Item):
|
||||||
|
# define the fields for your item here like:
|
||||||
def date_convert(value):
|
# name = scrapy.Field()
|
||||||
match_re = re.match('.*?(\d+.*)', value)
|
pass
|
||||||
if match_re:
|
|
||||||
return match_re.group(1)
|
|
||||||
else:
|
def gen_suggests(index, info_tuple):
|
||||||
return '1970-07-01'
|
# 根据字符串生成字符串搜索建议数组
|
||||||
|
used_words = set() # 去重
|
||||||
|
suggests = []
|
||||||
class ArticleItemLoader(ItemLoader):
|
for text, weight in info_tuple:
|
||||||
default_output_processor = TakeFirst()
|
if text:
|
||||||
|
# 调用es的analyze接口分析字符串
|
||||||
|
words = es.indices.analyze(index=index, analyzer='ik_max_word', params={'filter': ['lowercase']}, body=text)
|
||||||
class JobBoleArticleItem(scrapy.Item):
|
anylyzed_words = set([r['token'] for r in words['tokens'] if len(r['token']) > 1])
|
||||||
title = scrapy.Field() # 标题
|
new_words = anylyzed_words - used_words
|
||||||
create_date = scrapy.Field(
|
else:
|
||||||
input_processor=MapCompose(date_convert)
|
new_words = set()
|
||||||
) # 发布时间
|
if new_words:
|
||||||
url = scrapy.Field() # 链接
|
suggests.append({'input': list(new_words), 'weight': weight})
|
||||||
url_object_id = scrapy.Field() # 链接id
|
return suggests
|
||||||
front_image_url = scrapy.Field(
|
|
||||||
output_processor=Identity()
|
|
||||||
) # 封面图
|
def date_convert(value):
|
||||||
front_image_path = scrapy.Field() # 封面图路径
|
match_re = re.match('.*?(\d+.*)', value)
|
||||||
praise_nums = scrapy.Field() # 点赞数
|
if match_re:
|
||||||
comment_nums = scrapy.Field() # 评论数
|
return match_re.group(1)
|
||||||
fav_nums = scrapy.Field() # 收藏数
|
else:
|
||||||
tags = scrapy.Field(
|
return '1970-07-01'
|
||||||
output_processor=Join(separator=',')
|
|
||||||
) # 标签
|
|
||||||
content = scrapy.Field() # 内容
|
class ArticleItemLoader(ItemLoader):
|
||||||
|
default_output_processor = TakeFirst()
|
||||||
def save_to_es(self):
|
|
||||||
article = ArticleType()
|
|
||||||
article.title = self['title']
|
class JobBoleArticleItem(scrapy.Item):
|
||||||
article.create_date = self['create_date']
|
title = scrapy.Field() # 标题
|
||||||
article.content = remove_tags(self['content'])
|
create_date = scrapy.Field(
|
||||||
article.front_image_url = self['front_image_url']
|
input_processor=MapCompose(date_convert)
|
||||||
if 'front_image_path' in self:
|
) # 发布时间
|
||||||
article.front_image_path = self['front_image_path']
|
url = scrapy.Field() # 链接
|
||||||
article.praise_nums = self['praise_nums']
|
url_object_id = scrapy.Field() # 链接id
|
||||||
article.fav_nums = self['fav_nums']
|
front_image_url = scrapy.Field(
|
||||||
article.comment_nums = self['comment_nums']
|
output_processor=Identity()
|
||||||
article.url = self['url']
|
) # 封面图
|
||||||
article.tags = self['tags']
|
front_image_path = scrapy.Field() # 封面图路径
|
||||||
article.meta.id = self['url_object_id']
|
praise_nums = scrapy.Field() # 点赞数
|
||||||
article.save()
|
comment_nums = scrapy.Field() # 评论数
|
||||||
return
|
fav_nums = scrapy.Field() # 收藏数
|
||||||
|
tags = scrapy.Field(
|
||||||
|
output_processor=Join(separator=',')
|
||||||
|
) # 标签
|
||||||
|
content = scrapy.Field() # 内容
|
||||||
|
|
||||||
|
def save_to_es(self):
|
||||||
|
article = ArticleType()
|
||||||
|
article.title = self['title']
|
||||||
|
article.create_date = self['create_date']
|
||||||
|
article.content = remove_tags(self['content'])
|
||||||
|
article.front_image_url = self['front_image_url']
|
||||||
|
if 'front_image_path' in self:
|
||||||
|
article.front_image_path = self['front_image_path']
|
||||||
|
article.praise_nums = self['praise_nums']
|
||||||
|
article.fav_nums = self['fav_nums']
|
||||||
|
article.comment_nums = self['comment_nums']
|
||||||
|
article.url = self['url']
|
||||||
|
article.tags = self['tags']
|
||||||
|
article.meta.id = self['url_object_id']
|
||||||
|
article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7)))
|
||||||
|
article.save()
|
||||||
|
return
|
||||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,12 +1,10 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
__author__ = 'bobby'
|
__author__ = 'bobby'
|
||||||
|
|
||||||
from scrapy.cmdline import execute
|
from scrapy.cmdline import execute
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||||
execute(["scrapy", "crawl", "jobbole"])
|
execute(["scrapy", "crawl", "jobbole"])
|
||||||
# execute(["scrapy", "crawl", "zhihu"])
|
|
||||||
# execute(["scrapy", "crawl", "lagou"])
|
|
||||||
|
Loading…
Reference in new issue