main
qweasdzxc227 6 months ago
parent 618ebdb791
commit d737bee569

@ -8,6 +8,10 @@ from scrapy.loader.processors import MapCompose, TakeFirst, Identity, Join
from scrapy.loader import ItemLoader from scrapy.loader import ItemLoader
from ArticleSpider.models.es_types import ArticleType from ArticleSpider.models.es_types import ArticleType
from w3lib.html import remove_tags from w3lib.html import remove_tags
from elasticsearch_dsl.connections import connections
es = connections.create_connection(ArticleType._doc_type.using)
class ArticlespiderItem(scrapy.Item): class ArticlespiderItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
@ -15,6 +19,23 @@ class ArticlespiderItem(scrapy.Item):
pass pass
def gen_suggests(index, info_tuple):
# 根据字符串生成字符串搜索建议数组
used_words = set() # 去重
suggests = []
for text, weight in info_tuple:
if text:
# 调用es的analyze接口分析字符串
words = es.indices.analyze(index=index, analyzer='ik_max_word', params={'filter': ['lowercase']}, body=text)
anylyzed_words = set([r['token'] for r in words['tokens'] if len(r['token']) > 1])
new_words = anylyzed_words - used_words
else:
new_words = set()
if new_words:
suggests.append({'input': list(new_words), 'weight': weight})
return suggests
def date_convert(value): def date_convert(value):
match_re = re.match('.*?(\d+.*)', value) match_re = re.match('.*?(\d+.*)', value)
if match_re: if match_re:
@ -60,5 +81,6 @@ class JobBoleArticleItem(scrapy.Item):
article.url = self['url'] article.url = self['url']
article.tags = self['tags'] article.tags = self['tags']
article.meta.id = self['url_object_id'] article.meta.id = self['url_object_id']
article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7)))
article.save() article.save()
return return

@ -8,5 +8,3 @@ import os
sys.path.append(os.path.dirname(os.path.abspath(__file__))) sys.path.append(os.path.dirname(os.path.abspath(__file__)))
execute(["scrapy", "crawl", "jobbole"]) execute(["scrapy", "crawl", "jobbole"])
# execute(["scrapy", "crawl", "zhihu"])
# execute(["scrapy", "crawl", "lagou"])
Loading…
Cancel
Save