|
|
@ -8,6 +8,10 @@ from scrapy.loader.processors import MapCompose, TakeFirst, Identity, Join
|
|
|
|
from scrapy.loader import ItemLoader
|
|
|
|
from scrapy.loader import ItemLoader
|
|
|
|
from ArticleSpider.models.es_types import ArticleType
|
|
|
|
from ArticleSpider.models.es_types import ArticleType
|
|
|
|
from w3lib.html import remove_tags
|
|
|
|
from w3lib.html import remove_tags
|
|
|
|
|
|
|
|
from elasticsearch_dsl.connections import connections
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
es = connections.create_connection(ArticleType._doc_type.using)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ArticlespiderItem(scrapy.Item):
|
|
|
|
class ArticlespiderItem(scrapy.Item):
|
|
|
|
# define the fields for your item here like:
|
|
|
|
# define the fields for your item here like:
|
|
|
@ -15,6 +19,23 @@ class ArticlespiderItem(scrapy.Item):
|
|
|
|
pass
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def gen_suggests(index, info_tuple):
|
|
|
|
|
|
|
|
# 根据字符串生成字符串搜索建议数组
|
|
|
|
|
|
|
|
used_words = set() # 去重
|
|
|
|
|
|
|
|
suggests = []
|
|
|
|
|
|
|
|
for text, weight in info_tuple:
|
|
|
|
|
|
|
|
if text:
|
|
|
|
|
|
|
|
# 调用es的analyze接口分析字符串
|
|
|
|
|
|
|
|
words = es.indices.analyze(index=index, analyzer='ik_max_word', params={'filter': ['lowercase']}, body=text)
|
|
|
|
|
|
|
|
anylyzed_words = set([r['token'] for r in words['tokens'] if len(r['token']) > 1])
|
|
|
|
|
|
|
|
new_words = anylyzed_words - used_words
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
new_words = set()
|
|
|
|
|
|
|
|
if new_words:
|
|
|
|
|
|
|
|
suggests.append({'input': list(new_words), 'weight': weight})
|
|
|
|
|
|
|
|
return suggests
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def date_convert(value):
|
|
|
|
def date_convert(value):
|
|
|
|
match_re = re.match('.*?(\d+.*)', value)
|
|
|
|
match_re = re.match('.*?(\d+.*)', value)
|
|
|
|
if match_re:
|
|
|
|
if match_re:
|
|
|
@ -60,5 +81,6 @@ class JobBoleArticleItem(scrapy.Item):
|
|
|
|
article.url = self['url']
|
|
|
|
article.url = self['url']
|
|
|
|
article.tags = self['tags']
|
|
|
|
article.tags = self['tags']
|
|
|
|
article.meta.id = self['url_object_id']
|
|
|
|
article.meta.id = self['url_object_id']
|
|
|
|
|
|
|
|
article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7)))
|
|
|
|
article.save()
|
|
|
|
article.save()
|
|
|
|
return
|
|
|
|
return
|