parent
618ebdb791
commit
d737bee569
@ -1,8 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.9 (article_spider)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.9 (article_spider)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,64 +1,86 @@
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||
import re
|
||||
import scrapy
|
||||
from scrapy.loader.processors import MapCompose, TakeFirst, Identity, Join
|
||||
from scrapy.loader import ItemLoader
|
||||
from ArticleSpider.models.es_types import ArticleType
|
||||
from w3lib.html import remove_tags
|
||||
|
||||
class ArticlespiderItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
# name = scrapy.Field()
|
||||
pass
|
||||
|
||||
|
||||
def date_convert(value):
|
||||
match_re = re.match('.*?(\d+.*)', value)
|
||||
if match_re:
|
||||
return match_re.group(1)
|
||||
else:
|
||||
return '1970-07-01'
|
||||
|
||||
|
||||
class ArticleItemLoader(ItemLoader):
|
||||
default_output_processor = TakeFirst()
|
||||
|
||||
|
||||
class JobBoleArticleItem(scrapy.Item):
|
||||
title = scrapy.Field() # 标题
|
||||
create_date = scrapy.Field(
|
||||
input_processor=MapCompose(date_convert)
|
||||
) # 发布时间
|
||||
url = scrapy.Field() # 链接
|
||||
url_object_id = scrapy.Field() # 链接id
|
||||
front_image_url = scrapy.Field(
|
||||
output_processor=Identity()
|
||||
) # 封面图
|
||||
front_image_path = scrapy.Field() # 封面图路径
|
||||
praise_nums = scrapy.Field() # 点赞数
|
||||
comment_nums = scrapy.Field() # 评论数
|
||||
fav_nums = scrapy.Field() # 收藏数
|
||||
tags = scrapy.Field(
|
||||
output_processor=Join(separator=',')
|
||||
) # 标签
|
||||
content = scrapy.Field() # 内容
|
||||
|
||||
def save_to_es(self):
|
||||
article = ArticleType()
|
||||
article.title = self['title']
|
||||
article.create_date = self['create_date']
|
||||
article.content = remove_tags(self['content'])
|
||||
article.front_image_url = self['front_image_url']
|
||||
if 'front_image_path' in self:
|
||||
article.front_image_path = self['front_image_path']
|
||||
article.praise_nums = self['praise_nums']
|
||||
article.fav_nums = self['fav_nums']
|
||||
article.comment_nums = self['comment_nums']
|
||||
article.url = self['url']
|
||||
article.tags = self['tags']
|
||||
article.meta.id = self['url_object_id']
|
||||
article.save()
|
||||
return
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||
import re
|
||||
import scrapy
|
||||
from scrapy.loader.processors import MapCompose, TakeFirst, Identity, Join
|
||||
from scrapy.loader import ItemLoader
|
||||
from ArticleSpider.models.es_types import ArticleType
|
||||
from w3lib.html import remove_tags
|
||||
from elasticsearch_dsl.connections import connections
|
||||
|
||||
es = connections.create_connection(ArticleType._doc_type.using)
|
||||
|
||||
|
||||
class ArticlespiderItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
# name = scrapy.Field()
|
||||
pass
|
||||
|
||||
|
||||
def gen_suggests(index, info_tuple):
|
||||
# 根据字符串生成字符串搜索建议数组
|
||||
used_words = set() # 去重
|
||||
suggests = []
|
||||
for text, weight in info_tuple:
|
||||
if text:
|
||||
# 调用es的analyze接口分析字符串
|
||||
words = es.indices.analyze(index=index, analyzer='ik_max_word', params={'filter': ['lowercase']}, body=text)
|
||||
anylyzed_words = set([r['token'] for r in words['tokens'] if len(r['token']) > 1])
|
||||
new_words = anylyzed_words - used_words
|
||||
else:
|
||||
new_words = set()
|
||||
if new_words:
|
||||
suggests.append({'input': list(new_words), 'weight': weight})
|
||||
return suggests
|
||||
|
||||
|
||||
def date_convert(value):
|
||||
match_re = re.match('.*?(\d+.*)', value)
|
||||
if match_re:
|
||||
return match_re.group(1)
|
||||
else:
|
||||
return '1970-07-01'
|
||||
|
||||
|
||||
class ArticleItemLoader(ItemLoader):
|
||||
default_output_processor = TakeFirst()
|
||||
|
||||
|
||||
class JobBoleArticleItem(scrapy.Item):
|
||||
title = scrapy.Field() # 标题
|
||||
create_date = scrapy.Field(
|
||||
input_processor=MapCompose(date_convert)
|
||||
) # 发布时间
|
||||
url = scrapy.Field() # 链接
|
||||
url_object_id = scrapy.Field() # 链接id
|
||||
front_image_url = scrapy.Field(
|
||||
output_processor=Identity()
|
||||
) # 封面图
|
||||
front_image_path = scrapy.Field() # 封面图路径
|
||||
praise_nums = scrapy.Field() # 点赞数
|
||||
comment_nums = scrapy.Field() # 评论数
|
||||
fav_nums = scrapy.Field() # 收藏数
|
||||
tags = scrapy.Field(
|
||||
output_processor=Join(separator=',')
|
||||
) # 标签
|
||||
content = scrapy.Field() # 内容
|
||||
|
||||
def save_to_es(self):
|
||||
article = ArticleType()
|
||||
article.title = self['title']
|
||||
article.create_date = self['create_date']
|
||||
article.content = remove_tags(self['content'])
|
||||
article.front_image_url = self['front_image_url']
|
||||
if 'front_image_path' in self:
|
||||
article.front_image_path = self['front_image_path']
|
||||
article.praise_nums = self['praise_nums']
|
||||
article.fav_nums = self['fav_nums']
|
||||
article.comment_nums = self['comment_nums']
|
||||
article.url = self['url']
|
||||
article.tags = self['tags']
|
||||
article.meta.id = self['url_object_id']
|
||||
article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7)))
|
||||
article.save()
|
||||
return
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,12 +1,10 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
__author__ = 'bobby'
|
||||
|
||||
from scrapy.cmdline import execute
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||
execute(["scrapy", "crawl", "jobbole"])
|
||||
# execute(["scrapy", "crawl", "zhihu"])
|
||||
# execute(["scrapy", "crawl", "lagou"])
|
||||
# -*- coding: utf-8 -*-
|
||||
__author__ = 'bobby'
|
||||
|
||||
from scrapy.cmdline import execute
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||
execute(["scrapy", "crawl", "jobbole"])
|
||||
|
Loading…
Reference in new issue