main
qweasdzxc227 6 months ago
parent 618ebdb791
commit d737bee569

@ -1,8 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.9 (article_spider)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.9 (article_spider)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

@ -1,64 +1,86 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import re
import scrapy
from scrapy.loader.processors import MapCompose, TakeFirst, Identity, Join
from scrapy.loader import ItemLoader
from ArticleSpider.models.es_types import ArticleType
from w3lib.html import remove_tags
class ArticlespiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
def date_convert(value):
match_re = re.match('.*?(\d+.*)', value)
if match_re:
return match_re.group(1)
else:
return '1970-07-01'
class ArticleItemLoader(ItemLoader):
default_output_processor = TakeFirst()
class JobBoleArticleItem(scrapy.Item):
title = scrapy.Field() # 标题
create_date = scrapy.Field(
input_processor=MapCompose(date_convert)
) # 发布时间
url = scrapy.Field() # 链接
url_object_id = scrapy.Field() # 链接id
front_image_url = scrapy.Field(
output_processor=Identity()
) # 封面图
front_image_path = scrapy.Field() # 封面图路径
praise_nums = scrapy.Field() # 点赞数
comment_nums = scrapy.Field() # 评论数
fav_nums = scrapy.Field() # 收藏数
tags = scrapy.Field(
output_processor=Join(separator=',')
) # 标签
content = scrapy.Field() # 内容
def save_to_es(self):
article = ArticleType()
article.title = self['title']
article.create_date = self['create_date']
article.content = remove_tags(self['content'])
article.front_image_url = self['front_image_url']
if 'front_image_path' in self:
article.front_image_path = self['front_image_path']
article.praise_nums = self['praise_nums']
article.fav_nums = self['fav_nums']
article.comment_nums = self['comment_nums']
article.url = self['url']
article.tags = self['tags']
article.meta.id = self['url_object_id']
article.save()
return
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import re
import scrapy
from scrapy.loader.processors import MapCompose, TakeFirst, Identity, Join
from scrapy.loader import ItemLoader
from ArticleSpider.models.es_types import ArticleType
from w3lib.html import remove_tags
from elasticsearch_dsl.connections import connections
es = connections.create_connection(ArticleType._doc_type.using)
class ArticlespiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
def gen_suggests(index, info_tuple):
# 根据字符串生成字符串搜索建议数组
used_words = set() # 去重
suggests = []
for text, weight in info_tuple:
if text:
# 调用es的analyze接口分析字符串
words = es.indices.analyze(index=index, analyzer='ik_max_word', params={'filter': ['lowercase']}, body=text)
anylyzed_words = set([r['token'] for r in words['tokens'] if len(r['token']) > 1])
new_words = anylyzed_words - used_words
else:
new_words = set()
if new_words:
suggests.append({'input': list(new_words), 'weight': weight})
return suggests
def date_convert(value):
match_re = re.match('.*?(\d+.*)', value)
if match_re:
return match_re.group(1)
else:
return '1970-07-01'
class ArticleItemLoader(ItemLoader):
default_output_processor = TakeFirst()
class JobBoleArticleItem(scrapy.Item):
title = scrapy.Field() # 标题
create_date = scrapy.Field(
input_processor=MapCompose(date_convert)
) # 发布时间
url = scrapy.Field() # 链接
url_object_id = scrapy.Field() # 链接id
front_image_url = scrapy.Field(
output_processor=Identity()
) # 封面图
front_image_path = scrapy.Field() # 封面图路径
praise_nums = scrapy.Field() # 点赞数
comment_nums = scrapy.Field() # 评论数
fav_nums = scrapy.Field() # 收藏数
tags = scrapy.Field(
output_processor=Join(separator=',')
) # 标签
content = scrapy.Field() # 内容
def save_to_es(self):
article = ArticleType()
article.title = self['title']
article.create_date = self['create_date']
article.content = remove_tags(self['content'])
article.front_image_url = self['front_image_url']
if 'front_image_path' in self:
article.front_image_path = self['front_image_path']
article.praise_nums = self['praise_nums']
article.fav_nums = self['fav_nums']
article.comment_nums = self['comment_nums']
article.url = self['url']
article.tags = self['tags']
article.meta.id = self['url_object_id']
article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7)))
article.save()
return

@ -1,12 +1,10 @@
# -*- coding: utf-8 -*-
__author__ = 'bobby'
from scrapy.cmdline import execute
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
execute(["scrapy", "crawl", "jobbole"])
# execute(["scrapy", "crawl", "zhihu"])
# execute(["scrapy", "crawl", "lagou"])
# -*- coding: utf-8 -*-
__author__ = 'bobby'
from scrapy.cmdline import execute
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
execute(["scrapy", "crawl", "jobbole"])

Loading…
Cancel
Save