diff --git a/ArticleSpider/ArticleSpider/__pycache__/items.cpython-39.pyc b/ArticleSpider/ArticleSpider/__pycache__/items.cpython-39.pyc index f34a24f..8618b9a 100644 Binary files a/ArticleSpider/ArticleSpider/__pycache__/items.cpython-39.pyc and b/ArticleSpider/ArticleSpider/__pycache__/items.cpython-39.pyc differ diff --git a/ArticleSpider/ArticleSpider/__pycache__/pipelines.cpython-39.pyc b/ArticleSpider/ArticleSpider/__pycache__/pipelines.cpython-39.pyc index 983cf65..bb49194 100644 Binary files a/ArticleSpider/ArticleSpider/__pycache__/pipelines.cpython-39.pyc and b/ArticleSpider/ArticleSpider/__pycache__/pipelines.cpython-39.pyc differ diff --git a/ArticleSpider/ArticleSpider/__pycache__/settings.cpython-39.pyc b/ArticleSpider/ArticleSpider/__pycache__/settings.cpython-39.pyc index b08b487..93df99d 100644 Binary files a/ArticleSpider/ArticleSpider/__pycache__/settings.cpython-39.pyc and b/ArticleSpider/ArticleSpider/__pycache__/settings.cpython-39.pyc differ diff --git a/ArticleSpider/ArticleSpider/items.py b/ArticleSpider/ArticleSpider/items.py index 5076fd8..adfce61 100644 --- a/ArticleSpider/ArticleSpider/items.py +++ b/ArticleSpider/ArticleSpider/items.py @@ -66,6 +66,7 @@ class JobBoleArticleItem(scrapy.Item): output_processor=Join(separator=',') ) # 标签 content = scrapy.Field() # 内容 + where_from = scrapy.Field() # 来自 def save_to_es(self): article = ArticleType() @@ -80,6 +81,7 @@ class JobBoleArticleItem(scrapy.Item): article.comment_nums = self['comment_nums'] article.url = self['url'] article.tags = self['tags'] + article.where_from = self['where_from'] article.meta.id = self['url_object_id'] article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7))) article.save() diff --git a/ArticleSpider/ArticleSpider/models/__pycache__/es_types.cpython-39.pyc b/ArticleSpider/ArticleSpider/models/__pycache__/es_types.cpython-39.pyc index 482af49..41161e0 100644 Binary files a/ArticleSpider/ArticleSpider/models/__pycache__/es_types.cpython-39.pyc and b/ArticleSpider/ArticleSpider/models/__pycache__/es_types.cpython-39.pyc differ diff --git a/ArticleSpider/ArticleSpider/models/es_types.py b/ArticleSpider/ArticleSpider/models/es_types.py index 5fd480d..1eaa740 100644 --- a/ArticleSpider/ArticleSpider/models/es_types.py +++ b/ArticleSpider/ArticleSpider/models/es_types.py @@ -33,6 +33,7 @@ class ArticleType(DocType): fav_nums = Integer() tags = Text(analyzer="ik_max_word") content = Text(analyzer="ik_max_word") + where_from = Text(analyzer="ik_max_word") class Meta: index = "jobbole" diff --git a/ArticleSpider/ArticleSpider/pipelines.py b/ArticleSpider/ArticleSpider/pipelines.py index 425e4d6..68edf95 100644 --- a/ArticleSpider/ArticleSpider/pipelines.py +++ b/ArticleSpider/ArticleSpider/pipelines.py @@ -32,8 +32,8 @@ class MysqlPipeline(object): def process_item(self, item, spider): insert_sql = """ - insert into jobbole_article(title, url ,url_object_id, front_image_url, front_image_path, parise_nums, comment_nums, fav_nums, tags, content, create_date) - values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE parise_nums=VALUES(parise_nums) + insert into jobbole_article(title, url ,url_object_id, front_image_url, front_image_path, parise_nums, comment_nums, fav_nums, tags, content, create_date, where_from) + values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE parise_nums=VALUES(parise_nums) """ params = list() params.append(item.get("title", "")) @@ -48,6 +48,7 @@ class MysqlPipeline(object): params.append(item.get("tags", "")) params.append(item.get("content", "")) params.append(item.get("create_date", "1970-07-01")) + params.append(item.get("where_from", "")) self.cursor.execute(insert_sql, tuple(params)) self.conn.commit() return item @@ -83,8 +84,8 @@ class MysqlTwistedPipline(object): def do_insert(self, cursor, item): # 执行具体的插入 insert_sql = """ - insert into jobbole_article(title, url ,url_object_id, front_image_url, front_image_path, parise_nums, comment_nums, fav_nums, tags, content, create_date) - values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE parise_nums=VALUES(parise_nums) + insert into jobbole_article(title, url ,url_object_id, front_image_url, front_image_path, parise_nums, comment_nums, fav_nums, tags, content, create_date, where_from) + values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE parise_nums=VALUES(parise_nums) """ params = list() params.append(item.get("title", "")) @@ -99,6 +100,7 @@ class MysqlTwistedPipline(object): params.append(item.get("tags", "")) params.append(item.get("content", "")) params.append(item.get("create_date", "1970-07-01")) + params.append(item.get("where_from", "")) # 根据不同的item 构建不同的sql语句并插入到mysql中 cursor.execute(insert_sql, tuple(params)) diff --git a/ArticleSpider/ArticleSpider/settings.py b/ArticleSpider/ArticleSpider/settings.py index abdc2b1..8585f5e 100644 --- a/ArticleSpider/ArticleSpider/settings.py +++ b/ArticleSpider/ArticleSpider/settings.py @@ -74,8 +74,8 @@ ITEM_PIPELINES = { # 'ArticleSpider.pipelines.JsonWithEncodingPipeline': 2, # 'ArticleSpider.pipelines.JsonExporterPipeline': 3, # 'ArticleSpider.pipelines.MysqlPipeline': 4, - # 'ArticleSpider.pipelines.MysqlTwistedPipline': 5, - 'ArticleSpider.pipelines.ElasticsearchPipeline': 6, + 'ArticleSpider.pipelines.MysqlTwistedPipline': 5, + # 'ArticleSpider.pipelines.ElasticsearchPipeline': 6, 'ArticleSpider.pipelines.ArticlespiderPipeline': 300, } diff --git a/ArticleSpider/ArticleSpider/spiders/__pycache__/jobbole.cpython-39.pyc b/ArticleSpider/ArticleSpider/spiders/__pycache__/jobbole.cpython-39.pyc index 639c1c6..514c62d 100644 Binary files a/ArticleSpider/ArticleSpider/spiders/__pycache__/jobbole.cpython-39.pyc and b/ArticleSpider/ArticleSpider/spiders/__pycache__/jobbole.cpython-39.pyc differ diff --git a/ArticleSpider/ArticleSpider/spiders/jobbole.py b/ArticleSpider/ArticleSpider/spiders/jobbole.py index 56c5adb..3912247 100644 --- a/ArticleSpider/ArticleSpider/spiders/jobbole.py +++ b/ArticleSpider/ArticleSpider/spiders/jobbole.py @@ -54,7 +54,7 @@ class JobboleSpider(scrapy.Spider): def parse(self, response): # 1.获取新闻列表页中的新闻url并交给scrapy进行下载后调用相应的解析方 # 提取文章链接,extract_first()提取第一个值 - post_nodes = response.css('#news_list .news_block')[:100] + post_nodes = response.css('#news_list .news_block')[:1] for post_node in post_nodes: image_url = "https:" + post_node.css('.entry_summary a img::attr(src)').extract_first("") post_url = post_node.css('h2 a::attr(href)').extract_first("") @@ -73,6 +73,7 @@ class JobboleSpider(scrapy.Spider): item_loader.add_css('content', '#news_content') item_loader.add_css('tags', '.news_tags a::text') item_loader.add_css('create_date', '#news_info .time::text') + item_loader.add_xpath('where_from', '//*[@id="link_source2"]/text()') item_loader.add_value('url', response.url) item_loader.add_value('front_image_url', response.meta.get('front_image_url', '')) yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)),