From 234622c4aa6bc77a631f141f6390cb051cec1b07 Mon Sep 17 00:00:00 2001 From: pnhekgfuf <1913997697@qq.com> Date: Sat, 29 Apr 2023 12:27:36 +0800 Subject: [PATCH] ADD file via upload --- items.py | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 items.py diff --git a/items.py b/items.py new file mode 100644 index 0000000..5549898 --- /dev/null +++ b/items.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- +import scrapy +from datetime import datetime +from settings import SQL_DATETIME_FORMAT + + +class ZhihuQuestionItem(scrapy.Item): + ''' + zhihu's question item design + ''' + question_id = scrapy.Field() + topics = scrapy.Field() + question_url = scrapy.Field() + title = scrapy.Field() + content = scrapy.Field() + create_time = scrapy.Field() + update_time = scrapy.Field() + answer_nums = scrapy.Field() + comment_nums = scrapy.Field() + watch_user_nums = scrapy.Field() + click_nums = scrapy.Field() + crawl_time = scrapy.Field() + crawl_update_time = scrapy.Field() + + def get_insert_sql(self): + ''' + get insert_sql and parameters of question + ''' + insert_sql = "insert into question(question_id, topics, question_url, title, content, answer_nums, " \ + "comment_nums, watch_user_nums, click_nums, crawl_time)VALUES (%s, %s, %s, %s, %s, %s, %s, %s, " \ + "%s, %s)ON DUPLICATE KEY UPDATE content=VALUES(content), answer_nums=VALUES(" \ + "answer_nums),comment_nums=VALUES(comment_nums), watch_user_nums=VALUES" \ + "(watch_user_nums),click_nums=VALUES(click_nums)" + + parameters = ( + self['question_id'], self['topics'], self['question_url'], + self['title'], self['content'], self['answer_nums'], + self['comment_nums'], self['watch_user_nums'], + self['click_nums'], self['crawl_time'] + ) + return insert_sql, parameters + + +class ZhihuAnswerItem(scrapy.Item): + ''' + zhihu's answer item design + ''' + answer_id = scrapy.Field() + question_id = scrapy.Field() + answer_url = scrapy.Field() + author_id = scrapy.Field() + content = scrapy.Field() + praise_nums = scrapy.Field() + comment_nums = scrapy.Field() + create_time = scrapy.Field() + update_time = scrapy.Field() + crawl_time = scrapy.Field() + crawl_update_time = scrapy.Field() + + def get_insert_sql(self): + ''' + get insert_sql and parameters of answer + ''' + insert_sql = "insert into answer(answer_id, question_id, answer_url, author_id, content, praise_nums, " \ + "comment_nums, create_time, update_time, crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, " \ + "%s, %s)ON DUPLICATE KEY UPDATE content=VALUES(content), praise_nums=VALUES(" \ + "praise_nums), comment_nums=VALUES(comment_nums), update_time=VALUES(update_time)" + + create_time = datetime.fromtimestamp(self['create_time']).strftime(SQL_DATETIME_FORMAT) + update_time = datetime.fromtimestamp(self['update_time']).strftime(SQL_DATETIME_FORMAT) + + parameters = ( + self['answer_id'], self['question_id'], self['answer_url'], + self['author_id'], self['content'], self['praise_nums'], + self['comment_nums'], create_time, update_time, self['crawl_time'] + ) + return insert_sql, parameters