parent
3f1d6ef4ef
commit
234622c4aa
@ -0,0 +1,77 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import scrapy
|
||||
from datetime import datetime
|
||||
from settings import SQL_DATETIME_FORMAT
|
||||
|
||||
|
||||
class ZhihuQuestionItem(scrapy.Item):
|
||||
'''
|
||||
zhihu's question item design
|
||||
'''
|
||||
question_id = scrapy.Field()
|
||||
topics = scrapy.Field()
|
||||
question_url = scrapy.Field()
|
||||
title = scrapy.Field()
|
||||
content = scrapy.Field()
|
||||
create_time = scrapy.Field()
|
||||
update_time = scrapy.Field()
|
||||
answer_nums = scrapy.Field()
|
||||
comment_nums = scrapy.Field()
|
||||
watch_user_nums = scrapy.Field()
|
||||
click_nums = scrapy.Field()
|
||||
crawl_time = scrapy.Field()
|
||||
crawl_update_time = scrapy.Field()
|
||||
|
||||
def get_insert_sql(self):
|
||||
'''
|
||||
get insert_sql and parameters of question
|
||||
'''
|
||||
insert_sql = "insert into question(question_id, topics, question_url, title, content, answer_nums, " \
|
||||
"comment_nums, watch_user_nums, click_nums, crawl_time)VALUES (%s, %s, %s, %s, %s, %s, %s, %s, " \
|
||||
"%s, %s)ON DUPLICATE KEY UPDATE content=VALUES(content), answer_nums=VALUES(" \
|
||||
"answer_nums),comment_nums=VALUES(comment_nums), watch_user_nums=VALUES" \
|
||||
"(watch_user_nums),click_nums=VALUES(click_nums)"
|
||||
|
||||
parameters = (
|
||||
self['question_id'], self['topics'], self['question_url'],
|
||||
self['title'], self['content'], self['answer_nums'],
|
||||
self['comment_nums'], self['watch_user_nums'],
|
||||
self['click_nums'], self['crawl_time']
|
||||
)
|
||||
return insert_sql, parameters
|
||||
|
||||
|
||||
class ZhihuAnswerItem(scrapy.Item):
|
||||
'''
|
||||
zhihu's answer item design
|
||||
'''
|
||||
answer_id = scrapy.Field()
|
||||
question_id = scrapy.Field()
|
||||
answer_url = scrapy.Field()
|
||||
author_id = scrapy.Field()
|
||||
content = scrapy.Field()
|
||||
praise_nums = scrapy.Field()
|
||||
comment_nums = scrapy.Field()
|
||||
create_time = scrapy.Field()
|
||||
update_time = scrapy.Field()
|
||||
crawl_time = scrapy.Field()
|
||||
crawl_update_time = scrapy.Field()
|
||||
|
||||
def get_insert_sql(self):
|
||||
'''
|
||||
get insert_sql and parameters of answer
|
||||
'''
|
||||
insert_sql = "insert into answer(answer_id, question_id, answer_url, author_id, content, praise_nums, " \
|
||||
"comment_nums, create_time, update_time, crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, " \
|
||||
"%s, %s)ON DUPLICATE KEY UPDATE content=VALUES(content), praise_nums=VALUES(" \
|
||||
"praise_nums), comment_nums=VALUES(comment_nums), update_time=VALUES(update_time)"
|
||||
|
||||
create_time = datetime.fromtimestamp(self['create_time']).strftime(SQL_DATETIME_FORMAT)
|
||||
update_time = datetime.fromtimestamp(self['update_time']).strftime(SQL_DATETIME_FORMAT)
|
||||
|
||||
parameters = (
|
||||
self['answer_id'], self['question_id'], self['answer_url'],
|
||||
self['author_id'], self['content'], self['praise_nums'],
|
||||
self['comment_nums'], create_time, update_time, self['crawl_time']
|
||||
)
|
||||
return insert_sql, parameters
|
Loading…
Reference in new issue