You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

279 lines
14 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/usr/bin/env python
# encoding: utf-8
import re
from lxml import etree
from scrapy import Spider
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.utils.project import get_project_settings
from bot.items import TweetsItem, InformationItem, RelationshipsItem, CommentItem
from bot.spiders.utils import time_fix
from datetime import datetime
class WeiboSpider(Spider):
name = "weibo_spider"
base_url = "https://weibo.cn"
def start_requests(self):
import pymysql.cursors
# 连接数据库
connect = pymysql.connect(
host='localhost', # 数据库地址
port=3306, # 数据库端口
db='WeiboAnalysisSystem', # 数据库名
user='root', # 数据库用户名
passwd='1234', # 数据库密码
charset='utf8mb4', # 编码方式
use_unicode=True)
# 通过cursor执行增删查改
cursor = connect.cursor()
cursor.execute("""SELECT uid from scrapydapi_target WHERE isScrapy=0""")
uids = cursor.fetchall()
start_uids = uids
print('uid:::',uids)
for uid in start_uids:
cursor.execute("""UPDATE scrapydapi_target set isScrapy=1 WHERE uid=%s""" % uid)
cursor.execute("commit")
yield Request(url="https://weibo.cn/%s/info" % uid, callback=self.parse_information)
connect.close()
cursor.close()
def parse_information(self, response):
""" 抓取个人信息 """
information_item = InformationItem()
information_item['crawl_time'] = datetime.now()
selector = Selector(response)
information_item['_id'] = re.findall('(\d+)/info', response.url)[0]
image = selector.xpath('body/div[@class="c"]//img/@src').extract()
text1 = ";".join(selector.xpath('body/div[@class="c"]//text()').extract()) # 获取标签里的所有text()
nick_name = re.findall('昵称;?[:]?(.*?);', text1)
gender = re.findall('性别;?[:]?(.*?);', text1)
place = re.findall('地区;?[:]?(.*?);', text1)
briefIntroduction = re.findall('简介;?[:]?(.*?);', text1)
birthday = re.findall('生日;?[:]?(.*?);', text1)
sex_orientation = re.findall('性取向;?[:]?(.*?);', text1)
sentiment = re.findall('感情状况;?[:]?(.*?);', text1)
vip_level = re.findall('会员等级;?[:]?(.*?);', text1)
authentication = re.findall('认证;?[:]?(.*?);', text1)
labels = re.findall('标签;?[:]?(.*?)更多>>', text1)
if image and image[0]:
information_item["Image"] = image[0]
if nick_name and nick_name[0]:
information_item["nick_name"] = nick_name[0].replace(u"\xa0", "")
if gender and gender[0]:
information_item["gender"] = gender[0].replace(u"\xa0", "")
if place and place[0]:
place = place[0].replace(u"\xa0", "").split(" ")
information_item["province"] = place[0]
if len(place) > 1:
information_item["city"] = place[1]
if briefIntroduction and briefIntroduction[0]:
information_item["brief_introduction"] = briefIntroduction[0].replace(u"\xa0", "")
if birthday and birthday[0]:
try:
birthday = datetime.datetime.strptime(birthday[0], "%Y-%m-%d")
information_item['birthday'] = birthday - datetime.timedelta(hours=8)
except Exception:
information_item['constellation'] = birthday[0] # 有可能是星座,而非时间
if sex_orientation and sex_orientation[0]:
if sex_orientation[0].replace(u"\xa0", "") == gender[0]:
information_item["sex_orientation"] = "同性恋"
else:
information_item["sex_orientation"] = "异性恋"
if sentiment and sentiment[0]:
information_item["sentiment"] = sentiment[0].replace(u"\xa0", "")
if vip_level and vip_level[0]:
information_item["vip_level"] = vip_level[0].replace(u"\xa0", "")
if authentication and authentication[0]:
information_item["authentication"] = authentication[0].replace(u"\xa0", "")
if labels and labels[0]:
information_item["labels"] = labels[0].replace(u"\xa0", ",").replace(';', '').strip(',')
request_meta = response.meta
request_meta['item'] = information_item
yield Request(self.base_url + '/u/{}'.format(information_item['_id']),
callback=self.parse_further_information,
meta=request_meta, dont_filter=True, priority=1)
def parse_further_information(self, response):
text = response.text
information_item = response.meta['item']
tweets_num = re.findall('微博\[(\d+)\]', text)
if tweets_num:
information_item['tweets_num'] = int(tweets_num[0])
follows_num = re.findall('关注\[(\d+)\]', text)
if follows_num:
information_item['follows_num'] = int(follows_num[0])
fans_num = re.findall('粉丝\[(\d+)\]', text)
if fans_num:
information_item['fans_num'] = int(fans_num[0])
yield information_item
# 获取该用户微博
yield Request(url=self.base_url + '/{}/profile?page=1'.format(information_item['_id']),
callback=self.parse_tweet,
priority=1)
# # 获取关注列表
# yield Request(url=self.base_url + '/{}/follow?page=1'.format(information_item['_id']),
# callback=self.parse_follow,
# dont_filter=True)
# # 获取粉丝列表
# yield Request(url=self.base_url + '/{}/fans?page=1'.format(information_item['_id']),
# callback=self.parse_fans,
# dont_filter=True)
def parse_tweet(self, response):
if response.url.endswith('page=1'):
# 如果是第1页一次性获取后面的所有页
all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
if all_page:
all_page = all_page.group(1)
all_page = int(all_page)
for page_num in range(2, all_page + 1):
page_url = response.url.replace('page=1', 'page={}'.format(page_num))
yield Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta)
"""
解析本页的数据
"""
tree_node = etree.HTML(response.body)
tweet_nodes = tree_node.xpath('//div[@class="c" and @id]')
for tweet_node in tweet_nodes:
try:
tweet_item = TweetsItem()
tweet_item['crawl_time'] = datetime.now()
tweet_repost_url = tweet_node.xpath('.//a[contains(text(),"转发[")]/@href')[0]
user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url)
tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(user_tweet_id.group(2),
user_tweet_id.group(1))
tweet_item['user_id'] = user_tweet_id.group(2)
tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1))
create_time_info = tweet_node.xpath('.//span[@class="ct"]/text()')[-1]
if "来自" in create_time_info:
tweet_item['created_at'] = time_fix(create_time_info.split('来自')[0].strip())
else:
tweet_item['created_at'] = time_fix(create_time_info.strip())
like_num = tweet_node.xpath('.//a[contains(text(),"赞[")]/text()')[-1]
tweet_item['like_num'] = int(re.search('\d+', like_num).group())
repost_num = tweet_node.xpath('.//a[contains(text(),"转发[")]/text()')[-1]
tweet_item['repost_num'] = int(re.search('\d+', repost_num).group())
comment_num = tweet_node.xpath(
'.//a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()')[-1]
tweet_item['comment_num'] = int(re.search('\d+', comment_num).group())
tweet_content_node = tweet_node.xpath('.//span[@class="ctt"]')[0]
# 检测由没有阅读全文:
all_content_link = tweet_content_node.xpath('.//a[text()="全文"]')
if all_content_link:
all_content_url = self.base_url + all_content_link[0].xpath('./@href')[0]
yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item},
priority=1)
else:
all_content = tweet_content_node.xpath('string(.)').replace('\u200b', '').strip()
tweet_item['content'] = all_content[1:]
yield tweet_item
# 抓取该微博的评论信息
# comment_url = self.base_url + '/comment/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1'
# yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': tweet_item['weibo_url']})
except Exception as e:
self.logger.error(e)
def parse_all_content(self, response):
# 有阅读全文的情况,获取全文
tree_node = etree.HTML(response.body)
tweet_item = response.meta['item']
content_node = tree_node.xpath('//div[@id="M_"]//span[@class="ctt"]')[0]
all_content = content_node.xpath('string(.)').replace('\u200b', '').strip()
tweet_item['content'] = all_content[1:]
yield tweet_item
def parse_follow(self, response):
"""
抓取关注列表
"""
# 如果是第1页一次性获取后面的所有页
if response.url.endswith('page=1'):
all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
if all_page:
all_page = all_page.group(1)
all_page = int(all_page)
for page_num in range(2, all_page + 1):
page_url = response.url.replace('page=1', 'page={}'.format(page_num))
yield Request(page_url, self.parse_follow, dont_filter=True, meta=response.meta)
selector = Selector(response)
urls = selector.xpath('//a[text()="关注他" or text()="关注她" or text()="取消关注"]/@href').extract()
uids = re.findall('uid=(\d+)', ";".join(urls), re.S)
ID = re.findall('(\d+)/follow', response.url)[0]
for uid in uids:
relationships_item = RelationshipsItem()
relationships_item['crawl_time'] = datetime.now()
relationships_item["fan_id"] = ID
relationships_item["followed_id"] = uid
relationships_item["_id"] = ID + '-' + uid
yield relationships_item
def parse_fans(self, response):
"""
抓取粉丝列表
"""
# 如果是第1页一次性获取后面的所有页
if response.url.endswith('page=1'):
all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
if all_page:
all_page = all_page.group(1)
all_page = int(all_page)
for page_num in range(2, all_page + 1):
page_url = response.url.replace('page=1', 'page={}'.format(page_num))
yield Request(page_url, self.parse_fans, dont_filter=True, meta=response.meta)
selector = Selector(response)
urls = selector.xpath('//a[text()="关注他" or text()="关注她" or text()="移除"]/@href').extract()
uids = re.findall('uid=(\d+)', ";".join(urls), re.S)
ID = re.findall('(\d+)/fans', response.url)[0]
for uid in uids:
relationships_item = RelationshipsItem()
relationships_item['crawl_time'] = datetime.now()
relationships_item["fan_id"] = uid
relationships_item["followed_id"] = ID
relationships_item["_id"] = uid + '-' + ID
yield relationships_item
def parse_comment(self, response):
# 如果是第1页一次性获取后面的所有页
if response.url.endswith('page=1'):
all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
if all_page:
all_page = all_page.group(1)
all_page = int(all_page)
for page_num in range(2, all_page + 1):
page_url = response.url.replace('page=1', 'page={}'.format(page_num))
yield Request(page_url, self.parse_comment, dont_filter=True, meta=response.meta)
selector = Selector(response)
comment_nodes = selector.xpath('//div[@class="c" and contains(@id,"C_")]')
for comment_node in comment_nodes:
comment_user_url = comment_node.xpath('.//a[contains(@href,"/u/")]/@href').extract_first()
if not comment_user_url:
continue
comment_item = CommentItem()
comment_item['crawl_time'] = datetime.now()
comment_item['weibo_url'] = response.meta['weibo_url']
comment_item['comment_user_id'] = re.search(r'/u/(\d+)', comment_user_url).group(1)
comment_item['content'] = comment_node.xpath('.//span[@class="ctt"]').xpath('string(.)').extract_first()
comment_item['_id'] = comment_node.xpath('./@id').extract_first()
created_at = comment_node.xpath('.//span[@class="ct"]/text()').extract_first()
comment_item['created_at'] = time_fix(created_at.split('\xa0')[0])
yield comment_item
if __name__ == "__main__":
process = CrawlerProcess(get_project_settings())
process.crawl('weibo_spider')
process.start()