|
|
|
|
@ -1,10 +1,8 @@
|
|
|
|
|
import json
|
|
|
|
|
import random
|
|
|
|
|
import threading
|
|
|
|
|
import urllib.parse as up
|
|
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
from loguru import logger
|
|
|
|
|
|
|
|
|
|
from Weibo_Spider import settings
|
|
|
|
|
@ -15,20 +13,12 @@ class Crawler(threading.Thread):
|
|
|
|
|
def __init__(self, executable_path=settings.MSEDGEDRIVER_PATH) -> None:
|
|
|
|
|
super().__init__()
|
|
|
|
|
self.cookies = None
|
|
|
|
|
self.base_url = r'https://weibo.com/ajax/feed/hottimeline?'
|
|
|
|
|
self.headers = {0: {
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39'},
|
|
|
|
|
1: {
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0'},
|
|
|
|
|
2: {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'},
|
|
|
|
|
3: {
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}}
|
|
|
|
|
self.base_url = r'https://m.weibo.cn/api/container/getIndex?containerid=102803&openApp=0'
|
|
|
|
|
self.headers = {'User-Agent': 'Mozilla/5.0 (Linux; U; Android 4.0.2; en-us; Galaxy Nexus Build/ICL53F) '
|
|
|
|
|
'AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'}
|
|
|
|
|
self.excutable_path = executable_path
|
|
|
|
|
self.loginer = login.Loginer(str(settings.CONFIG_DIR)+r'config.json')
|
|
|
|
|
|
|
|
|
|
def get_rand_headers(self):
|
|
|
|
|
return self.headers[random.randint(0, 3)]
|
|
|
|
|
|
|
|
|
|
def get_cookies(self):
|
|
|
|
|
self.cookies = self.loginer.run()
|
|
|
|
|
|
|
|
|
|
@ -41,22 +31,12 @@ class Crawler(threading.Thread):
|
|
|
|
|
cookies[cookie['name']] = cookie['value']
|
|
|
|
|
self.cookies = cookies
|
|
|
|
|
|
|
|
|
|
def get_hotline_json(self, max_id=0, since_id=0, count=15):
|
|
|
|
|
params = {
|
|
|
|
|
'since_id': since_id, # 估测为热门微博列表要抓取的起始序号
|
|
|
|
|
'refresh': 0,
|
|
|
|
|
'group_id': 102803,
|
|
|
|
|
'containerid': 102803,
|
|
|
|
|
'extparam': 'discover|new_feed',
|
|
|
|
|
'max_id': max_id, # 估测默认热门列表的第一页
|
|
|
|
|
'count': count # 估测一次访问抓取的热门微博数, 一页最多100, 超过默认10
|
|
|
|
|
}
|
|
|
|
|
url = self.base_url + up.urlencode(params)
|
|
|
|
|
logger.debug(url)
|
|
|
|
|
|
|
|
|
|
def get_hotline_json(self):
|
|
|
|
|
try:
|
|
|
|
|
res = requests.get(url, headers=self.get_rand_headers())
|
|
|
|
|
res = requests.get(self.base_url, headers=self.headers)
|
|
|
|
|
if res.status_code == 200:
|
|
|
|
|
with open('mweibo.html', encoding='utf-8', mode='w') as f:
|
|
|
|
|
f.write(res.text)
|
|
|
|
|
res = res.json()
|
|
|
|
|
return res
|
|
|
|
|
except Exception as e:
|
|
|
|
|
@ -64,7 +44,7 @@ class Crawler(threading.Thread):
|
|
|
|
|
|
|
|
|
|
def parse_hotline_json(self, res):
|
|
|
|
|
collections = []
|
|
|
|
|
statuses = res['statuses'][:10]
|
|
|
|
|
statuses = res['data']['cards'][:10]
|
|
|
|
|
parsers = []
|
|
|
|
|
|
|
|
|
|
for status in statuses:
|
|
|
|
|
@ -97,24 +77,17 @@ class StatusParser(threading.Thread):
|
|
|
|
|
def __init__(self, cookies):
|
|
|
|
|
super().__init__()
|
|
|
|
|
self.cookies = cookies
|
|
|
|
|
self.base_comment_url = r'https://weibo.com/ajax/statuses/buildComments?'
|
|
|
|
|
self.base_comment_url = r'https://m.weibo.cn/comments/hotflow?'
|
|
|
|
|
self.base_longtext_url = r'https://weibo.com/ajax/statuses/longtext?'
|
|
|
|
|
self.headers = {0: {
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39'},
|
|
|
|
|
1: {
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0'},
|
|
|
|
|
2: {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'},
|
|
|
|
|
3: {
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}}
|
|
|
|
|
|
|
|
|
|
def get_rand_headers(self):
|
|
|
|
|
return self.headers[random.randint(0, 3)]
|
|
|
|
|
self.headers = {'User-Agent': 'Mozilla/5.0 (Linux; U; Android 4.0.2; en-us; Galaxy Nexus Build/ICL53F) '
|
|
|
|
|
'AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'}
|
|
|
|
|
|
|
|
|
|
def get_longtext(self, mblogid):
|
|
|
|
|
url = self.base_longtext_url + up.urlencode({'id': mblogid})
|
|
|
|
|
headers = {
|
|
|
|
|
"cookie": f"SUBP={self.cookies['SUBP']}; SUB={self.cookies['SUB']};",
|
|
|
|
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.47"}
|
|
|
|
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
|
|
|
"Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.47"}
|
|
|
|
|
|
|
|
|
|
# TODO: 为什么cookie不能加在get里面,必须要放到headers里面???
|
|
|
|
|
res = requests.get(url, headers=headers)
|
|
|
|
|
@ -128,20 +101,17 @@ class StatusParser(threading.Thread):
|
|
|
|
|
exit()
|
|
|
|
|
return res['data']['longTextContent']
|
|
|
|
|
|
|
|
|
|
def get_comment_json(self, _id, count, uid):
|
|
|
|
|
def get_comment_json(self, _id, mid):
|
|
|
|
|
params = {
|
|
|
|
|
'is_reload': 1,
|
|
|
|
|
'id': _id,
|
|
|
|
|
'is_show_bulletin': 2,
|
|
|
|
|
'is_mix': 0,
|
|
|
|
|
'count': count,
|
|
|
|
|
'uid': uid
|
|
|
|
|
'mid': mid,
|
|
|
|
|
'max_id_type': 0
|
|
|
|
|
}
|
|
|
|
|
url = self.base_comment_url + up.urlencode(params)
|
|
|
|
|
logger.debug(url)
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
res = requests.get(url, headers=self.get_rand_headers())
|
|
|
|
|
res = requests.get(url, headers=self.headers)
|
|
|
|
|
if res.status_code == 200:
|
|
|
|
|
res = res.json()
|
|
|
|
|
return res
|
|
|
|
|
@ -152,7 +122,7 @@ class StatusParser(threading.Thread):
|
|
|
|
|
@staticmethod
|
|
|
|
|
def parse_comment_json(res):
|
|
|
|
|
collections = []
|
|
|
|
|
data = res['data'][:10]
|
|
|
|
|
data = res['data']['data'][:10]
|
|
|
|
|
for comment in data:
|
|
|
|
|
collection = {}
|
|
|
|
|
|
|
|
|
|
@ -166,31 +136,32 @@ class StatusParser(threading.Thread):
|
|
|
|
|
_time = comment['created_at']
|
|
|
|
|
|
|
|
|
|
# 获取内容
|
|
|
|
|
soup = BeautifulSoup(comment['text'], 'lxml')
|
|
|
|
|
text = soup.text
|
|
|
|
|
text = text.replace('\n', '<br/>')
|
|
|
|
|
# soup = BeautifulSoup(comment['text'], 'lxml')
|
|
|
|
|
# text = soup.text
|
|
|
|
|
# text = text.replace('\n', '<br/>')
|
|
|
|
|
text = comment['text']
|
|
|
|
|
|
|
|
|
|
# 获取赞数
|
|
|
|
|
like_counts = comment['like_counts']
|
|
|
|
|
like_count = comment['like_count']
|
|
|
|
|
|
|
|
|
|
# 构造结果
|
|
|
|
|
collection['uid'] = uid
|
|
|
|
|
collection['time'] = _time
|
|
|
|
|
collection['user_name'] = name
|
|
|
|
|
collection['text'] = text
|
|
|
|
|
collection['like_counts'] = like_counts
|
|
|
|
|
collection['total_number'] = res['total_number']
|
|
|
|
|
collection['like_count'] = like_count
|
|
|
|
|
# collection['total_number'] = res['total_number']
|
|
|
|
|
|
|
|
|
|
# 保存结果
|
|
|
|
|
collections.append(collection)
|
|
|
|
|
while len(collections) < 10:
|
|
|
|
|
collection = {'uid': None, 'time': None, 'user_name': None, 'text': res['trendsText'], 'like_counts': None, 'total_number': None}
|
|
|
|
|
collections.append(collection)
|
|
|
|
|
# while len(collections) < 10: collection = {'uid': None, 'time': None, 'user_name': None, 'text': res[
|
|
|
|
|
# 'trendsText'], 'like_counts': None, 'total_number': None} collections.append(collection)
|
|
|
|
|
|
|
|
|
|
return collections
|
|
|
|
|
|
|
|
|
|
def get(self, status, collections):
|
|
|
|
|
logger.debug('parsing status...')
|
|
|
|
|
status = status['mblog']
|
|
|
|
|
logger.debug('parsing card...')
|
|
|
|
|
collection = {}
|
|
|
|
|
|
|
|
|
|
# 获取时间
|
|
|
|
|
@ -200,19 +171,20 @@ class StatusParser(threading.Thread):
|
|
|
|
|
name = status['user']['screen_name']
|
|
|
|
|
|
|
|
|
|
# 获取主题
|
|
|
|
|
topics = []
|
|
|
|
|
try:
|
|
|
|
|
for topic in status['topic_struct']:
|
|
|
|
|
topics.append(topic['topic_title'])
|
|
|
|
|
except:
|
|
|
|
|
pass
|
|
|
|
|
# topics = []
|
|
|
|
|
# try:
|
|
|
|
|
# for topic in status['topic_struct']:
|
|
|
|
|
# topics.append(topic['topic_title'])
|
|
|
|
|
# except:
|
|
|
|
|
# pass
|
|
|
|
|
|
|
|
|
|
# 获取内容
|
|
|
|
|
soup = BeautifulSoup(status['text'], 'lxml')
|
|
|
|
|
text = soup.text
|
|
|
|
|
if '展开' in text:
|
|
|
|
|
mblogid = status['mblogid']
|
|
|
|
|
text = self.get_longtext(mblogid)
|
|
|
|
|
text = status['text']
|
|
|
|
|
# soup = BeautifulSoup(status['text'], 'lxml')
|
|
|
|
|
# text = soup.text
|
|
|
|
|
# if '展开' in text:
|
|
|
|
|
# mblogid = status['mblogid']
|
|
|
|
|
# text = self.get_longtext(mblogid)
|
|
|
|
|
|
|
|
|
|
# 获取转发数、评论数、赞数
|
|
|
|
|
reposts_count = status['reposts_count']
|
|
|
|
|
@ -220,16 +192,16 @@ class StatusParser(threading.Thread):
|
|
|
|
|
attitudes_count = status['attitudes_count']
|
|
|
|
|
|
|
|
|
|
# 获取评论
|
|
|
|
|
comments_json = self.get_comment_json(status['id'], 15, status['user']['id'])
|
|
|
|
|
comments_json = self.get_comment_json(status['id'], status['mid'])
|
|
|
|
|
comments = self.parse_comment_json(comments_json)
|
|
|
|
|
if len(comments) > comments_count:
|
|
|
|
|
comments_count = comments[0]['total_number']
|
|
|
|
|
logger.warning('Internal Weibo Error!')
|
|
|
|
|
# if len(comments) > comments_count:
|
|
|
|
|
# comments_count = comments[0]['total_number']
|
|
|
|
|
# logger.warning('Internal Weibo Error!')
|
|
|
|
|
|
|
|
|
|
# 构造结果
|
|
|
|
|
collection['time'] = _time
|
|
|
|
|
collection['user_name'] = name
|
|
|
|
|
collection['topics'] = topics
|
|
|
|
|
# collection['topics'] = topics
|
|
|
|
|
collection['text'] = text
|
|
|
|
|
collection['reposts_count'] = reposts_count
|
|
|
|
|
collection['comments_count'] = comments_count
|
|
|
|
|
|