初步完成基本功能,还有自动刷新以及衔接功能问题

master
wufayuan 4 years ago
parent cf762d91be
commit 0fae2c112f

@ -6,5 +6,9 @@
</content>
<orderEntry type="jdk" jdkName="Python 3.10 (DWSpider)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="bootstrap" level="application" />
<orderEntry type="library" name="bootstrap-table" level="application" />
<orderEntry type="library" name="jquery" level="application" />
<orderEntry type="library" name="bootstrap-table-zh-CN" level="application" />
</component>
</module>

File diff suppressed because it is too large Load Diff

@ -19,13 +19,16 @@ class AC(threading.Thread):
self.base_url = r'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)'
self.excutable_path = _executable_path
self.loginer = login.Loginer(str(settings.CONFIG_DIR) + 'config.json')
self.headers = {0: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39'},
self.headers = {
0: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39'},
1: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0'},
2: {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'},
3: {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}}
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/69.0.3497.100 Safari/537.36'}}
def get_rand_headers(self):
return self.headers[random.randint(0, 3)]
@ -48,7 +51,8 @@ class AC(threading.Thread):
url = base_url + urlencode(params)
headers = {
"cookie": f"SUBP={self.cookies['SUBP']}; SUB={self.cookies['SUB']};",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.47"
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.47 "
}
res = requests.get(url, headers=headers)
if res.status_code == 200:
@ -82,7 +86,7 @@ class AC(threading.Thread):
def write2json(self):
word = self.word.replace('#', '')
with open(str(settings.BASE_DIR) + rf'\{self.loginer.username}_associative_crawl_result_{word}.json', 'w',
with open(str(settings.DATA_DIR) + f'{self.loginer.username}_associative_crawl_result_{word}.json', 'w',
encoding='utf-8') as f:
json.dump(self.collections, f, ensure_ascii=False, indent=4)
@ -92,12 +96,14 @@ class HashtagParser(threading.Thread):
super().__init__()
self.collections = collections
self.headers = {0: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39'},
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39'},
1: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0'},
2: {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'},
3: {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}}
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/69.0.3497.100 Safari/537.36'}}
def get_rand_headers(self):
return self.headers[random.randint(0, 3)]
@ -148,6 +154,7 @@ class HashtagParser(threading.Thread):
@staticmethod
def parse_comment_json(_res):
trends_text = _res['trendsText']
_res = _res['data'][:10]
collections = []
for res in _res:
@ -158,7 +165,7 @@ class HashtagParser(threading.Thread):
while len(collections) < 10:
collection = {'uid': None, 'commentor_name': None,
'time': None,
'text': _res['trendsText'], 'like_counts': None}
'text': trends_text, 'like_counts': None}
collections.append(collection)
return collections

@ -1,10 +1,8 @@
import json
import random
import threading
import urllib.parse as up
import requests
from bs4 import BeautifulSoup
from loguru import logger
from Weibo_Spider import settings
@ -15,20 +13,12 @@ class Crawler(threading.Thread):
def __init__(self, executable_path=settings.MSEDGEDRIVER_PATH) -> None:
super().__init__()
self.cookies = None
self.base_url = r'https://weibo.com/ajax/feed/hottimeline?'
self.headers = {0: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39'},
1: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0'},
2: {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'},
3: {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}}
self.base_url = r'https://m.weibo.cn/api/container/getIndex?containerid=102803&openApp=0'
self.headers = {'User-Agent': 'Mozilla/5.0 (Linux; U; Android 4.0.2; en-us; Galaxy Nexus Build/ICL53F) '
'AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'}
self.excutable_path = executable_path
self.loginer = login.Loginer(str(settings.CONFIG_DIR)+r'config.json')
def get_rand_headers(self):
return self.headers[random.randint(0, 3)]
def get_cookies(self):
self.cookies = self.loginer.run()
@ -41,22 +31,12 @@ class Crawler(threading.Thread):
cookies[cookie['name']] = cookie['value']
self.cookies = cookies
def get_hotline_json(self, max_id=0, since_id=0, count=15):
params = {
'since_id': since_id, # 估测为热门微博列表要抓取的起始序号
'refresh': 0,
'group_id': 102803,
'containerid': 102803,
'extparam': 'discover|new_feed',
'max_id': max_id, # 估测默认热门列表的第一页
'count': count # 估测一次访问抓取的热门微博数, 一页最多100 超过默认10
}
url = self.base_url + up.urlencode(params)
logger.debug(url)
def get_hotline_json(self):
try:
res = requests.get(url, headers=self.get_rand_headers())
res = requests.get(self.base_url, headers=self.headers)
if res.status_code == 200:
with open('mweibo.html', encoding='utf-8', mode='w') as f:
f.write(res.text)
res = res.json()
return res
except Exception as e:
@ -64,7 +44,7 @@ class Crawler(threading.Thread):
def parse_hotline_json(self, res):
collections = []
statuses = res['statuses'][:10]
statuses = res['data']['cards'][:10]
parsers = []
for status in statuses:
@ -97,24 +77,17 @@ class StatusParser(threading.Thread):
def __init__(self, cookies):
super().__init__()
self.cookies = cookies
self.base_comment_url = r'https://weibo.com/ajax/statuses/buildComments?'
self.base_comment_url = r'https://m.weibo.cn/comments/hotflow?'
self.base_longtext_url = r'https://weibo.com/ajax/statuses/longtext?'
self.headers = {0: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39'},
1: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0'},
2: {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'},
3: {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}}
def get_rand_headers(self):
return self.headers[random.randint(0, 3)]
self.headers = {'User-Agent': 'Mozilla/5.0 (Linux; U; Android 4.0.2; en-us; Galaxy Nexus Build/ICL53F) '
'AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'}
def get_longtext(self, mblogid):
url = self.base_longtext_url + up.urlencode({'id': mblogid})
headers = {
"cookie": f"SUBP={self.cookies['SUBP']}; SUB={self.cookies['SUB']};",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.47"}
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.47"}
# TODO: 为什么cookie不能加在get里面必须要放到headers里面
res = requests.get(url, headers=headers)
@ -128,20 +101,17 @@ class StatusParser(threading.Thread):
exit()
return res['data']['longTextContent']
def get_comment_json(self, _id, count, uid):
def get_comment_json(self, _id, mid):
params = {
'is_reload': 1,
'id': _id,
'is_show_bulletin': 2,
'is_mix': 0,
'count': count,
'uid': uid
'mid': mid,
'max_id_type': 0
}
url = self.base_comment_url + up.urlencode(params)
logger.debug(url)
try:
res = requests.get(url, headers=self.get_rand_headers())
res = requests.get(url, headers=self.headers)
if res.status_code == 200:
res = res.json()
return res
@ -152,7 +122,7 @@ class StatusParser(threading.Thread):
@staticmethod
def parse_comment_json(res):
collections = []
data = res['data'][:10]
data = res['data']['data'][:10]
for comment in data:
collection = {}
@ -166,31 +136,32 @@ class StatusParser(threading.Thread):
_time = comment['created_at']
# 获取内容
soup = BeautifulSoup(comment['text'], 'lxml')
text = soup.text
text = text.replace('\n', '<br/>')
# soup = BeautifulSoup(comment['text'], 'lxml')
# text = soup.text
# text = text.replace('\n', '<br/>')
text = comment['text']
# 获取赞数
like_counts = comment['like_counts']
like_count = comment['like_count']
# 构造结果
collection['uid'] = uid
collection['time'] = _time
collection['user_name'] = name
collection['text'] = text
collection['like_counts'] = like_counts
collection['total_number'] = res['total_number']
collection['like_count'] = like_count
# collection['total_number'] = res['total_number']
# 保存结果
collections.append(collection)
while len(collections) < 10:
collection = {'uid': None, 'time': None, 'user_name': None, 'text': res['trendsText'], 'like_counts': None, 'total_number': None}
collections.append(collection)
# while len(collections) < 10: collection = {'uid': None, 'time': None, 'user_name': None, 'text': res[
# 'trendsText'], 'like_counts': None, 'total_number': None} collections.append(collection)
return collections
def get(self, status, collections):
logger.debug('parsing status...')
status = status['mblog']
logger.debug('parsing card...')
collection = {}
# 获取时间
@ -200,19 +171,20 @@ class StatusParser(threading.Thread):
name = status['user']['screen_name']
# 获取主题
topics = []
try:
for topic in status['topic_struct']:
topics.append(topic['topic_title'])
except:
pass
# topics = []
# try:
# for topic in status['topic_struct']:
# topics.append(topic['topic_title'])
# except:
# pass
# 获取内容
soup = BeautifulSoup(status['text'], 'lxml')
text = soup.text
if '展开' in text:
mblogid = status['mblogid']
text = self.get_longtext(mblogid)
text = status['text']
# soup = BeautifulSoup(status['text'], 'lxml')
# text = soup.text
# if '展开' in text:
# mblogid = status['mblogid']
# text = self.get_longtext(mblogid)
# 获取转发数、评论数、赞数
reposts_count = status['reposts_count']
@ -220,16 +192,16 @@ class StatusParser(threading.Thread):
attitudes_count = status['attitudes_count']
# 获取评论
comments_json = self.get_comment_json(status['id'], 15, status['user']['id'])
comments_json = self.get_comment_json(status['id'], status['mid'])
comments = self.parse_comment_json(comments_json)
if len(comments) > comments_count:
comments_count = comments[0]['total_number']
logger.warning('Internal Weibo Error!')
# if len(comments) > comments_count:
# comments_count = comments[0]['total_number']
# logger.warning('Internal Weibo Error!')
# 构造结果
collection['time'] = _time
collection['user_name'] = name
collection['topics'] = topics
# collection['topics'] = topics
collection['text'] = text
collection['reposts_count'] = reposts_count
collection['comments_count'] = comments_count

@ -6,7 +6,7 @@ urlpatterns = [
path('hotline', views.index, name='hotline'),
path('hotline_comments', views.hotline_comments, name='hotline_comments'),
path('ac_result', views.AC_result, name='ac'),
path('ac_result', views.AC_result, name='ac_result'),
path('ac_comments', views.ac_comments, name='ac_comments'),
path('get_ac_result', views.get_ac_result, name='get_ac_result'),

@ -1,6 +1,4 @@
import json
import os
from time import sleep
from django.shortcuts import render
@ -14,7 +12,12 @@ from hotline_crawler.associative_crawler import AC
from hotline_crawler.crawler import Crawler
configFile = str(settings.BASE_DIR) + r'\\config.json'
configFile = settings.CONFIG_DIR + 'config.json'
with open(configFile, "r") as f:
config_dict = json.load(f)
username = config_dict["username"]
hotlineFile = settings.DATA_DIR + f'{username}_crawl_result.json'
def index(request):
@ -22,45 +25,37 @@ def index(request):
def hotline_comments(request):
index = int(request.GET['index'])
index = {'index': index}
# print(index)
return render(request=request, template_name='hotline_comments.html', context=index)
_index = int(request.GET['index'])
_index = {'index': _index}
return render(request=request, template_name='hotline_comments.html', context=_index)
def get_hotline_result(request):
with open(configFile, "r") as f:
config_dict = json.load(f)
username = config_dict["username"]
result = f'{username}_crawl_result.json'
logger.debug('crawling hotline...')
crawler = Crawler()
crawler.start()
crawler.join()
# crawler = Crawler()
# crawler.start()
# crawler.join()
logger.debug('crawling hotline end!')
with open(result, 'r', encoding='utf-8') as f:
res = f.read()
return HttpResponse(res)
with open(hotlineFile, 'r', encoding='utf-8') as fh:
return HttpResponse(json.dumps(json.load(fh)))
def get_hotline_comments(request):
index = int(request.GET['index'])
with open(configFile, "r") as f:
config_dict = json.load(f)
logger.debug('crawling hotline...')
# crawler = Crawler()
# crawler.start()
# crawler.join()
logger.debug('crawling hotline end!')
username = config_dict["username"]
result = f'{username}_crawl_result.json'
_index = int(request.GET['index'])
with open(result, 'r', encoding='utf-8') as f:
res = json.load(f)
with open(hotlineFile, 'r', encoding='utf-8') as fh:
res = json.load(fh)
comments = []
for hotline in res:
comments.append(hotline['comments'])
# print(comments)
return HttpResponse(json.dumps(comments[index]))
return HttpResponse(json.dumps(comments[_index]))
def AC_result(request):
@ -71,11 +66,10 @@ def AC_result(request):
def ac_comments(request):
index = int(request.GET['index'])
_index = int(request.GET['index'])
word = request.GET['word']
context = {'index': index, 'word': word}
context = {'index': _index, 'word': word}
logger.debug(context)
# print(index)
return render(request=request, template_name='ac_comments.html', context=context)
@ -83,35 +77,31 @@ def get_ac_result(request):
word = request.GET['word']
logger.debug(word)
logger.debug('associative searching...')
ac = AC('#'+word+'#')
ac.start()
ac.join()
# ac = AC('#'+word+'#')
# ac.start()
# ac.join()
logger.debug('associative searching end!')
with open(configFile, "r") as f:
config_dict = json.load(f)
username = config_dict["username"]
result = f'{username}_associative_crawl_result_{word}.json'
with open(result, 'r', encoding='utf-8') as f:
return HttpResponse(json.dumps(json.load(f)))
result = settings.DATA_DIR+f'{username}_associative_crawl_result_{word}.json'
with open(result, 'r', encoding='utf-8') as fh:
return HttpResponse(json.dumps(json.load(fh)))
def get_ac_comments(request):
word = request.GET['word']
index = int(request.GET['index'])
with open(configFile, "r") as f:
config_dict = json.load(f)
username = config_dict["username"]
result = f'{username}_associative_crawl_result_{word}.json'
with open(result, 'r', encoding='utf-8') as f:
res = json.load(f)
_index = int(request.GET['index'])
logger.debug('associative searching...')
# ac = AC('#'+word+'#')
# ac.start()
# ac.join()
logger.debug('associative searching end!')
result = settings.DATA_DIR+f'{username}_associative_crawl_result_{word}.json'
with open(result, 'r', encoding='utf-8') as fh:
res = json.load(fh)
comments = []
for ac in res:
try:
comments.append(ac['comments'])
except:
logger.debug(ac)
comments.append([])
# logger.debug(comments)
return HttpResponse(json.dumps(comments[index]))
comments.append(ac['comments'])
return HttpResponse(json.dumps(comments[_index]))

@ -10,7 +10,7 @@
<script src="https://cdn.bootcss.com/bootstrap-table/1.15.4/bootstrap-table.min.js"></script>
<script src="https://cdn.bootcss.com/bootstrap-table/1.15.4/locale/bootstrap-table-zh-CN.min.js"></script>
</head>
<body onload="JavaScript:AutoRefresh(5000);">
<body onload="AutoRefresh(5000);">
<!-- <a href="javascript:;" _id="remove"><span class="hidden-480">删除</span></a>-->
<table id="mytab" class="table table-hover"></table>
<script>
@ -24,13 +24,6 @@
pageSize: 10, // 单页记录数
pageList: [5, 10],
showRefresh : true,// 刷新按钮
queryParams: function(params) { // 上传服务器的参数
var temp = {
name: $("#sname").val(),
viewReason: $("#viewReason").val(),
};
return temp;
},
columns: [{
checkbox: true
}, {
@ -66,11 +59,12 @@
//表格超出宽度鼠标悬停显示td内容
function paramsMatter(value, row, index) {
var span = document.createElement("span");
let span = document.createElement("span");
span.setAttribute("title", value);
span.innerHTML = value;
return span.outerHTML;
}
//td宽度以及内容超过宽度隐藏
function formatTableUnit(value, row, index) {
return {
@ -82,27 +76,6 @@
}
}
}
// 删除按钮事件
$("#remove").on("click", function() {
if(!confirm("是否确认删除?"))
return;
var rows = $("#mytab").bootstrapTable('getSelections'); // 获得要删除的数据
if(rows.length == 0) { // rows 主要是为了判断是否选中下面的else内容才是主要
alert("请先选择要删除的记录!");
return;
} else {
var ids = new Array(); // 声明一个数组
$(rows).each(function() { // 通过获得别选中的来进行遍历
ids.push(this.id); // cid为获得到的整条数据中的一列
});
//后端删除的方法
deleteMs(ids)
}
})
</script>
</body>

@ -40,10 +40,6 @@
title: '用户昵称',
field: 'user_name',
}, {
title: '主题',
field: 'topics',
formatter: topics
}, {
title: '微博正文',
field: 'text',
@ -69,21 +65,21 @@
}
// 定义topics超链接
function topics(value, row, index) {
<!-- value = value.split(',')-->
topics = ''
for (t = 0; t < value.length; t++)
topics += `<a href="/index/ac_result?word=${value[t]}">${value[t]}</a><br/>`
return topics
}
// function topics(value, row, index) {
// <!-- value = value.split(',')-->
// topics = ''
// for (t = 0; t < value.length; t++)
// topics += `<a href="/index/ac_result?word=${value[t]}">${value[t]}</a><br/>`
// return topics
// }
// 定义links超链接展示
function links(value, row, index) {
links = ''
for (l = 0; l < value.length; l++)
links += `<a href=${value[l]}>${value[l]}</a><br/>`
return links
}
// function links(value, row, index) {
// links = ''
// for (l = 0; l < value.length; l++)
// links += `<a href=${value[l]}>${value[l]}</a><br/>`
// return links
// }
// 定义删除、更新按钮
function option(value, row, index) {

@ -24,13 +24,6 @@
pageSize: 10, // 单页记录数
pageList: [5, 10],
showRefresh : true,// 刷新按钮
queryParams: function(params) { // 上传服务器的参数
var temp = {
name: $("#sname").val(),
viewReason: $("#viewReason").val(),
};
return temp;
},
columns: [{
checkbox: true
}, {
@ -48,25 +41,18 @@
field: 'text',
}, {
title: '点赞数',
field: 'like_counts'
field: 'like_count'
}],
})
// 定义删除、更新按钮
function option(value, row, index) {
var htm = "";
htm += '<button id="dupdevice" deviceId="' + value +
'" onclick="updDevice(' + value + ')">编辑</button>'
return htm;
}
//表格超出宽度鼠标悬停显示td内容
function paramsMatter(value, row, index) {
var span = document.createElement("span");
let span = document.createElement("span");
span.setAttribute("title", value);
span.innerHTML = value;
return span.outerHTML;
}
//td宽度以及内容超过宽度隐藏
function formatTableUnit(value, row, index) {
return {
@ -78,27 +64,6 @@
}
}
}
// 删除按钮事件
$("#remove").on("click", function() {
if(!confirm("是否确认删除?"))
return;
var rows = $("#mytab").bootstrapTable('getSelections'); // 获得要删除的数据
if(rows.length == 0) { // rows 主要是为了判断是否选中下面的else内容才是主要
alert("请先选择要删除的记录!");
return;
} else {
var ids = new Array(); // 声明一个数组
$(rows).each(function() { // 通过获得别选中的来进行遍历
ids.push(this.id); // cid为获得到的整条数据中的一列
});
//后端删除的方法
deleteMs(ids)
}
})
</script>
</body>

Loading…
Cancel
Save