|
|
import requests
|
|
|
from bs4 import BeautifulSoup
|
|
|
import re
|
|
|
import urllib.request, urllib.error
|
|
|
import xlwt
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
findLink = re.compile(r'<a href="(.*?)" target="_blank">')
|
|
|
# 获取标题
|
|
|
findTitle = re.compile(r'<a href="(.*?)" target="_blank">(.*?)</a>')
|
|
|
# 获取热度值
|
|
|
findHot = re.compile(r'<span>(\w*\s\d*)</span>')
|
|
|
|
|
|
|
|
|
def main():
|
|
|
baseurl = "https://s.weibo.com/top/summary?Refer=top_hot&topnav=1&wvr=6"
|
|
|
# baseurl="view-source:file:///C:/Users/%E5%AD%99%E5%BB%BA%E6%9E%97/Desktop/%E5%89%8D%E7%AB%AF/new%201.html"
|
|
|
datalist = getData(baseurl)
|
|
|
savepath = "C:\\Users\\17491\\PycharmProjects\\pythonProject\\微博热搜\\微博热搜50.xls" # 当前目录新建XLS,存储进去
|
|
|
saveData(datalist, savepath)
|
|
|
|
|
|
|
|
|
def getData(baseurl):
|
|
|
datalist = []
|
|
|
url = baseurl
|
|
|
html = askURL(url)
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
for item in soup.find_all('td', class_="td-02"):
|
|
|
data = []
|
|
|
item = str(item)
|
|
|
if len(re.findall(findLink, item)) == 0:
|
|
|
continue
|
|
|
link = re.findall(findLink, item)[0]
|
|
|
data.append(link)
|
|
|
# print(re.findall(findTitle,item))
|
|
|
title = str(re.findall(findTitle, item)[0])
|
|
|
title = title.strip('(')
|
|
|
title = title.strip(')')
|
|
|
title = title.replace("\'", "")
|
|
|
title = title.split(",")
|
|
|
data.append(title[1])
|
|
|
if len(re.findall(findHot, item)) == 0:
|
|
|
data.append("置顶")
|
|
|
datalist.append(data)
|
|
|
continue
|
|
|
hot = re.findall(findHot, item)[0]
|
|
|
data.append(hot)
|
|
|
datalist.append(data)
|
|
|
return datalist
|
|
|
|
|
|
|
|
|
# 得到指定一个URL的网页内容
|
|
|
def askURL(url):
|
|
|
head = { # 模拟浏览器头部信息,向服务器发送消息
|
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3884.400',
|
|
|
'referer': 'https://s.weibo.com/top/summary?cate=realtimehot',
|
|
|
'cookie': 'SINAGLOBAL=1473897670039.8142.1645516763370; wvr=6; UOR=,,login.sina.com.cn; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFK0a_T.XV5-y6wqA-zGS6X5JpX5KMhUgL.FoqXe0-XSKz4S052dJLoIEBLxK-L12BL1KMLxK.L1KnLB--LxKqLBK5L1h5LxK-LB-qL1Kzt; webim_unReadCount=%7B%22time%22%3A1649935854450%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A82%2C%22msgbox%22%3A0%7D; ALF=1681532725; SSOLoginState=1649996727; SCF=AiCrtXGpPAmTMd7-e5FOVan79OEFFteewmEuiyl5x_pUkXNzxK3BymBTfGLVyXRF_QqBp-UFqPMfy-1lz_1SGJg.; SUB=_2A25PXIPnDeRhGeBK6FcV9SzFzDyIHXVsK_IvrDV8PUNbmtAKLXPlkW9NR9Wow6KZc1BRRGciOf_sm6Lisj0pINf8; _s_tentry=login.sina.com.cn; Apache=486678193467.93097.1649996729900; ULV=1649996730087:13:11:9:486678193467.93097.1649996729900:1649933903694',
|
|
|
|
|
|
}
|
|
|
|
|
|
request = urllib.request.Request(url, headers=head)
|
|
|
html = ""
|
|
|
try:
|
|
|
response = urllib.request.urlopen(request)
|
|
|
html = response.read().decode("utf-8")
|
|
|
except urllib.error.URLError as e:
|
|
|
if hasattr(e, "code"):
|
|
|
print(e.code)
|
|
|
if hasattr(e, "reason"):
|
|
|
print(e.reason)
|
|
|
return html
|
|
|
|
|
|
|
|
|
def saveData(datalist, savepath):
|
|
|
print("save.......")
|
|
|
print(datalist)
|
|
|
book = xlwt.Workbook(encoding="utf-8", style_compression=0) # 创建workbook对象
|
|
|
sheet = book.add_sheet('微博热搜Top50', cell_overwrite_ok=True) # 创建工作表
|
|
|
# col = ["热搜排名","热搜链接","热搜标题","热搜热度"]
|
|
|
# for i in range(0, 4):
|
|
|
# sheet.write(1, i, col[i]) # 列名
|
|
|
for i in range(1, 51):
|
|
|
print("第%d条" % (i)) # 输出语句,用来测试
|
|
|
data = datalist[i - 1]
|
|
|
print(data)
|
|
|
if i == 1:
|
|
|
sheet.write(i - 1, 0, "置顶")
|
|
|
sheet.write(i - 1, 1, data[0])
|
|
|
sheet.write(i - 1, 2, data[1])
|
|
|
sheet.write(i - 1, 3, data[2])
|
|
|
else:
|
|
|
sheet.write(i - 1, 0, i - 1)
|
|
|
for j in range(1, 4):
|
|
|
sheet.write(i - 1, j, data[j - 1]) # 数据
|
|
|
book.save(savepath) # 保存
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": # 当程序执行时
|
|
|
# 调用函数
|
|
|
main()
|
|
|
print("爬取完毕!")
|
|
|
|