You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

103 lines
4.1 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
from bs4 import BeautifulSoup
import re
import urllib.request, urllib.error
import xlwt
import matplotlib.pyplot as plt
findLink = re.compile(r'<a href="(.*?)" target="_blank">')
# 获取标题
findTitle = re.compile(r'<a href="(.*?)" target="_blank">(.*?)</a>')
# 获取热度值
findHot = re.compile(r'<span>(\w*\s\d*)</span>')
def main():
baseurl = "https://s.weibo.com/top/summary?Refer=top_hot&topnav=1&wvr=6"
# baseurl="view-source:file:///C:/Users/%E5%AD%99%E5%BB%BA%E6%9E%97/Desktop/%E5%89%8D%E7%AB%AF/new%201.html"
datalist = getData(baseurl)
savepath = "C:\\Users\\17491\\PycharmProjects\\pythonProject\\微博热搜\\微博热搜50.xls" # 当前目录新建XLS存储进去
saveData(datalist, savepath)
def getData(baseurl):
datalist = []
url = baseurl
html = askURL(url)
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('td', class_="td-02"):
data = []
item = str(item)
if len(re.findall(findLink, item)) == 0:
continue
link = re.findall(findLink, item)[0]
data.append(link)
# print(re.findall(findTitle,item))
title = str(re.findall(findTitle, item)[0])
title = title.strip('(')
title = title.strip(')')
title = title.replace("\'", "")
title = title.split(",")
data.append(title[1])
if len(re.findall(findHot, item)) == 0:
data.append("置顶")
datalist.append(data)
continue
hot = re.findall(findHot, item)[0]
data.append(hot)
datalist.append(data)
return datalist
# 得到指定一个URL的网页内容
def askURL(url):
head = { # 模拟浏览器头部信息,向服务器发送消息
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3884.400',
'referer': 'https://s.weibo.com/top/summary?cate=realtimehot',
'cookie': 'SINAGLOBAL=1473897670039.8142.1645516763370; wvr=6; UOR=,,login.sina.com.cn; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFK0a_T.XV5-y6wqA-zGS6X5JpX5KMhUgL.FoqXe0-XSKz4S052dJLoIEBLxK-L12BL1KMLxK.L1KnLB--LxKqLBK5L1h5LxK-LB-qL1Kzt; webim_unReadCount=%7B%22time%22%3A1649935854450%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A82%2C%22msgbox%22%3A0%7D; ALF=1681532725; SSOLoginState=1649996727; SCF=AiCrtXGpPAmTMd7-e5FOVan79OEFFteewmEuiyl5x_pUkXNzxK3BymBTfGLVyXRF_QqBp-UFqPMfy-1lz_1SGJg.; SUB=_2A25PXIPnDeRhGeBK6FcV9SzFzDyIHXVsK_IvrDV8PUNbmtAKLXPlkW9NR9Wow6KZc1BRRGciOf_sm6Lisj0pINf8; _s_tentry=login.sina.com.cn; Apache=486678193467.93097.1649996729900; ULV=1649996730087:13:11:9:486678193467.93097.1649996729900:1649933903694',
}
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
def saveData(datalist, savepath):
print("save.......")
print(datalist)
book = xlwt.Workbook(encoding="utf-8", style_compression=0) # 创建workbook对象
sheet = book.add_sheet('微博热搜Top50', cell_overwrite_ok=True) # 创建工作表
# col = ["热搜排名","热搜链接","热搜标题","热搜热度"]
# for i in range(0, 4):
# sheet.write(1, i, col[i]) # 列名
for i in range(1, 51):
print("第%d条" % (i)) # 输出语句,用来测试
data = datalist[i - 1]
print(data)
if i == 1:
sheet.write(i - 1, 0, "置顶")
sheet.write(i - 1, 1, data[0])
sheet.write(i - 1, 2, data[1])
sheet.write(i - 1, 3, data[2])
else:
sheet.write(i - 1, 0, i - 1)
for j in range(1, 4):
sheet.write(i - 1, j, data[j - 1]) # 数据
book.save(savepath) # 保存
if __name__ == "__main__": # 当程序执行时
# 调用函数
main()
print("爬取完毕!")