4.1 KiB

Raw Permalink Blame History Unescape Escape

import requests from bs4 import BeautifulSoup import re import urllib.request, urllib.error import xlwt import matplotlib.pyplot as plt

findLink = re.compile(r'')

获取标题

findTitle = re.compile(r'(.*?)')

获取热度值

findHot = re.compile(r'(\w*\s\d*)')

def main(): baseurl = "https://s.weibo.com/top/summary?Refer=top_hot&topnav=1&wvr=6" # baseurl="view-source:file:///C:/Users/%E5%AD%99%E5%BB%BA%E6%9E%97/Desktop/%E5%89%8D%E7%AB%AF/new%201.html" datalist = getData(baseurl) savepath = "C:\Users\17491\PycharmProjects\pythonProject\微博热搜\微博热搜50.xls" # 当前目录新建XLS，存储进去 saveData(datalist, savepath)

def getData(baseurl): datalist = [] url = baseurl html = askURL(url) soup = BeautifulSoup(html, "html.parser") for item in soup.find_all('td', class_="td-02"): data = [] item = str(item) if len(re.findall(findLink, item)) == 0: continue link = re.findall(findLink, item)[0] data.append(link) # print(re.findall(findTitle,item)) title = str(re.findall(findTitle, item)[0]) title = title.strip('(') title = title.strip(')') title = title.replace("'", "") title = title.split(",") data.append(title[1]) if len(re.findall(findHot, item)) == 0: data.append("置顶") datalist.append(data) continue hot = re.findall(findHot, item)[0] data.append(hot) datalist.append(data) return datalist

得到指定一个URL的网页内容

def askURL(url): head = { # 模拟浏览器头部信息，向服务器发送消息 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3884.400', 'referer': 'https://s.weibo.com/top/summary?cate=realtimehot', 'cookie': 'SINAGLOBAL=1473897670039.8142.1645516763370; wvr=6; UOR=,,login.sina.com.cn; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFK0a_T.XV5-y6wqA-zGS6X5JpX5KMhUgL.FoqXe0-XSKz4S052dJLoIEBLxK-L12BL1KMLxK.L1KnLB--LxKqLBK5L1h5LxK-LB-qL1Kzt; webim_unReadCount=%7B%22time%22%3A1649935854450%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A82%2C%22msgbox%22%3A0%7D; ALF=1681532725; SSOLoginState=1649996727; SCF=AiCrtXGpPAmTMd7-e5FOVan79OEFFteewmEuiyl5x_pUkXNzxK3BymBTfGLVyXRF_QqBp-UFqPMfy-1lz_1SGJg.; SUB=_2A25PXIPnDeRhGeBK6FcV9SzFzDyIHXVsK_IvrDV8PUNbmtAKLXPlkW9NR9Wow6KZc1BRRGciOf_sm6Lisj0pINf8; _s_tentry=login.sina.com.cn; Apache=486678193467.93097.1649996729900; ULV=1649996730087:13:11:9:486678193467.93097.1649996729900:1649933903694',

}

request = urllib.request.Request(url, headers=head)
html = ""
try:
    response = urllib.request.urlopen(request)
    html = response.read().decode("utf-8")
except urllib.error.URLError as e:
    if hasattr(e, "code"):
        print(e.code)
    if hasattr(e, "reason"):
        print(e.reason)
return html

def saveData(datalist, savepath): print("save.......") print(datalist) book = xlwt.Workbook(encoding="utf-8", style_compression=0) # 创建workbook对象 sheet = book.add_sheet('微博热搜Top50', cell_overwrite_ok=True) # 创建工作表 # col = ["热搜排名","热搜链接","热搜标题","热搜热度"] # for i in range(0, 4): # sheet.write(1, i, col[i]) # 列名 for i in range(1, 51): print("第%d条" % (i)) # 输出语句，用来测试 data = datalist[i - 1] print(data) if i == 1: sheet.write(i - 1, 0, "置顶") sheet.write(i - 1, 1, data[0]) sheet.write(i - 1, 2, data[1]) sheet.write(i - 1, 3, data[2]) else: sheet.write(i - 1, 0, i - 1) for j in range(1, 4): sheet.write(i - 1, j, data[j - 1]) # 数据 book.save(savepath) # 保存

if name == "main": # 当程序执行时 # 调用函数 main() print("爬取完毕！")

4.1 KiB Raw Permalink Blame History Unescape Escape

获取标题

获取热度值

得到指定一个URL的网页内容

4.1 KiB

Raw Permalink Blame History Unescape Escape