diff --git a/final.py b/final.py new file mode 100644 index 0000000..0e3d799 --- /dev/null +++ b/final.py @@ -0,0 +1,101 @@ +import re +import random +import requests +import datetime +import sys +import io +import json +import openpyxl +from bs4 import BeautifulSoup +import time +import pandas +import wordcloud +import numpy +import jieba +import pylint +information_list = []#用来保存爬取的弹幕 +#解决一下怎么爬出BV号,每个网站的BV号不同,根据不同的BV号我们可以对不同的视频进行操作 +bvnum_list = []#存BV号,有了BV号再循环导出弹幕 +cid_list = []#存cid号 +def findbv(): + temp_list =[] + for i in range(1,10): + url = f'https://search.bilibili.com/video?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page={i}' + headers = { + "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0", + "Cookie" : "i-wanna-go-back=-1; rpdid=|(RYkm|Ykul0J'uY~YmYlYl~; DedeUserID=474817892; DedeUserID__ckMd5=2ad2b4cb2126ef63; buvid4=9AE18FFB-B9C4-8389-DF6A-72C0E6BCC34D43412-023022115-vdGT0WAv%2FuJ7%2FRzFm7Eg7A%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO4716769855903121; CURRENT_BLACKGAP=0; enable_web_push=DISABLE; buvid3=913910D1-D049-0F77-02E0-21B1093ACF0601725infoc; b_nut=1708668801; b_ut=5; _uuid=65739108E-2B75-9642-2E59-826F71099D7E203422infoc; header_theme_version=CLOSE; hit-dyn-v2=1; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; SESSDATA=709ab107%2C1729957738%2C3372c%2A42CjB0lzRiXKvwOi-Op4_kAeXHcFSeDW2GtWNhiW33Y4bLFvKzOUP5fUvm7CyuyD9Y1OISVkVDUkozNXlxUGIwLVZiNTNrTE9RMHhyeW1ycEJBZWZoa2VJYjd0WnN6UmNKTFVFWDZYWUI4NmpuYXNIT0k0T1ozNjFQVzBRbEdCaDliWGtFRjdFTkd3IIEC; bili_jct=717efb0472a4b30b96a2e6a470bdec42; sid=6a6r71lf; is-2022-channel=1; CURRENT_FNVAL=4048; fingerprint=254d2c4cf86e9c4fe76c3c00ffd0de23; CURRENT_QUALITY=80; buvid_fp=e440b4a9c5e0b9af9ad8950245b201f7; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1Nzc5NjIsImlhdCI6MTcyNjMxODcwMiwicGx0IjotMX0.ruBBQK0YAJ7TyuLBgdfVijbpzBhfwGTglNFMMAmhv-c; bili_ticket_expires=1726577902; PVID=6; home_feed_column=5; browser_resolution=1707-837; b_lsid=610579158_191F9ED6DE3; bp_t_offset_474817892=977706466930589696" + } + result = requests.get(url=url,headers= headers) + result.encoding = "utf-8" + tempresult = result.text + tempnum = re.findall(r'bvid:"(.*?)"', tempresult) + temp_list.extend(tempnum) + return temp_list +#接下来抓取cid +def getcid(list): + temp_cid_list = [] + try: + for i in range(300): + temp_url = f'https://api.bilibili.com/x/player/pagelist?bvid={list[i]}&jsonp=jsonp' + temp_headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0", + 'Cookie': "i-wanna-go-back=-1; rpdid=|(RYkm|Ykul0J'uY~YmYlYl~; DedeUserID=474817892; DedeUserID__ckMd5=2ad2b4cb2126ef63; buvid4=9AE18FFB-B9C4-8389-DF6A-72C0E6BCC34D43412-023022115-vdGT0WAv%2FuJ7%2FRzFm7Eg7A%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO4716769855903121; CURRENT_BLACKGAP=0; enable_web_push=DISABLE; buvid3=913910D1-D049-0F77-02E0-21B1093ACF0601725infoc; b_nut=1708668801; b_ut=5; _uuid=65739108E-2B75-9642-2E59-826F71099D7E203422infoc; header_theme_version=CLOSE; hit-dyn-v2=1; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; SESSDATA=709ab107%2C1729957738%2C3372c%2A42CjB0lzRiXKvwOi-Op4_kAeXHcFSeDW2GtWNhiW33Y4bLFvKzOUP5fUvm7CyuyD9Y1OISVkVDUkozNXlxUGIwLVZiNTNrTE9RMHhyeW1ycEJBZWZoa2VJYjd0WnN6UmNKTFVFWDZYWUI4NmpuYXNIT0k0T1ozNjFQVzBRbEdCaDliWGtFRjdFTkd3IIEC; bili_jct=717efb0472a4b30b96a2e6a470bdec42; sid=6a6r71lf; is-2022-channel=1; CURRENT_FNVAL=4048; fingerprint=254d2c4cf86e9c4fe76c3c00ffd0de23; CURRENT_QUALITY=80; buvid_fp=e440b4a9c5e0b9af9ad8950245b201f7; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1Nzc5NjIsImlhdCI6MTcyNjMxODcwMiwicGx0IjotMX0.ruBBQK0YAJ7TyuLBgdfVijbpzBhfwGTglNFMMAmhv-c; bili_ticket_expires=1726577902; PVID=6; home_feed_column=5; browser_resolution=1707-837; b_lsid=610579158_191F9ED6DE3; bp_t_offset_474817892=977706466930589696" + } + respone = requests.get(url = temp_url,headers = temp_headers) + respone.encoding = 'utf-8' + if respone.status_code == 200: + temp_cid_list.extend(re.findall(r'"cid":(.*?),',respone.text)) + except requests.RequestException as e: + print(f"An error occurred: {e}") + + return temp_cid_list + +def get_information(list): + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8') + temp_information_list = [] + for i in range(300): + temp_url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={list[i]}' + temp_headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0", + 'Cookie': "i-wanna-go-back=-1; rpdid=|(RYkm|Ykul0J'uY~YmYlYl~; DedeUserID=474817892; DedeUserID__ckMd5=2ad2b4cb2126ef63; buvid4=9AE18FFB-B9C4-8389-DF6A-72C0E6BCC34D43412-023022115-vdGT0WAv%2FuJ7%2FRzFm7Eg7A%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO4716769855903121; CURRENT_BLACKGAP=0; enable_web_push=DISABLE; buvid3=913910D1-D049-0F77-02E0-21B1093ACF0601725infoc; b_nut=1708668801; b_ut=5; _uuid=65739108E-2B75-9642-2E59-826F71099D7E203422infoc; header_theme_version=CLOSE; hit-dyn-v2=1; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; SESSDATA=709ab107%2C1729957738%2C3372c%2A42CjB0lzRiXKvwOi-Op4_kAeXHcFSeDW2GtWNhiW33Y4bLFvKzOUP5fUvm7CyuyD9Y1OISVkVDUkozNXlxUGIwLVZiNTNrTE9RMHhyeW1ycEJBZWZoa2VJYjd0WnN6UmNKTFVFWDZYWUI4NmpuYXNIT0k0T1ozNjFQVzBRbEdCaDliWGtFRjdFTkd3IIEC; bili_jct=717efb0472a4b30b96a2e6a470bdec42; sid=6a6r71lf; is-2022-channel=1; CURRENT_FNVAL=4048; fingerprint=254d2c4cf86e9c4fe76c3c00ffd0de23; CURRENT_QUALITY=80; buvid_fp=e440b4a9c5e0b9af9ad8950245b201f7; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1Nzc5NjIsImlhdCI6MTcyNjMxODcwMiwicGx0IjotMX0.ruBBQK0YAJ7TyuLBgdfVijbpzBhfwGTglNFMMAmhv-c; bili_ticket_expires=1726577902; PVID=6; home_feed_column=5; browser_resolution=1707-837; b_lsid=610579158_191F9ED6DE3; bp_t_offset_474817892=977706466930589696" + } + try: + response = requests.get(url=temp_url, headers=temp_headers) + response.encoding = 'utf-8' + temp_list = re.findall('(.*?)',response.text) + temp_information_list.extend(temp_list) + except requests.RequestException as e: + print(f"An error occurred: {e}") + time.sleep(0.5) + return temp_information_list + + + +bvnum_list = findbv() +cid_list = getcid(bvnum_list) +information_list = get_information(cid_list) +for i in range(len(information_list)): + with open("E:\python1\\requirement1.txt","a",encoding= "utf-8") as f: #/r要做出区分 + f.write(information_list[i]) + f.write("\n") +f = open("E:\python1\\requirement1.txt",encoding= "utf-8") +txt = f.read() +string = ' '.join(jieba.lcut(txt)) +wc = wordcloud.WordCloud( + width = 500, + height = 500, + background_color= 'white', + scale = 10, + font_path = 'msyh.ttc', +) +wc.generate(string) +wc.to_file("wordcloud.png") +df = pandas.DataFrame(information_list) +df.to_excel('spider1.xlsx',index = False) + + + + + + +