You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
pa4kjlu8f 44d46fd3a3
生成的词云图
2 months ago
README.md Update README.md 2 months ago
requirement.txt ADD file via upload 2 months ago
spider1.xlsx ADD file via upload 2 months ago
wordcloud.png 生成的词云图 2 months ago

README.md

import re import random import requests import datetime import sys import io import json import openpyxl from bs4 import BeautifulSoup import time import pandas import wordcloud import numpy import jieba import pylint information_list = []#用来保存爬取的弹幕 #解决一下怎么爬出BV号每个网站的BV号不同根据不同的BV号我们可以对不同的视频进行操作 bvnum_list = []#存BV号有了BV号再循环导出弹幕 cid_list = []#存cid号 def findbv(): temp_list =[] for i in range(1,10): url = f'https://search.bilibili.com/video?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3&page={i}' headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0", "Cookie" : "i-wanna-go-back=-1; rpdid=|(RYkm|Ykul0J'uY~YmYlYl~; DedeUserID=474817892; DedeUserID__ckMd5=2ad2b4cb2126ef63; buvid4=9AE18FFB-B9C4-8389-DF6A-72C0E6BCC34D43412-023022115-vdGT0WAv%2FuJ7%2FRzFm7Eg7A%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO4716769855903121; CURRENT_BLACKGAP=0; enable_web_push=DISABLE; buvid3=913910D1-D049-0F77-02E0-21B1093ACF0601725infoc; b_nut=1708668801; b_ut=5; _uuid=65739108E-2B75-9642-2E59-826F71099D7E203422infoc; header_theme_version=CLOSE; hit-dyn-v2=1; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; SESSDATA=709ab107%2C1729957738%2C3372c%2A42CjB0lzRiXKvwOi-Op4_kAeXHcFSeDW2GtWNhiW33Y4bLFvKzOUP5fUvm7CyuyD9Y1OISVkVDUkozNXlxUGIwLVZiNTNrTE9RMHhyeW1ycEJBZWZoa2VJYjd0WnN6UmNKTFVFWDZYWUI4NmpuYXNIT0k0T1ozNjFQVzBRbEdCaDliWGtFRjdFTkd3IIEC; bili_jct=717efb0472a4b30b96a2e6a470bdec42; sid=6a6r71lf; is-2022-channel=1; CURRENT_FNVAL=4048; fingerprint=254d2c4cf86e9c4fe76c3c00ffd0de23; CURRENT_QUALITY=80; buvid_fp=e440b4a9c5e0b9af9ad8950245b201f7; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1Nzc5NjIsImlhdCI6MTcyNjMxODcwMiwicGx0IjotMX0.ruBBQK0YAJ7TyuLBgdfVijbpzBhfwGTglNFMMAmhv-c; bili_ticket_expires=1726577902; PVID=6; home_feed_column=5; browser_resolution=1707-837; b_lsid=610579158_191F9ED6DE3; bp_t_offset_474817892=977706466930589696" } result = requests.get(url=url,headers= headers) result.encoding = "utf-8" tempresult = result.text tempnum = re.findall(r'bvid:"(.?)"', tempresult) temp_list.extend(tempnum) return temp_list #接下来抓取cid def getcid(list): temp_cid_list = [] try: for i in range(300): temp_url = f'https://api.bilibili.com/x/player/pagelist?bvid={list[i]}&jsonp=jsonp' temp_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0", 'Cookie': "i-wanna-go-back=-1; rpdid=|(RYkm|Ykul0J'uY~YmYlYl~; DedeUserID=474817892; DedeUserID__ckMd5=2ad2b4cb2126ef63; buvid4=9AE18FFB-B9C4-8389-DF6A-72C0E6BCC34D43412-023022115-vdGT0WAv%2FuJ7%2FRzFm7Eg7A%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO4716769855903121; CURRENT_BLACKGAP=0; enable_web_push=DISABLE; buvid3=913910D1-D049-0F77-02E0-21B1093ACF0601725infoc; b_nut=1708668801; b_ut=5; _uuid=65739108E-2B75-9642-2E59-826F71099D7E203422infoc; header_theme_version=CLOSE; hit-dyn-v2=1; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; SESSDATA=709ab107%2C1729957738%2C3372c%2A42CjB0lzRiXKvwOi-Op4_kAeXHcFSeDW2GtWNhiW33Y4bLFvKzOUP5fUvm7CyuyD9Y1OISVkVDUkozNXlxUGIwLVZiNTNrTE9RMHhyeW1ycEJBZWZoa2VJYjd0WnN6UmNKTFVFWDZYWUI4NmpuYXNIT0k0T1ozNjFQVzBRbEdCaDliWGtFRjdFTkd3IIEC; bili_jct=717efb0472a4b30b96a2e6a470bdec42; sid=6a6r71lf; is-2022-channel=1; CURRENT_FNVAL=4048; fingerprint=254d2c4cf86e9c4fe76c3c00ffd0de23; CURRENT_QUALITY=80; buvid_fp=e440b4a9c5e0b9af9ad8950245b201f7; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1Nzc5NjIsImlhdCI6MTcyNjMxODcwMiwicGx0IjotMX0.ruBBQK0YAJ7TyuLBgdfVijbpzBhfwGTglNFMMAmhv-c; bili_ticket_expires=1726577902; PVID=6; home_feed_column=5; browser_resolution=1707-837; b_lsid=610579158_191F9ED6DE3; bp_t_offset_474817892=977706466930589696" } respone = requests.get(url = temp_url,headers = temp_headers) respone.encoding = 'utf-8' if respone.status_code == 200: temp_cid_list.extend(re.findall(r'"cid":(.?),',respone.text)) except requests.RequestException as e: print(f"An error occurred: {e}")

return temp_cid_list

def get_information(list): sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8') temp_information_list = [] for i in range(300): temp_url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={list[i]}' temp_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0", 'Cookie': "i-wanna-go-back=-1; rpdid=|(RYkm|Ykul0J'uY~YmYlYl~; DedeUserID=474817892; DedeUserID__ckMd5=2ad2b4cb2126ef63; buvid4=9AE18FFB-B9C4-8389-DF6A-72C0E6BCC34D43412-023022115-vdGT0WAv%2FuJ7%2FRzFm7Eg7A%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO4716769855903121; CURRENT_BLACKGAP=0; enable_web_push=DISABLE; buvid3=913910D1-D049-0F77-02E0-21B1093ACF0601725infoc; b_nut=1708668801; b_ut=5; _uuid=65739108E-2B75-9642-2E59-826F71099D7E203422infoc; header_theme_version=CLOSE; hit-dyn-v2=1; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; SESSDATA=709ab107%2C1729957738%2C3372c%2A42CjB0lzRiXKvwOi-Op4_kAeXHcFSeDW2GtWNhiW33Y4bLFvKzOUP5fUvm7CyuyD9Y1OISVkVDUkozNXlxUGIwLVZiNTNrTE9RMHhyeW1ycEJBZWZoa2VJYjd0WnN6UmNKTFVFWDZYWUI4NmpuYXNIT0k0T1ozNjFQVzBRbEdCaDliWGtFRjdFTkd3IIEC; bili_jct=717efb0472a4b30b96a2e6a470bdec42; sid=6a6r71lf; is-2022-channel=1; CURRENT_FNVAL=4048; fingerprint=254d2c4cf86e9c4fe76c3c00ffd0de23; CURRENT_QUALITY=80; buvid_fp=e440b4a9c5e0b9af9ad8950245b201f7; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1Nzc5NjIsImlhdCI6MTcyNjMxODcwMiwicGx0IjotMX0.ruBBQK0YAJ7TyuLBgdfVijbpzBhfwGTglNFMMAmhv-c; bili_ticket_expires=1726577902; PVID=6; home_feed_column=5; browser_resolution=1707-837; b_lsid=610579158_191F9ED6DE3; bp_t_offset_474817892=977706466930589696" } try: response = requests.get(url=temp_url, headers=temp_headers) response.encoding = 'utf-8' temp_list = re.findall('(.*?)',response.text) temp_information_list.extend(temp_list) except requests.RequestException as e: print(f"An error occurred: {e}") time.sleep(0.5) return temp_information_list

bvnum_list = findbv() cid_list = getcid(bvnum_list) information_list = get_information(cid_list) for i in range(len(information_list)): with open("E:\python1\requirement.txt","a",encoding= "utf-8") as f: #/r要做出区分 f.write(information_list[i]) f.write("\n") f = open("E:\python1\requirement.txt",encoding= "utf-8") txt = f.read() string = ' '.join(jieba.lcut(txt)) wc = wordcloud.WordCloud( width = 500, height = 500, background_color= 'white', scale = 10, font_path = 'msyh.ttc', ) wc.generate(string) wc.to_file("wordcloud.png") df = pandas.DataFrame(information_list) df.to_excel('spider1.xlsx',index = False)