ADD file via upload

2 months ago · d1c266b777
parent 7e11ca6978
commit d1c266b777
1 changed files with 62 additions and 0 deletions
--- a/提取300个视频的弹幕并以文本形式存入.py
+++ b/提取300个视频的弹幕并以文本形式存入.py
@ -0,0 +1,62 @@
 import requests
 from bs4 import BeautifulSoup
 import re
 import jieba
 import wordcloud
 from collections import Counter
 import imageio
 from openpyxl import Workbook,load_workbook
 from PIL import Image
 import numpy as np
 from pyinstrument import Profiler
 t=Profiler()
 t.start()
 #定义请求头
 headers = {
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.5211 SLBChan/105",
              "Referer":"https: // www.bilibili.com /",
              "Origin":"https://search.bilibili.com"
 }
 #定义页数page和o来翻页
 page=0
 o=0
 #存放视频bv号
 list_ = []
 #翻页提取视频bv号
 while page<10:
    url = 'https://search.bilibili.com/all?vt=94833807&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3'.format(page,o)
    response=requests.get(url=url,headers=headers)
    text1=response.text
    html=re.findall(r'(BV.{10})',text1)
    list_.extend(html)
    page=page+1
    o=o+36
 #排除重复的bv号
 ll = []
 for k in list_:
    if ll.count(k)==0:
        ll.append(k)
 #获取前300个视频链接
 all_danmu=[]
 for j in ll[:300]:
    link = "https://www.bilibili.com/video/{}/".format(j)
    response = requests.get(url=link, headers=headers)
    response.encoding = 'utf-8'
    html1 = response.text
 #获取cid以此获得弹幕地址
    cid = re.search(r'"cid":(\d*),', html1).group(1)
    link = "https://comment.bilibili.com/{}.xml".format(cid)
    res = requests.get(link)
    res.encoding = 'utf-8'
    soup2 = BeautifulSoup(res.text, 'xml')
    all_barrage = soup2.findAll("d")
    #print(all_danmu)
 #将弹幕存入文件中
    for danmu in all_danmu:
        with open('弹幕.txt', 'a', newline='', encoding='utf-8-sig') as file:
            file.write(danmu.string)
            file.write("\n")
 t.stop()
 t.print()