diff --git a/crwal.py b/crwal.py new file mode 100644 index 0000000..1f1deb4 --- /dev/null +++ b/crwal.py @@ -0,0 +1,158 @@ +import requests +import time +from bs4 import BeautifulSoup +from collections import Counter +import openpyxl +import matplotlib.pyplot as plt +from wordcloud import WordCloud +import jieba +import copy +import re + +header = { + 'Cookie': "buvid4=45060204-E173-B80C-04C7-11B8D43E872D75556-022101721-m4T90WVXeagV7lXz92ZWuw%3D%3D; DedeUserID=3461571130952367; DedeUserID__ckMd5=b3ea366f0ed87c94; CURRENT_FNVAL=4048; enable_web_push=DISABLE; _uuid=FFC241066-10489-E63A-8A4B-6C217510EACC571042infoc; buvid3=E0F932DB-25F3-EF2A-2629-B756E279809355623infoc; b_nut=1700799855; header_theme_version=CLOSE; rpdid=|(u))kkYu|JJ0J'u~|R~)~|Yu; PVID=1; FEED_LIVE_VERSION=V8; buvid_fp_plain=undefined; CURRENT_QUALITY=0; fingerprint=450dce408658a7f873061345351176c7; buvid_fp=77344ae6cddf7b1d32d0c69bd9a38f80; b_lsid=4D7FCA4C_191DC589C5D; SESSDATA=8840b176%2C1741530686%2C6d032%2A92CjC46834ghJkHd4YjzNOtLiSAgKyx6hzATr7eWbatDJxB8pR708adiIvgr2OqCWEFkASVmluRUljMDZPQWlEMUx2MlVhQkJoanRvR1kxNlZ2M2hjS0hPekFrWGlNZFc2ZWNiWmdrQ0tyaWlkODFyNUp2WDdqc3FSbjUzUEhjNlFBc0U0UGR4a3d3IIEC; bili_jct=80e99b3d5026f86143f072c7e055fed8; sid=gbw623md; bp_t_offset_3461571130952367=975560108564021248; home_feed_column=4; browser_resolution=895-822", + 'user-agent': '12345678zxdcvfbnmdfcsdxaghdnhguiasgck' + # "Mozilla/5.0 (111Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 + # Safari/537.36 Edg/128.0.0.0" + +} + + +# 获取视频的aid列表: +def get_video_aid_list(keyword): + aid_set = set() + aid_list = [] + page = 1 + while len(aid_set) < 300: + url = f"https://api.bilibili.com/x/web-interface/search/all/v2?keyword={keyword}&page={page}" + response = requests.get(url, headers=header) + data = response.json() + + for item in data['data']['result']: + if item['result_type'] == 'video': + for video in item['data']: + if video['aid'] not in aid_set: + aid_list.append({ + 'title': video['title'], + 'aid': video['aid'] + }) + aid_set.add(video['aid']) + page += 1 + time.sleep(0.15) + return aid_list + + +def get_cid_by_aid(aid): + url = f"https://api.bilibili.com/x/web-interface/view?aid={aid}" + response = requests.get(url, headers=header) + # print(response.text) + data = response.json() + return data['data']['cid'] + + +def get_danmaku(cid): + url = f"https://comment.bilibili.com/{cid}.xml" + response = requests.get(url, headers=header) + # print(response.text) + soup = BeautifulSoup(response.content, "xml") + danmakus = soup.find_all('d') + return [d.text for d in danmakus] + + +def crawl_danmaku_data(keyword): + videos = get_video_aid_list(keyword) + all_danmakus = [] + for video in videos: + aid = video['aid'] + title = video['title'] + # print(aid) + cid = get_cid_by_aid(aid) + print(f"正在爬取弹幕: {title} (AID: {aid}, CID: {cid})") + try: + danmakus = get_danmaku(cid) + all_danmakus.extend(danmakus) + # time.sleep(1) + except Exception as e: + print(f"获取视频 {title} 的弹幕时出错: {e}") + + wb = openpyxl.Workbook() + sheet = wb.active + sheet.title = "Danmakus" + sheet.append(["弹幕内容"]) + + for item in all_danmakus: + sheet.append([item]) + + wb.save('danmakus.xlsx') + # print(all_danmakus) + # all_danmakus = all_danmakus[:-1] + + return all_danmakus + + +ai_keywords = ["AI", "人工智能", "云技术", "机器学习", "深度学习", "神经网络","大数据" ] + + +def contains_pattern(text): + text1 = text.lower() + + # pattern = r'|'.join(re.escape(keyword) for keyword in ai_keywords) + pattern = r'(?