From 0840d3cc402e02a584927b27d2ed333ddc991e76 Mon Sep 17 00:00:00 2001 From: pbsfi9vyo <2630024077@qq.com> Date: Sat, 14 Sep 2024 20:21:44 +0800 Subject: [PATCH] ADD file via upload --- crab.py | 157 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 crab.py diff --git a/crab.py b/crab.py new file mode 100644 index 0000000..01034d0 --- /dev/null +++ b/crab.py @@ -0,0 +1,157 @@ +import requests +import json +import re +from lxml import etree +import pandas as pd +from collections import Counter +from wordcloud import WordCloud, STOPWORDS +from PIL import Image +import numpy as np + + +def fetch_videos(keyword, num_videos=300): + search_url = f"https://search.bilibili.com/all?keyword={keyword}" + + headers = { + "cookie" : "buvid_fp_plain=undefined; DedeUserID=478046906; DedeUserID__ckMd5=e069fb2f7c7e45d8; LIVE_BUVID=AUTO5416552130626367; buvid4=A2EA20E9-E779-847F-39B4-938CD4287F1714203-022061113-BEFmx%2F6H9VrRuwt93E6aCjRHU0GpnkXpk1pE1uK5mJmVSDNUChtrag%3D%3D; CURRENT_FNVAL=4048; header_theme_version=CLOSE; enable_web_push=DISABLE; home_feed_column=5; PVID=1; rpdid=|(RYk~~RkJY0J'u~|JYRkR|R; FEED_LIVE_VERSION=V_HEADER_LIVE_NEW_POP; buvid3=4799A87F-A9DC-AA07-27BF-82F3A5F664F239018infoc; b_nut=1720075339; _uuid=668710B22-5122-4109F-5104D-74B5666E10F1837667infoc; CURRENT_QUALITY=80; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY0MDMyNzgsImlhdCI6MTcyNjE0NDAxOCwicGx0IjotMX0.DQbK-ZNXUzxZXH3Z5cZzBsvahbrDIGLhTVNtZn_q2Fs; bili_ticket_expires=1726403218; SESSDATA=8b95e9ea%2C1741745313%2C0d10f%2A91CjCH0RwJFik2-i32G8hIuFSVDWydWuXvE1N-D4N9IoB01Z1nhfZSlOu-FUStWixjMPwSVm1XLXhhVVlsLVVILTA3cW1JVHJMUWFvUzYxMktjRHNtcUJHV29CWUpCNzFkM2NESU5vRzgzM3JYSDNJelF1QkFHaklNQmpITXA5YzdZTWhYUjRMcUVnIIEC; bili_jct=2b74ad2d258f25b4bc6e4d71843bdfb0; sid=4p5183l0; bp_t_offset_478046906=976512964238508032; fingerprint=271eb250ccf16737050c17694464d5fe; b_lsid=29DAAAB7_191EA6E45AE; browser_resolution=1528-151; buvid_fp=4799A87F-A9DC-AA07-27BF-82F3A5F664F239018infoc", + "user-agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0" , + "referer" : "https: // www.bilibili.com /" + } + response = requests.get(url = search_url, headers=headers) + # response.encoding = 'utf-8' + # html_data = response.text + # print(html_data) 检查response是否获取成功 + + # 获取网页内容 + html_content = response.content + # 解析 HTML 内容 + tree = etree.HTML(html_content) + + result = [] + for index in range(1,31): + div_text = tree.xpath(f'//*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div/div[3]/div/div[{index}]/div/div[2]/a/@href') + result.append(div_text[0]) + # print(div_text[0]) #获取视频链接 + # //*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div/div[3]/div/div[1]/div/div[2]/a + # //*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div/div[3]/div/div[2]/div/div[2]/a + # //*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div/div[3]/div/div[3]/div/div[2]/a + #xpath路径 + page_size = 30 + page = num_videos // 30 + for index in range(1,page): + search_url = f"https://search.bilibili.com/all?keyword={keyword}&page={index+1}&o={index*page_size}" + response = requests.get(url=search_url, headers=headers) + html_content = response.content + tree = etree.HTML(html_content) + for index in range(1, 31): + div_text = tree.xpath(f'//*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div[1]/div[{index}]/div/div[2]/a/@href') + result.append(div_text[0]) + #第二页及后续与第一页xpath结构不同需重新写 + #//*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div[1]/div[1]/div/div[2]/a + #//*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div[1]/div[2]/div/div[2]/a + #print(div_text[0]) #检测是否成功 + return result + + +def fetch_bullet(url): #正则表达式 + + headers = { + "cookie" : "buvid_fp_plain=undefined; DedeUserID=478046906; DedeUserID__ckMd5=e069fb2f7c7e45d8; LIVE_BUVID=AUTO5416552130626367; buvid4=A2EA20E9-E779-847F-39B4-938CD4287F1714203-022061113-BEFmx%2F6H9VrRuwt93E6aCjRHU0GpnkXpk1pE1uK5mJmVSDNUChtrag%3D%3D; CURRENT_FNVAL=4048; header_theme_version=CLOSE; enable_web_push=DISABLE; home_feed_column=5; PVID=1; rpdid=|(RYk~~RkJY0J'u~|JYRkR|R; FEED_LIVE_VERSION=V_HEADER_LIVE_NEW_POP; buvid3=4799A87F-A9DC-AA07-27BF-82F3A5F664F239018infoc; b_nut=1720075339; _uuid=668710B22-5122-4109F-5104D-74B5666E10F1837667infoc; CURRENT_QUALITY=80; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY0MDMyNzgsImlhdCI6MTcyNjE0NDAxOCwicGx0IjotMX0.DQbK-ZNXUzxZXH3Z5cZzBsvahbrDIGLhTVNtZn_q2Fs; bili_ticket_expires=1726403218; SESSDATA=8b95e9ea%2C1741745313%2C0d10f%2A91CjCH0RwJFik2-i32G8hIuFSVDWydWuXvE1N-D4N9IoB01Z1nhfZSlOu-FUStWixjMPwSVm1XLXhhVVlsLVVILTA3cW1JVHJMUWFvUzYxMktjRHNtcUJHV29CWUpCNzFkM2NESU5vRzgzM3JYSDNJelF1QkFHaklNQmpITXA5YzdZTWhYUjRMcUVnIIEC; bili_jct=2b74ad2d258f25b4bc6e4d71843bdfb0; sid=4p5183l0; fingerprint=271eb250ccf16737050c17694464d5fe; browser_resolution=1528-786; buvid_fp=271eb250ccf16737050c17694464d5fe; b_lsid=89A9842E_191EBAF4B62; bsource=search_bing; bp_t_offset_478046906=976675180858310656", + "user-agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0" + } + + response = requests.get(url = url, headers = headers) + # 获取网页内容 + html_content = response.content + # 解析 HTML 内容 + tree = etree.HTML(html_content) + + list_url = tree.xpath("//*[@id=\"dtl\"]/div[5]/input/@value") + #得到['https://api.bilibili.com/x/v1/dm/list.so?oid=1633670800'] + + # ['https://api.bilibili.com/x/v1/dm/list.so?oid=1630123093'] + # ['https://api.bilibili.com/x/v1/dm/list.so?oid=1633866069'] + # [] #部分通过xpath无法得到链接 改用正则表达式 + + #使用得到的弹幕链接获取弹幕 + bullet_url = None + if list_url: + bullet_url = list_url[0] + response = requests.get(url= bullet_url, headers= headers) + response.encoding = 'utf-8' + html_data = response.text + + #正则表达式获取弹幕(返回列表形式) + content_list = re.findall('(.*?)', html_data) + #列表合成字符串 + content = '\n'.join(content_list) + #print(content) #成功获取弹幕 + + #将弹幕写入文件 + with open('bullet.txt', mode= 'a', encoding='utf-8') as f: + f.write(content) #检查后没问题进入下一步 + + +def keyword(): + with open('bullet.txt', mode='r', encoding='utf-8') as f: + text = f.read() + # 预处理文本:去除标点符号,转换为小写 + text = re.sub(r'[^\w\s]', '', text).lower() + + # 计算关键词频率 + words = ['科学', '深度学习','语音识别','数据挖掘','自动驾驶','机器人','图像识别','自然语言处理','算法','模型','语义理解','建模','算法推理','智能','芯片','智能算法','机器人技术','神经网络','特征提取','自适应学习','神经网络','异常检测','智能控制','数据清洗','多模态','强化学习', '语音合成','数据融合','模型', '实时处理','人工智能应用','深度生成', '自监督', '数据分类', '自动', '迁移学习', '智能预测', '虚拟现实', '机器视觉', '算法优化', '模式识别', '语义分割', '生成对抗网络','智能识别', '智能检索', '边缘计算', '模型推理', '数据标注','神经架构搜索','语言模型','智能推荐','自动化','模式匹配','智能助理','计算机视觉','ai'] + + #计算每个词的频率 + word_counts = Counter() + for word in words: + word_counts[word] = text.count(word) + # 提取前八个关键词及其频率 + top_eight = word_counts.most_common(8) + + # 将结果导入Excel + df = pd.DataFrame(top_eight, columns=['Keyword', 'Frequency']) + df.to_excel('keyword_frequencies.xlsx', index=False) + + # print(top_eight) #[('ai', 55), ('科学', 42), ('自动', 17), ('芯片', 6), ('机器人', 5), ('算法', 4), ('智能', 3), ('建模', 2)] + + # # 将元组列表转换为字典 + # word_freq_dict = dict(top_eight) + mask_img = np.array(Image.open("mask.jpg")) + + # 设置停用词 + stopwords = set(STOPWORDS).union({"哈哈","哈哈哈","哈哈哈哈", "哈哈哈哈哈", "哈哈哈哈哈哈", "哈哈哈哈哈哈哈", "哈哈哈哈哈哈哈哈哈","哈哈哈哈哈哈哈哈哈","哈哈哈哈哈哈哈哈哈哈","哈哈哈哈哈哈哈哈哈哈哈哈哈","哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈","哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈","哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈"}) + # 创建词云对象 + wordcloud = WordCloud(font_path='msyh.ttc', # 可以指定字体路径,如果使用中文字符可能需要指定合适的字体路径 + width=2750, + height=1700, + mask=mask_img, + background_color='white', + stopwords=stopwords).generate(text) + + # 保存词云图到文件 + + wordcloud.to_file('wordcloud.png') + + +def main(): + videos_url = fetch_videos("巴黎奥运会", 300) + for index in range(0,300): + fetch_bullet('http://www.i' + videos_url[index][6:]) + keyword() + + + + +if __name__ == "__main__": + main() + #search_url = fetch_videos("巴黎奥运会", 300) + # for index in range(0,300): + # print(search_url[index]) #打印值://www.bilibili.com/video/BV1Kz421i71x/ 注意!!! + + # videos_url = fetch_videos("巴黎奥运会", 300) + # for index in range(0,300): + # # print(videos_url[index]) + # # print('http://www.i'+videos_url[index][6:]) + # + # fetch_bullet('http://www.i'+videos_url[index][6:]) + #获取弹幕