diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..287a2f0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,162 @@ +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + diff --git a/image2black.py b/image2black.py deleted file mode 100644 index 6c232cd..0000000 --- a/image2black.py +++ /dev/null @@ -1,20 +0,0 @@ -from PIL import Image -import numpy as np - -# 打开图像文件 -img = Image.open("D:\\PYcharm\\pycharm projects\\Life\\blibliCrawler\\image\\CM24B[{2($140KKTP2PC898_tmb.jpg").convert('RGB') - -# 将图像转换为NumPy数组 -data = np.array(img) - -# 创建一个掩码,标记非白色像素 -non_white_mask = (data != [255, 255, 255]).any(axis=-1) - -# 将非白色像素设置为黑色 -data[non_white_mask] = [0, 0, 0] - -# 将NumPy数组转换回图像 -new_img = Image.fromarray(data) - -# 保存处理后的图像 -new_img.save('output.jpg') diff --git a/main.py b/main.py deleted file mode 100644 index e821602..0000000 --- a/main.py +++ /dev/null @@ -1,122 +0,0 @@ -import requests -from bs4 import BeautifulSoup -import pandas as pd -from wordcloud import WordCloud -import matplotlib.pyplot as plt -import time -def get_video_ids(keyword, max_results): - videos = [] - for page in range(1, max_results // 30 + 1): - url = f'https://api.bilibili.com/x/web-interface/search/all/v2?keyword={keyword}&page={page}' - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0', - 'Referer': 'https://www.bilibili.com/', - 'Origin': 'https://www.bilibili.com' , - 'Accept': 'application/json, text/plain, */*', - 'Accept-Encoding': 'gzip, deflate, br', - 'Accept-Language': 'en-US,en;q=0.9', - 'Cache-Control': 'no-cache', - 'Connection': 'keep-alive', - 'X-Requested-With': 'XMLHttpRequest', - 'DNT': '1', - 'Cookie': 'buvid3=1771013D-4B9A-7EDC-B731-F26861DA6BC572364infoc; b_nut=1724924272; _uuid=105A710892-CDD2-D428-F7CE-710974C815A9C68509infoc; buvid_fp=af977995f09c5e137184ce7245ff4b24; buvid4=63859769-66CC-9A8C-877A-2EC1DBADC52673285-024082909-9aqwhTn0maBTqGMv%2F5bUdg%3D%3D; enable_web_push=DISABLE; home_feed_column=5; CURRENT_FNVAL=4048; rpdid=|(u)luk)))YJ0J\'u~kRmkRJJu; header_theme_version=CLOSE; DedeUserID=296374397; DedeUserID__ckMd5=3846201b32bf5925; hit-dyn-v2=1; CURRENT_QUALITY=80; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU5OTY1NTMsImlhdCI6MTcyNTczNzI5MywicGx0IjotMX0.4i0IcahQaom8775sGxa60ssJOD6sYJc-Emr8Xdj4HUg; bili_ticket_expires=1725996493; SESSDATA=2f5df052%2C1741427356%2C1b6ed%2A92CjBmK7vjV-9ckxGj4QviTYeR5dgiOECi1hHuDSCiIQv073dV72Ce14Z7zYRW84pn4aASVkJwdzV0MHBKTjhoUWJ0QktZbFJ1UHk5bmh2SVp3ZkkwckNtSHhfcWpCS2pLbmZ2bXpCSFM5Q19NdU9keTRYTkpFbEFrTkhIT3hrMFVOM0xKOExhLWVnIIEC; bili_jct=e1932adcb4a2c3db0c92f1492968fa3d; sid=6i0bipdx; b_lsid=5E10A61046_191DB2DBF63; browser_resolution=1488-740; bp_t_offset_296374397=975480406855909376' - } - response = requests.get(url, headers=headers) - data = response.json() - if 'data' in data and 'result' in data['data']: - for item in data['data']['result']: - if item['result_type'] == 'video': - for video in item['data']: - videos.append(video['bvid']) - with open('videos_ids/巴黎奥运会.txt', 'a') as file: # Append mode - file.write('\n'.join(videos) + '\n') - else: - print("No data found for page:", page) - print("Response:", data) - time.sleep(1) # 减缓请求频率 - return videos[:max_results] - - -# 获取单个视频的弹幕的函数 -def get_danmaku(bvid): - # 构建CID URL并请求 - cid_url = f'https://api.bilibili.com/x/player/pagelist?bvid={bvid}' - cid_response = requests.get(cid_url) - - # 检查CID请求是否成功 - if cid_response.status_code != 200: - print(f"Failed to fetch CID: HTTP {cid_response.status_code}") - print("Response text:", cid_response.text) - return [] - - try: - cid_data = cid_response.json() - cid = cid_data['data'][0]['cid'] - except (KeyError, IndexError, ValueError) as e: - print(f"Error parsing CID data: {str(e)}") - print("Response content:", cid_response.text) - return [] - - # 使用获取到的CID构建弹幕URL并请求 - danmaku_url = f'https://comment.bilibili.com/{cid}.xml' - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0', - # 'Referer': f'https://www.bilibili.com/video/{bvid}', - # 'Origin': 'https://www.bilibili.com', - 'Cookie': '_uuid=7F7A6379-BA108-194D-92B2-3CAAE8E2BE2D02587infoc; fingerprint=b8971f556d2eeb006b40801c8e3efb84; buvid3=68EE987D-FCC0-A766-AED9-1BD27FBD32E603566infoc; b_nut=1725969003; buvid4=73420F38-E432-9A45-CFD8-4F517839FF4C03566-024091011-kei221M088sCtEsqrKkUZw%3D%3D; buvid_fp_plain=undefined; b_lsid=4463B3FC_191DC332B4B; buvid_fp=b8971f556d2eeb006b40801c8e3efb84; header_theme_version=CLOSE; enable_web_push=DISABLE; home_feed_column=5; browser_resolution=1536-678; CURRENT_FNVAL=4048; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYyMzY1ODQsImlhdCI6MTcyNTk3NzMyNCwicGx0IjotMX0.fX6hQTkCSTUT8EbvFbuVCPxJzEZb7nGSXFpgkz6aD4U; bili_ticket_expires=1726236524; sid=g5u2ivp4' } - session = requests.Session() - session.headers.update(headers) - danmaku_response = session.get(danmaku_url, headers=headers) - danmaku_response.encoding = 'utf-8' - - # 解析弹幕数据 - try: - soup = BeautifulSoup(danmaku_response.content, 'lxml') - danmakus = [d.text for d in soup.find_all('d')] - except Exception as e: - print(f"Error parsing danmakus: {str(e)}") - return [] - - # 保存弹幕到文件 - with open(f'{bvid}_danmakus.txt', 'w', encoding='utf-8') as file: - file.write('\n'.join(danmakus)) - - # 返回弹幕数据 - return danmakus - - - -# 主函数 -def main(): - keyword = '2024巴黎奥运会' - video_ids = get_video_ids(keyword, 300) - all_danmakus = [] - if not video_ids: - print("No video ids retrieved; check the network and API responses.") - return - - for bvid in video_ids: - danmakus = get_danmaku(bvid) - all_danmakus.extend(danmakus) - - if not all_danmakus: - print("No danmakus retrieved; unable to generate word cloud.") - return - - # 数据分析 - danmaku_df = pd.DataFrame(all_danmakus, columns=['danmaku']) - top_danmakus = danmaku_df['danmaku'].value_counts().head(8) - print("Top 8 Danmakus:") - print(top_danmakus) - - # 生成词云 - wordcloud = WordCloud(font_path="C:\\Windows\\Fonts\\msyh.ttc", width=800, height=400).generate( - ' '.join(all_danmakus)) - plt.figure(figsize=(10, 5)) - plt.imshow(wordcloud, interpolation='bilinear') - plt.axis('off') - plt.show() - - -if __name__ == "__main__": - main() diff --git a/main2.py b/main2.py deleted file mode 100644 index 0d4419c..0000000 --- a/main2.py +++ /dev/null @@ -1,94 +0,0 @@ -import requests -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -from selenium.webdriver.common.by import By -from bs4 import BeautifulSoup -import time -import pandas as pd -from wordcloud import WordCloud -import matplotlib.pyplot as plt -import logging - -# 设置日志配置 -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') - - -def get_video_ids_with_selenium(keyword, max_results): - options = Options() - options.add_argument("--headless") # 无头模式 - options.add_argument("--disable-gpu") - driver = webdriver.Chrome(options=options) - - videos = [] - url = f"https://search.bilibili.com/all?keyword={keyword}" - driver.get(url) - time.sleep(3) # 等待页面加载 - - try: - video_elements = driver.find_elements(By.CSS_SELECTOR, 'a.video-item') - for element in video_elements: - video_link = element.get_attribute('href') - video_id = video_link.split('/')[-1] - videos.append(video_id) - if len(videos) >= max_results: - break - except Exception as e: - logging.error(f"Error finding video elements: {str(e)}") - finally: - driver.quit() - - return videos - - -def get_danmaku(bvid): - try: - cid_url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp" - cid_response = requests.get(cid_url) - cid = cid_response.json()['data'][0]['cid'] - - danmaku_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}" - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' - } - danmaku_response = requests.get(danmaku_url, headers=headers) - soup = BeautifulSoup(danmaku_response.content, 'lxml') - danmakus = [d.text for d in soup.find_all('d')] - except Exception as e: - logging.error(f"Failed to get danmakus for BVID {bvid}: {str(e)}") - return [] - - return danmakus - - -def main(): - keyword = '2024巴黎奥运会' - logging.info("Starting to fetch video IDs") - video_ids = get_video_ids_with_selenium(keyword, 300) - logging.info(f"Retrieved {len(video_ids)} video IDs") - - all_danmakus = [] - for index, bvid in enumerate(video_ids): - logging.info(f"Processing video {index + 1}/{len(video_ids)}: {bvid}") - danmakus = get_danmaku(bvid) - all_danmakus.extend(danmakus) - - if not all_danmakus: - logging.warning("No danmakus retrieved; unable to generate word cloud.") - return - - logging.info(f"Collected a total of {len(all_danmakus)} danmakus") - danmaku_df = pd.DataFrame(all_danmakus, columns=['danmaku']) - top_danmakus = danmaku_df['danmaku'].value_counts().head(8) - logging.info("Top 8 Danmakus:") - logging.info("\n" + str(top_danmakus)) - - wordcloud = WordCloud(font_path="C:\\Windows\\Fonts\\msyh.ttc", width=800, height=400).generate( - ' '.join(all_danmakus)) - plt.figure(figsize=(10, 5)) - plt.imshow(wordcloud, interpolation='bilinear') - plt.axis('off') - plt.show() - - -if __name__ == "__main__": - main() diff --git a/main3.py b/main3.py deleted file mode 100644 index fd5a88c..0000000 --- a/main3.py +++ /dev/null @@ -1,25 +0,0 @@ -from wordcloud import WordCloud -import matplotlib.pyplot as plt - -# 原始的词频字典 -# 定义字典 -danmaku_frequency = { - '5': 824, - '?': 535, - '0': 190, - '哈哈哈哈哈': 124, - '哈哈哈哈': 122, - '哈哈哈': 121, - '哈哈哈哈哈哈': 111, - '哈哈哈哈哈哈哈': 103 -} - -# 获取所有键 -keys = danmaku_frequency.keys() - -# 将键列表转换为字符串,每个键之间用空格分隔 -resulting_string = ' '.join(keys) - -# 打印结果 -print(resulting_string) - diff --git a/test.py b/test.py deleted file mode 100644 index 4ed5d24..0000000 --- a/test.py +++ /dev/null @@ -1,5 +0,0 @@ -import re - -text = "hello 12345 world! 哈哈" -matches = re.findall(r"\w+", text) -print(matches) # 查看输出,确认是否包括预期的数字和字母 diff --git a/top_danmakus_2024巴黎奥运会.xlsx b/top_danmakus_2024巴黎奥运会.xlsx deleted file mode 100644 index 288ab14..0000000 Binary files a/top_danmakus_2024巴黎奥运会.xlsx and /dev/null differ diff --git a/豆瓣电影Top250.xls b/豆瓣电影Top250.xls deleted file mode 100644 index 5d430bc..0000000 Binary files a/豆瓣电影Top250.xls and /dev/null differ