yjh 5 months ago
parent 7c68b73365
commit 78771220b3

162
.gitignore vendored

@ -0,0 +1,162 @@
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

@ -1,20 +0,0 @@
from PIL import Image
import numpy as np
# 打开图像文件
img = Image.open("D:\\PYcharm\\pycharm projects\\Life\\blibliCrawler\\image\\CM24B[{2($140KKTP2PC898_tmb.jpg").convert('RGB')
# 将图像转换为NumPy数组
data = np.array(img)
# 创建一个掩码,标记非白色像素
non_white_mask = (data != [255, 255, 255]).any(axis=-1)
# 将非白色像素设置为黑色
data[non_white_mask] = [0, 0, 0]
# 将NumPy数组转换回图像
new_img = Image.fromarray(data)
# 保存处理后的图像
new_img.save('output.jpg')

@ -1,122 +0,0 @@
import requests
from bs4 import BeautifulSoup
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import time
def get_video_ids(keyword, max_results):
videos = []
for page in range(1, max_results // 30 + 1):
url = f'https://api.bilibili.com/x/web-interface/search/all/v2?keyword={keyword}&page={page}'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0',
'Referer': 'https://www.bilibili.com/',
'Origin': 'https://www.bilibili.com' ,
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'X-Requested-With': 'XMLHttpRequest',
'DNT': '1',
'Cookie': 'buvid3=1771013D-4B9A-7EDC-B731-F26861DA6BC572364infoc; b_nut=1724924272; _uuid=105A710892-CDD2-D428-F7CE-710974C815A9C68509infoc; buvid_fp=af977995f09c5e137184ce7245ff4b24; buvid4=63859769-66CC-9A8C-877A-2EC1DBADC52673285-024082909-9aqwhTn0maBTqGMv%2F5bUdg%3D%3D; enable_web_push=DISABLE; home_feed_column=5; CURRENT_FNVAL=4048; rpdid=|(u)luk)))YJ0J\'u~kRmkRJJu; header_theme_version=CLOSE; DedeUserID=296374397; DedeUserID__ckMd5=3846201b32bf5925; hit-dyn-v2=1; CURRENT_QUALITY=80; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU5OTY1NTMsImlhdCI6MTcyNTczNzI5MywicGx0IjotMX0.4i0IcahQaom8775sGxa60ssJOD6sYJc-Emr8Xdj4HUg; bili_ticket_expires=1725996493; SESSDATA=2f5df052%2C1741427356%2C1b6ed%2A92CjBmK7vjV-9ckxGj4QviTYeR5dgiOECi1hHuDSCiIQv073dV72Ce14Z7zYRW84pn4aASVkJwdzV0MHBKTjhoUWJ0QktZbFJ1UHk5bmh2SVp3ZkkwckNtSHhfcWpCS2pLbmZ2bXpCSFM5Q19NdU9keTRYTkpFbEFrTkhIT3hrMFVOM0xKOExhLWVnIIEC; bili_jct=e1932adcb4a2c3db0c92f1492968fa3d; sid=6i0bipdx; b_lsid=5E10A61046_191DB2DBF63; browser_resolution=1488-740; bp_t_offset_296374397=975480406855909376'
}
response = requests.get(url, headers=headers)
data = response.json()
if 'data' in data and 'result' in data['data']:
for item in data['data']['result']:
if item['result_type'] == 'video':
for video in item['data']:
videos.append(video['bvid'])
with open('videos_ids/巴黎奥运会.txt', 'a') as file: # Append mode
file.write('\n'.join(videos) + '\n')
else:
print("No data found for page:", page)
print("Response:", data)
time.sleep(1) # 减缓请求频率
return videos[:max_results]
# 获取单个视频的弹幕的函数
def get_danmaku(bvid):
# 构建CID URL并请求
cid_url = f'https://api.bilibili.com/x/player/pagelist?bvid={bvid}'
cid_response = requests.get(cid_url)
# 检查CID请求是否成功
if cid_response.status_code != 200:
print(f"Failed to fetch CID: HTTP {cid_response.status_code}")
print("Response text:", cid_response.text)
return []
try:
cid_data = cid_response.json()
cid = cid_data['data'][0]['cid']
except (KeyError, IndexError, ValueError) as e:
print(f"Error parsing CID data: {str(e)}")
print("Response content:", cid_response.text)
return []
# 使用获取到的CID构建弹幕URL并请求
danmaku_url = f'https://comment.bilibili.com/{cid}.xml'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0',
# 'Referer': f'https://www.bilibili.com/video/{bvid}',
# 'Origin': 'https://www.bilibili.com',
'Cookie': '_uuid=7F7A6379-BA108-194D-92B2-3CAAE8E2BE2D02587infoc; fingerprint=b8971f556d2eeb006b40801c8e3efb84; buvid3=68EE987D-FCC0-A766-AED9-1BD27FBD32E603566infoc; b_nut=1725969003; buvid4=73420F38-E432-9A45-CFD8-4F517839FF4C03566-024091011-kei221M088sCtEsqrKkUZw%3D%3D; buvid_fp_plain=undefined; b_lsid=4463B3FC_191DC332B4B; buvid_fp=b8971f556d2eeb006b40801c8e3efb84; header_theme_version=CLOSE; enable_web_push=DISABLE; home_feed_column=5; browser_resolution=1536-678; CURRENT_FNVAL=4048; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYyMzY1ODQsImlhdCI6MTcyNTk3NzMyNCwicGx0IjotMX0.fX6hQTkCSTUT8EbvFbuVCPxJzEZb7nGSXFpgkz6aD4U; bili_ticket_expires=1726236524; sid=g5u2ivp4' }
session = requests.Session()
session.headers.update(headers)
danmaku_response = session.get(danmaku_url, headers=headers)
danmaku_response.encoding = 'utf-8'
# 解析弹幕数据
try:
soup = BeautifulSoup(danmaku_response.content, 'lxml')
danmakus = [d.text for d in soup.find_all('d')]
except Exception as e:
print(f"Error parsing danmakus: {str(e)}")
return []
# 保存弹幕到文件
with open(f'{bvid}_danmakus.txt', 'w', encoding='utf-8') as file:
file.write('\n'.join(danmakus))
# 返回弹幕数据
return danmakus
# 主函数
def main():
keyword = '2024巴黎奥运会'
video_ids = get_video_ids(keyword, 300)
all_danmakus = []
if not video_ids:
print("No video ids retrieved; check the network and API responses.")
return
for bvid in video_ids:
danmakus = get_danmaku(bvid)
all_danmakus.extend(danmakus)
if not all_danmakus:
print("No danmakus retrieved; unable to generate word cloud.")
return
# 数据分析
danmaku_df = pd.DataFrame(all_danmakus, columns=['danmaku'])
top_danmakus = danmaku_df['danmaku'].value_counts().head(8)
print("Top 8 Danmakus:")
print(top_danmakus)
# 生成词云
wordcloud = WordCloud(font_path="C:\\Windows\\Fonts\\msyh.ttc", width=800, height=400).generate(
' '.join(all_danmakus))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
if __name__ == "__main__":
main()

@ -1,94 +0,0 @@
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import logging
# 设置日志配置
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def get_video_ids_with_selenium(keyword, max_results):
options = Options()
options.add_argument("--headless") # 无头模式
options.add_argument("--disable-gpu")
driver = webdriver.Chrome(options=options)
videos = []
url = f"https://search.bilibili.com/all?keyword={keyword}"
driver.get(url)
time.sleep(3) # 等待页面加载
try:
video_elements = driver.find_elements(By.CSS_SELECTOR, 'a.video-item')
for element in video_elements:
video_link = element.get_attribute('href')
video_id = video_link.split('/')[-1]
videos.append(video_id)
if len(videos) >= max_results:
break
except Exception as e:
logging.error(f"Error finding video elements: {str(e)}")
finally:
driver.quit()
return videos
def get_danmaku(bvid):
try:
cid_url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp"
cid_response = requests.get(cid_url)
cid = cid_response.json()['data'][0]['cid']
danmaku_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
danmaku_response = requests.get(danmaku_url, headers=headers)
soup = BeautifulSoup(danmaku_response.content, 'lxml')
danmakus = [d.text for d in soup.find_all('d')]
except Exception as e:
logging.error(f"Failed to get danmakus for BVID {bvid}: {str(e)}")
return []
return danmakus
def main():
keyword = '2024巴黎奥运会'
logging.info("Starting to fetch video IDs")
video_ids = get_video_ids_with_selenium(keyword, 300)
logging.info(f"Retrieved {len(video_ids)} video IDs")
all_danmakus = []
for index, bvid in enumerate(video_ids):
logging.info(f"Processing video {index + 1}/{len(video_ids)}: {bvid}")
danmakus = get_danmaku(bvid)
all_danmakus.extend(danmakus)
if not all_danmakus:
logging.warning("No danmakus retrieved; unable to generate word cloud.")
return
logging.info(f"Collected a total of {len(all_danmakus)} danmakus")
danmaku_df = pd.DataFrame(all_danmakus, columns=['danmaku'])
top_danmakus = danmaku_df['danmaku'].value_counts().head(8)
logging.info("Top 8 Danmakus:")
logging.info("\n" + str(top_danmakus))
wordcloud = WordCloud(font_path="C:\\Windows\\Fonts\\msyh.ttc", width=800, height=400).generate(
' '.join(all_danmakus))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
if __name__ == "__main__":
main()

@ -1,25 +0,0 @@
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# 原始的词频字典
# 定义字典
danmaku_frequency = {
'5': 824,
'': 535,
'0': 190,
'哈哈哈哈哈': 124,
'哈哈哈哈': 122,
'哈哈哈': 121,
'哈哈哈哈哈哈': 111,
'哈哈哈哈哈哈哈': 103
}
# 获取所有键
keys = danmaku_frequency.keys()
# 将键列表转换为字符串,每个键之间用空格分隔
resulting_string = ' '.join(keys)
# 打印结果
print(resulting_string)

@ -1,5 +0,0 @@
import re
text = "hello 12345 world! 哈哈"
matches = re.findall(r"\w+", text)
print(matches) # 查看输出,确认是否包括预期的数字和字母

Binary file not shown.
Loading…
Cancel
Save