111

11 months ago · 78771220b3
parent 7c68b73365
commit 78771220b3
8 changed files with 162 additions and 266 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,162 @@
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
--- a/image2black.py
+++ b/image2black.py
@ -1,20 +0,0 @@
-from PIL import Image
-import numpy as np
-
-# 打开图像文件
-img = Image.open("D:\\PYcharm\\pycharm projects\\Life\\blibliCrawler\\image\\CM24B[{2($140KKTP2PC898_tmb.jpg").convert('RGB')
-
-# 将图像转换为NumPy数组
-data = np.array(img)
-
-# 创建一个掩码，标记非白色像素
-non_white_mask = (data != [255, 255, 255]).any(axis=-1)
-
-# 将非白色像素设置为黑色
-data[non_white_mask] = [0, 0, 0]
-
-# 将NumPy数组转换回图像
-new_img = Image.fromarray(data)
-
-# 保存处理后的图像
-new_img.save('output.jpg')
--- a/main.py
+++ b/main.py
@ -1,122 +0,0 @@
-import requests
-from bs4 import BeautifulSoup
-import pandas as pd
-from wordcloud import WordCloud
-import matplotlib.pyplot as plt
-import time
-def get_video_ids(keyword, max_results):
-    videos = []
-    for page in range(1, max_results // 30 + 1):
-        url = f'https://api.bilibili.com/x/web-interface/search/all/v2?keyword={keyword}&page={page}'
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0',
-            'Referer': 'https://www.bilibili.com/',
-            'Origin': 'https://www.bilibili.com' ,
-            'Accept': 'application/json, text/plain, */*',
-            'Accept-Encoding': 'gzip, deflate, br',
-            'Accept-Language': 'en-US,en;q=0.9',
-            'Cache-Control': 'no-cache',
-            'Connection': 'keep-alive',
-            'X-Requested-With': 'XMLHttpRequest',
-            'DNT': '1',
-            'Cookie': 'buvid3=1771013D-4B9A-7EDC-B731-F26861DA6BC572364infoc; b_nut=1724924272; _uuid=105A710892-CDD2-D428-F7CE-710974C815A9C68509infoc; buvid_fp=af977995f09c5e137184ce7245ff4b24; buvid4=63859769-66CC-9A8C-877A-2EC1DBADC52673285-024082909-9aqwhTn0maBTqGMv%2F5bUdg%3D%3D; enable_web_push=DISABLE; home_feed_column=5; CURRENT_FNVAL=4048; rpdid=|(u)luk)))YJ0J\'u~kRmkRJJu; header_theme_version=CLOSE; DedeUserID=296374397; DedeUserID__ckMd5=3846201b32bf5925; hit-dyn-v2=1; CURRENT_QUALITY=80; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU5OTY1NTMsImlhdCI6MTcyNTczNzI5MywicGx0IjotMX0.4i0IcahQaom8775sGxa60ssJOD6sYJc-Emr8Xdj4HUg; bili_ticket_expires=1725996493; SESSDATA=2f5df052%2C1741427356%2C1b6ed%2A92CjBmK7vjV-9ckxGj4QviTYeR5dgiOECi1hHuDSCiIQv073dV72Ce14Z7zYRW84pn4aASVkJwdzV0MHBKTjhoUWJ0QktZbFJ1UHk5bmh2SVp3ZkkwckNtSHhfcWpCS2pLbmZ2bXpCSFM5Q19NdU9keTRYTkpFbEFrTkhIT3hrMFVOM0xKOExhLWVnIIEC; bili_jct=e1932adcb4a2c3db0c92f1492968fa3d; sid=6i0bipdx; b_lsid=5E10A61046_191DB2DBF63; browser_resolution=1488-740; bp_t_offset_296374397=975480406855909376'
-        }
-        response = requests.get(url, headers=headers)
-        data = response.json()
-        if 'data' in data and 'result' in data['data']:
-            for item in data['data']['result']:
-                if item['result_type'] == 'video':
-                    for video in item['data']:
-                        videos.append(video['bvid'])
-            with open('videos_ids/巴黎奥运会.txt', 'a') as file:  # Append mode
-                file.write('\n'.join(videos) + '\n')
-        else:
-            print("No data found for page:", page)
-            print("Response:", data)
-        time.sleep(1)  # 减缓请求频率
-    return videos[:max_results]
-
-
-# 获取单个视频的弹幕的函数
-def get_danmaku(bvid):
-    # 构建CID URL并请求
-    cid_url = f'https://api.bilibili.com/x/player/pagelist?bvid={bvid}'
-    cid_response = requests.get(cid_url)
-
-    # 检查CID请求是否成功
-    if cid_response.status_code != 200:
-        print(f"Failed to fetch CID: HTTP {cid_response.status_code}")
-        print("Response text:", cid_response.text)
-        return []
-
-    try:
-        cid_data = cid_response.json()
-        cid = cid_data['data'][0]['cid']
-    except (KeyError, IndexError, ValueError) as e:
-        print(f"Error parsing CID data: {str(e)}")
-        print("Response content:", cid_response.text)
-        return []
-
-    # 使用获取到的CID构建弹幕URL并请求
-    danmaku_url = f'https://comment.bilibili.com/{cid}.xml'
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0',
-        # 'Referer': f'https://www.bilibili.com/video/{bvid}',
-        # 'Origin': 'https://www.bilibili.com',
-        'Cookie': '_uuid=7F7A6379-BA108-194D-92B2-3CAAE8E2BE2D02587infoc; fingerprint=b8971f556d2eeb006b40801c8e3efb84; buvid3=68EE987D-FCC0-A766-AED9-1BD27FBD32E603566infoc; b_nut=1725969003; buvid4=73420F38-E432-9A45-CFD8-4F517839FF4C03566-024091011-kei221M088sCtEsqrKkUZw%3D%3D; buvid_fp_plain=undefined; b_lsid=4463B3FC_191DC332B4B; buvid_fp=b8971f556d2eeb006b40801c8e3efb84; header_theme_version=CLOSE; enable_web_push=DISABLE; home_feed_column=5; browser_resolution=1536-678; CURRENT_FNVAL=4048; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYyMzY1ODQsImlhdCI6MTcyNTk3NzMyNCwicGx0IjotMX0.fX6hQTkCSTUT8EbvFbuVCPxJzEZb7nGSXFpgkz6aD4U; bili_ticket_expires=1726236524; sid=g5u2ivp4'    }
-    session = requests.Session()
-    session.headers.update(headers)
-    danmaku_response = session.get(danmaku_url, headers=headers)
-    danmaku_response.encoding = 'utf-8'
-
-    # 解析弹幕数据
-    try:
-        soup = BeautifulSoup(danmaku_response.content, 'lxml')
-        danmakus = [d.text for d in soup.find_all('d')]
-    except Exception as e:
-        print(f"Error parsing danmakus: {str(e)}")
-        return []
-
-    # 保存弹幕到文件
-    with open(f'{bvid}_danmakus.txt', 'w', encoding='utf-8') as file:
-        file.write('\n'.join(danmakus))
-
-    # 返回弹幕数据
-    return danmakus
-
-
-
-# 主函数
-def main():
-    keyword = '2024巴黎奥运会'
-    video_ids = get_video_ids(keyword, 300)
-    all_danmakus = []
-    if not video_ids:
-        print("No video ids retrieved; check the network and API responses.")
-        return
-
-    for bvid in video_ids:
-        danmakus = get_danmaku(bvid)
-        all_danmakus.extend(danmakus)
-
-    if not all_danmakus:
-        print("No danmakus retrieved; unable to generate word cloud.")
-        return
-
-    # 数据分析
-    danmaku_df = pd.DataFrame(all_danmakus, columns=['danmaku'])
-    top_danmakus = danmaku_df['danmaku'].value_counts().head(8)
-    print("Top 8 Danmakus:")
-    print(top_danmakus)
-
-    # 生成词云
-    wordcloud = WordCloud(font_path="C:\\Windows\\Fonts\\msyh.ttc", width=800, height=400).generate(
-        ' '.join(all_danmakus))
-    plt.figure(figsize=(10, 5))
-    plt.imshow(wordcloud, interpolation='bilinear')
-    plt.axis('off')
-    plt.show()
-
-
-if __name__ == "__main__":
-    main()
--- a/main2.py
+++ b/main2.py
@ -1,94 +0,0 @@
-import requests
-from selenium import webdriver
-from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.common.by import By
-from bs4 import BeautifulSoup
-import time
-import pandas as pd
-from wordcloud import WordCloud
-import matplotlib.pyplot as plt
-import logging
-
-# 设置日志配置
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-
-
-def get_video_ids_with_selenium(keyword, max_results):
-    options = Options()
-    options.add_argument("--headless")  # 无头模式
-    options.add_argument("--disable-gpu")
-    driver = webdriver.Chrome(options=options)
-
-    videos = []
-    url = f"https://search.bilibili.com/all?keyword={keyword}"
-    driver.get(url)
-    time.sleep(3)  # 等待页面加载
-
-    try:
-        video_elements = driver.find_elements(By.CSS_SELECTOR, 'a.video-item')
-        for element in video_elements:
-            video_link = element.get_attribute('href')
-            video_id = video_link.split('/')[-1]
-            videos.append(video_id)
-            if len(videos) >= max_results:
-                break
-    except Exception as e:
-        logging.error(f"Error finding video elements: {str(e)}")
-    finally:
-        driver.quit()
-
-    return videos
-
-
-def get_danmaku(bvid):
-    try:
-        cid_url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp"
-        cid_response = requests.get(cid_url)
-        cid = cid_response.json()['data'][0]['cid']
-
-        danmaku_url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
-        }
-        danmaku_response = requests.get(danmaku_url, headers=headers)
-        soup = BeautifulSoup(danmaku_response.content, 'lxml')
-        danmakus = [d.text for d in soup.find_all('d')]
-    except Exception as e:
-        logging.error(f"Failed to get danmakus for BVID {bvid}: {str(e)}")
-        return []
-
-    return danmakus
-
-
-def main():
-    keyword = '2024巴黎奥运会'
-    logging.info("Starting to fetch video IDs")
-    video_ids = get_video_ids_with_selenium(keyword, 300)
-    logging.info(f"Retrieved {len(video_ids)} video IDs")
-
-    all_danmakus = []
-    for index, bvid in enumerate(video_ids):
-        logging.info(f"Processing video {index + 1}/{len(video_ids)}: {bvid}")
-        danmakus = get_danmaku(bvid)
-        all_danmakus.extend(danmakus)
-
-    if not all_danmakus:
-        logging.warning("No danmakus retrieved; unable to generate word cloud.")
-        return
-
-    logging.info(f"Collected a total of {len(all_danmakus)} danmakus")
-    danmaku_df = pd.DataFrame(all_danmakus, columns=['danmaku'])
-    top_danmakus = danmaku_df['danmaku'].value_counts().head(8)
-    logging.info("Top 8 Danmakus:")
-    logging.info("\n" + str(top_danmakus))
-
-    wordcloud = WordCloud(font_path="C:\\Windows\\Fonts\\msyh.ttc", width=800, height=400).generate(
-        ' '.join(all_danmakus))
-    plt.figure(figsize=(10, 5))
-    plt.imshow(wordcloud, interpolation='bilinear')
-    plt.axis('off')
-    plt.show()
-
-
-if __name__ == "__main__":
-    main()
--- a/main3.py
+++ b/main3.py
@ -1,25 +0,0 @@
-from wordcloud import WordCloud
-import matplotlib.pyplot as plt
-
-# 原始的词频字典
-# 定义字典
-danmaku_frequency = {
-    '5': 824,
-    '？': 535,
-    '0': 190,
-    '哈哈哈哈哈': 124,
-    '哈哈哈哈': 122,
-    '哈哈哈': 121,
-    '哈哈哈哈哈哈': 111,
-    '哈哈哈哈哈哈哈': 103
-}
-
-# 获取所有键
-keys = danmaku_frequency.keys()
-
-# 将键列表转换为字符串，每个键之间用空格分隔
-resulting_string = ' '.join(keys)
-
-# 打印结果
-print(resulting_string)
-
--- a/test.py
+++ b/test.py
@ -1,5 +0,0 @@
-import re
-
-text = "hello 12345 world! 哈哈"
-matches = re.findall(r"\w+", text)
-print(matches)  # 查看输出，确认是否包括预期的数字和字母
--- a/top_danmakus_2024巴黎奥运会.xlsx
+++ b/top_danmakus_2024巴黎奥运会.xlsx
--- a/豆瓣电影Top250.xls
+++ b/豆瓣电影Top250.xls