爬虫相关代码

11 months ago · 5809c0a5f6
parent 1923632812
commit 5809c0a5f6
12 changed files with 101450 additions and 0 deletions
--- a/crawler/2.png
+++ b/crawler/2.png
--- a/crawler/ai_analysis.py
+++ b/crawler/ai_analysis.py
@ -0,0 +1,21 @@
 import pandas as pd
 def top_8_ai_barrage(file_path):
    # 读取CSV文件
    all_barrage = pd.read_csv(file_path, encoding='utf-8')
    # 过滤包含"AI"或"人工智能"的弹幕
    ai_barrage = all_barrage[all_barrage['barrage'].str.contains('AI |人工智能|科技|智能', case=False, na=False)]
    # 统计每个弹幕出现的次数
    counter = ai_barrage['barrage'].value_counts()
    # 获取数量排名前8项
    top_8 = counter.head(8).reset_index()
    top_8.columns = ['弹幕', '出现次数']
    return top_8
 if __name__ == '__main__':
    print(top_8_ai_barrage('barrage.csv'))
--- a/crawler/barrage.csv
+++ b/crawler/barrage.csv
--- a/crawler/barrage_clustered.csv
+++ b/crawler/barrage_clustered.csv
--- a/crawler/barrage_sentiment.csv
+++ b/crawler/barrage_sentiment.csv
--- a/crawler/cluster.py
+++ b/crawler/cluster.py
@ -0,0 +1,44 @@
 import pandas as pd
 import jieba
 from nltk.corpus import stopwords
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.cluster import KMeans
 # 对弹幕内容进行分词
 def jieba_tokenizer(text):
    return jieba.lcut(text)
 def cluster_analysis():
    df = pd.read_csv('barrage.csv')
    stopwords_list = list(stopwords.words('chinese'))
    stopwords_list.append('都')
    stopwords_list.append('不')
    stopwords_list.append('好')
    stopwords_list.append('5')
    # 使用TF-IDF进行特征提取
    vectorizer = TfidfVectorizer(tokenizer=jieba_tokenizer, stop_words=stopwords_list)
    tfidf_matrix = vectorizer.fit_transform(df['barrage'])
    # 使用KMeans进行聚类
    num_clusters = 10
    km = KMeans(n_clusters=num_clusters)
    km.fit(tfidf_matrix)
    # 将聚类结果添加到原始数据中
    df['cluster'] = km.labels_
    # 输出每个聚类的前几个弹幕
    for i in range(num_clusters):
        print(f"cluster {i}:")
        print(df[df['cluster'] == i]['barrage'].head(10))
        print("\n")
    # 保存聚类结果到CSV文件
    df.to_csv('barrage_clustered.csv', index=False)
 if __name__ == '__main__':
    cluster_analysis()
--- a/crawler/crawler.py
+++ b/crawler/crawler.py
@ -0,0 +1,61 @@
 import os
 import time
 import requests
 import re
 import json
 import pandas as pd
 os.environ['NO_PROXY'] = 'bilibili.com'
 USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
 COOKIE = "buvid3=5CD968B6-5E6F-AA6C-D8BB-422744C1DB0054109infoc; b_nut=1673696354; _uuid=1092729F9-10FE3-DB8C-64A2-F27261E3B43153165infoc; buvid4=82DFB562-AF5C-F20C-AC96-E71C089E97E355884-023011419-hBTxrxbr8pWyUVDiIX7ZVw%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u)luk|llRR0J'uY~RkuJ|Ju; buvid_fp_plain=undefined; i-wanna-go-back=-1; nostalgia_conf=-1; b_ut=5; header_theme_version=CLOSE; LIVE_BUVID=AUTO6216768194613696; home_feed_column=4; CURRENT_PID=8e025d90-cb08-11ed-8e36-15f3cf3099af; CURRENT_QUALITY=80; browser_resolution=1392-786; FEED_LIVE_VERSION=V_SEO_FIRST_CARD; fingerprint=efbe80e589d57838b8ff20cb5df98e9d; buvid_fp=a1198a46c9f71d42436ace10a9ab7448; bili_jct=416c2ad96c39091affdb9092e9a593d9; DedeUserID=414937912; DedeUserID__ckMd5=a09ace8891a7091a; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE2OTQwODgxODUsImlhdCI6MTY5MzgyODk4NSwicGx0IjotMX0.pEBW75b3VX6p2lqx7jPuUpPvrAHz0QIoHtagLMp3_iU; bili_ticket_expires=1694088185; bp_video_offset_1480857975=837478778313637938; bp_video_offset_414937912=838225299482083363; sid=6udo3o89; PVID=3; b_lsid=B11E5210A_18A6ABF8D38"
 def get_bvid(page, pos):
    url = 'https://api.bilibili.com/x/web-interface/search/all/v2?page='+str(page+1)+'&keyword=2024巴黎奥运会'
    headers = {
        'User-Agent': USER_AGENT,
        'cookie': COOKIE,
    }
    response = requests.get(url=url, headers=headers, verify=False).text
    print(response)
    json_dict = json.loads(response)
    return json_dict["data"]["result"][11]["data"][pos]['bvid']
 def get_cid(bvid):
    url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp"
    headers = {
        'User-Agent': USER_AGENT,
        'cookie': COOKIE,
    }
    response = requests.get(url=url, headers=headers, verify=False).text
    dirt = json.loads(response)
    cid = dirt['data'][0]['cid']
    print(cid)
    return cid
 def get_barrage(cid):
    url = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + str(cid)
    headers = {
        "User-Agent": USER_AGENT
    }
    response = requests.get(url, headers=headers)
    html_doc = response.content.decode('utf-8')
    # 正则表达式的匹配模式
    res = re.compile('<d.*?>(.*?)</d>')
    # 根据模式提取网页数据
    barrage = re.findall(res, html_doc)
    df = pd.DataFrame(barrage, columns=['barrage'])
    if not os.path.isfile('barrage.csv'):
        df.to_csv('barrage.csv', mode='w', index=False, encoding='utf-8-sig')
    else:
        df.to_csv('barrage.csv', mode='a', index=False, header=False, encoding='utf-8-sig')
 if __name__ == '__main__':
    for i in range(15):
        for j in range(20):
            get_barrage(get_cid(get_bvid(i, j)))
        time.sleep(1)
--- a/crawler/crawler_Multi_threaded.py
+++ b/crawler/crawler_Multi_threaded.py
@ -0,0 +1,68 @@
 import os
 import time
 from concurrent.futures import ThreadPoolExecutor
 import requests
 import re
 import json
 import pandas as pd
 os.environ['NO_PROXY'] = 'bilibili.com'
 USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
 COOKIE = "buvid3=5CD968B6-5E6F-AA6C-D8BB-422744C1DB0054109infoc; b_nut=1673696354; _uuid=1092729F9-10FE3-DB8C-64A2-F27261E3B43153165infoc; buvid4=82DFB562-AF5C-F20C-AC96-E71C089E97E355884-023011419-hBTxrxbr8pWyUVDiIX7ZVw%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u)luk|llRR0J'uY~RkuJ|Ju; buvid_fp_plain=undefined; i-wanna-go-back=-1; nostalgia_conf=-1; b_ut=5; header_theme_version=CLOSE; LIVE_BUVID=AUTO6216768194613696; home_feed_column=4; CURRENT_PID=8e025d90-cb08-11ed-8e36-15f3cf3099af; CURRENT_QUALITY=80; browser_resolution=1392-786; FEED_LIVE_VERSION=V_SEO_FIRST_CARD; fingerprint=efbe80e589d57838b8ff20cb5df98e9d; buvid_fp=a1198a46c9f71d42436ace10a9ab7448; bili_jct=416c2ad96c39091affdb9092e9a593d9; DedeUserID=414937912; DedeUserID__ckMd5=a09ace8891a7091a; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE2OTQwODgxODUsImlhdCI6MTY5MzgyODk4NSwicGx0IjotMX0.pEBW75b3VX6p2lqx7jPuUpPvrAHz0QIoHtagLMp3_iU; bili_ticket_expires=1694088185; bp_video_offset_1480857975=837478778313637938; bp_video_offset_414937912=838225299482083363; sid=6udo3o89; PVID=3; b_lsid=B11E5210A_18A6ABF8D38"
 def get_bvid(page, pos):
    url = 'https://api.bilibili.com/x/web-interface/search/all/v2?page='+str(page+1)+'&keyword=2024巴黎奥运会'
    headers = {
        'User-Agent': USER_AGENT,
        'cookie': COOKIE,
    }
    response = requests.get(url=url, headers=headers, verify=False).text
    print(response)
    json_dict = json.loads(response)
    return json_dict["data"]["result"][11]["data"][pos]['bvid']
 def get_cid(bvid):
    url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp"
    headers = {
        'User-Agent': USER_AGENT,
        'cookie': COOKIE,
    }
    response = requests.get(url=url, headers=headers, verify=False).text
    dirt = json.loads(response)
    cid = dirt['data'][0]['cid']
    print(cid)
    return cid
 def get_barrage(cid):
    url = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + str(cid)
    headers = {
        "User-Agent": USER_AGENT
    }
    response = requests.get(url, headers=headers)
    html_doc = response.content.decode('utf-8')
    # 正则表达式的匹配模式
    res = re.compile('<d.*?>(.*?)</d>')
    # 根据模式提取网页数据
    barrage = re.findall(res, html_doc)
    df = pd.DataFrame(barrage, columns=['barrage'])
    if not os.path.isfile('2.csv'):
        df.to_csv('2.csv', mode='w', index=False, encoding='utf-8-sig')
    else:
        df.to_csv('2.csv', mode='a', index=False, header=False, encoding='utf-8-sig')
 def process_page(page, pos):
    bvid = get_bvid(page, pos)
    cid = get_cid(bvid)
    get_barrage(cid)
 if __name__ == '__main__':
    with ThreadPoolExecutor(max_workers=10) as executor:
        for i in range(15):
            for j in range(20):
                executor.submit(process_page, i, j)
--- a/crawler/sentiment_analysis.py
+++ b/crawler/sentiment_analysis.py
@ -0,0 +1,20 @@
 import pandas as pd
 from snownlp import SnowNLP
 def sentiment_analysis(text):
    s = SnowNLP(text)
    return s.sentiments
 def do_sentiment_analysis(filename):
    df = pd.read_csv('barrage.csv')
    # 对弹幕内容进行情感分析
    df['sentiment'] = df['barrage'].apply(sentiment_analysis)
    print(df.head())
    # 保存情感分析结果到CSV文件
    df.to_csv('barrage_sentiment.csv', index=False)
 if __name__ == '__main__':
    do_sentiment_analysis('barrage.csv')
--- a/crawler/word_cloud.py
+++ b/crawler/word_cloud.py
@ -0,0 +1,36 @@
 import jieba
 import nltk
 import numpy as np
 import PIL.Image as image
 from nltk.corpus import stopwords
 from wordcloud import wordcloud
 def get_wordcloud(file_name):
    nltk.download('stopwords')
    f = open(file_name, encoding='utf-8')
    txt = f.read()
    txt_list = jieba.lcut(txt)
    string = ' '.join(txt_list)
    mask_image = "2.png"
    mask = np.array(image.open(mask_image))
    stopwords_list = set(stopwords.words('chinese'))
    stopwords_target = ['都', '不', '好', '哈哈哈', '说', '还', '很', '没']
    for i in stopwords_target:
        stopwords_list.add(i)
    w = wordcloud.WordCloud(
                            mask=mask,
                            width=mask.shape[1],
                            height=mask.shape[0],
                            background_color='white',
                            font_path='C:/Windows/Fonts/STLITI.TTF',
                            stopwords=stopwords_list,
                            )
    w.generate(string)
    # 打印词云图片
    w.to_file('wordcloud.jpg')
 if __name__ == '__main__':
    get_wordcloud('barrage.csv')
--- a/crawler/world_comment.csv
+++ b/crawler/world_comment.csv
@ -0,0 +1,31 @@
 url,content,tag
 https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/,‘A whole biomechanical analysis’,strong
 https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/,Safer diving,strong
 https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/,The ultimate (for now) photo finish,strong
 https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/,Smart bibs,strong
 https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/,"Running on the beach, quick moves on the tennis court",strong
 https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/,"AI comes to the Olympics: In Paris, new tech will change your view of the Games",h1
 https://www.news.ufl.edu/2024/07/ai-olympics/,How AI will transform the Olympics,h1
 https://www.news.ufl.edu/2024/07/ai-olympics/,Perfecting talent,h2
 https://www.news.ufl.edu/2024/07/ai-olympics/,Judging upgrades,h2
 https://www.news.ufl.edu/2024/07/ai-olympics/,Streamlining for host cities,h2
 https://www.thepaper.cn/newsDetail_forward_28287864,技术一：AI技术成就8K超高清直播,p
 https://www.thepaper.cn/newsDetail_forward_28287864,技术二：AI自动生成定制化精彩集锦,p
 https://www.thepaper.cn/newsDetail_forward_28287864,技术三：3D全息视频技术让观众身临其境,p
 https://www.thepaper.cn/newsDetail_forward_28287864,技术四：通过AI平台全方位分析运动数据,p
 https://www.thepaper.cn/newsDetail_forward_28287864,技术五：用AI为视障人士搭建无障碍设施,p
 https://www.thepaper.cn/newsDetail_forward_28287864,技术六：AI技术辅助现场管理,p
 https://www.thepaper.cn/newsDetail_forward_28287864,技术七：用AI打造专属于运动员的GPT,p
 https://www.thepaper.cn/newsDetail_forward_28287864,技术八：神经对象克隆技术助力数字收藏,p
 https://www.wicongress.org.cn/2024/zh/article/6268,运动员的“贴身顾问”,strong
 https://www.wicongress.org.cn/2024/zh/article/6268,成绩提升的“好帮手”,strong
 https://www.wicongress.org.cn/2024/zh/article/6268,开辟观赛的“新视野”,strong
 https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/,人工智能介入到观众的数字观赛体验,h2
 https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/,AI帮助实现巴黎奥运会的交通能耗数字管理,h2
 https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/,全新数字技术可以实时追踪运动员赛场表现,h2
 https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/,可持续性和环境影响通过AI加以优化,h2
 https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/,AI驱动阿里巴巴云服务提升转播体验,h2
 https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/,英特尔配合AI PC带来革命性的 8K 直播,h2
 https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/,丰田和合作伙伴GCK为 2024 年奥运会提供氢燃料客车,h2
 https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/,中国九号公司助力Tier-Dott为巴黎提供电动自行车,h2
 https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/,总结：,h2
--- a/crawler/world_crawler.py
+++ b/crawler/world_crawler.py
@ -0,0 +1,102 @@
 import pandas as pd
 import requests
 from bs4 import BeautifulSoup
 USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
 def get_world_comments():
    # 因为没设代理池，在复现时请开代理，不然有些外网url无法访问
    urls = [
        "https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/",
        "https://www.news.ufl.edu/2024/07/ai-olympics/",
        "https://www.thepaper.cn/newsDetail_forward_28287864",
        "https://www.wicongress.org.cn/2024/zh/article/6268",
        "https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/",
    ]
    headers = {
        'User-Agent': USER_AGENT,
    }
    data = []
    for url in urls:
        response = requests.get(url, headers=headers,verify=False)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            if url == "https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/":
                strong_tags = soup.find_all('strong')
                count = 0
                for tag in strong_tags:
                    print(tag.get_text())
                    data.append({
                        'url': url,
                        'content': tag.get_text(),
                        'tag': 'strong',
                    })
                    count += 1
                    if count == 5:
                        break
                h1_tags = soup.find_all('h1')
                for tag in h1_tags:
                    print(tag.get_text())
                    data.append({
                        'url': url,
                        'content': tag.get_text(),
                        'tag': 'h1',
                    })
            if url == "https://www.news.ufl.edu/2024/07/ai-olympics/":
                h1_tags = soup.find_all('h1')
                for tag in h1_tags:
                    print(tag.get_text())
                    data.append({
                        'url': url,
                        'content': tag.get_text(),
                        'tag': 'h1',
                    })
                h2_tags = soup.find_all('h2')
                for tag in h2_tags:
                    print(tag.get_text())
                    data.append({
                        'url': url,
                        'content': tag.get_text(),
                        'tag': 'h2',
                    })
            if url == "https://www.thepaper.cn/newsDetail_forward_28287864":
                p_tags = soup.find_all('p')
                for tag in p_tags:
                    content = tag.get_text()
                    if content.startswith("技术"):
                        print(content)
                        data.append({
                            'url': url,
                            'content': content,
                            'tag': 'p',
                        })
            if url == "https://www.wicongress.org.cn/2024/zh/article/6268" :
                strong_tags = soup.find_all('strong')
                for tag in strong_tags:
                    print(tag.get_text())
                    data.append({
                        'url': url,
                        'content': tag.get_text(),
                        'tag': 'strong',
                    })
            if url == "https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/":
                h2_tags = soup.find_all('h2')
                for tag in h2_tags:
                    content = tag.get_text().strip()
                    if content:
                        print(content)
                        data.append({
                            'url': url,
                            'content': tag.get_text(),
                            'tag': 'h2',
                        })
        else:
            print(f"无法访问 {url}")
    df = pd.DataFrame(data)
    df.to_csv('world_comment.csv', index=False, encoding='utf-8')
 if __name__ == '__main__':
    get_world_comments()