爬虫相关代码

main
cxy 2 months ago
parent 1923632812
commit 5809c0a5f6

Binary file not shown.

After

Width:  |  Height:  |  Size: 212 KiB

@ -0,0 +1,21 @@
import pandas as pd
def top_8_ai_barrage(file_path):
# 读取CSV文件
all_barrage = pd.read_csv(file_path, encoding='utf-8')
# 过滤包含"AI"或"人工智能"的弹幕
ai_barrage = all_barrage[all_barrage['barrage'].str.contains('AI |人工智能|科技|智能', case=False, na=False)]
# 统计每个弹幕出现的次数
counter = ai_barrage['barrage'].value_counts()
# 获取数量排名前8项
top_8 = counter.head(8).reset_index()
top_8.columns = ['弹幕', '出现次数']
return top_8
if __name__ == '__main__':
print(top_8_ai_barrage('barrage.csv'))

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,44 @@
import pandas as pd
import jieba
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
# 对弹幕内容进行分词
def jieba_tokenizer(text):
return jieba.lcut(text)
def cluster_analysis():
df = pd.read_csv('barrage.csv')
stopwords_list = list(stopwords.words('chinese'))
stopwords_list.append('')
stopwords_list.append('')
stopwords_list.append('')
stopwords_list.append('5')
# 使用TF-IDF进行特征提取
vectorizer = TfidfVectorizer(tokenizer=jieba_tokenizer, stop_words=stopwords_list)
tfidf_matrix = vectorizer.fit_transform(df['barrage'])
# 使用KMeans进行聚类
num_clusters = 10
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
# 将聚类结果添加到原始数据中
df['cluster'] = km.labels_
# 输出每个聚类的前几个弹幕
for i in range(num_clusters):
print(f"cluster {i}:")
print(df[df['cluster'] == i]['barrage'].head(10))
print("\n")
# 保存聚类结果到CSV文件
df.to_csv('barrage_clustered.csv', index=False)
if __name__ == '__main__':
cluster_analysis()

@ -0,0 +1,61 @@
import os
import time
import requests
import re
import json
import pandas as pd
os.environ['NO_PROXY'] = 'bilibili.com'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
COOKIE = "buvid3=5CD968B6-5E6F-AA6C-D8BB-422744C1DB0054109infoc; b_nut=1673696354; _uuid=1092729F9-10FE3-DB8C-64A2-F27261E3B43153165infoc; buvid4=82DFB562-AF5C-F20C-AC96-E71C089E97E355884-023011419-hBTxrxbr8pWyUVDiIX7ZVw%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u)luk|llRR0J'uY~RkuJ|Ju; buvid_fp_plain=undefined; i-wanna-go-back=-1; nostalgia_conf=-1; b_ut=5; header_theme_version=CLOSE; LIVE_BUVID=AUTO6216768194613696; home_feed_column=4; CURRENT_PID=8e025d90-cb08-11ed-8e36-15f3cf3099af; CURRENT_QUALITY=80; browser_resolution=1392-786; FEED_LIVE_VERSION=V_SEO_FIRST_CARD; fingerprint=efbe80e589d57838b8ff20cb5df98e9d; buvid_fp=a1198a46c9f71d42436ace10a9ab7448; bili_jct=416c2ad96c39091affdb9092e9a593d9; DedeUserID=414937912; DedeUserID__ckMd5=a09ace8891a7091a; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE2OTQwODgxODUsImlhdCI6MTY5MzgyODk4NSwicGx0IjotMX0.pEBW75b3VX6p2lqx7jPuUpPvrAHz0QIoHtagLMp3_iU; bili_ticket_expires=1694088185; bp_video_offset_1480857975=837478778313637938; bp_video_offset_414937912=838225299482083363; sid=6udo3o89; PVID=3; b_lsid=B11E5210A_18A6ABF8D38"
def get_bvid(page, pos):
url = 'https://api.bilibili.com/x/web-interface/search/all/v2?page='+str(page+1)+'&keyword=2024巴黎奥运会'
headers = {
'User-Agent': USER_AGENT,
'cookie': COOKIE,
}
response = requests.get(url=url, headers=headers, verify=False).text
print(response)
json_dict = json.loads(response)
return json_dict["data"]["result"][11]["data"][pos]['bvid']
def get_cid(bvid):
url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp"
headers = {
'User-Agent': USER_AGENT,
'cookie': COOKIE,
}
response = requests.get(url=url, headers=headers, verify=False).text
dirt = json.loads(response)
cid = dirt['data'][0]['cid']
print(cid)
return cid
def get_barrage(cid):
url = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + str(cid)
headers = {
"User-Agent": USER_AGENT
}
response = requests.get(url, headers=headers)
html_doc = response.content.decode('utf-8')
# 正则表达式的匹配模式
res = re.compile('<d.*?>(.*?)</d>')
# 根据模式提取网页数据
barrage = re.findall(res, html_doc)
df = pd.DataFrame(barrage, columns=['barrage'])
if not os.path.isfile('barrage.csv'):
df.to_csv('barrage.csv', mode='w', index=False, encoding='utf-8-sig')
else:
df.to_csv('barrage.csv', mode='a', index=False, header=False, encoding='utf-8-sig')
if __name__ == '__main__':
for i in range(15):
for j in range(20):
get_barrage(get_cid(get_bvid(i, j)))
time.sleep(1)

@ -0,0 +1,68 @@
import os
import time
from concurrent.futures import ThreadPoolExecutor
import requests
import re
import json
import pandas as pd
os.environ['NO_PROXY'] = 'bilibili.com'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
COOKIE = "buvid3=5CD968B6-5E6F-AA6C-D8BB-422744C1DB0054109infoc; b_nut=1673696354; _uuid=1092729F9-10FE3-DB8C-64A2-F27261E3B43153165infoc; buvid4=82DFB562-AF5C-F20C-AC96-E71C089E97E355884-023011419-hBTxrxbr8pWyUVDiIX7ZVw%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u)luk|llRR0J'uY~RkuJ|Ju; buvid_fp_plain=undefined; i-wanna-go-back=-1; nostalgia_conf=-1; b_ut=5; header_theme_version=CLOSE; LIVE_BUVID=AUTO6216768194613696; home_feed_column=4; CURRENT_PID=8e025d90-cb08-11ed-8e36-15f3cf3099af; CURRENT_QUALITY=80; browser_resolution=1392-786; FEED_LIVE_VERSION=V_SEO_FIRST_CARD; fingerprint=efbe80e589d57838b8ff20cb5df98e9d; buvid_fp=a1198a46c9f71d42436ace10a9ab7448; bili_jct=416c2ad96c39091affdb9092e9a593d9; DedeUserID=414937912; DedeUserID__ckMd5=a09ace8891a7091a; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE2OTQwODgxODUsImlhdCI6MTY5MzgyODk4NSwicGx0IjotMX0.pEBW75b3VX6p2lqx7jPuUpPvrAHz0QIoHtagLMp3_iU; bili_ticket_expires=1694088185; bp_video_offset_1480857975=837478778313637938; bp_video_offset_414937912=838225299482083363; sid=6udo3o89; PVID=3; b_lsid=B11E5210A_18A6ABF8D38"
def get_bvid(page, pos):
url = 'https://api.bilibili.com/x/web-interface/search/all/v2?page='+str(page+1)+'&keyword=2024巴黎奥运会'
headers = {
'User-Agent': USER_AGENT,
'cookie': COOKIE,
}
response = requests.get(url=url, headers=headers, verify=False).text
print(response)
json_dict = json.loads(response)
return json_dict["data"]["result"][11]["data"][pos]['bvid']
def get_cid(bvid):
url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp"
headers = {
'User-Agent': USER_AGENT,
'cookie': COOKIE,
}
response = requests.get(url=url, headers=headers, verify=False).text
dirt = json.loads(response)
cid = dirt['data'][0]['cid']
print(cid)
return cid
def get_barrage(cid):
url = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + str(cid)
headers = {
"User-Agent": USER_AGENT
}
response = requests.get(url, headers=headers)
html_doc = response.content.decode('utf-8')
# 正则表达式的匹配模式
res = re.compile('<d.*?>(.*?)</d>')
# 根据模式提取网页数据
barrage = re.findall(res, html_doc)
df = pd.DataFrame(barrage, columns=['barrage'])
if not os.path.isfile('2.csv'):
df.to_csv('2.csv', mode='w', index=False, encoding='utf-8-sig')
else:
df.to_csv('2.csv', mode='a', index=False, header=False, encoding='utf-8-sig')
def process_page(page, pos):
bvid = get_bvid(page, pos)
cid = get_cid(bvid)
get_barrage(cid)
if __name__ == '__main__':
with ThreadPoolExecutor(max_workers=10) as executor:
for i in range(15):
for j in range(20):
executor.submit(process_page, i, j)

@ -0,0 +1,20 @@
import pandas as pd
from snownlp import SnowNLP
def sentiment_analysis(text):
s = SnowNLP(text)
return s.sentiments
def do_sentiment_analysis(filename):
df = pd.read_csv('barrage.csv')
# 对弹幕内容进行情感分析
df['sentiment'] = df['barrage'].apply(sentiment_analysis)
print(df.head())
# 保存情感分析结果到CSV文件
df.to_csv('barrage_sentiment.csv', index=False)
if __name__ == '__main__':
do_sentiment_analysis('barrage.csv')

@ -0,0 +1,36 @@
import jieba
import nltk
import numpy as np
import PIL.Image as image
from nltk.corpus import stopwords
from wordcloud import wordcloud
def get_wordcloud(file_name):
nltk.download('stopwords')
f = open(file_name, encoding='utf-8')
txt = f.read()
txt_list = jieba.lcut(txt)
string = ' '.join(txt_list)
mask_image = "2.png"
mask = np.array(image.open(mask_image))
stopwords_list = set(stopwords.words('chinese'))
stopwords_target = ['', '', '', '哈哈哈', '', '', '', '']
for i in stopwords_target:
stopwords_list.add(i)
w = wordcloud.WordCloud(
mask=mask,
width=mask.shape[1],
height=mask.shape[0],
background_color='white',
font_path='C:/Windows/Fonts/STLITI.TTF',
stopwords=stopwords_list,
)
w.generate(string)
# 打印词云图片
w.to_file('wordcloud.jpg')
if __name__ == '__main__':
get_wordcloud('barrage.csv')

@ -0,0 +1,31 @@
url,content,tag
https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/,A whole biomechanical analysis,strong
https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/,Safer diving,strong
https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/,The ultimate (for now) photo finish,strong
https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/,Smart bibs,strong
https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/,"Running on the beach, quick moves on the tennis court",strong
https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/,"AI comes to the Olympics: In Paris, new tech will change your view of the Games",h1
https://www.news.ufl.edu/2024/07/ai-olympics/,How AI will transform the Olympics,h1
https://www.news.ufl.edu/2024/07/ai-olympics/,Perfecting talent,h2
https://www.news.ufl.edu/2024/07/ai-olympics/,Judging upgrades,h2
https://www.news.ufl.edu/2024/07/ai-olympics/,Streamlining for host cities,h2
https://www.thepaper.cn/newsDetail_forward_28287864,技术一AI技术成就8K超高清直播,p
https://www.thepaper.cn/newsDetail_forward_28287864,技术二AI自动生成定制化精彩集锦,p
https://www.thepaper.cn/newsDetail_forward_28287864,技术三3D全息视频技术让观众身临其境,p
https://www.thepaper.cn/newsDetail_forward_28287864,技术四通过AI平台全方位分析运动数据,p
https://www.thepaper.cn/newsDetail_forward_28287864,技术五用AI为视障人士搭建无障碍设施,p
https://www.thepaper.cn/newsDetail_forward_28287864,技术六AI技术辅助现场管理,p
https://www.thepaper.cn/newsDetail_forward_28287864,技术七用AI打造专属于运动员的GPT,p
https://www.thepaper.cn/newsDetail_forward_28287864,技术八:神经对象克隆技术助力数字收藏,p
https://www.wicongress.org.cn/2024/zh/article/6268,运动员的“贴身顾问”,strong
https://www.wicongress.org.cn/2024/zh/article/6268,成绩提升的“好帮手”,strong
https://www.wicongress.org.cn/2024/zh/article/6268,开辟观赛的“新视野”,strong
https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/,人工智能介入到观众的数字观赛体验,h2
https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/,AI帮助实现巴黎奥运会的交通能耗数字管理,h2
https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/,全新数字技术可以实时追踪运动员赛场表现,h2
https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/,可持续性和环境影响通过AI加以优化,h2
https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/,AI驱动阿里巴巴云服务提升转播体验,h2
https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/,英特尔配合AI PC带来革命性的 8K 直播,h2
https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/,丰田和合作伙伴GCK为 2024 年奥运会提供氢燃料客车,h2
https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/,中国九号公司助力Tier-Dott为巴黎提供电动自行车,h2
https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/,总结:,h2
1 url content tag
2 https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/ ‘A whole biomechanical analysis’ strong
3 https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/ Safer diving strong
4 https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/ The ultimate (for now) photo finish strong
5 https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/ Smart bibs strong
6 https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/ Running on the beach, quick moves on the tennis court strong
7 https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/ AI comes to the Olympics: In Paris, new tech will change your view of the Games h1
8 https://www.news.ufl.edu/2024/07/ai-olympics/ How AI will transform the Olympics h1
9 https://www.news.ufl.edu/2024/07/ai-olympics/ Perfecting talent h2
10 https://www.news.ufl.edu/2024/07/ai-olympics/ Judging upgrades h2
11 https://www.news.ufl.edu/2024/07/ai-olympics/ Streamlining for host cities h2
12 https://www.thepaper.cn/newsDetail_forward_28287864 技术一:AI技术成就8K超高清直播 p
13 https://www.thepaper.cn/newsDetail_forward_28287864 技术二:AI自动生成定制化精彩集锦 p
14 https://www.thepaper.cn/newsDetail_forward_28287864 技术三:3D全息视频技术让观众身临其境 p
15 https://www.thepaper.cn/newsDetail_forward_28287864 技术四:通过AI平台全方位分析运动数据 p
16 https://www.thepaper.cn/newsDetail_forward_28287864 技术五:用AI为视障人士搭建无障碍设施 p
17 https://www.thepaper.cn/newsDetail_forward_28287864 技术六:AI技术辅助现场管理 p
18 https://www.thepaper.cn/newsDetail_forward_28287864 技术七:用AI打造专属于运动员的GPT p
19 https://www.thepaper.cn/newsDetail_forward_28287864 技术八:神经对象克隆技术助力数字收藏 p
20 https://www.wicongress.org.cn/2024/zh/article/6268 运动员的“贴身顾问” strong
21 https://www.wicongress.org.cn/2024/zh/article/6268 成绩提升的“好帮手” strong
22 https://www.wicongress.org.cn/2024/zh/article/6268 开辟观赛的“新视野” strong
23 https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/ 人工智能介入到观众的数字观赛体验 h2
24 https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/ AI帮助实现巴黎奥运会的交通能耗数字管理 h2
25 https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/ 全新数字技术可以实时追踪运动员赛场表现 h2
26 https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/ 可持续性和环境影响通过AI加以优化 h2
27 https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/ AI驱动阿里巴巴云服务提升转播体验 h2
28 https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/ 英特尔配合AI PC带来革命性的 8K 直播 h2
29 https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/ 丰田和合作伙伴GCK为 2024 年奥运会提供氢燃料客车 h2
30 https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/ 中国九号公司助力Tier-Dott为巴黎提供电动自行车 h2
31 https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/ 总结: h2

@ -0,0 +1,102 @@
import pandas as pd
import requests
from bs4 import BeautifulSoup
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
def get_world_comments():
# 因为没设代理池在复现时请开代理不然有些外网url无法访问
urls = [
"https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/",
"https://www.news.ufl.edu/2024/07/ai-olympics/",
"https://www.thepaper.cn/newsDetail_forward_28287864",
"https://www.wicongress.org.cn/2024/zh/article/6268",
"https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/",
]
headers = {
'User-Agent': USER_AGENT,
}
data = []
for url in urls:
response = requests.get(url, headers=headers,verify=False)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
if url == "https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/":
strong_tags = soup.find_all('strong')
count = 0
for tag in strong_tags:
print(tag.get_text())
data.append({
'url': url,
'content': tag.get_text(),
'tag': 'strong',
})
count += 1
if count == 5:
break
h1_tags = soup.find_all('h1')
for tag in h1_tags:
print(tag.get_text())
data.append({
'url': url,
'content': tag.get_text(),
'tag': 'h1',
})
if url == "https://www.news.ufl.edu/2024/07/ai-olympics/":
h1_tags = soup.find_all('h1')
for tag in h1_tags:
print(tag.get_text())
data.append({
'url': url,
'content': tag.get_text(),
'tag': 'h1',
})
h2_tags = soup.find_all('h2')
for tag in h2_tags:
print(tag.get_text())
data.append({
'url': url,
'content': tag.get_text(),
'tag': 'h2',
})
if url == "https://www.thepaper.cn/newsDetail_forward_28287864":
p_tags = soup.find_all('p')
for tag in p_tags:
content = tag.get_text()
if content.startswith("技术"):
print(content)
data.append({
'url': url,
'content': content,
'tag': 'p',
})
if url == "https://www.wicongress.org.cn/2024/zh/article/6268" :
strong_tags = soup.find_all('strong')
for tag in strong_tags:
print(tag.get_text())
data.append({
'url': url,
'content': tag.get_text(),
'tag': 'strong',
})
if url == "https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/":
h2_tags = soup.find_all('h2')
for tag in h2_tags:
content = tag.get_text().strip()
if content:
print(content)
data.append({
'url': url,
'content': tag.get_text(),
'tag': 'h2',
})
else:
print(f"无法访问 {url}")
df = pd.DataFrame(data)
df.to_csv('world_comment.csv', index=False, encoding='utf-8')
if __name__ == '__main__':
get_world_comments()
Loading…
Cancel
Save