parent
							
								
									58db263363
								
							
						
					
					
						commit
						1bc34bff47
					
				| @ -0,0 +1,99 @@ | ||||
| import requests | ||||
| import re | ||||
| from concurrent.futures import ThreadPoolExecutor, as_completed | ||||
| 
 | ||||
| count = 0 | ||||
| 
 | ||||
| 
 | ||||
| def get_page_url(n): | ||||
|     """ | ||||
|     获取页面的URL | ||||
|     """ | ||||
|     page_url_list = [] | ||||
|     for i in range(n): | ||||
|         if i == 0: | ||||
|             page_url = "https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5" | ||||
|         else: | ||||
|             page_url = f"https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={i + 1}&o={i * 36}" | ||||
|         page_url_list.append(page_url) | ||||
|     return page_url_list | ||||
| 
 | ||||
| 
 | ||||
| header = { | ||||
|     "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"} | ||||
| 
 | ||||
| 
 | ||||
| def fetch_page(url): | ||||
|     try: | ||||
|         response = requests.get(url=url, headers=header) | ||||
|         response.encoding = 'utf-8' | ||||
|         return response.text | ||||
|     except Exception as e: | ||||
|         print(f"请求失败: {e}") | ||||
|         return "" | ||||
| 
 | ||||
| 
 | ||||
| def get_cid(page_url_list): | ||||
|     global count | ||||
|     cid_list = [] | ||||
| 
 | ||||
|     with ThreadPoolExecutor(max_workers=5) as executor: | ||||
|         future_to_url = {executor.submit(fetch_page, url): url for url in page_url_list} | ||||
|         for future in as_completed(future_to_url): | ||||
|             page_url = future_to_url[future] | ||||
|             try: | ||||
|                 data = future.result() | ||||
|                 content = re.findall('"//www.bilibili.com/video/(.*?)/"', data) | ||||
|                 content = set(content) | ||||
|                 content = list(content) | ||||
|                 for bvid in content: | ||||
|                     url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp" | ||||
|                     response = fetch_page(url) | ||||
|                     cids = re.findall('{"cid":(.*?),', response) | ||||
|                     if cids: | ||||
|                         cid_list.append(cids[0]) | ||||
|                         count += 1 | ||||
|                         print(f"已获取到 {count} 个cid") | ||||
|                     if count >= 300: | ||||
|                         break | ||||
|                 if count >= 300: | ||||
|                     break | ||||
|             except Exception as e: | ||||
|                 print(f"处理失败: {e}") | ||||
|     return cid_list | ||||
| 
 | ||||
| 
 | ||||
| def get_danmu(cid_list): | ||||
|     def fetch_danmu(cid): | ||||
|         try: | ||||
|             url = f"https://comment.bilibili.com/{cid}.xml" | ||||
|             response = requests.get(url=url, headers=header) | ||||
|             response.encoding = 'utf-8' | ||||
|             data = response.text | ||||
|             return re.findall('<d p=".*?">(.*?)</d>', data) | ||||
|         except Exception as e: | ||||
|             print(f"请求失败: {e}") | ||||
|             return [] | ||||
| 
 | ||||
|     danmu_list = [] | ||||
|     with ThreadPoolExecutor(max_workers=5) as executor: | ||||
|         future_to_cid = {executor.submit(fetch_danmu, cid): cid for cid in cid_list} | ||||
|         for future in as_completed(future_to_cid): | ||||
|             cid = future_to_cid[future] | ||||
|             try: | ||||
|                 danmu_list.extend(future.result()) | ||||
|                 print(f"已获取到 {len(danmu_list)} 条弹幕") | ||||
|             except Exception as e: | ||||
|                 print(f"处理失败: {e}") | ||||
|     return danmu_list | ||||
| 
 | ||||
| 
 | ||||
| cid_list = get_cid(get_page_url(10)) | ||||
| print("开始获取弹幕数据...") | ||||
| danmu_list = get_danmu(cid_list) | ||||
| print("弹幕数据爬取完成。") | ||||
| with open('弹幕.txt', 'w', encoding='utf-8') as f: | ||||
|     for danmu in danmu_list: | ||||
|         f.write(danmu + '\n') | ||||
| 
 | ||||
| print("弹幕已保存到 '弹幕.txt'") | ||||
					Loading…
					
					
				
		Reference in new issue