diff --git a/获取所有视频的url.py b/获取所有视频的url.py new file mode 100644 index 0000000..c56e937 --- /dev/null +++ b/获取所有视频的url.py @@ -0,0 +1,47 @@ +from selenium import webdriver +import time +from bs4 import BeautifulSoup +import redis + +# 480959917 +# 1856528671 +def crawl_upinfo(url): + path='chromedriver.exe' + browser=webdriver.Chrome(path) + browser.get(url) + time.sleep(2) + detial_url_list = [] + html = BeautifulSoup(browser.page_source) + + for a_label in html.find('div', id='submit-video-list').find_all('a',attrs={'target': '_blank', 'class': 'title'}): + if (a_label['href'] != None): + detial_url_list.append('https:' + a_label['href']) + return detial_url_list + +# https://space.bilibili.com/480959917/video?tid=0&page=1&keyword=&order=pubdate + +if __name__=='__main__': + # 480959917 + uid=input('请输入你想要看的博主的uid:') + base_url1='https://space.bilibili.com/' + base_url2='/video?tid=0&page=' + base_url3='&keyword=&order=pubdate' + url=base_url1+uid+base_url2+'1'+base_url3 + + path='chromedriver.exe' + browser=webdriver.Chrome(path) + browser.get(url) + time.sleep(2) + html = BeautifulSoup(browser.page_source) + last_page=html.find('div',id='submit-video-list').find('ul',class_='be-pager').find_all('li')[-2].find('a').text + upvedio_url_list=[] + + for i in range(1,int(last_page)+1): + upvedio_url=base_url1+uid+base_url2+str(i)+base_url3 + upvedio_url_list+=crawl_upinfo(upvedio_url) + upvedio_url_list = set(upvedio_url_list) + r = redis.Redis(host='127.0.0.1', + port=6379, + ) + for url_i in upvedio_url_list: + r.lpush('bili23', url_i)