From b5fd0b1ef8bb8fbf0c4dceae8e1e5e8d1ad68ee2 Mon Sep 17 00:00:00 2001 From: hnu202401010102 <2250946371@qq.com> Date: Thu, 26 Dec 2024 14:42:38 +0800 Subject: [PATCH] ADD file via upload --- 爬虫大作业.py | 65 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 爬虫大作业.py diff --git a/爬虫大作业.py b/爬虫大作业.py new file mode 100644 index 0000000..b9a6e40 --- /dev/null +++ b/爬虫大作业.py @@ -0,0 +1,65 @@ +import re +from selenium import webdriver +from bs4 import BeautifulSoup +import requests +import matplotlib.pyplot as plt +import matplotlib +headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"} +driver = webdriver.Edge() +driver.switch_to.default_content() +driver.get("https://top.chinaz.com/gongsitop/index_500top.html" ) +next_num =2 +temp_height=0 +money=[] +name=[] +while True: + driver.execute_script("window.scrollBy(0,10000)") + soup = BeautifulSoup(driver.page_source, 'xml') + title_nodes=soup.find_all("div",class_="CoListTxt") + if next_num ==2: + url = "https://top.chinaz.com/gongsitop/index_500top.html" + else: + url = "https://top.chinaz.com/gongsitop/index_500top_{}.html".format(next_num-1) + response = requests.get(url, headers=headers) + html = response.text + money_=re.findall('注册资本:(.*?)

',html) + s=0 + for title_node in title_nodes: + s+=1 + link_1=title_node.find("a") + name.append(link_1.get_text()) + str_='' + for i in range(0,len(money_[s-1])): + if money_[s-1][i].isdigit() or money_[s-1][i]=='.': + str_+=money_[s-1][i] + m=0 + if str_!='': + m=eval(str_) + if '亿' in money_[s-1]: + m=eval(str_)*10000 + if "美元" in money_[s-1]: + m*=6.71 + money.append(int(m)) + if s==0: + break + check_height = driver.execute_script( "return document.documentElement.scrollTop || window.pageYOffset || document.body.scrollTop;") + if check_height == temp_height: + driver.get('https://top.chinaz.com/gongsitop/index_500top_'+str(next_num)+'.html') + next_num += 1 + temp_height = check_height +list_=[] +for i in range(0,len(name)): + list_.append([name[i],money[i]]) +list_.sort(key=lambda x:x[1],reverse=True) +y=[] +x=[] +for i in range(0,20): + y.append(list_[i][1]/10000) + x.append(list_[i][0]) +matplotlib.rc('font', family='SimHei', weight='bold') +plt.rcParams['axes.unicode_minus'] = False +plt.barh(x, y, color='skyblue') +plt.xlabel('注册资本/万元') +plt.ylabel('公司') +plt.title('TOP10公司注册资本') +plt.show() \ No newline at end of file