From f4a2ed1af497464a7f8a21e0dbdc4011dda626c7 Mon Sep 17 00:00:00 2001 From: hnu202110040210 Date: Sat, 21 May 2022 22:47:58 +0800 Subject: [PATCH] ADD file via upload --- 500爬.py | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 500爬.py diff --git a/500爬.py b/500爬.py new file mode 100644 index 0000000..67ca1c3 --- /dev/null +++ b/500爬.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- +""" +Created on Sat May 21 21:00:02 2022 + +@author: FengWei +""" + +import requests +import re + +# 设置请求头信息 +headers = { + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36" +} +# 使用reqeusts模快发起 GET 请求 + +message = [] +# 总共17个页面的数据 +for page in range(17): + # 组装url + if page == 0: + url = "https://top.chinaz.com/gongsi/index_zhuce.html" + else: + url = "https://top.chinaz.com/gongsi/index_zhuce_{}.html".format(page + 1) + # 使用reqeusts模快发起 GET 请求 + response = requests.get(url, headers=headers) + html = response.text + # 使用 findall 函数来获取数据 + # 公司名 + company = re.findall('(.+?)', html) + # 法定代表人 + money=re.findall('(.+?)亿注册资本', html) + pageOne = list(zip(company, money)) + # 合并列表 + message.extend(pageOne) + +A={} +for i in range(20): + A[message[i][0]]=message[i][1] +print(A) +import matplotlib.pyplot as plt +plt.rcParams['font.sans-serif'] = ['SimHei'] +plt.rcParams['axes.unicode_minus'] = False +M=A.keys() +n=A.values() +x = range(20) +y=[10.3,8.2,7.6,4.2,3.8,3.5,3.2,3.1,3.0,2.9,2.7,2.5,2.2,2.2,1.9,1.8,1.8,1.5,1.5,1.3] +plt.figure(figsize=(45,15)) +plt.bar(x, y,width=0.6) +plt.xticks(x,M) +plt.yticks(y,n) +for i,j,l in zip(x,y,n): + plt.text(i,j,l,size=20,ha='center') +plt.xlabel("公司",fontsize=20) +plt.ylabel("注册资本/亿元",fontsize=20) +plt.title('注册资本前20名的公司') +plt.tick_params(labelsize=10) +plt.show()