From 91d59f044db43bee3d70549bff958248fd062d6b Mon Sep 17 00:00:00 2001 From: hnu202401010123 <916363713@qq.com> Date: Wed, 25 Dec 2024 21:09:38 +0800 Subject: [PATCH] ADD file via upload --- zhuceziben.py | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 zhuceziben.py diff --git a/zhuceziben.py b/zhuceziben.py new file mode 100644 index 0000000..d965fd1 --- /dev/null +++ b/zhuceziben.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Dec 25 20:49:21 2024 + +@author: 21470 +""" +#### 代码窗口 +import requests +url = "https://top.chinaz.com/gongsi/index_zhuce.html" +headers = { + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36" +} +response = requests.get(url, headers=headers) +html = response.text +html +import re +# 使用 findall 函数来获取数据 +# 公司名 +company = re.findall('(.+?)', html) +# 注册资本 +capital = re.findall('注册资本an>(.*?)

', html) +pageOne = list(zip(company, capital)) +pageOne +# 存储内容 +message = [] +# 总共16个页面的数据 +for page in range(16): + # 组装url + if page == 0: + url = "https://top.chinaz.com/gongsi/index_zhuce.html" + else: + url = "https://top.chinaz.com/gongsi/index_zhuce_{}.html".format(page + 1) + # 使用reqeusts模快发起 GET 请求 + response = requests.get(url, headers=headers) + html = response.text + # 使用 findall 函数来获取数据 + # 公司名 + company = re.findall('(.+?)', html) + # 注册资本 + capital = re.findall('注册资本:(.*?)

', html) + pageOne = list(zip(company, capital)) + # 合并列表 + message.extend(pageOne) +message +# 导入python中的内置模块csv +import csv +with open("zhucecontent.csv", "w") as f: + w = csv.writer(f) + w.writerows(message) +!cat zhucecontent.csv +import matplotlib.pyplot as plt +import pandas as pd + +#读取数据 +df = pd.read_csv("zhucecontent.csv", names=["company", "capital"], header=None) + +df['capital'] = pd.to_numeric(df['capital'].str.replace(r'[^\d.]', '', regex=True).replace('', '0'), errors='coerce') + +# 按照注册资金降序排序 +sorted_df = df.sort_values(by='capital', ascending=False) + +# 选取前20家公司的数据 +top_20 = sorted_df.head(20) + +# 设置中文字体为黑体,解决中文显示问题 +plt.rcParams['font.sans-serif'] = ['SimHei'] +# 解决负号显示问题 +plt.rcParams['axes.unicode_minus'] = False + +# 绘制柱状图,以公司名称为x轴,注册资金为y轴 +plt.bar(top_20['company'], top_20['capital']) + +# 设置x轴标签 +plt.xlabel('公司名称') +# 设置y轴标签 +plt.ylabel('注册资金') +# 设置图表标题 +plt.title('中国五百强中注册资金最多的20家公司') +# 设置x轴刻度标签旋转角度为45度,并靠右对齐,避免文字重叠 +plt.xticks(rotation=45, ha='right') +# 自动调整子图参数,使之更紧凑合理,防止标签被截断等情况 +plt.tight_layout() + +# 显示图表 +plt.show()