You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

86 lines
2.6 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*- coding: utf-8 -*-
"""
Created on Wed Dec 25 20:49:21 2024
@author: 21470
"""
#### 代码窗口
import requests
url = "https://top.chinaz.com/gongsi/index_zhuce.html"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
}
response = requests.get(url, headers=headers)
html = response.text
html
import re
# 使用 findall 函数来获取数据
# 公司名
company = re.findall('<a.*?target="_blank">(.+?)</a></h3>', html)
# 注册资本
capital = re.findall('注册资本an>(.*?)</p>', html)
pageOne = list(zip(company, capital))
pageOne
# 存储内容
message = []
# 总共16个页面的数据
for page in range(16):
# 组装url
if page == 0:
url = "https://top.chinaz.com/gongsi/index_zhuce.html"
else:
url = "https://top.chinaz.com/gongsi/index_zhuce_{}.html".format(page + 1)
# 使用reqeusts模快发起 GET 请求
response = requests.get(url, headers=headers)
html = response.text
# 使用 findall 函数来获取数据
# 公司名
company = re.findall('<a.*?target="_blank">(.+?)</a></h3>', html)
# 注册资本
capital = re.findall('注册资本:</span>(.*?)</p>', html)
pageOne = list(zip(company, capital))
# 合并列表
message.extend(pageOne)
message
# 导入python中的内置模块csv
import csv
with open("zhucecontent.csv", "w") as f:
w = csv.writer(f)
w.writerows(message)
!cat zhucecontent.csv
import matplotlib.pyplot as plt
import pandas as pd
#读取数据
df = pd.read_csv("zhucecontent.csv", names=["company", "capital"], header=None)
df['capital'] = pd.to_numeric(df['capital'].str.replace(r'[^\d.]', '', regex=True).replace('', '0'), errors='coerce')
# 按照注册资金降序排序
sorted_df = df.sort_values(by='capital', ascending=False)
# 选取前20家公司的数据
top_20 = sorted_df.head(20)
# 设置中文字体为黑体,解决中文显示问题
plt.rcParams['font.sans-serif'] = ['SimHei']
# 解决负号显示问题
plt.rcParams['axes.unicode_minus'] = False
# 绘制柱状图以公司名称为x轴注册资金为y轴
plt.bar(top_20['company'], top_20['capital'])
# 设置x轴标签
plt.xlabel('公司名称')
# 设置y轴标签
plt.ylabel('注册资金')
# 设置图表标题
plt.title('中国五百强中注册资金最多的20家公司')
# 设置x轴刻度标签旋转角度为45度并靠右对齐避免文字重叠
plt.xticks(rotation=45, ha='right')
# 自动调整子图参数,使之更紧凑合理,防止标签被截断等情况
plt.tight_layout()
# 显示图表
plt.show()