You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

106 lines
3.5 KiB

# -*- coding: utf-8 -*-
"""
Created on Sat May 24 23:23:07 2025
@author: 15733
"""
import requests
import re
import csv
import pandas as pd
import matplotlib.pyplot as plt
# 1. 爬取数据
def scrape_company_data():
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
}
message = []
# 总共16个页面的数据
for page in range(16):
# 组装url
if page == 0:
url = "https://top.chinaz.com/gongsitop/index_500top.html"
else:
url = f"https://top.chinaz.com/gongsitop/index_500top_{page + 1}.html"
try:
response = requests.get(url, headers=headers)
response.raise_for_status() # 检查请求是否成功
html = response.text
# 使用正则表达式提取数据
company = re.findall('<a.*?target="_blank">(.+?)</a></h3>', html)
person = re.findall('法定代表人:</span>(.*?)</p>', html)
signDate = re.findall('注册时间:</span>(.*?)</p>', html)
category = re.findall('证券类别:</span>(.*?)</p>', html)
# 确保每页数据长度一致
min_length = min(len(company), len(person), len(signDate), len(category))
page_data = list(zip(
company[:min_length],
person[:min_length],
signDate[:min_length],
category[:min_length]
))
message.extend(page_data)
print(f"{page+1}页数据爬取完成,共{len(page_data)}条记录")
except Exception as e:
print(f"{page+1}页爬取失败: {e}")
continue
return message
# 2. 保存数据到CSV
def save_to_csv(data, filename="china_top500_companies.csv"):
with open(filename, "w", newline="", encoding="utf-8-sig") as f:
writer = csv.writer(f)
writer.writerow(["公司名", "法定代表人", "注册时间", "证券类别"]) # 写入标题行
writer.writerows(data)
print(f"数据已保存到 {filename}")
# 3. 分析证券占比并绘制饼图
def analyze_and_plot(filename="china_top500_companies.csv"):
# 读取数据
df = pd.read_csv(filename)
# 统计证券类别占比
category_counts = df["证券类别"].value_counts()
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 绘制饼图
plt.figure(figsize=(10, 8))
plt.pie(category_counts,
labels=category_counts.index,
autopct='%1.1f%%',
startangle=90,
textprops={'fontsize': 12})
plt.title("中国500强公司证券类别占比", fontsize=16)
plt.axis('equal') # 保证饼图是圆形
# 保存图片
plt.savefig("证券类别占比.png", dpi=300, bbox_inches='tight')
print("证券类别占比分析图已保存为 证券类别占比.png")
plt.show()
return category_counts
# 主程序
if __name__ == "__main__":
# 1. 爬取数据
company_data = scrape_company_data()
# 2. 保存到CSV
save_to_csv(company_data)
# 3. 分析并绘制饼图
category_distribution = analyze_and_plot()
print("\n证券类别分布统计:")
print(category_distribution)