You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
106 lines
3.5 KiB
106 lines
3.5 KiB
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Sat May 24 23:23:07 2025
|
|
|
|
@author: 15733
|
|
"""
|
|
|
|
import requests
|
|
import re
|
|
import csv
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
|
|
# 1. 爬取数据
|
|
def scrape_company_data():
|
|
headers = {
|
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
|
|
}
|
|
message = []
|
|
|
|
# 总共16个页面的数据
|
|
for page in range(16):
|
|
# 组装url
|
|
if page == 0:
|
|
url = "https://top.chinaz.com/gongsitop/index_500top.html"
|
|
else:
|
|
url = f"https://top.chinaz.com/gongsitop/index_500top_{page + 1}.html"
|
|
|
|
try:
|
|
response = requests.get(url, headers=headers)
|
|
response.raise_for_status() # 检查请求是否成功
|
|
html = response.text
|
|
|
|
# 使用正则表达式提取数据
|
|
company = re.findall('<a.*?target="_blank">(.+?)</a></h3>', html)
|
|
person = re.findall('法定代表人:</span>(.*?)</p>', html)
|
|
signDate = re.findall('注册时间:</span>(.*?)</p>', html)
|
|
category = re.findall('证券类别:</span>(.*?)</p>', html)
|
|
|
|
# 确保每页数据长度一致
|
|
min_length = min(len(company), len(person), len(signDate), len(category))
|
|
page_data = list(zip(
|
|
company[:min_length],
|
|
person[:min_length],
|
|
signDate[:min_length],
|
|
category[:min_length]
|
|
))
|
|
|
|
message.extend(page_data)
|
|
print(f"第{page+1}页数据爬取完成,共{len(page_data)}条记录")
|
|
|
|
except Exception as e:
|
|
print(f"第{page+1}页爬取失败: {e}")
|
|
continue
|
|
|
|
return message
|
|
|
|
# 2. 保存数据到CSV
|
|
def save_to_csv(data, filename="china_top500_companies.csv"):
|
|
with open(filename, "w", newline="", encoding="utf-8-sig") as f:
|
|
writer = csv.writer(f)
|
|
writer.writerow(["公司名", "法定代表人", "注册时间", "证券类别"]) # 写入标题行
|
|
writer.writerows(data)
|
|
print(f"数据已保存到 {filename}")
|
|
|
|
# 3. 分析证券占比并绘制饼图
|
|
def analyze_and_plot(filename="china_top500_companies.csv"):
|
|
# 读取数据
|
|
df = pd.read_csv(filename)
|
|
|
|
# 统计证券类别占比
|
|
category_counts = df["证券类别"].value_counts()
|
|
|
|
# 设置中文字体
|
|
plt.rcParams['font.sans-serif'] = ['SimHei']
|
|
plt.rcParams['axes.unicode_minus'] = False
|
|
|
|
# 绘制饼图
|
|
plt.figure(figsize=(10, 8))
|
|
plt.pie(category_counts,
|
|
labels=category_counts.index,
|
|
autopct='%1.1f%%',
|
|
startangle=90,
|
|
textprops={'fontsize': 12})
|
|
plt.title("中国500强公司证券类别占比", fontsize=16)
|
|
plt.axis('equal') # 保证饼图是圆形
|
|
|
|
# 保存图片
|
|
plt.savefig("证券类别占比.png", dpi=300, bbox_inches='tight')
|
|
print("证券类别占比分析图已保存为 证券类别占比.png")
|
|
plt.show()
|
|
|
|
return category_counts
|
|
|
|
# 主程序
|
|
if __name__ == "__main__":
|
|
# 1. 爬取数据
|
|
company_data = scrape_company_data()
|
|
|
|
# 2. 保存到CSV
|
|
save_to_csv(company_data)
|
|
|
|
# 3. 分析并绘制饼图
|
|
category_distribution = analyze_and_plot()
|
|
print("\n证券类别分布统计:")
|
|
print(category_distribution) |