parent
088a35dffb
commit
f6ebe55c01
@ -0,0 +1,106 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 24 23:23:07 2025
|
||||
|
||||
@author: 15733
|
||||
"""
|
||||
|
||||
import requests
|
||||
import re
|
||||
import csv
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# 1. 爬取数据
|
||||
def scrape_company_data():
|
||||
headers = {
|
||||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
|
||||
}
|
||||
message = []
|
||||
|
||||
# 总共16个页面的数据
|
||||
for page in range(16):
|
||||
# 组装url
|
||||
if page == 0:
|
||||
url = "https://top.chinaz.com/gongsitop/index_500top.html"
|
||||
else:
|
||||
url = f"https://top.chinaz.com/gongsitop/index_500top_{page + 1}.html"
|
||||
|
||||
try:
|
||||
response = requests.get(url, headers=headers)
|
||||
response.raise_for_status() # 检查请求是否成功
|
||||
html = response.text
|
||||
|
||||
# 使用正则表达式提取数据
|
||||
company = re.findall('<a.*?target="_blank">(.+?)</a></h3>', html)
|
||||
person = re.findall('法定代表人:</span>(.*?)</p>', html)
|
||||
signDate = re.findall('注册时间:</span>(.*?)</p>', html)
|
||||
category = re.findall('证券类别:</span>(.*?)</p>', html)
|
||||
|
||||
# 确保每页数据长度一致
|
||||
min_length = min(len(company), len(person), len(signDate), len(category))
|
||||
page_data = list(zip(
|
||||
company[:min_length],
|
||||
person[:min_length],
|
||||
signDate[:min_length],
|
||||
category[:min_length]
|
||||
))
|
||||
|
||||
message.extend(page_data)
|
||||
print(f"第{page+1}页数据爬取完成,共{len(page_data)}条记录")
|
||||
|
||||
except Exception as e:
|
||||
print(f"第{page+1}页爬取失败: {e}")
|
||||
continue
|
||||
|
||||
return message
|
||||
|
||||
# 2. 保存数据到CSV
|
||||
def save_to_csv(data, filename="china_top500_companies.csv"):
|
||||
with open(filename, "w", newline="", encoding="utf-8-sig") as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(["公司名", "法定代表人", "注册时间", "证券类别"]) # 写入标题行
|
||||
writer.writerows(data)
|
||||
print(f"数据已保存到 {filename}")
|
||||
|
||||
# 3. 分析证券占比并绘制饼图
|
||||
def analyze_and_plot(filename="china_top500_companies.csv"):
|
||||
# 读取数据
|
||||
df = pd.read_csv(filename)
|
||||
|
||||
# 统计证券类别占比
|
||||
category_counts = df["证券类别"].value_counts()
|
||||
|
||||
# 设置中文字体
|
||||
plt.rcParams['font.sans-serif'] = ['SimHei']
|
||||
plt.rcParams['axes.unicode_minus'] = False
|
||||
|
||||
# 绘制饼图
|
||||
plt.figure(figsize=(10, 8))
|
||||
plt.pie(category_counts,
|
||||
labels=category_counts.index,
|
||||
autopct='%1.1f%%',
|
||||
startangle=90,
|
||||
textprops={'fontsize': 12})
|
||||
plt.title("中国500强公司证券类别占比", fontsize=16)
|
||||
plt.axis('equal') # 保证饼图是圆形
|
||||
|
||||
# 保存图片
|
||||
plt.savefig("证券类别占比.png", dpi=300, bbox_inches='tight')
|
||||
print("证券类别占比分析图已保存为 证券类别占比.png")
|
||||
plt.show()
|
||||
|
||||
return category_counts
|
||||
|
||||
# 主程序
|
||||
if __name__ == "__main__":
|
||||
# 1. 爬取数据
|
||||
company_data = scrape_company_data()
|
||||
|
||||
# 2. 保存到CSV
|
||||
save_to_csv(company_data)
|
||||
|
||||
# 3. 分析并绘制饼图
|
||||
category_distribution = analyze_and_plot()
|
||||
print("\n证券类别分布统计:")
|
||||
print(category_distribution)
|
Loading…
Reference in new issue