# -*- coding: utf-8 -*- """ Created on Sat May 24 23:23:07 2025 @author: 15733 """ import requests import re import csv import pandas as pd import matplotlib.pyplot as plt # 1. 爬取数据 def scrape_company_data(): headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36" } message = [] # 总共16个页面的数据 for page in range(16): # 组装url if page == 0: url = "https://top.chinaz.com/gongsitop/index_500top.html" else: url = f"https://top.chinaz.com/gongsitop/index_500top_{page + 1}.html" try: response = requests.get(url, headers=headers) response.raise_for_status() # 检查请求是否成功 html = response.text # 使用正则表达式提取数据 company = re.findall('(.+?)', html) person = re.findall('法定代表人:(.*?)

', html) signDate = re.findall('注册时间:(.*?)

', html) category = re.findall('证券类别:(.*?)

', html) # 确保每页数据长度一致 min_length = min(len(company), len(person), len(signDate), len(category)) page_data = list(zip( company[:min_length], person[:min_length], signDate[:min_length], category[:min_length] )) message.extend(page_data) print(f"第{page+1}页数据爬取完成,共{len(page_data)}条记录") except Exception as e: print(f"第{page+1}页爬取失败: {e}") continue return message # 2. 保存数据到CSV def save_to_csv(data, filename="china_top500_companies.csv"): with open(filename, "w", newline="", encoding="utf-8-sig") as f: writer = csv.writer(f) writer.writerow(["公司名", "法定代表人", "注册时间", "证券类别"]) # 写入标题行 writer.writerows(data) print(f"数据已保存到 {filename}") # 3. 分析证券占比并绘制饼图 def analyze_and_plot(filename="china_top500_companies.csv"): # 读取数据 df = pd.read_csv(filename) # 统计证券类别占比 category_counts = df["证券类别"].value_counts() # 设置中文字体 plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False # 绘制饼图 plt.figure(figsize=(10, 8)) plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%', startangle=90, textprops={'fontsize': 12}) plt.title("中国500强公司证券类别占比", fontsize=16) plt.axis('equal') # 保证饼图是圆形 # 保存图片 plt.savefig("证券类别占比.png", dpi=300, bbox_inches='tight') print("证券类别占比分析图已保存为 证券类别占比.png") plt.show() return category_counts # 主程序 if __name__ == "__main__": # 1. 爬取数据 company_data = scrape_company_data() # 2. 保存到CSV save_to_csv(company_data) # 3. 分析并绘制饼图 category_distribution = analyze_and_plot() print("\n证券类别分布统计:") print(category_distribution)