diff --git a/english.py b/english.py new file mode 100644 index 0000000..73294b9 --- /dev/null +++ b/english.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- +""" +Created on Sat May 24 23:23:07 2025 + +@author: 15733 +""" + +import requests +import re +import csv +import pandas as pd +import matplotlib.pyplot as plt + +# 1. 爬取数据 +def scrape_company_data(): + headers = { + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36" + } + message = [] + + # 总共16个页面的数据 + for page in range(16): + # 组装url + if page == 0: + url = "https://top.chinaz.com/gongsitop/index_500top.html" + else: + url = f"https://top.chinaz.com/gongsitop/index_500top_{page + 1}.html" + + try: + response = requests.get(url, headers=headers) + response.raise_for_status() # 检查请求是否成功 + html = response.text + + # 使用正则表达式提取数据 + company = re.findall('(.+?)', html) + person = re.findall('法定代表人:(.*?)

', html) + signDate = re.findall('注册时间:(.*?)

', html) + category = re.findall('证券类别:(.*?)

', html) + + # 确保每页数据长度一致 + min_length = min(len(company), len(person), len(signDate), len(category)) + page_data = list(zip( + company[:min_length], + person[:min_length], + signDate[:min_length], + category[:min_length] + )) + + message.extend(page_data) + print(f"第{page+1}页数据爬取完成,共{len(page_data)}条记录") + + except Exception as e: + print(f"第{page+1}页爬取失败: {e}") + continue + + return message + +# 2. 保存数据到CSV +def save_to_csv(data, filename="china_top500_companies.csv"): + with open(filename, "w", newline="", encoding="utf-8-sig") as f: + writer = csv.writer(f) + writer.writerow(["公司名", "法定代表人", "注册时间", "证券类别"]) # 写入标题行 + writer.writerows(data) + print(f"数据已保存到 {filename}") + +# 3. 分析证券占比并绘制饼图 +def analyze_and_plot(filename="china_top500_companies.csv"): + # 读取数据 + df = pd.read_csv(filename) + + # 统计证券类别占比 + category_counts = df["证券类别"].value_counts() + + # 设置中文字体 + plt.rcParams['font.sans-serif'] = ['SimHei'] + plt.rcParams['axes.unicode_minus'] = False + + # 绘制饼图 + plt.figure(figsize=(10, 8)) + plt.pie(category_counts, + labels=category_counts.index, + autopct='%1.1f%%', + startangle=90, + textprops={'fontsize': 12}) + plt.title("中国500强公司证券类别占比", fontsize=16) + plt.axis('equal') # 保证饼图是圆形 + + # 保存图片 + plt.savefig("证券类别占比.png", dpi=300, bbox_inches='tight') + print("证券类别占比分析图已保存为 证券类别占比.png") + plt.show() + + return category_counts + +# 主程序 +if __name__ == "__main__": + # 1. 爬取数据 + company_data = scrape_company_data() + + # 2. 保存到CSV + save_to_csv(company_data) + + # 3. 分析并绘制饼图 + category_distribution = analyze_and_plot() + print("\n证券类别分布统计:") + print(category_distribution) \ No newline at end of file