import requests import pandas as pd from bs4 import BeautifulSoup import time import matplotlib.pyplot as plt # 设置中文字体 plt.rcParams["font.family"] = ["Microsoft YaHei", "SimHei", "sans-serif"] plt.rcParams["axes.unicode_minus"] = False def get_company_info_from_url(url): headers = { "User-Agent": "Mozilla/5.0" } try: print(f"请求中: {url}") response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() except Exception as e: print(f"请求失败: {e}") return [] soup = BeautifulSoup(response.text, 'html.parser') company_list = [] for li in soup.select('li.LCliTheOne'): try: company_name = li.select_one('h3 a').get_text(strip=True) info_blocks = li.select('div.ColInfo') # 基本信息 person = reg_time = category = "未知" for p in info_blocks[0].select('p'): text = p.get_text() if '法定代表人' in text: person = text.split(':')[-1].strip() elif '注册时间' in text: reg_time = text.split(':')[-1].strip() for p in info_blocks[1].select('p'): if '证券类别' in p.text: category = p.text.split(':')[-1].strip() company_list.append({ '公司名': company_name, '法定代表人': person, '注册时间': reg_time, '证券类别': category }) except Exception as e: print(f"解析出错: {e}") continue return company_list def main(): base_url = "https://top.chinaz.com/gongsitop/index_500top" all_companies = [] for page in range(1, 17): url = base_url + ".html" if page == 1 else f"{base_url}_{page}.html" companies = get_company_info_from_url(url) all_companies.extend(companies) print(f"第 {page} 页提取 {len(companies)} 条") time.sleep(2) # 避免请求过快被封 df = pd.DataFrame(all_companies) df.to_csv("china_top500_online.csv", index=False, encoding="utf-8-sig") print(f"\n✅ 总共提取 {len(df)} 条记录,保存为 'china_top500_online.csv'") # 可选分析图表 category_stats = df['证券类别'].value_counts().reset_index() category_stats.columns = ['证券类别', '数量'] category_stats['占比(%)'] = round(category_stats['数量'] / len(df) * 100, 2) plt.figure(figsize=(10, 6)) plt.pie( category_stats['数量'], labels=category_stats['证券类别'], autopct='%1.1f%%', startangle=90, textprops={'fontsize': 10} ) plt.title("中国500强公司证券类别占比") plt.axis("equal") plt.legend(title="证券类别", bbox_to_anchor=(1, 1)) plt.tight_layout() plt.show() if __name__ == "__main__": main()