You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

95 lines
3.0 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import matplotlib.pyplot as plt
# 设置中文字体
plt.rcParams["font.family"] = ["Microsoft YaHei", "SimHei", "sans-serif"]
plt.rcParams["axes.unicode_minus"] = False
def get_company_info_from_url(url):
headers = {
"User-Agent": "Mozilla/5.0"
}
try:
print(f"请求中: {url}")
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
except Exception as e:
print(f"请求失败: {e}")
return []
soup = BeautifulSoup(response.text, 'html.parser')
company_list = []
for li in soup.select('li.LCliTheOne'):
try:
company_name = li.select_one('h3 a').get_text(strip=True)
info_blocks = li.select('div.ColInfo')
# 基本信息
person = reg_time = category = "未知"
for p in info_blocks[0].select('p'):
text = p.get_text()
if '法定代表人' in text:
person = text.split('')[-1].strip()
elif '注册时间' in text:
reg_time = text.split('')[-1].strip()
for p in info_blocks[1].select('p'):
if '证券类别' in p.text:
category = p.text.split('')[-1].strip()
company_list.append({
'公司名': company_name,
'法定代表人': person,
'注册时间': reg_time,
'证券类别': category
})
except Exception as e:
print(f"解析出错: {e}")
continue
return company_list
def main():
base_url = "https://top.chinaz.com/gongsitop/index_500top"
all_companies = []
for page in range(1, 17):
url = base_url + ".html" if page == 1 else f"{base_url}_{page}.html"
companies = get_company_info_from_url(url)
all_companies.extend(companies)
print(f"{page} 页提取 {len(companies)}")
time.sleep(2) # 避免请求过快被封
df = pd.DataFrame(all_companies)
df.to_csv("china_top500_online.csv", index=False, encoding="utf-8-sig")
print(f"\n✅ 总共提取 {len(df)} 条记录,保存为 'china_top500_online.csv'")
# 可选分析图表
category_stats = df['证券类别'].value_counts().reset_index()
category_stats.columns = ['证券类别', '数量']
category_stats['占比(%)'] = round(category_stats['数量'] / len(df) * 100, 2)
plt.figure(figsize=(10, 6))
plt.pie(
category_stats['数量'],
labels=category_stats['证券类别'],
autopct='%1.1f%%',
startangle=90,
textprops={'fontsize': 10}
)
plt.title("中国500强公司证券类别占比")
plt.axis("equal")
plt.legend(title="证券类别", bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.show()
if __name__ == "__main__":
main()