|
|
|
@ -0,0 +1,94 @@
|
|
|
|
|
import requests
|
|
|
|
|
import pandas as pd
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
import time
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
|
|
# 设置中文字体
|
|
|
|
|
plt.rcParams["font.family"] = ["Microsoft YaHei", "SimHei", "sans-serif"]
|
|
|
|
|
plt.rcParams["axes.unicode_minus"] = False
|
|
|
|
|
|
|
|
|
|
def get_company_info_from_url(url):
|
|
|
|
|
headers = {
|
|
|
|
|
"User-Agent": "Mozilla/5.0"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
print(f"请求中: {url}")
|
|
|
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"请求失败: {e}")
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
company_list = []
|
|
|
|
|
|
|
|
|
|
for li in soup.select('li.LCliTheOne'):
|
|
|
|
|
try:
|
|
|
|
|
company_name = li.select_one('h3 a').get_text(strip=True)
|
|
|
|
|
info_blocks = li.select('div.ColInfo')
|
|
|
|
|
|
|
|
|
|
# 基本信息
|
|
|
|
|
person = reg_time = category = "未知"
|
|
|
|
|
|
|
|
|
|
for p in info_blocks[0].select('p'):
|
|
|
|
|
text = p.get_text()
|
|
|
|
|
if '法定代表人' in text:
|
|
|
|
|
person = text.split(':')[-1].strip()
|
|
|
|
|
elif '注册时间' in text:
|
|
|
|
|
reg_time = text.split(':')[-1].strip()
|
|
|
|
|
|
|
|
|
|
for p in info_blocks[1].select('p'):
|
|
|
|
|
if '证券类别' in p.text:
|
|
|
|
|
category = p.text.split(':')[-1].strip()
|
|
|
|
|
|
|
|
|
|
company_list.append({
|
|
|
|
|
'公司名': company_name,
|
|
|
|
|
'法定代表人': person,
|
|
|
|
|
'注册时间': reg_time,
|
|
|
|
|
'证券类别': category
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"解析出错: {e}")
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
return company_list
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
base_url = "https://top.chinaz.com/gongsitop/index_500top"
|
|
|
|
|
all_companies = []
|
|
|
|
|
|
|
|
|
|
for page in range(1, 17):
|
|
|
|
|
url = base_url + ".html" if page == 1 else f"{base_url}_{page}.html"
|
|
|
|
|
companies = get_company_info_from_url(url)
|
|
|
|
|
all_companies.extend(companies)
|
|
|
|
|
print(f"第 {page} 页提取 {len(companies)} 条")
|
|
|
|
|
time.sleep(2) # 避免请求过快被封
|
|
|
|
|
|
|
|
|
|
df = pd.DataFrame(all_companies)
|
|
|
|
|
df.to_csv("china_top500_online.csv", index=False, encoding="utf-8-sig")
|
|
|
|
|
print(f"\n✅ 总共提取 {len(df)} 条记录,保存为 'china_top500_online.csv'")
|
|
|
|
|
|
|
|
|
|
# 可选分析图表
|
|
|
|
|
category_stats = df['证券类别'].value_counts().reset_index()
|
|
|
|
|
category_stats.columns = ['证券类别', '数量']
|
|
|
|
|
category_stats['占比(%)'] = round(category_stats['数量'] / len(df) * 100, 2)
|
|
|
|
|
|
|
|
|
|
plt.figure(figsize=(10, 6))
|
|
|
|
|
plt.pie(
|
|
|
|
|
category_stats['数量'],
|
|
|
|
|
labels=category_stats['证券类别'],
|
|
|
|
|
autopct='%1.1f%%',
|
|
|
|
|
startangle=90,
|
|
|
|
|
textprops={'fontsize': 10}
|
|
|
|
|
)
|
|
|
|
|
plt.title("中国500强公司证券类别占比")
|
|
|
|
|
plt.axis("equal")
|
|
|
|
|
plt.legend(title="证券类别", bbox_to_anchor=(1, 1))
|
|
|
|
|
plt.tight_layout()
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|