ADD file via upload

main
hnu202410040525 3 months ago
parent 2fe1017fac
commit 30d3a8156f

@ -0,0 +1,94 @@
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import matplotlib.pyplot as plt
# 设置中文字体
plt.rcParams["font.family"] = ["Microsoft YaHei", "SimHei", "sans-serif"]
plt.rcParams["axes.unicode_minus"] = False
def get_company_info_from_url(url):
headers = {
"User-Agent": "Mozilla/5.0"
}
try:
print(f"请求中: {url}")
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
except Exception as e:
print(f"请求失败: {e}")
return []
soup = BeautifulSoup(response.text, 'html.parser')
company_list = []
for li in soup.select('li.LCliTheOne'):
try:
company_name = li.select_one('h3 a').get_text(strip=True)
info_blocks = li.select('div.ColInfo')
# 基本信息
person = reg_time = category = "未知"
for p in info_blocks[0].select('p'):
text = p.get_text()
if '法定代表人' in text:
person = text.split('')[-1].strip()
elif '注册时间' in text:
reg_time = text.split('')[-1].strip()
for p in info_blocks[1].select('p'):
if '证券类别' in p.text:
category = p.text.split('')[-1].strip()
company_list.append({
'公司名': company_name,
'法定代表人': person,
'注册时间': reg_time,
'证券类别': category
})
except Exception as e:
print(f"解析出错: {e}")
continue
return company_list
def main():
base_url = "https://top.chinaz.com/gongsitop/index_500top"
all_companies = []
for page in range(1, 17):
url = base_url + ".html" if page == 1 else f"{base_url}_{page}.html"
companies = get_company_info_from_url(url)
all_companies.extend(companies)
print(f"{page} 页提取 {len(companies)}")
time.sleep(2) # 避免请求过快被封
df = pd.DataFrame(all_companies)
df.to_csv("china_top500_online.csv", index=False, encoding="utf-8-sig")
print(f"\n✅ 总共提取 {len(df)} 条记录,保存为 'china_top500_online.csv'")
# 可选分析图表
category_stats = df['证券类别'].value_counts().reset_index()
category_stats.columns = ['证券类别', '数量']
category_stats['占比(%)'] = round(category_stats['数量'] / len(df) * 100, 2)
plt.figure(figsize=(10, 6))
plt.pie(
category_stats['数量'],
labels=category_stats['证券类别'],
autopct='%1.1f%%',
startangle=90,
textprops={'fontsize': 10}
)
plt.title("中国500强公司证券类别占比")
plt.axis("equal")
plt.legend(title="证券类别", bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.show()
if __name__ == "__main__":
main()
Loading…
Cancel
Save