From 30d3a8156f921141a09cef431759176089ac5146 Mon Sep 17 00:00:00 2001 From: hnu202410040525 <3210828001@qq.com> Date: Thu, 22 May 2025 11:18:53 +0800 Subject: [PATCH] ADD file via upload --- crawler.py | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 crawler.py diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000..15576e6 --- /dev/null +++ b/crawler.py @@ -0,0 +1,94 @@ +import requests +import pandas as pd +from bs4 import BeautifulSoup +import time +import matplotlib.pyplot as plt + +# 设置中文字体 +plt.rcParams["font.family"] = ["Microsoft YaHei", "SimHei", "sans-serif"] +plt.rcParams["axes.unicode_minus"] = False + +def get_company_info_from_url(url): + headers = { + "User-Agent": "Mozilla/5.0" + } + + try: + print(f"请求中: {url}") + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + except Exception as e: + print(f"请求失败: {e}") + return [] + + soup = BeautifulSoup(response.text, 'html.parser') + company_list = [] + + for li in soup.select('li.LCliTheOne'): + try: + company_name = li.select_one('h3 a').get_text(strip=True) + info_blocks = li.select('div.ColInfo') + + # 基本信息 + person = reg_time = category = "未知" + + for p in info_blocks[0].select('p'): + text = p.get_text() + if '法定代表人' in text: + person = text.split(':')[-1].strip() + elif '注册时间' in text: + reg_time = text.split(':')[-1].strip() + + for p in info_blocks[1].select('p'): + if '证券类别' in p.text: + category = p.text.split(':')[-1].strip() + + company_list.append({ + '公司名': company_name, + '法定代表人': person, + '注册时间': reg_time, + '证券类别': category + }) + + except Exception as e: + print(f"解析出错: {e}") + continue + + return company_list + +def main(): + base_url = "https://top.chinaz.com/gongsitop/index_500top" + all_companies = [] + + for page in range(1, 17): + url = base_url + ".html" if page == 1 else f"{base_url}_{page}.html" + companies = get_company_info_from_url(url) + all_companies.extend(companies) + print(f"第 {page} 页提取 {len(companies)} 条") + time.sleep(2) # 避免请求过快被封 + + df = pd.DataFrame(all_companies) + df.to_csv("china_top500_online.csv", index=False, encoding="utf-8-sig") + print(f"\n✅ 总共提取 {len(df)} 条记录,保存为 'china_top500_online.csv'") + + # 可选分析图表 + category_stats = df['证券类别'].value_counts().reset_index() + category_stats.columns = ['证券类别', '数量'] + category_stats['占比(%)'] = round(category_stats['数量'] / len(df) * 100, 2) + + plt.figure(figsize=(10, 6)) + plt.pie( + category_stats['数量'], + labels=category_stats['证券类别'], + autopct='%1.1f%%', + startangle=90, + textprops={'fontsize': 10} + ) + plt.title("中国500强公司证券类别占比") + plt.axis("equal") + plt.legend(title="证券类别", bbox_to_anchor=(1, 1)) + plt.tight_layout() + plt.show() + +if __name__ == "__main__": + main()