ADD file via upload

3 months ago · 30d3a8156f
parent 2fe1017fac
commit 30d3a8156f
1 changed files with 94 additions and 0 deletions
--- a/crawler.py
+++ b/crawler.py
@ -0,0 +1,94 @@
+import requests
+import pandas as pd
+from bs4 import BeautifulSoup
+import time
+import matplotlib.pyplot as plt
+
+# 设置中文字体
+plt.rcParams["font.family"] = ["Microsoft YaHei", "SimHei", "sans-serif"]
+plt.rcParams["axes.unicode_minus"] = False
+
+def get_company_info_from_url(url):
+    headers = {
+        "User-Agent": "Mozilla/5.0"
+    }
+
+    try:
+        print(f"请求中: {url}")
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+    except Exception as e:
+        print(f"请求失败: {e}")
+        return []
+
+    soup = BeautifulSoup(response.text, 'html.parser')
+    company_list = []
+
+    for li in soup.select('li.LCliTheOne'):
+        try:
+            company_name = li.select_one('h3 a').get_text(strip=True)
+            info_blocks = li.select('div.ColInfo')
+
+            # 基本信息
+            person = reg_time = category = "未知"
+
+            for p in info_blocks[0].select('p'):
+                text = p.get_text()
+                if '法定代表人' in text:
+                    person = text.split('：')[-1].strip()
+                elif '注册时间' in text:
+                    reg_time = text.split('：')[-1].strip()
+
+            for p in info_blocks[1].select('p'):
+                if '证券类别' in p.text:
+                    category = p.text.split('：')[-1].strip()
+
+            company_list.append({
+                '公司名': company_name,
+                '法定代表人': person,
+                '注册时间': reg_time,
+                '证券类别': category
+            })
+
+        except Exception as e:
+            print(f"解析出错: {e}")
+            continue
+
+    return company_list
+
+def main():
+    base_url = "https://top.chinaz.com/gongsitop/index_500top"
+    all_companies = []
+
+    for page in range(1, 17):
+        url = base_url + ".html" if page == 1 else f"{base_url}_{page}.html"
+        companies = get_company_info_from_url(url)
+        all_companies.extend(companies)
+        print(f"第 {page} 页提取 {len(companies)} 条")
+        time.sleep(2)  # 避免请求过快被封
+
+    df = pd.DataFrame(all_companies)
+    df.to_csv("china_top500_online.csv", index=False, encoding="utf-8-sig")
+    print(f"\n✅ 总共提取 {len(df)} 条记录，保存为 'china_top500_online.csv'")
+
+    # 可选分析图表
+    category_stats = df['证券类别'].value_counts().reset_index()
+    category_stats.columns = ['证券类别', '数量']
+    category_stats['占比(%)'] = round(category_stats['数量'] / len(df) * 100, 2)
+
+    plt.figure(figsize=(10, 6))
+    plt.pie(
+        category_stats['数量'],
+        labels=category_stats['证券类别'],
+        autopct='%1.1f%%',
+        startangle=90,
+        textprops={'fontsize': 10}
+    )
+    plt.title("中国500强公司证券类别占比")
+    plt.axis("equal")
+    plt.legend(title="证券类别", bbox_to_anchor=(1, 1))
+    plt.tight_layout()
+    plt.show()
+
+if __name__ == "__main__":
+    main()