ElizabethLiu/crawler.py

import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import matplotlib.pyplot as plt

# 设置中文字体
plt.rcParams["font.family"] = ["Microsoft YaHei", "SimHei", "sans-serif"]
plt.rcParams["axes.unicode_minus"] = False

def get_company_info_from_url(url):
    headers = {
        "User-Agent": "Mozilla/5.0"
    }

    try:
        print(f"请求中: {url}")
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except Exception as e:
        print(f"请求失败: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    company_list = []

    for li in soup.select('li.LCliTheOne'):
        try:
            company_name = li.select_one('h3 a').get_text(strip=True)
            info_blocks = li.select('div.ColInfo')

            # 基本信息
            person = reg_time = category = "未知"

            for p in info_blocks[0].select('p'):
                text = p.get_text()
                if '法定代表人' in text:
                    person = text.split('：')[-1].strip()
                elif '注册时间' in text:
                    reg_time = text.split('：')[-1].strip()

            for p in info_blocks[1].select('p'):
                if '证券类别' in p.text:
                    category = p.text.split('：')[-1].strip()

            company_list.append({
                '公司名': company_name,
                '法定代表人': person,
                '注册时间': reg_time,
                '证券类别': category
            })

        except Exception as e:
            print(f"解析出错: {e}")
            continue

    return company_list

def main():
    base_url = "https://top.chinaz.com/gongsitop/index_500top"
    all_companies = []

    for page in range(1, 17):
        url = base_url + ".html" if page == 1 else f"{base_url}_{page}.html"
        companies = get_company_info_from_url(url)
        all_companies.extend(companies)
        print(f"第 {page} 页提取 {len(companies)} 条")
        time.sleep(2)  # 避免请求过快被封

    df = pd.DataFrame(all_companies)
    df.to_csv("china_top500_online.csv", index=False, encoding="utf-8-sig")
    print(f"\n✅ 总共提取 {len(df)} 条记录，保存为 'china_top500_online.csv'")

    # 可选分析图表
    category_stats = df['证券类别'].value_counts().reset_index()
    category_stats.columns = ['证券类别', '数量']
    category_stats['占比(%)'] = round(category_stats['数量'] / len(df) * 100, 2)

    plt.figure(figsize=(10, 6))
    plt.pie(
        category_stats['数量'],
        labels=category_stats['证券类别'],
        autopct='%1.1f%%',
        startangle=90,
        textprops={'fontsize': 10}
    )
    plt.title("中国500强公司证券类别占比")
    plt.axis("equal")
    plt.legend(title="证券类别", bbox_to_anchor=(1, 1))
    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    main()