CCCC/2024头歌大作业.py

# -*- coding: utf-8 -*-
"""
Created on Sun May 26 12:41:54 2024

@author: Panda
"""

import requests
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

url_head = "https://top.chinaz.com/gongsitop/index_500top"
Headers = {
        "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/104.0.5112.102"
        }

#建立一个二重列表存储数据
global tl
tl = []

#爬虫核心代码
def crawl(index):
    response = requests.get(url_head + index + ".html", headers = Headers)
    if response.status_code == 200:
        page_doc = response.text

    #使用BeautifulSoup4 解析页面Html源码
    # soup = BeautifulSoup(page_doc, features='lxml') #旧版本lxml与bs4的写法
    soup = BeautifulSoup(page_doc, features='html.parser') #至少lxml 4.6.3起的写法

    #获取所有<div>标签
    div_list = soup.find_all('div')

    #遍历标签寻找属性class为"CoListTxt"的元素
    for div in div_list:
        attrib = div["class"]
        if attrib == ["CoListTxt"]:
            string = div.text
            row = string.strip('\n').replace('\n',' ').split(' ')
            row = list(filter(None, row))
            company = row[0]
            t = row[1]
            person = row[1][t.find('：')+1:]
            if len(row) >=8:
                row[2] = row[2] + row[3]
                row.remove(row[3])
            t = row[2]
            capital = row[2][t.find('：')+1:]
            t = row[3]
            date = row[3][t.find('：')+1:]
            if len(row) >=7:
                t = row[4]
                stock_type = row[4][t.find('：')+1:]
                t = row[5]
                stock_code = row[5][t.find('：')+1:]
            else:
                stock_type = '暂无'
                stock_code = None
            row = [company, person, capital, date, stock_type, stock_code]
            #将爬取的单条数据整合进二重列表
            tl.append(row)


def main():
    inds = []
    pages = 16 #网站总页数
    for i in range(1, pages+1):
        ch = '_' + str(i)
        inds.append(ch)
    inds[0] = ''
    for ind in inds:
        crawl(ind)
    #将二重列表转化为Dataframe
    df = pd.DataFrame(tl, columns=['公司', '法人', '注册资本', '注册时间', '证券', '股票代码'])
    #保存为csv文件
    df.to_csv('result.csv', index=False, encoding='utf_8_sig')

    #分析证券占比
    percentages = df['证券'].value_counts(normalize=True) * 100
    print(percentages)

    #可视化占比
    percentages = percentages.head(3)
    plt.rcParams['font.sans-serif'] = 'SimHei'
    plt.rcParams['axes.unicode_minus'] = False
    percentages.plot.pie(autopct='%1.1f%%', startangle=90, shadow=False, labels=percentages.index)
    plt.title('500强企业证券类别占比饼图')
    plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.show()

    return None

if __name__=='__main__':
    main()