You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
crawler/2.500强公司信息爬取.py

39 lines
1.2 KiB

import requests
import pandas as pd
import matplotlib.pyplot as plt
import re
plt.rcParams['font.sans-serif'] = ["SimHei"]
def crawl_data():
response = requests.get("https://top.chinaz.com/gongsitop/index_500top.html")
data = response.text
company_names = re.findall('<a.*?target="_blank">(.+?)</a></h3>', data)
legal_representatives = re.findall('法定代表人:</span>(.+?)</p>',data)
registration_times = re.findall('注册时间:</span>(.+?)</p>', data)
securities = re.findall('证券类别:</span>(.+?)</p>', data)
df = pd.DataFrame({
"公司名": company_names,
"法定代表人": legal_representatives,
"注册时间": registration_times,
"证券类别": securities
})
df.to_csv("500qiang.csv", index=False)
def analyze_data():
df = pd.read_csv("500qiang.csv")
securities_counts = df["证券类别"].value_counts()
total = len(df)
proportions = securities_counts / total
labels = securities_counts.index
sizes = proportions * 100
plt.pie(sizes, labels=labels, autopct='%1.1f%%')
plt.axis('equal')
plt.title("500 强公司证券占比")
plt.show()
crawl_data()
analyze_data()