diff --git a/2.500强公司信息爬取.py b/2.500强公司信息爬取.py new file mode 100644 index 0000000..e3a26ea --- /dev/null +++ b/2.500强公司信息爬取.py @@ -0,0 +1,39 @@ +import requests +import pandas as pd +import matplotlib.pyplot as plt +import re +plt.rcParams['font.sans-serif'] = ["SimHei"] +def crawl_data(): + + response = requests.get("https://top.chinaz.com/gongsitop/index_500top.html") + data = response.text + company_names = re.findall('(.+?)', data) + legal_representatives = re.findall('法定代表人:(.+?)

',data) + registration_times = re.findall('注册时间:(.+?)

', data) + securities = re.findall('证券类别:(.+?)

', data) + + df = pd.DataFrame({ + "公司名": company_names, + "法定代表人": legal_representatives, + "注册时间": registration_times, + "证券类别": securities + }) + + df.to_csv("500qiang.csv", index=False) + +def analyze_data(): + df = pd.read_csv("500qiang.csv") + securities_counts = df["证券类别"].value_counts() + total = len(df) + proportions = securities_counts / total + + labels = securities_counts.index + sizes = proportions * 100 + + plt.pie(sizes, labels=labels, autopct='%1.1f%%') + plt.axis('equal') + plt.title("500 强公司证券占比") + plt.show() + +crawl_data() +analyze_data() \ No newline at end of file