# -*- coding: utf-8 -*- """ Created on Thu Jun 6 15:40:29 2024 @author: Asus """ import requests import re company_name_list=[] company_money_list=[] for i in range(10): # 请求的url if i==0: url = "https://top.chinaz.com/gongsi/index_zhuce.html" else: url = "https://top.chinaz.com/gongsi/index_zhuce"+"_"+str(i)+".html" # 设置请求头信息 headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36" } # 使用reqeusts模快发起 GET 请求 response = requests.get(url, headers=headers) # 获取请求的返回结果 html = response.text # 使用 findall 函数来获取数据 company_name = re.findall('

(.+?)

', html) company_name = [tuple_item[1] for tuple_item in company_name] company_name_list.append(company_name) company_money=re.findall('
(.+?)注册资本
', html) company_money_list.append(company_money) company_name_list = [item for sublist in company_name_list for item in sublist] company_money_list = [item for sublist in company_money_list for item in sublist] print(company_name_list) print(company_money_list) import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 # 使用zip组合列表,并按注册资金排序 combined = list(zip(company_name_list,company_money_list)) # 提取前20个 top_20=combined[:20] # 解包公司名称和注册资金 top_20_companies, top_20_capital = zip(*top_20) a=[] for i in top_20_capital: # 提取数字部分 number = i[:-1] # 提取单位部分 unit = i[-1] if unit=='亿': b=number *100 a.append(eval(number)) # 绘制条形图 plt.figure(figsize=(10, 6)) plt.bar(top_20_companies, a) plt.title("Top 20 Companies by Registered Capital") plt.xlabel("Company Name") plt.ylabel("Registered Capital (a hundred million)") plt.xticks(rotation=90) plt.show()