From 7463dc6adb60e447d89cc38e14dab2a271cbc6e7 Mon Sep 17 00:00:00 2001 From: pfwvrj5cf <1076978369@qq.com> Date: Sun, 11 Dec 2022 17:50:30 +0800 Subject: [PATCH] ADD file via upload --- "\"https:/top.chinaz.com/gongsi/index_zhuce.html\"" | 1 + 1 file changed, 1 insertion(+) create mode 100644 "\"https:/top.chinaz.com/gongsi/index_zhuce.html\"" diff --git "a/\"https:/top.chinaz.com/gongsi/index_zhuce.html\"" "b/\"https:/top.chinaz.com/gongsi/index_zhuce.html\"" new file mode 100644 index 0000000..6e22625 --- /dev/null +++ "b/\"https:/top.chinaz.com/gongsi/index_zhuce.html\"" @@ -0,0 +1 @@ +#### 代码窗口 #1.数据爬取 import requests import re import csv #构建地址 url = "https://top.chinaz.com/gongsi/index_zhuce.html" # 设置请求头信息 # 设置请求头信息 headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.42" } # 使用reqeusts模快发起 GET 请求 response = requests.get(url, headers=headers) # 获取请求的返回结果 html = response.text # 存储内容 message = [] # 总共903个页面的数据 for page in range(903): # 组装url if page == 0: url = "https://top.chinaz.com/gongsi/index_zhuce.html" else: url = "https://top.chinaz.com/gongsi/index_zhuce_{}.html".format(page + 1) # 使用reqeusts模快发起 GET 请求 response = requests.get(url, headers=headers) html = response.text # 使用 findall 函数来获取数据 # 找公司名 company = re.findall('target="_blank">(.*?)\r\n', html) # 找注册资金 money = re.findall('
(.*?)注册资本
', html) pageOne = list(zip(company,money)) # 合并列表 message.extend(pageOne) #保存文件 with open("contents.csv", "w") as f: w = csv.writer(f) w.writerows(message) !cat contents.csv #绘图 import matplotlib.pyplot as plt import pandas as pd %matplotlib inline # 读取数据 df = pd.read_csv("contents.csv", names=["company", "money"],encoding='utf-8') # 用黑体显示中文 plt.rcParams['font.sans-serif'] = ['SimHei'] #获取所需数据 j=df.values[0:20] companys=[i for i,k in j] moneyss=[k for i,k in j] moneys=[float(k[:-1]) for i,k in j] #设置柱状图长宽 plt.figure(figsize=(20,10)) #画柱状图 plt.bar(companys,moneys) #标签 plt.xticks(range(len(companys)),companys,rotation=90) #标注资金 for x,y in zip(range(len(moneys)),moneys): plt.text(x,y,y,ha='center',va='bottom') #x,y轴标注和图标题 plt.title("注册资金最多的公司 top20") plt.xlabel("公司") plt.ylabel("注册资金(亿元)") plt.show \ No newline at end of file