From 7463dc6adb60e447d89cc38e14dab2a271cbc6e7 Mon Sep 17 00:00:00 2001 From: pfwvrj5cf <1076978369@qq.com> Date: Sun, 11 Dec 2022 17:50:30 +0800 Subject: [PATCH] ADD file via upload --- "\"https:/top.chinaz.com/gongsi/index_zhuce.html\"" | 1 + 1 file changed, 1 insertion(+) create mode 100644 "\"https:/top.chinaz.com/gongsi/index_zhuce.html\"" diff --git "a/\"https:/top.chinaz.com/gongsi/index_zhuce.html\"" "b/\"https:/top.chinaz.com/gongsi/index_zhuce.html\"" new file mode 100644 index 0000000..6e22625 --- /dev/null +++ "b/\"https:/top.chinaz.com/gongsi/index_zhuce.html\"" @@ -0,0 +1 @@ +#### 代码窗口 #1.数据爬取 import requests import re import csv #构建地址 url = "https://top.chinaz.com/gongsi/index_zhuce.html" # 设置请求头信息 # 设置请求头信息 headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.42" } # 使用reqeusts模快发起 GET 请求 response = requests.get(url, headers=headers) # 获取请求的返回结果 html = response.text # 存储内容 message = [] # 总共903个页面的数据 for page in range(903): # 组装url if page == 0: url = "https://top.chinaz.com/gongsi/index_zhuce.html" else: url = "https://top.chinaz.com/gongsi/index_zhuce_{}.html".format(page + 1) # 使用reqeusts模快发起 GET 请求 response = requests.get(url, headers=headers) html = response.text # 使用 findall 函数来获取数据 # 找公司名 company = re.findall('target="_blank">(.*?)\r\n', html) # 找注册资金 money = re.findall('