diff --git "a/\"https:/top.chinaz.com/gongsi/index_zhuce.html\"" "b/\"https:/top.chinaz.com/gongsi/index_zhuce.html\"" new file mode 100644 index 0000000..6e22625 --- /dev/null +++ "b/\"https:/top.chinaz.com/gongsi/index_zhuce.html\"" @@ -0,0 +1 @@ +#### 代码窗口 #1.数据爬取 import requests import re import csv #构建地址 url = "https://top.chinaz.com/gongsi/index_zhuce.html" # 设置请求头信息 # 设置请求头信息 headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.42" } # 使用reqeusts模快发起 GET 请求 response = requests.get(url, headers=headers) # 获取请求的返回结果 html = response.text # 存储内容 message = [] # 总共903个页面的数据 for page in range(903): # 组装url if page == 0: url = "https://top.chinaz.com/gongsi/index_zhuce.html" else: url = "https://top.chinaz.com/gongsi/index_zhuce_{}.html".format(page + 1) # 使用reqeusts模快发起 GET 请求 response = requests.get(url, headers=headers) html = response.text # 使用 findall 函数来获取数据 # 找公司名 company = re.findall('target="_blank">(.*?)\r\n', html) # 找注册资金 money = re.findall('