You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
54 lines
4.0 KiB
54 lines
4.0 KiB
import requests
|
|
import re
|
|
from bs4 import BeautifulSoup
|
|
import csv
|
|
|
|
with open("my.csv", 'a', newline="") as file:
|
|
writer = csv.writer(file)
|
|
writer.writerow(["地区", "职位说明", "工资", "公司名称", "职位", "学历", "经验"])
|
|
url = "https://bj.58.com/"
|
|
h1 = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"}
|
|
r = requests.get(url, headers=h1)
|
|
# print(r.text)
|
|
m = '<a[\s]*?href="(.*?)"[\s]*?tongji_tag="pc_home_zp_lldz41">'
|
|
q = re.findall(m, r.text, re.S)
|
|
for i in range(1, len(q)):
|
|
o = "https://bj.58.com" + q[i]
|
|
a = []
|
|
h = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"
|
|
,
|
|
"Cookie": "commontopbar_new_city_info=1%7C%E5%8C%97%E4%BA%AC%7Cbj; f=n; id58=CocGoGZhUbuBt0Zw3NTxAg==; city=bj; 58tj_uuid=d37953ae-6bb8-4fcf-adcf-977f5cfb37a8; als=0; myfeet_tooltip=end; 58home=bj; xxzl_deviceid=ofCiQEjCMkBdfoHqOBulLDRpu%2BPi2oGqf%2Bppd0H9RkNHHC%2B%2BcTSvC5StXC76o3Es; wmda_uuid=be12674ea0b5a6543158dec5f330f772; wmda_new_uuid=1; xxzlclientid=3f6c1cf7-7ebc-4ef9-9178-1717657287321; xxzlxxid=pfmxkiCBzeFitT1K7WNk5mrRWumzPCUnUXMdMYmp2umhPmEh5zN3jOjvSTx1s1PPVgpO; commontopbar_new_city_info=1%7C%E5%8C%97%E4%BA%AC%7Cbj; commontopbar_ipcity=nc%7C%E5%8D%97%E6%98%8C%7C0; fzq_h=f04c1b62d2e70e47fd9d2d9b72fc5f4b_1718247894223_b4f4f54e15b24e1eb5c1c74e0c9f0712_3663660715; ppStore_fingerprint=A05E1ED78DA7B8581D2FB6DA8C7B593CE1760E79A99F4813%EF%BC%BF1718248246337; PPU.sig=7ZAEeOc_JFR8qS8V1IMhxkANY8M; wmda_session_id_1731916484865=1718258091827-2d7299d1-2a6a-c9f6; new_uv=9; utm_source=; spm=; init_refer=https%253A%252F%252Fcallback.58.com%252F; new_session=0; qz_gdt=; wmda_session_id_10104579731767=1718258612228-4cd813d2-aa34-0aaf; wmda_visited_projects=%3B1731916484865%3B10104579731767%3B3381039819650; xxzl_smartid=79a743e8569e9e1aa09d653e7f9143b0; sessionid=d1ebe3c6-015c-459c-866a-197ae7ab243e; f=n; xxzl_cid=424e1a7c01fb4e9b99e4e425b69fd73a; xxzl_deviceid=dus4dqX7JjcBMr/wo9StNXIVLkUYWxILaP0bHsIWk+QMBlwMpOfzLsFCyCVNuFWY; crmvip=; dk_cookie=; PPU=UID=104501224598337&UN=qljj06467&TT=7b0e727a2572a55b4cac02517988085f&PBODY=Md8bYlI99GepTo0UDLP9Ng7J5KrCWX6kCrrCQ0MPVULFyzuYx8wV7q1Wg-LhDID86IbYHFQy3xKhx-dXQ3YZwZyc1taB4tDH76-fMK7pxemR19DHil9oJt2WEBF45ijgxMWiX1H8HJ5BntpFgBtlgqcEFmc3QvQl84iYNksiVv0&VER=1&CUID=YSAw2vReBhlHEu3U-r6eAA; www58com=UserID=104501224598337&UserName=qljj06467; 58cooper=userid=104501224598337&username=qljj06467; 58uname=qljj06467; passportAccount=atype=0&bstate=0; JSESSIONID=3570957C6A773A2E89A30F01CD43C2BB; fzq_js_zhaopin_list_pc=d4d7e616131ed8cd326618f7a0caa4cb_1718261773271_7; xxzlbbid=pfmbM3wxMDI5MnwxLjcuMHwxNzE4MjYxNzc0NTc0fGRIbWNPZkNINEUxeGltMkMvZXRtbEUxWWdGbHdrYUplZDdjdDl2dmdmTkE9fDg1OGZmNTg2ZDNkMGJiZDIzOTY0MzQ2YTRlZGY5YTJhXzE3MTgyNjE3NzQ3NjBfMzVjMDMxNWUzNjNmNGFiNDlkZThiZTFlMjFlODNkYTdfMzY2MzY2MDcxNXxkMmMyOTUzNmUxNTAxODRmMmQ3ZWJkYmI5MWEyNDE0M18xNzE4MjYxNzc0MjQ0XzI1Ng=="}
|
|
r = requests.get(url=o, headers=h)
|
|
soup = BeautifulSoup(r.text, "lxml")
|
|
f1 = soup.select("span.address")
|
|
f2 = soup.select("span.name")
|
|
f3 = soup.select("p.job_salary")
|
|
f4 = soup.select("a.fl")
|
|
f5 = soup.select("span.cate")
|
|
f6 = soup.select("span.xueli")
|
|
f7 = soup.select("span.jingyan")
|
|
# print(f5)
|
|
for i in range(0, len(f1)):
|
|
b = []
|
|
w1 = f1[i].get_text(strip=True)
|
|
w2 = f2[i].get_text(strip=True)
|
|
w3 = f3[i].get_text(strip=True)
|
|
w4 = f4[i].get_text(strip=True)
|
|
w5 = f5[i].get_text(strip=True)
|
|
w6 = f6[i].get_text(strip=True)
|
|
w7 = f7[i].get_text(strip=True)
|
|
b.append(w1)
|
|
b.append(w2)
|
|
b.append(w3)
|
|
b.append(w4)
|
|
b.append(w5)
|
|
b.append(w6)
|
|
b.append(w7)
|
|
a.append(b)
|
|
print(a)
|
|
with open("my.csv", 'a', newline="") as file:
|
|
writer = csv.writer(file)
|
|
writer.writerows(a)
|