from selenium.webdriver import Firefox from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.firefox.options import Options as FirefoxOptions from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common import exceptions from selenium.webdriver import FirefoxOptions from selenium.webdriver.common.by import By from time import sleep from selenium.common import exceptions from sql import sql from pypinyin import pinyin,Style import csv import traceback def get_dbname(city,job): """ 将中文字符串转换为拼音字符串,并只保留每个汉字的第一个拼音字母。 """ pinyin_list = pinyin(city, style=Style.NORMAL) city = ''.join([p[0] for p in pinyin_list]) pinyin_list = pinyin(job, style=Style.NORMAL) job = ''.join([p[0] for p in pinyin_list]) return city+'_'+job def responsedata(job,city,pages): #无头启动 options = FirefoxOptions() # options.headless=True # 反检测 options.add_argument("--proxy-server=127.0.0.1:7890") driver = Firefox(options=options) # 设置隐性等待时间为10s driver.maximize_window() driver.get('https://www.zhipin.com/web/geek/job?query='+job+"&city="+str(city)+'&page=1') driver.implicitly_wait(10) sleep(2) dics=[] for page in range(1, pages+1): # 爬取3页 k = 0 # 用来设置每页爬取的数量,每页有30条数据,因全部爬取用selenium较慢,为测试效果每页只爬取5条 sleep(2) # 滚动条滚到底部 driver.execute_script('document.documentElement.scrollTop = document.documentElement.scrollHeight') li_lists = driver.find_elements(By.CSS_SELECTOR, '.job-card-wrapper') print(len(li_lists)) for li in li_lists: job_name = li.find_element(By.CLASS_NAME, 'job-name').text job_area = li.find_element(By.CLASS_NAME, 'job-area').text salary = li.find_element(By.CLASS_NAME, 'salary').text job_tag = li.find_element(By.CSS_SELECTOR, '.job-card-wrapper .job-card-left .tag-list').text.replace('\n', ',') job_ability = li.find_element(By.XPATH, './div[2]/ul').text company_name = li.find_element(By.CLASS_NAME, 'company-name').text welfare = li.find_element(By.CLASS_NAME, 'info-desc').text link = li.find_element(By.CLASS_NAME, 'job-card-left').get_attribute('href') # WebDriverWait(driver, 100).until( # EC.presence_of_element_located((By.XPATH, '//*[@id="main"]/div[3]/div/div[2]/div[1]/div[2]')) # ) try: # 点击详情页 clic = li.find_element(By.CSS_SELECTOR, '.job-card-left') driver.execute_script('arguments[0].click()', clic) # 窗口切换到最新打开的页面 driver.switch_to.window(driver.window_handles[-1]) driver.implicitly_wait(5) driver.execute_script('document.documentElement.scrollTop = document.documentElement.scrollHeight') job_des = driver.find_element(By.XPATH, '//*[@id="main"]/div[3]/div/div[2]/div[1]/div[2]').text.replace('\n',' ') except Exception as e: print("Error:",e) print('因网络问题,有个别页面无法加载,已进行跳过') company_info = '' company_type = '' address='' job_des='' dic = { "职位名称": job_name, "地区": job_area, "薪水": salary, "标签": job_tag, "能力要求": job_ability, "公司名字": company_name, "公司介绍": company_info, "福利待遇": welfare, "职位描述": job_des, "企业类型": company_type, "工作地址": address, "详情链接": link } # 写入数据 k += 1 print(dic) dics.append(dic) driver.close() driver.switch_to.window(driver.window_handles[0]) continue try: # 有的公司没有公司介绍 company_info = driver.find_element(By.CSS_SELECTOR,'.job-body-wrapper .company-info-box .fold-text').text.replace('\n', ' ') except exceptions.NoSuchElementException: company_info = '' try: company_type = driver.find_element(By.CLASS_NAME, 'company-type').text.replace('企业类型\n', '') except exceptions.NoSuchElementException: company_type = '' address = driver.find_element(By.CLASS_NAME, 'location-address').text dic = { "职位名称": job_name, "地区": job_area, "薪水": salary, "标签": job_tag, "能力要求": job_ability, "公司名字": company_name, "公司介绍": company_info, "福利待遇": welfare, "职位描述": job_des, "企业类型": company_type, "工作地址": address, "详情链接": link } # 写入数据 k += 1 print(dic) dics.append(dic) driver.close() # 窗口切换到第一个页面 driver.switch_to.window(driver.window_handles[0]) if k == 2: # 每页爬取5条数据 break sleep(2) try: # 点击下一页,这里下一页的按钮不好定位,用XPATH的话只对1-4页有用,第五页后面要改成a[11] if page<=4: c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[10]') driver.execute_script('arguments[0].click()', c) elif page<7: c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[11]') driver.execute_script('arguments[0].click()', c) else: c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[10]/i') driver.execute_script('arguments[0].click()', c) WebDriverWait(driver, 100).until( EC.presence_of_element_located((By.CLASS_NAME, 'job-card-wrapper')) ) except: driver.close() driver.quit() driver.get('https://www.zhipin.com/web/geek/job?query=' + job + "&city=" + str(city) + '&page=1') driver.implicitly_wait(10) if page<=4: c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[10]') driver.execute_script('arguments[0].click()', c) elif page<7: c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[11]') driver.execute_script('arguments[0].click()', c) else: c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[10]/i') driver.execute_script('arguments[0].click()', c) WebDriverWait(driver, 100).until( EC.presence_of_element_located((By.CLASS_NAME, 'job-card-wrapper')) ) driver.close() driver.quit() return dics def createfile(dicts): f = open('BOSS直聘1.csv', mode='a', encoding='utf-8-sig', newline='') csv_writer = csv.DictWriter(f, fieldnames=["职位名称", "地区", "薪水", "标签", "能力要求", "公司名字", "公司介绍", "福利待遇", "职位描述", "企业类型", "工作地址", "详情链接"]) csv_writer.writeheader() # 写入表头 for dic in dicts: csv_writer.writerow(dic) def createsql(city,job,dicts): table_name = get_dbname(city, job) sql().create_table(table_name) sql().Insert_datas(table_name,dicts) def crawl(city,job,pages): city_form = {'全国': 100010000, '北京': 101010100, '上海': 101020100, '广州': 101280100, '深圳': 101280600, '杭州': 101210100, '天津': 101030100, '西安': 101110100, '苏州': 101190400, '武汉': 101200100, '厦门': 101230200, '长沙': 101250100, '成都': 101270100, '郑州': 101180100} for _ in city_form: if city == _: city_code = city_form[_] break dicts=responsedata(job,city_code,pages) createfile(dicts) createsql(city, job,dicts) return 1