master
Li 2 years ago
parent d6d1a05df1
commit bd612480a5

3
.idea/.gitignore vendored

@ -0,0 +1,3 @@
# 默认忽略的文件
/shelf/
/workspace.xml

@ -0,0 +1 @@
main.py

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9" project-jdk-type="Python SDK" />
<component name="PyCharmProfessionalAdvertiser">
<option name="shown" value="true" />
</component>
</project>

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/爬虫2.iml" filepath="$PROJECT_DIR$/.idea/爬虫2.iml" />
</modules>
</component>
</project>

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.9" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

Binary file not shown.

Binary file not shown.

@ -0,0 +1,58 @@
from flask import Flask
from flask import url_for
from markupsafe import escape
from flask import Flask, render_template,request
from pypinyin import pinyin,Style
import crawler
import pymysql
app = Flask(__name__)
@app.route('/')
def index():
return render_template('index.html')
@app.route('/test')
def test():
print('hello')
@app.route('/result')
def send():
return render_template('send.html')
@app.route('/result',methods=['POST'])
def result():
if request.method == 'POST':
city = request.form['city']
job = request.form['job']
if city and job:
pinyin_list = pinyin(city, style=Style.NORMAL)
city = ''.join([p[0] for p in pinyin_list])
pinyin_list = pinyin(job, style=Style.NORMAL)
job = ''.join([p[0] for p in pinyin_list])
table_name=city+'_'+job
else:
return "城市信息不能为空!"
try:
conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root', charset="utf8", db="bossdb")
cursor = conn.cursor()
sql = 'SELECT job_name, job_area, salary, job_tag, job_ability, company_name, company_info, welfare, job_des, company_type, address, link FROM '+table_name
cursor.execute(sql)
rows = cursor.fetchall()
return render_template('result.html', rows=rows)
except:
return "你输入的信息有误,请重新输入"
@app.route('/crawl', methods=['GET', 'POST'])
def crawl():
result='a'
if request.method == 'POST':
result=0
city = request.form['city']
job = request.form['job']
pages = int(request.form['pages'])
result=crawler.crawl(city, job, pages)
return render_template('crawl.html', result=result)
# return render_template('crawl.html',result=result)
return render_template('crawl.html',result=result)

@ -0,0 +1,196 @@
from selenium.webdriver import Firefox
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common import exceptions
from selenium.webdriver import FirefoxOptions
from selenium.webdriver.common.by import By
from time import sleep
from selenium.common import exceptions
from sql import sql
from pypinyin import pinyin,Style
import csv
import traceback
def get_dbname(city,job):
"""
将中文字符串转换为拼音字符串并只保留每个汉字的第一个拼音字母
"""
pinyin_list = pinyin(city, style=Style.NORMAL)
city = ''.join([p[0] for p in pinyin_list])
pinyin_list = pinyin(job, style=Style.NORMAL)
job = ''.join([p[0] for p in pinyin_list])
return city+'_'+job
def responsedata(job,city,pages):
#无头启动
options = FirefoxOptions()
# options.headless=True
# 反检测
options.add_argument("--proxy-server=127.0.0.1:7890")
driver = Firefox(options=options)
# 设置隐性等待时间为10s
driver.maximize_window()
driver.get('https://www.zhipin.com/web/geek/job?query='+job+"&city="+str(city)+'&page=1')
driver.implicitly_wait(10)
sleep(2)
dics=[]
for page in range(1, pages+1): # 爬取3页
k = 0 # 用来设置每页爬取的数量每页有30条数据因全部爬取用selenium较慢为测试效果每页只爬取5条
sleep(2)
# 滚动条滚到底部
driver.execute_script('document.documentElement.scrollTop = document.documentElement.scrollHeight')
li_lists = driver.find_elements(By.CSS_SELECTOR, '.job-card-wrapper')
print(len(li_lists))
for li in li_lists:
job_name = li.find_element(By.CLASS_NAME, 'job-name').text
job_area = li.find_element(By.CLASS_NAME, 'job-area').text
salary = li.find_element(By.CLASS_NAME, 'salary').text
job_tag = li.find_element(By.CSS_SELECTOR, '.job-card-wrapper .job-card-left .tag-list').text.replace('\n', ',')
job_ability = li.find_element(By.XPATH, './div[2]/ul').text
company_name = li.find_element(By.CLASS_NAME, 'company-name').text
welfare = li.find_element(By.CLASS_NAME, 'info-desc').text
link = li.find_element(By.CLASS_NAME, 'job-card-left').get_attribute('href')
# WebDriverWait(driver, 100).until(
# EC.presence_of_element_located((By.XPATH, '//*[@id="main"]/div[3]/div/div[2]/div[1]/div[2]'))
# )
try:
# 点击详情页
clic = li.find_element(By.CSS_SELECTOR, '.job-card-left')
driver.execute_script('arguments[0].click()', clic)
# 窗口切换到最新打开的页面
driver.switch_to.window(driver.window_handles[-1])
driver.implicitly_wait(5)
driver.execute_script('document.documentElement.scrollTop = document.documentElement.scrollHeight')
job_des = driver.find_element(By.XPATH, '//*[@id="main"]/div[3]/div/div[2]/div[1]/div[2]').text.replace('\n',' ')
except Exception as e:
print("Error:",e)
print('因网络问题,有个别页面无法加载,已进行跳过')
company_info = ''
company_type = ''
address=''
job_des=''
dic = {
"职位名称": job_name,
"地区": job_area,
"薪水": salary,
"标签": job_tag,
"能力要求": job_ability,
"公司名字": company_name,
"公司介绍": company_info,
"福利待遇": welfare,
"职位描述": job_des,
"企业类型": company_type,
"工作地址": address,
"详情链接": link
}
# 写入数据
k += 1
print(dic)
dics.append(dic)
driver.close()
driver.switch_to.window(driver.window_handles[0])
continue
try: # 有的公司没有公司介绍
company_info = driver.find_element(By.CSS_SELECTOR,'.job-body-wrapper .company-info-box .fold-text').text.replace('\n', ' ')
except exceptions.NoSuchElementException:
company_info = ''
try:
company_type = driver.find_element(By.CLASS_NAME, 'company-type').text.replace('企业类型\n', '')
except exceptions.NoSuchElementException:
company_type = ''
address = driver.find_element(By.CLASS_NAME, 'location-address').text
dic = {
"职位名称": job_name,
"地区": job_area,
"薪水": salary,
"标签": job_tag,
"能力要求": job_ability,
"公司名字": company_name,
"公司介绍": company_info,
"福利待遇": welfare,
"职位描述": job_des,
"企业类型": company_type,
"工作地址": address,
"详情链接": link
}
# 写入数据
k += 1
print(dic)
dics.append(dic)
driver.close()
# 窗口切换到第一个页面
driver.switch_to.window(driver.window_handles[0])
if k == 2: # 每页爬取5条数据
break
sleep(2)
try:
# 点击下一页这里下一页的按钮不好定位用XPATH的话只对1-4页有用第五页后面要改成a[11]
if page<=4:
c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[10]')
driver.execute_script('arguments[0].click()', c)
elif page<7:
c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[11]')
driver.execute_script('arguments[0].click()', c)
else:
c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[10]/i')
driver.execute_script('arguments[0].click()', c)
WebDriverWait(driver, 100).until(
EC.presence_of_element_located((By.CLASS_NAME, 'job-card-wrapper'))
)
except:
driver.close()
driver.quit()
driver.get('https://www.zhipin.com/web/geek/job?query=' + job + "&city=" + str(city) + '&page=1')
driver.implicitly_wait(10)
if page<=4:
c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[10]')
driver.execute_script('arguments[0].click()', c)
elif page<7:
c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[11]')
driver.execute_script('arguments[0].click()', c)
else:
c = driver.find_element(By.XPATH, '//*[@class="options-pages"]/a[10]/i')
driver.execute_script('arguments[0].click()', c)
WebDriverWait(driver, 100).until(
EC.presence_of_element_located((By.CLASS_NAME, 'job-card-wrapper'))
)
driver.close()
driver.quit()
return dics
def createfile(dicts):
f = open('BOSS直聘1.csv', mode='a', encoding='utf-8-sig', newline='')
csv_writer = csv.DictWriter(f, fieldnames=["职位名称", "地区", "薪水", "标签", "能力要求", "公司名字", "公司介绍", "福利待遇", "职位描述",
"企业类型", "工作地址", "详情链接"])
csv_writer.writeheader() # 写入表头
for dic in dicts:
csv_writer.writerow(dic)
def createsql(city,job,dicts):
table_name = get_dbname(city, job)
sql().create_table(table_name)
sql().Insert_datas(table_name,dicts)
def crawl(city,job,pages):
city_form = {'全国': 100010000, '北京': 101010100, '上海': 101020100, '广州': 101280100, '深圳': 101280600, '杭州': 101210100,
'天津': 101030100, '西安': 101110100, '苏州': 101190400, '武汉': 101200100, '厦门': 101230200, '长沙': 101250100,
'成都': 101270100, '郑州': 101180100}
for _ in city_form:
if city == _:
city_code = city_form[_]
break
dicts=responsedata(job,city_code,pages)
createfile(dicts)
createsql(city, job,dicts)
return 1

File diff suppressed because it is too large Load Diff

@ -0,0 +1,74 @@
import pymysql
class sql():
def __init__(self):
# 连接MySQL(socket)
conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root', charset="utf8", db="bossdb")
cursor = conn.cursor()
self.conn = conn
self.cursor = cursor
def create_table(self,table_name):
try:
# 创建一个游标
cursor = self.cursor
conn = self.conn
#判断是否创建过该表
result = cursor.execute("SHOW TABLES LIKE '{}'".format(table_name))
if result:
# print('{0}表已存在'.format(table_name))
# choose=input('是否要删除{0}表?(y/n)'.format(table_name))
# #删除表,重新插入数据
# if choose=='y':
sql='drop table {0};'.format(table_name)
cursor.execute(sql)
# print('已删除{0}表,正常重新创建--------------'.format(table_name))
create_table ='create table '+table_name+'(job_name varchar(20),job_area varchar(20),salary varchar(20),job_tag varchar(20),job_ability varchar(50),company_name varchar(20),company_info varchar(200),welfare varchar(100),job_des varchar(150),company_type varchar(60),address varchar(60),link varchar(150))ENGINE=InnoDB DEFAULT CHARSET=utf8;'
cursor.execute(create_table)
conn.commit()
print(table_name + '表创建完毕')
#不删除表,继续插入数据
else:
create_table ='create table '+table_name+'(job_name varchar(20),job_area varchar(20),salary varchar(20),job_tag varchar(20),job_ability varchar(50),company_name varchar(20),company_info varchar(200),welfare varchar(100),job_des varchar(150),company_type varchar(60),address varchar(60),link varchar(150))ENGINE=InnoDB DEFAULT CHARSET=utf8;'
cursor.execute(create_table)
conn.commit()
print(table_name + '表创建完毕')
except:
print('创建数据表失败请查看mysql服务是否开启、密码是否正确')
def Insert_datas(self,table_name,dicts):
try:
n=0
for content in dicts:
job_name = content['职位名称']
job_area = content['地区']
salary = content['薪水']
job_tag = content['标签']
job_ability = content['能力要求']
company_name = content['公司名字']
company_info = content['公司介绍']
welfare = content['福利待遇']
job_des = content['职位描述']
company_type = content['企业类型']
address = content['工作地址']
link = content['详情链接']
#将信息插入表中
cursor = self.cursor
conn = self.conn
sql='insert into '+table_name+"(job_name,job_area,salary,job_tag,job_ability,company_name,company_info,welfare,job_des,company_type,address,link) values ('{0}','{1}','{2}','{3}','{4}','{5}','{6}','{7}','{8}','{9}','{10}','{11}')".format(job_name,job_area,salary,job_tag,job_ability,company_name,company_info,welfare,job_des,company_type,address,link)
cursor.execute(sql)
conn.commit()
n+=1
print('本次程序已插入{0}条数据'.format(n))
except:
print('插入数据失败')

@ -0,0 +1,30 @@
<!DOCTYPE html>
<html>
<head>
<title>boss直聘爬取</title>
<link rel="stylesheet" href="{{ url_for('static', filename='crawl.css') }}" type="text/css">
</head>
<body>
{% if result == 1 %}
<script>alert("爬取成功");</script>
{% elif result == 0 %}
<script>alert("爬取失败");</script>
{% else %}
<script>alert("请输入信息");</script>
{% endif %}
<h1 class="title">在线爬取</h1>
<form action="/crawl" method="post">
<label for="city">城市:</label>
<input type="text" id="city" name="city"><br><br>
<label for="job">工作:</label>
<input type="text" id="job" name="job"><br><br>
<label for="pages">页数:</label>
<input type="number" id="pages" name="pages"><br><br>
<input type="submit" value="提交">
</form>
</body>
</html>

@ -0,0 +1,20 @@
<!DOCTYPE html>
<html lang="en">
<head>
<link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}" type="text/css">
</head>
<body>
<h2>
<img alt="Avatar" class="avatar" src="{{ url_for('static', filename='images/avatar.png') }}">
首页
</h2>
<ul class="tools">
<li><a href="{{ url_for('crawl') }}" class="crawl-btn">爬取数据</a></li>
<li><a href="{{ url_for('result') }}" class="result-btn">查看数据</a></li>
</ul>
<img alt="Walking Totoro" class="totoro" src="{{ url_for('static', filename='images/totoro.gif') }}">
<footer>
<small>&copy; 2018 <a href="http://helloflask.com/book/3">HelloFlask</a></small>
</footer>
</body>
</html>

@ -0,0 +1,55 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Jobs</title>
<link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='result.css') }}">
</head>
<body>
<div class="container">
<img alt="Avatar" class="avatar" src="{{ url_for('static', filename='/images/avatar.png') }}" alt="logo">
<h1 class="title">职位信息</h1>
<table class="jobs-table">
<thead>
<tr class="table-header-row">
<th class="job-title-col">工作名称</th>
<th class="job-location-col">地区</th>
<th class="job-salary-col">薪资</th>
<th class="job-labels-col">标签</th>
<th class="job-skills-col">能力要求</th>
<th class="company-name-col">公司名字</th>
<th class="company-info-col">公司信息</th>
<th class="company-benefits-col">公司福利</th>
<th class="company-description-col">公司描述</th>
<th class="company-nature-col">公司性质</th>
<th class="company-address-col">公司地址</th>
<th class="job-detail-link-col">详情链接</th>
</tr>
</thead>
<tbody>
{% for row in rows %}
<tr class="table-row">
<td class="job-title-col">{{ row[0] }}</td>
<td class="job-location-col">{{ row[1] }}</td>
<td class="job-salary-col">{{ row[2] }}</td>
<td class="job-labels-col">{{ row[3] }}</td>
<td class="job-skills-col">{{ row[4] }}</td>
<td class="company-name-col">{{ row[5] }}</td>
<td class="company-info-col">{{ row[6] }}</td>
<td class="company-benefits-col">{{ row[7] }}</td>
<td class="company-description-col">{{ row[8] }}</td>
<td class="company-nature-col">{{ row[9] }}</td>
<td class="company-address-col">{{ row[10] }}</td>
<td class="job-detail-link-col"><a href="{{ row[11] }}">点击查看</a></td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
<script src="{{ url_for('static', filename='js/myscript.js') }}"></script>
<div class="pagination">
<button id="prev-btn">上一页</button>
<button id="next-btn">下一页</button>
</div>
</body>
</html>

@ -0,0 +1,25 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>数据查询 - 输入城市和工作</title>
<link rel="stylesheet" href="{{ url_for('static', filename='send.css') }}" type="text/css">
</head>
<body>
<div class="container">
<h1>数据查询 - 输入城市和工作</h1>
<form action="/result" method="post">
<label for="city">请输入城市:</label>
<input type="text" name="city" id="city" placeholder="例如:北京、上海、杭州等">
<label for="job">请输入职位:</label>
<input type="text" name="job" id="job" placeholder="例如Python开发工程师、Java软件工程师等">
<button type="submit">查询</button>
</form>
</div>
</body>
</html>
Loading…
Cancel
Save