|
|
|
|
@ -0,0 +1,130 @@
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
|
|
import sys
|
|
|
|
|
import requests
|
|
|
|
|
import re
|
|
|
|
|
from urllib.request import quote
|
|
|
|
|
import getopt
|
|
|
|
|
|
|
|
|
|
all_urls = list()
|
|
|
|
|
|
|
|
|
|
class crawler:
|
|
|
|
|
'''爬百度搜索结果的爬虫'''
|
|
|
|
|
url = ''
|
|
|
|
|
urls = []
|
|
|
|
|
o_urls = []
|
|
|
|
|
html = ''
|
|
|
|
|
total_pages = 5
|
|
|
|
|
current_page = 0
|
|
|
|
|
next_page_url = ''
|
|
|
|
|
timeout = 60 #默认超时时间为60秒
|
|
|
|
|
headersParameters = { #发送HTTP请求时的HEAD信息,用于伪装为浏览器
|
|
|
|
|
'Connection': 'Keep-Alive',
|
|
|
|
|
'Accept': 'text/html, application/xhtml+xml, */*',
|
|
|
|
|
'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
|
|
|
|
|
'Accept-Encoding': 'gzip, deflate',
|
|
|
|
|
'User-Agent': 'Mozilla/6.1 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
|
|
|
|
|
}
|
|
|
|
|
# headersParameters = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',}
|
|
|
|
|
|
|
|
|
|
def __init__(self, keyword):
|
|
|
|
|
self.url = 'https://www.baidu.com/baidu?wd='+quote(keyword)+'&tn=monline_dg&ie=utf-8'
|
|
|
|
|
|
|
|
|
|
def set_timeout(self, time):
|
|
|
|
|
'''设置超时时间,单位:秒'''
|
|
|
|
|
try:
|
|
|
|
|
self.timeout = int(time)
|
|
|
|
|
except:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
def set_total_pages(self, num):
|
|
|
|
|
'''设置总共要爬取的页数'''
|
|
|
|
|
try:
|
|
|
|
|
self.total_pages = int(num)
|
|
|
|
|
except:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
def set_current_url(self, url):
|
|
|
|
|
'''设置当前url'''
|
|
|
|
|
self.url = url
|
|
|
|
|
|
|
|
|
|
def switch_url(self):
|
|
|
|
|
'''切换当前url为下一页的url
|
|
|
|
|
若下一页为空,则退出程序'''
|
|
|
|
|
if self.next_page_url == '':
|
|
|
|
|
sys.exit()
|
|
|
|
|
else:
|
|
|
|
|
self.set_current_url(self.next_page_url)
|
|
|
|
|
|
|
|
|
|
def is_finish(self):
|
|
|
|
|
'''判断是否爬取完毕'''
|
|
|
|
|
if self.current_page >= self.total_pages:
|
|
|
|
|
return True
|
|
|
|
|
else:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def get_html(self):
|
|
|
|
|
'''爬取当前url所指页面的内容,保存到html中'''
|
|
|
|
|
r = requests.get(self.url ,timeout=self.timeout, headers=self.headersParameters)
|
|
|
|
|
if r.status_code==200:
|
|
|
|
|
self.html = r.text
|
|
|
|
|
print(self.current_page)
|
|
|
|
|
all_urls.append(self.url)
|
|
|
|
|
self.current_page += 1
|
|
|
|
|
else:
|
|
|
|
|
self.html = ''
|
|
|
|
|
print('[ERROR]',self.url,'get此url返回的http状态码不是200')
|
|
|
|
|
|
|
|
|
|
def get_urls(self):
|
|
|
|
|
'''从当前html中解析出搜索结果的url,保存到o_urls'''
|
|
|
|
|
#取下一页地址
|
|
|
|
|
next = re.findall(' href\=\"(\/s\?wd\=[\w\d\%\&\=\_\-]*?)\" class\=\"n\"', self.html)
|
|
|
|
|
if len(next) > 0:
|
|
|
|
|
self.next_page_url = 'https://www.baidu.com'+next[-1]
|
|
|
|
|
else:
|
|
|
|
|
print('no')
|
|
|
|
|
self.next_page_url = ''
|
|
|
|
|
|
|
|
|
|
def run(self):
|
|
|
|
|
while(not self.is_finish()):
|
|
|
|
|
c.get_html()
|
|
|
|
|
c.get_urls()
|
|
|
|
|
c.switch_url()
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
help = 'baiduSpider.py -k <keyword> [-t <timeout> -p <total pages>]'
|
|
|
|
|
keyword = None
|
|
|
|
|
timeout = None
|
|
|
|
|
totalpages = None
|
|
|
|
|
try:
|
|
|
|
|
opts, args = getopt.getopt(sys.argv[1:], "hk:t:p:", [
|
|
|
|
|
"keyword=", "timeout=", "totalpages="])
|
|
|
|
|
except getopt.GetoptError:
|
|
|
|
|
print(help)
|
|
|
|
|
sys.exit(2)
|
|
|
|
|
for opt, arg in opts:
|
|
|
|
|
if opt == '-h':
|
|
|
|
|
print(help)
|
|
|
|
|
sys.exit()
|
|
|
|
|
elif opt in ("-k", "--keyword"):
|
|
|
|
|
keyword = arg
|
|
|
|
|
elif opt in ("-t", "--timeout"):
|
|
|
|
|
timeout = arg
|
|
|
|
|
elif opt in ("-p", "--totalpages"):
|
|
|
|
|
totalpages = arg
|
|
|
|
|
if keyword == None:
|
|
|
|
|
print(help)
|
|
|
|
|
sys.exit()
|
|
|
|
|
|
|
|
|
|
c = crawler(keyword)
|
|
|
|
|
if timeout != None:
|
|
|
|
|
c.set_timeout(timeout)
|
|
|
|
|
if totalpages != None:
|
|
|
|
|
c.set_total_pages(totalpages)
|
|
|
|
|
c.run()
|
|
|
|
|
with open('page_urls\\'+str(keyword)+'page_urls.txt', 'w', encoding='UTF-8') as f:
|
|
|
|
|
f.write(str(list(all_urls)))
|
|
|
|
|
f.close()
|
|
|
|
|
print(all_urls)
|
|
|
|
|
|
|
|
|
|
#python 爬取网页.py -k 布偶猫 -p 2
|