From 78b2b35fce7cff43d4eed4b1ceb8ae70cf1aac7d Mon Sep 17 00:00:00 2001 From: p36049127 Date: Sat, 10 Jul 2021 21:09:15 +0800 Subject: [PATCH] data1 --- src/爬取网页.py | 130 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 src/爬取网页.py diff --git a/src/爬取网页.py b/src/爬取网页.py new file mode 100644 index 0000000..37f0113 --- /dev/null +++ b/src/爬取网页.py @@ -0,0 +1,130 @@ +# -*- coding: utf-8 -*- + +import sys +import requests +import re +from urllib.request import quote +import getopt + +all_urls = list() + +class crawler: + '''爬百度搜索结果的爬虫''' + url = '' + urls = [] + o_urls = [] + html = '' + total_pages = 5 + current_page = 0 + next_page_url = '' + timeout = 60 #默认超时时间为60秒 + headersParameters = { #发送HTTP请求时的HEAD信息,用于伪装为浏览器 + 'Connection': 'Keep-Alive', + 'Accept': 'text/html, application/xhtml+xml, */*', + 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', + 'Accept-Encoding': 'gzip, deflate', + 'User-Agent': 'Mozilla/6.1 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko' + } +# headersParameters = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',} + + def __init__(self, keyword): + self.url = 'https://www.baidu.com/baidu?wd='+quote(keyword)+'&tn=monline_dg&ie=utf-8' + + def set_timeout(self, time): + '''设置超时时间,单位:秒''' + try: + self.timeout = int(time) + except: + pass + + def set_total_pages(self, num): + '''设置总共要爬取的页数''' + try: + self.total_pages = int(num) + except: + pass + + def set_current_url(self, url): + '''设置当前url''' + self.url = url + + def switch_url(self): + '''切换当前url为下一页的url + 若下一页为空,则退出程序''' + if self.next_page_url == '': + sys.exit() + else: + self.set_current_url(self.next_page_url) + + def is_finish(self): + '''判断是否爬取完毕''' + if self.current_page >= self.total_pages: + return True + else: + return False + + def get_html(self): + '''爬取当前url所指页面的内容,保存到html中''' + r = requests.get(self.url ,timeout=self.timeout, headers=self.headersParameters) + if r.status_code==200: + self.html = r.text + print(self.current_page) + all_urls.append(self.url) + self.current_page += 1 + else: + self.html = '' + print('[ERROR]',self.url,'get此url返回的http状态码不是200') + + def get_urls(self): + '''从当前html中解析出搜索结果的url,保存到o_urls''' + #取下一页地址 + next = re.findall(' href\=\"(\/s\?wd\=[\w\d\%\&\=\_\-]*?)\" class\=\"n\"', self.html) + if len(next) > 0: + self.next_page_url = 'https://www.baidu.com'+next[-1] + else: + print('no') + self.next_page_url = '' + + def run(self): + while(not self.is_finish()): + c.get_html() + c.get_urls() + c.switch_url() + +if __name__ == '__main__': + help = 'baiduSpider.py -k [-t -p ]' + keyword = None + timeout = None + totalpages = None + try: + opts, args = getopt.getopt(sys.argv[1:], "hk:t:p:", [ + "keyword=", "timeout=", "totalpages="]) + except getopt.GetoptError: + print(help) + sys.exit(2) + for opt, arg in opts: + if opt == '-h': + print(help) + sys.exit() + elif opt in ("-k", "--keyword"): + keyword = arg + elif opt in ("-t", "--timeout"): + timeout = arg + elif opt in ("-p", "--totalpages"): + totalpages = arg + if keyword == None: + print(help) + sys.exit() + + c = crawler(keyword) + if timeout != None: + c.set_timeout(timeout) + if totalpages != None: + c.set_total_pages(totalpages) +c.run() +with open('page_urls\\'+str(keyword)+'page_urls.txt', 'w', encoding='UTF-8') as f: + f.write(str(list(all_urls))) + f.close() +print(all_urls) + +#python 爬取网页.py -k 布偶猫 -p 2 \ No newline at end of file