data1

5 years ago · 78b2b35fce
parent ca52845aa3
commit 78b2b35fce
1 changed files with 130 additions and 0 deletions
--- a/src/爬取网页.py
+++ b/src/爬取网页.py
@ -0,0 +1,130 @@
+# -*- coding: utf-8 -*-
+
+import sys
+import requests
+import re
+from urllib.request import quote
+import getopt
+ 
+all_urls = list()
+        
+class crawler:
+    '''爬百度搜索结果的爬虫'''
+    url = ''
+    urls = []
+    o_urls = []
+    html = ''
+    total_pages = 5
+    current_page = 0
+    next_page_url = ''
+    timeout = 60                    #默认超时时间为60秒
+    headersParameters = {    #发送HTTP请求时的HEAD信息，用于伪装为浏览器
+        'Connection': 'Keep-Alive',
+        'Accept': 'text/html, application/xhtml+xml, */*',
+        'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
+        'Accept-Encoding': 'gzip, deflate',
+        'User-Agent': 'Mozilla/6.1 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
+    }
+#    headersParameters = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',}
+ 
+    def __init__(self, keyword):
+        self.url = 'https://www.baidu.com/baidu?wd='+quote(keyword)+'&tn=monline_dg&ie=utf-8'
+ 
+    def set_timeout(self, time):
+        '''设置超时时间，单位：秒'''
+        try:
+            self.timeout = int(time)
+        except:
+            pass
+ 
+    def set_total_pages(self, num):
+        '''设置总共要爬取的页数'''
+        try:
+            self.total_pages = int(num)
+        except:
+            pass
+ 
+    def set_current_url(self, url):
+        '''设置当前url'''
+        self.url = url
+ 
+    def switch_url(self):
+        '''切换当前url为下一页的url
+           若下一页为空，则退出程序'''
+        if self.next_page_url == '':
+            sys.exit()
+        else:
+            self.set_current_url(self.next_page_url)
+ 
+    def is_finish(self):
+        '''判断是否爬取完毕'''
+        if self.current_page >= self.total_pages:
+            return True
+        else:
+            return False
+ 
+    def get_html(self):
+        '''爬取当前url所指页面的内容，保存到html中'''
+        r = requests.get(self.url ,timeout=self.timeout, headers=self.headersParameters)
+        if r.status_code==200:
+            self.html = r.text
+            print(self.current_page)
+            all_urls.append(self.url)
+            self.current_page += 1
+        else:
+            self.html = ''
+            print('[ERROR]',self.url,'get此url返回的http状态码不是200')
+            
+    def get_urls(self):
+        '''从当前html中解析出搜索结果的url，保存到o_urls'''
+        #取下一页地址
+        next = re.findall(' href\=\"(\/s\?wd\=[\w\d\%\&\=\_\-]*?)\" class\=\"n\"', self.html)
+        if len(next) > 0:
+            self.next_page_url = 'https://www.baidu.com'+next[-1]
+        else:
+            print('no')
+            self.next_page_url = ''
+ 
+    def run(self):
+        while(not self.is_finish()):
+            c.get_html()
+            c.get_urls()
+            c.switch_url()
+ 
+if __name__ == '__main__':
+    help = 'baiduSpider.py -k <keyword> [-t <timeout> -p <total pages>]'
+    keyword = None
+    timeout  = None
+    totalpages = None
+    try:
+        opts, args = getopt.getopt(sys.argv[1:], "hk:t:p:", [
+                                   "keyword=", "timeout=", "totalpages="])
+    except getopt.GetoptError:
+        print(help)
+        sys.exit(2)
+    for opt, arg in opts:
+        if opt == '-h':
+            print(help)
+            sys.exit()
+        elif opt in ("-k", "--keyword"):
+            keyword = arg
+        elif opt in ("-t", "--timeout"):
+            timeout = arg
+        elif opt in ("-p", "--totalpages"):
+            totalpages = arg
+    if keyword == None:
+        print(help)
+        sys.exit()
+ 
+    c = crawler(keyword)
+    if timeout != None:
+        c.set_timeout(timeout)
+    if totalpages != None:
+        c.set_total_pages(totalpages)
+c.run()
+with open('page_urls\\'+str(keyword)+'page_urls.txt', 'w', encoding='UTF-8') as f:
+    f.write(str(list(all_urls)))
+    f.close() 
+print(all_urls)
+
+#python 爬取网页.py -k 布偶猫 -p 2