爬虫3

4 years ago · ba8cbf0f56
parent e9f2b37e10
commit ba8cbf0f56
1 changed files with 91 additions and 0 deletions
--- a/src/宠物知识.py
+++ b/src/宠物知识.py
@ -0,0 +1,91 @@
+import requests
+import re
+
+def get_html(url):
+    try:
+        response = requests.get(url,headers=headers)
+        response.encoding = 'GBK'
+        response.encoding = 'utf-8'
+#        response.encoding = 'gbk'
+        html = response.text
+        return html
+    except:
+        print('请求网址出错')
+        
+def parse_url_dog(html):
+   pattern = re.compile('<li><a href="/vod/(.*?)" target="_blank">狗狗..</a></li>',re.S)
+   items = re.findall(pattern,html)
+   for i in range(len(items)):
+       items[i] = items[i][-5:-1:]
+   return items
+
+def parse_url_cat(html):
+   pattern = re.compile('<li><a href="/vod/(.*?)" target="_blank">猫猫..</a></li>',re.S)
+   items = re.findall(pattern,html)
+   for i in range(len(items)):
+       items[i] = items[i][-5:-1:]
+   return items
+
+def parse_url(html):
+   pattern1 = re.compile('<li><a href="/vod(\d+)" target="_blank">.*?</i></div>',re.S)
+   items1 = re.findall(pattern1,html)
+   pattern2 = re.compile('<title>(.*?)-我的宠舍</title>',re.S)
+   items2 = re.findall(pattern2,html)
+   return [items2,items1]
+
+def parse_result(html):
+   pattern = re.compile('<title>(.*?)-我的宠舍</title>.*?<p>(.*?)</p>',re.S)
+   items = re.findall(pattern,html)
+   return items
+
+    
+def write_item_to_file(item,name):
+    with open('问答\\' + str(name)+'知识.txt', 'a', encoding='ANSI') as f:
+       f.write('问：' + item[0] + '？')
+       f.write('答：' + item[1] + '')
+       f.close()
+       
+headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',}
+url_main = 'https://www.chongshe.cn/'
+html_main = get_html(url_main)
+dog_urls = parse_url_dog(html_main)  #['vdys', 'vdjk', 'vdxl', 'vdsh', 'vdfy']
+cat_urls = parse_url_cat(html_main)
+
+urls = []
+for dog_url in dog_urls:
+    tmp = 'https://www.chongshe.cn/vod/' + str(dog_url)
+    html_dog = get_html(tmp)
+    urls = urls + parse_url(html_dog)
+for cat_url in cat_urls:
+    tmp = 'https://www.chongshe.cn/vod/' + str(cat_url)
+    html_cat = get_html(tmp)
+    urls = urls + parse_url(html_cat)
+
+for i in range(len(urls)):
+    if i%2==1:
+        for url in urls[i]:
+            tmp = 'https://www.chongshe.cn/vod' + str(url)
+            html = get_html(tmp)
+            items = parse_result(html)
+            for j in range(len(items)):
+                write_item_to_file(eval(str(items[j])),urls[i-1][0])
+
+#添加进目录
+                '''
+with open('catalogue.txt','r', encoding='ANSI') as f:
+    line_data = ''
+    for line in f:
+        line_data += line
+        if line == '狗狗\n':
+            line_data += '\t常见问题\n'
+            for i in range(0,10,2):
+                line_data += ('\t\t' + urls[i][0] + '\n')
+        if line == '猫猫\n':
+            line_data += '\t常见问题\n'
+            for i in range(10,len(urls),2):
+                line_data += ('\t\t' + urls[i][0] + '\n')
+    f.close()           
+            
+with open('catalogue.txt','w', encoding='ANSI') as f:
+    f.write(line_data)
+    f.close()'''