爬虫2

5 years ago · 747e532c00
parent 013f9478e9
commit 747e532c00
1 changed files with 176 additions and 0 deletions
--- a/src/宠物资料.py
+++ b/src/宠物资料.py
@ -0,0 +1,176 @@
+# -*- coding: utf-8 -*-
+
+from bs4 import BeautifulSoup
+import requests
+import re
+
+def get_html(url):
+    try:
+        response = requests.get(url,headers=headers)
+        response.encoding = 'GBK'
+        response.encoding = 'utf-8'
+#        response.encoding = 'gbk'
+        html = response.text
+        return html
+    except:
+        print('请求网址出错')
+
+headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',}
+
+
+def make(url):
+    soup = BeautifulSoup(get_html(url),'lxml')
+    cata = list()
+    subs = soup.find_all(class_ = ['para','anchor-list'])
+    
+    i=0
+    part1 = ''
+    num = 0
+    while i < len(subs):
+        #具体信息
+        part2 = ''
+        cont = ''
+        while(subs[i]['class'][0] == 'para'):
+            #过滤图片
+            [s.extract() for s in subs[i].find_all("div", {"class": "description"})]
+            txt = subs[i].text
+            while txt != '' and txt[0] == '\n':
+                txt = txt[1::] 
+            while txt != '' and txt[-1] == '\n':
+                txt = txt[:-1:] 
+            if txt != '':
+                cont = cont + '\t' + str(txt) + '\n'
+            i = i + 1
+        newcont = cont
+        #替换注释数字
+        res = re.findall('\n\[\d+\].*?\n',newcont,re.S)
+        for r in res:
+            cont = newcont.replace(r,'')
+            newcont = cont
+        #替换\u3000
+        cont = newcont.replace('\u3000','')
+        newcont = cont
+        #删除连续换行符
+        res = re.findall('\n\n+',newcont,re.S)
+        for r in res:
+            cont = newcont.replace(r,'')
+            newcont = cont
+        #添加
+        if newcont != '':
+            if newcont[-1] != '\n':
+                newcont = newcont + '\n'
+            part2 = newcont
+        if part1 != '' or part2 != '': 
+            cata.append([part1,part2,num])
+        #分类名
+        if subs[i]['class'][0] == 'anchor-list':
+            tag = subs[i].find_all('a')
+            if len(tag) >= 3:
+                if '_' in str(tag[0]['name']):
+                    pos = str(tag[0]['name']).find('_')
+                    num = int(str(tag[0]['name'])[pos+1::])
+                else:
+                    num = 0
+                part1 = str(tag[2]['name'])
+            i = i + 1
+    return cata
+
+def write(txt,name):
+    with open(name+'.txt', 'w', encoding='UTF-8') as f:
+        f.write(txt)
+        f.close()
+        
+def append(txt,name):
+    with open(name+'.txt', 'a', encoding='UTF-8') as f:
+        f.write(txt)
+        f.close()
+
+def unify(content,name,path):
+    cnt = 1
+    i = 0
+    while i < len(content):
+        notxt = True
+        if content[i][2] == 0:
+            txtname = path + name + '_(' + str(cnt) + ')' + content[i][0]
+            cnt += 1
+            if content[i][1] != '':
+                write(content[i][1],txtname)
+                notxt = False
+            i += 1
+        while i < len(content) and content[i][2] != 0:
+            if notxt:
+                if content[i][1] != '':
+                    notxt = False
+                    if content[i][0] == '':
+                        write(content[i][2] + '. ' + content[i][1],txtname)
+                    else:
+                        write(content[i][0] + ':\n' + content[i][1],txtname)
+            else:
+                if content[i][0] == '':
+                    append(content[i][2] + '. ' + content[i][1],txtname)
+                else:
+                    append(content[i][0] + ':\n' + content[i][1],txtname)
+            i += 1
+        if notxt:
+            delecnt.append(cnt-1)
+            
+def cataloguegenerate(path):
+    with open('catalogue.txt','a',encoding = 'UTF-8') as f:
+        f.write(path[0:2] + '\n')
+        f.close()
+
+def catalogueappend(content,name,path):
+    with open('catalogue.txt','a',encoding = 'UTF-8') as f:  
+        f.write('\t' + name + '\n')
+        cnt = 1
+        for i in range(len(content)):
+            if content[i][2] == 0 and content[i][0] != '':
+                cnt += 1
+                if not cnt in delecnt:
+                    f.write('\t\t' + content[i][0] + '\n')
+        f.close()
+        
+        
+with open("namelist.txt","w",encoding = "UTF-8") as namelist:
+    with open('dogs.txt','r',encoding = 'ANSI') as f:
+        path = '狗狗\\'
+        cataloguegenerate(path)
+        lines = f.readlines()
+        for i in range(len(lines)):
+            line = eval(lines[i])
+            #delecnt = list()
+            namelist.write("狗:"+line[0]+"\n")
+            #unify(make(line[1]),line[0],path)
+            #write(str(line[0])+'\n',"tmp\\"+line[0])
+            write(str(make(line[1])),"tmp\\" + line[0])
+            catalogueappend(make(line[1]),line[0],path)
+    f.close()
+    
+    with open('cats.txt','r',encoding = 'ANSI') as f:
+        path = '猫猫\\'
+        cataloguegenerate(path)
+        lines = f.readlines()
+        for i in range(len(lines)):
+            line = eval(lines[i])
+            #delecnt = list()
+            namelist.write("猫:"+line[0]+"\n")
+            #unify(make(line[1]),line[0],path)
+            #write(str(line[0])+'\n',"tmp\\"+line[0])
+            write(str(make(line[1])),"tmp\\" + line[0])
+            catalogueappend(make(line[1]),line[0],path)
+    f.close()
+    
+    with open('else.txt','r',encoding = 'ANSI') as f:
+        path = '其他动物\\'
+        cataloguegenerate(path)
+        lines = f.readlines()
+        for i in range(len(lines)):
+            line = eval(lines[i])
+            #delecnt = list()
+            namelist.write("另:"+line[0]+"\n")
+            #unify(make(line[1]),line[0],path)
+            #write(str(line[0])+'\n',"tmp\\"+line[0])
+            write(str(make(line[1])),"tmp\\" + line[0])
+            catalogueappend(make(line[1]),line[0],path)
+    f.close()
+namelist.close()