master
p36049127 5 years ago
parent 013f9478e9
commit 747e532c00

@ -0,0 +1,176 @@
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import re
def get_html(url):
try:
response = requests.get(url,headers=headers)
response.encoding = 'GBK'
response.encoding = 'utf-8'
# response.encoding = 'gbk'
html = response.text
return html
except:
print('请求网址出错')
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',}
def make(url):
soup = BeautifulSoup(get_html(url),'lxml')
cata = list()
subs = soup.find_all(class_ = ['para','anchor-list'])
i=0
part1 = ''
num = 0
while i < len(subs):
#具体信息
part2 = ''
cont = ''
while(subs[i]['class'][0] == 'para'):
#过滤图片
[s.extract() for s in subs[i].find_all("div", {"class": "description"})]
txt = subs[i].text
while txt != '' and txt[0] == '\n':
txt = txt[1::]
while txt != '' and txt[-1] == '\n':
txt = txt[:-1:]
if txt != '':
cont = cont + '\t' + str(txt) + '\n'
i = i + 1
newcont = cont
#替换注释数字
res = re.findall('\n\[\d+\].*?\n',newcont,re.S)
for r in res:
cont = newcont.replace(r,'')
newcont = cont
#替换\u3000
cont = newcont.replace('\u3000','')
newcont = cont
#删除连续换行符
res = re.findall('\n\n+',newcont,re.S)
for r in res:
cont = newcont.replace(r,'')
newcont = cont
#添加
if newcont != '':
if newcont[-1] != '\n':
newcont = newcont + '\n'
part2 = newcont
if part1 != '' or part2 != '':
cata.append([part1,part2,num])
#分类名
if subs[i]['class'][0] == 'anchor-list':
tag = subs[i].find_all('a')
if len(tag) >= 3:
if '_' in str(tag[0]['name']):
pos = str(tag[0]['name']).find('_')
num = int(str(tag[0]['name'])[pos+1::])
else:
num = 0
part1 = str(tag[2]['name'])
i = i + 1
return cata
def write(txt,name):
with open(name+'.txt', 'w', encoding='UTF-8') as f:
f.write(txt)
f.close()
def append(txt,name):
with open(name+'.txt', 'a', encoding='UTF-8') as f:
f.write(txt)
f.close()
def unify(content,name,path):
cnt = 1
i = 0
while i < len(content):
notxt = True
if content[i][2] == 0:
txtname = path + name + '_(' + str(cnt) + ')' + content[i][0]
cnt += 1
if content[i][1] != '':
write(content[i][1],txtname)
notxt = False
i += 1
while i < len(content) and content[i][2] != 0:
if notxt:
if content[i][1] != '':
notxt = False
if content[i][0] == '':
write(content[i][2] + '. ' + content[i][1],txtname)
else:
write(content[i][0] + ':\n' + content[i][1],txtname)
else:
if content[i][0] == '':
append(content[i][2] + '. ' + content[i][1],txtname)
else:
append(content[i][0] + ':\n' + content[i][1],txtname)
i += 1
if notxt:
delecnt.append(cnt-1)
def cataloguegenerate(path):
with open('catalogue.txt','a',encoding = 'UTF-8') as f:
f.write(path[0:2] + '\n')
f.close()
def catalogueappend(content,name,path):
with open('catalogue.txt','a',encoding = 'UTF-8') as f:
f.write('\t' + name + '\n')
cnt = 1
for i in range(len(content)):
if content[i][2] == 0 and content[i][0] != '':
cnt += 1
if not cnt in delecnt:
f.write('\t\t' + content[i][0] + '\n')
f.close()
with open("namelist.txt","w",encoding = "UTF-8") as namelist:
with open('dogs.txt','r',encoding = 'ANSI') as f:
path = '狗狗\\'
cataloguegenerate(path)
lines = f.readlines()
for i in range(len(lines)):
line = eval(lines[i])
#delecnt = list()
namelist.write("狗:"+line[0]+"\n")
#unify(make(line[1]),line[0],path)
#write(str(line[0])+'\n',"tmp\\"+line[0])
write(str(make(line[1])),"tmp\\" + line[0])
catalogueappend(make(line[1]),line[0],path)
f.close()
with open('cats.txt','r',encoding = 'ANSI') as f:
path = '猫猫\\'
cataloguegenerate(path)
lines = f.readlines()
for i in range(len(lines)):
line = eval(lines[i])
#delecnt = list()
namelist.write("猫:"+line[0]+"\n")
#unify(make(line[1]),line[0],path)
#write(str(line[0])+'\n',"tmp\\"+line[0])
write(str(make(line[1])),"tmp\\" + line[0])
catalogueappend(make(line[1]),line[0],path)
f.close()
with open('else.txt','r',encoding = 'ANSI') as f:
path = '其他动物\\'
cataloguegenerate(path)
lines = f.readlines()
for i in range(len(lines)):
line = eval(lines[i])
#delecnt = list()
namelist.write("另:"+line[0]+"\n")
#unify(make(line[1]),line[0],path)
#write(str(line[0])+'\n',"tmp\\"+line[0])
write(str(make(line[1])),"tmp\\" + line[0])
catalogueappend(make(line[1]),line[0],path)
f.close()
namelist.close()
Loading…
Cancel
Save