parent
013f9478e9
commit
747e532c00
@ -0,0 +1,176 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import re
|
||||
|
||||
def get_html(url):
|
||||
try:
|
||||
response = requests.get(url,headers=headers)
|
||||
response.encoding = 'GBK'
|
||||
response.encoding = 'utf-8'
|
||||
# response.encoding = 'gbk'
|
||||
html = response.text
|
||||
return html
|
||||
except:
|
||||
print('请求网址出错')
|
||||
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',}
|
||||
|
||||
|
||||
def make(url):
|
||||
soup = BeautifulSoup(get_html(url),'lxml')
|
||||
cata = list()
|
||||
subs = soup.find_all(class_ = ['para','anchor-list'])
|
||||
|
||||
i=0
|
||||
part1 = ''
|
||||
num = 0
|
||||
while i < len(subs):
|
||||
#具体信息
|
||||
part2 = ''
|
||||
cont = ''
|
||||
while(subs[i]['class'][0] == 'para'):
|
||||
#过滤图片
|
||||
[s.extract() for s in subs[i].find_all("div", {"class": "description"})]
|
||||
txt = subs[i].text
|
||||
while txt != '' and txt[0] == '\n':
|
||||
txt = txt[1::]
|
||||
while txt != '' and txt[-1] == '\n':
|
||||
txt = txt[:-1:]
|
||||
if txt != '':
|
||||
cont = cont + '\t' + str(txt) + '\n'
|
||||
i = i + 1
|
||||
newcont = cont
|
||||
#替换注释数字
|
||||
res = re.findall('\n\[\d+\].*?\n',newcont,re.S)
|
||||
for r in res:
|
||||
cont = newcont.replace(r,'')
|
||||
newcont = cont
|
||||
#替换\u3000
|
||||
cont = newcont.replace('\u3000','')
|
||||
newcont = cont
|
||||
#删除连续换行符
|
||||
res = re.findall('\n\n+',newcont,re.S)
|
||||
for r in res:
|
||||
cont = newcont.replace(r,'')
|
||||
newcont = cont
|
||||
#添加
|
||||
if newcont != '':
|
||||
if newcont[-1] != '\n':
|
||||
newcont = newcont + '\n'
|
||||
part2 = newcont
|
||||
if part1 != '' or part2 != '':
|
||||
cata.append([part1,part2,num])
|
||||
#分类名
|
||||
if subs[i]['class'][0] == 'anchor-list':
|
||||
tag = subs[i].find_all('a')
|
||||
if len(tag) >= 3:
|
||||
if '_' in str(tag[0]['name']):
|
||||
pos = str(tag[0]['name']).find('_')
|
||||
num = int(str(tag[0]['name'])[pos+1::])
|
||||
else:
|
||||
num = 0
|
||||
part1 = str(tag[2]['name'])
|
||||
i = i + 1
|
||||
return cata
|
||||
|
||||
def write(txt,name):
|
||||
with open(name+'.txt', 'w', encoding='UTF-8') as f:
|
||||
f.write(txt)
|
||||
f.close()
|
||||
|
||||
def append(txt,name):
|
||||
with open(name+'.txt', 'a', encoding='UTF-8') as f:
|
||||
f.write(txt)
|
||||
f.close()
|
||||
|
||||
def unify(content,name,path):
|
||||
cnt = 1
|
||||
i = 0
|
||||
while i < len(content):
|
||||
notxt = True
|
||||
if content[i][2] == 0:
|
||||
txtname = path + name + '_(' + str(cnt) + ')' + content[i][0]
|
||||
cnt += 1
|
||||
if content[i][1] != '':
|
||||
write(content[i][1],txtname)
|
||||
notxt = False
|
||||
i += 1
|
||||
while i < len(content) and content[i][2] != 0:
|
||||
if notxt:
|
||||
if content[i][1] != '':
|
||||
notxt = False
|
||||
if content[i][0] == '':
|
||||
write(content[i][2] + '. ' + content[i][1],txtname)
|
||||
else:
|
||||
write(content[i][0] + ':\n' + content[i][1],txtname)
|
||||
else:
|
||||
if content[i][0] == '':
|
||||
append(content[i][2] + '. ' + content[i][1],txtname)
|
||||
else:
|
||||
append(content[i][0] + ':\n' + content[i][1],txtname)
|
||||
i += 1
|
||||
if notxt:
|
||||
delecnt.append(cnt-1)
|
||||
|
||||
def cataloguegenerate(path):
|
||||
with open('catalogue.txt','a',encoding = 'UTF-8') as f:
|
||||
f.write(path[0:2] + '\n')
|
||||
f.close()
|
||||
|
||||
def catalogueappend(content,name,path):
|
||||
with open('catalogue.txt','a',encoding = 'UTF-8') as f:
|
||||
f.write('\t' + name + '\n')
|
||||
cnt = 1
|
||||
for i in range(len(content)):
|
||||
if content[i][2] == 0 and content[i][0] != '':
|
||||
cnt += 1
|
||||
if not cnt in delecnt:
|
||||
f.write('\t\t' + content[i][0] + '\n')
|
||||
f.close()
|
||||
|
||||
|
||||
with open("namelist.txt","w",encoding = "UTF-8") as namelist:
|
||||
with open('dogs.txt','r',encoding = 'ANSI') as f:
|
||||
path = '狗狗\\'
|
||||
cataloguegenerate(path)
|
||||
lines = f.readlines()
|
||||
for i in range(len(lines)):
|
||||
line = eval(lines[i])
|
||||
#delecnt = list()
|
||||
namelist.write("狗:"+line[0]+"\n")
|
||||
#unify(make(line[1]),line[0],path)
|
||||
#write(str(line[0])+'\n',"tmp\\"+line[0])
|
||||
write(str(make(line[1])),"tmp\\" + line[0])
|
||||
catalogueappend(make(line[1]),line[0],path)
|
||||
f.close()
|
||||
|
||||
with open('cats.txt','r',encoding = 'ANSI') as f:
|
||||
path = '猫猫\\'
|
||||
cataloguegenerate(path)
|
||||
lines = f.readlines()
|
||||
for i in range(len(lines)):
|
||||
line = eval(lines[i])
|
||||
#delecnt = list()
|
||||
namelist.write("猫:"+line[0]+"\n")
|
||||
#unify(make(line[1]),line[0],path)
|
||||
#write(str(line[0])+'\n',"tmp\\"+line[0])
|
||||
write(str(make(line[1])),"tmp\\" + line[0])
|
||||
catalogueappend(make(line[1]),line[0],path)
|
||||
f.close()
|
||||
|
||||
with open('else.txt','r',encoding = 'ANSI') as f:
|
||||
path = '其他动物\\'
|
||||
cataloguegenerate(path)
|
||||
lines = f.readlines()
|
||||
for i in range(len(lines)):
|
||||
line = eval(lines[i])
|
||||
#delecnt = list()
|
||||
namelist.write("另:"+line[0]+"\n")
|
||||
#unify(make(line[1]),line[0],path)
|
||||
#write(str(line[0])+'\n',"tmp\\"+line[0])
|
||||
write(str(make(line[1])),"tmp\\" + line[0])
|
||||
catalogueappend(make(line[1]),line[0],path)
|
||||
f.close()
|
||||
namelist.close()
|
||||
Loading…
Reference in new issue