|
|
|
@ -1,157 +0,0 @@
|
|
|
|
|
# @Time : 2021/11/5 13:57
|
|
|
|
|
# @Author :wenkaic
|
|
|
|
|
# @File : 002草稿
|
|
|
|
|
# @Project : pythonProject4
|
|
|
|
|
import urllib.request
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
from urllib import parse
|
|
|
|
|
from wordcloud import WordCloud
|
|
|
|
|
import numpy as np
|
|
|
|
|
from PIL import Image
|
|
|
|
|
from lxml import etree
|
|
|
|
|
|
|
|
|
|
#阿拉伯数字转为中文数字
|
|
|
|
|
def change(num):
|
|
|
|
|
dic_num = {"1": "一", "2": "两", "3": "三", "4": "四", "5": "五", "6": "六", "7": "七", "8": "八", "9": "九", }
|
|
|
|
|
dic_unit = {1: "", 2: "十", 3: "百", 4: "千", 5: "万"}
|
|
|
|
|
fs = []
|
|
|
|
|
daxie = ""
|
|
|
|
|
lennum = len(num)
|
|
|
|
|
if lennum >= 1:
|
|
|
|
|
for item in num:
|
|
|
|
|
if int(item) > 0:
|
|
|
|
|
fs.append(dic_num[item])
|
|
|
|
|
fs.append(dic_unit[lennum])
|
|
|
|
|
elif int(item) == 0:
|
|
|
|
|
fs.append("零")
|
|
|
|
|
lennum -= 1
|
|
|
|
|
while fs[-1] == "零":
|
|
|
|
|
fs.pop()
|
|
|
|
|
daxie = "".join(fs)
|
|
|
|
|
if int(num)>1000 and int(num)<=1009:
|
|
|
|
|
daxie=daxie.replace('零','',1)
|
|
|
|
|
return daxie
|
|
|
|
|
|
|
|
|
|
# 词云的制作,需要处理后的content
|
|
|
|
|
def Word_cloud():
|
|
|
|
|
|
|
|
|
|
mask = np.array(Image.open('006.png'))
|
|
|
|
|
fp = open('003斗破苍穹.txt', 'r', encoding='utf-8')
|
|
|
|
|
text = fp.read()
|
|
|
|
|
w = WordCloud(font_path='方正粗黑宋简体.ttf', width=650, height=700
|
|
|
|
|
, contour_color='yellow', contour_width=2, mask=mask
|
|
|
|
|
, max_words=500)
|
|
|
|
|
w.generate(text)
|
|
|
|
|
w.to_file('004dou.png')
|
|
|
|
|
fp.close()
|
|
|
|
|
|
|
|
|
|
# 保存爬取文件或者直接输出爬取的文件
|
|
|
|
|
def save(n,soup):
|
|
|
|
|
bbs = soup.select('th,tr')
|
|
|
|
|
fp = open('003斗破苍穹.txt', 'a', encoding='utf-8')
|
|
|
|
|
if(n=='1'):
|
|
|
|
|
print('执行指令1')
|
|
|
|
|
for i in range(0, len(bbs)):
|
|
|
|
|
obj = bbs[i]
|
|
|
|
|
fp.write(obj.get_text().strip())
|
|
|
|
|
elif(n=='2'):
|
|
|
|
|
print('执行指令2')
|
|
|
|
|
for i in range(0, len(bbs)):
|
|
|
|
|
obj = bbs[i]
|
|
|
|
|
print(obj.get_text().strip())
|
|
|
|
|
else:
|
|
|
|
|
print('出现问题',n)
|
|
|
|
|
print('没有执行')
|
|
|
|
|
fp.close()
|
|
|
|
|
url = 'http://www.ddxs.com/doupocangqiong/'
|
|
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40",
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 请求对象定制
|
|
|
|
|
request = urllib.request.Request(url=url, headers=headers)
|
|
|
|
|
|
|
|
|
|
# 获取处理前的相应数据
|
|
|
|
|
response = urllib.request.urlopen(request)
|
|
|
|
|
content = response.read().decode('utf-8')
|
|
|
|
|
|
|
|
|
|
soup = BeautifulSoup(content,'lxml')
|
|
|
|
|
print('输入1:保存在文件夹中\n' \
|
|
|
|
|
'输入2:直接输出\n' \
|
|
|
|
|
'输入3:词云(文件会下载到本地)\n' \
|
|
|
|
|
'输入4:查询章节\n'\
|
|
|
|
|
'输入5:显示提示信息')
|
|
|
|
|
while 1:
|
|
|
|
|
print('请输入指令:')
|
|
|
|
|
n = input()
|
|
|
|
|
|
|
|
|
|
#创造词云
|
|
|
|
|
if n == '1':
|
|
|
|
|
save(n, soup)
|
|
|
|
|
elif n == '2':
|
|
|
|
|
save(n, soup)
|
|
|
|
|
elif n == '3':
|
|
|
|
|
print('指令3执行')
|
|
|
|
|
save('1',soup)
|
|
|
|
|
Word_cloud()
|
|
|
|
|
|
|
|
|
|
#查询章节,需要未处理的content
|
|
|
|
|
elif n == '4':
|
|
|
|
|
print('指令4执行')
|
|
|
|
|
tree = etree.HTML(content)
|
|
|
|
|
list = tree.xpath('//body//a/text()')
|
|
|
|
|
|
|
|
|
|
# 把阿拉伯数据装变为中文数字
|
|
|
|
|
num = input("\n请输入数字(1-1623):")
|
|
|
|
|
if int(num) < 1 or int(num) > 1623:
|
|
|
|
|
print("输入范围错误!\n")
|
|
|
|
|
continue
|
|
|
|
|
n = change(num)
|
|
|
|
|
n = '第' + n + '章'
|
|
|
|
|
if (n == '第一百章'):
|
|
|
|
|
n = '一百零章'
|
|
|
|
|
elif (n =='第两百零两章'):
|
|
|
|
|
n='第两百零二章'
|
|
|
|
|
elif(n=='第一千零一十一章'):
|
|
|
|
|
n='第一千零十一章'
|
|
|
|
|
print('n',n)
|
|
|
|
|
print(list)
|
|
|
|
|
for i in range(0, len(list)):
|
|
|
|
|
if n in list[i]:
|
|
|
|
|
print(list[i])
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
elif n=='5':
|
|
|
|
|
print('输入1:保存在文件夹中\n' \
|
|
|
|
|
'输入2:直接输出\n' \
|
|
|
|
|
'输入3:词云\n' \
|
|
|
|
|
'输入4:查询章节\n' \
|
|
|
|
|
'输入5:显示提示信息'\
|
|
|
|
|
'输入6:退出')
|
|
|
|
|
elif n=='6':
|
|
|
|
|
print('退出程序')
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
print('!!输入了错误指令!!')
|
|
|
|
|
print('==================')
|
|
|
|
|
print('输入1:保存在文件夹中\n' \
|
|
|
|
|
'输入2:直接输出\n' \
|
|
|
|
|
'输入3:词云\n' \
|
|
|
|
|
'输入4:查询章节\n' \
|
|
|
|
|
'输入5:显示提示信息' \
|
|
|
|
|
'输入6:退出')
|
|
|
|
|
print('==================')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# url1 = 'http://www.ddxs.com/doupocangqiong/'
|
|
|
|
|
# url2 = '/doupocangqiong/1656.html'
|
|
|
|
|
# new = urllib.parse.urljoin(url1,url2)
|
|
|
|
|
# print(new)
|
|
|
|
|
# from urllib import parse
|
|
|
|
|
#
|
|
|
|
|
# page_url = 'http://www.ddxs.com/doupocangqiong/'
|
|
|
|
|
# new_url = '/doupocangqiong/1656.html'
|
|
|
|
|
#
|
|
|
|
|
# new_full_url = parse.urljoin(page_url, new_url)
|
|
|
|
|
# print(new_full_url)
|