You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
python_pac/最终版本1.0.py

158 lines
4.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# @Time : 2021/11/5 13:57
# @Author :wenkaic
# @File : 002草稿
# @Project : pythonProject4
import urllib.request
from bs4 import BeautifulSoup
from urllib import parse
from wordcloud import WordCloud
import numpy as np
from PIL import Image
from lxml import etree
#阿拉伯数字转为中文数字
def change(num):
dic_num = {"1": "", "2": "", "3": "", "4": "", "5": "", "6": "", "7": "", "8": "", "9": "", }
dic_unit = {1: "", 2: "", 3: "", 4: "", 5: ""}
fs = []
daxie = ""
lennum = len(num)
if lennum >= 1:
for item in num:
if int(item) > 0:
fs.append(dic_num[item])
fs.append(dic_unit[lennum])
elif int(item) == 0:
fs.append("")
lennum -= 1
while fs[-1] == "":
fs.pop()
daxie = "".join(fs)
if int(num)>1000 and int(num)<=1009:
daxie=daxie.replace('','',1)
return daxie
# 词云的制作需要处理后的content
def Word_cloud():
mask = np.array(Image.open('006.png'))
fp = open('003斗破苍穹.txt', 'r', encoding='utf-8')
text = fp.read()
w = WordCloud(font_path='方正粗黑宋简体.ttf', width=650, height=700
, contour_color='yellow', contour_width=2, mask=mask
, max_words=500)
w.generate(text)
w.to_file('004dou.png')
fp.close()
# 保存爬取文件或者直接输出爬取的文件
def save(n,soup):
bbs = soup.select('th,tr')
fp = open('003斗破苍穹.txt', 'a', encoding='utf-8')
if(n=='1'):
print('执行指令1')
for i in range(0, len(bbs)):
obj = bbs[i]
fp.write(obj.get_text().strip())
elif(n=='2'):
print('执行指令2')
for i in range(0, len(bbs)):
obj = bbs[i]
print(obj.get_text().strip())
else:
print('出现问题',n)
print('没有执行')
fp.close()
url = 'http://www.ddxs.com/doupocangqiong/'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40",
}
# 请求对象定制
request = urllib.request.Request(url=url, headers=headers)
# 获取处理前的相应数据
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
soup = BeautifulSoup(content,'lxml')
print('输入1保存在文件夹中\n' \
'输入2直接输出\n' \
'输入3词云(文件会下载到本地)\n' \
'输入4查询章节\n'\
'输入5显示提示信息')
while 1:
print('请输入指令:')
n = input()
#创造词云
if n == '1':
save(n, soup)
elif n == '2':
save(n, soup)
elif n == '3':
print('指令3执行')
save('1',soup)
Word_cloud()
#查询章节需要未处理的content
elif n == '4':
print('指令4执行')
tree = etree.HTML(content)
list = tree.xpath('//body//a/text()')
# 把阿拉伯数据装变为中文数字
num = input("\n请输入数字1-1623")
if int(num) < 1 or int(num) > 1623:
print("输入范围错误!\n")
continue
n = change(num)
n = '' + n + ''
if (n == '第一百章'):
n = '一百零章'
elif (n =='第两百零两章'):
n='第两百零二章'
elif(n=='第一千零一十一章'):
n='第一千零十一章'
print('n',n)
print(list)
for i in range(0, len(list)):
if n in list[i]:
print(list[i])
break
elif n=='5':
print('输入1保存在文件夹中\n' \
'输入2直接输出\n' \
'输入3词云\n' \
'输入4查询章节\n' \
'输入5显示提示信息'\
'输入6退出')
elif n=='6':
print('退出程序')
break
else:
print('!!输入了错误指令!!')
print('==================')
print('输入1保存在文件夹中\n' \
'输入2直接输出\n' \
'输入3词云\n' \
'输入4查询章节\n' \
'输入5显示提示信息' \
'输入6退出')
print('==================')
# url1 = 'http://www.ddxs.com/doupocangqiong/'
# url2 = '/doupocangqiong/1656.html'
# new = urllib.parse.urljoin(url1,url2)
# print(new)
# from urllib import parse
#
# page_url = 'http://www.ddxs.com/doupocangqiong/'
# new_url = '/doupocangqiong/1656.html'
#
# new_full_url = parse.urljoin(page_url, new_url)
# print(new_full_url)