|
|
import re
|
|
|
import requests
|
|
|
from fontTools.ttLib import TTFont
|
|
|
from lxml import etree
|
|
|
|
|
|
fp1 = open("类型.txt", "a", encoding="utf-8")
|
|
|
fp2 = open("收藏数.txt", "a", encoding="utf-8")
|
|
|
fp3 = open("作者名.txt", "a", encoding="utf-8")
|
|
|
|
|
|
for i in range(1,2):
|
|
|
#url = (f'https://www.qidian.com/rank/vipcollect?page={i}')
|
|
|
url = ('https://www.qidian.com/rank/vipcollect/')
|
|
|
response =requests.get(url)
|
|
|
html = response.text
|
|
|
# 'w'只写,不存在则创建
|
|
|
with open('解密前.html', mode='w', encoding='utf-8') as f:
|
|
|
f.write(html)
|
|
|
|
|
|
# 代码下载字体文件
|
|
|
with open('解密前.html', mode='r', encoding='utf-8') as f:
|
|
|
html = f.read()
|
|
|
|
|
|
# 正则匹配需要下载的字体文件下载地址
|
|
|
font_url = re.findall("\('eot'\); src: url\('(.*?)'\) format\('woff'\)", html)[0] # 目的是获取字体文件链接
|
|
|
# print(font_url)
|
|
|
#font_url = ('https://qidian.gtimg.com/qd_anti_spider/fTDYLCSL.woff')
|
|
|
# 下载
|
|
|
font_response = requests.get(font_url)
|
|
|
font_path = font_url.split('/')[-1] # 用split将/之间分隔开,取最后一个元素将字体文件链接中的’piQOMNSL.woff‘提取出来给font_path命名
|
|
|
with open(font_path, mode='wb') as f: # 二进制
|
|
|
f.write(font_response.content)
|
|
|
|
|
|
# 用fonttools查看字体源码
|
|
|
# font_path = 'piQOMNSL.woff'
|
|
|
fi = TTFont(font_path) # 打开当前目录的font_path文件,也就是”piQOMNSL.woff“
|
|
|
fi.saveXML('font.xml') # 另存为font.xml
|
|
|
|
|
|
# 解析到字体,并将英文替换为数字
|
|
|
font_map = fi['cmap'].getBestCmap()
|
|
|
|
|
|
d = {
|
|
|
'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7',
|
|
|
'eight': '8', 'nine': '9', 'zero': '0', 'period': '.'
|
|
|
}
|
|
|
# 将字典的值从英文转换为数字
|
|
|
for key in font_map.keys():
|
|
|
# key:100070
|
|
|
# font_map[key]: 'key'
|
|
|
# d[font_map[key]]: '5'
|
|
|
font_map[key] = d[font_map[key]]
|
|
|
|
|
|
# font_map字典,将网页中的加密内容替换为破解的数字
|
|
|
for key, value in font_map.items():
|
|
|
# print(key,value)
|
|
|
html = html.replace('&#' + str(key) + ';', str(value))
|
|
|
|
|
|
with open('解密后.html', mode='w', encoding='utf_8') as f:
|
|
|
f.write(html)
|
|
|
|
|
|
# 用xpath提取html中解密的信息
|
|
|
|
|
|
html = etree.HTML(open('解密后.html', 'r', encoding='utf-8').read())
|
|
|
#h2是小说类型
|
|
|
h2 = html.xpath('''//*[@id="book-img-text"]/ul/li['j']/div[2]/p[1]/a[2]/text()''')
|
|
|
#h1是小说的收藏量
|
|
|
h1 = html.xpath('''//*[@id="book-img-text"]/ul/li['j']/div[3]/div/p/span/span/text()''')
|
|
|
# h0是小说的作者名
|
|
|
h0 = html.xpath('''//*[@id="book-img-text"]/ul/li['j']/div[2]/p[1]/a[1]/text()''')
|
|
|
h3 = h2
|
|
|
h4 = h1
|
|
|
h = h0
|
|
|
|
|
|
#循环h3列表输出元素
|
|
|
for k in h3[::1]:
|
|
|
k = str(k) + '\n'
|
|
|
fp1.write(k)
|
|
|
|
|
|
#同理
|
|
|
for e in h4[::1]:
|
|
|
e = str(e) + '\n'
|
|
|
fp2.write(e)
|
|
|
|
|
|
for q in h[::1]:
|
|
|
q = str(q) + '\n'
|
|
|
fp3.write(q)
|
|
|
|
|
|
|
|
|
|