import re import requests from fontTools.ttLib import TTFont from lxml import etree fp1 = open("类型.txt", "a", encoding="utf-8") fp2 = open("收藏数.txt", "a", encoding="utf-8") fp3 = open("作者名.txt", "a", encoding="utf-8") for i in range(1,2): #url = (f'https://www.qidian.com/rank/vipcollect?page={i}') url = ('https://www.qidian.com/rank/vipcollect/') response =requests.get(url) html = response.text # 'w'只写,不存在则创建 with open('解密前.html', mode='w', encoding='utf-8') as f: f.write(html) # 代码下载字体文件 with open('解密前.html', mode='r', encoding='utf-8') as f: html = f.read() # 正则匹配需要下载的字体文件下载地址 font_url = re.findall("\('eot'\); src: url\('(.*?)'\) format\('woff'\)", html)[0] # 目的是获取字体文件链接 # print(font_url) #font_url = ('https://qidian.gtimg.com/qd_anti_spider/fTDYLCSL.woff') # 下载 font_response = requests.get(font_url) font_path = font_url.split('/')[-1] # 用split将/之间分隔开,取最后一个元素将字体文件链接中的’piQOMNSL.woff‘提取出来给font_path命名 with open(font_path, mode='wb') as f: # 二进制 f.write(font_response.content) # 用fonttools查看字体源码 # font_path = 'piQOMNSL.woff' fi = TTFont(font_path) # 打开当前目录的font_path文件,也就是”piQOMNSL.woff“ fi.saveXML('font.xml') # 另存为font.xml # 解析到字体,并将英文替换为数字 font_map = fi['cmap'].getBestCmap() d = { 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 'zero': '0', 'period': '.' } # 将字典的值从英文转换为数字 for key in font_map.keys(): # key:100070 # font_map[key]: 'key' # d[font_map[key]]: '5' font_map[key] = d[font_map[key]] # font_map字典,将网页中的加密内容替换为破解的数字 for key, value in font_map.items(): # print(key,value) html = html.replace('&#' + str(key) + ';', str(value)) with open('解密后.html', mode='w', encoding='utf_8') as f: f.write(html) # 用xpath提取html中解密的信息 html = etree.HTML(open('解密后.html', 'r', encoding='utf-8').read()) #h2是小说类型 h2 = html.xpath('''//*[@id="book-img-text"]/ul/li['j']/div[2]/p[1]/a[2]/text()''') #h1是小说的收藏量 h1 = html.xpath('''//*[@id="book-img-text"]/ul/li['j']/div[3]/div/p/span/span/text()''') # h0是小说的作者名 h0 = html.xpath('''//*[@id="book-img-text"]/ul/li['j']/div[2]/p[1]/a[1]/text()''') h3 = h2 h4 = h1 h = h0 #循环h3列表输出元素 for k in h3[::1]: k = str(k) + '\n' fp1.write(k) #同理 for e in h4[::1]: e = str(e) + '\n' fp2.write(e) for q in h[::1]: q = str(q) + '\n' fp3.write(q)