selenium_Redis_scrapy/字体反爬 (1).py

import re
import requests
from fontTools.ttLib import TTFont
from lxml import  etree

fp1 = open("类型.txt", "a", encoding="utf-8")
fp2 = open("收藏数.txt", "a", encoding="utf-8")
fp3 = open("作者名.txt", "a", encoding="utf-8")

for i in range(1,2):
    #url = (f'https://www.qidian.com/rank/vipcollect?page={i}')
    url = ('https://www.qidian.com/rank/vipcollect/')
    response =requests.get(url)
    html = response.text
    # 'w'只写，不存在则创建
    with open('解密前.html', mode='w', encoding='utf-8') as f:
        f.write(html)

    # 代码下载字体文件
    with open('解密前.html', mode='r', encoding='utf-8') as f:
        html = f.read()

    # 正则匹配需要下载的字体文件下载地址
    font_url = re.findall("\('eot'\); src: url\('(.*?)'\) format\('woff'\)", html)[0]  # 目的是获取字体文件链接
    # print(font_url)
    #font_url = ('https://qidian.gtimg.com/qd_anti_spider/fTDYLCSL.woff')
    # 下载
    font_response = requests.get(font_url)
    font_path = font_url.split('/')[-1]  # 用split将/之间分隔开，取最后一个元素将字体文件链接中的’piQOMNSL.woff‘提取出来给font_path命名
    with open(font_path, mode='wb') as f:  # 二进制
        f.write(font_response.content)

    # 用fonttools查看字体源码
    # font_path = 'piQOMNSL.woff'
    fi = TTFont(font_path)  # 打开当前目录的font_path文件，也就是”piQOMNSL.woff“
    fi.saveXML('font.xml')  # 另存为font.xml

    # 解析到字体，并将英文替换为数字
    font_map = fi['cmap'].getBestCmap()

    d = {
        'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7',
        'eight': '8', 'nine': '9', 'zero': '0', 'period': '.'
    }
    # 将字典的值从英文转换为数字
    for key in font_map.keys():
        # key:100070
        # font_map[key]:  'key'
        # d[font_map[key]]:  '5'
        font_map[key] = d[font_map[key]]

    # font_map字典,将网页中的加密内容替换为破解的数字
    for key, value in font_map.items():
        # print(key,value)
        html = html.replace('&#' + str(key) + ';', str(value))

    with open('解密后.html', mode='w', encoding='utf_8') as f:
        f.write(html)

# 用xpath提取html中解密的信息

    html = etree.HTML(open('解密后.html', 'r', encoding='utf-8').read())
    #h2是小说类型
    h2 = html.xpath('''//*[@id="book-img-text"]/ul/li['j']/div[2]/p[1]/a[2]/text()''')
    #h1是小说的收藏量
    h1 = html.xpath('''//*[@id="book-img-text"]/ul/li['j']/div[3]/div/p/span/span/text()''')
    # h0是小说的作者名
    h0 = html.xpath('''//*[@id="book-img-text"]/ul/li['j']/div[2]/p[1]/a[1]/text()''')
    h3 = h2
    h4 = h1
    h = h0

    #循环h3列表输出元素
    for k in h3[::1]:
        k = str(k) + '\n'
        fp1.write(k)

    #同理
    for e in h4[::1]:
        e = str(e) + '\n'
        fp2.write(e)

    for q in h[::1]:
        q = str(q) + '\n'
        fp3.write(q)