You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.
"""
检查 robots.txt 收集子域名
"""
from client . subdomain . oneforall . common . module import Module
from client . subdomain . oneforall . common import utils
class CheckRobots ( Module ) :
"""
检查 robots.txt 文件中的子域名
"""
def __init__ ( self , domain ) :
"""
初始化 CheckRobots 类
:param domain: 目标域名
"""
Module . __init__ ( self ) # 调用父类 Module 的初始化方法
self . domain = self . register ( domain ) # 注册域名
self . module = ' Check ' # 模块名称
self . source = ' Robots ' # 数据源标识
def check ( self ) :
"""
访问 robots.txt 文件并匹配子域名
"""
# 可能的 robots.txt URL 列表
urls = [ f ' http:// { self . domain } /robots.txt ' , f ' https:// { self . domain } /robots.txt ' ,
f ' http://www. { self . domain } /robots.txt ' , f ' https://www. { self . domain } /robots.txt ' ]
# 循环访问不同的 URL, 查找 robots.txt 文件
for url in urls :
self . header = self . get_header ( )
self . proxy = self . get_proxy ( self . source )
response = self . get ( url , check = False , allow_redirects = False ) # 不跟随重定向
if not response :
return
if response and len ( response . content ) :
# 匹配 robots.txt 文件中的子域名
self . subdomains = utils . match_subdomain ( self . domain , response . text )
def run ( self ) :
"""
执行入口方法,启动检查过程
"""
self . begin ( ) # 开始执行
self . check ( ) # 执行 robots.txt 检查
self . finish ( ) # 完成执行
# 保存结果到文件或数据库
self . save_json ( )
self . gen_result ( )
self . save_db ( )
def do ( domain ) : # 统一入口,方便多线程调用
"""
类统一调用入口
:param domain: 域名
"""
check = CheckRobots ( domain ) # 创建 CheckRobots 实例
check . run ( ) # 执行检查
if __name__ == ' __main__ ' :
# 示例:执行针对 'qq.com' 的 robots.txt 检查
do ( ' qq.com ' )