You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
MiaCTFer/client-1/subdomain/oneforall/modules/crawl/commoncrawl.py

62 lines
1.6 KiB

import cdx_toolkit
from tqdm import tqdm
from client.subdomain.oneforall.common.crawl import Crawl
from client.subdomain.oneforall.config import logger
class CommonCrawl(Crawl):
def __init__(self, domain):
Crawl.__init__(self)
self.domain = domain
self.module = 'Crawl'
self.source = 'CommonCrawl'
def crawl(self, domain, limit):
"""
:param domain:
:param limit:
"""
self.header = self.get_header()
self.proxy = self.get_proxy(self.source)
cdx = cdx_toolkit.CDXFetcher()
url = f'*.{domain}/*'
size = cdx.get_size_estimate(url)
print(url, 'CommonCrawl size estimate', size)
for resp in tqdm(cdx.iter(url, limit=limit), total=limit):
if resp.data.get('status') not in ['301', '302']:
subdomains = self.match(self.register(domain), resp.text)
# 合并搜索子域名搜索结果
self.subdomains = self.subdomains.union(subdomains)
def run(self):
"""
类执行入口
"""
self.begin()
self.crawl(self.domain, 50)
# 爬取已发现的子域以发现新的子域
for subdomain in self.subdomains:
if subdomain != self.domain:
self.crawl(subdomain, 10)
self.finish()
self.save_json()
self.gen_result()
self.save_db()
def do(domain): # 统一入口名字 方便多线程调用
"""
类统一调用入口
:param str domain: 域名
"""
crawl = CommonCrawl(domain)
crawl.run()
if __name__ == '__main__':
do('example.com')