You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
110 lines
3.9 KiB
110 lines
3.9 KiB
import time
|
|
from bs4 import BeautifulSoup
|
|
from client.subdomain.oneforall.common.search import Search
|
|
|
|
|
|
class Baidu(Search):
|
|
def __init__(self, domain):
|
|
Search.__init__(self)
|
|
self.module = 'Search'
|
|
self.source = 'BaiduSearch'
|
|
self.init = 'https://www.baidu.com/'
|
|
self.addr = 'https://www.baidu.com/s'
|
|
self.domain = domain
|
|
self.limit_num = 750 # 限制搜索条数
|
|
|
|
def redirect_match(self, domain, html):
|
|
"""
|
|
获取跳转地址并传递地址进行跳转head请求
|
|
|
|
:param domain: 域名
|
|
:param html: 响应体
|
|
:return: 子域
|
|
"""
|
|
bs = BeautifulSoup(html, 'html.parser')
|
|
subdomains_all = set()
|
|
# 获取搜索结果中所有的跳转URL地址
|
|
for find_res in bs.find_all('a', {'class': 'c-showurl'}):
|
|
url = find_res.get('href')
|
|
subdomains = self.match_location(domain, url)
|
|
subdomains_all = subdomains_all.union(subdomains)
|
|
return subdomains_all
|
|
|
|
def search(self, domain, filtered_subdomain='', full_search=False):
|
|
"""
|
|
发送搜索请求并做子域匹配
|
|
|
|
:param str domain: 域名
|
|
:param str filtered_subdomain: 过滤的子域
|
|
:param bool full_search: 全量搜索
|
|
"""
|
|
self.page_num = 0 # 二次搜索重新置0
|
|
while True:
|
|
time.sleep(self.delay)
|
|
self.header = self.get_header()
|
|
self.proxy = self.get_proxy(self.source)
|
|
query = 'site:' + domain + filtered_subdomain
|
|
params = {'wd': query,
|
|
'pn': self.page_num,
|
|
'rn': self.per_page_num}
|
|
resp = self.get(self.addr, params)
|
|
if not resp:
|
|
return
|
|
if len(domain) > 12: # 解决百度搜索结果中域名过长会显示不全的问题
|
|
# 获取百度跳转URL响应头的Location字段获取直链
|
|
subdomains = self.redirect_match(domain, resp.text)
|
|
else:
|
|
subdomains = self.match(domain, resp.text)
|
|
if not subdomains: # 搜索没有发现子域名则停止搜索
|
|
break
|
|
if not full_search:
|
|
# 搜索中发现搜索出的结果有完全重复的结果就停止搜索
|
|
if subdomains.issubset(self.subdomains):
|
|
break
|
|
# 合并搜索子域名搜索结果
|
|
self.subdomains = self.subdomains.union(subdomains)
|
|
self.page_num += self.per_page_num
|
|
# 搜索页面没有出现下一页时停止搜索
|
|
if '&pn={next_pn}&'.format(next_pn=self.page_num) not in resp.text:
|
|
break
|
|
if self.page_num >= self.limit_num: # 搜索条数限制
|
|
break
|
|
|
|
def run(self):
|
|
"""
|
|
类执行入口
|
|
"""
|
|
self.begin()
|
|
self.search(self.domain, full_search=True)
|
|
# 排除同一子域搜索结果过多的子域以发现新的子域
|
|
for statement in self.filter(self.domain, self.subdomains):
|
|
self.search(self.domain, filtered_subdomain=statement)
|
|
|
|
# 递归搜索下一层的子域
|
|
if self.recursive_search:
|
|
# 从1开始是之前已经做过1层子域搜索了,当前实际递归层数是layer+1
|
|
for layer_num in range(1, self.recursive_times):
|
|
for subdomain in self.subdomains:
|
|
# 进行下一层子域搜索的限制条件
|
|
count = subdomain.count('.') - self.domain.count('.')
|
|
if count == layer_num:
|
|
self.search(subdomain)
|
|
self.finish()
|
|
self.save_json()
|
|
self.gen_result()
|
|
self.save_db()
|
|
|
|
|
|
def do(domain): # 统一入口名字 方便多线程调用
|
|
"""
|
|
类统一调用入口
|
|
|
|
:param str domain: 域名
|
|
"""
|
|
search = Baidu(domain)
|
|
search.run()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
do('huayunshuzi.com')
|