|
|
from datetime import datetime
|
|
|
from bs4 import BeautifulSoup
|
|
|
import urllib.request, urllib.error
|
|
|
import ssl, xlwt, re
|
|
|
from Proxypool import Get_UA
|
|
|
|
|
|
def base_url():
|
|
|
return 'https://www.dyttcn.com'
|
|
|
|
|
|
def current_time():
|
|
|
time = '%a %b %d %H:%M:%S %Y'
|
|
|
return datetime.now().strftime(time)
|
|
|
|
|
|
def time_diff(start, end):
|
|
|
time = '%a %b %d %H:%M:%S %Y'
|
|
|
return datetime.strptime(end, time) - datetime.strptime(start, time)
|
|
|
|
|
|
def visitURL(url: str, proxy_ip):
|
|
|
"""
|
|
|
请求指定URL,获取网页源代码
|
|
|
:param url:
|
|
|
:return: 返回网页源代码
|
|
|
"""
|
|
|
# 关闭ssl证书印证
|
|
|
ssl._create_default_https_context = ssl._create_unverified_context
|
|
|
|
|
|
head = Get_UA()
|
|
|
|
|
|
# 安装代理 IP opener
|
|
|
proxy_support = urllib.request.ProxyHandler({'http': proxy_ip})
|
|
|
opener = urllib.request.build_opener(proxy_support)
|
|
|
urllib.request.install_opener(opener)
|
|
|
|
|
|
request = urllib.request.Request(url=url, headers=head, method="GET")
|
|
|
|
|
|
try:
|
|
|
response = urllib.request.urlopen(request)
|
|
|
html = response.read().decode("gbk")
|
|
|
return html
|
|
|
except urllib.error.HTTPError as e:
|
|
|
if hasattr(e, "code"):
|
|
|
print(e.code)
|
|
|
if hasattr(e, "reason"):
|
|
|
print(e.reason)
|
|
|
return False
|
|
|
|
|
|
def get_movie_info(data_queue: list, result_queue: list, proxy_ip):
|
|
|
"""
|
|
|
:param data_queue:
|
|
|
:param result_queue:
|
|
|
:return: None
|
|
|
"""
|
|
|
# 正则匹配
|
|
|
findname = re.compile(r'<h1>.*?《(.*?)》.*?</h1>')
|
|
|
findtype = re.compile(r'<span>类型:(.*?)</span>')
|
|
|
findlink = re.compile(r'豆瓣链接 (.*?)</p>', re.S)
|
|
|
finddate = re.compile(r'<span class="updatetime">发布时间:(.*?)</span>')
|
|
|
|
|
|
while True:
|
|
|
if not data_queue:
|
|
|
break
|
|
|
data = []
|
|
|
url = data_queue.pop(0)
|
|
|
html = visitURL(url, proxy_ip)
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
item = str(soup)
|
|
|
|
|
|
if re.findall(findname, item): # 片名
|
|
|
data.append(re.findall(findname, item)[0])
|
|
|
else:
|
|
|
data.append('')
|
|
|
if re.findall(findtype, item): # 电影类型
|
|
|
data.append(re.findall(findtype, item)[0])
|
|
|
else:
|
|
|
data.append('')
|
|
|
data.append(url) # 地址
|
|
|
if re.findall(finddate, item): # 发布时间
|
|
|
data.append(re.findall(finddate, item))
|
|
|
else:
|
|
|
data.append('')
|
|
|
if re.findall(findlink, item): # 豆瓣链接
|
|
|
data.append(re.findall(findlink, item)[0])
|
|
|
else:
|
|
|
data.append('')
|
|
|
result_queue.append(data)
|
|
|
|
|
|
def save_xls(datalist: list, savepath: str):
|
|
|
"""
|
|
|
:param datalist: 获取到的数据
|
|
|
:param savepath: 保存的文件路径
|
|
|
:return: None
|
|
|
"""
|
|
|
print("开始保存信息。。。")
|
|
|
workbook = xlwt.Workbook(encoding="utf-8", style_compression=0)
|
|
|
worksheet = workbook.add_sheet('sheet1', cell_overwrite_ok=True)
|
|
|
title = ["片名", "类型", "地址", "发布时间", "豆瓣链接"]
|
|
|
for i in range(len(title)):
|
|
|
worksheet.write(0, i, title[i])
|
|
|
for i in range(len(datalist)):
|
|
|
for j in range(len(datalist[i])):
|
|
|
worksheet.write(i + 1, j, datalist[i][j])
|
|
|
workbook.save(savepath) |