from datetime import datetime
from bs4 import BeautifulSoup
import urllib.request, urllib.error
import ssl, xlwt, re
from Proxypool import Get_UA
def base_url():
return 'https://www.dyttcn.com'
def current_time():
time = '%a %b %d %H:%M:%S %Y'
return datetime.now().strftime(time)
def time_diff(start, end):
time = '%a %b %d %H:%M:%S %Y'
return datetime.strptime(end, time) - datetime.strptime(start, time)
def visitURL(url: str, proxy_ip):
"""
请求指定URL,获取网页源代码
:param url:
:return: 返回网页源代码
"""
# 关闭ssl证书印证
ssl._create_default_https_context = ssl._create_unverified_context
head = Get_UA()
# 安装代理 IP opener
proxy_support = urllib.request.ProxyHandler({'http': proxy_ip})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
request = urllib.request.Request(url=url, headers=head, method="GET")
try:
response = urllib.request.urlopen(request)
html = response.read().decode("gbk")
return html
except urllib.error.HTTPError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return False
def get_movie_info(data_queue: list, result_queue: list, proxy_ip):
"""
:param data_queue:
:param result_queue:
:return: None
"""
# 正则匹配
findname = re.compile(r'
.*?《(.*?)》.*?
')
findtype = re.compile(r'类型:(.*?)')
findlink = re.compile(r'豆瓣链接 (.*?)', re.S)
finddate = re.compile(r'发布时间:(.*?)')
while True:
if not data_queue:
break
data = []
url = data_queue.pop(0)
html = visitURL(url, proxy_ip)
soup = BeautifulSoup(html, "html.parser")
item = str(soup)
if re.findall(findname, item): # 片名
data.append(re.findall(findname, item)[0])
else:
data.append('')
if re.findall(findtype, item): # 电影类型
data.append(re.findall(findtype, item)[0])
else:
data.append('')
data.append(url) # 地址
if re.findall(finddate, item): # 发布时间
data.append(re.findall(finddate, item))
else:
data.append('')
if re.findall(findlink, item): # 豆瓣链接
data.append(re.findall(findlink, item)[0])
else:
data.append('')
result_queue.append(data)
def save_xls(datalist: list, savepath: str):
"""
:param datalist: 获取到的数据
:param savepath: 保存的文件路径
:return: None
"""
print("开始保存信息。。。")
workbook = xlwt.Workbook(encoding="utf-8", style_compression=0)
worksheet = workbook.add_sheet('sheet1', cell_overwrite_ok=True)
title = ["片名", "类型", "地址", "发布时间", "豆瓣链接"]
for i in range(len(title)):
worksheet.write(0, i, title[i])
for i in range(len(datalist)):
for j in range(len(datalist[i])):
worksheet.write(i + 1, j, datalist[i][j])
workbook.save(savepath)