You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

102 lines
3.3 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from datetime import datetime
from bs4 import BeautifulSoup
import urllib.request, urllib.error
import ssl, xlwt, re
from Proxypool import Get_UA
def base_url():
return 'https://www.dyttcn.com'
def current_time():
time = '%a %b %d %H:%M:%S %Y'
return datetime.now().strftime(time)
def time_diff(start, end):
time = '%a %b %d %H:%M:%S %Y'
return datetime.strptime(end, time) - datetime.strptime(start, time)
def visitURL(url: str, proxy_ip):
"""
请求指定URL获取网页源代码
:param url:
:return: 返回网页源代码
"""
# 关闭ssl证书印证
ssl._create_default_https_context = ssl._create_unverified_context
head = Get_UA()
# 安装代理 IP opener
proxy_support = urllib.request.ProxyHandler({'http': proxy_ip})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
request = urllib.request.Request(url=url, headers=head, method="GET")
try:
response = urllib.request.urlopen(request)
html = response.read().decode("gbk")
return html
except urllib.error.HTTPError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return False
def get_movie_info(data_queue: list, result_queue: list, proxy_ip):
"""
:param data_queue:
:param result_queue:
:return: None
"""
# 正则匹配
findname = re.compile(r'<h1>.*?《(.*?)》.*?</h1>')
findtype = re.compile(r'<span>类型:(.*?)</span>')
findlink = re.compile(r'豆瓣链接 (.*?)</p>', re.S)
finddate = re.compile(r'<span class="updatetime">发布时间:(.*?)</span>')
while True:
if not data_queue:
break
data = []
url = data_queue.pop(0)
html = visitURL(url, proxy_ip)
soup = BeautifulSoup(html, "html.parser")
item = str(soup)
if re.findall(findname, item): # 片名
data.append(re.findall(findname, item)[0])
else:
data.append('')
if re.findall(findtype, item): # 电影类型
data.append(re.findall(findtype, item)[0])
else:
data.append('')
data.append(url) # 地址
if re.findall(finddate, item): # 发布时间
data.append(re.findall(finddate, item))
else:
data.append('')
if re.findall(findlink, item): # 豆瓣链接
data.append(re.findall(findlink, item)[0])
else:
data.append('')
result_queue.append(data)
def save_xls(datalist: list, savepath: str):
"""
:param datalist: 获取到的数据
:param savepath: 保存的文件路径
:return: None
"""
print("开始保存信息。。。")
workbook = xlwt.Workbook(encoding="utf-8", style_compression=0)
worksheet = workbook.add_sheet('sheet1', cell_overwrite_ok=True)
title = ["片名", "类型", "地址", "发布时间", "豆瓣链接"]
for i in range(len(title)):
worksheet.write(0, i, title[i])
for i in range(len(datalist)):
for j in range(len(datalist[i])):
worksheet.write(i + 1, j, datalist[i][j])
workbook.save(savepath)