from datetime import datetime from bs4 import BeautifulSoup import urllib.request, urllib.error import ssl, xlwt, re from Proxypool import Get_UA def base_url(): return 'https://www.dyttcn.com' def current_time(): time = '%a %b %d %H:%M:%S %Y' return datetime.now().strftime(time) def time_diff(start, end): time = '%a %b %d %H:%M:%S %Y' return datetime.strptime(end, time) - datetime.strptime(start, time) def visitURL(url: str, proxy_ip): """ 请求指定URL,获取网页源代码 :param url: :return: 返回网页源代码 """ # 关闭ssl证书印证 ssl._create_default_https_context = ssl._create_unverified_context head = Get_UA() # 安装代理 IP opener proxy_support = urllib.request.ProxyHandler({'http': proxy_ip}) opener = urllib.request.build_opener(proxy_support) urllib.request.install_opener(opener) request = urllib.request.Request(url=url, headers=head, method="GET") try: response = urllib.request.urlopen(request) html = response.read().decode("gbk") return html except urllib.error.HTTPError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) return False def get_movie_info(data_queue: list, result_queue: list, proxy_ip): """ :param data_queue: :param result_queue: :return: None """ # 正则匹配 findname = re.compile(r'

.*?《(.*?)》.*?

') findtype = re.compile(r'类型:(.*?)') findlink = re.compile(r'豆瓣链接 (.*?)

', re.S) finddate = re.compile(r'发布时间:(.*?)') while True: if not data_queue: break data = [] url = data_queue.pop(0) html = visitURL(url, proxy_ip) soup = BeautifulSoup(html, "html.parser") item = str(soup) if re.findall(findname, item): # 片名 data.append(re.findall(findname, item)[0]) else: data.append('') if re.findall(findtype, item): # 电影类型 data.append(re.findall(findtype, item)[0]) else: data.append('') data.append(url) # 地址 if re.findall(finddate, item): # 发布时间 data.append(re.findall(finddate, item)) else: data.append('') if re.findall(findlink, item): # 豆瓣链接 data.append(re.findall(findlink, item)[0]) else: data.append('') result_queue.append(data) def save_xls(datalist: list, savepath: str): """ :param datalist: 获取到的数据 :param savepath: 保存的文件路径 :return: None """ print("开始保存信息。。。") workbook = xlwt.Workbook(encoding="utf-8", style_compression=0) worksheet = workbook.add_sheet('sheet1', cell_overwrite_ok=True) title = ["片名", "类型", "地址", "发布时间", "豆瓣链接"] for i in range(len(title)): worksheet.write(0, i, title[i]) for i in range(len(datalist)): for j in range(len(datalist[i])): worksheet.write(i + 1, j, datalist[i][j]) workbook.save(savepath)