From 90a7776709c5920cdca949450d2eb0180d0c66f7 Mon Sep 17 00:00:00 2001 From: pc3vqetl4 <493801852@qq.com> Date: Sat, 27 Apr 2024 22:46:17 +0800 Subject: [PATCH] ADD file via upload --- Common.py | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 Common.py diff --git a/Common.py b/Common.py new file mode 100644 index 0000000..43de68c --- /dev/null +++ b/Common.py @@ -0,0 +1,102 @@ +from datetime import datetime +from bs4 import BeautifulSoup +import urllib.request, urllib.error +import ssl, xlwt, re +from Proxypool import Get_UA + +def base_url(): + return 'https://www.dyttcn.com' + +def current_time(): + time = '%a %b %d %H:%M:%S %Y' + return datetime.now().strftime(time) + +def time_diff(start, end): + time = '%a %b %d %H:%M:%S %Y' + return datetime.strptime(end, time) - datetime.strptime(start, time) + +def visitURL(url: str, proxy_ip): + """ + 请求指定URL,获取网页源代码 + :param url: + :return: 返回网页源代码 + """ + # 关闭ssl证书印证 + ssl._create_default_https_context = ssl._create_unverified_context + + head = Get_UA() + + # 安装代理 IP opener + proxy_support = urllib.request.ProxyHandler({'http': proxy_ip}) + opener = urllib.request.build_opener(proxy_support) + urllib.request.install_opener(opener) + + request = urllib.request.Request(url=url, headers=head, method="GET") + + try: + response = urllib.request.urlopen(request) + html = response.read().decode("gbk") + return html + except urllib.error.HTTPError as e: + if hasattr(e, "code"): + print(e.code) + if hasattr(e, "reason"): + print(e.reason) + return False + +def get_movie_info(data_queue: list, result_queue: list, proxy_ip): + """ + :param data_queue: + :param result_queue: + :return: None + """ + # 正则匹配 + findname = re.compile(r'

.*?《(.*?)》.*?

') + findtype = re.compile(r'类型:(.*?)') + findlink = re.compile(r'豆瓣链接 (.*?)

', re.S) + finddate = re.compile(r'发布时间:(.*?)') + + while True: + if not data_queue: + break + data = [] + url = data_queue.pop(0) + html = visitURL(url, proxy_ip) + soup = BeautifulSoup(html, "html.parser") + item = str(soup) + + if re.findall(findname, item): # 片名 + data.append(re.findall(findname, item)[0]) + else: + data.append('') + if re.findall(findtype, item): # 电影类型 + data.append(re.findall(findtype, item)[0]) + else: + data.append('') + data.append(url) # 地址 + if re.findall(finddate, item): # 发布时间 + data.append(re.findall(finddate, item)) + else: + data.append('') + if re.findall(findlink, item): # 豆瓣链接 + data.append(re.findall(findlink, item)[0]) + else: + data.append('') + result_queue.append(data) + +def save_xls(datalist: list, savepath: str): + """ + :param datalist: 获取到的数据 + :param savepath: 保存的文件路径 + :return: None + """ + print("开始保存信息。。。") + workbook = xlwt.Workbook(encoding="utf-8", style_compression=0) + worksheet = workbook.add_sheet('sheet1', cell_overwrite_ok=True) + title = ["片名", "类型", "地址", "发布时间", "豆瓣链接"] + for i in range(len(title)): + worksheet.write(0, i, title[i]) + for i in range(len(datalist)): + for j in range(len(datalist[i])): + worksheet.write(i + 1, j, datalist[i][j]) + workbook.save(savepath) \ No newline at end of file