From 372921967f81c4e1d8e654d2adb7f2ebdf46cd0e Mon Sep 17 00:00:00 2001 From: hnu201804060808 Date: Thu, 2 Mar 2023 13:42:34 +0800 Subject: [PATCH] ADD file via upload --- main.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 main.py diff --git a/main.py b/main.py new file mode 100644 index 0000000..11acd8d --- /dev/null +++ b/main.py @@ -0,0 +1,37 @@ +import requests +from lxml import etree + +data_list = [] + +#爬取数据 +url = "http://www.boxofficecn.com/the-red-box-office" +headers = { + 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36" + } +# 使用 requests 的 get 请求该电影排行网页信息 +r = requests.get(url=url, headers=headers) +r.encoding = 'utf-8' +html = r.text +doc = etree.HTML(html) +trs = doc.xpath("//tbody/tr") +for tr in trs: + year = tr.xpath("./td[@class='column-1']/text()")[0] + name = tr.xpath("./td[@class='column-2']/text()")[0] + cast = tr.xpath("./td[@class='column-3']/text()")[0] + num = tr.xpath("./td[@class='column-4']/text()") + if num: + num = num[0] + else: + num = tr.xpath("./td[@class='column-4']/font/text()")[0] + data_list.append([year,name,cast,num]) + +#数据处理 +data_list_new = [] +for data in data_list: + data_list_new.append(f"{data[0]}---{data[1]}---{data[2]}---{data[3]}") + + +#将数据写入data.text文件 +with open("./movie.text","w") as f: + for data in data_list_new: + f.write(data+"\n")