parent
372921967f
commit
b108002e20
@ -1,37 +0,0 @@
|
|||||||
import requests
|
|
||||||
from lxml import etree
|
|
||||||
|
|
||||||
data_list = []
|
|
||||||
|
|
||||||
#爬取数据
|
|
||||||
url = "http://www.boxofficecn.com/the-red-box-office"
|
|
||||||
headers = {
|
|
||||||
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
|
|
||||||
}
|
|
||||||
# 使用 requests 的 get 请求该电影排行网页信息
|
|
||||||
r = requests.get(url=url, headers=headers)
|
|
||||||
r.encoding = 'utf-8'
|
|
||||||
html = r.text
|
|
||||||
doc = etree.HTML(html)
|
|
||||||
trs = doc.xpath("//tbody/tr")
|
|
||||||
for tr in trs:
|
|
||||||
year = tr.xpath("./td[@class='column-1']/text()")[0]
|
|
||||||
name = tr.xpath("./td[@class='column-2']/text()")[0]
|
|
||||||
cast = tr.xpath("./td[@class='column-3']/text()")[0]
|
|
||||||
num = tr.xpath("./td[@class='column-4']/text()")
|
|
||||||
if num:
|
|
||||||
num = num[0]
|
|
||||||
else:
|
|
||||||
num = tr.xpath("./td[@class='column-4']/font/text()")[0]
|
|
||||||
data_list.append([year,name,cast,num])
|
|
||||||
|
|
||||||
#数据处理
|
|
||||||
data_list_new = []
|
|
||||||
for data in data_list:
|
|
||||||
data_list_new.append(f"{data[0]}---{data[1]}---{data[2]}---{data[3]}")
|
|
||||||
|
|
||||||
|
|
||||||
#将数据写入data.text文件
|
|
||||||
with open("./movie.text","w") as f:
|
|
||||||
for data in data_list_new:
|
|
||||||
f.write(data+"\n")
|
|
Loading…
Reference in new issue