You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
54 lines
1.9 KiB
54 lines
1.9 KiB
# coding:utf-8
|
|
from lxml import etree
|
|
import os
|
|
import csv
|
|
|
|
|
|
def get_filenames(path):
|
|
filenames = []
|
|
for name in os.listdir(path):
|
|
if os.path.isfile(os.path.join(path, name)):
|
|
if name.find("html")!=-1:
|
|
filenames.append(name)
|
|
# print(filenames)
|
|
return filenames
|
|
|
|
|
|
def analysis_html(file_list):
|
|
csv_title=["电影名","电影类型","上映时间","电影时长","票房"]
|
|
file_info=[]
|
|
box_title = ["电影名", "评论"]
|
|
box_info = []
|
|
for file_name in file_list:
|
|
# print(file_name)
|
|
fp=open("./file/"+file_name,"r",encoding="utf-8", errors='ignore')
|
|
html=fp.read()
|
|
fp.close()
|
|
# print(html)
|
|
h = etree.HTML(html)
|
|
title = h.xpath("//div[@class='movie-cn-name']/h1/text()")[0]
|
|
film_type = h.xpath("//div[@class='movie-type']/span/text()")[0]
|
|
film_address_time = h.xpath("//div[@class='movie-show-time']/span/text()")[0]
|
|
num=h.xpath("//div[@class='data-box']/div[3]//text()")
|
|
box_office=''.join(num)
|
|
film_address=film_address_time.split("/")[0]
|
|
film_time=film_address_time.split("/")[-1]
|
|
file_info.append([title,film_type,film_address,film_time,box_office])
|
|
with open('./file/filmInfo.csv', 'w', newline='')as f:
|
|
writer = csv.writer(f)
|
|
writer.writerow(csv_title)
|
|
writer.writerows(file_info)
|
|
for i in h.xpath("//div[@class='comments']/article"):
|
|
evaluate=i.xpath(".//span[@class='comment-content']/text()")[0]
|
|
# print([title,evaluate])
|
|
box_info.append([title,evaluate])
|
|
print(box_info)
|
|
with open('./file/box_info.csv', 'w', newline='', encoding='utf-8')as f:
|
|
writer = csv.writer(f)
|
|
writer.writerow(box_title)
|
|
writer.writerows(box_info)
|
|
|
|
if __name__ == '__main__':
|
|
file_list=get_filenames('./file')
|
|
analysis_html(file_list)
|