You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

54 lines
1.9 KiB

# coding:utf-8
from lxml import etree
import os
import csv
def get_filenames(path):
filenames = []
for name in os.listdir(path):
if os.path.isfile(os.path.join(path, name)):
if name.find("html")!=-1:
filenames.append(name)
# print(filenames)
return filenames
def analysis_html(file_list):
csv_title=["电影名","电影类型","上映时间","电影时长","票房"]
file_info=[]
box_title = ["电影名", "评论"]
box_info = []
for file_name in file_list:
# print(file_name)
fp=open("./file/"+file_name,"r",encoding="utf-8", errors='ignore')
html=fp.read()
fp.close()
# print(html)
h = etree.HTML(html)
title = h.xpath("//div[@class='movie-cn-name']/h1/text()")[0]
film_type = h.xpath("//div[@class='movie-type']/span/text()")[0]
film_address_time = h.xpath("//div[@class='movie-show-time']/span/text()")[0]
num=h.xpath("//div[@class='data-box']/div[3]//text()")
box_office=''.join(num)
film_address=film_address_time.split("/")[0]
film_time=film_address_time.split("/")[-1]
file_info.append([title,film_type,film_address,film_time,box_office])
with open('./file/filmInfo.csv', 'w', newline='')as f:
writer = csv.writer(f)
writer.writerow(csv_title)
writer.writerows(file_info)
for i in h.xpath("//div[@class='comments']/article"):
evaluate=i.xpath(".//span[@class='comment-content']/text()")[0]
# print([title,evaluate])
box_info.append([title,evaluate])
print(box_info)
with open('./file/box_info.csv', 'w', newline='', encoding='utf-8')as f:
writer = csv.writer(f)
writer.writerow(box_title)
writer.writerows(box_info)
if __name__ == '__main__':
file_list=get_filenames('./file')
analysis_html(file_list)