# coding:utf-8 from lxml import etree import os import csv def get_filenames(path): filenames = [] for name in os.listdir(path): if os.path.isfile(os.path.join(path, name)): if name.find("html")!=-1: filenames.append(name) # print(filenames) return filenames def analysis_html(file_list): csv_title=["电影名","电影类型","上映时间","电影时长","票房"] file_info=[] box_title = ["电影名", "评论"] box_info = [] for file_name in file_list: # print(file_name) fp=open("./file/"+file_name,"r",encoding="utf-8", errors='ignore') html=fp.read() fp.close() # print(html) h = etree.HTML(html) title = h.xpath("//div[@class='movie-cn-name']/h1/text()")[0] film_type = h.xpath("//div[@class='movie-type']/span/text()")[0] film_address_time = h.xpath("//div[@class='movie-show-time']/span/text()")[0] num=h.xpath("//div[@class='data-box']/div[3]//text()") box_office=''.join(num) film_address=film_address_time.split("/")[0] film_time=film_address_time.split("/")[-1] file_info.append([title,film_type,film_address,film_time,box_office]) with open('./file/filmInfo.csv', 'w', newline='')as f: writer = csv.writer(f) writer.writerow(csv_title) writer.writerows(file_info) for i in h.xpath("//div[@class='comments']/article"): evaluate=i.xpath(".//span[@class='comment-content']/text()")[0] # print([title,evaluate]) box_info.append([title,evaluate]) print(box_info) with open('./file/box_info.csv', 'w', newline='', encoding='utf-8')as f: writer = csv.writer(f) writer.writerow(box_title) writer.writerows(box_info) if __name__ == '__main__': file_list=get_filenames('./file') analysis_html(file_list)