chen/keshe/analysisHtml.py

# coding:utf-8
from lxml import etree
import os
import csv


def get_filenames(path):
    filenames = []
    for name in os.listdir(path):
        if os.path.isfile(os.path.join(path, name)):
            if name.find("html")!=-1:
                filenames.append(name)
    # print(filenames)
    return filenames


def analysis_html(file_list):
    csv_title=["电影名","电影类型","上映时间","电影时长","票房"]
    file_info=[]
    box_title = ["电影名", "评论"]
    box_info = []
    for file_name in file_list:
        # print(file_name)
        fp=open("./file/"+file_name,"r",encoding="utf-8", errors='ignore')
        html=fp.read()
        fp.close()
        # print(html)
        h = etree.HTML(html)
        title = h.xpath("//div[@class='movie-cn-name']/h1/text()")[0]
        film_type = h.xpath("//div[@class='movie-type']/span/text()")[0]
        film_address_time = h.xpath("//div[@class='movie-show-time']/span/text()")[0]
        num=h.xpath("//div[@class='data-box']/div[3]//text()")
        box_office=''.join(num)
        film_address=film_address_time.split("/")[0]
        film_time=film_address_time.split("/")[-1]
        file_info.append([title,film_type,film_address,film_time,box_office])
        with open('./file/filmInfo.csv', 'w', newline='')as f:
            writer = csv.writer(f)
            writer.writerow(csv_title)
            writer.writerows(file_info)
        for i in h.xpath("//div[@class='comments']/article"):
            evaluate=i.xpath(".//span[@class='comment-content']/text()")[0]
            # print([title,evaluate])
            box_info.append([title,evaluate])
    print(box_info)
    with open('./file/box_info.csv', 'w', newline='', encoding='utf-8')as f:
        writer = csv.writer(f)
        writer.writerow(box_title)
        writer.writerows(box_info)

if __name__ == '__main__':
    file_list=get_filenames('./file')
    analysis_html(file_list)