import requests import dp from lxml import etree # 进程池和管理工具 from multiprocessing import Pool, Manager import functools import matplotlib.pyplot as plt import logging plt.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体 plt.rcParams['axes.unicode_minus'] = False # 解决保存图像时负号'-'显示为方块的问题 # 获取logger的实例 logger = logging.getLogger("films_log.txt") # 指定logger的输出格式 formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') # 文件日志 终端日志 file_handler = logging.FileHandler('films_log.txt') file_handler.setFormatter(formatter) logger.setLevel(logging.INFO) logger.addHandler(file_handler) def getPage(url): headers = { 'User-Agent':'Mozilla/5.0(Macintosh;Intel Mac OS X 10_11_4)AppleWebKit/537.36(KHTML,like Gecko)Chrome/52.0.2743.116 Safari/537.36' } response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return None def insertInfo(item): db = dp.DBHelper() _title = item['item'] _actor = item['actor'] _time = item['time'] sql = 'insert into films(title, actor, time) values (%s, %s, %s);' params = (_title, _actor, _time) ok = db.execute(sql, params) if ok == False: logger.error(str(sql)) logger.error('params: ', str(params)) logger.error('插入失败') print('插入失败') def parse(html): items = [] tree = etree.HTML(html) titles = tree.xpath('/html/body/div[4]/div/div/div[1]/dl/dd/div/div/div[1]/p[1]/a/text()') actors = tree.xpath('/html/body/div[4]/div/div/div[1]/dl/dd/div/div/div[1]/p[2]/text()') times = tree.xpath('/html/body/div[4]/div/div/div[1]/dl/dd/div/div/div[1]/p[3]/text()') for i in range(len(titles)): title = titles[i].strip() actor = actors[i].strip() time = times[i].strip() items.append({ 'title': title, 'actor': actor[3:], 'time': time[5:] }) print(items) return items def analysisCounry(): # 从数据库表中查询每个国家的电影数量来做分析 db = dp.DBHelper() total = db.fetchCount("select count(*) from films;") Am = db.fetchCount('select count(*) from films where time like "%美国%";') Ch = db.fetchCount('select count(*) from films where time like "%中国%";') Jp = db.fetchCount('select count(*) from films where time like "%日本%";') Other = total[0] - Am[0] - Ch[0] - Jp[0] sizes = Am[0], Ch[0], Jp[0], Other labels = 'America', 'China', 'Japan', 'Others' colors = 'cyan', 'red', 'yellow', 'lightgreen' explode = 0, 0, 0, 0 # 画出统计表的饼状图 plt.title("热门电影分类信息图") plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%d%%', shadow=True) plt.show() def CrawlMovieInfo(lock, offset): # 抓取电影名,主演,上映时间 url = 'https://www.maoyan.com/board/4?offset=' + str(offset) # 抓取当前页面 html = getPage(url) for item in parse(html): lock.acquire() # 获取锁定,使线程进入同步阻塞状态 insertInfo(item) lock.release() # 释放锁 if __name__ == '__main__': lock = Manager().Lock() partial_CrawlMovieInfo = functools.partial(CrawlMovieInfo, lock) pool = Pool() # 创建一个进程池 pool.map(partial_CrawlMovieInfo, [i * 10 for i in range(10)]) # 并行处理 pool.close() pool.join() logger.removeHandler(file_handler) analysisCounry()