diff --git a/pythonProject/__pycache__/dp.cpython-312.pyc b/pythonProject/__pycache__/dp.cpython-312.pyc new file mode 100644 index 0000000..4e91af8 Binary files /dev/null and b/pythonProject/__pycache__/dp.cpython-312.pyc differ diff --git a/pythonProject/cf..py b/pythonProject/cf..py deleted file mode 100644 index 0cff032..0000000 --- a/pythonProject/cf..py +++ /dev/null @@ -1,25 +0,0 @@ -import requests - -url = 'https://codeforces.com/search?by=' - -name = input() - -params = { - 'query' : name -} - -headers = { - 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0' -} - -response = requests.get(url=url, params=params, headers=headers) - -page_text = response.text - -File = name + '.html' - -with open(File, 'w', encoding='utf-8') as fp: - fp.write(page_text) - - - diff --git a/pythonProject/db_log.txt b/pythonProject/db_log.txt index e69de29..192c3d2 100644 --- a/pythonProject/db_log.txt +++ b/pythonProject/db_log.txt @@ -0,0 +1 @@ +2024-05-29 18:05:36,839 ERROR connect Error diff --git a/pythonProject/dp.py b/pythonProject/dp.py index 1cce2ac..0787d48 100644 --- a/pythonProject/dp.py +++ b/pythonProject/dp.py @@ -50,6 +50,12 @@ class DBHelper: return False return True + def fetchCount(self, sql, params=None): + if not self.connect(): + return False + self.execute(sql, params) + return self.cur.fetchone() + def close(self): self.cur.close() self.conn.close() @@ -57,7 +63,6 @@ class DBHelper: if __name__ == '__main__': db = DBHelper() sql = 'create table films(title varchar(50), actor varchar(200), time varchar(100));' - db.execute(sql) + db.connect() + logger.removeHandler(file_handler) - db.close() - logger.removeHandler(file_handler) \ No newline at end of file diff --git a/pythonProject/films.py b/pythonProject/films.py new file mode 100644 index 0000000..177e8b6 --- /dev/null +++ b/pythonProject/films.py @@ -0,0 +1,110 @@ +import requests +import dp +from lxml import etree +# 进程池和管理工具 +from multiprocessing import Pool, Manager +import functools +import matplotlib.pyplot as plt +import logging + +plt.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体 +plt.rcParams['axes.unicode_minus'] = False # 解决保存图像时负号'-'显示为方块的问题 + +# 获取logger的实例 +logger = logging.getLogger("films_log.txt") + +# 指定logger的输出格式 +formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') + +# 文件日志 终端日志 +file_handler = logging.FileHandler('films_log.txt') +file_handler.setFormatter(formatter) + +logger.setLevel(logging.INFO) +logger.addHandler(file_handler) + +def getPage(url): + headers = { + 'User-Agent':'Mozilla/5.0(Macintosh;Intel Mac OS X 10_11_4)AppleWebKit/537.36(KHTML,like Gecko)Chrome/52.0.2743.116 Safari/537.36' + } + response = requests.get(url, headers=headers) + if response.status_code == 200: + return response.text + return None + +def insertInfo(item): + db = dp.DBHelper() + _title = item['item'] + _actor = item['actor'] + _time = item['time'] + sql = 'insert into films(title, actor, time) values (%s, %s, %s);' + params = (_title, _actor, _time) + ok = db.execute(sql, params) + + if ok == False: + logger.error(str(sql)) + logger.error('params: ', str(params)) + logger.error('插入失败') + print('插入失败') + +def parse(html): + items = [] + tree = etree.HTML(html) + titles = tree.xpath('/html/body/div[4]/div/div/div[1]/dl/dd/div/div/div[1]/p[1]/a/text()') + actors = tree.xpath('/html/body/div[4]/div/div/div[1]/dl/dd/div/div/div[1]/p[2]/text()') + times = tree.xpath('/html/body/div[4]/div/div/div[1]/dl/dd/div/div/div[1]/p[3]/text()') + + for i in range(len(titles)): + title = titles[i].strip() + actor = actors[i].strip() + time = times[i].strip() + items.append({ + 'title': title, + 'actor': actor[3:], + 'time': time[5:] + }) + + print(items) + return items + +def analysisCounry(): + # 从数据库表中查询每个国家的电影数量来做分析 + db = dp.DBHelper() + + total = db.fetchCount("select count(*) from films;") + Am = db.fetchCount('select count(*) from films where time like "%美国%";') + Ch = db.fetchCount('select count(*) from films where time like "%中国%";') + Jp = db.fetchCount('select count(*) from films where time like "%日本%";') + Other = total[0] - Am[0] - Ch[0] - Jp[0] + sizes = Am[0], Ch[0], Jp[0], Other + labels = 'America', 'China', 'Japan', 'Others' + colors = 'cyan', 'red', 'yellow', 'lightgreen' + explode = 0, 0, 0, 0 + + # 画出统计表的饼状图 + plt.title("热门电影分类信息图") + plt.pie(sizes, explode=explode, labels=labels, colors=colors, + autopct='%d%%', shadow=True) + + plt.show() + +def CrawlMovieInfo(lock, offset): + # 抓取电影名,主演,上映时间 + url = 'https://www.maoyan.com/board/4?offset=' + str(offset) + # 抓取当前页面 + html = getPage(url) + + for item in parse(html): + lock.acquire() # 获取锁定,使线程进入同步阻塞状态 + insertInfo(item) + lock.release() # 释放锁 + +if __name__ == '__main__': + lock = Manager().Lock() + partial_CrawlMovieInfo = functools.partial(CrawlMovieInfo, lock) + pool = Pool() # 创建一个进程池 + pool.map(partial_CrawlMovieInfo, [i * 10 for i in range(10)]) # 并行处理 + pool.close() + pool.join() + logger.removeHandler(file_handler) + analysisCounry() diff --git a/pythonProject/test.py b/pythonProject/test.py deleted file mode 100644 index 5376f0b..0000000 --- a/pythonProject/test.py +++ /dev/null @@ -1,34 +0,0 @@ -import requests -from lxml import etree - -url = 'https://www.maoyan.com/board/4?offset=0' - - - -headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0' -} - -response = requests.get(url=url, headers=headers) - -tree = etree.HTML(response.text) - -titles = tree.xpath('/html/body/div[4]/div/div/div[1]/dl/dd/div/div/div[1]/p[1]/a/text()') -actors = tree.xpath('/html/body/div[4]/div/div/div[1]/dl/dd/div/div/div[1]/p[2]/text()') -times = tree.xpath('/html/body/div[4]/div/div/div[1]/dl/dd/div/div/div[1]/p[3]/text()') - -items = [] - -for i in range(len(titles)): - title = titles[i].strip() - actors = actors[i].strip() - times = times[i].strip() - - items.append({ - 'title': titles, - 'actor': actors[3:], - 'time': times[5:] - }) - -for i in items: - print(i) \ No newline at end of file