parent
b73d6a0fba
commit
0c6941f5b4
Binary file not shown.
@ -1,25 +0,0 @@
|
|||||||
import requests
|
|
||||||
|
|
||||||
url = 'https://codeforces.com/search?by='
|
|
||||||
|
|
||||||
name = input()
|
|
||||||
|
|
||||||
params = {
|
|
||||||
'query' : name
|
|
||||||
}
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0'
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.get(url=url, params=params, headers=headers)
|
|
||||||
|
|
||||||
page_text = response.text
|
|
||||||
|
|
||||||
File = name + '.html'
|
|
||||||
|
|
||||||
with open(File, 'w', encoding='utf-8') as fp:
|
|
||||||
fp.write(page_text)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1 @@
|
|||||||
|
2024-05-29 18:05:36,839 ERROR connect Error
|
@ -0,0 +1,110 @@
|
|||||||
|
import requests
|
||||||
|
import dp
|
||||||
|
from lxml import etree
|
||||||
|
# 进程池和管理工具
|
||||||
|
from multiprocessing import Pool, Manager
|
||||||
|
import functools
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import logging
|
||||||
|
|
||||||
|
plt.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体
|
||||||
|
plt.rcParams['axes.unicode_minus'] = False # 解决保存图像时负号'-'显示为方块的问题
|
||||||
|
|
||||||
|
# 获取logger的实例
|
||||||
|
logger = logging.getLogger("films_log.txt")
|
||||||
|
|
||||||
|
# 指定logger的输出格式
|
||||||
|
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
|
||||||
|
|
||||||
|
# 文件日志 终端日志
|
||||||
|
file_handler = logging.FileHandler('films_log.txt')
|
||||||
|
file_handler.setFormatter(formatter)
|
||||||
|
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
logger.addHandler(file_handler)
|
||||||
|
|
||||||
|
def getPage(url):
|
||||||
|
headers = {
|
||||||
|
'User-Agent':'Mozilla/5.0(Macintosh;Intel Mac OS X 10_11_4)AppleWebKit/537.36(KHTML,like Gecko)Chrome/52.0.2743.116 Safari/537.36'
|
||||||
|
}
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
if response.status_code == 200:
|
||||||
|
return response.text
|
||||||
|
return None
|
||||||
|
|
||||||
|
def insertInfo(item):
|
||||||
|
db = dp.DBHelper()
|
||||||
|
_title = item['item']
|
||||||
|
_actor = item['actor']
|
||||||
|
_time = item['time']
|
||||||
|
sql = 'insert into films(title, actor, time) values (%s, %s, %s);'
|
||||||
|
params = (_title, _actor, _time)
|
||||||
|
ok = db.execute(sql, params)
|
||||||
|
|
||||||
|
if ok == False:
|
||||||
|
logger.error(str(sql))
|
||||||
|
logger.error('params: ', str(params))
|
||||||
|
logger.error('插入失败')
|
||||||
|
print('插入失败')
|
||||||
|
|
||||||
|
def parse(html):
|
||||||
|
items = []
|
||||||
|
tree = etree.HTML(html)
|
||||||
|
titles = tree.xpath('/html/body/div[4]/div/div/div[1]/dl/dd/div/div/div[1]/p[1]/a/text()')
|
||||||
|
actors = tree.xpath('/html/body/div[4]/div/div/div[1]/dl/dd/div/div/div[1]/p[2]/text()')
|
||||||
|
times = tree.xpath('/html/body/div[4]/div/div/div[1]/dl/dd/div/div/div[1]/p[3]/text()')
|
||||||
|
|
||||||
|
for i in range(len(titles)):
|
||||||
|
title = titles[i].strip()
|
||||||
|
actor = actors[i].strip()
|
||||||
|
time = times[i].strip()
|
||||||
|
items.append({
|
||||||
|
'title': title,
|
||||||
|
'actor': actor[3:],
|
||||||
|
'time': time[5:]
|
||||||
|
})
|
||||||
|
|
||||||
|
print(items)
|
||||||
|
return items
|
||||||
|
|
||||||
|
def analysisCounry():
|
||||||
|
# 从数据库表中查询每个国家的电影数量来做分析
|
||||||
|
db = dp.DBHelper()
|
||||||
|
|
||||||
|
total = db.fetchCount("select count(*) from films;")
|
||||||
|
Am = db.fetchCount('select count(*) from films where time like "%美国%";')
|
||||||
|
Ch = db.fetchCount('select count(*) from films where time like "%中国%";')
|
||||||
|
Jp = db.fetchCount('select count(*) from films where time like "%日本%";')
|
||||||
|
Other = total[0] - Am[0] - Ch[0] - Jp[0]
|
||||||
|
sizes = Am[0], Ch[0], Jp[0], Other
|
||||||
|
labels = 'America', 'China', 'Japan', 'Others'
|
||||||
|
colors = 'cyan', 'red', 'yellow', 'lightgreen'
|
||||||
|
explode = 0, 0, 0, 0
|
||||||
|
|
||||||
|
# 画出统计表的饼状图
|
||||||
|
plt.title("热门电影分类信息图")
|
||||||
|
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
|
||||||
|
autopct='%d%%', shadow=True)
|
||||||
|
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
def CrawlMovieInfo(lock, offset):
|
||||||
|
# 抓取电影名,主演,上映时间
|
||||||
|
url = 'https://www.maoyan.com/board/4?offset=' + str(offset)
|
||||||
|
# 抓取当前页面
|
||||||
|
html = getPage(url)
|
||||||
|
|
||||||
|
for item in parse(html):
|
||||||
|
lock.acquire() # 获取锁定,使线程进入同步阻塞状态
|
||||||
|
insertInfo(item)
|
||||||
|
lock.release() # 释放锁
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
lock = Manager().Lock()
|
||||||
|
partial_CrawlMovieInfo = functools.partial(CrawlMovieInfo, lock)
|
||||||
|
pool = Pool() # 创建一个进程池
|
||||||
|
pool.map(partial_CrawlMovieInfo, [i * 10 for i in range(10)]) # 并行处理
|
||||||
|
pool.close()
|
||||||
|
pool.join()
|
||||||
|
logger.removeHandler(file_handler)
|
||||||
|
analysisCounry()
|
@ -1,34 +0,0 @@
|
|||||||
import requests
|
|
||||||
from lxml import etree
|
|
||||||
|
|
||||||
url = 'https://www.maoyan.com/board/4?offset=0'
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0'
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.get(url=url, headers=headers)
|
|
||||||
|
|
||||||
tree = etree.HTML(response.text)
|
|
||||||
|
|
||||||
titles = tree.xpath('/html/body/div[4]/div/div/div[1]/dl/dd/div/div/div[1]/p[1]/a/text()')
|
|
||||||
actors = tree.xpath('/html/body/div[4]/div/div/div[1]/dl/dd/div/div/div[1]/p[2]/text()')
|
|
||||||
times = tree.xpath('/html/body/div[4]/div/div/div[1]/dl/dd/div/div/div[1]/p[3]/text()')
|
|
||||||
|
|
||||||
items = []
|
|
||||||
|
|
||||||
for i in range(len(titles)):
|
|
||||||
title = titles[i].strip()
|
|
||||||
actors = actors[i].strip()
|
|
||||||
times = times[i].strip()
|
|
||||||
|
|
||||||
items.append({
|
|
||||||
'title': titles,
|
|
||||||
'actor': actors[3:],
|
|
||||||
'time': times[5:]
|
|
||||||
})
|
|
||||||
|
|
||||||
for i in items:
|
|
||||||
print(i)
|
|
Loading…
Reference in new issue