import pymysql as pymysql import requests from bs4 import BeautifulSoup import re from lxml import etree def getDB(): '''连接数据库''' db = pymysql.connect(host='localhost',user='root',password='shengji.',database='douban',) return db def Agent_info(): ''' 用于保存cookies,url,user-agent信息''' headers = { 'Cookie': 'douban-fav-remind=1; bid=LP2o54Mcp34; ll="118268"; ap_v=0,6.0', 'Host':'movie.douban.com', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62' } return headers #获取电影详情url地址列表和外国电影名字 def get_url(url): print('抓取网址:',url) headers = Agent_info() request = requests.get(url,headers=headers) soup = BeautifulSoup(request.text,'lxml') pic = soup.find_all(attrs={'class':'pic'}) film_urls = [] #电影详情url地址列表 for x in pic: href = x.a.get('href') film_urls.append(href) movie_list = [] #外国电影名字 div_list = soup.find_all('div',class_='hd') for each in div_list: movie = each.a.contents[3].text.strip() movie = movie[2:] movie_list.append(movie) return film_urls,movie_list #获取电影信息 def get_url_info(film_url,film_name_en,id): print('抓取网址:',film_url) headers = Agent_info() request = requests.get(film_url, headers=headers) soup = BeautifulSoup(request.text, 'lxml') #排名 ranks = soup.find(attrs={'class':"top250-no"}).text.split(".")[1] #电影中文名 film_name = soup.find(attrs={'property':'v:itemreviewed'}).text.split(' ')[0] #导演 director = soup.find(attrs={'id':"info"}).text.split('\n')[1].split(':')[1].strip() #编剧 scriptwriter = soup.find(attrs={'id':"info"}).text.split('\n')[2].split(':')[1].strip() #主演 actor = soup.find(attrs={'id':"info"}).text.split('\n')[3].split(':')[1].strip() # 类型 filmtype = soup.find(attrs={'id': "info"}).text.split('\n')[4].split(':')[1].strip() types = filmtype.split("/") if soup.find(attrs={'id': "info"}).text.split('\n')[5].split(':')[0] =='官方网站': # 制片国家/地区 area = soup.find(attrs={'id': "info"}).text.split('\n')[6].split(':')[1].strip() # 语言 language = soup.find(attrs={'id': "info"}).text.split('\n')[7].split(':')[1].strip() # 上映日期 initialrReleaseDate = soup.find(attrs={'id': "info"}).text.split('\n')[8].split(':')[1].split('(')[0].strip() else: # 制片国家/地区 area = soup.find(attrs={'id': "info"}).text.split('\n')[5].split(':')[1].strip() # 语言 language = soup.find(attrs={'id': "info"}).text.split('\n')[6].split(':')[1].strip() # 上映日期 initialrReleaseDate = soup.find(attrs={'id': "info"}).text.split('\n')[7].split(':')[1].split('(')[0].strip() # 片长 runtime = soup.find(attrs={'property': "v:runtime"}).text # 评分(平均分) rating_num = soup.find(attrs={'property': "v:average"}).text # 五星评分比例 stars5_rating_per = soup.find(attrs={'class': "rating_per"}).text # 评价人数 rating_people = soup.find(attrs={'property': "v:votes"}).text #剧情简介 summary = soup.find(attrs={'property': "v:summary"}).text summary = pymysql.converters.escape_string(summary) #存到数据库 sql = 'insert into movies(film_name,director,scriptwriter,actor,filmtype,area,language,initialrReleaseDate,ranks,runtime,rating_num,stars5_rating_per,rating_people,summary,film_name_en,links) values("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}");'.format(film_name,director,scriptwriter,actor,filmtype,area,language,initialrReleaseDate,ranks,runtime,rating_num,stars5_rating_per,rating_people,summary,film_name_en,film_url) db = getDB() try: cursor = db.cursor() cursor.execute(sql) cursor.execute('insert into moviehash(movieid) values("{}")'.format(id)) for j in range(len(types)): cursor.execute('insert into movietype(movieid,filmtype) values("{}","{}")'.format(id,types[j].strip())) db.commit() except Exception as e: print(e) db.rollback() cursor.close() db.close() if __name__ == '__main__': print('开始抓取') db = getDB() cursor = db.cursor() for i in range(0,250,25): film_urls, movie_list = get_url("https://movie.douban.com/top250?start="+str(i)+"&filter=") for film_url in range(len(film_urls)): id = re.search('\d\d+',film_urls[film_url]).group() sql = 'select movieid from moviehash where movieid = {}'.format(id) cursor.execute(sql) data = cursor.fetchall() if not data: get_url_info(film_urls[film_url],movie_list[film_url],id) # get_url_info("https://movie.douban.com/subject/1291561/","111","1291561")