import pymysql import requests import re from bs4 import BeautifulSoup def getDB(): db = pymysql.connect(host='localhost',user='root',password='123456',database='douban') return db def Agent_info(): headers={ 'Cookie':'118268"; bid=IO1Dg04kIm4; _vwo_uuid_v2=D1A645C6CFFBAF77D4DD98C62F188490B|754ea089c6c61f5af161e2912f2d4bee; __yadk_uid=NpVAgGKfMbTI28NFopWDALVWvzGMJt3S; _pk_id.100001.4cf6=095af3751c7a7a20.1681143032.; ap_v=0,6.0; __utmc=30149280; __utmc=223695111; dbcl2="279593631:HhdIjxDt0FA"; ck=XIW8; __utma=30149280.966668946.1681143033.1712632454.1712639313.6; __utmb=30149280.0.10.1712639313; __utmz=30149280.1712639313.6.2.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=223695111.1690211447.1681143033.1712632454.1712639313.6; __utmb=223695111.0.10.1712639313; __utmz=223695111.1712639313.6.2.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1712639313%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=1; push_noty_num=0; push_doumail_num=0; __gads=ID=d8a2141b0213474a-229414c42add00a6:T=1681143032:RT=1712639314:S=ALNI_Mb89dGhTs42z60R9TMxDscyQIzA8A; __gpi=UID=00000bf05307ad13:T=1681143032:RT=1712639314:S=ALNI_MbkC2b_Z_7nO1PL2HHsgHolhWs0iw; __eoi=ID=2f9ca57c63f42bd7:T=1712496871:RT=1712639314:S=AA-AfjbV9P_SdwHly0Xzv8gyJ7ZR', 'Host':'movie.douban.com', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0' } return headers def get_url(url): print("抓取网址",url) headers=Agent_info() request=requests.get(url,headers=headers) soup=BeautifulSoup(request.text,'lxml') pic=soup.find_all(attrs={'class' :'item'}) film_urls=[] for x in pic: href=x.a.get('href') film_urls.append(href) return film_urls def get_url_info(film_url,id): print("抓取网址", film_url) headers = Agent_info() request = requests.get(film_url, headers=headers) soup = BeautifulSoup(request.text, 'lxml') ranks=soup.find(attrs={'class':'top250-no'}).text.split('.')[1] film_name=soup.find(attrs={'property':'v:itemreviewed'}).text director=soup.find(attrs={'id':'info'}).text.split('\n')[1].split(':')[1].strip() actor=soup.find(attrs={'id':'info'}).text.split('\n')[3].split(':')[1].strip().split('/') actor= str(actor) actor = pymysql.converters.escape_string(actor) language=soup.find(attrs={'id':'info'}).text.split('\n')[6].split(':')[1].strip() rating_num = soup.find(attrs={'property':'v:average'}).text summary = soup.find(attrs={'property': 'v:summary'}).text summary=pymysql.converters.escape_string(summary) sql = 'insert into movies (film_name,director,actor,language,ranks,rating_num,summary,links) values ("{}","{}","{}","{}","{}","{}","{}","{}")'.format(film_name,director,actor,language,ranks,rating_num,summary,film_url) db = getDB() cursor = db.cursor() try: cursor.execute(sql) cursor.execute('insert into moviehash(movieid) values ("{}")'.format(id)) db.commit() except Exception as e: print(e) db.rollback() cursor.close() db.close() if __name__ == '__main__': print("开始抓取") db=getDB() cursor=db.cursor() for i in range(0,50,25): film_urls= get_url("https://movie.douban.com/top250?start="+str(i)+"&filter=") for film_url in range(len(film_urls)): id=re.search('\d\d+',film_urls[film_url]).group() sql='select movieid from moviehash where movieid="{}";'.format(id) cursor.execute(sql) data=cursor.fetchall() if not data: get_url_info(film_urls[film_url],id)