diff --git a/flaskProject/.idea/.gitignore b/flaskProject/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/flaskProject/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/flaskProject/.idea/flaskProject.iml b/flaskProject/.idea/flaskProject.iml new file mode 100644 index 0000000..bed7d6e --- /dev/null +++ b/flaskProject/.idea/flaskProject.iml @@ -0,0 +1,21 @@ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/flaskProject/.idea/inspectionProfiles/profiles_settings.xml b/flaskProject/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/flaskProject/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/flaskProject/.idea/misc.xml b/flaskProject/.idea/misc.xml new file mode 100644 index 0000000..26eb547 --- /dev/null +++ b/flaskProject/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/flaskProject/.idea/modules.xml b/flaskProject/.idea/modules.xml new file mode 100644 index 0000000..2c2d842 --- /dev/null +++ b/flaskProject/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/flaskProject/__pycache__/app.cpython-38.pyc b/flaskProject/__pycache__/app.cpython-38.pyc new file mode 100644 index 0000000..602bc16 Binary files /dev/null and b/flaskProject/__pycache__/app.cpython-38.pyc differ diff --git a/flaskProject/app.py b/flaskProject/app.py new file mode 100644 index 0000000..0dc62f8 --- /dev/null +++ b/flaskProject/app.py @@ -0,0 +1,185 @@ +from flask import Flask, render_template, request +import requests +from lxml import etree +import pymysql + +app = Flask(__name__) + +def getDB(): + db = pymysql.connect(host='localhost',user='root',password='123456',database='douban') + return db + + + +@app.route('/movies') +def movies(): + db=getDB() + cursor=db.cursor() + sql="select film_name,director,actor,language,ranks,rating_num,summary,links from movies" + cursor.execute(sql) + data=cursor.fetchall() + datalist=[] + for item in data: + datalist.append(item) + cursor.close() + db.close() + return render_template('movies.html',movies=datalist) + +@app.route('/index') +def index(): + return render_template('index.html') + + +@app.route('/index1') +def index1(): + uname=request.args.get('u') + psw=request.args.get('p') + if uname=="肖旺" and psw=="123": + return render_template('index.html') + else: + return render_template('regist.html') + +@app.route('/login') +def login(): + return render_template('login.html') + +@app.route('/regist') +def regist(): + return render_template('login.html') + +@app.route('/get_nbadata') +def get_nbadata(): + db=getDB() + cursor=db.cursor() + sql="select no,name,score,team,info from nba" + cursor.execute(sql) + data=cursor.fetchall() + datalist=[] + for item in data: + datalist.append(item) + cursor.close() + db.close() + return render_template('nba.html',nbadata=datalist) + + +'''@app.route('/get_nbadata') +def get_nbadata(): + url='https://nba.hupu.com/stats/players' + header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'} + res=requests.get(url,headers=header) + e=etree.HTML(res.text) + names=e.xpath('//table[@class="players_table"]//tr/td[2]/a/text()') + teams=e.xpath('//table[@class="players_table"]//tr/td[3]/a/text()') + nos=e.xpath('//table[@class="players_table"]//tr/td[1]/text()') + nos=nos[1::] + scores=e.xpath('//table[@class="players_table"]//tr/td[4]/text()') + scores=scores[1::] + + infos=e.xpath('//table[@class="players_table"]//tr/td[2]/a/@href') + total_ls = [] + infos_ls=[] + for no, name, team, score in zip(nos, names, teams, scores): + st1 = f"排名:{no} 姓名:{name} 队伍:{team} 得分:{score}" + total_ls.append(st1) + for info in infos: + infos_ls.append(info) + return render_template('nba.html',total_ls=total_ls,infos_ls=infos_ls) + + +@app.route('/music') +def get_music(): + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0' + } + url = 'https://www.kugou.com/yy/html/rank.html' + res = requests.get(url=url, headers=headers) + e = etree.HTML(res.text) + music_names = e.xpath('//div[@id="rankWrap"]//ul/li/a/text()') + music_urls = e.xpath('//div[@id="rankWrap"]//ul/li/a/@href') + return render_template('music.html', music_urls=music_urls, music_names=music_names) +''' + +@app.route('/music') +def music(): + db=getDB() + cursor=db.cursor() + sql="select name,info from music" + cursor.execute(sql) + data=cursor.fetchall() + datalist=[] + for item in data: + datalist.append(item) + cursor.close() + db.close() + return render_template('music.html',datalist=datalist) + + +@app.route('/get_text') +def get_text(): + headers= { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36' + } + url = 'https://www.biqg.cc/top/' + res = requests.get(url=url,headers=headers) + e = etree.HTML(res.text) + text_names=e.xpath('//div[@class="wrap rank"]//li/a/text()') + text_urls=e.xpath('//div[@class="wrap rank"]//li/a/@href') + text_authors=e.xpath('//div[@class="wrap rank"]//li/text()') + text_authors2 = [] + for i in range(len(text_authors)): + x = str(text_authors[i]).strip('/').strip('\n').rstrip('\n ') + if x=='': + text_authors2.append('匿名作者') + else: + text_authors2.append(x) + print(text_authors2) + return render_template('text.html',text_names=text_names,text_authors2=text_authors2,text_urls=text_urls) + + + +@app.route('/manh') +def get_url_info(): + db = getDB() + cursor = db.cursor() + sql = "select names,ranks,renqi,shoucang,authors,urls from manh" + cursor.execute(sql) + data = cursor.fetchall() + datalist = [] + for item in data: + datalist.append(item) + cursor.close() + db.close() + return render_template('manh.html', datalist=datalist) + +@app.route('/qm') +def get_qm_info(): + db = getDB() + cursor = db.cursor() + sql = "select names,ranks,authors,summarys,urls from xiaoshuo" + cursor.execute(sql) + data = cursor.fetchall() + datalist = [] + for item in data: + datalist.append(item) + cursor.close() + db.close() + return render_template('qm.html', datalist=datalist) + + + +@app.route('/about') +def about(): + return render_template('about.html') + +@app.route('/talk') +def talk(): + return render_template('talk.html') + +@app.route('/zhanghao') +def zhanghao(): + return render_template('login.html') + + +if __name__=='__main__': + app.run() diff --git a/flaskProject/getData.py b/flaskProject/getData.py new file mode 100644 index 0000000..0f7a89b --- /dev/null +++ b/flaskProject/getData.py @@ -0,0 +1,71 @@ +import pymysql +import requests +import re +from bs4 import BeautifulSoup + +def getDB(): + db = pymysql.connect(host='localhost',user='root',password='123456',database='douban') + return db + +def Agent_info(): + headers={ + 'Cookie':'118268"; bid=IO1Dg04kIm4; _vwo_uuid_v2=D1A645C6CFFBAF77D4DD98C62F188490B|754ea089c6c61f5af161e2912f2d4bee; __yadk_uid=NpVAgGKfMbTI28NFopWDALVWvzGMJt3S; _pk_id.100001.4cf6=095af3751c7a7a20.1681143032.; ap_v=0,6.0; __utmc=30149280; __utmc=223695111; dbcl2="279593631:HhdIjxDt0FA"; ck=XIW8; __utma=30149280.966668946.1681143033.1712632454.1712639313.6; __utmb=30149280.0.10.1712639313; __utmz=30149280.1712639313.6.2.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=223695111.1690211447.1681143033.1712632454.1712639313.6; __utmb=223695111.0.10.1712639313; __utmz=223695111.1712639313.6.2.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1712639313%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=1; push_noty_num=0; push_doumail_num=0; __gads=ID=d8a2141b0213474a-229414c42add00a6:T=1681143032:RT=1712639314:S=ALNI_Mb89dGhTs42z60R9TMxDscyQIzA8A; __gpi=UID=00000bf05307ad13:T=1681143032:RT=1712639314:S=ALNI_MbkC2b_Z_7nO1PL2HHsgHolhWs0iw; __eoi=ID=2f9ca57c63f42bd7:T=1712496871:RT=1712639314:S=AA-AfjbV9P_SdwHly0Xzv8gyJ7ZR', + 'Host':'movie.douban.com', + 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0' + } + return headers + +def get_url(url): + print("抓取网址",url) + headers=Agent_info() + request=requests.get(url,headers=headers) + soup=BeautifulSoup(request.text,'lxml') + pic=soup.find_all(attrs={'class' :'item'}) + film_urls=[] + for x in pic: + href=x.a.get('href') + film_urls.append(href) + + return film_urls + +def get_url_info(film_url,id): + print("抓取网址", film_url) + headers = Agent_info() + request = requests.get(film_url, headers=headers) + soup = BeautifulSoup(request.text, 'lxml') + ranks=soup.find(attrs={'class':'top250-no'}).text.split('.')[1] + film_name=soup.find(attrs={'property':'v:itemreviewed'}).text + director=soup.find(attrs={'id':'info'}).text.split('\n')[1].split(':')[1].strip() + actor=soup.find(attrs={'id':'info'}).text.split('\n')[3].split(':')[1].strip().split('/') + actor= str(actor) + actor = pymysql.converters.escape_string(actor) + language=soup.find(attrs={'id':'info'}).text.split('\n')[6].split(':')[1].strip() + rating_num = soup.find(attrs={'property':'v:average'}).text + summary = soup.find(attrs={'property': 'v:summary'}).text + summary=pymysql.converters.escape_string(summary) + sql = 'insert into movies (film_name,director,actor,language,ranks,rating_num,summary,links) values ("{}","{}","{}","{}","{}","{}","{}","{}")'.format(film_name,director,actor,language,ranks,rating_num,summary,film_url) + db = getDB() + cursor = db.cursor() + try: + cursor.execute(sql) + cursor.execute('insert into moviehash(movieid) values ("{}")'.format(id)) + db.commit() + except Exception as e: + print(e) + db.rollback() + cursor.close() + db.close() + +if __name__ == '__main__': + print("开始抓取") + db=getDB() + cursor=db.cursor() + for i in range(0,50,25): + film_urls= get_url("https://movie.douban.com/top250?start="+str(i)+"&filter=") + for film_url in range(len(film_urls)): + id=re.search('\d\d+',film_urls[film_url]).group() + sql='select movieid from moviehash where movieid="{}";'.format(id) + cursor.execute(sql) + data=cursor.fetchall() + if not data: + get_url_info(film_urls[film_url],id) \ No newline at end of file diff --git a/flaskProject/get_music.py b/flaskProject/get_music.py new file mode 100644 index 0000000..7e1038d --- /dev/null +++ b/flaskProject/get_music.py @@ -0,0 +1,35 @@ +import pymysql +import requests +from lxml import etree +def getDB(): + db = pymysql.connect(host='localhost',user='root',password='123456',database='douban') + return db +def Agent_info(): + headers={ + 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0' + } + return headers +def get_url_info(music_url): + print("抓取网址", music_url) + music_url = 'https://www.kugou.com/yy/html/rank.html' + headers = Agent_info() + res = requests.get(url=music_url, headers=headers) + e = etree.HTML(res.text) + music_singer =e.xpath('//div[@id="rankWrap"]//ul/li/a/@title') + music_urls = e.xpath('//div[@id="rankWrap"]//ul/li/a/@href') + for i in range(len(music_singer)): + sql = 'insert into music (name,info) values ("{}","{}")'.format(music_singer[i],music_urls[i]) + db = getDB() + cursor = db.cursor() + try: + cursor.execute(sql) + db.commit() + except Exception as e: + print(e) + db.rollback() + result = cursor.fetchone() + cursor.close() + db.close() + +if __name__ == '__main__': + get_url_info(music_url= 'https://www.kugou.com/yy/html/rank.html') \ No newline at end of file diff --git a/flaskProject/get_nbadata.py b/flaskProject/get_nbadata.py new file mode 100644 index 0000000..6f9b342 --- /dev/null +++ b/flaskProject/get_nbadata.py @@ -0,0 +1,38 @@ +import pymysql +import requests +from lxml import etree +def getDB(): + db = pymysql.connect(host='localhost',user='root',password='123456',database='douban') + return db +def Agent_info(): + headers={ + 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0' + } + return headers +def get_url_info(nba_url): + print("抓取网址", nba_url) + headers = Agent_info() + res = requests.get(nba_url, headers=headers) + e = etree.HTML(res.text) + names = e.xpath('//table[@class="players_table"]//tr/td[2]/a/text()') + teams = e.xpath('//table[@class="players_table"]//tr/td[3]/a/text()') + nos = e.xpath('//table[@class="players_table"]//tr/td[1]/text()') + nos = nos[1::] + scores = e.xpath('//table[@class="players_table"]//tr/td[4]/text()') + scores = scores[1::] + infos = e.xpath('//table[@class="players_table"]//tr/td[2]/a/@href') + for i in range(len(names)): + sql = 'insert into nba (no,name,team,score,info) values ("{}","{}","{}","{}","{}")'.format(nos[i],names[i],teams[i],scores[i],infos[i]) + db = getDB() + cursor = db.cursor() + try: + cursor.execute(sql) + db.commit() + except Exception as e: + print(e) + db.rollback() + result = cursor.fetchone() + cursor.close() + db.close() +if __name__ == '__main__': + get_url_info(nba_url="https://nba.hupu.com/stats/players") \ No newline at end of file diff --git a/flaskProject/get_xiaoshuo.py b/flaskProject/get_xiaoshuo.py new file mode 100644 index 0000000..c12c31f --- /dev/null +++ b/flaskProject/get_xiaoshuo.py @@ -0,0 +1,62 @@ +import pymysql +import requests +from lxml import etree +import re + + +def getDB(): + db = pymysql.connect(host='localhost',user='root',password='123456',database='douban') + return db + +def Agent_info(): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36' + } + return headers + +def get_url(): + headers=Agent_info() + manh_url = 'https://www.qimao.com/paihang' + res = requests.get(url=manh_url, headers=headers) + htmldata=res.text + urls=re.findall(' + + + + Title + + + +
+
+ +
+ +
+

       我们网站提供关于爬虫的相关信息和资源,旨在帮助用户了解爬虫技术并学习如何使用它们。爬虫是一种自动的程序,用于在互联网上收集和提取数据。它可以模拟人类的浏览行为,访问网页并抓取所需的信息。虫技术在数据分析、市场研究、争情报等领域具有广泛应用。在我们的网站上,用户可以找到有关爬虫的教程、指南实例代码。我们提供入门级别的教程,帮助初学者了解爬虫的基本原理和操作步骤。我们还提供高级教程,介绍更复杂的爬虫技术和应用场景。此外,我们还提供爬虫工具和架的推和评估。用户可以了解不同的爬虫工具,选择适合己需求的工具来开展爬虫项目。我们还提供爬虫实战案例和经验分享,让用户可以学习和借鉴成功的爬虫项目。我们的目标是帮助掌握爬虫技术,并将其应用于实际项目中。我们鼓励用户遵守法律法规和道德准则,在使用爬虫技术时保护个人隐私和网络安全。感谢您选择我们的网站,我们期待为您提供有关爬虫的知识和资源。如有任何问题或需求,请随时联系我们。

+ + \ No newline at end of file diff --git a/flaskProject/templates/index.html b/flaskProject/templates/index.html new file mode 100644 index 0000000..27a8933 --- /dev/null +++ b/flaskProject/templates/index.html @@ -0,0 +1,54 @@ + + + + + index + + + + +
+ + +
+
+ +
+ +
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+ + + \ No newline at end of file diff --git a/flaskProject/templates/login.html b/flaskProject/templates/login.html new file mode 100644 index 0000000..1e9f892 --- /dev/null +++ b/flaskProject/templates/login.html @@ -0,0 +1,18 @@ + + + + +登录界面 + + + +
+

阿肖的爬虫项目

+
+ + + +
+
+ + \ No newline at end of file diff --git a/flaskProject/templates/manh.html b/flaskProject/templates/manh.html new file mode 100644 index 0000000..d000964 --- /dev/null +++ b/flaskProject/templates/manh.html @@ -0,0 +1,38 @@ + + + + + Title + + + +
+
+

腾讯漫画

+
+
+ +
+
+ + + + + + + + + + {% for manh in datalist %} + + + + + + + + + {% endfor %} +
评分名称人气收藏作者地址
{{ manh[1] }}{{ manh[0] }}{{ manh[2] }}{{ manh[3] }}{{ manh[4] }}去腾讯动漫看
+ + \ No newline at end of file diff --git a/flaskProject/templates/movies.html b/flaskProject/templates/movies.html new file mode 100644 index 0000000..bed5264 --- /dev/null +++ b/flaskProject/templates/movies.html @@ -0,0 +1,43 @@ + + + + + Title + + + +
+
+

豆瓣电影top50排行榜

+
+
+ +
+
+ + + + + + + + + + + + {% for movie in movies %} + + + + + + + + + + + {% endfor %} +
排名评分电影中文名称导演语言演员简介播放地址
{{ movie[4] }}{{ movie[5] }}{{ movie[0] }}{{ movie[1] }}{{ movie[3] }}{{ movie[2] }}{{ movie[6] }}去豆瓣看
+ + + \ No newline at end of file diff --git a/flaskProject/templates/music.html b/flaskProject/templates/music.html new file mode 100644 index 0000000..f16c66e --- /dev/null +++ b/flaskProject/templates/music.html @@ -0,0 +1,30 @@ + + + + + Title + + + +
+
+

酷狗音乐排行榜

+
+
+ +
+
+ + + + + + {% for music in datalist %} + + + + + {% endfor %} +
歌名播放地址
{{ music[0] }}播放
+ + \ No newline at end of file diff --git a/flaskProject/templates/nba.html b/flaskProject/templates/nba.html new file mode 100644 index 0000000..e414a1d --- /dev/null +++ b/flaskProject/templates/nba.html @@ -0,0 +1,37 @@ + + + + + Title + + + +
+
+

NBA球员top50排行榜

+
+
+ +
+
+ + + + + + + + + + {% for nba in nbadata %} + + + + + + + + {% endfor %} +
排名姓名得分队伍球员信息
{{ nba[0] }}{{ nba[1] }}{{ nba[2] }}{{ nba[3] }}去虎扑网看
+ + \ No newline at end of file diff --git a/flaskProject/templates/qm.html b/flaskProject/templates/qm.html new file mode 100644 index 0000000..179c4d4 --- /dev/null +++ b/flaskProject/templates/qm.html @@ -0,0 +1,36 @@ + + + + + Title + + + +
+
+

七猫小说

+
+
+ +
+
+ + + + + + + + + {% for qm in datalist %} + + + + + + + + {% endfor %} +
评分名称作者简介地址
{{ qm[1] }}{{ qm[0] }}{{ qm[2] }}{{ qm[3] }}去七猫小说看
+ + \ No newline at end of file diff --git a/flaskProject/templates/regist.html b/flaskProject/templates/regist.html new file mode 100644 index 0000000..9ea25b3 --- /dev/null +++ b/flaskProject/templates/regist.html @@ -0,0 +1,36 @@ + + + + + + Document + + + + +
+

输入有误,请重新登录

+ +
+ + \ No newline at end of file diff --git a/flaskProject/templates/talk.html b/flaskProject/templates/talk.html new file mode 100644 index 0000000..2aa5008 --- /dev/null +++ b/flaskProject/templates/talk.html @@ -0,0 +1,56 @@ + + + + + + + + +
+

评论区


+ + +
+
+ + + + diff --git a/flaskProject/templates/text.html b/flaskProject/templates/text.html new file mode 100644 index 0000000..62f4abc --- /dev/null +++ b/flaskProject/templates/text.html @@ -0,0 +1,40 @@ + + + + + Title + + + +
+
+

笔趣阁小说

+
+ +
+
+ +
+ + {% for name in text_names %} + + + + {% endfor %} +
{{ name }}
+ + {% for author in text_authors2 %} + + + + {% endfor %} +
{{ author }}
+ + {% for url in text_urls %} + + + + {% endfor %} +
看小说
+ + \ No newline at end of file diff --git a/flaskProject/test.py b/flaskProject/test.py new file mode 100644 index 0000000..edd7f98 --- /dev/null +++ b/flaskProject/test.py @@ -0,0 +1,26 @@ +import pymysql +import requests +from lxml import etree +import re + +def get_news(): + headers= { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36' + } + url = 'https://news.sina.com.cn/hotnews/#1' + res = requests.get(url=url,headers=headers) + e = res.json() + + return e + + + +if __name__ == '__main__': + urls=get_news() + print(urls) + + + + + +