diff --git a/flaskProject/.idea/.gitignore b/flaskProject/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/flaskProject/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/flaskProject/.idea/flaskProject.iml b/flaskProject/.idea/flaskProject.iml
new file mode 100644
index 0000000..bed7d6e
--- /dev/null
+++ b/flaskProject/.idea/flaskProject.iml
@@ -0,0 +1,21 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/flaskProject/.idea/inspectionProfiles/profiles_settings.xml b/flaskProject/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/flaskProject/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/flaskProject/.idea/misc.xml b/flaskProject/.idea/misc.xml
new file mode 100644
index 0000000..26eb547
--- /dev/null
+++ b/flaskProject/.idea/misc.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/flaskProject/.idea/modules.xml b/flaskProject/.idea/modules.xml
new file mode 100644
index 0000000..2c2d842
--- /dev/null
+++ b/flaskProject/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/flaskProject/__pycache__/app.cpython-38.pyc b/flaskProject/__pycache__/app.cpython-38.pyc
new file mode 100644
index 0000000..602bc16
Binary files /dev/null and b/flaskProject/__pycache__/app.cpython-38.pyc differ
diff --git a/flaskProject/app.py b/flaskProject/app.py
new file mode 100644
index 0000000..0dc62f8
--- /dev/null
+++ b/flaskProject/app.py
@@ -0,0 +1,185 @@
+from flask import Flask, render_template, request
+import requests
+from lxml import etree
+import pymysql
+
+app = Flask(__name__)
+
+def getDB():
+ db = pymysql.connect(host='localhost',user='root',password='123456',database='douban')
+ return db
+
+
+
+@app.route('/movies')
+def movies():
+ db=getDB()
+ cursor=db.cursor()
+ sql="select film_name,director,actor,language,ranks,rating_num,summary,links from movies"
+ cursor.execute(sql)
+ data=cursor.fetchall()
+ datalist=[]
+ for item in data:
+ datalist.append(item)
+ cursor.close()
+ db.close()
+ return render_template('movies.html',movies=datalist)
+
+@app.route('/index')
+def index():
+ return render_template('index.html')
+
+
+@app.route('/index1')
+def index1():
+ uname=request.args.get('u')
+ psw=request.args.get('p')
+ if uname=="肖旺" and psw=="123":
+ return render_template('index.html')
+ else:
+ return render_template('regist.html')
+
+@app.route('/login')
+def login():
+ return render_template('login.html')
+
+@app.route('/regist')
+def regist():
+ return render_template('login.html')
+
+@app.route('/get_nbadata')
+def get_nbadata():
+ db=getDB()
+ cursor=db.cursor()
+ sql="select no,name,score,team,info from nba"
+ cursor.execute(sql)
+ data=cursor.fetchall()
+ datalist=[]
+ for item in data:
+ datalist.append(item)
+ cursor.close()
+ db.close()
+ return render_template('nba.html',nbadata=datalist)
+
+
+'''@app.route('/get_nbadata')
+def get_nbadata():
+ url='https://nba.hupu.com/stats/players'
+ header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'}
+ res=requests.get(url,headers=header)
+ e=etree.HTML(res.text)
+ names=e.xpath('//table[@class="players_table"]//tr/td[2]/a/text()')
+ teams=e.xpath('//table[@class="players_table"]//tr/td[3]/a/text()')
+ nos=e.xpath('//table[@class="players_table"]//tr/td[1]/text()')
+ nos=nos[1::]
+ scores=e.xpath('//table[@class="players_table"]//tr/td[4]/text()')
+ scores=scores[1::]
+
+ infos=e.xpath('//table[@class="players_table"]//tr/td[2]/a/@href')
+ total_ls = []
+ infos_ls=[]
+ for no, name, team, score in zip(nos, names, teams, scores):
+ st1 = f"排名:{no} 姓名:{name} 队伍:{team} 得分:{score}"
+ total_ls.append(st1)
+ for info in infos:
+ infos_ls.append(info)
+ return render_template('nba.html',total_ls=total_ls,infos_ls=infos_ls)
+
+
+@app.route('/music')
+def get_music():
+
+ headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0'
+ }
+ url = 'https://www.kugou.com/yy/html/rank.html'
+ res = requests.get(url=url, headers=headers)
+ e = etree.HTML(res.text)
+ music_names = e.xpath('//div[@id="rankWrap"]//ul/li/a/text()')
+ music_urls = e.xpath('//div[@id="rankWrap"]//ul/li/a/@href')
+ return render_template('music.html', music_urls=music_urls, music_names=music_names)
+'''
+
+@app.route('/music')
+def music():
+ db=getDB()
+ cursor=db.cursor()
+ sql="select name,info from music"
+ cursor.execute(sql)
+ data=cursor.fetchall()
+ datalist=[]
+ for item in data:
+ datalist.append(item)
+ cursor.close()
+ db.close()
+ return render_template('music.html',datalist=datalist)
+
+
+@app.route('/get_text')
+def get_text():
+ headers= {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
+ }
+ url = 'https://www.biqg.cc/top/'
+ res = requests.get(url=url,headers=headers)
+ e = etree.HTML(res.text)
+ text_names=e.xpath('//div[@class="wrap rank"]//li/a/text()')
+ text_urls=e.xpath('//div[@class="wrap rank"]//li/a/@href')
+ text_authors=e.xpath('//div[@class="wrap rank"]//li/text()')
+ text_authors2 = []
+ for i in range(len(text_authors)):
+ x = str(text_authors[i]).strip('/').strip('\n').rstrip('\n ')
+ if x=='':
+ text_authors2.append('匿名作者')
+ else:
+ text_authors2.append(x)
+ print(text_authors2)
+ return render_template('text.html',text_names=text_names,text_authors2=text_authors2,text_urls=text_urls)
+
+
+
+@app.route('/manh')
+def get_url_info():
+ db = getDB()
+ cursor = db.cursor()
+ sql = "select names,ranks,renqi,shoucang,authors,urls from manh"
+ cursor.execute(sql)
+ data = cursor.fetchall()
+ datalist = []
+ for item in data:
+ datalist.append(item)
+ cursor.close()
+ db.close()
+ return render_template('manh.html', datalist=datalist)
+
+@app.route('/qm')
+def get_qm_info():
+ db = getDB()
+ cursor = db.cursor()
+ sql = "select names,ranks,authors,summarys,urls from xiaoshuo"
+ cursor.execute(sql)
+ data = cursor.fetchall()
+ datalist = []
+ for item in data:
+ datalist.append(item)
+ cursor.close()
+ db.close()
+ return render_template('qm.html', datalist=datalist)
+
+
+
+@app.route('/about')
+def about():
+ return render_template('about.html')
+
+@app.route('/talk')
+def talk():
+ return render_template('talk.html')
+
+@app.route('/zhanghao')
+def zhanghao():
+ return render_template('login.html')
+
+
+if __name__=='__main__':
+ app.run()
diff --git a/flaskProject/getData.py b/flaskProject/getData.py
new file mode 100644
index 0000000..0f7a89b
--- /dev/null
+++ b/flaskProject/getData.py
@@ -0,0 +1,71 @@
+import pymysql
+import requests
+import re
+from bs4 import BeautifulSoup
+
+def getDB():
+ db = pymysql.connect(host='localhost',user='root',password='123456',database='douban')
+ return db
+
+def Agent_info():
+ headers={
+ 'Cookie':'118268"; bid=IO1Dg04kIm4; _vwo_uuid_v2=D1A645C6CFFBAF77D4DD98C62F188490B|754ea089c6c61f5af161e2912f2d4bee; __yadk_uid=NpVAgGKfMbTI28NFopWDALVWvzGMJt3S; _pk_id.100001.4cf6=095af3751c7a7a20.1681143032.; ap_v=0,6.0; __utmc=30149280; __utmc=223695111; dbcl2="279593631:HhdIjxDt0FA"; ck=XIW8; __utma=30149280.966668946.1681143033.1712632454.1712639313.6; __utmb=30149280.0.10.1712639313; __utmz=30149280.1712639313.6.2.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=223695111.1690211447.1681143033.1712632454.1712639313.6; __utmb=223695111.0.10.1712639313; __utmz=223695111.1712639313.6.2.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1712639313%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=1; push_noty_num=0; push_doumail_num=0; __gads=ID=d8a2141b0213474a-229414c42add00a6:T=1681143032:RT=1712639314:S=ALNI_Mb89dGhTs42z60R9TMxDscyQIzA8A; __gpi=UID=00000bf05307ad13:T=1681143032:RT=1712639314:S=ALNI_MbkC2b_Z_7nO1PL2HHsgHolhWs0iw; __eoi=ID=2f9ca57c63f42bd7:T=1712496871:RT=1712639314:S=AA-AfjbV9P_SdwHly0Xzv8gyJ7ZR',
+ 'Host':'movie.douban.com',
+ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0'
+ }
+ return headers
+
+def get_url(url):
+ print("抓取网址",url)
+ headers=Agent_info()
+ request=requests.get(url,headers=headers)
+ soup=BeautifulSoup(request.text,'lxml')
+ pic=soup.find_all(attrs={'class' :'item'})
+ film_urls=[]
+ for x in pic:
+ href=x.a.get('href')
+ film_urls.append(href)
+
+ return film_urls
+
+def get_url_info(film_url,id):
+ print("抓取网址", film_url)
+ headers = Agent_info()
+ request = requests.get(film_url, headers=headers)
+ soup = BeautifulSoup(request.text, 'lxml')
+ ranks=soup.find(attrs={'class':'top250-no'}).text.split('.')[1]
+ film_name=soup.find(attrs={'property':'v:itemreviewed'}).text
+ director=soup.find(attrs={'id':'info'}).text.split('\n')[1].split(':')[1].strip()
+ actor=soup.find(attrs={'id':'info'}).text.split('\n')[3].split(':')[1].strip().split('/')
+ actor= str(actor)
+ actor = pymysql.converters.escape_string(actor)
+ language=soup.find(attrs={'id':'info'}).text.split('\n')[6].split(':')[1].strip()
+ rating_num = soup.find(attrs={'property':'v:average'}).text
+ summary = soup.find(attrs={'property': 'v:summary'}).text
+ summary=pymysql.converters.escape_string(summary)
+ sql = 'insert into movies (film_name,director,actor,language,ranks,rating_num,summary,links) values ("{}","{}","{}","{}","{}","{}","{}","{}")'.format(film_name,director,actor,language,ranks,rating_num,summary,film_url)
+ db = getDB()
+ cursor = db.cursor()
+ try:
+ cursor.execute(sql)
+ cursor.execute('insert into moviehash(movieid) values ("{}")'.format(id))
+ db.commit()
+ except Exception as e:
+ print(e)
+ db.rollback()
+ cursor.close()
+ db.close()
+
+if __name__ == '__main__':
+ print("开始抓取")
+ db=getDB()
+ cursor=db.cursor()
+ for i in range(0,50,25):
+ film_urls= get_url("https://movie.douban.com/top250?start="+str(i)+"&filter=")
+ for film_url in range(len(film_urls)):
+ id=re.search('\d\d+',film_urls[film_url]).group()
+ sql='select movieid from moviehash where movieid="{}";'.format(id)
+ cursor.execute(sql)
+ data=cursor.fetchall()
+ if not data:
+ get_url_info(film_urls[film_url],id)
\ No newline at end of file
diff --git a/flaskProject/get_music.py b/flaskProject/get_music.py
new file mode 100644
index 0000000..7e1038d
--- /dev/null
+++ b/flaskProject/get_music.py
@@ -0,0 +1,35 @@
+import pymysql
+import requests
+from lxml import etree
+def getDB():
+ db = pymysql.connect(host='localhost',user='root',password='123456',database='douban')
+ return db
+def Agent_info():
+ headers={
+ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0'
+ }
+ return headers
+def get_url_info(music_url):
+ print("抓取网址", music_url)
+ music_url = 'https://www.kugou.com/yy/html/rank.html'
+ headers = Agent_info()
+ res = requests.get(url=music_url, headers=headers)
+ e = etree.HTML(res.text)
+ music_singer =e.xpath('//div[@id="rankWrap"]//ul/li/a/@title')
+ music_urls = e.xpath('//div[@id="rankWrap"]//ul/li/a/@href')
+ for i in range(len(music_singer)):
+ sql = 'insert into music (name,info) values ("{}","{}")'.format(music_singer[i],music_urls[i])
+ db = getDB()
+ cursor = db.cursor()
+ try:
+ cursor.execute(sql)
+ db.commit()
+ except Exception as e:
+ print(e)
+ db.rollback()
+ result = cursor.fetchone()
+ cursor.close()
+ db.close()
+
+if __name__ == '__main__':
+ get_url_info(music_url= 'https://www.kugou.com/yy/html/rank.html')
\ No newline at end of file
diff --git a/flaskProject/get_nbadata.py b/flaskProject/get_nbadata.py
new file mode 100644
index 0000000..6f9b342
--- /dev/null
+++ b/flaskProject/get_nbadata.py
@@ -0,0 +1,38 @@
+import pymysql
+import requests
+from lxml import etree
+def getDB():
+ db = pymysql.connect(host='localhost',user='root',password='123456',database='douban')
+ return db
+def Agent_info():
+ headers={
+ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0'
+ }
+ return headers
+def get_url_info(nba_url):
+ print("抓取网址", nba_url)
+ headers = Agent_info()
+ res = requests.get(nba_url, headers=headers)
+ e = etree.HTML(res.text)
+ names = e.xpath('//table[@class="players_table"]//tr/td[2]/a/text()')
+ teams = e.xpath('//table[@class="players_table"]//tr/td[3]/a/text()')
+ nos = e.xpath('//table[@class="players_table"]//tr/td[1]/text()')
+ nos = nos[1::]
+ scores = e.xpath('//table[@class="players_table"]//tr/td[4]/text()')
+ scores = scores[1::]
+ infos = e.xpath('//table[@class="players_table"]//tr/td[2]/a/@href')
+ for i in range(len(names)):
+ sql = 'insert into nba (no,name,team,score,info) values ("{}","{}","{}","{}","{}")'.format(nos[i],names[i],teams[i],scores[i],infos[i])
+ db = getDB()
+ cursor = db.cursor()
+ try:
+ cursor.execute(sql)
+ db.commit()
+ except Exception as e:
+ print(e)
+ db.rollback()
+ result = cursor.fetchone()
+ cursor.close()
+ db.close()
+if __name__ == '__main__':
+ get_url_info(nba_url="https://nba.hupu.com/stats/players")
\ No newline at end of file
diff --git a/flaskProject/get_xiaoshuo.py b/flaskProject/get_xiaoshuo.py
new file mode 100644
index 0000000..c12c31f
--- /dev/null
+++ b/flaskProject/get_xiaoshuo.py
@@ -0,0 +1,62 @@
+import pymysql
+import requests
+from lxml import etree
+import re
+
+
+def getDB():
+ db = pymysql.connect(host='localhost',user='root',password='123456',database='douban')
+ return db
+
+def Agent_info():
+ headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
+ }
+ return headers
+
+def get_url():
+ headers=Agent_info()
+ manh_url = 'https://www.qimao.com/paihang'
+ res = requests.get(url=manh_url, headers=headers)
+ htmldata=res.text
+ urls=re.findall('
+
+
+
+ Title
+
+
+
+
+
+
+
+        我们网站提供关于爬虫的相关信息和资源,旨在帮助用户了解爬虫技术并学习如何使用它们。爬虫是一种自动的程序,用于在互联网上收集和提取数据。它可以模拟人类的浏览行为,访问网页并抓取所需的信息。虫技术在数据分析、市场研究、争情报等领域具有广泛应用。在我们的网站上,用户可以找到有关爬虫的教程、指南实例代码。我们提供入门级别的教程,帮助初学者了解爬虫的基本原理和操作步骤。我们还提供高级教程,介绍更复杂的爬虫技术和应用场景。此外,我们还提供爬虫工具和架的推和评估。用户可以了解不同的爬虫工具,选择适合己需求的工具来开展爬虫项目。我们还提供爬虫实战案例和经验分享,让用户可以学习和借鉴成功的爬虫项目。我们的目标是帮助掌握爬虫技术,并将其应用于实际项目中。我们鼓励用户遵守法律法规和道德准则,在使用爬虫技术时保护个人隐私和网络安全。感谢您选择我们的网站,我们期待为您提供有关爬虫的知识和资源。如有任何问题或需求,请随时联系我们。
+
+
\ No newline at end of file
diff --git a/flaskProject/templates/index.html b/flaskProject/templates/index.html
new file mode 100644
index 0000000..27a8933
--- /dev/null
+++ b/flaskProject/templates/index.html
@@ -0,0 +1,54 @@
+
+
+
+
+ index
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/flaskProject/templates/login.html b/flaskProject/templates/login.html
new file mode 100644
index 0000000..1e9f892
--- /dev/null
+++ b/flaskProject/templates/login.html
@@ -0,0 +1,18 @@
+
+
+
+
+登录界面
+
+
+
+
+
阿肖的爬虫项目
+
+
+
+
\ No newline at end of file
diff --git a/flaskProject/templates/manh.html b/flaskProject/templates/manh.html
new file mode 100644
index 0000000..d000964
--- /dev/null
+++ b/flaskProject/templates/manh.html
@@ -0,0 +1,38 @@
+
+
+
+
+ Title
+
+
+
+
+
+
+ 评分 |
+ 名称 |
+ 人气 |
+ 收藏 |
+ 作者 |
+ 地址 |
+
+ {% for manh in datalist %}
+
+ {{ manh[1] }} |
+ {{ manh[0] }} |
+ {{ manh[2] }} |
+ {{ manh[3] }} |
+ {{ manh[4] }} |
+ 去腾讯动漫看 |
+
+ {% endfor %}
+
+
+
\ No newline at end of file
diff --git a/flaskProject/templates/movies.html b/flaskProject/templates/movies.html
new file mode 100644
index 0000000..bed5264
--- /dev/null
+++ b/flaskProject/templates/movies.html
@@ -0,0 +1,43 @@
+
+
+
+
+ Title
+
+
+
+
+
+
+ 排名 |
+ 评分 |
+ 电影中文名称 |
+ 导演 |
+ 语言 |
+ 演员 |
+ 简介 |
+ 播放地址 |
+
+ {% for movie in movies %}
+
+ {{ movie[4] }} |
+ {{ movie[5] }} |
+ {{ movie[0] }} |
+ {{ movie[1] }} |
+ {{ movie[3] }} |
+ {{ movie[2] }} |
+ {{ movie[6] }} |
+ 去豆瓣看 |
+
+ {% endfor %}
+
+