ADD file via upload

1 year ago · 706570807f
parent f89c39db89
commit 706570807f
1 changed files with 238 additions and 0 deletions
--- a/douban.movie.py
+++ b/douban.movie.py
@ -0,0 +1,238 @@
+from bs4 import BeautifulSoup  # 网页解析，获取数据
+import re  # 正则表达式，进行文字匹配`
+import urllib.request, urllib.error  # 制定URL，获取网页数据
+import xlwt  # 进行excel操作
+import sqlite3  # 进行SQLite数据库操作
+from concurrent.futures import ThreadPoolExecutor #优化，使用了异步IO和线程池，可以提高程序的并发处理能力和性能。
+import asyncio                 #使用了异步IO和线程池，可以提高程序的并发处理能力和性能
+import matplotlib.pyplot as plt
+import numpy as np
+from tqdm import tqdm
+
+
+findLink = re.compile(r'<a href="(.*?)">')  # 创建正则表达式对象，标售规则   影片详情链接的规则
+findImgSrc = re.compile(r'<img.*src="(.*?)"', re.S)
+findTitle = re.compile(r'<span class="title">(.*)</span>')
+findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
+findJudge = re.compile(r'<span>(\d*)人评价</span>')
+findInq = re.compile(r'<span class="inq">(.*)</span>')
+findBd = re.compile(r'<p class="">(.*?)</p>', re.S)
+
+
+# 得到指定一个URL的网页内容,反爬手段
+def askURL(url):
+    head = { # 模拟浏览器头部信息，向豆瓣服务器发送消息
+        "User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122  Safari / 537.36"
+    }
+    request = urllib.request.Request(url, headers=head)
+    html = ""
+    try:
+        # 用户代理，表示告诉豆瓣服务器，我们是什么类型的机器、浏览器（本质上是告诉浏览器，我们可以接收什么水平的文件内容）
+        response = urllib.request.urlopen(request)
+        html = response.read().decode("utf-8")
+    except urllib.error.URLError as e:#捕获 URLError 异常，并将其命名为 e
+        if hasattr(e, "code"):       #查异常对象 e 是否包含 code 属性，用于打印错误代码和原因（如果存在）
+            print(e.code)
+        if hasattr(e, "reason"):        #查异常对象 e 是否包含reason 属性，用于打印错误代码和原因（如果存在）。
+            print(e.reason)
+    return html
+
+#获取网页
+def parse_page(html):
+    soup = BeautifulSoup(html, "html.parser")       #用于解析 HTML 内容，Python 的内置解析器 html.parse
+    data_list = []                                  #用来存储爬取的网页信息
+    for item in soup.find_all('div', class_="item"):# 查找符合要求的字符串
+        data = []
+        item = str(item)
+        link = re.findall(findLink, item)[0]        # 通过正则表达式查找
+        data.append(link)
+        img_src = re.findall(findImgSrc, item)[0]
+        data.append(img_src)
+        titles = re.findall(findTitle, item)        #查找名称
+        if len(titles) == 2:
+            c_title = titles[0]
+            data.append(c_title)
+            o_title = titles[1].replace("/", "")    #消除转义字符
+            data.append(o_title)
+        else:
+            data.append(titles[0])
+            data.append(' ')
+        rating = re.findall(findRating, item)[0]    #查找评分
+        data.append(rating)
+        judge_num = re.findall(findJudge, item)[0]  #查找评论
+        data.append(judge_num)
+        inq = re.findall(findInq, item)             #查找概况
+        if len(inq) != 0:
+            inq = inq[0].replace("。", "")
+            data.append(inq)
+        else:
+            data.append(" ")
+        bd = re.findall(findBd, item)[0]        #查找相关信息
+        bd = re.sub('<br(\s+)?/>(\s+)?', "", bd)#替换 bd 中的 <br> 标签及其周围的空白字符为空字符串，以删除换行符
+        bd = re.sub('/', "", bd)                #将 bd 中的斜杠 / 替换为空字符串
+        data.append(bd.strip())
+        data_list.append(data)
+    return data_list
+
+
+#可视化功能
+
+def visualize(ranting):
+    # 定义评分区间
+    bins = [7,8.5, 9, 9.5, 10]
+
+    # 使用 numpy.histogram() 计算每个评分区间内的电影数量
+    counts, _ = np.histogram(ranting, bins=bins)
+
+    # 定义每个区间的标签
+    labels = ['7-8.5', '8.5-9', '9-9.5', '9.5-10']
+
+    # 绘制饼图
+    plt.figure(figsize=(8, 8))  #创建一个图形窗口，设置其大小为 8x8 英寸
+    #绘制饼图。counts 是各个评分的数量，labels 是评分的标签，autopct='%1.1f%%' 设置显示百分比，并保留一位小数，
+    # startangle=140 设置起始角度为 140 度。
+    plt.pie(counts, labels=labels, autopct='%1.1f%%', startangle=140)
+    plt.axis('equal')  # 保证饼图是圆形而不是椭圆形
+    plt.title('Distribution of Movie Ratings')  #设置图形的标题
+    plt.tight_layout()                  #自动调整子图参数，确保图形布局美观。
+    plt.show()
+
+
+# 保存数据到表格
+def saveData(datalist, savepath):
+    print("Saving data...")
+    book = xlwt.Workbook(encoding="utf-8", style_compression=0)   #创建一个新的 Excel 工作簿
+    sheet = book.add_sheet('豆瓣电影Top250', cell_overwrite_ok=True)        #来添加一个新的工作表到工作簿中
+    col = ["电影详情链接", "图片链接", "影片中文名", "影片外国名", "评分", "评价数", "概况", "导演", "主演", "年份", "国家", "类型"]
+
+    # 写入列名
+    for i, column_name in enumerate(col):
+        sheet.write(0, i, column_name)          #向特定的行和列写入数据
+
+    # 写入数据
+    for row_index, movie_info in enumerate(datalist, start=1):#start=1意味着索引从 1 开始，这样可以避免覆盖 Excel 表格的第一行
+        for col_index, info in enumerate(movie_info):
+            if isinstance(info, list):  # 检查 info 是否是一个列表
+                for sub_index, sub_info in enumerate(info):
+                    sheet.write(row_index, col_index + sub_index, sub_info) #确保子信息不会覆盖主信息，因为子列表中的内容将从当前列开始逐个写入，按序排列
+            else:
+                sheet.write(row_index, col_index, info)
+
+    book.save(savepath)
+
+
+#保存到数据库,创建数据库
+def init_db(dbpath):
+    sql = '''
+        create table movie250(
+        id integer  primary  key autoincrement,
+        info_link text,
+        pic_link text,
+        cname varchar,
+        ename varchar ,
+        score numeric,
+        rated numeric,
+        instroduction text,
+        info text
+        )
+
+
+    '''  #创建数据表
+    conn = sqlite3.connect(dbpath)
+    cursor = conn.cursor()
+    cursor.execute(sql)
+    conn.commit()
+    conn.close()
+
+#将数据保存到数据库
+def saveData2DB(datalist,dbpath):
+    init_db(dbpath)
+    conn = sqlite3.connect(dbpath) #函数来连接到 SQLite 数据库文件
+    cur = conn.cursor()    #创建一个游标对象。游标对象用于执行 SQL 查询和获取结果。
+    for data in datalist:
+            for index in range(len(data)):
+                if index == 4 or index == 5:  #跳过该索引位置，因为在数据库表格中并不需要这些信息。
+                    continue
+                data[index] = '"'+data[index]+'"'
+            sql = '''
+                    insert into movie250(
+                    info_link,pic_link,cname,ename,score,rated,instroduction,info)
+                    values (%s)'''%",".join(data)   #将列表 data 中的值通过逗号连接为一个字符串
+            #print(sql)     #输出查询语句，用来测试
+            cur.execute(sql)    #execute() 方法来执行 SQL 插入
+            conn.commit()
+    cur.close                   #关闭数据库
+    conn.close()
+
+#查询数据库中的数据，查询数据库中的电影大于9分的电影
+def search():
+    def execute_query(query, db_path):
+        conn = sqlite3.connect(db_path)     #连接数据库
+        cursor = conn.cursor()              #连接对象的 cursor() 方法来创建一个游标对象。游标对象用于执行 SQL 查询和获取结果
+        cursor.execute(query)               #execute() 方法来执行 SQL 查询
+        rows = cursor.fetchall()            #SELECT 查询，使用游标对象的fetchall()方法来获取查询结果
+        conn.close()
+        return rows
+
+    # 示例查询函数的使用
+    query = "SELECT * FROM movie250 WHERE score > 9.0"
+    db_path = "movie.db"
+    result = execute_query(query, db_path)
+    for row in result:
+        print(row)
+
+
+async def main():
+    baseurl = "https://movie.douban.com/top250?start="  # 要爬取的网页链接
+    urls = [baseurl + str(i) for i in range(0, 250, 25)]  # 构建URL列表
+    data=[0]*250
+    data=await main_async(urls)  # 调用异步主函数获取数据
+    datas = [item for sublist in data for item in sublist]  # 使用 flatten() 方法将二维数组拆分成一维
+    choice=input('请选择存储方式：a:excel表，b:数据库')
+    if choice=='a':
+        savepath = "豆瓣电影Top250.xls"  # 当前目录新建XLS，存储进去
+        # 等待异步操作完成后执行保存操作
+        await asyncio.sleep(3)  # 使用异步等待来等待3秒钟
+        saveData(datas, savepath)  # 使用excel表来保存
+    elif choice=='b':
+        dbpath = "movie.db"              #当前目录新建数据库，存储进去
+        # 等待异步操作完成后执行保存操作
+        await asyncio.sleep(3)  # 使用异步等待来等待3秒钟
+        saveData2DB(datas,dbpath)  #使用数据库保存
+        search()     #查询数据库里的数据
+
+    else:
+        print('请选择a或者b')
+
+    # 假设这里是异步数据获取的部分，等待异步操作完成
+
+    ratings = [float(movie[4]) for movie in datas]  #提取电影评分
+    visualize(ratings)     #可视化功能
+
+    print("爬取完毕！")
+
+
+# 这是异步数据获取的函数
+async def main_async(urls):
+    # 使用线程池执行异步任务
+    with ThreadPoolExecutor(max_workers=5) as executor:
+        loop = asyncio.get_event_loop()
+        # 将每个 URL 的请求任务加入到事件循环中
+        tasks = [loop.run_in_executor(executor, askURL, url) for url in urls]
+        # 等待所有请求任务完成
+        htmls = await asyncio.gather(*tasks)
+        # 解析每个页面的 HTML 内容
+        data = [parse_page(html) for html in htmls]
+
+        # 使用tqdm动画效果显示爬取进度
+        for _ in tqdm(range(len(urls)), desc="正在爬取网页", unit="页"):
+            await asyncio.sleep(0.1)  # 模拟爬取网页的耗时
+
+        return data
+
+
+if __name__ == "__main__":
+    # 运行主函数
+    asyncio.run(main())
+
+