import requests import re import csv from parsel import Selector class NBASpider: def __init__(self): self.url = "https://www.basketball-reference.com/leagues/NBA_2021.html" self.schedule_url = "https://www.basketball-reference.com/leagues/NBA_2016_games-{}.html" self.advanced_team_url = "https://www.basketball-reference.com/leagues/NBA_2016.html" self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 " "Safari/537.36" } # 发送请求,获取数据 def send(self, url): response = requests.get(url, headers=self.headers, timeout=30) response.encoding = 'utf-8' return response.text # 解析html def parse(self, html): team_heads, team_datas = self.get_team_info(html) opponent_heads, opponent_datas = self.get_opponent_info(html) return team_heads, team_datas, opponent_heads, opponent_datas def get_team_info(self, html): """ 通过正则从获取到的html页面数据中team表的表头和各行数据 :param html 爬取到的页面数据 :return: team_heads表头 team_datas 列表内容 """ # 1. 正则匹配数据所在的table team_table = re.search('(.*?)', html, re.S).group(1) # 2. 正则从table中匹配出表头 team_head = re.search('(.*?)', team_table, re.S).group(1) team_heads = re.findall('(.*?)', team_head, re.S) # 3. 正则从table中匹配出表的各行数据 team_datas = self.get_datas(team_table) return team_heads, team_datas # 解析opponent数据 def get_opponent_info(self, html): """ 通过正则从获取到的html页面数据中opponent表的表头和各行数据 :param html 爬取到的页面数据 :return: """ # 1. 正则匹配数据所在的table opponent_table = re.search('(.*?)', html, re.S).group(1) # 2. 正则从table中匹配出表头 opponent_head = re.search('(.*?)', opponent_table, re.S).group(1) opponent_heads = re.findall('(.*?)', opponent_head, re.S) # 3. 正则从table中匹配出表的各行数据 opponent_datas = self.get_datas(opponent_table) return opponent_heads, opponent_datas # 获取表格body数据 def get_datas(self, table_html): """ 从tboday数据中解析出实际数据(去掉页面标签) :param table_html 解析出来的table数据 :return: """ tboday = re.search('(.*?)', table_html, re.S).group(1) contents = re.findall('(.*?)', tboday, re.S) for oc in contents: rk = re.findall('(.*?)', oc) datas = re.findall('(.*?)', oc, re.S) datas[0] = re.search('(.*?)', datas[0]).group(1) datas.insert(0, rk[0]) # yield 声明这个方法是一个生成器, 返回的值是datas yield datas def get_schedule_datas(self, table_html): """ 从tboday数据中解析出实际数据(去掉页面标签) :param table_html 解析出来的table数据 :return: """ tboday = re.search('(.*?)', table_html, re.S).group(1) contents = re.findall('(.*?)', tboday, re.S) for oc in contents: rk = re.findall('(.*?)', oc) datas = re.findall('(.*?)', oc, re.S) if datas and len(datas) > 0: datas[1] = re.search('(.*?)', datas[1]).group(1) datas[3] = re.search('(.*?)', datas[3]).group(1) datas[5] = re.search('(.*?)', datas[5]).group(1) datas.insert(0, rk[0]) # yield 声明这个方法是一个生成器, 返回的值是datas yield datas def get_advanced_team_datas(self, table): trs = table.xpath('./tbody/tr') for tr in trs: rk = tr.xpath('./th/text()').get() datas = tr.xpath('./td[@data-stat!="DUMMY"]/text()').getall() datas[0] = tr.xpath('./td/a/text()').get() datas.insert(0, rk) yield datas def parse_schedule_info(self, html): """ 通过正则从获取到的html页面数据中的表头和各行数据 :param html 爬取到的页面数据 :return: heads表头 datas 列表内容 """ # 1. 正则匹配数据所在的table table = re.search('(.*?)', html, re.S).group(1) table = table + "" # 2. 正则从table中匹配出表头 head = re.search('(.*?)', table, re.S).group(1) heads = re.findall('(.*?)', head, re.S) # 3. 正则从table中匹配出表的各行数据 datas = self.get_schedule_datas(table) return heads, datas def parse_advanced_team(self, html): """ 通过xpath从获取到的html页面数据中表头和各行数据 :param html 爬取到的页面数据 :return: heads表头 datas 列表内容 """ selector = Selector(text=html) # 1. 获取对应的table table = selector.xpath('//table[@id="advanced-team"]') # 2. 从table中匹配出表头 res = table.xpath('./thead/tr')[1].xpath('./th/text()').getall() heads = [] for i, head in enumerate(res): if '\xa0' in head: continue heads.append(head) # 3. 匹配出表的各行数据 table_data = self.get_advanced_team_datas(table) return heads, table_data # 存储成csv文件 def save_csv(self, title, heads, rows): f = open(title + '.csv', mode='w', encoding='utf-8', newline='') csv_writer = csv.writer(f) csv_writer.writerow(heads) for row in rows: csv_writer.writerow(row) f.close() def crawl_team_opponent(self): # 1. 发送请求 res = self.send(self.url) # 2. 解析数据 team_heads, team_datas, opponent_heads, opponent_datas = self.parse(res) # 3. 保存数据为csv self.save_csv("team", team_heads, team_datas) self.save_csv("opponent", opponent_heads, opponent_datas) def crawl_schedule(self): months = ["october", "november", "december", "january", "february", "march", "april", "may", "june"] for month in months: html = self.send(self.schedule_url.format(month)) # print(html) heads, datas = self.parse_schedule_info(html) # 3. 保存数据为csv self.save_csv("schedule_"+month, heads, datas) def crawl_advanced_team(self): # 1. 发送请求 res = self.send(self.advanced_team_url) # 2. 解析数据 heads, datas = self.parse_advanced_team(res) # 3. 保存数据为csv self.save_csv("advanced_team", heads, datas) def crawl(self): # 1. 爬取各队伍信息 # self.crawl_team_opponent() # 2. 爬取计划表 # self.crawl_schedule() # 3. 爬取Advanced Team表 self.crawl_advanced_team() if __name__ == '__main__': # 运行爬虫 spider = NBASpider() spider.crawl()