You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
zj_1_git/src/predict.spider.py

198 lines
7.5 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import re
import csv
from parsel import Selector
class NBASpider:
def __init__(self):
self.url = "https://www.basketball-reference.com/leagues/NBA_2021.html"
self.schedule_url = "https://www.basketball-reference.com/leagues/NBA_2016_games-{}.html"
self.advanced_team_url = "https://www.basketball-reference.com/leagues/NBA_2016.html"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 "
"Safari/537.36"
}
# 发送请求,获取数据
def send(self, url):
response = requests.get(url, headers=self.headers, timeout=30)
response.encoding = 'utf-8'
return response.text
# 解析html
def parse(self, html):
team_heads, team_datas = self.get_team_info(html)
opponent_heads, opponent_datas = self.get_opponent_info(html)
return team_heads, team_datas, opponent_heads, opponent_datas
def get_team_info(self, html):
"""
通过正则从获取到的html页面数据中team表的表头和各行数据
:param html 爬取到的页面数据
:return: team_heads表头
team_datas 列表内容
"""
# 1. 正则匹配数据所在的table
team_table = re.search('<table.*?id="per_game-team".*?>(.*?)</table>', html, re.S).group(1)
# 2. 正则从table中匹配出表头
team_head = re.search('<thead>(.*?)</thead>', team_table, re.S).group(1)
team_heads = re.findall('<th.*?>(.*?)</th>', team_head, re.S)
# 3. 正则从table中匹配出表的各行数据
team_datas = self.get_datas(team_table)
return team_heads, team_datas
# 解析opponent数据
def get_opponent_info(self, html):
"""
通过正则从获取到的html页面数据中opponent表的表头和各行数据
:param html 爬取到的页面数据
:return:
"""
# 1. 正则匹配数据所在的table
opponent_table = re.search('<table.*?id="per_game-opponent".*?>(.*?)</table>', html, re.S).group(1)
# 2. 正则从table中匹配出表头
opponent_head = re.search('<thead>(.*?)</thead>', opponent_table, re.S).group(1)
opponent_heads = re.findall('<th.*?>(.*?)</th>', opponent_head, re.S)
# 3. 正则从table中匹配出表的各行数据
opponent_datas = self.get_datas(opponent_table)
return opponent_heads, opponent_datas
# 获取表格body数据
def get_datas(self, table_html):
"""
从tboday数据中解析出实际数据去掉页面标签
:param table_html 解析出来的table数据
:return:
"""
tboday = re.search('<tbody>(.*?)</tbody>', table_html, re.S).group(1)
contents = re.findall('<tr.*?>(.*?)</tr>', tboday, re.S)
for oc in contents:
rk = re.findall('<th.*?>(.*?)</th>', oc)
datas = re.findall('<td.*?>(.*?)</td>', oc, re.S)
datas[0] = re.search('<a.*?>(.*?)</a>', datas[0]).group(1)
datas.insert(0, rk[0])
# yield 声明这个方法是一个生成器, 返回的值是datas
yield datas
def get_schedule_datas(self, table_html):
"""
从tboday数据中解析出实际数据去掉页面标签
:param table_html 解析出来的table数据
:return:
"""
tboday = re.search('<tbody>(.*?)</tbody>', table_html, re.S).group(1)
contents = re.findall('<tr.*?>(.*?)</tr>', tboday, re.S)
for oc in contents:
rk = re.findall('<th.*?><a.*?>(.*?)</a></th>', oc)
datas = re.findall('<td.*?>(.*?)</td>', oc, re.S)
if datas and len(datas) > 0:
datas[1] = re.search('<a.*?>(.*?)</a>', datas[1]).group(1)
datas[3] = re.search('<a.*?>(.*?)</a>', datas[3]).group(1)
datas[5] = re.search('<a.*?>(.*?)</a>', datas[5]).group(1)
datas.insert(0, rk[0])
# yield 声明这个方法是一个生成器, 返回的值是datas
yield datas
def get_advanced_team_datas(self, table):
trs = table.xpath('./tbody/tr')
for tr in trs:
rk = tr.xpath('./th/text()').get()
datas = tr.xpath('./td[@data-stat!="DUMMY"]/text()').getall()
datas[0] = tr.xpath('./td/a/text()').get()
datas.insert(0, rk)
yield datas
def parse_schedule_info(self, html):
"""
通过正则从获取到的html页面数据中的表头和各行数据
:param html 爬取到的页面数据
:return: heads表头
datas 列表内容
"""
# 1. 正则匹配数据所在的table
table = re.search('<table.*?id="schedule" data-cols-to-freeze=",1">(.*?)</table>', html, re.S).group(1)
table = table + "</tbody>"
# 2. 正则从table中匹配出表头
head = re.search('<thead>(.*?)</thead>', table, re.S).group(1)
heads = re.findall('<th.*?>(.*?)</th>', head, re.S)
# 3. 正则从table中匹配出表的各行数据
datas = self.get_schedule_datas(table)
return heads, datas
def parse_advanced_team(self, html):
"""
通过xpath从获取到的html页面数据中表头和各行数据
:param html 爬取到的页面数据
:return: heads表头
datas 列表内容
"""
selector = Selector(text=html)
# 1. 获取对应的table
table = selector.xpath('//table[@id="advanced-team"]')
# 2. 从table中匹配出表头
res = table.xpath('./thead/tr')[1].xpath('./th/text()').getall()
heads = []
for i, head in enumerate(res):
if '\xa0' in head:
continue
heads.append(head)
# 3. 匹配出表的各行数据
table_data = self.get_advanced_team_datas(table)
return heads, table_data
# 存储成csv文件
def save_csv(self, title, heads, rows):
f = open(title + '.csv', mode='w', encoding='utf-8', newline='')
csv_writer = csv.writer(f)
csv_writer.writerow(heads)
for row in rows:
csv_writer.writerow(row)
f.close()
def crawl_team_opponent(self):
# 1. 发送请求
res = self.send(self.url)
# 2. 解析数据
team_heads, team_datas, opponent_heads, opponent_datas = self.parse(res)
# 3. 保存数据为csv
self.save_csv("team", team_heads, team_datas)
self.save_csv("opponent", opponent_heads, opponent_datas)
def crawl_schedule(self):
months = ["october", "november", "december", "january", "february", "march", "april", "may", "june"]
for month in months:
html = self.send(self.schedule_url.format(month))
# print(html)
heads, datas = self.parse_schedule_info(html)
# 3. 保存数据为csv
self.save_csv("schedule_"+month, heads, datas)
def crawl_advanced_team(self):
# 1. 发送请求
res = self.send(self.advanced_team_url)
# 2. 解析数据
heads, datas = self.parse_advanced_team(res)
# 3. 保存数据为csv
self.save_csv("advanced_team", heads, datas)
def crawl(self):
# 1. 爬取各队伍信息
# self.crawl_team_opponent()
# 2. 爬取计划表
# self.crawl_schedule()
# 3. 爬取Advanced Team表
self.crawl_advanced_team()
if __name__ == '__main__':
# 运行爬虫
spider = NBASpider()
spider.crawl()