|
|
import requests
|
|
|
from lxml import etree
|
|
|
import re
|
|
|
import csv
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
url="https://www.sina.com.cn/"
|
|
|
h={'User-Agent':
|
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0'}
|
|
|
# 获取源码及状态码
|
|
|
r=requests.get(url,headers=h)
|
|
|
c=r.status_code
|
|
|
r.encoding="utf-8"
|
|
|
ym=r.text
|
|
|
# print(r.text,c)
|
|
|
# 获取其中头条新闻的标题及链接
|
|
|
tt=[]
|
|
|
lj=r'<a target="_blank" class="linkNewsTopBold" href=[\'"](.*?)[^\'"]\S>'
|
|
|
bt=r'<a target="_blank" class="linkNewsTopBold" href=".*?">(.*?)</a>'
|
|
|
resultlj=re.findall(lj,ym)
|
|
|
resultbt=re.findall(bt,ym)
|
|
|
# print(resultlj)
|
|
|
# print(resultbt)
|
|
|
for i in range(len(resultlj)):
|
|
|
tt.append({
|
|
|
'链接': resultlj[i],
|
|
|
'标题': resultbt[i]
|
|
|
})
|
|
|
# print(*tt,sep="\n")
|
|
|
# 使用xpath获取新闻和标题
|
|
|
xw=[]
|
|
|
dom=etree.HTML(ym)
|
|
|
lj1= dom.xpath('//div[@class="top_newslist"]/ul/li/a/@href')
|
|
|
bt1= dom.xpath('//div[@class="top_newslist"]/ul/li/a/text()')
|
|
|
# print(lj1)
|
|
|
# print(bt1)
|
|
|
for i in range(len(lj1)):
|
|
|
xw.append({
|
|
|
'标题': bt1[i],
|
|
|
'链接': lj1[i]
|
|
|
|
|
|
})
|
|
|
# print(*xw,sep="\n")
|
|
|
# DictWriter类是csv模块中的一个辅助类,用于将字典数据写入CSV文件。newline=""参数用于确保在不同的操作系统上正确处理换行符。
|
|
|
ttname="新闻.csv"
|
|
|
with open(ttname, "w", newline="") as file:
|
|
|
writer = csv.DictWriter(file,fieldnames=["标题","链接"])
|
|
|
writer.writeheader() # 写入标题行
|
|
|
for item in xw:
|
|
|
writer.writerow(item)
|
|
|
print(f"已将积分列表保存到{ttname}文件中。")
|
|
|
|
|
|
# 获取找车栏目中的图片链接将其加入列表
|
|
|
tp=dom.xpath('//div[@class="carbrand-logo clearfix"]/a/img')
|
|
|
tp_urls = []
|
|
|
for img in tp:
|
|
|
tp_url = img.get('src')
|
|
|
if tp_url:
|
|
|
tp_urls.append(tp_url)
|
|
|
# for tp_url in tp_urls:
|
|
|
# print(tp_url)
|
|
|
# # 打印其中的一张图片
|
|
|
# urls1='https://k.sinaimg.cn/auto4/autoimg/brand/07/07/64a7d61acc5fc8040707_95.jpg/w49h49l50t50q80a38.jpg'
|
|
|
# re=requests.get(urls1,headers=h)
|
|
|
# # print(re.content)
|
|
|
# with open('img.jpg', 'wb') as file:
|
|
|
# # 将响应内容写入文件
|
|
|
# file.write(re.content)
|
|
|
|
|
|
# 根据列表中的图片链接,全部输出图片信息
|
|
|
for tp_url in tp_urls:
|
|
|
tp_url1='http:'+tp_url
|
|
|
re=requests.get(tp_url1,headers=h)
|
|
|
a=re.content
|
|
|
# print(a)
|
|
|
# 下面的循环创建图片文件,w使用写的方法,b是以二进制模式进行
|
|
|
# with open('png' + tp_url.split('/')[-1], 'wb') as f:
|
|
|
# f.write(a)
|
|
|
|
|
|
|
|
|
jfurl = "http://api.sports.sina.com.cn/?p=sports&s=sport_client&a=index&_sport_t_=football&_sport_s_=opta&_sport_a_=teamOrder&type=4"
|
|
|
response = requests.get(jfurl,headers=h)
|
|
|
data = response.json()
|
|
|
# 解析数据
|
|
|
result = data.get("result", {}).get("data", [])
|
|
|
# for team in result:
|
|
|
# print(f"排名:{team['team_order']},球队:{team['team_cn']},积分:{team['score']}")
|
|
|
name = "英超积分榜.csv"
|
|
|
with open(name, "w", newline="") as file:
|
|
|
writer = csv.writer(file)
|
|
|
writer.writerow(["排名", "球队", "积分"]) # 写入标题行
|
|
|
for team in result:
|
|
|
writer.writerow([team["team_order"], team["team_cn"], team["score"]])
|
|
|
# print(f"已将积分列表保存到{name}文件中。") |