You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

98 lines
3.4 KiB

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

'''
爬虫业务
'''
import requests,re,json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
'''基于css选择器来实现目标的获取解析'''
class Scrawler():
def __init__(self):
self.headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.36'
}
def reqbsGetText(self,url=None,cssselector=None):
'''获取文本'''
try:
rs = requests.get(url,headers=self.headers)
rs.encoding=rs.apparent_encoding
bs =BeautifulSoup(rs.text,'html.parser')
return [str(item.text).strip() for item in bs.select(selector=cssselector)]
except:pass
def reqbsGetJSON(self,url=None):
'''获取JSON文本'''
try:
rs = requests.get(url,headers=self.headers).text
rs = str(rs)[13:][:-2]
return dict(json.loads(rs))
except:pass
def reqbsGetImg(self,url=None,cssselector=None):
'''图片获取'''
try:
rs = requests.get(url, headers=self.headers)
rs.encoding = rs.apparent_encoding
bs = BeautifulSoup(rs.text, 'html.parser')
urls = [item.attrs['src'] for item in bs.select(selector=cssselector)]
print(urls)
titles = [item.atrrs['alt'] for item in bs.select(selector=cssselector)]
return urls,titles
except:pass
def reqbsGetHref(self, url=None, cssselector=None):
try:
rs = requests.get(url, headers=self.headers)
rs.encoding = rs.apparent_encoding
bs = BeautifulSoup(rs.text, 'html.parser')
urls = [item.attrs['href'] for item in bs.select(selector=cssselector)]
return urls
except:
pass
def seleniumGetText(self,url,cssselector=None):
chrome = webdriver.Chrome()
chrome.get(url)
chrome.implicitly_wait(3)
res = chrome.find_elements(by=By.CSS_SELECTOR,value=cssselector)
return [str(item.text).strip() for item in res]
# 获取株洲本地的温度
def getWeatherInfo():
url=" http://d1.weather.com.cn/sk_2d/101250304.html?_=1716271809611"
headers={
"Host": "d1.weather.com.cn",
"Referer": "http://www.weather.com.cn/",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36"
}
str = requests.get(url,headers=headers).text.replace('"','')
for item in str.split(","):
if 'time' in item:
timeinfo=item[5:]
elif 'temp:' in item:temp=item.split(":")[1]
return timeinfo,temp
# 获取期刊名
if __name__=='__main__':
# print(getWeatherInfo())
str='var dataSK={"nameen":"hetang","cityname":"荷塘","city":"101250304","temp":"27.6","tempf":"81.7","WD":"东北风","wde":"NE","WS":"1级","wse":"4km\/h","SD":"69%","sd":"69%","qy":"1002","njd":"12km","time":"14:25","rain":"0","rain24h":"0","aqi":"49","aqi_pm25":"49","weather":"阴","weathere":"Overcast","weathercode":"d02","limitnumber":"","date":"05月21日(星期二)"}'
str1=str.replace('"','')
items=str1.split(",")
for item in items:
if 'time' in item:
timeinfo=item[5:]
elif 'temp:' in item:temp=item.split(":")[1]
print(timeinfo,temp)