''' 爬虫业务 ''' import requests,re,json from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.by import By '''基于css选择器来实现目标的获取解析''' class Scrawler(): def __init__(self): self.headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.36' } def reqbsGetText(self,url=None,cssselector=None): '''获取文本''' try: rs = requests.get(url,headers=self.headers) rs.encoding=rs.apparent_encoding bs =BeautifulSoup(rs.text,'html.parser') return [str(item.text).strip() for item in bs.select(selector=cssselector)] except:pass def reqbsGetJSON(self,url=None): '''获取JSON文本''' try: rs = requests.get(url,headers=self.headers).text rs = str(rs)[13:][:-2] return dict(json.loads(rs)) except:pass def reqbsGetImg(self,url=None,cssselector=None): '''图片获取''' try: rs = requests.get(url, headers=self.headers) rs.encoding = rs.apparent_encoding bs = BeautifulSoup(rs.text, 'html.parser') urls = [item.attrs['src'] for item in bs.select(selector=cssselector)] print(urls) titles = [item.atrrs['alt'] for item in bs.select(selector=cssselector)] return urls,titles except:pass def reqbsGetHref(self, url=None, cssselector=None): try: rs = requests.get(url, headers=self.headers) rs.encoding = rs.apparent_encoding bs = BeautifulSoup(rs.text, 'html.parser') urls = [item.attrs['href'] for item in bs.select(selector=cssselector)] return urls except: pass def seleniumGetText(self,url,cssselector=None): chrome = webdriver.Chrome() chrome.get(url) chrome.implicitly_wait(3) res = chrome.find_elements(by=By.CSS_SELECTOR,value=cssselector) return [str(item.text).strip() for item in res] # 获取株洲本地的温度 def getWeatherInfo(): url=" http://d1.weather.com.cn/sk_2d/101250304.html?_=1716271809611" headers={ "Host": "d1.weather.com.cn", "Referer": "http://www.weather.com.cn/", "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36" } str = requests.get(url,headers=headers).text.replace('"','') for item in str.split(","): if 'time' in item: timeinfo=item[5:] elif 'temp:' in item:temp=item.split(":")[1] return timeinfo,temp # 获取期刊名 if __name__=='__main__': # print(getWeatherInfo()) str='var dataSK={"nameen":"hetang","cityname":"荷塘","city":"101250304","temp":"27.6","tempf":"81.7","WD":"东北风","wde":"NE","WS":"1级","wse":"4km\/h","SD":"69%","sd":"69%","qy":"1002","njd":"12km","time":"14:25","rain":"0","rain24h":"0","aqi":"49","aqi_pm25":"49","weather":"阴","weathere":"Overcast","weathercode":"d02","limitnumber":"","date":"05月21日(星期二)"}' str1=str.replace('"','') items=str1.split(",") for item in items: if 'time' in item: timeinfo=item[5:] elif 'temp:' in item:temp=item.split(":")[1] print(timeinfo,temp)