|
|
@ -1,97 +0,0 @@
|
|
|
|
'''
|
|
|
|
|
|
|
|
爬虫业务
|
|
|
|
|
|
|
|
'''
|
|
|
|
|
|
|
|
import requests,re,json
|
|
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
from selenium import webdriver
|
|
|
|
|
|
|
|
from selenium.webdriver.common.by import By
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'''基于css选择器来实现目标的获取解析'''
|
|
|
|
|
|
|
|
class Scrawler():
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
|
|
|
self.headers={
|
|
|
|
|
|
|
|
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.36'
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
def reqbsGetText(self,url=None,cssselector=None):
|
|
|
|
|
|
|
|
'''获取文本'''
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
rs = requests.get(url,headers=self.headers)
|
|
|
|
|
|
|
|
rs.encoding=rs.apparent_encoding
|
|
|
|
|
|
|
|
bs =BeautifulSoup(rs.text,'html.parser')
|
|
|
|
|
|
|
|
return [str(item.text).strip() for item in bs.select(selector=cssselector)]
|
|
|
|
|
|
|
|
except:pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def reqbsGetJSON(self,url=None):
|
|
|
|
|
|
|
|
'''获取JSON文本'''
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
rs = requests.get(url,headers=self.headers).text
|
|
|
|
|
|
|
|
rs = str(rs)[13:][:-2]
|
|
|
|
|
|
|
|
return dict(json.loads(rs))
|
|
|
|
|
|
|
|
except:pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def reqbsGetImg(self,url=None,cssselector=None):
|
|
|
|
|
|
|
|
'''图片获取'''
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
rs = requests.get(url, headers=self.headers)
|
|
|
|
|
|
|
|
rs.encoding = rs.apparent_encoding
|
|
|
|
|
|
|
|
bs = BeautifulSoup(rs.text, 'html.parser')
|
|
|
|
|
|
|
|
urls = [item.attrs['src'] for item in bs.select(selector=cssselector)]
|
|
|
|
|
|
|
|
print(urls)
|
|
|
|
|
|
|
|
titles = [item.atrrs['alt'] for item in bs.select(selector=cssselector)]
|
|
|
|
|
|
|
|
return urls,titles
|
|
|
|
|
|
|
|
except:pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def reqbsGetHref(self, url=None, cssselector=None):
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
rs = requests.get(url, headers=self.headers)
|
|
|
|
|
|
|
|
rs.encoding = rs.apparent_encoding
|
|
|
|
|
|
|
|
bs = BeautifulSoup(rs.text, 'html.parser')
|
|
|
|
|
|
|
|
urls = [item.attrs['href'] for item in bs.select(selector=cssselector)]
|
|
|
|
|
|
|
|
return urls
|
|
|
|
|
|
|
|
except:
|
|
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def seleniumGetText(self,url,cssselector=None):
|
|
|
|
|
|
|
|
chrome = webdriver.Chrome()
|
|
|
|
|
|
|
|
chrome.get(url)
|
|
|
|
|
|
|
|
chrome.implicitly_wait(3)
|
|
|
|
|
|
|
|
res = chrome.find_elements(by=By.CSS_SELECTOR,value=cssselector)
|
|
|
|
|
|
|
|
return [str(item.text).strip() for item in res]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 获取株洲本地的温度
|
|
|
|
|
|
|
|
def getWeatherInfo():
|
|
|
|
|
|
|
|
url=" http://d1.weather.com.cn/sk_2d/101250304.html?_=1716271809611"
|
|
|
|
|
|
|
|
headers={
|
|
|
|
|
|
|
|
"Host": "d1.weather.com.cn",
|
|
|
|
|
|
|
|
"Referer": "http://www.weather.com.cn/",
|
|
|
|
|
|
|
|
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
str = requests.get(url,headers=headers).text.replace('"','')
|
|
|
|
|
|
|
|
for item in str.split(","):
|
|
|
|
|
|
|
|
if 'time' in item:
|
|
|
|
|
|
|
|
timeinfo=item[5:]
|
|
|
|
|
|
|
|
elif 'temp:' in item:temp=item.split(":")[1]
|
|
|
|
|
|
|
|
return timeinfo,temp
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 获取期刊名
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__=='__main__':
|
|
|
|
|
|
|
|
# print(getWeatherInfo())
|
|
|
|
|
|
|
|
str='var dataSK={"nameen":"hetang","cityname":"è·å¡","city":"101250304","temp":"27.6","tempf":"81.7","WD":"ä¸åé£","wde":"NE","WS":"1级","wse":"4km\/h","SD":"69%","sd":"69%","qy":"1002","njd":"12km","time":"14:25","rain":"0","rain24h":"0","aqi":"49","aqi_pm25":"49","weather":"é´","weathere":"Overcast","weathercode":"d02","limitnumber":"","date":"05æ21æ¥(ææäº)"}'
|
|
|
|
|
|
|
|
str1=str.replace('"','')
|
|
|
|
|
|
|
|
items=str1.split(",")
|
|
|
|
|
|
|
|
for item in items:
|
|
|
|
|
|
|
|
if 'time' in item:
|
|
|
|
|
|
|
|
timeinfo=item[5:]
|
|
|
|
|
|
|
|
elif 'temp:' in item:temp=item.split(":")[1]
|
|
|
|
|
|
|
|
print(timeinfo,temp)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|