Lenovo 8 months ago
parent 6fe3154c2f
commit 37349ea7ae

@ -0,0 +1,106 @@
'''
负责处理入库业务
'''
from Model import DataDB as db
from Controller import DataFetch
import logging
from datetime import datetime
scrawler = DataFetch.Scrawler()
'''入库业务'''
'''1.新闻入库 insertData(tableName,source,ctime,title,url):'''
def newsAdd(id):
if id=='0':
'''工大的地址'''
source="工大新闻"
url = 'http://news.hut.edu.cn/'
count=0
for i in range(0,5):
ctime = scrawler.reqbsGetText(url, cssselector='.news-list .date')[i]
title = scrawler.reqbsGetText(url, cssselector='.news-list .info h2')[i]
urls = url+scrawler.reqbsGetHref(url, cssselector='.news-list .info a')[i]
if db.insertData('news',source,ctime,title,urls):
count=count+1
else:count=0
if count==5:return True
else:return False
elif id=='1':
'''北大新闻'''
source = "北大新闻"
url = 'https://news.pku.edu.cn/ttxw/index.htm'
count = 0
for i in range(0, 5):
ctime = scrawler.reqbsGetText(url, cssselector='.articleList01 .item-date')[i]
title = scrawler.reqbsGetText(url, cssselector='.articleList01 .item-txt h3')[i]
urls = url + scrawler.reqbsGetHref(url, cssselector='.articleList01 .item-txt h3 a')[i]
if db.insertData('news', source, ctime, title, urls):
count = count + 1
else:
count = 0
if count == 5:
return True
else:
return False
pass
'''2.疫情数据入库'''
def yqAdd(source):
if source=='1':
# 来源是新浪疫情
url = 'http://zj.sina.com.cn/zt_d/zjyiqing/?hk=/*'
res = scrawler.reqbsGetJSON(url=url)
ctime = res['data']['mtime']
cnumber = res['data']['gntotal']
gnumber = res['data']['deathtotal']
if db.insertData('yq',source,ctime,cnumber,gnumber):
return True
else:return False
elif source=='0':
# 来源是腾讯疫情
url = 'https://www.qq.com/'
res = scrawler.reqbsGetJSON(url=url)
ctime = res['data']['mtime']
cnumber = res['data']['gntotal']
gnumber = res['data']['deathtotal']
if db.insertData('yq',source,ctime,cnumber,gnumber):
return True
else:return False
'''
日志相关处理方法
'''
class MyLogHandler(logging.Handler, object):
""" 自定义日志handler """
def __init__(self):
logging.Handler.__init__(self)
def emit(self, record):
""" record为一个消息类对象包括nameasctime、lineno、funcname等属性
emit函数为自定义handler类时必重写的函数这里可以根据需要对日志消息做一些处理比如发送日志到服务器
发出记录(Emit a record)"""
try:
db.log2db(str(datetime.now()),record.lineno,
record.funcName,record.getMessage())
except Exception:
self.handleError(record)
def mylog():
# 创建一个日志记录器
log = logging.getLogger("test_logger")
log.setLevel(logging.DEBUG)
# 创建一个日志处理器
logHandler = MyLogHandler()
logHandler.setLevel(logging.INFO)
# 创建一个日志格式器
formats = logging.Formatter('%(asctime)s - %(name)s - %(lineno)d: %(message)s')
# 将日志格式器添加到日志处理器中
logHandler.setFormatter(formats)
# 将日志处理器添加到日志记录器中
log.addHandler(logHandler)
return log
if __name__=='__main__':
print(newsAdd('工大新闻',0))
print(yqAdd('新浪'))
# print(db.getAllData('yq'))

@ -0,0 +1,100 @@
'''
es
'''
from elasticsearch import Elasticsearch
import requests
from bs4 import BeautifulSoup
import datetime
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36',
}
def get_target(url,selector):
try:
response = requests.get(url=url, headers=headers)
response.encoding = 'UTF-8'
bs = BeautifulSoup(response.text,'html.parser')
return [item.text for item in bs.select(selector=selector)]
except Exception as e:
pass
class esController():
def __init__(self,index_name,index_type):
'''
创建连接对象同时给定连接哪个index和哪个type
:param index_name: 相当于哪个库
:param index_type: 相当于哪个表
'''
self.es = Elasticsearch(hosts="localhost:9200") #创建连接对象
self.index_name = index_name
self.index_type = index_type
def create_index(self):
'''
创建一个index如果存在时就先删除然后再重新创建
:return:
'''
if self.es.indices.exists(index=self.index_name):
self.es.indices.delete(index=self.index_name)
self.es.indices.create(index=self.index_name)
def delete_index(self):
'''
删除某一个index
:return:
'''
try:
self.es.indices.delete(index=self.index_name)
except:pass
def search_index(self,keyword,fields,count):
'''
查询index里的内容
:param keyword: 查询内容
:param count: 返回多少个数量
:return:
'''
body = {
"query": {
"multi_match": {
"query": keyword, # 指定查询内容,注意:会被分词
"fields": fields # 指定字段查询
}
}
}
return self.es.search(index=self.index_name,body=body,size=count)
def get_doc(self,id):
'''
获取index里的某个文档内容
:param id:
:return:
'''
return self.es.get(index=self.index_name,id=id)
def insert_one(self,doc:dict):
'''
往index中插入内容
:param doc: 以键值对方式表示的内容
:return:
'''
self.es.index(index=self.index_name,doc_type=self.index_type,body=doc)
def insert_array(self,docs:list):
'''
往index中插入多项内容
:param docs:
:return:
'''
for doc in docs:
self.es.index(index=self.index_name, doc_type=self.index_type, body=doc)
if __name__ == '__main__':
# 搜索获取新闻
es = esController("tust","news")
res =es.search_index("实验",'title',10)
print("共有{}条结果".format(res['hits']['total']['value']))
for item in res['hits']['hits']:
print(item['_score'],item['_source'])

@ -0,0 +1,97 @@
'''
爬虫业务
'''
import requests,re,json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
'''基于css选择器来实现目标的获取解析'''
class Scrawler():
def __init__(self):
self.headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.36'
}
def reqbsGetText(self,url=None,cssselector=None):
'''获取文本'''
try:
rs = requests.get(url,headers=self.headers)
rs.encoding=rs.apparent_encoding
bs =BeautifulSoup(rs.text,'html.parser')
return [str(item.text).strip() for item in bs.select(selector=cssselector)]
except:pass
def reqbsGetJSON(self,url=None):
'''获取JSON文本'''
try:
rs = requests.get(url,headers=self.headers).text
rs = str(rs)[13:][:-2]
return dict(json.loads(rs))
except:pass
def reqbsGetImg(self,url=None,cssselector=None):
'''图片获取'''
try:
rs = requests.get(url, headers=self.headers)
rs.encoding = rs.apparent_encoding
bs = BeautifulSoup(rs.text, 'html.parser')
urls = [item.attrs['src'] for item in bs.select(selector=cssselector)]
print(urls)
titles = [item.atrrs['alt'] for item in bs.select(selector=cssselector)]
return urls,titles
except:pass
def reqbsGetHref(self, url=None, cssselector=None):
try:
rs = requests.get(url, headers=self.headers)
rs.encoding = rs.apparent_encoding
bs = BeautifulSoup(rs.text, 'html.parser')
urls = [item.attrs['href'] for item in bs.select(selector=cssselector)]
return urls
except:
pass
def seleniumGetText(self,url,cssselector=None):
chrome = webdriver.Chrome()
chrome.get(url)
chrome.implicitly_wait(3)
res = chrome.find_elements(by=By.CSS_SELECTOR,value=cssselector)
return [str(item.text).strip() for item in res]
# 获取株洲本地的温度
def getWeatherInfo():
url=" http://d1.weather.com.cn/sk_2d/101250304.html?_=1716271809611"
headers={
"Host": "d1.weather.com.cn",
"Referer": "http://www.weather.com.cn/",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36"
}
str = requests.get(url,headers=headers).text.replace('"','')
for item in str.split(","):
if 'time' in item:
timeinfo=item[5:]
elif 'temp:' in item:temp=item.split(":")[1]
return timeinfo,temp
# 获取期刊名
if __name__=='__main__':
# print(getWeatherInfo())
str='var dataSK={"nameen":"hetang","cityname":"荷塘","city":"101250304","temp":"27.6","tempf":"81.7","WD":"东北风","wde":"NE","WS":"1级","wse":"4km\/h","SD":"69%","sd":"69%","qy":"1002","njd":"12km","time":"14:25","rain":"0","rain24h":"0","aqi":"49","aqi_pm25":"49","weather":"阴","weathere":"Overcast","weathercode":"d02","limitnumber":"","date":"05月21日(星期二)"}'
str1=str.replace('"','')
items=str1.split(",")
for item in items:
if 'time' in item:
timeinfo=item[5:]
elif 'temp:' in item:temp=item.split(":")[1]
print(timeinfo,temp)

@ -0,0 +1,53 @@
import platform,psutil
from datetime import datetime
from Model import DataDB as db
def get_size(bytes, suffix="B"):
"""
Scale bytes to its proper format
e.g:
1253656 => '1.20MB'
1253656678 => '1.17GB'
"""
factor = 1024
for unit in ["", "K", "M", "G", "T", "P"]:
if bytes < factor:
return f"{bytes:.2f}{unit}{suffix}"
bytes /= factor
def getsysInfo():
'''
系统软硬件信息
:return:
'''
#1.运行环境
data={}
data['system']=platform.uname().system
data['node']=platform.uname().node
data['version']=platform.uname().version
#2.启动时间
data['boottime']=datetime.fromtimestamp(psutil.boot_time())
#3.memory
data['memoryused']=get_size(psutil.virtual_memory().used)
data['memoryfree']=get_size(psutil.virtual_memory().free)
data['memorytotal']=get_size(psutil.virtual_memory().total)
data['memorypercent']=psutil.virtual_memory().percent
#4.CPU
data['cpucount']=psutil.cpu_count(logical=False)
data['cpupercent']=psutil.cpu_percent()
data['cpucurrent']=psutil.cpu_freq().current
#5.disk
data['diskdevice']=[item.device for item in psutil.disk_partitions()]
data['diskfstype']=[item.fstype for item in psutil.disk_partitions()]
data['diskusage']=[psutil.disk_usage(item.mountpoint) for item in psutil.disk_partitions()]
data['diskMountpoint'] = [item.mountpoint for item in psutil.disk_partitions()]
data['diskpercent'] = [psutil.disk_usage(item).percent for item in data['diskMountpoint']]
data['disktotal'] = [get_size(psutil.disk_usage(item).total) for item in data['diskMountpoint']]
data['diskfree'] = [get_size(psutil.disk_usage(item).free) for item in data['diskMountpoint']]
data['diskused'] = [get_size(psutil.disk_usage(item).used) for item in data['diskMountpoint']]
return data
# 读取日志函数
def getsysLog(page):
return db.getAllDataByPage("dblog",page)
Loading…
Cancel
Save