进一步完善爬取结果写入数据库

master
wufayuan 3 years ago
parent 9501253095
commit 2dbf99feda

@ -276,3 +276,101 @@
2022-03-28 15:56:47.696 | INFO | dcs.tests.requestHandler:report_state:64 - [RESPONSE] report free: success marked ['127.0.0.1', 7777]
2022-03-28 15:56:50.660 | INFO | dcs.tests.requestHandler:crawl_zhiwang:47 - [REQUEST] crawl zhiwang
2022-03-28 15:56:50.660 | INFO | dcs.tests.spider:run:91 - crawling...
2022-03-29 10:10:01.148 | INFO | __main__:<module>:8 - reading config args...
2022-03-29 10:10:01.150 | INFO | __main__:<module>:15 - starting the server...
2022-03-29 10:10:08.502 | INFO | dcs.tests.requestHandler:report_state:57 - [REQUEST] report free
2022-03-29 10:10:08.502 | INFO | dcs.tests.requestHandler:report_state:64 - [RESPONSE] report free: success marked ['127.0.0.1', 7777]
2022-03-29 10:10:16.933 | INFO | dcs.tests.requestHandler:crawl_zhiwang:47 - [REQUEST] crawl zhiwang
2022-03-29 10:10:16.934 | INFO | dcs.tests.spider:run:95 - crawling...
2022-03-29 10:18:56.694 | INFO | __main__:<module>:8 - reading config args...
2022-03-29 10:18:56.694 | INFO | __main__:<module>:15 - starting the server...
2022-03-29 10:18:58.534 | INFO | dcs.tests.requestHandler:report_state:57 - [REQUEST] report free
2022-03-29 10:18:58.536 | INFO | dcs.tests.requestHandler:report_state:64 - [RESPONSE] report free: success marked ['127.0.0.1', 7777]
2022-03-29 10:19:01.418 | INFO | dcs.tests.requestHandler:crawl_zhiwang:47 - [REQUEST] crawl zhiwang
2022-03-29 10:19:01.419 | INFO | dcs.tests.spider:run:96 - crawling...
2022-03-29 10:24:22.241 | INFO | __main__:<module>:8 - reading config args...
2022-03-29 10:24:22.245 | INFO | __main__:<module>:15 - starting the server...
2022-03-29 10:24:29.596 | INFO | dcs.tests.requestHandler:report_state:57 - [REQUEST] report free
2022-03-29 10:24:29.597 | INFO | dcs.tests.requestHandler:report_state:64 - [RESPONSE] report free: success marked ['127.0.0.1', 7777]
2022-03-29 10:24:34.228 | INFO | dcs.tests.requestHandler:crawl_zhiwang:47 - [REQUEST] crawl zhiwang
2022-03-29 10:24:34.228 | INFO | dcs.tests.spider:run:97 - crawling...
2022-03-29 10:26:58.927 | INFO | __main__:<module>:8 - reading config args...
2022-03-29 10:26:58.927 | INFO | __main__:<module>:15 - starting the server...
2022-03-29 10:27:00.869 | INFO | dcs.tests.requestHandler:report_state:57 - [REQUEST] report free
2022-03-29 10:27:00.869 | INFO | dcs.tests.requestHandler:report_state:64 - [RESPONSE] report free: success marked ['127.0.0.1', 7777]
2022-03-29 10:27:03.378 | INFO | dcs.tests.requestHandler:crawl_zhiwang:47 - [REQUEST] crawl zhiwang
2022-03-29 10:27:03.379 | INFO | dcs.tests.spider:run:97 - crawling...
2022-03-29 10:32:16.896 | INFO | __main__:<module>:8 - reading config args...
2022-03-29 10:32:16.896 | INFO | __main__:<module>:15 - starting the server...
2022-03-29 10:32:19.506 | INFO | dcs.tests.requestHandler:report_state:57 - [REQUEST] report free
2022-03-29 10:32:19.506 | INFO | dcs.tests.requestHandler:report_state:64 - [RESPONSE] report free: success marked ['127.0.0.1', 7777]
2022-03-29 10:32:21.844 | INFO | dcs.tests.requestHandler:crawl_zhiwang:47 - [REQUEST] crawl zhiwang
2022-03-29 10:32:21.844 | INFO | dcs.tests.spider:run:97 - crawling...
2022-03-29 10:34:29.185 | INFO | __main__:<module>:8 - reading config args...
2022-03-29 10:34:29.186 | INFO | __main__:<module>:15 - starting the server...
2022-03-29 10:34:31.100 | INFO | dcs.tests.requestHandler:report_state:57 - [REQUEST] report free
2022-03-29 10:34:31.100 | INFO | dcs.tests.requestHandler:report_state:64 - [RESPONSE] report free: success marked ['127.0.0.1', 7777]
2022-03-29 10:34:33.383 | INFO | dcs.tests.requestHandler:crawl_zhiwang:47 - [REQUEST] crawl zhiwang
2022-03-29 10:34:33.384 | INFO | dcs.tests.spider:run:98 - crawling...
2022-03-29 10:36:09.692 | INFO | __main__:<module>:8 - reading config args...
2022-03-29 10:36:09.692 | INFO | __main__:<module>:15 - starting the server...
2022-03-29 10:36:11.844 | INFO | dcs.tests.requestHandler:report_state:57 - [REQUEST] report free
2022-03-29 10:36:11.844 | INFO | dcs.tests.requestHandler:report_state:64 - [RESPONSE] report free: success marked ['127.0.0.1', 7777]
2022-03-29 10:36:14.414 | INFO | dcs.tests.requestHandler:crawl_zhiwang:47 - [REQUEST] crawl zhiwang
2022-03-29 10:36:14.414 | INFO | dcs.tests.spider:run:97 - crawling...
2022-03-29 10:39:56.639 | INFO | __main__:<module>:8 - reading config args...
2022-03-29 10:39:56.639 | INFO | __main__:<module>:15 - starting the server...
2022-03-29 10:42:38.230 | INFO | __main__:<module>:8 - reading config args...
2022-03-29 10:42:38.231 | INFO | __main__:<module>:15 - starting the server...
2022-03-29 10:42:39.993 | INFO | dcs.tests.requestHandler:report_state:57 - [REQUEST] report free
2022-03-29 10:42:39.993 | INFO | dcs.tests.requestHandler:report_state:64 - [RESPONSE] report free: success marked ['127.0.0.1', 7777]
2022-03-29 10:42:42.088 | INFO | dcs.tests.requestHandler:crawl_zhiwang:47 - [REQUEST] crawl zhiwang
2022-03-29 10:42:42.088 | INFO | dcs.tests.spider:run:96 - crawling...
2022-03-29 10:43:21.679 | INFO | __main__:<module>:8 - reading config args...
2022-03-29 10:43:21.679 | INFO | __main__:<module>:15 - starting the server...
2022-03-29 10:43:24.139 | INFO | dcs.tests.requestHandler:report_state:57 - [REQUEST] report free
2022-03-29 10:43:24.139 | INFO | dcs.tests.requestHandler:report_state:64 - [RESPONSE] report free: success marked ['127.0.0.1', 7777]
2022-03-29 10:43:26.130 | INFO | dcs.tests.requestHandler:crawl_zhiwang:47 - [REQUEST] crawl zhiwang
2022-03-29 10:43:26.131 | INFO | dcs.tests.spider:run:98 - crawling...
2022-03-29 10:47:13.491 | INFO | __main__:<module>:8 - reading config args...
2022-03-29 10:47:13.492 | INFO | __main__:<module>:15 - starting the server...
2022-03-29 10:47:15.946 | INFO | dcs.tests.requestHandler:report_state:57 - [REQUEST] report free
2022-03-29 10:47:15.946 | INFO | dcs.tests.requestHandler:report_state:64 - [RESPONSE] report free: success marked ['127.0.0.1', 7777]
2022-03-29 10:47:18.123 | INFO | dcs.tests.requestHandler:crawl_zhiwang:47 - [REQUEST] crawl zhiwang
2022-03-29 10:47:18.124 | INFO | dcs.tests.spider:run:98 - crawling...
2022-03-29 10:48:55.960 | INFO | __main__:<module>:8 - reading config args...
2022-03-29 10:48:55.961 | INFO | __main__:<module>:15 - starting the server...
2022-03-29 10:49:00.887 | INFO | dcs.tests.requestHandler:report_state:57 - [REQUEST] report free
2022-03-29 10:49:00.888 | INFO | dcs.tests.requestHandler:report_state:64 - [RESPONSE] report free: success marked ['127.0.0.1', 7777]
2022-03-29 10:49:12.173 | INFO | dcs.tests.requestHandler:crawl_zhiwang:47 - [REQUEST] crawl zhiwang
2022-03-29 10:49:12.174 | INFO | dcs.tests.spider:run:96 - crawling...
2022-03-29 10:50:41.292 | INFO | dcs.tests.requestHandler:crawl_zhiwang:54 - [RESPONSE] crawl zhiwang: success
2022-03-29 10:50:41.293 | INFO | dcs.tests.requestHandler:report_state:57 - [REQUEST] report free
2022-03-29 10:50:41.294 | INFO | dcs.tests.requestHandler:report_state:64 - [RESPONSE] report free: success marked ['127.0.0.1', 7777]
2022-03-29 10:50:41.295 | INFO | dcs.tests.server:run:36 - [REQUEST] end
2022-03-29 10:50:41.295 | WARNING | dcs.tests.server:run:37 - communication over!
2022-03-29 10:50:41.296 | WARNING | __main__:<module>:21 - Overing...
2022-03-29 10:56:28.466 | INFO | __main__:<module>:8 - reading config args...
2022-03-29 10:56:28.466 | INFO | __main__:<module>:15 - starting the server...
2022-03-29 10:56:32.107 | INFO | dcs.tests.requestHandler:report_state:57 - [REQUEST] report free
2022-03-29 10:56:32.107 | INFO | dcs.tests.requestHandler:report_state:64 - [RESPONSE] report free: success marked ['127.0.0.1', 7777]
2022-03-29 10:56:34.888 | INFO | dcs.tests.requestHandler:crawl_zhiwang:47 - [REQUEST] crawl zhiwang
2022-03-29 10:56:34.888 | INFO | dcs.tests.spider:run:101 - crawling...
2022-03-29 10:58:18.658 | INFO | dcs.tests.requestHandler:crawl_zhiwang:54 - [RESPONSE] crawl zhiwang: success
2022-03-29 10:58:18.660 | INFO | dcs.tests.requestHandler:report_state:57 - [REQUEST] report free
2022-03-29 10:58:18.660 | INFO | dcs.tests.requestHandler:report_state:64 - [RESPONSE] report free: success marked ['127.0.0.1', 7777]
2022-03-29 10:58:18.661 | INFO | dcs.tests.server:run:36 - [REQUEST] end
2022-03-29 10:58:18.661 | WARNING | dcs.tests.server:run:37 - communication over!
2022-03-29 10:58:18.662 | WARNING | __main__:<module>:21 - Overing...
2022-03-29 11:08:25.653 | INFO | __main__:<module>:8 - reading config args...
2022-03-29 11:08:25.654 | INFO | __main__:<module>:15 - starting the server...
2022-03-29 11:08:29.895 | INFO | dcs.tests.requestHandler:report_state:57 - [REQUEST] report free
2022-03-29 11:08:29.896 | INFO | dcs.tests.requestHandler:report_state:64 - [RESPONSE] report free: success marked ['127.0.0.1', 7777]
2022-03-29 11:08:32.321 | INFO | dcs.tests.requestHandler:crawl_zhiwang:47 - [REQUEST] crawl zhiwang
2022-03-29 11:08:32.321 | INFO | dcs.tests.spider:run:101 - crawling...
2022-03-29 11:09:56.280 | INFO | dcs.tests.requestHandler:crawl_zhiwang:54 - [RESPONSE] crawl zhiwang: success
2022-03-29 11:09:56.281 | INFO | dcs.tests.requestHandler:report_state:57 - [REQUEST] report free
2022-03-29 11:09:56.281 | INFO | dcs.tests.requestHandler:report_state:64 - [RESPONSE] report free: success marked ['127.0.0.1', 7777]
2022-03-29 11:09:56.282 | INFO | dcs.tests.server:run:36 - [REQUEST] end
2022-03-29 11:09:56.282 | WARNING | dcs.tests.server:run:37 - communication over!
2022-03-29 11:09:56.283 | WARNING | __main__:<module>:21 - Overing...

@ -81,27 +81,69 @@ def cancel(u_name):
print(e)
def write_result2database(res: list, table_name: str): #id 需修改
def get_last_crawl_id(table_name: str) -> int:
"""
:param table_name: 目标用户对应的爬取结果信息表
:return: 要取得的用户最后一次爬取序列号
"""
try:
conn = mysql_conn()
cur = conn.cursor()
create_sql = f'create table if not exists {table_name} (' \
f'crawl_id int not null,' \
f'time timestamp not null,' \
f'name varchar(40),' \
f'college varchar(100),' \
f'major varchar(100),' \
f'paper varchar(100)' \
f')'
get_id_sql = f'SELECT crawl_id from {table_name} where time = (SELECT max(time) FROM {table_name})'
cur.execute(create_sql)
conn.commit()
cur.execute(get_id_sql)
last_crawl_id_res = cur.fetchone()
if last_crawl_id_res is None:
last_crawl_id_res = [0]
last_crawl_id = int(last_crawl_id_res[0])
insert_sql = f"insert into {table_name} (name,college,major,paper,crawl_id,time) values ('%s','%s','%s','%s',%s,now())" % (res[0], res[1], res[2], res[3], last_crawl_id+1)
cur.close()
conn.close()
except Exception as e:
print(e)
return last_crawl_id
def drop_table(table_name: str):
try:
conn = mysql_conn()
cur = conn.cursor()
drop_sql = f'drop table if exists {table_name}'
cur.execute(drop_sql)
conn.commit()
cur.close()
conn.close()
except Exception as e:
print(e)
def create_crawl_result_table(table_name: str):
try:
conn = mysql_conn()
cur = conn.cursor()
create_sql = f'create table if not exists {table_name} (' \
f'id int primary key not null auto_increment,' \
f'crawl_id int not null,' \
f'time timestamp not null,' \
f'name varchar(100),' \
f'college varchar(200),' \
f'major varchar(200),' \
f'paper varchar(200)' \
f')'
cur.execute(create_sql)
conn.commit()
cur.close()
conn.close()
except Exception as e:
print(e)
def write_result2database(res: list, table_name: str, last_crawl_id: int):
try:
conn = mysql_conn()
cur = conn.cursor()
insert_sql = f"insert into {table_name} (name,college,major,paper,crawl_id,time) values ('%s','%s','%s','%s',%s,now())" % (
res[0], res[1], res[2], res[3], last_crawl_id + 1)
cur.execute(insert_sql)
conn.commit()
cur.close()
@ -114,7 +156,8 @@ def write_result2database(res: list, table_name: str): #id 需修改
if __name__ == '__main__':
print(write_result2database(['name', 'college', 'major', 'paper'], "table_name"))
create_crawl_result_table('table_name')
print(write_result2database(['name', 'college', 'major', 'paper'], "table_name", last_crawl_id=0))
pass
'''
u_name = input('请输入用户名')

@ -7,7 +7,7 @@ from msedge.selenium_tools import EdgeOptions
from dcs.tests.zhiwang import *
from loguru import logger
from dcs.tests.database import write_result2database
from dcs.tests.database import write_result2database, get_last_crawl_id, create_crawl_result_table
def translate(word):
@ -34,23 +34,33 @@ def crawl_zhiwang(word, pages_start=1, pages_end=1):
edge_options.add_argument('headless')
driver = Edge(options=edge_options, executable_path=r'G:\Users\god\PycharmProjects\dcs\bin\msedgedriver.exe')
soup = driver_open(driver, word) # 搜索word
papers = [] # 用于保存爬取到的论文
table_name = 'wufayuan_crawl_result'
create_crawl_result_table(table_name=table_name)
last_crawl_id = get_last_crawl_id(table_name=table_name)
paper_id = 0
# 爬取第一篇
if pages_start == 1:
spider(driver, soup, papers)
pages_start += 1
write2database(papers[-1])
while paper_id < len(papers):
write2database(papers[paper_id], table_name=table_name, last_crawl_id=last_crawl_id)
paper_id += 1
for pn in range(pages_start, pages_end):
content = change_page(driver, pn)
spider(driver, content, papers)
write2database(papers[-1])
while paper_id < len(papers):
write2database(papers[paper_id], table_name=table_name, last_crawl_id=last_crawl_id)
paper_id += 1
driver.close()
def write2database(paper: Paper):
def write2database(paper: Paper, table_name: str, last_crawl_id: int):
for author in paper.authors:
if author.name:
print(write_result2database([author.name, author.college, author.major, paper.title], "test_user"))
write_result2database([author.name, author.college, author.major, paper.title], table_name, last_crawl_id)
def write2csv(papers: list, file_name='./paper_author.csv'):

@ -3,10 +3,10 @@
'''
from bs4 import BeautifulSoup
from dcs.tests.database import write_result2database
import time
import requests
# 定义论文类
from selenium.webdriver.common.by import By
@ -16,8 +16,12 @@ class Paper:
self.title = title
self.authors = authors
def __str__(self):
return f'{self.title}, authors'
# 定义作者类
# 定义作者类
class Author:
def __init__(self, name, college, major):
self.name = name
@ -34,7 +38,7 @@ def driver_open(driver, key_word):
time.sleep(2)
# 点击搜索按钮
driver.find_element(by=By.CSS_SELECTOR, value=
'body > div.wrapper.section1 > div.searchmain > div > div.input-box > input.search-btn').click()
'body > div.wrapper.section1 > div.searchmain > div > div.input-box > input.search-btn').click()
time.sleep(5)
content = driver.page_source.encode('utf-8')
# driver.close()
@ -46,7 +50,8 @@ def spider(driver, soup, papers):
tbody = soup.find_all('tbody')
try:
tbody = BeautifulSoup(str(tbody[0]), 'lxml')
except:return
except:
return
tr = tbody.find_all('tr')
for item in tr:
tr_bf = BeautifulSoup(str(item), 'lxml')

Loading…
Cancel
Save