实现基本爬虫功能, 爬取知网数据

developer_wufayuan
wufayuan 3 years ago
parent e94c3026bc
commit 271dda7b12

Binary file not shown.

@ -22,3 +22,89 @@
2022-03-16 15:56:17.146 | INFO | dcs.tests.server:run:36 - [REQUEST] end
2022-03-16 15:56:17.147 | WARNING | dcs.tests.server:run:37 - communication over!
2022-03-16 15:56:17.149 | WARNING | __main__:<module>:21 - Overing...
2022-03-17 16:33:41.255 | INFO | __main__:<module>:8 - reading config args...
2022-03-17 16:33:41.256 | INFO | __main__:<module>:15 - starting the server...
2022-03-17 16:33:49.912 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test
2022-03-17 16:33:49.913 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22
2022-03-17 16:33:57.605 | INFO | dcs.tests.requestHandler:run:33 - [REQUEST] translate
2022-03-17 16:33:57.606 | INFO | dcs.tests.spider:run:45 - crawling...
2022-03-17 16:38:20.919 | INFO | __main__:<module>:8 - reading config args...
2022-03-17 16:38:20.920 | INFO | __main__:<module>:15 - starting the server...
2022-03-17 16:38:26.531 | INFO | __main__:<module>:8 - reading config args...
2022-03-17 16:38:26.531 | INFO | __main__:<module>:15 - starting the server...
2022-03-17 16:38:29.286 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test
2022-03-17 16:38:29.287 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22
2022-03-17 16:38:33.326 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang
2022-03-17 16:38:33.327 | INFO | dcs.tests.spider:run:45 - crawling...
2022-03-17 16:41:03.903 | INFO | __main__:<module>:8 - reading config args...
2022-03-17 16:41:03.903 | INFO | __main__:<module>:15 - starting the server...
2022-03-17 16:41:06.026 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test
2022-03-17 16:41:06.027 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22
2022-03-17 16:41:09.812 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang
2022-03-17 16:41:09.812 | INFO | dcs.tests.spider:run:49 - crawling...
2022-03-17 16:43:18.535 | INFO | __main__:<module>:8 - reading config args...
2022-03-17 16:43:18.535 | INFO | __main__:<module>:15 - starting the server...
2022-03-17 16:43:22.518 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test
2022-03-17 16:43:22.518 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22
2022-03-17 16:43:26.977 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang
2022-03-17 16:43:26.977 | INFO | dcs.tests.spider:run:49 - crawling...
2022-03-17 16:44:01.823 | INFO | __main__:<module>:8 - reading config args...
2022-03-17 16:44:01.824 | INFO | __main__:<module>:15 - starting the server...
2022-03-17 16:44:04.256 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test
2022-03-17 16:44:04.257 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22
2022-03-17 16:44:08.634 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang
2022-03-17 16:44:08.634 | INFO | dcs.tests.spider:run:49 - crawling...
2022-03-17 16:44:55.914 | INFO | __main__:<module>:8 - reading config args...
2022-03-17 16:44:55.914 | INFO | __main__:<module>:15 - starting the server...
2022-03-17 16:44:57.966 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test
2022-03-17 16:44:57.967 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22
2022-03-17 16:45:01.569 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang
2022-03-17 16:45:01.570 | INFO | dcs.tests.spider:run:49 - crawling...
2022-03-17 16:45:59.893 | INFO | __main__:<module>:8 - reading config args...
2022-03-17 16:45:59.894 | INFO | __main__:<module>:15 - starting the server...
2022-03-17 16:46:03.158 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test
2022-03-17 16:46:03.158 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22
2022-03-17 16:46:06.376 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang
2022-03-17 16:46:06.377 | INFO | dcs.tests.spider:run:49 - crawling...
2022-03-17 16:48:25.032 | INFO | __main__:<module>:8 - reading config args...
2022-03-17 16:48:25.033 | INFO | __main__:<module>:15 - starting the server...
2022-03-17 16:48:26.903 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test
2022-03-17 16:48:26.904 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22
2022-03-17 16:48:29.504 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang
2022-03-17 16:48:29.505 | INFO | dcs.tests.spider:run:49 - crawling...
2022-03-17 16:50:28.903 | INFO | __main__:<module>:8 - reading config args...
2022-03-17 16:50:28.905 | INFO | __main__:<module>:15 - starting the server...
2022-03-17 16:50:30.975 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test
2022-03-17 16:50:30.975 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22
2022-03-17 16:50:33.959 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang
2022-03-17 16:50:33.960 | INFO | dcs.tests.spider:run:49 - crawling...
2022-03-17 16:56:08.569 | INFO | __main__:<module>:8 - reading config args...
2022-03-17 16:56:08.569 | INFO | __main__:<module>:15 - starting the server...
2022-03-17 16:56:19.722 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test
2022-03-17 16:56:19.722 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22
2022-03-17 16:56:21.588 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang
2022-03-17 16:56:21.589 | INFO | dcs.tests.spider:run:48 - crawling...
2022-03-17 16:58:01.057 | INFO | __main__:<module>:8 - reading config args...
2022-03-17 16:58:01.058 | INFO | __main__:<module>:15 - starting the server...
2022-03-17 16:58:04.142 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test
2022-03-17 16:58:04.143 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22
2022-03-17 16:58:09.346 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang
2022-03-17 16:58:09.347 | INFO | dcs.tests.spider:run:48 - crawling...
2022-03-17 17:06:58.479 | INFO | __main__:<module>:8 - reading config args...
2022-03-17 17:06:58.480 | INFO | __main__:<module>:15 - starting the server...
2022-03-17 17:07:02.340 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test
2022-03-17 17:07:02.341 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22
2022-03-17 17:07:06.076 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang
2022-03-17 17:07:06.076 | INFO | dcs.tests.spider:run:49 - crawling...
2022-03-17 17:08:39.065 | INFO | __main__:<module>:8 - reading config args...
2022-03-17 17:08:39.066 | INFO | __main__:<module>:15 - starting the server...
2022-03-17 17:08:42.427 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test
2022-03-17 17:08:42.428 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22
2022-03-17 17:08:45.521 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang
2022-03-17 17:08:45.522 | INFO | dcs.tests.spider:run:49 - crawling...
2022-03-17 17:09:23.242 | INFO | __main__:<module>:8 - reading config args...
2022-03-17 17:09:23.243 | INFO | __main__:<module>:15 - starting the server...
2022-03-17 17:09:25.024 | INFO | dcs.tests.requestHandler:run:21 - [REQUEST] test
2022-03-17 17:09:25.025 | INFO | dcs.tests.requestHandler:run:31 - [RESPONSE] test: hello TEST, header size: 22
2022-03-17 17:09:28.188 | INFO | dcs.tests.requestHandler:run:46 - [REQUEST] crawl zhiwang
2022-03-17 17:09:28.189 | INFO | dcs.tests.spider:run:49 - crawling...

@ -85,6 +85,23 @@ class Client(Thread):
return responseJson['translate']
def crawling(self, word: str) -> 'int':
with socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.IPPROTO_TCP) as socket_to_server:
socket_to_server.connect((self.ip, self.port))
request = dict()
request['action'] = 'crawl zhiwang'
request['word'] = word
full_request = generate_request(request)
socket_to_server.sendall(full_request)
responseJson = JSONDecoder().decode(
read_bytes(socket_to_server, struct.unpack('!Q', socket_to_server.recv(8))[0]).decode(
"utf-8"))
return responseJson['crawl zhiwang']
def end(self):
"""
结束通信
@ -103,7 +120,7 @@ class Client(Thread):
def run(self) -> None:
print(self.test())
print(self.translate(input("word:")))
print(self.crawling(input("word:")))
self.end()

@ -42,5 +42,18 @@ class RequestHandler(threading.Thread):
response_binary = response_binary_len_binary + response_binary
self.client_socket.sendall(response_binary)
logger.info(f"[RESPONSE] translate: {response['translate']}, header size: {response_binary_len}")
elif self.request_map['action'] == 'crawl zhiwang':
logger.info(f"[REQUEST] crawl zhiwang")
spider = Spider(self.request_map['word'])
response = {
'crawl zhiwang': spider.run()
}
response_binary = json.JSONEncoder().encode(response).encode("utf-8")
response_binary_len = len(response_binary)
response_binary_len_binary = struct.pack("!Q", response_binary_len)
response_binary = response_binary_len_binary + response_binary
self.client_socket.sendall(response_binary)
logger.info(f"[RESPONSE] crawl zhiwang: {response['crawl zhiwang']}, header size: {response_binary_len}")
finally:
self.client_socket.close()

@ -1,9 +1,12 @@
import threading
import requests
from msedge.selenium_tools import Edge
from msedge.selenium_tools import EdgeOptions
from dcs.tests.zhiwang import *
from loguru import logger
def crawl(word):
def translate(word):
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
data = {'i': word,
'from': 'AUTO',
@ -21,8 +24,22 @@ def crawl(word):
return result
def crawl_zhiwang(word, pages_start=2, pages_end=3):
edge_options = EdgeOptions()
edge_options.use_chromium = True
edge_options.add_argument('headless')
driver = Edge(options=edge_options, executable_path=r'G:\Users\god\PycharmProjects\dcs\bin\msedgedriver.exe')
soup = driver_open(driver, word) # 搜索word
papers = [] # 用于保存爬取到的论文
spider(driver, soup, papers)
for pn in range(pages_start, pages_end):
content = change_page(driver, pn)
spider(driver, content, papers)
driver.close()
class Spider(threading.Thread):
def __init__(self, word: 'string'):
def __init__(self, word: str):
super().__init__()
self.word = word
self.daemon = True
@ -30,5 +47,4 @@ class Spider(threading.Thread):
def run(self) -> None:
logger.info('crawling...')
return crawl(self.word)
crawl_zhiwang(word=self.word)

@ -3,11 +3,8 @@
'''
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
import requests
import csv
# 定义论文类
@ -47,7 +44,9 @@ def driver_open(driver, key_word):
def spider(driver, soup, papers):
tbody = soup.find_all('tbody')
try:
tbody = BeautifulSoup(str(tbody[0]), 'lxml')
except:return
tr = tbody.find_all('tr')
for item in tr:
tr_bf = BeautifulSoup(str(item), 'lxml')
@ -121,6 +120,7 @@ def get_author_info(skey, code):
if __name__ == '__main__':
'''
browser = Service('../../bin/msedgedriver.exe')
driver = webdriver.Edge(service=browser)
soup = driver_open(driver, '知识图谱') # 搜索知识图谱
@ -147,3 +147,4 @@ if __name__ == '__main__':
# 关闭文件
f_papers_authors.close()
'''

@ -1,2 +1,6 @@
loguru~=0.6.0
requests~=2.27.1
pandas~=1.3.4
bs4~=0.0.1
beautifulsoup4~=4.10.0
selenium~=4.1.3
Loading…
Cancel
Save