|
|
|
@ -284,7 +284,7 @@ from lxml import etree
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def begin_spider(url, web_name):
|
|
|
|
|
def begin_spider(url, web_name, web=1):
|
|
|
|
|
url = "https://guilin.zbj.com/search/f/?type=new&kw=saas"
|
|
|
|
|
# 设置headers,防止UA验证,Host为要爬取的域名,通过浏览器F12获取User-Agent
|
|
|
|
|
headers = {
|
|
|
|
@ -314,10 +314,10 @@ def begin_spider(url, web_name):
|
|
|
|
|
def main():
|
|
|
|
|
begin_spider("https://guilin.zbj.com/search/f/?type=new&kw=saas", "猪八戒")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'''
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
main()
|
|
|
|
|
|
|
|
|
|
'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
@ -358,15 +358,15 @@ def begin_spider(page, url, web_name):
|
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
def main_a():
|
|
|
|
|
for i in range(0, 100, 20):
|
|
|
|
|
begin_spider(i,"https://music.163.com/","网易云")
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'''
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
main()
|
|
|
|
|
|
|
|
|
|
'''
|
|
|
|
|
import re # 正则表达式进行文字匹配
|
|
|
|
|
from bs4 import BeautifulSoup # 网页解析获取数据
|
|
|
|
|
import urllib.error,urllib.request
|
|
|
|
@ -374,7 +374,7 @@ import xlwt # 进行excel操作
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
def main_b():
|
|
|
|
|
baseurl = "https://movie.douban.com/top250?start="
|
|
|
|
|
# 1.爬取网页
|
|
|
|
|
name= '豆瓣top250'
|
|
|
|
@ -505,11 +505,11 @@ def saveData(datalist, savepath):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'''
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|
|
|
|
|
print("爬取完毕!")
|
|
|
|
|
|
|
|
|
|
'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|