diff --git a/SpiderWebsite/demo/views.py b/SpiderWebsite/demo/views.py index 4048fda..30f11a2 100644 --- a/SpiderWebsite/demo/views.py +++ b/SpiderWebsite/demo/views.py @@ -284,7 +284,7 @@ from lxml import etree import requests -def begin_spider(url, web_name): +def begin_spider(url, web_name, web=1): url = "https://guilin.zbj.com/search/f/?type=new&kw=saas" # 设置headers,防止UA验证,Host为要爬取的域名,通过浏览器F12获取User-Agent headers = { @@ -314,10 +314,10 @@ def begin_spider(url, web_name): def main(): begin_spider("https://guilin.zbj.com/search/f/?type=new&kw=saas", "猪八戒") - +''' if __name__ == '__main__': main() - +''' import json @@ -358,15 +358,15 @@ def begin_spider(page, url, web_name): f.close() -def main(): +def main_a(): for i in range(0, 100, 20): begin_spider(i,"https://music.163.com/","网易云") time.sleep(1) - +''' if __name__ == '__main__': main() - +''' import re # 正则表达式进行文字匹配 from bs4 import BeautifulSoup # 网页解析获取数据 import urllib.error,urllib.request @@ -374,7 +374,7 @@ import xlwt # 进行excel操作 -def main(): +def main_b(): baseurl = "https://movie.douban.com/top250?start=" # 1.爬取网页 name= '豆瓣top250' @@ -505,11 +505,11 @@ def saveData(datalist, savepath): - +''' if __name__ == "__main__": main() print("爬取完毕!") - +'''