diff --git a/SpiderWebsite/demo/views.py b/SpiderWebsite/demo/views.py index 3e05c02..084bbf9 100644 --- a/SpiderWebsite/demo/views.py +++ b/SpiderWebsite/demo/views.py @@ -279,3 +279,90 @@ def begin_spider(url, web_name): datalist = getData(url) saveData(savepath,datalist,web_name) +import csv +from lxml import etree +import requests + + +def begin_spider(url, web_name): + url = "https://guilin.zbj.com/search/f/?type=new&kw=saas" + # 设置headers,防止UA验证,Host为要爬取的域名,通过浏览器F12获取User-Agent + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51" + } + response = requests.get(url=url, headers=headers) + html = etree.HTML(response.text) #通过etree解析文本内容 + divs = html.xpath("/html/body/div[6]/div/div/div[2]/div[6]/div[1]/div") #通过浏览器的F12获取对应页面的xpath, + f = open("data.csv", 'w', encoding='utf-8') + csv_writer = csv.writer(f) + for div in divs: + + price_temp = div.xpath("./div/div/a[1]/div[2]/div[1]/span[1]/text()") #找到价格位置的Xpath + title_temp = div.xpath("./div/div/a[1]/div[2]/div[2]/p/text()")#找到项目名称位置的Xpath + Company_temp = div.xpath("./div/div/a[2]/div[1]/p/text()")#找到公司名字位置的Xpath + Address_temp = div.xpath("./div/div/a[2]/div[1]/div/span/text()")#找到公司地点位置的Xpath + # 以下均为对数据的处理 + if len(price_temp) != 0 and len(title_temp) != 0 and len(Company_temp) != 0 and len(Address_temp) != 0:#为了处理一些较为特殊的数据,所以加次判断 + price = price_temp[0].strip("¥") #去除价格的符号 + title = "SAAS".join(title_temp) #将项目名称做美化 + Company = Company_temp[0] #去除括号 + Address = Address_temp[0] #去除括号 + csv_writer.writerow([price, title, Company, Address]) #写入文件 + + + +def main(): + begin_spider("https://guilin.zbj.com/search/f/?type=new&kw=saas", "猪八戒") + + +if __name__ == '__main__': + main() + + + +import json +import time +import requests + +# 设置headers,防止UA验证,Host为要爬取的域名,通过浏览器F12获取User-Agent +headers = { + 'Host': 'music.163.com', +"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51" +} + + +def begin_spider(page, url, web_name): + """ + 获取评论信息 + """ + + url = 'https://music.163.com/api/v1/resource/comments/R_SO_4_483671599?limit=10&offset=' + str(page) #调用网易云音乐评论区的api + response = requests.get(url=url, headers=headers)#获取请求 + print(response.status_code) + # 将字符串转为json格式 + result = json.loads(response.text) + items = result['comments'] #从一大堆的文字中提取评论区的内容,这个地方不懂得话,可以debug看一下 + for item in items: #开始提取评论区内容 + # 用户名 + user_name = item['user']['nickname'].replace(',', ',') #从json串获取到用户名 + # 评论内容 + comment = item['content'].strip().replace('', '').replace(',', ',') #从json中获取评论内容 + # 评论点赞数 + praise = str(item['likedCount']) #从json中获取评论点赞数 + # 评论时间 + date = time.localtime(int(str(item['time'])[:10])) #从json中获取评论时间 + date = time.strftime("%Y-%m-%d %H:%M:%S", date) + + with open('test.csv', 'a', encoding='utf-8-sig') as f: #保存到文件中 + f.write(user_name + ',' + comment + ',' + praise + ',' + date + '\n') + f.close() + + +def main(): + for i in range(0, 100, 20): + begin_spider(i,"https://music.163.com/","网易云") + time.sleep(1) + + +if __name__ == '__main__': + main()