|
|
@ -279,3 +279,90 @@ def begin_spider(url, web_name):
|
|
|
|
datalist = getData(url)
|
|
|
|
datalist = getData(url)
|
|
|
|
saveData(savepath,datalist,web_name)
|
|
|
|
saveData(savepath,datalist,web_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import csv
|
|
|
|
|
|
|
|
from lxml import etree
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def begin_spider(url, web_name):
|
|
|
|
|
|
|
|
url = "https://guilin.zbj.com/search/f/?type=new&kw=saas"
|
|
|
|
|
|
|
|
# 设置headers,防止UA验证,Host为要爬取的域名,通过浏览器F12获取User-Agent
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
response = requests.get(url=url, headers=headers)
|
|
|
|
|
|
|
|
html = etree.HTML(response.text) #通过etree解析文本内容
|
|
|
|
|
|
|
|
divs = html.xpath("/html/body/div[6]/div/div/div[2]/div[6]/div[1]/div") #通过浏览器的F12获取对应页面的xpath,
|
|
|
|
|
|
|
|
f = open("data.csv", 'w', encoding='utf-8')
|
|
|
|
|
|
|
|
csv_writer = csv.writer(f)
|
|
|
|
|
|
|
|
for div in divs:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
price_temp = div.xpath("./div/div/a[1]/div[2]/div[1]/span[1]/text()") #找到价格位置的Xpath
|
|
|
|
|
|
|
|
title_temp = div.xpath("./div/div/a[1]/div[2]/div[2]/p/text()")#找到项目名称位置的Xpath
|
|
|
|
|
|
|
|
Company_temp = div.xpath("./div/div/a[2]/div[1]/p/text()")#找到公司名字位置的Xpath
|
|
|
|
|
|
|
|
Address_temp = div.xpath("./div/div/a[2]/div[1]/div/span/text()")#找到公司地点位置的Xpath
|
|
|
|
|
|
|
|
# 以下均为对数据的处理
|
|
|
|
|
|
|
|
if len(price_temp) != 0 and len(title_temp) != 0 and len(Company_temp) != 0 and len(Address_temp) != 0:#为了处理一些较为特殊的数据,所以加次判断
|
|
|
|
|
|
|
|
price = price_temp[0].strip("¥") #去除价格的符号
|
|
|
|
|
|
|
|
title = "SAAS".join(title_temp) #将项目名称做美化
|
|
|
|
|
|
|
|
Company = Company_temp[0] #去除括号
|
|
|
|
|
|
|
|
Address = Address_temp[0] #去除括号
|
|
|
|
|
|
|
|
csv_writer.writerow([price, title, Company, Address]) #写入文件
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
|
|
|
begin_spider("https://guilin.zbj.com/search/f/?type=new&kw=saas", "猪八戒")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
|
|
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 设置headers,防止UA验证,Host为要爬取的域名,通过浏览器F12获取User-Agent
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
|
|
|
'Host': 'music.163.com',
|
|
|
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def begin_spider(page, url, web_name):
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
获取评论信息
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
url = 'https://music.163.com/api/v1/resource/comments/R_SO_4_483671599?limit=10&offset=' + str(page) #调用网易云音乐评论区的api
|
|
|
|
|
|
|
|
response = requests.get(url=url, headers=headers)#获取请求
|
|
|
|
|
|
|
|
print(response.status_code)
|
|
|
|
|
|
|
|
# 将字符串转为json格式
|
|
|
|
|
|
|
|
result = json.loads(response.text)
|
|
|
|
|
|
|
|
items = result['comments'] #从一大堆的文字中提取评论区的内容,这个地方不懂得话,可以debug看一下
|
|
|
|
|
|
|
|
for item in items: #开始提取评论区内容
|
|
|
|
|
|
|
|
# 用户名
|
|
|
|
|
|
|
|
user_name = item['user']['nickname'].replace(',', ',') #从json串获取到用户名
|
|
|
|
|
|
|
|
# 评论内容
|
|
|
|
|
|
|
|
comment = item['content'].strip().replace('', '').replace(',', ',') #从json中获取评论内容
|
|
|
|
|
|
|
|
# 评论点赞数
|
|
|
|
|
|
|
|
praise = str(item['likedCount']) #从json中获取评论点赞数
|
|
|
|
|
|
|
|
# 评论时间
|
|
|
|
|
|
|
|
date = time.localtime(int(str(item['time'])[:10])) #从json中获取评论时间
|
|
|
|
|
|
|
|
date = time.strftime("%Y-%m-%d %H:%M:%S", date)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with open('test.csv', 'a', encoding='utf-8-sig') as f: #保存到文件中
|
|
|
|
|
|
|
|
f.write(user_name + ',' + comment + ',' + praise + ',' + date + '\n')
|
|
|
|
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
|
|
|
for i in range(0, 100, 20):
|
|
|
|
|
|
|
|
begin_spider(i,"https://music.163.com/","网易云")
|
|
|
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
|
|
main()
|
|
|
|