个人代码

master
pc7lxt4iy 4 years ago
parent 7afc85e45b
commit 86df8a89cf

@ -279,3 +279,90 @@ def begin_spider(url, web_name):
datalist = getData(url) datalist = getData(url)
saveData(savepath,datalist,web_name) saveData(savepath,datalist,web_name)
import csv
from lxml import etree
import requests
def begin_spider(url, web_name):
url = "https://guilin.zbj.com/search/f/?type=new&kw=saas"
# 设置headers防止UA验证Host为要爬取的域名,通过浏览器F12获取User-Agent
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51"
}
response = requests.get(url=url, headers=headers)
html = etree.HTML(response.text) #通过etree解析文本内容
divs = html.xpath("/html/body/div[6]/div/div/div[2]/div[6]/div[1]/div") #通过浏览器的F12获取对应页面的xpath
f = open("data.csv", 'w', encoding='utf-8')
csv_writer = csv.writer(f)
for div in divs:
price_temp = div.xpath("./div/div/a[1]/div[2]/div[1]/span[1]/text()") #找到价格位置的Xpath
title_temp = div.xpath("./div/div/a[1]/div[2]/div[2]/p/text()")#找到项目名称位置的Xpath
Company_temp = div.xpath("./div/div/a[2]/div[1]/p/text()")#找到公司名字位置的Xpath
Address_temp = div.xpath("./div/div/a[2]/div[1]/div/span/text()")#找到公司地点位置的Xpath
# 以下均为对数据的处理
if len(price_temp) != 0 and len(title_temp) != 0 and len(Company_temp) != 0 and len(Address_temp) != 0:#为了处理一些较为特殊的数据,所以加次判断
price = price_temp[0].strip("¥") #去除价格的符号
title = "SAAS".join(title_temp) #将项目名称做美化
Company = Company_temp[0] #去除括号
Address = Address_temp[0] #去除括号
csv_writer.writerow([price, title, Company, Address]) #写入文件
def main():
begin_spider("https://guilin.zbj.com/search/f/?type=new&kw=saas", "猪八戒")
if __name__ == '__main__':
main()
import json
import time
import requests
# 设置headers防止UA验证Host为要爬取的域名,通过浏览器F12获取User-Agent
headers = {
'Host': 'music.163.com',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51"
}
def begin_spider(page, url, web_name):
"""
获取评论信息
"""
url = 'https://music.163.com/api/v1/resource/comments/R_SO_4_483671599?limit=10&offset=' + str(page) #调用网易云音乐评论区的api
response = requests.get(url=url, headers=headers)#获取请求
print(response.status_code)
# 将字符串转为json格式
result = json.loads(response.text)
items = result['comments'] #从一大堆的文字中提取评论区的内容这个地方不懂得话可以debug看一下
for item in items: #开始提取评论区内容
# 用户名
user_name = item['user']['nickname'].replace(',', '') #从json串获取到用户名
# 评论内容
comment = item['content'].strip().replace('', '').replace(',', '') #从json中获取评论内容
# 评论点赞数
praise = str(item['likedCount']) #从json中获取评论点赞数
# 评论时间
date = time.localtime(int(str(item['time'])[:10])) #从json中获取评论时间
date = time.strftime("%Y-%m-%d %H:%M:%S", date)
with open('test.csv', 'a', encoding='utf-8-sig') as f: #保存到文件中
f.write(user_name + ',' + comment + ',' + praise + ',' + date + '\n')
f.close()
def main():
for i in range(0, 100, 20):
begin_spider(i,"https://music.163.com/","网易云")
time.sleep(1)
if __name__ == '__main__':
main()

Loading…
Cancel
Save