|
|
|
@ -0,0 +1,73 @@
|
|
|
|
|
import requests
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
import time
|
|
|
|
|
headers = {
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
|
|
|
|
|
}
|
|
|
|
|
def get_info(url):
|
|
|
|
|
wb_data = requests.get(url, headers=headers)
|
|
|
|
|
soup = BeautifulSoup(wb_data.text, 'html.parser')
|
|
|
|
|
ranks = soup.select('span.pc_temp_num')
|
|
|
|
|
titles = soup.select('div.pc_temp_songlist > ul > li > a')
|
|
|
|
|
times = soup.select('span.pc_temp_tips_r > span')
|
|
|
|
|
for rank, title, time in zip(ranks, titles, times):
|
|
|
|
|
data = {
|
|
|
|
|
'rank': rank.get_text().strip(),
|
|
|
|
|
'singer': title.get_text().split('-')[0].strip(),
|
|
|
|
|
'song': title.get_text().split('-')[1].strip() if len(title.get_text().split('-')) > 1 else '',
|
|
|
|
|
'time': time.get_text().strip()
|
|
|
|
|
}
|
|
|
|
|
print(data)
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
urls = ['http://www.kugou.com/yy/rank/home/{}-8888.html'.format(str(i)) for i in range(1, 24)]
|
|
|
|
|
for url in urls:
|
|
|
|
|
get_info(url)
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
其中items.py文件中删出name前的#
|
|
|
|
|
setting.py删除注释65-67
|
|
|
|
|
meiju.py
|
|
|
|
|
import scrapy
|
|
|
|
|
from movie.items import MovieItem
|
|
|
|
|
class MeijuSpider(scrapy.Spider):
|
|
|
|
|
name = "meiju"
|
|
|
|
|
allowed_domains = ["meijutt.com"]
|
|
|
|
|
start_urls = ['https://www.meijutt.tv/topiclist/2024xinjutop.html']
|
|
|
|
|
|
|
|
|
|
def parse(self, response):
|
|
|
|
|
movies = response.xpath('//div[@class="topic_box"]/ul')
|
|
|
|
|
for each_movie in movies:
|
|
|
|
|
item = MovieItem()
|
|
|
|
|
item['name'] = each_movie.xpath('./li[@class="font_14"]/a/@title').extract()[0]
|
|
|
|
|
yield item
|
|
|
|
|
|
|
|
|
|
pipelines.py
|
|
|
|
|
import json
|
|
|
|
|
class MoviePipeline(object):
|
|
|
|
|
# def process_item(self, item, spider):
|
|
|
|
|
# return item
|
|
|
|
|
def open_spider(self, spider):
|
|
|
|
|
self.file = open('log.txt', 'w', encoding='utf-8')
|
|
|
|
|
def close_spider(self, spider):
|
|
|
|
|
self.file.close()
|
|
|
|
|
def process_item(self, item, spider):
|
|
|
|
|
self.file.write(str(item) + '\n')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
|
|
# 指定Excel文件路径
|
|
|
|
|
excel_file_path = 'path_to_your_excel_file.xlsx'
|
|
|
|
|
|
|
|
|
|
# 使用Pandas读取Excel文件
|
|
|
|
|
# 这里假设Excel文件中只有一个sheet,如果有多个sheet,可以通过sheet_name参数指定
|
|
|
|
|
df = pd.read_excel(excel_file_path)
|
|
|
|
|
|
|
|
|
|
# 打印DataFrame的内容
|
|
|
|
|
print(df)
|
|
|
|
|
|