diff --git a/码源.txt b/码源.txt new file mode 100644 index 0000000..2aea657 --- /dev/null +++ b/码源.txt @@ -0,0 +1,73 @@ +import requests +from bs4 import BeautifulSoup +import time +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" +} +def get_info(url): + wb_data = requests.get(url, headers=headers) + soup = BeautifulSoup(wb_data.text, 'html.parser') + ranks = soup.select('span.pc_temp_num') + titles = soup.select('div.pc_temp_songlist > ul > li > a') + times = soup.select('span.pc_temp_tips_r > span') + for rank, title, time in zip(ranks, titles, times): + data = { + 'rank': rank.get_text().strip(), + 'singer': title.get_text().split('-')[0].strip(), + 'song': title.get_text().split('-')[1].strip() if len(title.get_text().split('-')) > 1 else '', + 'time': time.get_text().strip() + } + print(data) +if __name__ == '__main__': + urls = ['http://www.kugou.com/yy/rank/home/{}-8888.html'.format(str(i)) for i in range(1, 24)] + for url in urls: + get_info(url) + time.sleep(1) + + + + +其中items.py文件中删出name前的# +setting.py删除注释65-67 +meiju.py +import scrapy +from movie.items import MovieItem +class MeijuSpider(scrapy.Spider): + name = "meiju" + allowed_domains = ["meijutt.com"] + start_urls = ['https://www.meijutt.tv/topiclist/2024xinjutop.html'] + + def parse(self, response): + movies = response.xpath('//div[@class="topic_box"]/ul') + for each_movie in movies: + item = MovieItem() + item['name'] = each_movie.xpath('./li[@class="font_14"]/a/@title').extract()[0] + yield item + +pipelines.py +import json +class MoviePipeline(object): + # def process_item(self, item, spider): + # return item + def open_spider(self, spider): + self.file = open('log.txt', 'w', encoding='utf-8') + def close_spider(self, spider): + self.file.close() + def process_item(self, item, spider): + self.file.write(str(item) + '\n') + + + + +import pandas as pd + +# 指定Excel文件路径 +excel_file_path = 'path_to_your_excel_file.xlsx' + +# 使用Pandas读取Excel文件 +# 这里假设Excel文件中只有一个sheet,如果有多个sheet,可以通过sheet_name参数指定 +df = pd.read_excel(excel_file_path) + +# 打印DataFrame的内容 +print(df) +