From 2ffd1197b36904d4bae4bed130fbfdf6aabc150f Mon Sep 17 00:00:00 2001 From: pbn38v75u <2986301174@qq.com> Date: Sun, 7 Nov 2021 00:53:33 +0800 Subject: [PATCH] ADD file via upload --- 爬虫代码.py | 162 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 爬虫代码.py diff --git a/爬虫代码.py b/爬虫代码.py new file mode 100644 index 0000000..b45483f --- /dev/null +++ b/爬虫代码.py @@ -0,0 +1,162 @@ +import urllib +import urllib.request +import re +import random +import time +import xlwt + + +def set_style(font_name, font_height, bold=False): + style = xlwt.XFStyle() + + font = xlwt.Font() + font.name = font_name # 'Times New Roman' + font.height = font_height + font.bold = bold + font.colour_index = 4 + + borders = xlwt.Borders() + borders.left = 6 + borders.right = 6 + borders.top = 6 + borders.bottom = 6 + + style.font = font + style.borders = borders + return style + + +stockFile = open("stockFile.txt", "w") +stockFile.close() +# 抓取所需内容 +user_agent = ["Mozilla/5.0 (Windows NT 10.0; WOW64)", 'Mozilla/5.0 (Windows NT 6.3; WOW64)', + 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko', + 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1', + 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3', + 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12', + 'Opera/9.27 (Windows NT 5.2; U; zh-cn)', + 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0', + 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)', + 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)', + 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) ' + 'Version/5.1 Safari/534.50 ' + 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 ' + 'Safari/534.50 ' + 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11' + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) ' + 'Chrome/17.0.963.56 Safari/535.11 ' + 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)' + 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, ' + 'like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5 ' + 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) ' + 'Version/5.1 Safari/534.50 ' + 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1' + ] +stock_total = [] # stock_total:所有页面的股票数据 stock_page:某页的股票数据 +for page in range(1, 10): # 1-2 + # http://quote.stockstar.com/stock/ranklist_a_3_1_1.html + url = 'http://quote.stockstar.com/stock/ranklist_a_3_1_' + str(page) + '.html' + a = random.choice(user_agent) + print(a) + request = urllib.request.Request(url=url, + headers={"User-Agent": a}) # 随机从user_agent列表中抽取一个元素 + + try: + response = urllib.request.urlopen(request) + except urllib.error.HTTPError as e: # 异常检测 + print('page=', page, '', e.code) + except urllib.error.URLError as e: + print('page=', page, '', e.reason) + + content = response.read().decode('gbk') # 读取网页内容 + print('get page', page) # 打印成功获取的页码 + + pattern = re.compile('') + body = re.findall(pattern, str(content)) + pattern = re.compile('>(.*?)<') + stock_page = re.findall(pattern, body[0]) # 正则匹配 + + stock_total.extend(stock_page) # stock_total:所有页面的股票数据 stock_page:某页的股票数据 + time.sleep(random.randrange(1, 4)) # 每抓一页随机休眠几秒,数值可根据实际情况改动 + +# 删除空白字符 +stock_last = stock_total[:] # stock_last为最终所要得到的股票数据 +for data in stock_total: + if data == '': + stock_last.remove('') +# 打印结果 +'''----------------------------------------''' +nums = '序号' +codes = '代码' +simple = '简称' +news = '最新价' +chgs = '涨跌幅' +chge = '涨跌额' +fivechg = '5分钟涨幅' +'''--------------------写入txt文件--------------------''' +topStr = nums + '\t' + codes + '\t' + simple + '\t' + news + '\t\t' + chgs + '\t\t' + chge + '\t\t' + fivechg + +stockFile = open("stockFile.txt", "a", encoding="utf-8") +stockFile.writelines(topStr) +stockFile.writelines("\n") + +'''--------------------写入xls--------------------''' +new_workbook = xlwt.Workbook() +new_sheet = new_workbook.add_sheet("SheetName_test") +new_sheet.write(0, 0, str(nums), set_style("Times New Roman", 220, True)) +new_sheet.write(0, 1, codes, set_style("Times New Roman", 220, True)) +new_sheet.write(0, 2, simple, set_style("Times New Roman", 220, True)) +new_sheet.write(0, 3, news, set_style("Times New Roman", 220, True)) +new_sheet.write(0, 4, chgs, set_style("Times New Roman", 220, True)) +new_sheet.write(0, 5, chge, set_style("Times New Roman", 220, True)) +new_sheet.write(0, 6, fivechg, set_style("Times New Roman", 220, True)) +'''---------------------打印到控制台-------------------''' +topStr = '序号' + '\t' '代码' + '\t\t' + '简称' + '\t\t\t' + '最新价' + '\t\t\t' + '涨跌幅' + '\t\t\t' + '涨跌额' + '\t\t' + '5分钟涨幅' +print(topStr) + +for i in range(0, len(stock_last), 6): # 原网页有13列数据,所以步长为13 + '''-------------------nums = '序号'---------------------''' + nums = str(i // 6) + codes = stock_last[i] + simple = stock_last[i + 1] + news = stock_last[i + 2] + chgs = stock_last[i + 3] + chge = stock_last[i + 4] + fivechg = stock_last[i + 5] + + stockInfo = nums + stockInfo += '\t' + stockInfo += codes + stockInfo += '\t' + stockInfo += simple + # if len(simple) < 4: + # stockInfo += '\t' + stockInfo += '\t' + stockInfo += news + stockInfo += '\t' + stockInfo += chgs + stockInfo += '\t' + stockInfo += chge + stockInfo += '\t' + stockInfo += fivechg + '''--------------------写入txt文件--------------------''' + stockFile.writelines(stockInfo) + stockFile.writelines("\n") + + '''--------------------写入xls文件--------------------''' + new_sheet.write(i // 6 + 1, 0, str(nums), set_style("Times New Roman", 220, False)) + new_sheet.write(i // 6 + 1, 1, codes, set_style("Times New Roman", 220, False)) + new_sheet.write(i // 6 + 1, 2, simple, set_style("Times New Roman", 220, False)) + new_sheet.write(i // 6 + 1, 3, news, set_style("Times New Roman", 220, False)) + new_sheet.write(i // 6 + 1, 4, chgs, set_style("Times New Roman", 220, False)) + new_sheet.write(i // 6 + 1, 5, chge, set_style("Times New Roman", 220, False)) + new_sheet.write(i // 6 + 1, 6, fivechg, set_style("Times New Roman", 220, False)) + + '''---------------------打印到控制台-------------------''' + stockInfo = str(i // 6) + '\t' + stock_last[i] + '\t' + stock_last[i + 1] + '\t\t' + stock_last[ + i + 2] + ' ' + '\t' + stock_last[i + 3] + ' ' + '\t' + stock_last[i + 4] + ' ' + '\t' + stock_last[i + 5] + print(stockInfo) + +stockFile.close() +new_workbook.save(r"newestStockInfo.xls") +input("按回车退出。")