diff --git a/spider.py b/spider.py new file mode 100644 index 0000000..90df55a --- /dev/null +++ b/spider.py @@ -0,0 +1,67 @@ +#encoding:utf-8 +import requests +import xlwt +from lxml import etree + +#写入excel文件 +def write(booksheet,name,data,n): + booksheet.write(n, 0, name) + for i in range(len(data)): + booksheet.write(n, i+1, data[i]) + +#爬取html文本 +def gettext(url,headers): + response = requests.get(url=url, headers=headers, timeout=10) + if (response.status_code!=200) : + return None + + response.encoding = response.apparent_encoding + return response.text + +#获取数据 +def getdata(url,headers): + newtext = gettext(url, headers) + if newtext == None : + return None + temp = etree.HTML(newtext) + data1 = temp.xpath('//*[@id="stock_info"]/div[1]/section/div[1]/em[1]/text()') + data2 = temp.xpath('//*[@id="stock_info"]/div[1]/section/dl/dd/text()') + data3 = temp.xpath('//*[@id="stock_info"]/div[1]/section/div[2]/dl/dd/text()') + return data1 + data2 + data3 + +if __name__ == '__main__': + url = 'https://hq.gucheng.com/gpdmylb.html' + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' + } + htmltext = gettext(url, headers) + + if htmltext==None: + print('打开失败') + exit(-1) + + html = etree.HTML(htmltext) + urls = html.xpath('//*[@id="stock_index_right"]/div[3]/section/a/@href') + texts = html.xpath('//*[@id="stock_index_right"]/div[3]/section/a/text()') + + # 打开excel文件 + workbook = xlwt.Workbook(encoding='utf-8') + booksheet = workbook.add_sheet('Sheet 1', cell_overwrite_ok=True) + write(booksheet,"股票",['当前','最高','最低','今开', '昨收', '涨停', '跌停', '换手率', '振幅', '成交量', '成交额', '内盘', '外盘', '量比', '涨跌幅', '市盈率(动)', '市净率', '流通市值', '总市值'],0) + + num=0 + goodnum=0 + badnum=0 + for i in range(100): + data = getdata(urls[i],headers) + if data == None : + print(texts[i], 'bad') + badnum+=1 + continue + print(texts[i],'good') + goodnum+=1 + write(booksheet,texts[i],data,num+1) + num+=1 + + print('成功:', goodnum, '\n失败:' , badnum) + workbook.save('./text.xls') \ No newline at end of file