1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
| import requests import os from openpyxl import workbook from bs4 import BeautifulSoup class GanJi(): """docstring for GanJi""" def __init__(self): super(GanJi, self).__init__() def get(self,url): user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36' headers = {'User-Agent':user_agent} webData = requests.get(url + 'o1',headers=headers).text soup = BeautifulSoup(webData,'lxml') sum = soup.find('span',class_="num").text.replace("套","") ave = int(sum) / 32 forNum = int(ave) if forNum < ave: forNum = forNum + 1 print("一共%d页数据,共%s条" % (forNum, sum)) wb = workbook.Workbook() ws = [] ws = wb.active ws.append(['名称', '描述', '地址', '总价', '均价', '链接']) for x in range(5): webData = requests.get(url + 'o' + str(x + 1),headers=headers).text soup = BeautifulSoup(webData,'lxml') find_list = soup.find('div',class_="f-main-list").find_all('div',class_="f-list-item ershoufang-list") for dl in find_list: name = dl.find('a',class_="js-title value title-font").text describe = "" tempDD = dl.find('dd',class_="dd-item size").find_all('span') for tempSpan in tempDD: if not tempSpan.text == '' : describe += tempSpan.text.replace("\n", "") address = dl.find('span',class_="area").text.replace(" ","").replace("\n","") allMoney = dl.find('div',class_="price").text.replace(" ","").replace("\n","") aveMoneey = dl.find('div',class_="time").text.replace(" ","").replace("\n","") hourseUrl = "http://chaozhou.ganji.com" + dl['href'] ws.append([name, describe, address, allMoney, aveMoneey, hourseUrl]) print("完成第" + str(x + 1) + "页数据收集") wb.save('ershoufang.xlsx') if __name__ == '__main__': temp = GanJi() temp.get("http://changde.ganji.com/fang5/")
|