python爬虫练习

爬虫练习

爬虫练习1-爬取淘宝和千图网图片

import requests
import urllib
import re
import time
from xpinyin import Pinyin
def main():
    search_name = '奶粉'
    key = urllib.request.quote(search_name)
    file = "E:/temp/img/"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
    for i in range(0,2):
        url = "https://s.taobao.com/search?q=" + key + "&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20180729&ie=utf8&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s=" + str(i*44)
        req = requests.get(url, headers=headers)
        html = req.content.decode('utf-8', 'ignore')
        imgre = re.compile(r'"pic_url":"(.*?)"', re.S)
        imgs = re.findall(imgre, html)
        # print(imgs)
        for j in range(len(imgs)):
            imgurl = "http:" + imgs[j] 
            filename = file + str(i) + str(j) +".jpg"
            # print(filename) 
            urllib.request.urlretrieve(imgurl, filename=filename)
            time.sleep(1)
        print("第%s页的图片已经获取完成，等待2秒" % str(i+1))
        time.sleep(2)
def get_58pic(keyword):
    if isinstance(keyword, str):
        url = "http://www.58pic.com/tupian/" + get_pinyin(keyword) + ".html"
    else:
        url = "http://www.58pic.com/tupian/" + str(keyword) + ".html"
    headers =  {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
    req = requests.get(url, headers=headers)
    html = req.content.decode('gbk', 'ignore')
    imgre = re.compile(r'data-original="(.*?)"', re.S)
    imgs = re.findall(imgre, html)
    for img in imgs:
        imgurl = img.split("!")[0]
        print(imgurl)
def get_pinyin(word):
    pin = Pinyin()
    wordpinyin = pin.get_pinyin(word, "")
    return wordpinyin
if __name__ == '__main__':
    get_58pic("七夕")

爬虫练习2-爬取58二手房并且存储excel

import requests
import os
from openpyxl import workbook  # 写入Excel表所用
from bs4 import BeautifulSoup
class GanJi():
    """docstring for GanJi"""
    def __init__(self):
        super(GanJi, self).__init__()
    def get(self,url):
        user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'
        headers    = {'User-Agent':user_agent}
        webData    = requests.get(url + 'o1',headers=headers).text
        soup       = BeautifulSoup(webData,'lxml')
        sum        = soup.find('span',class_="num").text.replace("套","")
        ave        = int(sum) / 32
        forNum     = int(ave)
        if forNum < ave:
            forNum = forNum + 1
        print("一共%d页数据，共%s条" % (forNum, sum))
        wb = workbook.Workbook()  # 创建Excel对象
        ws = []
        ws = wb.active  # 获取当前正在操作的表对象
        ws.append(['名称', '描述', '地址', '总价', '均价', '链接'])
        for x in range(5):
            webData    = requests.get(url + 'o' + str(x + 1),headers=headers).text
            soup       = BeautifulSoup(webData,'lxml')
            find_list  = soup.find('div',class_="f-main-list").find_all('div',class_="f-list-item ershoufang-list")
            for dl in find_list:
                name = dl.find('a',class_="js-title value title-font").text # 名称
                # 中间 5 个信息
                describe = ""
                tempDD = dl.find('dd',class_="dd-item size").find_all('span')
                for tempSpan in tempDD:
                    if not tempSpan.text == '' : 
                        describe += tempSpan.text.replace("\n", "")
                        # print(tempSpan.text.replace("\n", ""),end='|')
                address = dl.find('span',class_="area").text.replace(" ","").replace("\n","") # 地址
                allMoney = dl.find('div',class_="price").text.replace(" ","").replace("\n","") # 价钱
                aveMoneey = dl.find('div',class_="time").text.replace(" ","").replace("\n","") # 平均
                hourseUrl = "http://chaozhou.ganji.com" + dl['href'] # 链接
                ws.append([name, describe, address, allMoney, aveMoneey, hourseUrl])
            print("完成第" + str(x + 1)  + "页数据收集")
        wb.save('ershoufang.xlsx')
if __name__ == '__main__':
    temp = GanJi()
    temp.get("http://changde.ganji.com/fang5/")

爬虫练习3-使用flddler获取登陆信息利用cookie登陆github

获取cookie

import urllib.error, urllib.request, urllib.parse
import http.cookiejar
# 参考https://www.cnblogs.com/xiaoxi-3-/p/7586072.html
LOGIN_URL = 'http://github.com/session'
#get_url为使用cookie所登陆的网址，该网址必须先登录才可
get_url = 'https://github.com/mutoulazy?tab=repositories'
values = {
    'commit':'Sign in',
    'utf8':'✓',
    'authenticity_token':'rep+oaSnsp9OMoZ8pvfBc/Qs54Fck3z5n+NOWiKEiQEc29zDuYNreqvKiBPZ8CRHToIBYRxdtB1SnCeZxXQkvw==',
    'login':'mutoulazy',
    'password':'*********'
    }
postdata = urllib.parse.urlencode(values).encode()
user_agent = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' \
             r' (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
headers = {'User-Agent':user_agent, 'Connection':'keep-alive'}
#将cookie保存在本地，并命名为cookie.txt
cookie_filename = 'githubCookie.txt'
cookie_aff = http.cookiejar.MozillaCookieJar(cookie_filename)
handler = urllib.request.HTTPCookieProcessor(cookie_aff)
opener = urllib.request.build_opener(handler)
request = urllib.request.Request(LOGIN_URL, postdata, headers)
try:
    response = opener.open(request)
except urllib.error.URLError as e:
    print(e.reason)
cookie_aff.save(ignore_discard=True, ignore_expires=True)
for item in cookie_aff:
    print('Name ='+ item.name)
    print('Value ='+ item.value)
#使用cookie登陆get_url
get_request = urllib.request.Request(get_url,headers=headers)
get_response = opener.open(get_request)
print(get_response.read().decode())

使用cookie反复登陆

import urllib.request, urllib.parse
import http.cookiejar
get_url = 'https://github.com/mutoulazy?tab=repositories'
cookie_filename = 'githubCookie.txt'
cookie_aff = http.cookiejar.MozillaCookieJar(cookie_filename)
cookie_aff.load(cookie_filename,ignore_discard=True,ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie_aff)
opener = urllib.request.build_opener(handler)
#使用cookie登陆get_url
get_request = urllib.request.Request(get_url)
get_response = opener.open(get_request)
print(get_response.read().decode())