爬虫练习

爬虫练习1-爬取淘宝和千图网图片

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import requests
import urllib
import re
import time
from xpinyin import Pinyin
def main():
search_name = '奶粉'
key = urllib.request.quote(search_name)
file = "E:/temp/img/"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
for i in range(0,2):
url = "https://s.taobao.com/search?q=" + key + "&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20180729&ie=utf8&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s=" + str(i*44)
req = requests.get(url, headers=headers)
html = req.content.decode('utf-8', 'ignore')
imgre = re.compile(r'"pic_url":"(.*?)"', re.S)
imgs = re.findall(imgre, html)
# print(imgs)
for j in range(len(imgs)):
imgurl = "http:" + imgs[j]
filename = file + str(i) + str(j) +".jpg"
# print(filename)
urllib.request.urlretrieve(imgurl, filename=filename)
time.sleep(1)
print("第%s页的图片已经获取完成,等待2秒" % str(i+1))
time.sleep(2)
def get_58pic(keyword):
if isinstance(keyword, str):
url = "http://www.58pic.com/tupian/" + get_pinyin(keyword) + ".html"
else:
url = "http://www.58pic.com/tupian/" + str(keyword) + ".html"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
req = requests.get(url, headers=headers)
html = req.content.decode('gbk', 'ignore')
imgre = re.compile(r'data-original="(.*?)"', re.S)
imgs = re.findall(imgre, html)
for img in imgs:
imgurl = img.split("!")[0]
print(imgurl)
def get_pinyin(word):
pin = Pinyin()
wordpinyin = pin.get_pinyin(word, "")
return wordpinyin
if __name__ == '__main__':
get_58pic("七夕")

爬虫练习2-爬取58二手房并且存储excel

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import requests
import os
from openpyxl import workbook # 写入Excel表所用
from bs4 import BeautifulSoup
class GanJi():
"""docstring for GanJi"""
def __init__(self):
super(GanJi, self).__init__()
def get(self,url):
user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'
headers = {'User-Agent':user_agent}
webData = requests.get(url + 'o1',headers=headers).text
soup = BeautifulSoup(webData,'lxml')
sum = soup.find('span',class_="num").text.replace("套","")
ave = int(sum) / 32
forNum = int(ave)
if forNum < ave:
forNum = forNum + 1
print("一共%d页数据,共%s条" % (forNum, sum))
wb = workbook.Workbook() # 创建Excel对象
ws = []
ws = wb.active # 获取当前正在操作的表对象
ws.append(['名称', '描述', '地址', '总价', '均价', '链接'])
for x in range(5):
webData = requests.get(url + 'o' + str(x + 1),headers=headers).text
soup = BeautifulSoup(webData,'lxml')
find_list = soup.find('div',class_="f-main-list").find_all('div',class_="f-list-item ershoufang-list")
for dl in find_list:
name = dl.find('a',class_="js-title value title-font").text # 名称
# 中间 5 个信息
describe = ""
tempDD = dl.find('dd',class_="dd-item size").find_all('span')
for tempSpan in tempDD:
if not tempSpan.text == '' :
describe += tempSpan.text.replace("\n", "")
# print(tempSpan.text.replace("\n", ""),end='|')
address = dl.find('span',class_="area").text.replace(" ","").replace("\n","") # 地址
allMoney = dl.find('div',class_="price").text.replace(" ","").replace("\n","") # 价钱
aveMoneey = dl.find('div',class_="time").text.replace(" ","").replace("\n","") # 平均
hourseUrl = "http://chaozhou.ganji.com" + dl['href'] # 链接
ws.append([name, describe, address, allMoney, aveMoneey, hourseUrl])
print("完成第" + str(x + 1) + "页数据收集")
wb.save('ershoufang.xlsx')
if __name__ == '__main__':
temp = GanJi()
temp.get("http://changde.ganji.com/fang5/")

爬虫练习3-使用flddler获取登陆信息利用cookie登陆github

获取cookie

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import urllib.error, urllib.request, urllib.parse
import http.cookiejar
# 参考https://www.cnblogs.com/xiaoxi-3-/p/7586072.html
LOGIN_URL = 'http://github.com/session'
#get_url为使用cookie所登陆的网址,该网址必须先登录才可
get_url = 'https://github.com/mutoulazy?tab=repositories'
values = {
'commit':'Sign in',
'utf8':'✓',
'authenticity_token':'rep+oaSnsp9OMoZ8pvfBc/Qs54Fck3z5n+NOWiKEiQEc29zDuYNreqvKiBPZ8CRHToIBYRxdtB1SnCeZxXQkvw==',
'login':'mutoulazy',
'password':'*********'
}
postdata = urllib.parse.urlencode(values).encode()
user_agent = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' \
r' (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
headers = {'User-Agent':user_agent, 'Connection':'keep-alive'}
#将cookie保存在本地,并命名为cookie.txt
cookie_filename = 'githubCookie.txt'
cookie_aff = http.cookiejar.MozillaCookieJar(cookie_filename)
handler = urllib.request.HTTPCookieProcessor(cookie_aff)
opener = urllib.request.build_opener(handler)
request = urllib.request.Request(LOGIN_URL, postdata, headers)
try:
response = opener.open(request)
except urllib.error.URLError as e:
print(e.reason)
cookie_aff.save(ignore_discard=True, ignore_expires=True)
for item in cookie_aff:
print('Name ='+ item.name)
print('Value ='+ item.value)
#使用cookie登陆get_url
get_request = urllib.request.Request(get_url,headers=headers)
get_response = opener.open(get_request)
print(get_response.read().decode())

使用cookie反复登陆

1
2
3
4
5
6
7
8
9
10
11
12
import urllib.request, urllib.parse
import http.cookiejar
get_url = 'https://github.com/mutoulazy?tab=repositories'
cookie_filename = 'githubCookie.txt'
cookie_aff = http.cookiejar.MozillaCookieJar(cookie_filename)
cookie_aff.load(cookie_filename,ignore_discard=True,ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie_aff)
opener = urllib.request.build_opener(handler)
#使用cookie登陆get_url
get_request = urllib.request.Request(get_url)
get_response = opener.open(get_request)
print(get_response.read().decode())