1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
| from urllib import request import re,xlwt,datetime
html=request.urlopen("https://read.douban.com/provider/all").read() wzgz="<a href=\"(.*?)\" class=\"provider-item\"><div class=\"col-media\"><div class=\"cm-left avatar\"><div class=\"avatar\"><img src=\"(.*?)\"/></div></div><div class=\"cm-body\"><div class=\"name\">(.*?)</div><div class=\"works-num\">(.*?) 部作品在售</div></div></div></a>" xx=re.compile(wzgz).findall(str(html,"utf-8"))
workbook = xlwt.Workbook() sheet1 = workbook.add_sheet('sheet1',cell_overwrite_ok=True)
style = xlwt.XFStyle()
font = xlwt.Font() font.name = 'Times New Roman' font.bold = True
style.font = font
sheet1.write(0,0,"序号",style) sheet1.write(0,1,"出版社-URL",style) sheet1.write(0,2,"LOGO-URL",style) sheet1.write(0,3,"出版社名称",style) sheet1.write(0,4,"在售作品数量",style)
a=0 h=0 for i in xx: sheet1.write(a+1,0,a+1,style) sheet1.write(a+1,1,"https://read.douban.com"+str(i[0]),style) sheet1.write(a+1,2,i[1],style) sheet1.write(a+1,3,i[2],style) sheet1.write(a+1,4,int(i[3]),style) h+=int(i[3]) a+=1
if int(len(xx)) == a: t=datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") t1=datetime.datetime.now().strftime("%Y%m%d%H%M%S") sheet1.write(a+1,3,"合计",style) sheet1.write(a+1,4,h,style) sheet1.write(a+2,3,"采集时间",style) sheet1.write(a+2,4,t,style)
workbook.save("d:/豆瓣出版社汇总表"+str(t1)+".xls")
print("数据写入excel文件完毕!") print("在售书数量合计:"+str(h))
|