爬虫目标
博主在假期遇到有家长,拿着老师的布置的作业来打印,结果是如下这样的电子书,还不能下载,所以打算编写爬虫进行下载,发现有书籍的链接全都通过 Js 加密了,这不能忍,这么能阻止小学生做作业呢。
网站:https://mp.zhizhuma.com/book/shelf.htm?id=4872
所用模块:
# -*- coding:utf-8 -*- import requests import time import re import random import base64 import json import hashlib from Crypto.Cipher import AES from os import makedirs
爬虫结构
# 获取 数据每一页的链接的 Json def get_encryptedData(ebookId): pass # 获取真正的连接,有时效 def get_auth_key(data,differenceDate): # 下载保存连接 def download_and_save(datadict,differenceDate): if __name__ == '__main__': # 书籍的分享链接 share_url = 'https://mp.zhizhuma.com/book/sample2.htm?id=52753&shelfId=4872' ebookId = re.search(r'id=(\d+)', share_url).group(1) # 获取书籍的 URL encryptedData = get_encryptedData(ebookId) # 获取时间戳参数 differenceDate = encryptedData.get('timestamp') # 创建文件夹 makedirs(str(differenceDate)) for data in encryptedData.get('data'): # 获取加密的连接 pageNo_url = get_auth_key(data=data,differenceDate=differenceDate) print(pageNo_url) # 下载书籍 download_and_save(pageNo_url,differenceDate=differenceDate)
get_encryptedData()
通过Js分析,发现这段 Js 的解密,我们使用Python来实现,逻辑并不复杂。
def get_encryptedData(ebookId): url = 'https://biz.zhizhuma.com/ebookpageservices/queryAllPageByEbookId.do' data = { "ebookId": ebookId, "_timestamp": "1586101527", "_nonce": "24430072-41ad-48cb-9c7f-880f990c0886", "_sign": "975F1339ED050BB789CD51D66E40DD6B", } j = requests.post(url=url,data=data).json().get('encryptedData') # AES 解密 密钥 Js 寻找 cipher = AES.new("Suj4XDDt3jPsH9Jj".encode(), AES.MODE_ECB) raw_data = cipher.decrypt(base64.decodebytes(bytes(j, encoding='utf8'))).rstrip(b'\x0f').decode("utf-8")[:-1] j_data = json.loads(raw_data) return j_data
get_auth_key()
拼接图片链接后的参数,不带这个参数,获取不到图片
auth_key 参数的生成,也是模仿Js的来写就好。
# 获取真正的连接,有时效 def get_auth_key(data,differenceDate): # 页数 pageNo = data.get('pageNo') # 解密构造参数 imgurl = data.get('imgurl').split('https://cdnyuntisyspro.bookln.cn')[1] uid = "0" rand = str(random.random()) timestamp = str(int(time.time()) - int((int(time.time() * 1000) - differenceDate) / 1000) + 15) sstring = imgurl + "-" + timestamp + "-" + rand + "-" + uid + "-69731cbade6a64b58d60" md5 = hashlib.md5() md5.update(sstring.encode()) md5hash = md5.hexdigest() authKey = 'auth_key=' + timestamp + "-" + rand + "-" + uid + "-" + md5hash url = "https://cdnyuntisyspro.bookln.cn" + imgurl + "?" + authKey return {"pageNo":pageNo,"url":url}
download_and_save()
# 下载保存 def download_and_save(datadict,differenceDate): c = requests.get(url=datadict.get('url')).content with open(str(differenceDate)+ "/" + str(datadict.get('pageNo'))+".png","wb") as file: file.write(c)
运行
可以很容易扩展成多线程。
大数据男孩原创,仅用于交流学习之用
版权声明:《 【爬虫项目】书链平台试卷下载 》为明妃原创文章,转载请注明出处!
最后编辑:2020-4-6 07:04:24
2022-03-17 15:15
2022-03-24 21:54