说明在许多情况下,您需要将错误通知给正在使用API的客户端。FastAPI使用Python的HTTPException异常进行处理,所以raise它。实例fromfastapiimportFastAPI,HTTPExceptionimportuvicornapp=FastAPI()@app.get("/user")asyncdefread_item(age:int,name:str):ifage<18:#返回客户端500状态码及错误原因raiseHTTPException(status_code=500,detail="age小于18")return{"name":name,"age":age}if__name__=='__main__':uvicorn.run(app=app,host="127.0.0.1",port=80,)添加自定义Headers@app.get("/user")asyncdefread_item(age:int,name:str):ifage<18:#返回客户端500状态码及错误原因raiseHTTPException(status_code=500,#detail可以是list、dict、strdetail={"msg":"age小于18"},#自定义headersheaders={"X-Error":"Theregoesmyerror"})return{"name":name,"age":age}自定义异常处理程序fromfastapiimportFastAPI,Requestfromfastapi.responsesimportJSONResponseimportuvicornapp=FastAPI()#异常参数类classUnicornException(Exception):def__init__(self,status_code:int,msg:str):#错误状态码self.status_code=status_code#错误内容self.msg=msg#异常响应@app.exception_handler(UnicornException)asyncdefunicorn_exception_handler(request:Request,exc:UnicornException):returnJSONResponse(#错误响应状态码status_code=exc.status_code,#错误返回内容格式content={"msg":exc.msg},)@app.get("/user/{name}")asyncdefread_item(name:str):ifname=="bigdata":#错误提示raiseUnicornException(status_code=404,msg="用户名错误,没有该用户")return{"name":name}if__name__=='__main__':uvicorn.run(app=app,host="127.0.0.1",port=80,)测试URL:http://127.0.0.1/user/bigdata?name=bigdataboy
实例代码fromfastapiimportFastAPIimportuvicornapp=FastAPI()@app.post("/user/{name}")#路径参数asyncdefread_item(name:str):return{"name":name}if__name__=='__main__':uvicorn.run(app=app,host="127.0.0.1",port=80,)导入验证fromfastapiimportFastAPI,Query可选参数name可传可不传#正常写法name为可选参数@app.get("/user/")asyncdefread_item(age:int,name:str=None):return{"name":name,"age":age}#验证写法name为可选参数@app.get("/user/")asyncdefread_item(age:int,name:str=Query(None)):return{"name":name,"age":age}必选参数name必须传#正常写法name为必选参数asyncdefread_item(age:int,name:str):return{"name":name,"age":age}#验证写法name为必选参数asyncdefread_item(age:int,name:str=Query(...)):return{"name":name,"age":age}默认参数值#正常写法name默认值asyncdefread_item(age:int,name:str="bigdataboy"):return{"name":name,"age":age}#验证写法name默认值asyncdefread_item(age:int,name:str=Query("bigdataboy")):return{"name":name,"age":age}Query验证Query主要支持一下验证长度验证:max_length、min_length正则验证:regex@app.get("/user")#路径参数asyncdefread_item(age:int,name:str=Query(...,#该参数必选min_length=1,#最小长度为1max_length=50,#最大长度为50#regex=re.compile(".+bigdataboy.+").pattern#使用编译的正则表达式需要导入re库regex=".+bigdataboy.+"#正则表达式匹配验证)):return{"name":name,"age":age}其他参数这些参数主要是对接口进行描述@app.get("/user")#路径参数asyncdefread_item(age:int,name:str=Query(...,#该参数必选title="name",#标题description="用户的新名字",#参数作用的描述alias="new-name",#接口参数别名,URL参数就使用该别名deprecated=True,#代表该参数,即将弃用)):return{"name":name,"age":age}
POST请求需要首先定义参数模型fromfastapiimportFastAPIfrompydanticimportBaseModelimportuvicorn#声明参数模型classItem(BaseModel):name:strdescription:str=Noneapp=FastAPI()#接受POST类型@app.post("/")asyncdefread_item(item:Item):#参数类型是returnitemif__name__=='__main__':uvicorn.run(app=app,host="127.0.0.1",port=80,)参数模型#导入模型类frompydanticimportBaseModelclassItem(BaseModel):#必要参数name:str#可选参数description:str=None参看文档http://127.0.0.1/docs请求测试importrequestsurl="http://127.0.0.1"data={"name":"bigdataboy","description":"",}r=requests.post(url=url,json=data)print(r.json())高级用法与路径参数一起使用fromfastapiimportFastAPIfrompydanticimportBaseModelimportuvicorn#声明参数类classItem(BaseModel):name:strdescription:str=Noneapp=FastAPI()@app.post("/user/{id}")#路径参数asyncdefread_item(id:int,item:Item):return{"id":id,**item.dict()}if__name__=='__main__':uvicorn.run(app=app,host="127.0.0.1",port=80,)
路径参数比如访问:127.0.0.1/user/123/fromfastapiimportFastAPIimportuvicornapp=FastAPI()@app.get("/user/{id}/")asyncdefmain(id:int):return{"id":id}if__name__=='__main__':uvicorn.run(app=app,host="127.0.0.1",port=80,)API文档地址:http://127.0.0.1/docs固定的路径参数@app.get("/user/id/")asyncdefmain():return{"id":"thecurrentuser"}@app.get("/user/{id}/")asyncdefmain(id:int):return{"id":id}预留路径值传入的路径参数只能在预留路径里,不然就报错fromfastapiimportFastAPI#导入枚举类fromenumimportEnumimportuvicorn#预留路径类继承str,文档将能够知道这些值的类型classModelName(str,Enum):#预留的路径alexnet="alexnet"resnet="resnet"lenet="lenet"app=FastAPI()@app.get("/model/{model_name}")asyncdefget_model(model_name:ModelName):#参数类型是预留路径类#第一种判定写法ifmodel_name==ModelName.alexnet:return{"model_name":model_name,"message":"DeepLearningFTW!"}#第二种判定写法ifmodel_name.value=="lenet":return{"model_name":model_name,"message":"LeCNNalltheimages"}#默认返回return{"model_name":model_name,"message":"Havesomeresiduals"}if__name__=='__main__':uvicorn.run(app=app,host="127.0.0.1",port=80,)路径转换器访问/files/bigdataboy/a.txt,file_path将能得到bigdataboy/a.txt#添加上:path即可@app.get("/files/{file_path:path}")asyncdefread_file(file_path:str):return{"file_path":file_path}查询参数查询参数是指?后面的&分割的键值对http://127.0.0.1:8000/items/?skip=0&limit=2fromfastapiimportFastAPIimportuvicornapp=FastAPI()fake_items_db=[{"item_name":"Foo"},{"item_name":"Bar"},{"item_name":"Baz"}]@app.get("/items/")#设置的默认值,当默认值是None时,表示该值为可选asyncdefread_item(skip:int=0,limit:int=2,default:str=None):#一个列表切片returnfake_items_db[skip:skip+limit]if__name__=='__main__':uvicorn.run(app=app,host="127.0.0.1",port=80,)注意如果声明可选时出现以下类似错误#声明可选limit:int=None#出现以下类似错误Incompatibletypesinassignment(expressionhastype"None",variablehastype"int")这是你需要这样做fromtypingimportOptionallimit:Optional[int]=None完整实例fromfastapiimportFastAPIfromtypingimportOptionalimportuvicornapp=FastAPI()@app.get("/items/")asyncdefread_item(skip:str,limit:Optional[int]=None):item={"skip":skip,"limit":limit}returnitemif__name__=='__main__':uvicorn.run(app=app,host="127.0.0.1",port=80,)
安装FastAPI#下载fastapi模块pipinstallfastapi-ihttps://pypi.tuna.tsinghua.edu.cn/simple#安装uvicorn用作服务器pipinstalluvicorn-ihttps://pypi.tuna.tsinghua.edu.cn/simpleget请求测试importuvicornfromfastapiimportFastAPIapp=FastAPI()@app.get("/")asyncdefget_demo():return{"message":"HelloWorld"}if__name__=='__main__':uvicorn.run(app=app,host="127.0.0.1",port=5000,debug=True)查看Api文档所有的Api接口文档,都会自动生成,不需要额外的Api测试软件http://127.0.0.1:5000/docs
Scrapy选择器xpath选择器xpath匹配后返回的结果是一个列表get提取list()里面的值#extractextract_first以被getall()get()方法取代defparse(self,response):#获取第一个结果,没有结果返回default设置的默认值Noneresponse.xpath("").get(default=0)#获取全部匹配结果response.xpath("").getall()正则获取list()里面的值defparse(self,response):#正则匹配返回list()的第一个值,没有匹配返回默认值response.xpath('').re_first(r'.',default=0)#匹配list()里面的所有值response.xpath('').re(r'href="(.+?)"')其他方法index(value)返回value所在列表位置的索引值。defparse(self,response):response.xpath('').index("大数据男孩")count(value)返回value出现的个数。defparse(self,response):response.xpath('').count("大数据男孩")css选择器提取css选择器匹配结果的方法同上defparse(self,response):response.css('a::attr(href)').getall()
安装&加载pip3installrequests-ihttps://mirrors.aliyun.com/pypi/simple/importrequestsGET请求#普通请求r=requests.get('https://bigdataboy.cn/')#带Query参数,等价于https://bigdataboy.cn/?key1=value1&key2=value2payload={'key1':'value1','key2':'value2'}r=requests.get('https://bigdataboy.cn/',params=params)#带Headersheaders={'user-agent':'anoyi-app/0.0.1'}r=requests.get('https://bigdataboy.cn/',headers=headers)#带BasicAuthenticationr=requests.get('https://bigdataboy.cn/',auth=('user','pass'))POST请求POST请求-表单提交r=requests.post('https://bigdataboy.cn/',data={'key':'value'})POST请求-x-www-form-urlencodedheaders={'content-type':'application/x-www-form-urlencoded;charset=UTF-8'}r=requests.post('https://bigdataboy.cn/',headers=headers,data='key=value')POST请求-application/jsonpayload={'some':'data'}r=requests.post('https://bigdataboy.cn/',json=payload)其他请求#PUTr=requests.put('https://bigdataboy.cn/',data={'key':'value'})#DELETEr=requests.delete('https://bigdataboy.cn/')#HEADr=requests.head('https://bigdataboy.cn/')#OPTIONSr=requests.options('https://bigdataboy.cn/')网络响应-Reponse基本信息#状态码r.status_code#响应头r.headers#响应Cookier.cookies返回结果#文本内容r.text#二进制r.content#JSONr.json()#流r=requests.get('https://bigdataboy.cn/',stream=True)r.raw.read(10)常用方法URL编码fromrequests.utilsimportquotequote('ab')->'a%20b'URL解码fromrequests.utilsimportunquoteunquote('a%20b')->'ab'自动推断响应编码r.encoding=r.apparent_encoding下载文件r=requests.get('https://bigdataboy.cn/')open('bigdataboy.html','wb').write(r.content)上传文件files={'file':open('report.xls','rb')}r=requests.post(url,files=files)超时设置#单位:秒requests.get('https://bigdataboy.cn/',timeout=0.001)
说明selenium是一个自动化测试工具,而爬虫中使用它主要是为了解决requests无法执行javaScript代码的问题。优点通过驱动浏览器,完全模拟浏览器的操作,比如跳转、输入、点击、下拉等…进而拿到网页渲染之后的结果,可支持多种浏览器,真正做到可见及可爬。缺点使用selenium本质上是驱动浏览器对目标站点发送请求,那浏览器在访问目标站点的时候,需要把静态资源都加载完毕,比如html、css、js这些文件,加载完成后,等待浏览器执行,最后呈现在页面。所以用它的坏处就是效率极低!所以我们一般用它来做登录验证,Js特别复杂,没有必要去解密的场景等。使用下载驱动以谷歌浏览器为例驱动下载地址:http://chromedriver.storage.googleapis.com/index.html找到本机安装的谷歌浏览器,与之对应的版本。(虽然驱动是32位,但是支持64位浏览器的)基本使用打开大数据男孩网站fromseleniumimportwebdriver#驱动的路径bro=webdriver.Chrome(executable_path="./chromedriver_win32/chromedriver.exe")#打开这个网址bro.get(url="https://bigdataboy.cn/")找到搜索按钮,并点击fromseleniumimportwebdriver#驱动的地址bro=webdriver.Chrome(executable_path="./chromedriver_win32/chromedriver.exe")bro.get(url="https://bigdataboy.cn/")#找到搜索按钮search_tab=bro.find_element_by_xpath('//span[@class="icon-search"]')#点击搜索按钮search_tab.click()输入搜索内容在输入内容之前,需要判断内容是否加载完成,给出的判断方案是:定时去某个标签,如果该标签存在了,则认为页面加载完成。fromseleniumimportwebdriver#驱动的地址bro=webdriver.Chrome(executable_path="./chromedriver_win32/chromedriver.exe")bro.get(url="https://bigdataboy.cn/")#找到搜索按钮search_tab=bro.find_element_by_xpath('//span[@class="icon-search"]')#点击搜索按钮search_tab.click()fromselenium.webdriver.common.byimportByfromselenium.webdriver.supportimportexpected_conditionsasECfromselenium.webdriver.support.uiimportWebDriverWait"""判断搜索页面的记载情况1.默认0.5秒判断一次。Wait()里poll_frequency参数设置2.判断标签的方法有:ID、NAME、CLASS_NAME、XPATH、TAG_NAME、LINK_TEXT、PARTIAL_LINK_TEXT、CSS_SELECTOR"""#60秒没有检测到id为keyword的标签表示,加载失败WebDriverWait(bro,60).until(EC.presence_of_element_located((By.ID,"keyword")))#找到输入框input_tab=bro.find_element_by_xpath('//input[@id="keyword"]')#输入值input_tab.send_keys("大数据男孩")#关闭浏览器#bro.close()
高阶函数使用—-函数式编程Python函数的参数可以是:字符串,整数…普通的变量,但Python函数的参数还能是一个函数,返回值也能是一个函数。函数作为参数函数fun()接受一个函数f作为参数首先定义一个主函数函数的第一个参数需要传入一个函数deffun(f,a:int,b:int):returnf(a)+f(b)定义参数函数,只是简单的返回参数deff(num):returnnum使用主函数print(fun(f,1,2))-------------------3函数作为返回值定义返回值函数f()deff(num):returnnum**num定义主函数fun(),直接返回f()函数deffun():#不能加括号,带括号就是调用returnf使用主函数第一个括号调用fun()函数,第二个括号调用返回的函数foo=fun()print(foo)#一个函数print(foo(2))#合并写法print(fun()(2))---------------<functionfat0x000001FD81A62268>44内置的高阶函数sort()排序普通排序使用list=[1,5,6,8,7,6,5,5,2]list.sort(reverse=True)#降序print(list)---------------------------[8,7,6,6,5,5,5,2,1]高级使用—按照指定规则排序按照十位数的大小排序#作为参数的函数deffun(num):returnstr(num)[0]list=[11,51,61,82,72,64,52,512,21]list.sort(key=fun,reverse=True)print(list)------------------------------------[82,72,61,64,51,52,512,21,11]#使用匿名函数合并list=[11,51,61,82,72,64,52,512,21]list.sort(key=lambdanum:str(num)[0],reverse=True)print(list)------------------------------------[82,72,61,64,51,52,512,21,11]map()&filter()函数编程语言通常都会提供map,filter,reduce三个高阶函数。在Python3中,map和filter仍然是内置函数,但是由于引入了列表推导和生成器表达式,他们变得没有那么重要了。列表推导和生成器表达式具有了map和filter两个函数的功能,而且更易于阅读。map生成新的迭代类型list_m=map(lambdanum:num**2,[1,2,3,4,5,6,7])print(list(list_m))print(type(list_m))-------------------[1,4,9,16,25,36,49]<class'map'>列表推导实现map()list_m=[num**2fornumin[1,2,3,4,5,6,7]]print(list_m)-------------[1,4,9,16,25,36,49]同时迭代多个list,生成新的迭代类型的长度是最短list的长度list1=[10,20,30,40]list2=[1,2]list_m=map(lambdaa,b:a+b,list1,list2)print(list(list_m))-------------------[11,22]注意列表推倒中这里是使用的笛卡尔积list1=[10,20,30,40]list2=[1,2]list_m=[a+bforainlist1forbinlist2]print(list_m)-------------[11,12,21,22,31,32,41,42]filter过滤filter参数函数的返回值需要是布尔值list_m=filter(lambdanum:num%2,[1,2,3,4,5,6,7])print(list(list_m))-------------------[1,3,5,7]列表推导实现filter()list_m=[numfornumin[1,2,3,4,5,6,7]ifnum%2==1]print(list_m)-------------[1,3,5,7]reducePython3中reduce被移动到了functools模块中fromfunctoolsimportreduce用法两两做运算fromfunctoolsimportreducenum=reduce(lambdaa,b:a+b,[1,2,3,4,5,6,7],0)#0为a的初始值print(num)----------28
爬虫目标博主在假期遇到有家长,拿着老师的布置的作业来打印,结果是如下这样的电子书,还不能下载,所以打算编写爬虫进行下载,发现有书籍的链接全都通过Js加密了,这不能忍,这么能阻止小学生做作业呢。网站:https://mp.zhizhuma.com/book/shelf.htm?id=4872所用模块:#-*-coding:utf-8-*-importrequestsimporttimeimportreimportrandomimportbase64importjsonimporthashlibfromCrypto.CipherimportAESfromosimportmakedirs爬虫结构#获取数据每一页的链接的Jsondefget_encryptedData(ebookId):pass#获取真正的连接,有时效defget_auth_key(data,differenceDate):#下载保存连接defdownload_and_save(datadict,differenceDate):if__name__=='__main__':#书籍的分享链接share_url='https://mp.zhizhuma.com/book/sample2.htm?id=52753&shelfId=4872'ebookId=re.search(r'id=(\d+)',share_url).group(1)#获取书籍的URLencryptedData=get_encryptedData(ebookId)#获取时间戳参数differenceDate=encryptedData.get('timestamp')#创建文件夹makedirs(str(differenceDate))fordatainencryptedData.get('data'):#获取加密的连接pageNo_url=get_auth_key(data=data,differenceDate=differenceDate)print(pageNo_url)#下载书籍download_and_save(pageNo_url,differenceDate=differenceDate)get_encryptedData()通过Js分析,发现这段Js的解密,我们使用Python来实现,逻辑并不复杂。defget_encryptedData(ebookId):url='https://biz.zhizhuma.com/ebookpageservices/queryAllPageByEbookId.do'data={"ebookId":ebookId,"_timestamp":"1586101527","_nonce":"24430072-41ad-48cb-9c7f-880f990c0886","_sign":"975F1339ED050BB789CD51D66E40DD6B",}j=requests.post(url=url,data=data).json().get('encryptedData')#AES解密密钥Js寻找cipher=AES.new("Suj4XDDt3jPsH9Jj".encode(),AES.MODE_ECB)raw_data=cipher.decrypt(base64.decodebytes(bytes(j,encoding='utf8'))).rstrip(b'\x0f').decode("utf-8")[:-1]j_data=json.loads(raw_data)returnj_dataget_auth_key()拼接图片链接后的参数,不带这个参数,获取不到图片auth_key参数的生成,也是模仿Js的来写就好。#获取真正的连接,有时效defget_auth_key(data,differenceDate):#页数pageNo=data.get('pageNo')#解密构造参数imgurl=data.get('imgurl').split('https://cdnyuntisyspro.bookln.cn')[1]uid="0"rand=str(random.random())timestamp=str(int(time.time())-int((int(time.time()*1000)-differenceDate)/1000)+15)sstring=imgurl+"-"+timestamp+"-"+rand+"-"+uid+"-69731cbade6a64b58d60"md5=hashlib.md5()md5.update(sstring.encode())md5hash=md5.hexdigest()authKey='auth_key='+timestamp+"-"+rand+"-"+uid+"-"+md5hashurl="https://cdnyuntisyspro.bookln.cn"+imgurl+"?"+authKeyreturn{"pageNo":pageNo,"url":url}download_and_save()#下载保存defdownload_and_save(datadict,differenceDate):c=requests.get(url=datadict.get('url')).contentwithopen(str(differenceDate)+"/"+str(datadict.get('pageNo'))+".png","wb")asfile:file.write(c)运行可以很容易扩展成多线程。大数据男孩原创,仅用于交流学习之用
爬虫目标所用模块:requests、os、gevent爬虫结构项目结构classWZSpider():def__init__(self):#英雄数据self.herolist="https://pvp.qq.com/web201605/js/herolist.json"#英雄详情页self.infor="https://pvp.qq.com/web201605/herodetail/{}.shtml"#图片链接self.img_url="http://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{}/{}-bigskin-{}.jpg"#存储英雄详情也链接self.infor_urls=requests.get(url=self.herolist).json()defdownload(self,infor):defrun(self):下载方法把下载方法单独提出,能进行扩展协程,多线程等defdownload(self,infor):ename=infor.get("ename")cname=infor.get("cname")#创建目录path="infor/{}".format(cname)makedirs(path)try:#皮肤名字skins=infor.get("skin_name").split("|")exceptAttributeError:returnelse:forindex,skininenumerate(skins):img_url=self.img_url.format(ename,ename,index+1)img=requests.get(url=img_url)print(cname,skin,img_url)withopen("infor/"+cname+"/"+skin+".jpg","wb")asfile:file.write(img.content)调用defrun(self):glist=[gevent.spawn(self.download,url)forurlinself.infor_urls]gevent.joinall(glist)使用if__name__=='__main__':wzs=WZSpider()wzs.run()
爬虫目标通过抖音的分享链接,获取抖音的原视频(无水印)使用模块:requests、re爬虫结构主要为了最后获取视频时所要的参数classDYSpider():def__init__(self,share_url):#get获取3个参数item_idsmidu_codeself.share_url=share_urlself.item_ids=""self.mid=""self.u_code=""#get获取参数dytk后面大括号需要item_idsself.dytk_url="https://www.iesdouyin.com/share/video/{}/"self.dytk=""#获取信息接口getself.infor_url="https://www.iesdouyin.com/web/api/v2/aweme/iteminfo/"self.headers={"user-agent":"Mozilla/5.0(iPhone;CPUiPhoneOS11_0likeMacOSX)AppleWebKit/604.1.38(KHTML,likeGecko)Version/11.0Mobile/15A372Safari/604.1",}defget_imu(self):passdefget_dytk(self):passdefget_infor(self):passdefrun(self):passget_imu()获取item_idsmidu_code三个参数defget_imu(self):r=requests.get(url=self.share_url)dytk_data=r.urlself.item_ids=re.search(r'video/(.+?)/',dytk_data).group(1)self.mid=re.search(r'mid=(.+?)&',dytk_data).group(1)self.u_code=re.search(r'u_code=(.+?)&',dytk_data).group(1)get_dytk()为了获取dytk参数defget_dytk(self):url=self.dytk_url.format(self.item_ids)r=requests.get(url=url,headers=self.headers,params={"region":"CN","mid":self.mid,"u_code":self.u_code,"titleType":"title","utm_source":"copy_link","utm_campaign":"client_share","utm_medium":"android","app":"aweme",})self.dytk=re.search(r'dytk:"(.+?)"}\);',r.text)get_infor()没有对返回的json进行处理defget_infor(self):r=requests.get(url=self.infor_url,headers=self.headers,params={"item_ids":self.item_ids,"dytk":self.dytk,})print(r.json())运行方法defrun(self):self.get_imu()self.get_dytk()self.get_infor()使用dys=DYSpider("https://v.douyin.com/Wf6Rsa/")dys.run()