Python+scrapy代理接入之讯代理

上篇文章讲到了聚合代理的接入,现在我们接着说讯代理。middleware代码如下:

import requests import json class MaoyanXunProxyMiddleware(object): """ 讯代理: 注意:这是一次请求10个IP """ # ==============代理初始化============ def __init__(self): # 代理API self.get_url = "?token=4nZufMcvqklMfwNjmiIXSseJ&num=10" # 测试地址 self.teep_url = "" # IP代理池 self.ip_list = [] # 获取代理IP数量 self.num = 10 # 修改获取IP数量 # 用来记录使用IP的个数 self.count = 0 # 用来记录每个IP的使用次数 self.evecount = 0 # ==============获取代理IP============ def getIPData(self): teep_data = requests.get(url=self.get_url).text self.ip_list.clear() for eve_ip in json.loads(teep_data)["RESULT"]: self.ip_list.append( {"ip":eve_ip,"port":eve_ip["port"]} ) # ============改变原程序IP=========== def changeProxy(self,request): ip = self.ip_list[self.count-1]["ip"] port = self.ip_list[self.count-1]["port"] request.meta["proxy"] = "http://" + str(ip) + ":" + str(port) # ==============验证代理IP============ def verification(self): ip = self.ip_list[self.count - 1]["ip"] port = self.ip_list[self.count - 1]["port"] # 验证代理IP是否可用,并设置超时为5秒 requests.get(url=self.teep_url,proxies={"http":str(ip) + ":" + str(port)},timeout=5) # ==============切换代理IP============ def ifUsed(self,request): # 处理代理IP不可用的异常 try: self.changeProxy(request) self.verification() except: if self.count == 0 or self.count == self.num: self.getIPData() self.count = self.count + 1 else: self.count = self.count + 1 self.ifUsed(request) def process_request(self,spider,request): if self.count == 0 or self.count ==self.num: self.getIPData() # 获取代理IP信息 self.count = 1 # 判断代理IP使用次数 if self.evecount == 3: # 表示代理IP使用了几次 self.count = self.count + 1 self.evecount = 0 else: self.evecount = self.evecount + 1 self.ifUsed(request)# 切换代理IP

再接着就是设置setting文件的的配置了

1 DOWNLOADER_MIDDLEWARES = { 2# maoyan.middlewares.MaoyanDownloaderMiddleware: 543,3maoyan.middlewares.MaoyanXunProxyMiddleware: 543, }

最后,接入代理之后继续爬取数据是不是很爽啊!