Scrapy框架携带代理请求的几种方法

需要安装:redis [每种方法都涉及到redis的使用]代理存储格式为set 不可用代理存储格式为字符串

1、所有请求都是用同一个代理ip,在遇到非200状态时更换代理,遇到返回报文长度小于某个值时更换代理,各种异常报错更换代理,并将不可用代理存到不可用池子中,并使之持续在24小时之内不再被调用

import redis from scrapy.http import HtmlResponse from twisted.internet.error import TimeoutError, DNSLookupError, \ ConnectionRefusedError, ConnectionDone, ConnectError, \ ConnectionLost, TCPTimedOutError from scrapy.core.downloader.handlers.http11 import TunnelError from twisted.internet import defer from twisted.web.client import ResponseFailed redis_pag = { "host": "127.0.0.1", "port": 6379, "db": 14 } redis_rm = { "host": "127.0.0.1", "port": 6379, "db": 13 } pool = redis.ConnectionPool(**redis_pag) r_insert = redis.Redis(connection_pool=pool) pool_rm = redis.ConnectionPool(**redis_rm) rm = redis.Redis(connection_pool=pool_rm) def select_proxy_list(): """获取当前不可使用的ip""" lists = rm.keys() lists = [str(proxy, encoding="utf-8") for proxy in lists] val_list = rm.mget(*lists) val_list = [str(proxy, encoding="utf-8") for proxy in val_list] return val_list def rm_proxy(value): """ 1、再代理池中删除此过时代理 2、将过期代理或不可用代理存入不可用池中 3、设置不可用代理的期限为24小时 """ r_insert.srem("httpsproxy", value) proxskey = value.replace(".", "").replace(":", "").replace("//", "") rm.set(proxskey, value) rm.pexpire(proxskey, ) def redom_proxy(): """获取一个随机代理""" proxy = r_insert.srandmember("httpsproxy") proxy = str(proxy, encoding="utf-8") return proxy class ProxiesMiddleware: ALL_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError, ConnectionRefusedError, ConnectionDone, ConnectError, ConnectionLost, TCPTimedOutError, ResponseFailed, IOError, TunnelError) def __init__(self): self.proxy = redom_proxy()# 随机获取一个代理方法 def process_request(self, request, spider): spider.logger.info("[proxy] {}".format(self.proxy)) request.meta["proxy"] = self.proxy def process_response(self, request, response, spider): # 因为遇到过那种返回状态码是200但是是一个被反扒的界面,界面固定都是小于3000字符 if len(response.text) < 3000 or response.status in [403, 400, 405, 301, 302]: spider.logger.info("[此代理报错] {}".format(self.proxy)) rm_proxy(self.proxy) while True: new_proxy = redom_proxy() if new_proxy not in select_proxy_list(): self.proxy = new_proxy spider.logger.info("[更的的新代理为] {}".format(self.proxy)) break new_request = request.copy() new_request_l = new_request.replace(url=request.url) return new_request_l return response def process_exception(self, request, exception, spider): # 捕获几乎所有的异常 if isinstance(exception, self.ALL_EXCEPTIONS): # 在日志中打印异常类型 spider.logger.info("[Got exception] {}".format(exception)) spider.logger.info("[需要更换代理重试] {}".format(self.proxy)) rm_proxy(self.proxy) while True: new_proxy = redom_proxy() if new_proxy not in select_proxy_list(): self.proxy = new_proxy spider.logger.info("[更换后的代理为] {}".format(self.proxy)) break new_request = request.copy() new_request_l = new_request.replace(url=request.url) return new_request_l # 打印出未捕获到的异常 spider.logger.info("[not contained exception] {}".format(exception))

每次请求都随机使用不同代理[如果想加异常值得话可以使用上面的process_response和process_exception方法]

class ProxiesMiddleware: def process_request(self, request, spider): """添加代理""" proxin = redom_proxy() spider.logger.info("[proxy] {}".format(proxin)) request.meta["proxy"] = proxin

每10000次请求后强制更换代理[遇到报错和异常直接更换代理]

class ProxiesMiddleware: ALL_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError, ConnectionRefusedError, ConnectionDone, ConnectError, ConnectionLost, TCPTimedOutError, ResponseFailed, IOError, TunnelError) def __init__(self): self.proxy = redom_proxy()# 随机获取一个代理方法 self.count = 0 def process_request(self, request, spider): if self.count % 10000 == 0: self.proxy = redom_proxy() self.count += 1 spider.logger.info("[proxy] {}".format(self.proxy)) request.meta["proxy"] = self.proxy def process_response(self, request, response, spider): # 因为遇到过那种返回状态码是200但是是一个被反扒的界面,界面固定都是小于3000字符 if len(response.text) < 3000 or response.status in [403, 400, 405, 301, 302]: spider.logger.info("[此代理报错] {}".format(self.proxy)) rm_proxy(self.proxy) while True: new_proxy = redom_proxy() if new_proxy not in select_proxy_list(): self.proxy = new_proxy spider.logger.info("[更的的新代理为] {}".format(self.proxy)) break new_request = request.copy() new_request_l = new_request.replace(url=request.url) return new_request_l return response def process_exception(self, request, exception, spider): # 捕获几乎所有的异常 if isinstance(exception, self.ALL_EXCEPTIONS): # 在日志中打印异常类型 spider.logger.info("[Got exception] {}".format(exception)) spider.logger.info("[需要更换代理重试] {}".format(self.proxy)) rm_proxy(self.proxy) while True: new_proxy = redom_proxy() if new_proxy not in select_proxy_list(): self.proxy = new_proxy spider.logger.info("[更换后的代理为] {}".format(self.proxy)) break new_request = request.copy() new_request_l = new_request.replace(url=request.url) return new_request_l # 打印出未捕获到的异常 spider.logger.info("[not contained exception] {}".format(exception))

每隔指定时间强制更换代理[遇到报错和异常直接更换代理]

class ProxiesMiddleware: ALL_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError, ConnectionRefusedError, ConnectionDone, ConnectError, ConnectionLost, TCPTimedOutError, ResponseFailed, IOError, TunnelError) def __init__(self): self.proxy = redom_proxy() self.count = 0 self.information = self.information_func() def information_func(self): """ 用于计算每隔多久对代理进行更换方法 具体方法 获取更换节点时间 当前时间+7000秒+随机一个50到200之间的一个整数为未来更换代理的时间 """ return time.time() + 7000 + random.randint(50, 200) def agentExecutable(self, name): """每个请求都会经过这里来重复缺点是否到达该更换代理的时间""" if time.time() > self.information: self.proxy = redom_proxy(spiderhost=name) self.information = self.information_func() def process_request(self, request, spider): source = request.meta.get("source")# 这是spider中传过来的来确定是哪个源的爬虫 if self.count % 10000 == 0 and self.count != 0: spider.logger.info("[一万个请求啦换代理了]") self.proxy = redom_proxy(spiderhost=source) self.count += 1 self.agentExecutable(source)# 用来判断是否需要更换代理 spider.logger.info("[request url] {}".format(request.url)) spider.logger.info("[proxy] {}".format(self.proxy)) request.meta["proxy"] = self.proxy def process_response(self, request, response, spider): if len(response.text) < 3000 or response.status in [403, 400, 405, 301, 302]: source = request.meta.get("source") spider.logger.info("[此代理报错] {}".format(self.proxy)) new_proxy = redom_proxy(spiderhost=source) self.proxy = new_proxy spider.logger.info("[更的的新代理为] {}".format(self.proxy)) new_request = request.copy() new_request_l = new_request.replace(url=request.url) return new_request_l return response def process_exception(self, request, exception, spider): # 捕获几乎所有的异常 if isinstance(exception, self.ALL_EXCEPTIONS): # 在日志中打印异常类型 source = request.meta.get("source") spider.logger.info("[Got exception] {}".format(exception)) spider.logger.info("[需要更换代理重试] {}".format(self.proxy)) new_proxy = redom_proxy(spiderhost=source) self.proxy = new_proxy spider.logger.info("[更换后的代理为] {}".format(self.proxy)) new_request = request.copy() new_request_l = new_request.replace(url=request.url) return new_request_l # 打印出未捕获到的异常 spider.logger.info("[not contained exception] {}".format(exception))

思考中问题[为scrapy每个线程分配独立的代理并使之持续使用,目前还没有想到办法]