爬虫代理IP池构建(并可用性验证)

一、采集网上公示的免费代理ip:

目标网站:

http://www.66ip.cn/http://www.ip3366.net/http://www.xiladaili.com/

代理IP采集代码如下(仅供参考,请勿给目标网站造成压力,博主概不负责哦~),代码很简单,就不详细说明了:

# def ip66(self): url="index.html" respones=requests.get(url=url,headers=self.headers).content.decode("gb2312") html=etree.HTML(respones) jx=html.xpath(//table[@bordercolor="#6699ff"]/tr) for i in jx[1:]: #ip ip=str(i.xpath(./td[1]/text())[0])+":"+str(i.xpath(./td[2]/text())[0]) self.ips.put(ip) # def yun(self): url="" respones=requests.get(url=url,headers=self.headers).content.decode("gb2312") html=etree.HTML(respones) jx=html.xpath(//table[@class="table table-bordered table-striped"]//tbody/tr) for i in jx: # ip ip = str(i.xpath(./td[1]/text())[0]) + ":" + str(i.xpath(./td[2]/text())[0]) http://www.xiladaili.com/ def xiladaili(self): url="gaoni" respones=requests.get(url=url,headers=self.headers).content.decode() html=etree.HTML(respones) jx=html.xpath(//table[@class="fl-table"]//tbody/tr) for i in jx: #ip地址 ip=i.xpath(./td[1]/text())[0]

二、代理ip的可用性验证:

IP可用性验证,我这里抓包了讯代理的API接口,如下所示:

API详情如下:

URL:http://www.xdaili.cn/ipagent/checkIp/ipList?ip_ports[]=这里传入ip:端口号HTTP协议:GET

三、完整代码如下:

import requests from lxml import etree from queue import Queue import threading import time class agent(object): def __init__(self): self.headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"} self.ips=Queue(1000) self.iplist=[] #免费西拉代理 def xiladaili(self): url="gaoni" respones=requests.get(url=url,headers=self.headers).content.decode() html=etree.HTML(respones) jx=html.xpath(//table[@class="fl-table"]//tbody/tr) for i in jx: #ip地址 ip=i.xpath(./td[1]/text())[0] self.ips.put(ip) #66免费代理网 def ip66(self): url="index.html" respones=requests.get(url=url,headers=self.headers).content.decode("gb2312") html=etree.HTML(respones) jx=html.xpath(//table[@bordercolor="#6699ff"]/tr) for i in jx[1:]: #ip ip=str(i.xpath(./td[1]/text())[0])+":"+str(i.xpath(./td[2]/text())[0]) self.ips.put(ip) #云代理 def yun(self): url="" respones=requests.get(url=url,headers=self.headers).content.decode("gb2312") html=etree.HTML(respones) jx=html.xpath(//table[@class="table table-bordered table-striped"]//tbody/tr) for i in jx: # ip ip = str(i.xpath(./td[1]/text())[0]) + ":" + str(i.xpath(./td[2]/text())[0]) self.ips.put(ip) #代理ip可用性验证 def validation(self,ipp): url = "?ip_ports[]=" + ipp respones = requests.get(url=url, headers=self.headers).json() if respones["RESULT"][0]["time"]=="请求超时": pass else: #可用ip available_ip=respones["RESULT"][0]["ip"]+":"+respones["RESULT"][0]["port"] self.iplist.append(available_ip) print([{}]可用,已入库~.format(available_ip)) #运行 def run(self): threading.Thread(target=self.xiladaili).start() threading.Thread(target=self.ip66).start() threading.Thread(target=self.yun).start() time.sleep(1) for xc in range(3): while True: if self.ips.empty(): break threading.Thread(target=self.validation,args=[self.ips.get()]).start() if __name__ == __main__: agent().run()

注:以上内容仅供学习,请勿非法使用,一切责任与博主无关