python爬虫批量抓取ip代理

使用爬虫抓取数据时,经常要用到多个ip代理,防止单个ip访问太过频繁被封禁。

ip代理可以从这个网站获取:。

因此写一个python程序来获取ip代理,保存到本地。

python版本:3.6.3

1 #grab ip proxies from xicidaili 2 import sys, time, re, requests 3 from multiprocessing.dummy import Pool as ThreadPool 4 from lxml import etree 5 6 IP_POOL = ip_pool.py 7 URL =#IP代理 高匿 8 #URL =#IP代理 http 9 RUN_TIME = time.strftime("%Y-%m-%d %H:%M", time.localtime()) #执行时间 10 11 #用字典存放有效ip代理 12 alive_ip = {http: [], https: []} 13 #多线程 14 pool = ThreadPool(20) 15 16 #返回html文本 17 def get_html(url): 18 headers = { 19 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/ Firefox/55.0", 20 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 21 "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", 22 "Accept-Encoding": "gzip, deflate", 23 "Referer": "", 24 "Connection": "keep-alive", 25 "Upgrade-Insecure-Requests": "1" 26 } 27 r = requests.get(url, headers=headers) 28 r.encoding = utf-8 29 return r.text 30 31 #测试ip代理是否存活 32 def test_alive(proxy): 33 global alive_ip 34 proxies = {http: proxy} 35 try: 36 r = requests.get( proxies=proxies, timeout=3) 37 if r.status_code == 200: 38 if proxy.startswith(https): 39 alive_ip[https].append(proxy) 40 else: 41 alive_ip[http].append(proxy) 42 except: 43 print("%s无效!"%proxy) 44 45 #解析html文本,获取ip代理 46 def get_alive_ip_address(): 47 iplist = [] 48 html = get_html(URL) 49 selector = etree.HTML(html) 50 table = selector.xpath(//table[@id="ip_list"])[0] 51 lines = table.xpath(./tr)[1:] 52 for line in lines: 53 speed, connect_time = line.xpath(.//div/@title) 54 data = line.xpath(./td) 55 ip = data[1].xpath(./text())[0] 56 port = data[2].xpath(./text())[0] 57 anonymous = data[4].xpath(./text())[0] 58 ip_type = data[5].xpath(./text())[0] 59 #过滤掉速度慢和非高匿的ip代理 60 if float(speed[:-1])>1 or float(connect_time[:-1])>1 or anonymous != 高匿: 61 continue 62 iplist.append(ip_type.lower() + :// + ip + : + port) 63 pool.map(test_alive, iplist) 64 65 #把抓取到的有效ip代理写入到本地 66 def write_txt(output_file): 67 with open(output_file, w) as f: 68 f.write(#create time: %s\n\n % RUN_TIME) 69 f.write(http_ip_pool = \\\n) 70 f.write(str(alive_ip[http]).replace(,, ,\n)) 71 f.write(\n\n) 72 with open(output_file, a) as f: 73 f.write(https_ip_pool = \\\n) 74 f.write(str(alive_ip[https]).replace(,, ,\n)) 75 print(write successful: %s % output_file) 76 77 def main(): 78 get_alive_ip_address() 79 write_txt(output_file) 80 81 if __name__ == __main__: 82 try: 83 output_file = sys.argv[1] #第一个参数作为文件名 84 except: 85 output_file = IP_POOL 86 main()

运行程序:

root@c:test$ python get_ip_proxies.py write successful: ip_pool.py

查看文件:

root@c:test$ vim ip_pool.py 1 #create time: 2019-03-14 19:53 2 3 http_ip_pool = \ 4 [:9999, 5 :9999, 6 :9999, 7 :9999, 8 :8118, 9 :61320, 10 :9999, 11 :9999, 12 :9999, 13 :40274, 14 :9999, 15 :9999, 16 :9999, 17 :9999, 18 :9999, 19 :9999, 20 :9999, 21 :9999, 22 :9999, 23 :8118, 24 :9999, 25 :9999, 26 :9999, 27 :9999, 28 :9999, 29 :8123, 30 :8123, 31 :9999, 32 :9999, 33 :9999, 34 :1133, 35 :8010, 36 :9999, 37 :9999, 38 :9999] 39 40 https_ip_pool = \ 41 [:9999, 42 :808, 43 :39894, 44 :42788, 45 :33323, 46 :9999, 47 :9999, 48 :3128, 49 :52699, 50 :9999, 51 :43296, 52 :50465, 53 :9999, 54 :32741, 55 :9999, 56 :9999, 57 :9999, 58 :9999, 59 :9999, 60 :9999, 61 :9999, 62 :41306, 63 :9999, 64 :9999, 65 :9999, 66 :1133, 67 :56882, 68 :9999, 69 :8118, 70 :8118, 71 :9999, 72 :9999, 73 :9999, 74 :53281, 75 :9999, 76 :32431, 77 :61408]

之后就可以直接使用了

from ip_pool import http_ip_pool, https_ip_pool

小编最近整理了一套Python学习教程,有需要的小伙伴,记得关注小编头条号,私信【学习】即可免费领取一套Python学习教程哦