IP被封之后,利用 aiohttp和asyncio 实现协程抓取网页

练着练着IP被封了,找个代理吧

还是豆瓣250,如有不足希望能指点一下,自学太痛苦了

import aiohttp import asyncio from lxml import etree import time import json import requests import random from fake_useragent import UserAgent ua=UserAgent() headers={User-Agent: ua.random} result=[] urls=[] url_ip=?act=get&num=1&time=60&plat=1&re=0&type=0&ow=1&prov=%E5%AE%89%E5%BE%BD%E7%9C%81 #获取IP的网站,该网站每访问一次就给一个新的IP地址,有效时间大约一分钟 for i in range(10): urls.append(?start={}&filter=.format(i*25)) print(urls) def get_proxies():#获取IP proxy=requests.get(url_ip).text proxies= { http: https: } return proxies p=get_proxies()#设定初始IP地址 async def get_title(url,p):#具体抓取方案 s_time=time.time() async with aiohttp.ClientSession()as session: async with session.get(url,proxy=p[http],headers=headers) as resp: try: print(url+"连接正常") html=await resp.read()#不能用text titles=etree.HTML(html).xpath(.//div[@id="content"]/div/div[1]/ol) for i in titles[0]: #这个[0]一定要加 dict={} dict[paiming]=int(i.xpath(.//div/div[1]/em/text())[0]) dict[rank]=i.xpath(.//div / div[2] / div[2] / div / span[2]/text())[0] dict[title]=i.xpath(.//div[@class="hd"]/a/span[1]/text())[0] in_url = i.xpath(.//div[@class="pic"]/a/@href)[0] async with session.get(in_url,proxy=p[http],headers=headers) as resp2:#抓取二级页面 html2 = await resp2.read() content=etree.HTML(html2).xpath(//*[@id="link-report"]/span[1]/text()) if content!=[\n, \n, \n]: pass else: content=etree.HTML(html2).xpath(//*[@id="link-report"]/span[1]/span/text()) #内容的位置有两种 dict[contetn]=content result.append(dict) print(dict[title]) except: print("ip有问题,准备更换IP") get_title(url,p=get_proxies()) #失败就重新换1个IP e_time = time.time() print(url, "该URL抓取结束,用时:%f" % float(e_time - s_time)) def main(): tasks=[get_title(url,p) for url inurls] asyncio.run(asyncio.wait(tasks))#新版格式 result.sort(key=lambda x: x["paiming"])#排序 s = json.dumps(result, indent=4, ensure_ascii=False) with open(xiecheng, w, encoding=utf-8) as f: f.write(s) if __name__ == __main__: start = time.time() main() print(总耗时:%.5f秒 % float(time.time()-start))

之前将初始IP设置为1.1.1.1:80 看看能不能模拟在抓取的第一个IP不能用的情况下抓取成功,结果失败报错说连接不上~:

import aiohttp import asyncio from lxml import etree import time import json import requests import random from fake_useragent import UserAgent ua=UserAgent() headers={User-Agent: ua.random} result=[] urls=[] url_ip=?act=get&num=1&time=60&plat=1&re=0&type=0&ow=1&prov=%E5%AE%89%E5%BE%BD%E7%9C%81 #获取IP的网站,该网站每访问一次就给一个新的IP地址,有效时间大约一分钟 for i in range(10): urls.append(?start={}&filter=.format(i*25)) print(urls) def get_proxies():#获取IP proxy=requests.get(url_ip).text proxies= { http: https: } return proxies p={ http: :80, https::80 } #设定初始IP地址 async def get_title(url,p):#具体抓取方案 s_time=time.time() async with aiohttp.ClientSession()as session: async with session.get(url,proxy=p[http],headers=headers) as resp: try: print(url+"连接正常") html=await resp.read()#不能用text titles=etree.HTML(html).xpath(.//div[@id="content"]/div/div[1]/ol) for i in titles[0]: #这个[0]一定要加 dict={} dict[paiming]=int(i.xpath(.//div/div[1]/em/text())[0]) dict[rank]=i.xpath(.//div / div[2] / div[2] / div / span[2]/text())[0] dict[title]=i.xpath(.//div[@class="hd"]/a/span[1]/text())[0] in_url = i.xpath(.//div[@class="pic"]/a/@href)[0] async with session.get(in_url,proxy=p[http],headers=headers) as resp2:#抓取二级页面 html2 = await resp2.read() content=etree.HTML(html2).xpath(//*[@id="link-report"]/span[1]/text()) if content!=[\n, \n, \n]: pass else: content=etree.HTML(html2).xpath(//*[@id="link-report"]/span[1]/span/text()) #内容的位置有两种 dict[contetn]=content result.append(dict) print(dict[title]) except: print("ip有问题,准备更换IP") get_title(url,p=get_proxies()) #失败就重新换1个IP e_time = time.time() print(url, "该URL抓取结束,用时:%f" % float(e_time - s_time)) def main(): tasks=[get_title(url,p) for url inurls] asyncio.run(asyncio.wait(tasks))#新版格式 result.sort(key=lambda x: x["paiming"])#排序 s = json.dumps(result, indent=4, ensure_ascii=False) with open(xiecheng, w, encoding=utf-8) as f: f.write(s) if __name__ == __main__: start = time.time() main() print(总耗时:%.5f秒 % float(time.time()-start))

报错:

aiohttp.client_exceptions.ClientProxyConnectionError: Cannot connect to host 1.1.1.1:80 ssl:None [Connect call failed (1.1.1.1, 80)]

改成:

import aiohttp import asyncio from lxml import etree import time import json import requests import random from fake_useragent import UserAgent ua=UserAgent() headers={User-Agent: ua.random} result=[] urls=[] url_ip=?act=get&num=1&time=60&plat=1&re=0&type=0&ow=1&prov=%E5%AE%89%E5%BE%BD%E7%9C%81 #获取IP的网站,该网站每访问一次就给一个新的IP地址,有效时间大约一分钟 for i in range(10): urls.append(?start={}&filter=.format(i*25)) print(urls) def get_proxies():#获取IP proxy=requests.get(url_ip).text proxies= { http: https: } return proxies p={ http: :80, https::80 } #设定初始IP地址 async def get_title(url,p):#具体抓取方案 s_time=time.time() async with aiohttp.ClientSession()as session: try: async with session.get(url,proxy=p[http],headers=headers) as resp: print(url+"连接正常") html=await resp.read()#不能用text titles=etree.HTML(html).xpath(.//div[@id="content"]/div/div[1]/ol) for i in titles[0]: #这个[0]一定要加 dict={} dict[paiming]=int(i.xpath(.//div/div[1]/em/text())[0]) dict[rank]=i.xpath(.//div / div[2] / div[2] / div / span[2]/text())[0] dict[title]=i.xpath(.//div[@class="hd"]/a/span[1]/text())[0] in_url = i.xpath(.//div[@class="pic"]/a/@href)[0] async with session.get(in_url,proxy=p[http],headers=headers) as resp2:#抓取二级页面 html2 = await resp2.read() content=etree.HTML(html2).xpath(//*[@id="link-report"]/span[1]/text()) if content!=[\n, \n, \n]: pass else: content=etree.HTML(html2).xpath(//*[@id="link-report"]/span[1]/span/text()) #内容的位置有两种 dict[contetn]=content result.append(dict) print(dict[title]) e_time = time.time() print(url, "该URL抓取结束,用时:%f" % float(e_time - s_time)) except: print("ip有问题,准备更换IP") await get_title(url,p=get_proxies()) #失败就重新换1个IP def main(): tasks=[get_title(url,p) for url inurls] asyncio.run(asyncio.wait(tasks))#新版格式 result.sort(key=lambda x: x["paiming"])#排序 s = json.dumps(result, indent=4, ensure_ascii=False) with open(xiecheng, w, encoding=utf-8) as f: f.write(s) if __name__ == __main__: start = time.time() main() print(总耗时:%.5f秒 % float(time.time()-start))

try 的位置改了下,给except下的get_title加了个 await 关键字就可以了

发现有时候某个网页爬半天爬不下来,于是设置了timeout:

import aiohttp import asyncio from lxml import etree import time import json import requests import random from fake_useragent import UserAgent ua=UserAgent() headers={User-Agent: ua.random} result=[] urls=[] url_ip=?act=get&num=1&time=60&plat=1&re=0&type=0&ow=1&prov=%E5%AE%89%E5%BE%BD%E7%9C%81 #获取IP的网站,该网站每访问一次就给一个新的IP地址,有效时间大约一分钟 for i in range(10): urls.append(?start={}&filter=.format(i*25)) print(urls) def get_proxies():#获取IP proxy=requests.get(url_ip).text proxies= { http: https: } return proxies p={ http: :80, https::80 } #设定初始IP地址 async def get_title(url,p):#具体抓取方案 s_time=time.time() async with aiohttp.ClientSession()as session: try: async with session.get(url,proxy=p[http],headers=headers,timeout=10) as resp: print(url+"正在爬取") html=await resp.read()#不能用text titles=etree.HTML(html).xpath(.//div[@id="content"]/div/div[1]/ol) for i in titles[0]: #这个[0]一定要加 dict={} dict[paiming]=int(i.xpath(.//div/div[1]/em/text())[0]) dict[rank]=i.xpath(.//div / div[2] / div[2] / div / span[2]/text())[0] dict[title]=i.xpath(.//div[@class="hd"]/a/span[1]/text())[0] in_url = i.xpath(.//div[@class="pic"]/a/@href)[0] async with session.get(in_url,proxy=p[http],headers=headers,timeout=5) as resp2:#抓取二级页面 html2 = await resp2.read() content=etree.HTML(html2).xpath(//*[@id="link-report"]/span[1]/text()) if content!=[\n, \n, \n]: pass else: content=etree.HTML(html2).xpath(//*[@id="link-report"]/span[1]/span/text()) #内容的位置有两种 dict[contetn]=content result.append(dict) print(dict[title]) e_time = time.time() print(url, "该URL抓取结束,用时:%f" % float(e_time - s_time)) except: print("连接%s时候连接超时,准备更换IP---------------------------------------------------"%url) await get_title(url,p=get_proxies()) #失败就重新换1个IP def main(): tasks=[get_title(url,p) for url inurls] asyncio.run(asyncio.wait(tasks))#新版格式 result.sort(key=lambda x: x["paiming"])#排序 s = json.dumps(result, indent=4, ensure_ascii=False) with open(xiecheng, w, encoding=utf-8) as f: f.write(s) if __name__ == __main__: start = time.time() main() print(总耗时:%.5f秒 % float(time.time()-start))