目录
废话不多说,直接看代码
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
importsys,requests,lxml,re
#设置 utf8 字符流处理
reload(sys)
sys.setdefaultencoding(utf-8)
#设置头信息
headers={"User-Agent":"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36",
"Accept":"*/*",
"Accept-Language":"zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding":"gzip, deflate",
"Content-Type":"application/x-www-form-urlencoded; charset=UTF-8",
"X-Requested-With":"XMLHttpRequest",
"Connection":"keep-alive"
}
#代理验证,proxies() #传入一个字典
def proxies(urls ={"http":":81"}):
proxies = urls
# timeout=60 设置超时时间60秒
# res.status_code 查看返回网页状态码
# verify = False 忽略证书
try:
res = requests.get(url ="",proxies = proxies, verify =False,timeout=60,headers = headers)
#print u"访问畅通!!!"
#print res.content
if res.status_code==200 :
#print u"代理可用!"
#print res.content
##with open("1.txt",wb) as f:
## f.write(res.content)
print urls
print u"访问没有问题,返回1"
return proxies
else:
print urls
print u"访问不可用,返回0"
returnFalse
exceptException, e:
print urls
print u"访问异常,返回0"
returnFalse
#获取列表页数 并 生成列表超链接
def get_list_page(listurl =""):
#获取列表页数
doc = requests.get(url = listurl,headers = headers).text
soup = BeautifulSoup(doc,lxml)
page_html = soup.find("div",class_="pagination")
page_list =re.findall(r"\d+",str(page_html))
page_max =int(page_list[-2])
#生成列表超链接
list_all =[]
for i inxrange(1,page_max+1):
url =re.sub(/\d+,/%d%i,listurl+"1",re.S)
#print url
list_all.append(url)
else :
#print list_all
return list_all
#抓取页面字段
def page_data(url =""):
resule =[]
html = requests.get(url,headers = headers).text
soup = BeautifulSoup(html,lxml)
table = soup.select(table tr)
for tr in table:
#print tr
td = tr.select(td)
iplist =[]
for ip in td:
#print ip.string
iplist.append(ip.string)
#print iplist
if iplist :
resule.append(iplist[5].lower() + : + iplist[5].lower() + :// + iplist[1] + : + iplist[2])
return resule
#获取数据
#追加保存数据
def save_ip(ip):
withopen(ip.txt,a)as f:
f.writelines(ip)
f.close()
#proxies()
#print get_list_page("")
#print page_data()
list_url = get_list_page(listurl ="")
for url in list_url:
iplist = page_data(url)
#print iplist
#exit()
for ip in iplist:
arr =re.split(:,ip)
#print type(arr),arr,arr[0],arr[1],arr[2],arr[3]
parame ={arr[0]:arr[1]+:+arr[2]+:+arr[3]}
res = proxies(parame)
if res :
#print u"file_put" #写入文件
save_ip(str(arr[1]+:+arr[2]+:+arr[3])+"\r\n")
else:
#访问不可用时走这里的流程
pass
if __name__ ==__main__:
#print "main"
pass
# #国内高匿代理
# #国内普通代理
# #国外高匿代理
# #国外普通代理