代理实际上指的就是代理服务器,它的功能是代理网络用户去取得网络信息 。也可以说它是网络信息的中转站 。无私分享全套Python爬虫干货,如果你也想学习Python,@ 私信小编获取
我们了解了代理池的四大问题,所以我们可以根据这四个问题去分析设计一个代理池框架,我们可以分成四个模块。分别是获取模块、检测模块、存储模块、接口模块 。这样不仅有利于我们的维护,也使得可以更高效的完成我们的需求。
代码模块
获取模块
import requests
import chardet
import traceback
from lxml import etree
class Downloader(object):
def __init__(self):
self.headers = {
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36
}
def download(self, url):
print(正在下载页面:{}.format(url))
try:
resp = requests.get(url, headers=self.headers)
resp.encoding = chardet.detect(resp.content)[encoding]
if resp.status_code == 200:
return self.xpath_parse(resp.text)
else:
raise ConnectionError
except Exception:
print(下载页面出错:{}.format(url))
traceback.print_exc()
def xpath_parse(self, resp):
try:
page = etree.HTML(resp)
trs = page.xpath(//div[@id="list"]/table/tbody/tr)
proxy_list = []
for tr in trs:
ip = tr.xpath(./td[1]/text())[0]
port = tr.xpath(./td[2]/text())[0]
proxy = {
proxy: ip + : + port
}
proxy_list.append(proxy)
return proxy_list
except Exception:
print(解析IP地址出错)
traceback.print_exc()
if __name__ == __main__:
print(Downloader().download(
存储模块
import pymongo
from pymongo.errors import DuplicateKeyError
class MongoDB(object):
def __init__(self):
self.client = pymongo.MongoClient()
self.db = self.client[proxypool3]
self.proxies = self.db[proxies]
self.proxies.ensure_index(proxy, unique=True)
self.proxies.create_index()
# createIndex()
def insert(self, proxy):
try:
self.proxies.insert(proxy)
print(插入成功:{}.format(proxy))
except DuplicateKeyError:
pass
def delete(self, conditions):
self.proxies.remove(conditions)
print(删除成功:{}.format(conditions))
def update(self, conditions, values):
self.proxies.update(conditions, {"$set": values})
print(更新成功:{},{}.format(conditions,values))
def get(self, count, conditions=None):
conditions = conditions if conditions else {}
count = int(count)
items = self.proxies.find(conditions, limit=count).sort(delay, pymongo.ASCENDING)
items = list(items)
return items
def get_count(self):
return self.proxies.count({})
if __name__ == __main__:
m = MongoDB()
print(m.get(3))
检测模块
import requests
import time
import traceback
from requests.exceptions import ProxyError, ConnectionError
from db.mongo_db import MongoDB
from multiprocessing.pool import ThreadPool
def valid_many(proxy_list, method):
pool = ThreadPool(16)
for proxy in proxy_list:
pool.apply_async(valid_one, args=(proxy, method))
pool.close()
pool.join()
def valid_one(proxy, method, url=:
headers = {
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36
}
proxies = {
http: http:// + proxy[proxy],
https: http:// + proxy[proxy]
}
try:
start_time = time.time()
# requests.packages.urllib3.disable_warnings()
resp = requests.get(url, headers=headers, proxies=proxies, timeout=5, verify=False)
delay = round(time.time() - start_time, 2)
if resp.status_code == 200:
proxy[delay] = delay
if method == insert:
MongoDB().insert(proxy)
elif method == check:
MongoDB().update({proxy: proxy[proxy]}, {delay: proxy[delay]})
else:
if method == check:
MongoDB().delete({proxy: proxy[proxy]})
except (ProxyError, ConnectionError):
if method == check:
MongoDB().delete({proxy: proxy[proxy]})
except Exception:
traceback.print_exc()
API接口模块
import flask
import json
from db.mongo_db import MongoDB
app = flask.Flask(__name__)
@app.route(/one)
def get_one():
proxies = MongoDB().get(1)
result = [proxy[proxy] for proxy in proxies]
return json.dumps(result)
@app.route(/many)
def get_many():
args = flask.request.args
proxies = MongoDB().get(args[count])
result = [proxy[proxy] for proxy in proxies]
return json.dumps(result)
def run():
app.run()
为了帮助大家更轻松的学好Python,我给大家分享一套Python学习资料,希望对正在学习的你有所帮助!
获取方式:关注并私信小编 “ 学习 ”,即可免费获取!