通过chromium headless模式来对Twitter进行爬虫,通过一些关键词来监控Twitter上的安全资讯。
代码参考 ,环境需要python3.7以上,安装chromium,安装支持库
pip3 install pyquery pyppeteer
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Time: 2020/3/16 5:14 PM
# @Author: w8ay
# @File: main.py
import asyncio
import hashlib
from urllib.parse import urljoin
from pyppeteer import launch
from pyquery import PyQuery as pq
def getTwitter(html):
doc = pq(html)
items = doc("section.css-1dbjc4n article")
# print(len(items))
r = []
for item in list(items.items()):
article = item(".r-1iusvr4")
alink = article("a.css-4rbku5.css-18t94o4.css-901oao.r-1re7ezh.r-1loqt21.r-1q142lx.r-1qd0xha")
href = alink.attr("href")
url = urljoin("", href)
# name = article(
# "div.css-901oao.css-bfa6kz.r-hkyrab.r-1qd0xha.r-a023e6.r-vw2c0b.r-ad9z0x.r-bcqeeo.r-3s2u2q.r-qvutc0").text()
name2 = article(
"div .css-1dbjc4n.r-18u37iz.r-1wbh5a2.r-1f6r7vd .css-901oao.css-16my406.r-1qd0xha.r-ad9z0x.r-bcqeeo.r-qvutc0").text()
# print(name)
content = article(
"div .css-1dbjc4n .css-901oao.r-hkyrab.r-1qd0xha.r-a023e6.r-16dba41.r-ad9z0x.r-bcqeeo.r-bnwqim.r-qvutc0").text()
# print(url, name2, content)
print("地址:{}\n昵称:{}\n内容:{}\n".format(url, name2, content))
r.append((url, name2, content))
return r
async def request_check(req):
if req.resourceType in ["image", "media", "eventsource", "websocket", "stylesheet", "font"]:
await req.abort()
else:
await req.continue_()
async def twitter_spider(browser, keyword):
result = []
url = "search?q={}&src=typd".format(keyword)
page = await browser.newPage()
page.setDefaultNavigationTimeout(1000 * 60 * 5)# 5 min
await page.setRequestInterception(True)
page.on(request, lambda req: asyncio.ensure_future(request_check(req)))
waitUntil = [
load,
domcontentloaded,
# networkidle0,
# networkidle2
]
await page.goto(url, waitUntil=waitUntil)
#
await page.waitForSelector("#react-root section .r-my5ep6")
await page.waitFor(1000 * 2)
hash_set = set()
for i in range(1, 20):
js = window.scrollBy(0,400)
await page.evaluate(js)
await page.waitFor(1200)
content = await page.content()
rlist = getTwitter(content)
for item in rlist:
url, name2, content = item
h1 = hashlib.md5()
h1.update(content.encode(utf-8))
# md5加密后的结果
md5 = h1.hexdigest()
if md5 in hash_set:
continue
hash_set.add(md5)
result.append(item)
await page.close()
return result
async def twitter_start(executablePath, keywords):
browser = await launch(headless=True, ignoreHTTPSErrors=True, executablePath=executablePath, autoClose=True,
args=[
"--disable-gpu",
"--disable-web-security",
"--disable-xss-auditor",# 关闭 XSS Auditor
"--no-sandbox",
"--disable-setuid-sandbox",
"--allow-running-insecure-content",# 允许不安全内容
"--disable-webgl",
"--window-size=1250,600",
"--disable-popup-blocking",
# 使用代理取消下面注释
# "--proxy-server=socks5://127.0.0.1:1080",
],
timeout=60, devtools=debug)
ret = []
for keyword in keywords:
ret2 = await twitter_spider(browser, keyword)
ret.extend(ret2)
await browser.close()
print(ret)
return ret
if __name__ == __main__:
executablePath = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
debug = True
keywords = [i was awarded bugbounty, xss, sqli]
ret = asyncio.get_event_loop().run_until_complete(twitter_start(executablePath, keywords))
print(len(ret))
executablePath为chrome的执行路径, debug为 True时会显示浏览器窗口方便调试,为 False时即不显示窗口。
keywords为搜索的关键词。
效果图
后续可以自己设定定时脚本和推送方式,来每日推送。