7.7 KiB
7.7 KiB
Anti-Scraping 参考模式库
1. 常见爬虫特征指纹
User-Agent 黑名单(正则)
const BOT_UA_PATTERNS = [
// 爬虫框架
/python-requests/i,
/scrapy/i,
/beautifulsoup/i,
/selenium/i,
/playwright/i,
/puppeteer/i,
/mechanize/i,
/httpclient/i,
/java\/\d/i,
/go-http-client/i,
/ruby/i,
// 命令行工具
/curl\//i,
/wget\//i,
/httpie/i,
/insomnia/i,
// 无头浏览器特征
/headlesschrome/i,
/phantomjs/i,
/slimerjs/i,
// 已知数据采集
/dataprovider/i,
/yandexbot/i,
/mj12bot/i,
/ahrefsbot/i,
/semrushbot/i,
/dotbot/i,
];
export function isBotUA(ua: string): boolean {
return BOT_UA_PATTERNS.some(p => p.test(ua));
}
允许的搜索引擎爬虫白名单
// 合法爬虫:需要验证真实性(反向 DNS 查找)
const ALLOWED_BOTS = [
{ name: 'Googlebot', ua: /googlebot/i, rdns: 'googlebot.com' },
{ name: 'Bingbot', ua: /bingbot/i, rdns: 'search.msn.com' },
{ name: 'Baidu Spider', ua: /baiduspider/i, rdns: 'crawl.baidu.com' },
];
async function isLegitimateBot(ua: string, ip: string): Promise<boolean> {
const bot = ALLOWED_BOTS.find(b => b.ua.test(ua));
if (!bot) return false;
// 反向 DNS 验证(防止伪造 UA)
const hostname = await reverseDNS(ip);
return hostname?.endsWith(bot.rdns) ?? false;
}
2. 风险评分算法
综合评分模型
interface RiskFactors {
fingerprintScore: number; // 0-100
rateScore: number; // 0-100(超速时增加)
ipScore: number; // 0-100(数据中心/Tor/VPN)
behaviorScore: number; // 0-100(行为异常)
}
function calculateRiskScore(factors: RiskFactors): number {
const weights = {
fingerprint: 0.35,
rate: 0.30,
ip: 0.25,
behavior: 0.10,
};
return Math.min(
Math.round(
factors.fingerprintScore * weights.fingerprint +
factors.rateScore * weights.rate +
factors.ipScore * weights.ip +
factors.behaviorScore * weights.behavior
),
100
);
}
// 响应策略
function getResponseStrategy(score: number): 'allow' | 'slowdown' | 'challenge' | 'block' {
if (score >= 80) return 'block';
if (score >= 50) return 'challenge';
if (score >= 30) return 'slowdown';
return 'allow';
}
3. Redis 数据结构设计
# 速率限制(滑动窗口)
ZSET rl:{ip} → { timestamp: score }
ZSET rl:{ip}:{endpoint} → { timestamp: score }
# IP 黑白名单
SET ip:blocklist → { ip1, ip2, ... }
SET ip:allowlist → { ip1, ip2, ... }(合法爬虫白名单)
SET ip:tor-exit → { ip1, ip2, ... }
SET ip:datacenter → { ip1, ip2, ... }
# 蜜罐触发记录
HASH honeypot:hits → { ip: count }
# CAPTCHA 通过记录(防止重复挑战)
STRING captcha:passed:{ip} → "1"(TTL 1 小时)
# 行为画像
HASH behavior:{ip} → {
first_seen: timestamp,
request_count: number,
path_entropy: number, # 访问路径多样性(低=爬虫)
referer_missing_ratio: number, # 缺少 Referer 比例(高=爬虫)
}
4. Nginx 层防护(可选,性能最佳)
# /etc/nginx/conf.d/anti-scraping.conf
# 限速区域定义
limit_req_zone $binary_remote_addr zone=api:10m rate=30r/m;
limit_req_zone $binary_remote_addr zone=login:10m rate=5r/m;
limit_conn_zone $binary_remote_addr zone=conn:10m;
server {
# 连接数限制(单 IP 最多 20 并发)
limit_conn conn 20;
# UA 黑名单
if ($http_user_agent ~* "(python|curl|wget|scrapy|selenium)") {
return 403;
}
# 空 UA 拒绝
if ($http_user_agent = "") {
return 403;
}
location /api/ {
limit_req zone=api burst=10 nodelay;
limit_req_status 429;
proxy_pass http://app;
}
location /api/auth/ {
limit_req zone=login burst=2 nodelay;
limit_req_status 429;
proxy_pass http://app;
}
# 蜜罐路由(真实用户不会访问)
location /admin-backup/ {
access_log /var/log/nginx/honeypot.log;
# 记录访问者 IP 并返回假数据
return 200 '{"status":"ok"}';
add_header Content-Type application/json;
}
}
5. Cloudflare Workers 方案(Edge 层,最推荐)
// workers/anti-scraping.ts
export default {
async fetch(request: Request, env: Env): Promise<Response> {
const ip = request.headers.get('CF-Connecting-IP') ?? '';
const ua = request.headers.get('User-Agent') ?? '';
// 利用 Cloudflare 的威胁评分
const cfThreatScore = Number(request.headers.get('CF-Threat-Score') ?? 0);
if (cfThreatScore > 30) {
return new Response('Forbidden', { status: 403 });
}
// 利用 Cloudflare 的 Bot 管理分数(需开启 Bot Management)
const cfBotScore = Number(request.headers.get('CF-Bot-Score') ?? 100);
if (cfBotScore < 30) {
// 低分 = 高爬虫可能性
return new Response('Forbidden', { status: 403 });
}
// 自定义限速(使用 Durable Objects 或 KV)
const rateLimitKey = `rl:${ip}`;
const count = Number(await env.RATE_LIMIT.get(rateLimitKey) ?? 0);
if (count > 60) {
return new Response('Too Many Requests', {
status: 429,
headers: { 'Retry-After': '60' },
});
}
await env.RATE_LIMIT.put(rateLimitKey, String(count + 1), { expirationTtl: 60 });
return fetch(request);
},
};
6. 监控 Dashboard(Datadog / Grafana 指标)
// lib/metrics.ts — 关键埋点
export const antiScrapingMetrics = {
// 请求被拦截
blocked: (reason: 'fingerprint' | 'rate' | 'honeypot' | 'ip' | 'captcha') => {
metrics.increment('anti_scraping.blocked', { reason });
},
// 风险评分分布
scoreDistribution: (score: number) => {
metrics.histogram('anti_scraping.risk_score', score);
},
// CAPTCHA 展示与通过
captchaImpressed: () => metrics.increment('anti_scraping.captcha.impressed'),
captchaPassed: () => metrics.increment('anti_scraping.captcha.passed'),
captchaFailed: () => metrics.increment('anti_scraping.captcha.failed'),
// 误伤监控
falsePositive: (userId: string) => {
metrics.increment('anti_scraping.false_positive');
logger.warn({ userId }, 'Possible false positive in anti-scraping');
},
};
7. 测试用例
// __tests__/anti-scraping.test.ts
describe('Anti-Scraping', () => {
describe('Fingerprint Analysis', () => {
it('should flag Python requests as high risk', () => {
const score = analyzeFingerpring(mockRequest({
'user-agent': 'python-requests/2.28.0',
}));
expect(score).toBeGreaterThanOrEqual(50);
});
it('should not flag normal Chrome browser', () => {
const score = analyzeFingerpring(mockRequest({
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'accept-language': 'zh-CN,zh;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept': 'text/html,application/xhtml+xml',
}));
expect(score).toBeLessThan(20);
});
});
describe('Rate Limiting', () => {
it('should block after exceeding limit', async () => {
const ip = '192.168.1.100';
// 发送 61 次请求
for (let i = 0; i < 61; i++) {
await checkRateLimit(ip, { windowMs: 60_000, limit: 60 });
}
const { allowed } = await checkRateLimit(ip);
expect(allowed).toBe(false);
});
});
describe('Honeypot', () => {
it('should block IP that triggers honeypot', async () => {
const ip = '10.0.0.1';
await triggerHoneypot(ip);
const isBlocked = await redis.sismember('ip:blocklist', ip);
expect(isBlocked).toBe(1);
});
});
});