# Anti-Scraping 参考模式库 ## 1. 常见爬虫特征指纹 ### User-Agent 黑名单(正则) ```typescript const BOT_UA_PATTERNS = [ // 爬虫框架 /python-requests/i, /scrapy/i, /beautifulsoup/i, /selenium/i, /playwright/i, /puppeteer/i, /mechanize/i, /httpclient/i, /java\/\d/i, /go-http-client/i, /ruby/i, // 命令行工具 /curl\//i, /wget\//i, /httpie/i, /insomnia/i, // 无头浏览器特征 /headlesschrome/i, /phantomjs/i, /slimerjs/i, // 已知数据采集 /dataprovider/i, /yandexbot/i, /mj12bot/i, /ahrefsbot/i, /semrushbot/i, /dotbot/i, ]; export function isBotUA(ua: string): boolean { return BOT_UA_PATTERNS.some(p => p.test(ua)); } ``` ### 允许的搜索引擎爬虫白名单 ```typescript // 合法爬虫:需要验证真实性(反向 DNS 查找) const ALLOWED_BOTS = [ { name: 'Googlebot', ua: /googlebot/i, rdns: 'googlebot.com' }, { name: 'Bingbot', ua: /bingbot/i, rdns: 'search.msn.com' }, { name: 'Baidu Spider', ua: /baiduspider/i, rdns: 'crawl.baidu.com' }, ]; async function isLegitimateBot(ua: string, ip: string): Promise { const bot = ALLOWED_BOTS.find(b => b.ua.test(ua)); if (!bot) return false; // 反向 DNS 验证(防止伪造 UA) const hostname = await reverseDNS(ip); return hostname?.endsWith(bot.rdns) ?? false; } ``` --- ## 2. 风险评分算法 ### 综合评分模型 ```typescript interface RiskFactors { fingerprintScore: number; // 0-100 rateScore: number; // 0-100(超速时增加) ipScore: number; // 0-100(数据中心/Tor/VPN) behaviorScore: number; // 0-100(行为异常) } function calculateRiskScore(factors: RiskFactors): number { const weights = { fingerprint: 0.35, rate: 0.30, ip: 0.25, behavior: 0.10, }; return Math.min( Math.round( factors.fingerprintScore * weights.fingerprint + factors.rateScore * weights.rate + factors.ipScore * weights.ip + factors.behaviorScore * weights.behavior ), 100 ); } // 响应策略 function getResponseStrategy(score: number): 'allow' | 'slowdown' | 'challenge' | 'block' { if (score >= 80) return 'block'; if (score >= 50) return 'challenge'; if (score >= 30) return 'slowdown'; return 'allow'; } ``` --- ## 3. Redis 数据结构设计 ``` # 速率限制(滑动窗口) ZSET rl:{ip} → { timestamp: score } ZSET rl:{ip}:{endpoint} → { timestamp: score } # IP 黑白名单 SET ip:blocklist → { ip1, ip2, ... } SET ip:allowlist → { ip1, ip2, ... }(合法爬虫白名单) SET ip:tor-exit → { ip1, ip2, ... } SET ip:datacenter → { ip1, ip2, ... } # 蜜罐触发记录 HASH honeypot:hits → { ip: count } # CAPTCHA 通过记录(防止重复挑战) STRING captcha:passed:{ip} → "1"(TTL 1 小时) # 行为画像 HASH behavior:{ip} → { first_seen: timestamp, request_count: number, path_entropy: number, # 访问路径多样性(低=爬虫) referer_missing_ratio: number, # 缺少 Referer 比例(高=爬虫) } ``` --- ## 4. Nginx 层防护(可选,性能最佳) ```nginx # /etc/nginx/conf.d/anti-scraping.conf # 限速区域定义 limit_req_zone $binary_remote_addr zone=api:10m rate=30r/m; limit_req_zone $binary_remote_addr zone=login:10m rate=5r/m; limit_conn_zone $binary_remote_addr zone=conn:10m; server { # 连接数限制(单 IP 最多 20 并发) limit_conn conn 20; # UA 黑名单 if ($http_user_agent ~* "(python|curl|wget|scrapy|selenium)") { return 403; } # 空 UA 拒绝 if ($http_user_agent = "") { return 403; } location /api/ { limit_req zone=api burst=10 nodelay; limit_req_status 429; proxy_pass http://app; } location /api/auth/ { limit_req zone=login burst=2 nodelay; limit_req_status 429; proxy_pass http://app; } # 蜜罐路由(真实用户不会访问) location /admin-backup/ { access_log /var/log/nginx/honeypot.log; # 记录访问者 IP 并返回假数据 return 200 '{"status":"ok"}'; add_header Content-Type application/json; } } ``` --- ## 5. Cloudflare Workers 方案(Edge 层,最推荐) ```typescript // workers/anti-scraping.ts export default { async fetch(request: Request, env: Env): Promise { const ip = request.headers.get('CF-Connecting-IP') ?? ''; const ua = request.headers.get('User-Agent') ?? ''; // 利用 Cloudflare 的威胁评分 const cfThreatScore = Number(request.headers.get('CF-Threat-Score') ?? 0); if (cfThreatScore > 30) { return new Response('Forbidden', { status: 403 }); } // 利用 Cloudflare 的 Bot 管理分数(需开启 Bot Management) const cfBotScore = Number(request.headers.get('CF-Bot-Score') ?? 100); if (cfBotScore < 30) { // 低分 = 高爬虫可能性 return new Response('Forbidden', { status: 403 }); } // 自定义限速(使用 Durable Objects 或 KV) const rateLimitKey = `rl:${ip}`; const count = Number(await env.RATE_LIMIT.get(rateLimitKey) ?? 0); if (count > 60) { return new Response('Too Many Requests', { status: 429, headers: { 'Retry-After': '60' }, }); } await env.RATE_LIMIT.put(rateLimitKey, String(count + 1), { expirationTtl: 60 }); return fetch(request); }, }; ``` --- ## 6. 监控 Dashboard(Datadog / Grafana 指标) ```typescript // lib/metrics.ts — 关键埋点 export const antiScrapingMetrics = { // 请求被拦截 blocked: (reason: 'fingerprint' | 'rate' | 'honeypot' | 'ip' | 'captcha') => { metrics.increment('anti_scraping.blocked', { reason }); }, // 风险评分分布 scoreDistribution: (score: number) => { metrics.histogram('anti_scraping.risk_score', score); }, // CAPTCHA 展示与通过 captchaImpressed: () => metrics.increment('anti_scraping.captcha.impressed'), captchaPassed: () => metrics.increment('anti_scraping.captcha.passed'), captchaFailed: () => metrics.increment('anti_scraping.captcha.failed'), // 误伤监控 falsePositive: (userId: string) => { metrics.increment('anti_scraping.false_positive'); logger.warn({ userId }, 'Possible false positive in anti-scraping'); }, }; ``` --- ## 7. 测试用例 ```typescript // __tests__/anti-scraping.test.ts describe('Anti-Scraping', () => { describe('Fingerprint Analysis', () => { it('should flag Python requests as high risk', () => { const score = analyzeFingerpring(mockRequest({ 'user-agent': 'python-requests/2.28.0', })); expect(score).toBeGreaterThanOrEqual(50); }); it('should not flag normal Chrome browser', () => { const score = analyzeFingerpring(mockRequest({ 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', 'accept-language': 'zh-CN,zh;q=0.9', 'accept-encoding': 'gzip, deflate, br', 'accept': 'text/html,application/xhtml+xml', })); expect(score).toBeLessThan(20); }); }); describe('Rate Limiting', () => { it('should block after exceeding limit', async () => { const ip = '192.168.1.100'; // 发送 61 次请求 for (let i = 0; i < 61; i++) { await checkRateLimit(ip, { windowMs: 60_000, limit: 60 }); } const { allowed } = await checkRateLimit(ip); expect(allowed).toBe(false); }); }); describe('Honeypot', () => { it('should block IP that triggers honeypot', async () => { const ip = '10.0.0.1'; await triggerHoneypot(ip); const isBlocked = await redis.sismember('ip:blocklist', ip); expect(isBlocked).toBe(1); }); }); }); ```