307 lines
7.7 KiB
Markdown
307 lines
7.7 KiB
Markdown
# Anti-Scraping 参考模式库
|
||
|
||
## 1. 常见爬虫特征指纹
|
||
|
||
### User-Agent 黑名单(正则)
|
||
|
||
```typescript
|
||
const BOT_UA_PATTERNS = [
|
||
// 爬虫框架
|
||
/python-requests/i,
|
||
/scrapy/i,
|
||
/beautifulsoup/i,
|
||
/selenium/i,
|
||
/playwright/i,
|
||
/puppeteer/i,
|
||
/mechanize/i,
|
||
/httpclient/i,
|
||
/java\/\d/i,
|
||
/go-http-client/i,
|
||
/ruby/i,
|
||
|
||
// 命令行工具
|
||
/curl\//i,
|
||
/wget\//i,
|
||
/httpie/i,
|
||
/insomnia/i,
|
||
|
||
// 无头浏览器特征
|
||
/headlesschrome/i,
|
||
/phantomjs/i,
|
||
/slimerjs/i,
|
||
|
||
// 已知数据采集
|
||
/dataprovider/i,
|
||
/yandexbot/i,
|
||
/mj12bot/i,
|
||
/ahrefsbot/i,
|
||
/semrushbot/i,
|
||
/dotbot/i,
|
||
];
|
||
|
||
export function isBotUA(ua: string): boolean {
|
||
return BOT_UA_PATTERNS.some(p => p.test(ua));
|
||
}
|
||
```
|
||
|
||
### 允许的搜索引擎爬虫白名单
|
||
|
||
```typescript
|
||
// 合法爬虫:需要验证真实性(反向 DNS 查找)
|
||
const ALLOWED_BOTS = [
|
||
{ name: 'Googlebot', ua: /googlebot/i, rdns: 'googlebot.com' },
|
||
{ name: 'Bingbot', ua: /bingbot/i, rdns: 'search.msn.com' },
|
||
{ name: 'Baidu Spider', ua: /baiduspider/i, rdns: 'crawl.baidu.com' },
|
||
];
|
||
|
||
async function isLegitimateBot(ua: string, ip: string): Promise<boolean> {
|
||
const bot = ALLOWED_BOTS.find(b => b.ua.test(ua));
|
||
if (!bot) return false;
|
||
|
||
// 反向 DNS 验证(防止伪造 UA)
|
||
const hostname = await reverseDNS(ip);
|
||
return hostname?.endsWith(bot.rdns) ?? false;
|
||
}
|
||
```
|
||
|
||
---
|
||
|
||
## 2. 风险评分算法
|
||
|
||
### 综合评分模型
|
||
|
||
```typescript
|
||
interface RiskFactors {
|
||
fingerprintScore: number; // 0-100
|
||
rateScore: number; // 0-100(超速时增加)
|
||
ipScore: number; // 0-100(数据中心/Tor/VPN)
|
||
behaviorScore: number; // 0-100(行为异常)
|
||
}
|
||
|
||
function calculateRiskScore(factors: RiskFactors): number {
|
||
const weights = {
|
||
fingerprint: 0.35,
|
||
rate: 0.30,
|
||
ip: 0.25,
|
||
behavior: 0.10,
|
||
};
|
||
|
||
return Math.min(
|
||
Math.round(
|
||
factors.fingerprintScore * weights.fingerprint +
|
||
factors.rateScore * weights.rate +
|
||
factors.ipScore * weights.ip +
|
||
factors.behaviorScore * weights.behavior
|
||
),
|
||
100
|
||
);
|
||
}
|
||
|
||
// 响应策略
|
||
function getResponseStrategy(score: number): 'allow' | 'slowdown' | 'challenge' | 'block' {
|
||
if (score >= 80) return 'block';
|
||
if (score >= 50) return 'challenge';
|
||
if (score >= 30) return 'slowdown';
|
||
return 'allow';
|
||
}
|
||
```
|
||
|
||
---
|
||
|
||
## 3. Redis 数据结构设计
|
||
|
||
```
|
||
# 速率限制(滑动窗口)
|
||
ZSET rl:{ip} → { timestamp: score }
|
||
ZSET rl:{ip}:{endpoint} → { timestamp: score }
|
||
|
||
# IP 黑白名单
|
||
SET ip:blocklist → { ip1, ip2, ... }
|
||
SET ip:allowlist → { ip1, ip2, ... }(合法爬虫白名单)
|
||
SET ip:tor-exit → { ip1, ip2, ... }
|
||
SET ip:datacenter → { ip1, ip2, ... }
|
||
|
||
# 蜜罐触发记录
|
||
HASH honeypot:hits → { ip: count }
|
||
|
||
# CAPTCHA 通过记录(防止重复挑战)
|
||
STRING captcha:passed:{ip} → "1"(TTL 1 小时)
|
||
|
||
# 行为画像
|
||
HASH behavior:{ip} → {
|
||
first_seen: timestamp,
|
||
request_count: number,
|
||
path_entropy: number, # 访问路径多样性(低=爬虫)
|
||
referer_missing_ratio: number, # 缺少 Referer 比例(高=爬虫)
|
||
}
|
||
```
|
||
|
||
---
|
||
|
||
## 4. Nginx 层防护(可选,性能最佳)
|
||
|
||
```nginx
|
||
# /etc/nginx/conf.d/anti-scraping.conf
|
||
|
||
# 限速区域定义
|
||
limit_req_zone $binary_remote_addr zone=api:10m rate=30r/m;
|
||
limit_req_zone $binary_remote_addr zone=login:10m rate=5r/m;
|
||
limit_conn_zone $binary_remote_addr zone=conn:10m;
|
||
|
||
server {
|
||
# 连接数限制(单 IP 最多 20 并发)
|
||
limit_conn conn 20;
|
||
|
||
# UA 黑名单
|
||
if ($http_user_agent ~* "(python|curl|wget|scrapy|selenium)") {
|
||
return 403;
|
||
}
|
||
|
||
# 空 UA 拒绝
|
||
if ($http_user_agent = "") {
|
||
return 403;
|
||
}
|
||
|
||
location /api/ {
|
||
limit_req zone=api burst=10 nodelay;
|
||
limit_req_status 429;
|
||
|
||
proxy_pass http://app;
|
||
}
|
||
|
||
location /api/auth/ {
|
||
limit_req zone=login burst=2 nodelay;
|
||
limit_req_status 429;
|
||
|
||
proxy_pass http://app;
|
||
}
|
||
|
||
# 蜜罐路由(真实用户不会访问)
|
||
location /admin-backup/ {
|
||
access_log /var/log/nginx/honeypot.log;
|
||
# 记录访问者 IP 并返回假数据
|
||
return 200 '{"status":"ok"}';
|
||
add_header Content-Type application/json;
|
||
}
|
||
}
|
||
```
|
||
|
||
---
|
||
|
||
## 5. Cloudflare Workers 方案(Edge 层,最推荐)
|
||
|
||
```typescript
|
||
// workers/anti-scraping.ts
|
||
export default {
|
||
async fetch(request: Request, env: Env): Promise<Response> {
|
||
const ip = request.headers.get('CF-Connecting-IP') ?? '';
|
||
const ua = request.headers.get('User-Agent') ?? '';
|
||
|
||
// 利用 Cloudflare 的威胁评分
|
||
const cfThreatScore = Number(request.headers.get('CF-Threat-Score') ?? 0);
|
||
if (cfThreatScore > 30) {
|
||
return new Response('Forbidden', { status: 403 });
|
||
}
|
||
|
||
// 利用 Cloudflare 的 Bot 管理分数(需开启 Bot Management)
|
||
const cfBotScore = Number(request.headers.get('CF-Bot-Score') ?? 100);
|
||
if (cfBotScore < 30) {
|
||
// 低分 = 高爬虫可能性
|
||
return new Response('Forbidden', { status: 403 });
|
||
}
|
||
|
||
// 自定义限速(使用 Durable Objects 或 KV)
|
||
const rateLimitKey = `rl:${ip}`;
|
||
const count = Number(await env.RATE_LIMIT.get(rateLimitKey) ?? 0);
|
||
if (count > 60) {
|
||
return new Response('Too Many Requests', {
|
||
status: 429,
|
||
headers: { 'Retry-After': '60' },
|
||
});
|
||
}
|
||
|
||
await env.RATE_LIMIT.put(rateLimitKey, String(count + 1), { expirationTtl: 60 });
|
||
return fetch(request);
|
||
},
|
||
};
|
||
```
|
||
|
||
---
|
||
|
||
## 6. 监控 Dashboard(Datadog / Grafana 指标)
|
||
|
||
```typescript
|
||
// lib/metrics.ts — 关键埋点
|
||
export const antiScrapingMetrics = {
|
||
// 请求被拦截
|
||
blocked: (reason: 'fingerprint' | 'rate' | 'honeypot' | 'ip' | 'captcha') => {
|
||
metrics.increment('anti_scraping.blocked', { reason });
|
||
},
|
||
|
||
// 风险评分分布
|
||
scoreDistribution: (score: number) => {
|
||
metrics.histogram('anti_scraping.risk_score', score);
|
||
},
|
||
|
||
// CAPTCHA 展示与通过
|
||
captchaImpressed: () => metrics.increment('anti_scraping.captcha.impressed'),
|
||
captchaPassed: () => metrics.increment('anti_scraping.captcha.passed'),
|
||
captchaFailed: () => metrics.increment('anti_scraping.captcha.failed'),
|
||
|
||
// 误伤监控
|
||
falsePositive: (userId: string) => {
|
||
metrics.increment('anti_scraping.false_positive');
|
||
logger.warn({ userId }, 'Possible false positive in anti-scraping');
|
||
},
|
||
};
|
||
```
|
||
|
||
---
|
||
|
||
## 7. 测试用例
|
||
|
||
```typescript
|
||
// __tests__/anti-scraping.test.ts
|
||
describe('Anti-Scraping', () => {
|
||
describe('Fingerprint Analysis', () => {
|
||
it('should flag Python requests as high risk', () => {
|
||
const score = analyzeFingerpring(mockRequest({
|
||
'user-agent': 'python-requests/2.28.0',
|
||
}));
|
||
expect(score).toBeGreaterThanOrEqual(50);
|
||
});
|
||
|
||
it('should not flag normal Chrome browser', () => {
|
||
const score = analyzeFingerpring(mockRequest({
|
||
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
|
||
'accept-language': 'zh-CN,zh;q=0.9',
|
||
'accept-encoding': 'gzip, deflate, br',
|
||
'accept': 'text/html,application/xhtml+xml',
|
||
}));
|
||
expect(score).toBeLessThan(20);
|
||
});
|
||
});
|
||
|
||
describe('Rate Limiting', () => {
|
||
it('should block after exceeding limit', async () => {
|
||
const ip = '192.168.1.100';
|
||
// 发送 61 次请求
|
||
for (let i = 0; i < 61; i++) {
|
||
await checkRateLimit(ip, { windowMs: 60_000, limit: 60 });
|
||
}
|
||
const { allowed } = await checkRateLimit(ip);
|
||
expect(allowed).toBe(false);
|
||
});
|
||
});
|
||
|
||
describe('Honeypot', () => {
|
||
it('should block IP that triggers honeypot', async () => {
|
||
const ip = '10.0.0.1';
|
||
await triggerHoneypot(ip);
|
||
const isBlocked = await redis.sismember('ip:blocklist', ip);
|
||
expect(isBlocked).toBe(1);
|
||
});
|
||
});
|
||
});
|
||
```
|