初始化
This commit is contained in:
@@ -0,0 +1,306 @@
|
||||
# Anti-Scraping 参考模式库
|
||||
|
||||
## 1. 常见爬虫特征指纹
|
||||
|
||||
### User-Agent 黑名单(正则)
|
||||
|
||||
```typescript
|
||||
const BOT_UA_PATTERNS = [
|
||||
// 爬虫框架
|
||||
/python-requests/i,
|
||||
/scrapy/i,
|
||||
/beautifulsoup/i,
|
||||
/selenium/i,
|
||||
/playwright/i,
|
||||
/puppeteer/i,
|
||||
/mechanize/i,
|
||||
/httpclient/i,
|
||||
/java\/\d/i,
|
||||
/go-http-client/i,
|
||||
/ruby/i,
|
||||
|
||||
// 命令行工具
|
||||
/curl\//i,
|
||||
/wget\//i,
|
||||
/httpie/i,
|
||||
/insomnia/i,
|
||||
|
||||
// 无头浏览器特征
|
||||
/headlesschrome/i,
|
||||
/phantomjs/i,
|
||||
/slimerjs/i,
|
||||
|
||||
// 已知数据采集
|
||||
/dataprovider/i,
|
||||
/yandexbot/i,
|
||||
/mj12bot/i,
|
||||
/ahrefsbot/i,
|
||||
/semrushbot/i,
|
||||
/dotbot/i,
|
||||
];
|
||||
|
||||
export function isBotUA(ua: string): boolean {
|
||||
return BOT_UA_PATTERNS.some(p => p.test(ua));
|
||||
}
|
||||
```
|
||||
|
||||
### 允许的搜索引擎爬虫白名单
|
||||
|
||||
```typescript
|
||||
// 合法爬虫:需要验证真实性(反向 DNS 查找)
|
||||
const ALLOWED_BOTS = [
|
||||
{ name: 'Googlebot', ua: /googlebot/i, rdns: 'googlebot.com' },
|
||||
{ name: 'Bingbot', ua: /bingbot/i, rdns: 'search.msn.com' },
|
||||
{ name: 'Baidu Spider', ua: /baiduspider/i, rdns: 'crawl.baidu.com' },
|
||||
];
|
||||
|
||||
async function isLegitimateBot(ua: string, ip: string): Promise<boolean> {
|
||||
const bot = ALLOWED_BOTS.find(b => b.ua.test(ua));
|
||||
if (!bot) return false;
|
||||
|
||||
// 反向 DNS 验证(防止伪造 UA)
|
||||
const hostname = await reverseDNS(ip);
|
||||
return hostname?.endsWith(bot.rdns) ?? false;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2. 风险评分算法
|
||||
|
||||
### 综合评分模型
|
||||
|
||||
```typescript
|
||||
interface RiskFactors {
|
||||
fingerprintScore: number; // 0-100
|
||||
rateScore: number; // 0-100(超速时增加)
|
||||
ipScore: number; // 0-100(数据中心/Tor/VPN)
|
||||
behaviorScore: number; // 0-100(行为异常)
|
||||
}
|
||||
|
||||
function calculateRiskScore(factors: RiskFactors): number {
|
||||
const weights = {
|
||||
fingerprint: 0.35,
|
||||
rate: 0.30,
|
||||
ip: 0.25,
|
||||
behavior: 0.10,
|
||||
};
|
||||
|
||||
return Math.min(
|
||||
Math.round(
|
||||
factors.fingerprintScore * weights.fingerprint +
|
||||
factors.rateScore * weights.rate +
|
||||
factors.ipScore * weights.ip +
|
||||
factors.behaviorScore * weights.behavior
|
||||
),
|
||||
100
|
||||
);
|
||||
}
|
||||
|
||||
// 响应策略
|
||||
function getResponseStrategy(score: number): 'allow' | 'slowdown' | 'challenge' | 'block' {
|
||||
if (score >= 80) return 'block';
|
||||
if (score >= 50) return 'challenge';
|
||||
if (score >= 30) return 'slowdown';
|
||||
return 'allow';
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. Redis 数据结构设计
|
||||
|
||||
```
|
||||
# 速率限制(滑动窗口)
|
||||
ZSET rl:{ip} → { timestamp: score }
|
||||
ZSET rl:{ip}:{endpoint} → { timestamp: score }
|
||||
|
||||
# IP 黑白名单
|
||||
SET ip:blocklist → { ip1, ip2, ... }
|
||||
SET ip:allowlist → { ip1, ip2, ... }(合法爬虫白名单)
|
||||
SET ip:tor-exit → { ip1, ip2, ... }
|
||||
SET ip:datacenter → { ip1, ip2, ... }
|
||||
|
||||
# 蜜罐触发记录
|
||||
HASH honeypot:hits → { ip: count }
|
||||
|
||||
# CAPTCHA 通过记录(防止重复挑战)
|
||||
STRING captcha:passed:{ip} → "1"(TTL 1 小时)
|
||||
|
||||
# 行为画像
|
||||
HASH behavior:{ip} → {
|
||||
first_seen: timestamp,
|
||||
request_count: number,
|
||||
path_entropy: number, # 访问路径多样性(低=爬虫)
|
||||
referer_missing_ratio: number, # 缺少 Referer 比例(高=爬虫)
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. Nginx 层防护(可选,性能最佳)
|
||||
|
||||
```nginx
|
||||
# /etc/nginx/conf.d/anti-scraping.conf
|
||||
|
||||
# 限速区域定义
|
||||
limit_req_zone $binary_remote_addr zone=api:10m rate=30r/m;
|
||||
limit_req_zone $binary_remote_addr zone=login:10m rate=5r/m;
|
||||
limit_conn_zone $binary_remote_addr zone=conn:10m;
|
||||
|
||||
server {
|
||||
# 连接数限制(单 IP 最多 20 并发)
|
||||
limit_conn conn 20;
|
||||
|
||||
# UA 黑名单
|
||||
if ($http_user_agent ~* "(python|curl|wget|scrapy|selenium)") {
|
||||
return 403;
|
||||
}
|
||||
|
||||
# 空 UA 拒绝
|
||||
if ($http_user_agent = "") {
|
||||
return 403;
|
||||
}
|
||||
|
||||
location /api/ {
|
||||
limit_req zone=api burst=10 nodelay;
|
||||
limit_req_status 429;
|
||||
|
||||
proxy_pass http://app;
|
||||
}
|
||||
|
||||
location /api/auth/ {
|
||||
limit_req zone=login burst=2 nodelay;
|
||||
limit_req_status 429;
|
||||
|
||||
proxy_pass http://app;
|
||||
}
|
||||
|
||||
# 蜜罐路由(真实用户不会访问)
|
||||
location /admin-backup/ {
|
||||
access_log /var/log/nginx/honeypot.log;
|
||||
# 记录访问者 IP 并返回假数据
|
||||
return 200 '{"status":"ok"}';
|
||||
add_header Content-Type application/json;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5. Cloudflare Workers 方案(Edge 层,最推荐)
|
||||
|
||||
```typescript
|
||||
// workers/anti-scraping.ts
|
||||
export default {
|
||||
async fetch(request: Request, env: Env): Promise<Response> {
|
||||
const ip = request.headers.get('CF-Connecting-IP') ?? '';
|
||||
const ua = request.headers.get('User-Agent') ?? '';
|
||||
|
||||
// 利用 Cloudflare 的威胁评分
|
||||
const cfThreatScore = Number(request.headers.get('CF-Threat-Score') ?? 0);
|
||||
if (cfThreatScore > 30) {
|
||||
return new Response('Forbidden', { status: 403 });
|
||||
}
|
||||
|
||||
// 利用 Cloudflare 的 Bot 管理分数(需开启 Bot Management)
|
||||
const cfBotScore = Number(request.headers.get('CF-Bot-Score') ?? 100);
|
||||
if (cfBotScore < 30) {
|
||||
// 低分 = 高爬虫可能性
|
||||
return new Response('Forbidden', { status: 403 });
|
||||
}
|
||||
|
||||
// 自定义限速(使用 Durable Objects 或 KV)
|
||||
const rateLimitKey = `rl:${ip}`;
|
||||
const count = Number(await env.RATE_LIMIT.get(rateLimitKey) ?? 0);
|
||||
if (count > 60) {
|
||||
return new Response('Too Many Requests', {
|
||||
status: 429,
|
||||
headers: { 'Retry-After': '60' },
|
||||
});
|
||||
}
|
||||
|
||||
await env.RATE_LIMIT.put(rateLimitKey, String(count + 1), { expirationTtl: 60 });
|
||||
return fetch(request);
|
||||
},
|
||||
};
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 6. 监控 Dashboard(Datadog / Grafana 指标)
|
||||
|
||||
```typescript
|
||||
// lib/metrics.ts — 关键埋点
|
||||
export const antiScrapingMetrics = {
|
||||
// 请求被拦截
|
||||
blocked: (reason: 'fingerprint' | 'rate' | 'honeypot' | 'ip' | 'captcha') => {
|
||||
metrics.increment('anti_scraping.blocked', { reason });
|
||||
},
|
||||
|
||||
// 风险评分分布
|
||||
scoreDistribution: (score: number) => {
|
||||
metrics.histogram('anti_scraping.risk_score', score);
|
||||
},
|
||||
|
||||
// CAPTCHA 展示与通过
|
||||
captchaImpressed: () => metrics.increment('anti_scraping.captcha.impressed'),
|
||||
captchaPassed: () => metrics.increment('anti_scraping.captcha.passed'),
|
||||
captchaFailed: () => metrics.increment('anti_scraping.captcha.failed'),
|
||||
|
||||
// 误伤监控
|
||||
falsePositive: (userId: string) => {
|
||||
metrics.increment('anti_scraping.false_positive');
|
||||
logger.warn({ userId }, 'Possible false positive in anti-scraping');
|
||||
},
|
||||
};
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 7. 测试用例
|
||||
|
||||
```typescript
|
||||
// __tests__/anti-scraping.test.ts
|
||||
describe('Anti-Scraping', () => {
|
||||
describe('Fingerprint Analysis', () => {
|
||||
it('should flag Python requests as high risk', () => {
|
||||
const score = analyzeFingerpring(mockRequest({
|
||||
'user-agent': 'python-requests/2.28.0',
|
||||
}));
|
||||
expect(score).toBeGreaterThanOrEqual(50);
|
||||
});
|
||||
|
||||
it('should not flag normal Chrome browser', () => {
|
||||
const score = analyzeFingerpring(mockRequest({
|
||||
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
|
||||
'accept-language': 'zh-CN,zh;q=0.9',
|
||||
'accept-encoding': 'gzip, deflate, br',
|
||||
'accept': 'text/html,application/xhtml+xml',
|
||||
}));
|
||||
expect(score).toBeLessThan(20);
|
||||
});
|
||||
});
|
||||
|
||||
describe('Rate Limiting', () => {
|
||||
it('should block after exceeding limit', async () => {
|
||||
const ip = '192.168.1.100';
|
||||
// 发送 61 次请求
|
||||
for (let i = 0; i < 61; i++) {
|
||||
await checkRateLimit(ip, { windowMs: 60_000, limit: 60 });
|
||||
}
|
||||
const { allowed } = await checkRateLimit(ip);
|
||||
expect(allowed).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('Honeypot', () => {
|
||||
it('should block IP that triggers honeypot', async () => {
|
||||
const ip = '10.0.0.1';
|
||||
await triggerHoneypot(ip);
|
||||
const isBlocked = await redis.sismember('ip:blocklist', ip);
|
||||
expect(isBlocked).toBe(1);
|
||||
});
|
||||
});
|
||||
});
|
||||
```
|
||||
Reference in New Issue
Block a user