Files
vibe_coding/.cursor/skills/anti-scraping/references/anti-scraping-patterns.md
2026-03-05 21:27:11 +08:00

307 lines
7.7 KiB
Markdown
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# Anti-Scraping 参考模式库
## 1. 常见爬虫特征指纹
### User-Agent 黑名单(正则)
```typescript
const BOT_UA_PATTERNS = [
// 爬虫框架
/python-requests/i,
/scrapy/i,
/beautifulsoup/i,
/selenium/i,
/playwright/i,
/puppeteer/i,
/mechanize/i,
/httpclient/i,
/java\/\d/i,
/go-http-client/i,
/ruby/i,
// 命令行工具
/curl\//i,
/wget\//i,
/httpie/i,
/insomnia/i,
// 无头浏览器特征
/headlesschrome/i,
/phantomjs/i,
/slimerjs/i,
// 已知数据采集
/dataprovider/i,
/yandexbot/i,
/mj12bot/i,
/ahrefsbot/i,
/semrushbot/i,
/dotbot/i,
];
export function isBotUA(ua: string): boolean {
return BOT_UA_PATTERNS.some(p => p.test(ua));
}
```
### 允许的搜索引擎爬虫白名单
```typescript
// 合法爬虫:需要验证真实性(反向 DNS 查找)
const ALLOWED_BOTS = [
{ name: 'Googlebot', ua: /googlebot/i, rdns: 'googlebot.com' },
{ name: 'Bingbot', ua: /bingbot/i, rdns: 'search.msn.com' },
{ name: 'Baidu Spider', ua: /baiduspider/i, rdns: 'crawl.baidu.com' },
];
async function isLegitimateBot(ua: string, ip: string): Promise<boolean> {
const bot = ALLOWED_BOTS.find(b => b.ua.test(ua));
if (!bot) return false;
// 反向 DNS 验证(防止伪造 UA
const hostname = await reverseDNS(ip);
return hostname?.endsWith(bot.rdns) ?? false;
}
```
---
## 2. 风险评分算法
### 综合评分模型
```typescript
interface RiskFactors {
fingerprintScore: number; // 0-100
rateScore: number; // 0-100超速时增加
ipScore: number; // 0-100数据中心/Tor/VPN
behaviorScore: number; // 0-100行为异常
}
function calculateRiskScore(factors: RiskFactors): number {
const weights = {
fingerprint: 0.35,
rate: 0.30,
ip: 0.25,
behavior: 0.10,
};
return Math.min(
Math.round(
factors.fingerprintScore * weights.fingerprint +
factors.rateScore * weights.rate +
factors.ipScore * weights.ip +
factors.behaviorScore * weights.behavior
),
100
);
}
// 响应策略
function getResponseStrategy(score: number): 'allow' | 'slowdown' | 'challenge' | 'block' {
if (score >= 80) return 'block';
if (score >= 50) return 'challenge';
if (score >= 30) return 'slowdown';
return 'allow';
}
```
---
## 3. Redis 数据结构设计
```
# 速率限制(滑动窗口)
ZSET rl:{ip} → { timestamp: score }
ZSET rl:{ip}:{endpoint} → { timestamp: score }
# IP 黑白名单
SET ip:blocklist → { ip1, ip2, ... }
SET ip:allowlist → { ip1, ip2, ... }(合法爬虫白名单)
SET ip:tor-exit → { ip1, ip2, ... }
SET ip:datacenter → { ip1, ip2, ... }
# 蜜罐触发记录
HASH honeypot:hits → { ip: count }
# CAPTCHA 通过记录(防止重复挑战)
STRING captcha:passed:{ip} → "1"TTL 1 小时)
# 行为画像
HASH behavior:{ip} → {
first_seen: timestamp,
request_count: number,
path_entropy: number, # 访问路径多样性(低=爬虫)
referer_missing_ratio: number, # 缺少 Referer 比例(高=爬虫)
}
```
---
## 4. Nginx 层防护(可选,性能最佳)
```nginx
# /etc/nginx/conf.d/anti-scraping.conf
# 限速区域定义
limit_req_zone $binary_remote_addr zone=api:10m rate=30r/m;
limit_req_zone $binary_remote_addr zone=login:10m rate=5r/m;
limit_conn_zone $binary_remote_addr zone=conn:10m;
server {
# 连接数限制(单 IP 最多 20 并发)
limit_conn conn 20;
# UA 黑名单
if ($http_user_agent ~* "(python|curl|wget|scrapy|selenium)") {
return 403;
}
# 空 UA 拒绝
if ($http_user_agent = "") {
return 403;
}
location /api/ {
limit_req zone=api burst=10 nodelay;
limit_req_status 429;
proxy_pass http://app;
}
location /api/auth/ {
limit_req zone=login burst=2 nodelay;
limit_req_status 429;
proxy_pass http://app;
}
# 蜜罐路由(真实用户不会访问)
location /admin-backup/ {
access_log /var/log/nginx/honeypot.log;
# 记录访问者 IP 并返回假数据
return 200 '{"status":"ok"}';
add_header Content-Type application/json;
}
}
```
---
## 5. Cloudflare Workers 方案Edge 层,最推荐)
```typescript
// workers/anti-scraping.ts
export default {
async fetch(request: Request, env: Env): Promise<Response> {
const ip = request.headers.get('CF-Connecting-IP') ?? '';
const ua = request.headers.get('User-Agent') ?? '';
// 利用 Cloudflare 的威胁评分
const cfThreatScore = Number(request.headers.get('CF-Threat-Score') ?? 0);
if (cfThreatScore > 30) {
return new Response('Forbidden', { status: 403 });
}
// 利用 Cloudflare 的 Bot 管理分数(需开启 Bot Management
const cfBotScore = Number(request.headers.get('CF-Bot-Score') ?? 100);
if (cfBotScore < 30) {
// 低分 = 高爬虫可能性
return new Response('Forbidden', { status: 403 });
}
// 自定义限速(使用 Durable Objects 或 KV
const rateLimitKey = `rl:${ip}`;
const count = Number(await env.RATE_LIMIT.get(rateLimitKey) ?? 0);
if (count > 60) {
return new Response('Too Many Requests', {
status: 429,
headers: { 'Retry-After': '60' },
});
}
await env.RATE_LIMIT.put(rateLimitKey, String(count + 1), { expirationTtl: 60 });
return fetch(request);
},
};
```
---
## 6. 监控 DashboardDatadog / Grafana 指标)
```typescript
// lib/metrics.ts — 关键埋点
export const antiScrapingMetrics = {
// 请求被拦截
blocked: (reason: 'fingerprint' | 'rate' | 'honeypot' | 'ip' | 'captcha') => {
metrics.increment('anti_scraping.blocked', { reason });
},
// 风险评分分布
scoreDistribution: (score: number) => {
metrics.histogram('anti_scraping.risk_score', score);
},
// CAPTCHA 展示与通过
captchaImpressed: () => metrics.increment('anti_scraping.captcha.impressed'),
captchaPassed: () => metrics.increment('anti_scraping.captcha.passed'),
captchaFailed: () => metrics.increment('anti_scraping.captcha.failed'),
// 误伤监控
falsePositive: (userId: string) => {
metrics.increment('anti_scraping.false_positive');
logger.warn({ userId }, 'Possible false positive in anti-scraping');
},
};
```
---
## 7. 测试用例
```typescript
// __tests__/anti-scraping.test.ts
describe('Anti-Scraping', () => {
describe('Fingerprint Analysis', () => {
it('should flag Python requests as high risk', () => {
const score = analyzeFingerpring(mockRequest({
'user-agent': 'python-requests/2.28.0',
}));
expect(score).toBeGreaterThanOrEqual(50);
});
it('should not flag normal Chrome browser', () => {
const score = analyzeFingerpring(mockRequest({
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'accept-language': 'zh-CN,zh;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept': 'text/html,application/xhtml+xml',
}));
expect(score).toBeLessThan(20);
});
});
describe('Rate Limiting', () => {
it('should block after exceeding limit', async () => {
const ip = '192.168.1.100';
// 发送 61 次请求
for (let i = 0; i < 61; i++) {
await checkRateLimit(ip, { windowMs: 60_000, limit: 60 });
}
const { allowed } = await checkRateLimit(ip);
expect(allowed).toBe(false);
});
});
describe('Honeypot', () => {
it('should block IP that triggers honeypot', async () => {
const ip = '10.0.0.1';
await triggerHoneypot(ip);
const isBlocked = await redis.sismember('ip:blocklist', ip);
expect(isBlocked).toBe(1);
});
});
});
```