初始化

2026-03-05 21:27:11 +08:00
commit 130de0fd5d
140 changed files with 21972 additions and 0 deletions
--- a/.cursor/skills/anti-scraping/references/anti-scraping-patterns.md
+++ b/.cursor/skills/anti-scraping/references/anti-scraping-patterns.md
@@ -0,0 +1,306 @@
+# Anti-Scraping 参考模式库
+
+## 1. 常见爬虫特征指纹
+
+### User-Agent 黑名单（正则）
+
+```typescript
+const BOT_UA_PATTERNS = [
+  // 爬虫框架
+  /python-requests/i,
+  /scrapy/i,
+  /beautifulsoup/i,
+  /selenium/i,
+  /playwright/i,
+  /puppeteer/i,
+  /mechanize/i,
+  /httpclient/i,
+  /java\/\d/i,
+  /go-http-client/i,
+  /ruby/i,
+
+  // 命令行工具
+  /curl\//i,
+  /wget\//i,
+  /httpie/i,
+  /insomnia/i,
+
+  // 无头浏览器特征
+  /headlesschrome/i,
+  /phantomjs/i,
+  /slimerjs/i,
+
+  // 已知数据采集
+  /dataprovider/i,
+  /yandexbot/i,
+  /mj12bot/i,
+  /ahrefsbot/i,
+  /semrushbot/i,
+  /dotbot/i,
+];
+
+export function isBotUA(ua: string): boolean {
+  return BOT_UA_PATTERNS.some(p => p.test(ua));
+}
+```
+
+### 允许的搜索引擎爬虫白名单
+
+```typescript
+// 合法爬虫：需要验证真实性（反向 DNS 查找）
+const ALLOWED_BOTS = [
+  { name: 'Googlebot', ua: /googlebot/i, rdns: 'googlebot.com' },
+  { name: 'Bingbot', ua: /bingbot/i, rdns: 'search.msn.com' },
+  { name: 'Baidu Spider', ua: /baiduspider/i, rdns: 'crawl.baidu.com' },
+];
+
+async function isLegitimateBot(ua: string, ip: string): Promise<boolean> {
+  const bot = ALLOWED_BOTS.find(b => b.ua.test(ua));
+  if (!bot) return false;
+
+  // 反向 DNS 验证（防止伪造 UA）
+  const hostname = await reverseDNS(ip);
+  return hostname?.endsWith(bot.rdns) ?? false;
+}
+```
+
+---
+
+## 2. 风险评分算法
+
+### 综合评分模型
+
+```typescript
+interface RiskFactors {
+  fingerprintScore: number;    // 0-100
+  rateScore: number;           // 0-100（超速时增加）
+  ipScore: number;             // 0-100（数据中心/Tor/VPN）
+  behaviorScore: number;       // 0-100（行为异常）
+}
+
+function calculateRiskScore(factors: RiskFactors): number {
+  const weights = {
+    fingerprint: 0.35,
+    rate: 0.30,
+    ip: 0.25,
+    behavior: 0.10,
+  };
+
+  return Math.min(
+    Math.round(
+      factors.fingerprintScore * weights.fingerprint +
+      factors.rateScore * weights.rate +
+      factors.ipScore * weights.ip +
+      factors.behaviorScore * weights.behavior
+    ),
+    100
+  );
+}
+
+// 响应策略
+function getResponseStrategy(score: number): 'allow' | 'slowdown' | 'challenge' | 'block' {
+  if (score >= 80) return 'block';
+  if (score >= 50) return 'challenge';
+  if (score >= 30) return 'slowdown';
+  return 'allow';
+}
+```
+
+---
+
+## 3. Redis 数据结构设计
+
+```
+# 速率限制（滑动窗口）
+ZSET  rl:{ip}                     → { timestamp: score }
+ZSET  rl:{ip}:{endpoint}          → { timestamp: score }
+
+# IP 黑白名单
+SET   ip:blocklist                → { ip1, ip2, ... }
+SET   ip:allowlist                → { ip1, ip2, ... }（合法爬虫白名单）
+SET   ip:tor-exit                 → { ip1, ip2, ... }
+SET   ip:datacenter               → { ip1, ip2, ... }
+
+# 蜜罐触发记录
+HASH  honeypot:hits               → { ip: count }
+
+# CAPTCHA 通过记录（防止重复挑战）
+STRING captcha:passed:{ip}        → "1"（TTL 1 小时）
+
+# 行为画像
+HASH  behavior:{ip}               → {
+  first_seen: timestamp,
+  request_count: number,
+  path_entropy: number,           # 访问路径多样性（低=爬虫）
+  referer_missing_ratio: number,  # 缺少 Referer 比例（高=爬虫）
+}
+```
+
+---
+
+## 4. Nginx 层防护（可选，性能最佳）
+
+```nginx
+# /etc/nginx/conf.d/anti-scraping.conf
+
+# 限速区域定义
+limit_req_zone $binary_remote_addr zone=api:10m rate=30r/m;
+limit_req_zone $binary_remote_addr zone=login:10m rate=5r/m;
+limit_conn_zone $binary_remote_addr zone=conn:10m;
+
+server {
+  # 连接数限制（单 IP 最多 20 并发）
+  limit_conn conn 20;
+
+  # UA 黑名单
+  if ($http_user_agent ~* "(python|curl|wget|scrapy|selenium)") {
+    return 403;
+  }
+
+  # 空 UA 拒绝
+  if ($http_user_agent = "") {
+    return 403;
+  }
+
+  location /api/ {
+    limit_req zone=api burst=10 nodelay;
+    limit_req_status 429;
+
+    proxy_pass http://app;
+  }
+
+  location /api/auth/ {
+    limit_req zone=login burst=2 nodelay;
+    limit_req_status 429;
+
+    proxy_pass http://app;
+  }
+
+  # 蜜罐路由（真实用户不会访问）
+  location /admin-backup/ {
+    access_log /var/log/nginx/honeypot.log;
+    # 记录访问者 IP 并返回假数据
+    return 200 '{"status":"ok"}';
+    add_header Content-Type application/json;
+  }
+}
+```
+
+---
+
+## 5. Cloudflare Workers 方案（Edge 层，最推荐）
+
+```typescript
+// workers/anti-scraping.ts
+export default {
+  async fetch(request: Request, env: Env): Promise<Response> {
+    const ip = request.headers.get('CF-Connecting-IP') ?? '';
+    const ua = request.headers.get('User-Agent') ?? '';
+
+    // 利用 Cloudflare 的威胁评分
+    const cfThreatScore = Number(request.headers.get('CF-Threat-Score') ?? 0);
+    if (cfThreatScore > 30) {
+      return new Response('Forbidden', { status: 403 });
+    }
+
+    // 利用 Cloudflare 的 Bot 管理分数（需开启 Bot Management）
+    const cfBotScore = Number(request.headers.get('CF-Bot-Score') ?? 100);
+    if (cfBotScore < 30) {
+      // 低分 = 高爬虫可能性
+      return new Response('Forbidden', { status: 403 });
+    }
+
+    // 自定义限速（使用 Durable Objects 或 KV）
+    const rateLimitKey = `rl:${ip}`;
+    const count = Number(await env.RATE_LIMIT.get(rateLimitKey) ?? 0);
+    if (count > 60) {
+      return new Response('Too Many Requests', {
+        status: 429,
+        headers: { 'Retry-After': '60' },
+      });
+    }
+
+    await env.RATE_LIMIT.put(rateLimitKey, String(count + 1), { expirationTtl: 60 });
+    return fetch(request);
+  },
+};
+```
+
+---
+
+## 6. 监控 Dashboard（Datadog / Grafana 指标）
+
+```typescript
+// lib/metrics.ts — 关键埋点
+export const antiScrapingMetrics = {
+  // 请求被拦截
+  blocked: (reason: 'fingerprint' | 'rate' | 'honeypot' | 'ip' | 'captcha') => {
+    metrics.increment('anti_scraping.blocked', { reason });
+  },
+
+  // 风险评分分布
+  scoreDistribution: (score: number) => {
+    metrics.histogram('anti_scraping.risk_score', score);
+  },
+
+  // CAPTCHA 展示与通过
+  captchaImpressed: () => metrics.increment('anti_scraping.captcha.impressed'),
+  captchaPassed: () => metrics.increment('anti_scraping.captcha.passed'),
+  captchaFailed: () => metrics.increment('anti_scraping.captcha.failed'),
+
+  // 误伤监控
+  falsePositive: (userId: string) => {
+    metrics.increment('anti_scraping.false_positive');
+    logger.warn({ userId }, 'Possible false positive in anti-scraping');
+  },
+};
+```
+
+---
+
+## 7. 测试用例
+
+```typescript
+// __tests__/anti-scraping.test.ts
+describe('Anti-Scraping', () => {
+  describe('Fingerprint Analysis', () => {
+    it('should flag Python requests as high risk', () => {
+      const score = analyzeFingerpring(mockRequest({
+        'user-agent': 'python-requests/2.28.0',
+      }));
+      expect(score).toBeGreaterThanOrEqual(50);
+    });
+
+    it('should not flag normal Chrome browser', () => {
+      const score = analyzeFingerpring(mockRequest({
+        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
+        'accept-language': 'zh-CN,zh;q=0.9',
+        'accept-encoding': 'gzip, deflate, br',
+        'accept': 'text/html,application/xhtml+xml',
+      }));
+      expect(score).toBeLessThan(20);
+    });
+  });
+
+  describe('Rate Limiting', () => {
+    it('should block after exceeding limit', async () => {
+      const ip = '192.168.1.100';
+      // 发送 61 次请求
+      for (let i = 0; i < 61; i++) {
+        await checkRateLimit(ip, { windowMs: 60_000, limit: 60 });
+      }
+      const { allowed } = await checkRateLimit(ip);
+      expect(allowed).toBe(false);
+    });
+  });
+
+  describe('Honeypot', () => {
+    it('should block IP that triggers honeypot', async () => {
+      const ip = '10.0.0.1';
+      await triggerHoneypot(ip);
+      const isBlocked = await redis.sismember('ip:blocklist', ip);
+      expect(isBlocked).toBe(1);
+    });
+  });
+});
+```