Anti-Scraping 参考模式库

1. 常见爬虫特征指纹

User-Agent 黑名单（正则）

const BOT_UA_PATTERNS = [
  // 爬虫框架
  /python-requests/i,
  /scrapy/i,
  /beautifulsoup/i,
  /selenium/i,
  /playwright/i,
  /puppeteer/i,
  /mechanize/i,
  /httpclient/i,
  /java\/\d/i,
  /go-http-client/i,
  /ruby/i,

  // 命令行工具
  /curl\//i,
  /wget\//i,
  /httpie/i,
  /insomnia/i,

  // 无头浏览器特征
  /headlesschrome/i,
  /phantomjs/i,
  /slimerjs/i,

  // 已知数据采集
  /dataprovider/i,
  /yandexbot/i,
  /mj12bot/i,
  /ahrefsbot/i,
  /semrushbot/i,
  /dotbot/i,
];

export function isBotUA(ua: string): boolean {
  return BOT_UA_PATTERNS.some(p => p.test(ua));
}

允许的搜索引擎爬虫白名单

// 合法爬虫：需要验证真实性（反向 DNS 查找）
const ALLOWED_BOTS = [
  { name: 'Googlebot', ua: /googlebot/i, rdns: 'googlebot.com' },
  { name: 'Bingbot', ua: /bingbot/i, rdns: 'search.msn.com' },
  { name: 'Baidu Spider', ua: /baiduspider/i, rdns: 'crawl.baidu.com' },
];

async function isLegitimateBot(ua: string, ip: string): Promise<boolean> {
  const bot = ALLOWED_BOTS.find(b => b.ua.test(ua));
  if (!bot) return false;

  // 反向 DNS 验证（防止伪造 UA）
  const hostname = await reverseDNS(ip);
  return hostname?.endsWith(bot.rdns) ?? false;
}

2. 风险评分算法

综合评分模型

interface RiskFactors {
  fingerprintScore: number;    // 0-100
  rateScore: number;           // 0-100（超速时增加）
  ipScore: number;             // 0-100（数据中心/Tor/VPN）
  behaviorScore: number;       // 0-100（行为异常）
}

function calculateRiskScore(factors: RiskFactors): number {
  const weights = {
    fingerprint: 0.35,
    rate: 0.30,
    ip: 0.25,
    behavior: 0.10,
  };

  return Math.min(
    Math.round(
      factors.fingerprintScore * weights.fingerprint +
      factors.rateScore * weights.rate +
      factors.ipScore * weights.ip +
      factors.behaviorScore * weights.behavior
    ),
    100
  );
}

// 响应策略
function getResponseStrategy(score: number): 'allow' | 'slowdown' | 'challenge' | 'block' {
  if (score >= 80) return 'block';
  if (score >= 50) return 'challenge';
  if (score >= 30) return 'slowdown';
  return 'allow';
}

3. Redis 数据结构设计

# 速率限制（滑动窗口）
ZSET  rl:{ip}                     → { timestamp: score }
ZSET  rl:{ip}:{endpoint}          → { timestamp: score }

# IP 黑白名单
SET   ip:blocklist                → { ip1, ip2, ... }
SET   ip:allowlist                → { ip1, ip2, ... }（合法爬虫白名单）
SET   ip:tor-exit                 → { ip1, ip2, ... }
SET   ip:datacenter               → { ip1, ip2, ... }

# 蜜罐触发记录
HASH  honeypot:hits               → { ip: count }

# CAPTCHA 通过记录（防止重复挑战）
STRING captcha:passed:{ip}        → "1"（TTL 1 小时）

# 行为画像
HASH  behavior:{ip}               → {
  first_seen: timestamp,
  request_count: number,
  path_entropy: number,           # 访问路径多样性（低=爬虫）
  referer_missing_ratio: number,  # 缺少 Referer 比例（高=爬虫）
}

4. Nginx 层防护（可选，性能最佳）

# /etc/nginx/conf.d/anti-scraping.conf

# 限速区域定义
limit_req_zone $binary_remote_addr zone=api:10m rate=30r/m;
limit_req_zone $binary_remote_addr zone=login:10m rate=5r/m;
limit_conn_zone $binary_remote_addr zone=conn:10m;

server {
  # 连接数限制（单 IP 最多 20 并发）
  limit_conn conn 20;

  # UA 黑名单
  if ($http_user_agent ~* "(python|curl|wget|scrapy|selenium)") {
    return 403;
  }

  # 空 UA 拒绝
  if ($http_user_agent = "") {
    return 403;
  }

  location /api/ {
    limit_req zone=api burst=10 nodelay;
    limit_req_status 429;

    proxy_pass http://app;
  }

  location /api/auth/ {
    limit_req zone=login burst=2 nodelay;
    limit_req_status 429;

    proxy_pass http://app;
  }

  # 蜜罐路由（真实用户不会访问）
  location /admin-backup/ {
    access_log /var/log/nginx/honeypot.log;
    # 记录访问者 IP 并返回假数据
    return 200 '{"status":"ok"}';
    add_header Content-Type application/json;
  }
}

5. Cloudflare Workers 方案（Edge 层，最推荐）

// workers/anti-scraping.ts
export default {
  async fetch(request: Request, env: Env): Promise<Response> {
    const ip = request.headers.get('CF-Connecting-IP') ?? '';
    const ua = request.headers.get('User-Agent') ?? '';

    // 利用 Cloudflare 的威胁评分
    const cfThreatScore = Number(request.headers.get('CF-Threat-Score') ?? 0);
    if (cfThreatScore > 30) {
      return new Response('Forbidden', { status: 403 });
    }

    // 利用 Cloudflare 的 Bot 管理分数（需开启 Bot Management）
    const cfBotScore = Number(request.headers.get('CF-Bot-Score') ?? 100);
    if (cfBotScore < 30) {
      // 低分 = 高爬虫可能性
      return new Response('Forbidden', { status: 403 });
    }

    // 自定义限速（使用 Durable Objects 或 KV）
    const rateLimitKey = `rl:${ip}`;
    const count = Number(await env.RATE_LIMIT.get(rateLimitKey) ?? 0);
    if (count > 60) {
      return new Response('Too Many Requests', {
        status: 429,
        headers: { 'Retry-After': '60' },
      });
    }

    await env.RATE_LIMIT.put(rateLimitKey, String(count + 1), { expirationTtl: 60 });
    return fetch(request);
  },
};

6. 监控 Dashboard（Datadog / Grafana 指标）

// lib/metrics.ts — 关键埋点
export const antiScrapingMetrics = {
  // 请求被拦截
  blocked: (reason: 'fingerprint' | 'rate' | 'honeypot' | 'ip' | 'captcha') => {
    metrics.increment('anti_scraping.blocked', { reason });
  },

  // 风险评分分布
  scoreDistribution: (score: number) => {
    metrics.histogram('anti_scraping.risk_score', score);
  },

  // CAPTCHA 展示与通过
  captchaImpressed: () => metrics.increment('anti_scraping.captcha.impressed'),
  captchaPassed: () => metrics.increment('anti_scraping.captcha.passed'),
  captchaFailed: () => metrics.increment('anti_scraping.captcha.failed'),

  // 误伤监控
  falsePositive: (userId: string) => {
    metrics.increment('anti_scraping.false_positive');
    logger.warn({ userId }, 'Possible false positive in anti-scraping');
  },
};

7. 测试用例

// __tests__/anti-scraping.test.ts
describe('Anti-Scraping', () => {
  describe('Fingerprint Analysis', () => {
    it('should flag Python requests as high risk', () => {
      const score = analyzeFingerpring(mockRequest({
        'user-agent': 'python-requests/2.28.0',
      }));
      expect(score).toBeGreaterThanOrEqual(50);
    });

    it('should not flag normal Chrome browser', () => {
      const score = analyzeFingerpring(mockRequest({
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
        'accept-language': 'zh-CN,zh;q=0.9',
        'accept-encoding': 'gzip, deflate, br',
        'accept': 'text/html,application/xhtml+xml',
      }));
      expect(score).toBeLessThan(20);
    });
  });

  describe('Rate Limiting', () => {
    it('should block after exceeding limit', async () => {
      const ip = '192.168.1.100';
      // 发送 61 次请求
      for (let i = 0; i < 61; i++) {
        await checkRateLimit(ip, { windowMs: 60_000, limit: 60 });
      }
      const { allowed } = await checkRateLimit(ip);
      expect(allowed).toBe(false);
    });
  });

  describe('Honeypot', () => {
    it('should block IP that triggers honeypot', async () => {
      const ip = '10.0.0.1';
      await triggerHoneypot(ip);
      const isBlocked = await redis.sismember('ip:blocklist', ip);
      expect(isBlocked).toBe(1);
    });
  });
});

7.7 KiB Raw Blame History Unescape Escape

Anti-Scraping 参考模式库

1. 常见爬虫特征指纹

User-Agent 黑名单（正则）

允许的搜索引擎爬虫白名单

2. 风险评分算法

综合评分模型

3. Redis 数据结构设计

4. Nginx 层防护（可选，性能最佳）

5. Cloudflare Workers 方案（Edge 层，最推荐）

6. 监控 Dashboard（Datadog / Grafana 指标）

7. 测试用例

7.7 KiB

Raw Blame History