# robots.txt for analyticslegends.ai
# Version: v72.70.4 · Updated: 2026-04-26
#
# v72.49.1 hotfix — operator directive 2026-04-25: a sibling Claude
# conversation reported "Same wall" minutes after v72.49 deployed.
# Diagnosis: even though section 1 grouped 5 Anthropic UAs under one
# Disallow/Allow set, some crawlers parse robots.txt strictly enough
# that the grouped form is matched less reliably than per-UA blocks.
# v72.49.1 prepends section 0: separate per-User-agent blocks for
# ClaudeBot, Claude-Web, Claude-SearchBot, Claude-User, anthropic-ai
# AND Anthropic-AI (capitalised dual-case), each with its own
# explicit "Allow: /". Section 1 is unchanged. Policy is identical
# to v72.49 — same five UAs allowed, training prohibited per ToS.
#
# Policy (v72.49, partial reversal of v70.84):
#   - Operator directive (2026-04-25, urgent): "make sure claude can
#     access https://analyticslegends.ai". v70.84 had blanket-blocked
#     every named AI crawler including Anthropic's ClaudeBot family.
#     v72.49 carves Anthropic out of that block — ClaudeBot, Claude-Web,
#     Claude-SearchBot, Claude-User and the anthropic-ai user-agent are
#     now explicitly ALLOWED on public pages, with the same private-data
#     directories (headhunter master, consultants, agencies, stakeholders)
#     still blocked that traditional search engines see.
#
#   - The carve-out aligns with /api/about.json + /.well-known/agent.json
#     (shipped in v72.48), whose policy block declares
#     `training_data_use: "prohibited"` and
#     `answer_retrieval_via_manifest: "permitted_with_attribution"`.
#     ClaudeBot is now welcome to retrieve content for AI-answer
#     responses with attribution; training is still forbidden by the
#     Terms of Service and the noai meta tag.
#
#   - Traditional search engines (Googlebot, Bingbot, DuckDuckBot,
#     etc.) remain welcome on public marketing pages so the site
#     remains discoverable by humans via web search. They are blocked
#     from the proprietary data directories.
#
#   - Other AI / LLM crawlers remain blocked from EVERY path. The
#     v70.84 directive ("no AI agent may pull content") still applies
#     to every vendor except Anthropic. If a future operator directive
#     extends the carve-out to other vendors, add them to section 1
#     and remove from section 2.
#
#   - Abusive SEO link-graph scrapers (Semrush, Ahrefs, etc.) remain
#     blocked — no change.
#
# The Terms of Service explicitly prohibit AI-training extraction
# regardless of crawler identity. Permitted use for ClaudeBot et al.
# is real-time answer retrieval with source attribution — see
# /llms.txt and /.well-known/agent.json for the structured policy.

# ========================================================================
# 0. Anthropic (Claude) — per-User-agent explicit Allow blocks
# ========================================================================
# v72.49.1 hotfix — operator directive 2026-04-25 verbatim "make sure
# claude can access https://analyticslegends.ai" surfaced a real-time
# wall on a sibling Claude conversation even after v72.49 deployed the
# carve-out. Diagnosis: some crawlers parse robots.txt strictly enough
# that a SHARED block (5 Claude UAs grouped under one Disallow/Allow
# set) is matched less reliably than a SEPARATE block per UA. Section 0
# below states the Allow rule per-User-agent in the canonical RFC 9309
# format with explicit "Allow: /" lines, so any conformant parser sees
# an unambiguous green light. Section 1 below keeps the grouped block
# (with the more specific Disallow rules for private data dirs); a
# crawler that respects the most-specific match still gets the same
# private-data block from there.
#
# This is policy-equivalent to v72.49 — same five Anthropic UAs are
# allowed, same training-use prohibition stands per the Terms of
# Service and the noai meta tag.

User-agent: ClaudeBot
Allow: /
Disallow: /api/consultants
Disallow: /api/consultant-matches.json
Disallow: /api/opportunity-matches.json
Disallow: /api/opportunity-matches.csv
Disallow: /api/stakeholders-firm.json
Disallow: /api/private/
Disallow: /data/private/

User-agent: Claude-Web
Allow: /
Disallow: /api/consultants
Disallow: /api/consultant-matches.json
Disallow: /api/opportunity-matches.json
Disallow: /api/opportunity-matches.csv
Disallow: /api/stakeholders-firm.json
Disallow: /api/private/
Disallow: /data/private/

User-agent: Claude-SearchBot
Allow: /
Disallow: /api/consultants
Disallow: /api/consultant-matches.json
Disallow: /api/opportunity-matches.json
Disallow: /api/opportunity-matches.csv
Disallow: /api/stakeholders-firm.json
Disallow: /api/private/
Disallow: /data/private/

User-agent: Claude-User
Allow: /
Disallow: /api/consultants
Disallow: /api/consultant-matches.json
Disallow: /api/opportunity-matches.json
Disallow: /api/opportunity-matches.csv
Disallow: /api/stakeholders-firm.json
Disallow: /api/private/
Disallow: /data/private/

User-agent: anthropic-ai
Allow: /
Disallow: /api/consultants
Disallow: /api/consultant-matches.json
Disallow: /api/opportunity-matches.json
Disallow: /api/opportunity-matches.csv
Disallow: /api/stakeholders-firm.json
Disallow: /api/private/
Disallow: /data/private/

# Capitalised variant — RFC 9309 user-agent matching is case-insensitive,
# but a few hardened parsers normalise differently; explicit dual-case
# entry costs nothing and removes that ambiguity.
User-agent: Anthropic-AI
Allow: /
Disallow: /api/consultants
Disallow: /api/consultant-matches.json
Disallow: /api/opportunity-matches.json
Disallow: /api/opportunity-matches.csv
Disallow: /api/stakeholders-firm.json
Disallow: /api/private/
Disallow: /data/private/

# ========================================================================
# 1. Traditional search engines + Anthropic — crawl public pages,
#    block private data
# ========================================================================

# --- Traditional human-search engines ---
User-agent: Googlebot
User-agent: Googlebot-News
User-agent: Googlebot-Image
User-agent: Googlebot-Video
User-agent: Storebot-Google
User-agent: AdsBot-Google
User-agent: Bingbot
User-agent: MSNBot
User-agent: msnbot-media
User-agent: AdIdxBot
User-agent: DuckDuckBot
User-agent: Slurp
User-agent: YandexBot
User-agent: YandexImages
User-agent: Baiduspider
User-agent: Baiduspider-image
User-agent: Naverbot
User-agent: Yeti
User-agent: Seznam
User-agent: Qwantify
User-agent: MojeekBot
User-agent: ExaBot
User-agent: Ecosia
# --- Anthropic (Claude) — v72.49 carve-out ---
# Welcome to retrieve content for answer engines with attribution.
# Training use is still prohibited by the Terms of Service.
User-agent: ClaudeBot
User-agent: Claude-Web
User-agent: Claude-SearchBot
User-agent: Claude-User
User-agent: anthropic-ai
Disallow: /api/consultants
Disallow: /api/consultant-matches.json
Disallow: /api/opportunity-matches.json
Disallow: /api/opportunity-matches.csv
Disallow: /api/stakeholders-firm.json
Disallow: /api/private/
Disallow: /data/private/
Allow: /
Allow: /.well-known/agent.json
Allow: /api/about.json
Allow: /api/contracts.json
Allow: /api/contracts.meta.json
Allow: /api/agencies.json
Allow: /api/concepts.json
Allow: /api/academy.json
Allow: /api/news.json

# ========================================================================
# 2. Other AI agents — BLOCKED from every path (v70.84 stance preserved)
# ========================================================================
# Operator directive (2026-04-24): no AI agent except Anthropic may
# pull content from the platform. This covers LLM training crawlers,
# AI-answer engines, AI-search bots from every vendor other than
# Anthropic. If your AI crawler is not listed below, you are still
# covered by the catch-all (section 4) and by the noai / noimageai
# <meta name="robots"> directive. Please honour it.

# --- OpenAI (ChatGPT) ---
User-agent: GPTBot
User-agent: OAI-SearchBot
User-agent: ChatGPT-User
# --- Google (Gemini / Bard / Vertex) ---
User-agent: Google-Extended
User-agent: GoogleOther
User-agent: GoogleOther-Image
User-agent: GoogleOther-Video
# --- Microsoft (Copilot / Bing Chat) ---
User-agent: CopilotBot
User-agent: BingPreview
# --- Apple (Apple Intelligence) ---
User-agent: Applebot-Extended
# --- Amazon (Alexa / Rufus / Nova) ---
User-agent: Amazonbot
# --- Meta (Llama / Meta AI) ---
User-agent: FacebookBot
User-agent: Meta-ExternalAgent
User-agent: Meta-ExternalFetcher
# --- Perplexity ---
User-agent: PerplexityBot
User-agent: Perplexity-User
# --- Mistral AI ---
User-agent: MistralAI-User
User-agent: Mistral-Nemo
# --- Cohere ---
User-agent: cohere-ai
User-agent: cohere-training-data-crawler
User-agent: cohere-image-crawler
# --- xAI (Grok) ---
User-agent: xAI-Crawler
User-agent: Grok
User-agent: GrokBot
# --- You.com ---
User-agent: YouBot
# --- Kagi ---
User-agent: KagiBot
User-agent: Kagi-Assistant
# --- Phind ---
User-agent: PhindBot
# --- Poe (Quora) ---
User-agent: Poe
User-agent: PoeBot
# --- DeepSeek ---
User-agent: DeepSeekBot
# --- Yandex AI (YandexGPT) ---
User-agent: YandexAdditional
User-agent: YandexAdditionalBot
# --- ByteDance (Doubao) ---
User-agent: Bytespider
# --- Baidu (Ernie) ---
User-agent: Baidu-Ernie
# --- Allen Institute for AI ---
User-agent: AI2Bot
User-agent: AI2Bot-Dolma
# --- Hugging Face ---
User-agent: HuggingFace-Hub
# --- Tavily (AI search API) ---
User-agent: TavilyBot
# --- Webz.io (formerly OmgiliBot) ---
User-agent: Webzio-Extended
User-agent: OmgiliBot
User-agent: omgili
# --- Hive (image AI) ---
User-agent: ImagesiftBot
# --- Diffbot (knowledge graph / AI) ---
User-agent: Diffbot
# --- Common Crawl (feeds many LLMs) ---
User-agent: CCBot
# --- Timpi (decentralised AI search) ---
User-agent: TimpiBot
# --- FriendlyCrawler (AI training) ---
User-agent: FriendlyCrawler
# --- Scrapy-based AI crawlers ---
User-agent: Scrapy
# --- Other AI user-agents observed in logs ---
User-agent: iaskspider
User-agent: YisouSpider
User-agent: SummalyBot
User-agent: ICC-Crawler
User-agent: LinerBot
User-agent: Devin
User-agent: Factset_spyderbot
User-agent: AwarioBot
User-agent: Bravebot
User-agent: Brave-SearchBot
# v72.29 — added 2026-era emerging AI crawlers per operator anti-scraping
# directive ("please make sure you protect the web site of being scanned
# by another agent that will try to copy what we are doing").
# --- Reka AI ---
User-agent: RekaAI
User-agent: Reka-Crawler
# --- Adept ---
User-agent: AdeptBot
# --- Krea / Recraft (image AI) ---
User-agent: KreaBot
User-agent: RecraftBot
# --- Liner Search ---
User-agent: Liner-AI
User-agent: Liner-Search
# --- Grokipedia / xAI fleet ---
User-agent: Grokipedia
User-agent: GrokSearch
# --- Inflection (Pi) ---
User-agent: InflectionBot
User-agent: Pi-AI
# --- Aleph Alpha ---
User-agent: AlephAlphaBot
# --- 01.AI / Yi ---
User-agent: YiBot
User-agent: YiCrawler
# --- Stability AI ---
User-agent: StabilityBot
# --- Runway ---
User-agent: RunwayBot
# --- Character.AI ---
User-agent: CharacterAI-Crawler
# --- Pika / Sora-class video AI ---
User-agent: PikaBot
# --- Magic.dev ---
User-agent: MagicAI
# --- Cresta / contact-center AI ---
User-agent: CrestaBot
# --- Glean / enterprise AI search ---
User-agent: GleanBot
# --- Webdesigner.io / GPT-Crawler / open-source bot frameworks ---
User-agent: GPTCrawler
User-agent: AutoGPT
User-agent: AgentGPT
User-agent: BabyAGI
User-agent: LangChain
User-agent: LangChain-Crawler
User-agent: LlamaIndex
User-agent: HeadlessChrome
User-agent: PlaywrightBot
User-agent: PuppeteerBot
User-agent: Selenium
# --- Catch-all for anything matching "AI" / "GPT" / "LLM" patterns ---
User-agent: GPT
User-agent: LLM
User-agent: AI-Crawler
User-agent: AI-Bot
User-agent: AIBot
Disallow: /

# ========================================================================
# 2b. Honeypot route — only a non-compliant bot ever reaches /honeypot/
# ========================================================================
# index.html includes a hidden anchor `<a rel="nofollow" style="display:
# none" href="/honeypot/scraper-trap">` that no human and no compliant
# bot ever visits. This Disallow makes the path off-limits to every
# user-agent so a hit in the access log is, by elimination, evidence of
# a non-compliant scraper. Cloudflare Page Rules can then auto-block
# the originating IP. The path itself returns a 200 with a stub HTML
# response advising the bot operator to consult the Terms of Service.
# (The X-Honeypot response header was previously set via public/_headers
# under the Netlify edge — that file is now deleted (v72.70.4 ship);
# GitHub Pages cannot add custom response headers, so the header
# signal is gone. Detection now relies on access-log post-processing.)
User-agent: *
Disallow: /honeypot/

# ========================================================================
# 3. Known abusive / bulk SEO scrapers — deny everything (unchanged)
# ========================================================================

User-agent: SemrushBot
User-agent: SemrushBot-SA
User-agent: SemrushBot-BA
User-agent: AhrefsBot
User-agent: AhrefsSiteAudit
User-agent: MJ12bot
User-agent: DotBot
User-agent: DataForSeoBot
User-agent: BLEXBot
User-agent: SerpstatBot
User-agent: Seekport
User-agent: PetalBot
User-agent: Sogou
User-agent: LinkpadBot
User-agent: MegaIndex
User-agent: LinkfluenceBot
User-agent: SeobilityBot
User-agent: barkrowler
Disallow: /

# ========================================================================
# 4. Catch-all — block proprietary data + signal noai to unnamed bots
# ========================================================================
# Traditional search engines we didn't name get the same treatment
# as the named ones (crawl public pages, block private data).
# If you are an AI agent not named above, please honour the noai /
# noimageai <meta name="robots"> directive and the Terms of Service
# prohibition on AI-training extraction.

User-agent: *
Disallow: /api/consultants
Disallow: /api/consultant-matches.json
Disallow: /api/opportunity-matches.json
Disallow: /api/opportunity-matches.csv
Disallow: /api/stakeholders-firm.json
Disallow: /api/private/
Disallow: /data/private/
# v72.61 — `/api/agencies.json` was previously Disallowed in this catch-all
# section but is a PUBLIC endpoint per /api/about.json `public_data_endpoints`,
# /.well-known/agent.json `endpoints`, /llms.txt and /agents.md. Section 1
# (Googlebot + named search engines + Anthropic) already Allows it; the
# catch-all conflict meant unnamed search engines, archive bots, and standard
# checkers (Lighthouse SEO) were told it was private. Removing the line
# aligns the catch-all with the rest of the public-endpoint manifest.
Allow: /

# ========================================================================
# Sitemaps
# ========================================================================
Sitemap: https://analyticslegends.ai/sitemap.xml