From ef2a6af4d53e5fac3822bdd606b2b130370e0e3b Mon Sep 17 00:00:00 2001
From: Emanuel Seemann <3380606+seemanne@users.noreply.github.com>
Date: Thu, 6 Feb 2025 10:01:06 +0100
Subject: [PATCH] add AI LLM tags to public taxonomy (#1240)

---
 taxonomy/classifications.json | 80 +++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/taxonomy/classifications.json b/taxonomy/classifications.json
index 525e6618cb0..134e60e0d6b 100644
--- a/taxonomy/classifications.json
+++ b/taxonomy/classifications.json
@@ -358,5 +358,85 @@
         "description": "IP uses rapidly changing user agents.",
         "label": "Spoofed User Agent",
         "name": "profile:spoofed_user_agent"
+    },
+    "ai-crawler:meta": {
+        "description": "This IP is used to scrape websites for LLM training by Meta",
+        "label": "Meta AI crawler",
+        "name": "ai-crawler:meta"
+    },
+    "ai-search:duckduckgo": {
+        "description": "This IP is used to enrich search results using an LLM by DuckDuckGo",
+        "label": "DuckDuckGo AI search agent",
+        "name": "ai-search:duckduckgo"
+    },
+    "ai-crawler:allenai": {
+        "description": "This IP is used to scrape websites for LLM training by AllenAI",
+        "label": "AllenAI AI crawler",
+        "name": "ai-crawler:allenai"
+    },
+    "ai-crawler:apple": {
+        "description": "This IP is used to scrape websites for LLM training by Apple",
+        "label": "Apple AI crawler",
+        "name": "ai-crawler:apple"
+    },
+    "ai-search:apple": {
+        "description": "This IP is used to enrich search results using an LLM by Apple",
+        "label": "Apple AI search agent",
+        "name": "ai-search:apple"
+    },
+    "ai-crawler:bytedance": {
+        "description": "This IP is used to scrape websites for LLM training by Bytedance",
+        "label": "Bytedance AI crawler",
+        "name": "ai-crawler:bytedance"
+    },
+    "ai-crawler:commoncrawl": {
+        "description": "This IP is used to scrape websites for LLM training by CommonCrawl",
+        "label": "CommonCrawl AI crawler",
+        "name": "ai-crawler:commoncrawl"
+    },
+    "ai-crawler:anthropic": {
+        "description": "This IP is used to scrape websites for LLM training by Anthropic",
+        "label": "Anthropic AI crawler",
+        "name": "ai-crawler:anthropic"
+    },
+    "ai-search:anthropic": {
+        "description": "This IP is used to enrich search results using an LLM by Anthropic",
+        "label": "Anthropic AI search agent",
+        "name": "ai-search:anthropic"
+    },
+    "ai-crawler:cohere": {
+        "description": "This IP is used to scrape websites for LLM training by CohereAI",
+        "label": "CohereAI AI crawler",
+        "name": "ai-crawler:cohere"
+    },
+    "ai-search:cohere": {
+        "description": "This IP is used to enrich search results using an LLM by CohereAI",
+        "label": "CohereAI AI search agent",
+        "name": "ai-search:cohere"
+    },
+    "ai-crawler:openai": {
+        "description": "This IP is used to scrape websites for LLM training by OpenAI",
+        "label": "OpenAI AI crawler",
+        "name": "ai-crawler:openai"
+    },
+    "ai-search:openai": {
+        "description": "This IP is used to enrich search results using an LLM by OpenAI",
+        "label": "OpenAI AI search agent",
+        "name": "ai-search:openai"
+    },
+    "ai-crawler:huawei": {
+        "description": "This IP is used to scrape websites for LLM training by Huawei",
+        "label": "Huawei AI crawler",
+        "name": "ai-crawler:huawei"
+    },
+    "ai-crawler:perplexity": {
+        "description": "This IP is used to scrape websites for LLM training by Perplexity",
+        "label": "Perplexity AI crawler",
+        "name": "ai-crawler:perplexity"
+    },
+    "ai-search:perplexity": {
+        "description": "This IP is used to enrich search results using an LLM by Perplexity",
+        "label": "Perplexity AI search agent",
+        "name": "ai-search:perplexity"
     }
 }
\ No newline at end of file