From ef2a6af4d53e5fac3822bdd606b2b130370e0e3b Mon Sep 17 00:00:00 2001 From: Emanuel Seemann <3380606+seemanne@users.noreply.github.com> Date: Thu, 6 Feb 2025 10:01:06 +0100 Subject: [PATCH] add AI LLM tags to public taxonomy (#1240) --- taxonomy/classifications.json | 80 +++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/taxonomy/classifications.json b/taxonomy/classifications.json index 525e6618cb0..134e60e0d6b 100644 --- a/taxonomy/classifications.json +++ b/taxonomy/classifications.json @@ -358,5 +358,85 @@ "description": "IP uses rapidly changing user agents.", "label": "Spoofed User Agent", "name": "profile:spoofed_user_agent" + }, + "ai-crawler:meta": { + "description": "This IP is used to scrape websites for LLM training by Meta", + "label": "Meta AI crawler", + "name": "ai-crawler:meta" + }, + "ai-search:duckduckgo": { + "description": "This IP is used to enrich search results using an LLM by DuckDuckGo", + "label": "DuckDuckGo AI search agent", + "name": "ai-search:duckduckgo" + }, + "ai-crawler:allenai": { + "description": "This IP is used to scrape websites for LLM training by AllenAI", + "label": "AllenAI AI crawler", + "name": "ai-crawler:allenai" + }, + "ai-crawler:apple": { + "description": "This IP is used to scrape websites for LLM training by Apple", + "label": "Apple AI crawler", + "name": "ai-crawler:apple" + }, + "ai-search:apple": { + "description": "This IP is used to enrich search results using an LLM by Apple", + "label": "Apple AI search agent", + "name": "ai-search:apple" + }, + "ai-crawler:bytedance": { + "description": "This IP is used to scrape websites for LLM training by Bytedance", + "label": "Bytedance AI crawler", + "name": "ai-crawler:bytedance" + }, + "ai-crawler:commoncrawl": { + "description": "This IP is used to scrape websites for LLM training by CommonCrawl", + "label": "CommonCrawl AI crawler", + "name": "ai-crawler:commoncrawl" + }, + "ai-crawler:anthropic": { + "description": "This IP is used to scrape websites for LLM training by Anthropic", + "label": "Anthropic AI crawler", + "name": "ai-crawler:anthropic" + }, + "ai-search:anthropic": { + "description": "This IP is used to enrich search results using an LLM by Anthropic", + "label": "Anthropic AI search agent", + "name": "ai-search:anthropic" + }, + "ai-crawler:cohere": { + "description": "This IP is used to scrape websites for LLM training by CohereAI", + "label": "CohereAI AI crawler", + "name": "ai-crawler:cohere" + }, + "ai-search:cohere": { + "description": "This IP is used to enrich search results using an LLM by CohereAI", + "label": "CohereAI AI search agent", + "name": "ai-search:cohere" + }, + "ai-crawler:openai": { + "description": "This IP is used to scrape websites for LLM training by OpenAI", + "label": "OpenAI AI crawler", + "name": "ai-crawler:openai" + }, + "ai-search:openai": { + "description": "This IP is used to enrich search results using an LLM by OpenAI", + "label": "OpenAI AI search agent", + "name": "ai-search:openai" + }, + "ai-crawler:huawei": { + "description": "This IP is used to scrape websites for LLM training by Huawei", + "label": "Huawei AI crawler", + "name": "ai-crawler:huawei" + }, + "ai-crawler:perplexity": { + "description": "This IP is used to scrape websites for LLM training by Perplexity", + "label": "Perplexity AI crawler", + "name": "ai-crawler:perplexity" + }, + "ai-search:perplexity": { + "description": "This IP is used to enrich search results using an LLM by Perplexity", + "label": "Perplexity AI search agent", + "name": "ai-search:perplexity" } } \ No newline at end of file