Adding more analytics (#242)

* adding scarf * fixing scarf url * adding textual * fixing textual * updating ruff * Formatting * Fix ruff error * fixing pydantic issues affecting ragas logger * Fix dependency install --------- Co-authored-by: ethan-tonic <[email protected]>
TonicAI · Nov 14, 2024 · 940919d · 940919d
1 parent 17cf1af
commit 940919d
Show file tree

Hide file tree

Showing 14 changed files with 1,630 additions and 1,194 deletions.
diff --git a/.github/workflows/python-install-and-use-e2e.yml b/.github/workflows/python-install-and-use-e2e.yml
@@ -31,7 +31,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           python -m pip install poetry
-          poetry install --with dev
+          poetry install --with validate_dev
         shell: bash
       - name: Lint with Ruff  
         run: |  

diff --git a/.gitignore b/.gitignore
@@ -14,4 +14,6 @@ docs/build/
 dmypy.json
 
 .env
-*.pkl
+*.pkl
+
+.ruff_cache/
diff --git a/README.md b/README.md
@@ -515,7 +515,7 @@ You can also view the results of an individual run in the UI as well.
 <p align="right">(<a href="#readme-top">back to top</a>)</p>
 
 ### Telemetry
-Tonic Validate collects minimal telemetry to help us figure out what users want and how they're using the product. We do not use any existing telemetry framework and instead created our own privacy focused setup. Only the following information is tracked
+Tonic Validate collects minimal telemetry to help us figure out what users want and how they're using the product. Only the following information is tracked regarding usage of the product.  (Additional information is collected below, see details).
 
 * What metrics were used for a run
 * Number of questions in a run
@@ -525,10 +525,11 @@ Tonic Validate collects minimal telemetry to help us figure out what users want
 
 We do **NOT** track things such as the contents of the questions / answers, your scores, or any other sensitive information. For detecting CI/CD, we only check for common environment variables in different CI/CD environments. We do not log the values of these environment variables.
 
-We also generate a random UUID to help us figure out how many users are using the product. This UUID is linked to your Validate account only to help track who is using the SDK and UI at once and to get user counts. If you want to see how we implemented telemetry, you can do so in the `tonic_validate/utils/telemetry.py` file
+We also generate a random UUID to help us figure out how many users are using the product. This UUID is linked to your Validate account only to help track who is using the SDK and UI at once and to get user counts. 
 
-If you wish to opt out of telemetry, you only need to set the `TONIC_VALIDATE_DO_NOT_TRACK` environment variable to `True`.
+We also collect information on version of python being used and characteristics of the machine (e.g. calls to `platform.system()` and `platform.machine()`).  This information is sent to Scarf which helps us better understand our open-source community.
 
+If you wish to opt out of telemetry, you only need to set the `TONIC_VALIDATE_DO_NOT_TRACK` environment variable to `True`. If you want to see how we implemented telemetry, you can do so in the `tonic_validate/utils/telemetry.py` file.
 
 <p align="right">(<a href="#readme-top">back to top</a>)</p>
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "tonic-validate"
-version = "6.1.0"
+version = "6.2.0"
 description = "RAG evaluation metrics."
 authors = ["Joe Ferrara <[email protected]>", "Ethan Philpott <[email protected]>", "Adam Kamor <[email protected]>"]
 readme = "README.md"
@@ -18,14 +18,16 @@ litellm = "^1.35.8"
 google-generativeai = { version = "^0.5.2", python = ">=3.9" }
 aioboto3 = "^12.4.0"
 
-[tool.poetry.group.dev.dependencies]
+[tool.poetry.group.validate_dev.dependencies]
 sphinx = "^7.0.0"
 sphinx-rtd-theme = ">=1.2,<3.0"
 ruff = ">=0.1.15,<0.5.0"
 tonic-textual = ">=1.0.5,<3.0.0"
 pytest = "^8.1.1"
 pytest-asyncio = "^0.23.6"
 
+[tool.ruff]
+include = ["pyproject.toml", "tonic_validate/**/*.py"]
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]

diff --git a/tonic_validate/classes/run.py b/tonic_validate/classes/run.py
@@ -66,8 +66,8 @@ class Run:
 
     overall_scores: Dict[str, float]
     run_data: List[RunData]
-    llm_evaluator: Optional[str]
-    id: Optional[UUID]
+    llm_evaluator: Optional[str] = None
+    id: Optional[UUID] = None
 
     def to_df(self):
         """

diff --git a/tonic_validate/metrics/answer_contains_pii_metric.py b/tonic_validate/metrics/answer_contains_pii_metric.py
@@ -26,7 +26,7 @@ def __init__(self, pii_types: List[str], textual_api_key: Optional[str] = None):
 
         """
         try:
-            from tonic_textual.api import TonicTextual  # type: ignore
+            from tonic_textual.redact_api import TonicTextual  # type: ignore
         except ImportError:
             raise ImportError(
                 "You must install tonic-textual to use the AnswerContainsPiiMetric. You can install it via pip: pip install tonic-textual"

diff --git a/tonic_validate/metrics/context_contains_pii_metric.py b/tonic_validate/metrics/context_contains_pii_metric.py
@@ -26,7 +26,7 @@ def __init__(self, pii_types: List[str], textual_api_key: Optional[str] = None):
 
         """
         try:
-            from tonic_textual.api import TonicTextual  # type: ignore
+            from tonic_textual.redact_api import TonicTextual  # type: ignore
         except ImportError:
             raise ImportError(
                 "You must install tonic-textual to use the ContextContainsPiiMetric. You can install it via pip: pip install tonic-textual"

diff --git a/tonic_validate/services/litellm_service.py b/tonic_validate/services/litellm_service.py
@@ -53,41 +53,47 @@ def __init__(
 
     def check_environment(self, model: str) -> None:
         model_name_lower = model.lower()
-        if model_name_lower.startswith('gemini'):
+        if model_name_lower.startswith("gemini"):
             if "GEMINI_API_KEY" not in os.environ:
                 raise Exception(
                     "GEMINI_API_KEY must be set in the environment when using Gemini"
                 )
-        elif model_name_lower.startswith('bedrock'):
-            if ("AWS_SECRET_ACCESS_KEY" not in os.environ or "AWS_ACCESS_KEY_ID" not in os.environ or "AWS_REGION_NAME"
-                    not in os.environ):
+        elif model_name_lower.startswith("bedrock"):
+            if (
+                "AWS_SECRET_ACCESS_KEY" not in os.environ
+                or "AWS_ACCESS_KEY_ID" not in os.environ
+                or "AWS_REGION_NAME" not in os.environ
+            ):
                 raise Exception(
                     "AWS_SECRET_ACCESS_KEY and AWS_ACCESS_KEY_ID and AWS_REGION_NAME must be set in the environment "
                     "when using bedrock models."
                 )
-        elif model_name_lower.startswith('sagemaker'):
-            if ("AWS_SECRET_ACCESS_KEY" not in os.environ or "AWS_ACCESS_KEY_ID" not in os.environ or "AWS_REGION_NAME"
-                    not in os.environ):
+        elif model_name_lower.startswith("sagemaker"):
+            if (
+                "AWS_SECRET_ACCESS_KEY" not in os.environ
+                or "AWS_ACCESS_KEY_ID" not in os.environ
+                or "AWS_REGION_NAME" not in os.environ
+            ):
                 raise Exception(
                     "AWS_SECRET_ACCESS_KEY and AWS_ACCESS_KEY_ID and AWS_REGION_NAME must be set in the environment "
                     "when using sagemaker models."
                 )
-        elif model_name_lower.startswith('claude'):
+        elif model_name_lower.startswith("claude"):
             if "ANTHROPIC_API_KEY" not in os.environ:
                 raise Exception(
                     "ANTHROPIC_API_KEY must be set in the environment when using Claude"
                 )
-        elif model_name_lower.startswith('command'):
+        elif model_name_lower.startswith("command"):
             if "COHERE_API_KEY" not in os.environ:
                 raise Exception(
                     "COHERE_API_KEY must be set in the environment when using command models"
                 )
-        elif model_name_lower.startswith('mistral'):
+        elif model_name_lower.startswith("mistral"):
             if "MISTRAL_API_KEY" not in os.environ:
                 raise Exception(
                     "MISTRAL_API_KEY must be set in the environment when using mistral models"
                 )
-        elif model_name_lower.startswith('together_ai'):
+        elif model_name_lower.startswith("together_ai"):
             if "TOGETHERAI_API_KEY" not in os.environ:
                 raise Exception(
                     "TOGETHERAI_API_KEY must be set in the environment when using together AI"

diff --git a/tonic_validate/services/openai_service.py b/tonic_validate/services/openai_service.py
@@ -47,8 +47,10 @@ def __init__(
         elif "OPENAI_API_KEY" in os.environ:
             self.client = AsyncOpenAI()
         elif "OPENROUTR_API_KEY" in os.environ:
-            self.client = AsyncOpenAI(base_url="https://openrouter.ai/api/v1",
-                                      api_key=os.environ["OPENROUTR_API_KEY"])
+            self.client = AsyncOpenAI(
+                base_url="https://openrouter.ai/api/v1",
+                api_key=os.environ["OPENROUTR_API_KEY"],
+            )
         else:
             raise Exception(
                 "OPENAI_API_KEY or AZURE_OPENAI_API_KEY must be set in the environment"

diff --git a/tonic_validate/utils/llm_calls.py b/tonic_validate/utils/llm_calls.py
@@ -8,7 +8,10 @@
 
 
 async def similarity_score_call(
-    question: str, reference_answer: str, llm_answer: str, llm_service: Union[LiteLLMService, OpenAIService]
+    question: str,
+    reference_answer: str,
+    llm_answer: str,
+    llm_service: Union[LiteLLMService, OpenAIService],
 ) -> str:
     """Sends prompt for answer similarity score to OpenAI API, and returns response.
 
@@ -81,7 +84,9 @@ def similarity_score_prompt():
 
 
 async def answer_consistent_with_context_call(
-    answer: str, context_list: List[str], llm_service: Union[LiteLLMService, OpenAIService]
+    answer: str,
+    context_list: List[str],
+    llm_service: Union[LiteLLMService, OpenAIService],
 ) -> str:
     """Sends prompt for answer consistency binary score and returns response.
 
@@ -281,7 +286,9 @@ def answer_contains_context_prompt():
     return main_message
 
 
-async def main_points_call(answer: str, llm_service: Union[LiteLLMService, OpenAIService]) -> str:
+async def main_points_call(
+    answer: str, llm_service: Union[LiteLLMService, OpenAIService]
+) -> str:
     """Sends prompt for main points in answer to Open AI API and returns response.
 
     Parameters
@@ -296,9 +303,7 @@ async def main_points_call(answer: str, llm_service: Union[LiteLLMService, OpenA
     str
         Response from OpenAI API.
     """
-    logger.debug(
-        f"Asking {llm_service.model} for bullet list of main points in answer"
-    )
+    logger.debug(f"Asking {llm_service.model} for bullet list of main points in answer")
     main_message = main_points_prompt()
     main_message += f"\nANSWER: {answer}"
 
@@ -340,7 +345,9 @@ def main_points_prompt():
 
 
 async def statement_derived_from_context_call(
-    statement: str, context_list: List[str], llm_service: Union[LiteLLMService, OpenAIService]
+    statement: str,
+    context_list: List[str],
+    llm_service: Union[LiteLLMService, OpenAIService],
 ) -> str:
     """Sends prompt for whether statement is derived from context and returns response.
 
@@ -480,7 +487,9 @@ def contains_duplicate_info_prompt():
     return main_message
 
 
-async def contains_hate_speech(statement: str, llm_service: Union[LiteLLMService, OpenAIService]) -> str:
+async def contains_hate_speech(
+    statement: str, llm_service: Union[LiteLLMService, OpenAIService]
+) -> str:
     """Sends prompt for whether statement contains hate speech and returns response.
 
     Parameters
@@ -495,9 +504,7 @@ async def contains_hate_speech(statement: str, llm_service: Union[LiteLLMService
     str
         Response from OpenAI API.
     """
-    logger.debug(
-        f"Asking {llm_service.model} whether statement contains hate speech"
-    )
+    logger.debug(f"Asking {llm_service.model} whether statement contains hate speech")
     main_message = contains_hate_speech_prompt()
     main_message += f"\n\nSTATEMENT:\n{statement}\nEND OF STATEMENT"
 

diff --git a/tonic_validate/utils/telemetry.py b/tonic_validate/utils/telemetry.py
@@ -1,11 +1,13 @@
 import json
 import os
+import platform
 from typing import List, Optional
 import uuid
 from tonic_validate.classes.user_info import UserInfo
 from tonic_validate.config import Config
 from tonic_validate.utils.http_client import HttpClient
 from appdirs import user_data_dir
+import requests
 
 APP_DIR_NAME = "tonic-validate"
 
@@ -21,6 +23,8 @@
 
 
 class Telemetry:
+    __has_called_scarf = False
+
     def __init__(self, api_key: Optional[str] = None):
         """
         Used to log telemetry data to the Tonic Validate server
@@ -33,6 +37,34 @@ def __init__(self, api_key: Optional[str] = None):
         self.config = Config()
         self.http_client = HttpClient(self.config.TONIC_VALIDATE_TELEMETRY_URL, api_key)
 
+        if not Telemetry.__has_called_scarf:
+            self.scarf_analytics()
+            Telemetry.__has_called_scarf = True
+
+    def scarf_analytics(self):
+        try:
+            if self.config.TONIC_VALIDATE_DO_NOT_TRACK:
+                return
+            try:
+                from importlib.metadata import version
+
+                sdk_version = version("tonic-validate")
+            except Exception:
+                sdk_version = "unknown"
+
+            requests.get(
+                "https://tonic.gateway.scarf.sh/tonic-validate?version="
+                + sdk_version
+                + "&platform="
+                + platform.system()
+                + "&python="
+                + platform.python_version()
+                + "&arch="
+                + platform.machine(),
+            )
+        except Exception:
+            pass
+
     def get_user(self) -> UserInfo:
         """
         Retrieves the user information from the file. If the user does not exist, creates a new user
@@ -77,16 +109,17 @@ def log_run(self, num_of_questions: int, metrics: List[str], run_time: float):
         metrics: List[str]
             The metrics that were used to evaluate the run
         run_time: float
-            The time taken to evaluate the run            
+            The time taken to evaluate the run
         """
         if self.config.TONIC_VALIDATE_DO_NOT_TRACK:
             return
         try:
             from importlib.metadata import version
-            sdk_version = version('tonic-validate')
+
+            sdk_version = version("tonic-validate")
         except Exception:
             sdk_version = "unknown"
-        
+
         user_id = self.get_user()["user_id"]
         self.http_client.http_post(
             "/runs",
@@ -98,7 +131,7 @@ def log_run(self, num_of_questions: int, metrics: List[str], run_time: float):
                 "sdk_version": sdk_version,
                 "is_ci": self.__is_ci(),
                 "validate_gh_action": self.config.TONIC_VALIDATE_GITHUB_ACTION,
-                "backend": "validate"
+                "backend": "validate",
             },
             timeout=5,
         )
@@ -122,7 +155,7 @@ def log_benchmark(self, num_of_questions: int):
                 "num_of_questions": num_of_questions,
                 "is_ci": self.__is_ci(),
                 "validate_gh_action": self.config.TONIC_VALIDATE_GITHUB_ACTION,
-                "backend": "validate"
+                "backend": "validate",
             },
             timeout=5,
         )

diff --git a/tonic_validate/validate_monitorer.py b/tonic_validate/validate_monitorer.py
@@ -5,7 +5,6 @@
 
 from tonic_validate.config import Config
 from tonic_validate.metrics import (
-    AnswerSimilarityMetric,
     RetrievalPrecisionMetric,
     AugmentationPrecisionMetric,
     AnswerConsistencyMetric,

diff --git a/tonic_validate/validate_scorer.py b/tonic_validate/validate_scorer.py
@@ -88,12 +88,20 @@ def __init__(
             self.encoder = tiktoken.get_encoding("cl100k_base")
 
         model_name_lower = self.model_evaluator.lower()
-        if (model_name_lower.startswith("gemini") or model_name_lower.startswith("claude") or
-                model_name_lower.startswith("command") or model_name_lower.startswith("mistral") or
-                model_name_lower.startswith("together_ai") or model_name_lower.startswith("bedrock") or
-                model_name_lower.startswith("sagemaker")):
+        if (
+            model_name_lower.startswith("gemini")
+            or model_name_lower.startswith("claude")
+            or model_name_lower.startswith("command")
+            or model_name_lower.startswith("mistral")
+            or model_name_lower.startswith("together_ai")
+            or model_name_lower.startswith("bedrock")
+            or model_name_lower.startswith("sagemaker")
+        ):
             self.llm_service = LiteLLMService(
-                self.encoder, self.model_evaluator, max_retries=self.max_llm_retries, model_id=model_id
+                self.encoder,
+                self.model_evaluator,
+                max_retries=self.max_llm_retries,
+                model_id=model_id,
             )
         else:
             self.llm_service = OpenAIService(