Skip to content

Commit

Permalink
Add bottom_n, package_tuple_to_str, and restrict num_workers to…
Browse files Browse the repository at this point in the history
… 1 (#17222)
  • Loading branch information
nerdai authored Dec 10, 2024
1 parent 4974348 commit acea53c
Showing 1 changed file with 65 additions and 11 deletions.
76 changes: 65 additions & 11 deletions scripts/integration_health_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
import ast
import json
from typing import Dict, List
import pandas as pd

from typing import Literal
from datetime import datetime, timedelta
from math import exp

Expand All @@ -48,10 +50,12 @@ def __init__(
repo_path: str,
metric_weights: Dict = DEFAULT_METRIC_WEIGHTS,
new_project_score: float = DEFAULT_SCORE_NEW_PROJECT,
verbose: bool = False,
):
self.package_name = package_name
self.repo_path = repo_path
self.metrics = {}
self.verbose = verbose
if sum(v for v in metric_weights.values()) != 1:
raise ValueError("Metric weights do not sum up to 1.")
self.metric_weights = metric_weights
Expand Down Expand Up @@ -107,7 +111,12 @@ def get_download_trends(self) -> Dict[str, float]:

# We need at least 5 months of data, if not, its too new to be considered
if len(downloads_per_month) < 5:
return None
self._is_new_project = True
return {
"growth_rate": 0,
"stability": 0,
"avg_monthly_downloads": 0,
}

# Apply time weights to downloads
weighted_downloads = []
Expand Down Expand Up @@ -254,13 +263,15 @@ def calculate_metrics(self) -> None:
Calculate relative health score compared to llama-index-core.
"""
if os.path.exists("./core_package_metrics.json"):
print(
"Loading cached existing core package metrics from ./core_package_metrics.json"
)
if self.verbose:
print(
"Loading cached existing core package metrics from ./core_package_metrics.json"
)
with open("./core_package_metrics.json") as f:
core_package_metrics = json.load(f)
else:
print("No cached existing core package metrics found, calculating...")
if self.verbose:
print("No cached existing core package metrics found, calculating...")
core_package_metrics = {
"downloads": IntegrationActivityAnalyzer(
repo_path="./llama-index-core", package_name="llama-index-core"
Expand Down Expand Up @@ -319,25 +330,31 @@ def calculate_metrics(self) -> None:
@property
def health_score(self) -> float:
if self._is_new_project:
return self._new_project_score
score = 0
for k, v in self.metrics.items():
score += v * self.metric_weights[k]
score = self._new_project_score
else:
score = 0
for k, v in self.metrics.items():
score += v * self.metric_weights[k]
return score


def analyze_package(package_path: str) -> tuple[str, float]:
"""Analyze a single package. Helper function for parallel processing."""
package_name = package_path.strip().lstrip("./").rstrip("/").split("/")[-1]
logger.info(f"starting to analyze {package_name}")
analyzer = IntegrationActivityAnalyzer(package_name, package_path)
analyzer.calculate_metrics()
health_score = analyzer.health_score
if health_score == 42:
print(f"new package: {package_name}")
logger.info(f"health score for {package_name}: {health_score}")
return (package_name, health_score)


def analyze_multiple_packages(
package_paths: list[str],
bottom_percent: float | None = None,
bottom_n: int | None = None,
threshold: float | None = None,
) -> list[tuple[str, float]]:
"""Analyze multiple packages in parallel."""
Expand All @@ -348,7 +365,7 @@ def analyze_multiple_packages(
else:
print("No cached existing package metrics found, calculating...")
# Use ThreadPoolExecutor for parallel processing
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
results = list(executor.map(analyze_package, package_paths))

with open("./all_package_metrics.json", "w") as f:
Expand All @@ -357,9 +374,15 @@ def analyze_multiple_packages(
# Sort by health score ascending
results.sort(key=lambda x: x[1])

# Print summary stats
scores = pd.Series([el[1] for el in results])
print(scores.describe())

# Calculate how many packages to return
if bottom_percent is not None:
num_packages = max(1, int(len(results) * bottom_percent))
elif bottom_n is not None:
num_packages = min(bottom_n, len(results))
elif threshold is not None:
num_packages = next(
(i for i, (_, score) in enumerate(results) if score >= threshold),
Expand All @@ -373,28 +396,59 @@ def analyze_multiple_packages(
return results[:num_packages]


def package_tuple_to_str(
package_tuple: tuple[str, float], mode: Literal["default", "csv"] = "default"
):
if mode == "default":
return str(package_tuple)
elif mode == "csv":
name, score = package_tuple
return f"{name},{score}"
else:
raise ValueError(
"Unsupported str mode. Please enter `default` or `csv` as mode."
)


if __name__ == "__main__":
arg = sys.argv[1]
try:
val = float(arg)
is_threshold = sys.argv[2] == "threshold"
is_percent = sys.argv[2] == "percent"
is_bottom_n = sys.argv[2] = "bottom_n"
try:
output_mode = sys.argv[3]
except IndexError:
output_mode = "default"
all_packages = []
for root, dirs, files in os.walk("./llama-index-integrations"):
if "pyproject.toml" in files:
all_packages.append(root)
for root, dirs, files in os.walk("./llama-index-packs"):
if "pyproject.toml" in files:
all_packages.append(root)

if is_percent:
packages_to_remove = analyze_multiple_packages(
all_packages, bottom_percent=val
)
elif is_bottom_n:
packages_to_remove = analyze_multiple_packages(
all_packages, bottom_n=int(val)
)
elif is_threshold:
packages_to_remove = analyze_multiple_packages(all_packages, threshold=val)
else:
raise ValueError("Invalid argument for bottom_percent or threshold")

print(f"Found {len(packages_to_remove)} packages to remove.")
print("\n".join([str(x) for x in packages_to_remove]))
print(
"\n".join(
[package_tuple_to_str(x, mode=output_mode) for x in packages_to_remove]
)
)

except ValueError:
package_path = sys.argv[1].strip().lstrip("./").rstrip("/")
package_name = package_path.split("/")[-1]
Expand Down

0 comments on commit acea53c

Please sign in to comment.