Skip to content

Commit

Permalink
Merge pull request #245 from MLSysOps/lefan/local-git
Browse files Browse the repository at this point in the history
[MRG] Add a command to generate report for local git repository
  • Loading branch information
huangyz0918 authored Oct 26, 2024
2 parents 2a87550 + 1c20f1e commit 6192884
Show file tree
Hide file tree
Showing 8 changed files with 336 additions and 23 deletions.
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,24 @@ mle start
MLE agent can help you summarize your weekly report, including development progress, communication notes, reference, and
to-do lists.

#### Mode 1: Web Application to Generate Report from GitHub

```bash
cd <project name>
mle report
```

Then, you can visit http://localhost:3000/ to generate your report locally.

#### Mode 2: CLI Tool to Generate Report from Local Git Repository
```bash
cd <project name>
mle report-local --email=<git email> --start-date=YYYY-MM-DD --end-date=YYYY-MM-DD <path_to_git_repo>
```

- `--start-date` and `--end-date` are optional parameters. If omitted, the command will generate a report for the default date range of the last 7 days.
- Replace `<git email>` with your Git email and `<path_to_git_repo>` with the path to your local Git repository.

### :trophy: Start with Kaggle Competition

MLE agent can participate in Kaggle competitions and finish coding and debugging from data preparation to model training
Expand Down
122 changes: 119 additions & 3 deletions mle/agents/summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
from rich.console import Console

from mle.function import *
from mle.integration import GitHubIntegration
from mle.integration import GitHubIntegration, GitIntegration


class SummaryAgent:
class GitHubSummaryAgent:

def __init__(self, model, github_repo: str = None, username: str = None, github_token: str = None, console=None):
"""
SummaryAgent: summary the workspace provided by the user.
GitHubSummaryAgent: summary the workspace provided by the user.
Args:
model: the model to use.
Expand Down Expand Up @@ -172,3 +172,119 @@ def kaggle_request_summarize(
function_call='auto',
functions=[schema_preview_csv_data]
)


class GitSummaryAgent:

def __init__(self, model, git_path: str = None, git_email: str = None, console=None):
"""
GitSummaryAgent: summarize the local Git repository provided by the user.
Args:
model: the model to use.
git_path: the path to the local Git repository.
git_email: the email of the user.
console: the console to use.
"""
self.report = None
self.model = model
self.email = git_email
self.chat_history = []
self.git_path = git_path
self.git = GitIntegration(git_path)
self.console = console
if not self.console:
self.console = Console()
self.sys_prompt = """
You are a software expert tasked with summarizing the Git repository information provided by the user. The
project may contain the dataset, the source code, and the documentation, etc.
Your capabilities include:
1. You need to summarize the basic project information, including the project name, the project description,
the technical stacks, etc.
2. You need to further analyze the project structure and the README file to understand the project business goal
and the purpose. And give a deep understanding of the project, draw a summary in the description.
3. You should read the README.md file and see if the project includes a dataset (or using a public dataset).
if so, you'd better give a brief introduction to the dataset.
4. Based on the information provided, you need to guess the technical hard parts and give suggestions.
5. You may use function `search_arxiv` and `search_github_repos` to search for the related papers and github
repos of the project using the project keywords and tech stacks. Do not directly search the project name.
"""
self.json_mode_prompt = """
JSON Output Format:
{
"summary": "The project is a ...",
"business_goal": ["The project aims to build an image classification model...", ...],
"dataset": [{"name": "CIFAR-10", "description": "The project uses CIFAR-10 dataset to train
the classification model. The dataset includes 10 classes of images...""}, ...],
"tech_stack": ["Python", "PyTorch", "MLFlow", ...],
"related_work": [{"title": "xxxx", "link":"https://arxiv.org/abs/xxx.xxxx"}, {"title": "xxx", "link": "https://github.com/xxx"}, ...],
}
"""
self.functions = [
schema_search_arxiv,
schema_search_github_repos,
schema_search_papers_with_code
]

self.sys_prompt += self.json_mode_prompt
self.chat_history.append({"role": 'system', "content": self.sys_prompt})

def process_knowledge(self):
"""
Process the knowledge from the Git repo.
Args: None
"""
info_str = f"""
Git path: {self.git_path}
"""
readme_content = self.git.get_readme()
repo_files = self.git.get_structure()

info_str += f"""
README CONTENT:
{readme_content}
"""

info_str += f"""
PROJECT STRUCTURE:
"""

for file in repo_files:
info_str += f"""
{file}
"""

return info_str

def summarize(self, start_date=None, end_date=None):
"""
Handle the query from the model query response.
Args: None
"""
with self.console.status("MLE summarizer is summarizing the project..."):
self.chat_history.append({"role": "user", "content": self.process_knowledge()})
text = self.model.query(
self.chat_history,
function_call='auto',
functions=self.functions,
response_format={"type": "json_object"}
)

self.chat_history.append({"role": "assistant", "content": text})
summary = json.loads(text)
summary.update({"git_path": self.git_path})

# get user activity
user_activity = self.git.get_user_activity(self.email, start_date, end_date)
summary.update({"user_activity": user_activity})

return summary
22 changes: 21 additions & 1 deletion mle/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ def report(ctx, repo, model, user, visualize):
future2.result()
else:
if repo is None:
# TODO: support local project report
repo = questionary.text(
"What is your GitHub repository? (e.g., MLSysOps/MLE-agent)"
).ask()
Expand All @@ -110,6 +109,27 @@ def report(ctx, repo, model, user, visualize):
return workflow.report(os.getcwd(), repo, user, model)


@cli.command()
@click.pass_context
@click.argument('path', default='./')
@click.option('--email', default=None, help='The email of the user.')
@click.option('--start-date', default=None, help='The start date of the user activity (YYYY-MM-DD).')
@click.option('--end-date', default=None, help='The end date of the user activity (YYYY-MM-DD).')
def report_local(ctx, path, email, start_date, end_date):
"""
report_local: generate report with LLM for local git repo.
"""
if not check_config(console):
return

if email is None:
email = questionary.text(
"What is your Git email? (e.g., [email protected])"
).ask()

return workflow.report_local(os.getcwd(), path, email, start_date=start_date, end_date=end_date)


@cli.command()
@click.option('--model', default=None, help='The model to use for the chat.')
@click.option('--auto', is_flag=True, help='Use auto mode to generate the coding plan.')
Expand Down
152 changes: 138 additions & 14 deletions mle/integration/local_git.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from git import Repo, NULL_TREE
from datetime import datetime, timedelta
from datetime import datetime, timezone, timedelta

import os
import fnmatch
import subprocess

class GitIntegration:
def __init__(self, path):
Expand Down Expand Up @@ -29,25 +32,44 @@ def get_repo_status(self):
except Exception as e:
return f"An error occurred: {str(e)}"

def get_commit_history(self, date_range=None, limit=None):
def get_commit_history(self, start_date=None, end_date=None, email=None, limit=None):
"""
Get commit history from a git repository
:param date_range: Number of days to look back from today (default None)
:param limit: Number of commits to retrieve (default None)
:return: List of commit history
Process commit history within a specified date range and for a specific user (email).
:param start_date: Start date for commit range (inclusive), in 'YYYY-MM-DD' format
:param end_date: End date for commit range (inclusive), in 'YYYY-MM-DD' format
:param username: GitHub username to filter commits (optional)
:param limit: Maximum number of commits to retrieve (default is None, which retrieves all commits in range)
:return: Dictionary of commits
"""
end_time = None
if end_date is not None:
end_time = f"{end_date}T23:59:59Z"

start_time = None
if start_date is not None:
start_time = f"{start_date}T00:00:00Z"

try:
commit_history = []
for commit in self.repo.iter_commits(max_count=limit):
commit_date = datetime.fromtimestamp(commit.committed_date)
if date_range is None or (datetime.now() - commit_date).days <= date_range:
commit_history.append({
'commit_hash': commit.hexsha,
'author': commit.author.name,
'email': commit.author.email,
'message': commit.message.strip(),
'date': commit_date.strftime("%Y-%m-%d %H:%M:%S")
})
commit_date = commit_date.replace(tzinfo=timezone.utc)
if start_time is not None and commit_date < datetime.fromisoformat(start_time):
continue

if end_time is not None and commit_date > datetime.fromisoformat(end_time):
continue

if email is not None and commit.author.email != email:
continue

commit_history.append({
'commit_hash': commit.hexsha,
'author': commit.author.name,
'email': commit.author.email,
'message': commit.message.strip(),
'date': commit_date.strftime("%Y-%m-%d %H:%M:%S")
})

return commit_history

Expand Down Expand Up @@ -111,3 +133,105 @@ def get_commit_diff(self, commit_hash, show_content=False):

except Exception as e:
return f"An error occurred: {str(e)}"

def get_source_code(self, file_pattern="*"):
"""
Process source code files in the repository.
:param file_pattern: Wildcard pattern to filter files (e.g., "*.py" for Python files)
:return: Dictionary with file paths as keys and file contents as values
"""

def get_contents(path="", file_pattern=file_pattern):
for root, _, files in os.walk(os.path.join(self.repo_path, path)):
for filename in fnmatch.filter(files, file_pattern):
file_path = os.path.join(root, filename)
with open(file_path, 'r') as f:
yield {
'path': os.path.relpath(file_path, self.repo_path),
'name': filename,
'content': f.read()
}

return {file['path']: file['content'] for file in get_contents()}

def get_readme(self):
"""
Get readme content of the repository.
:return: The readme content
"""
content = self.get_source_code("README.md")
if len(content):
return list(content.values())[0]
return None

def get_structure(self, path=''):
"""
Scan and return the file structure and file names of the Git repository as a list of paths.
:param path: The path to start scanning from (default is root)
:param branch: The branch to scan (if None, the repository's default branch will be used)
:param include_invisible: Whether to include invisible files/folders (starting with .) (default is False)
:return: A list of file paths in the repository
"""
result = subprocess.run(
["git", "-C", path, "ls-files"],
stdout=subprocess.PIPE,
text=True,
check=True
)

return result.stdout.splitlines()

def get_user_activity(self, email, start_date=None, end_date=None):
"""
Aggregate information about a user's activity within a specific time period.
:param email: User email to analyze
:param start_date: Start date for the analysis period, in 'YYYY-MM-DD' format
:param end_date: End date for the analysis period, in 'YYYY-MM-DD' format
:return: Dictionary containing aggregated user activity information, if the
start and end dates are not provided, the default period is the last 7 days.
"""
if end_date is None:
end_datetime = datetime.now(timezone.utc).replace(hour=23, minute=59, second=59, microsecond=0)
end_date = end_datetime.strftime("%Y-%m-%d")
else:
end_datetime = (datetime.strptime(end_date, "%Y-%m-%d")
.replace(hour=23, minute=59, second=59, tzinfo=timezone.utc))

if start_date is None:
start_datetime = end_datetime - timedelta(days=6)
start_date = start_datetime.strftime("%Y-%m-%d")

# Fetch data
commits = self.get_commit_history(start_date, end_date, email)

# Aggregate commit information
commit_count = len(commits)
commit_messages = [commit['message'] for commit in commits]

# Compile the report
report = {
'username': email,
'period': {
'start': start_date,
'end': end_date
},
'summary': {
'total_commits': commit_count,
'total_pull_requests': 0,
'total_issues': 0
},
'commits': {
'count': commit_count,
'messages': commit_messages
},
'pull_requests': {
'count': 0,
'details': []
},
'issues': {
'count': 0,
'details': []
}
}

return report
1 change: 1 addition & 0 deletions mle/workflow/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .baseline import baseline
from .report import report
from .report_local import report_local
from .kaggle import kaggle, auto_kaggle
from .chat import chat
Loading

0 comments on commit 6192884

Please sign in to comment.