Skip to content

Commit

Permalink
feat: Add a basic CLI
Browse files Browse the repository at this point in the history
The existing version has too many manual steps to run before the script can run its analysis. The new CLI is much easier to use.

Unfortunately, the old script was too hard to modify since it was tightly coupled to a bunch of global variables, so it was easier to rewrite from scratch. As a result, the pseudonyms and author overrides configs are not yet supported in the new version.
  • Loading branch information
thehale committed Nov 10, 2024
1 parent 3533107 commit b655cc6
Show file tree
Hide file tree
Showing 5 changed files with 174 additions and 276 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

# Generated Project Files
repo

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
12 changes: 5 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,11 @@ files for labelling the licenses under which contributors have shared their code
poetry install
```
4. Clone the repository you wish to analyze into the included `repo` folder.
5. [Optional] Make copies of the files in the `config` folder without the `dist` extension.
6. Run the analyzer with `make run`
- The first run will take a while as it computes an accurate `git blame` for
every file in your repository. At the end of the run, a cached blame file
will be generated in the `build` directory to speed up future runs.
4. Run `python ./git_authorship REPO_URL`
- Generates a treemap at `authorship.html`
- AND Generates a JSON output at `authorship.json`

<!--
## Other Features
### Author Licenses
Expand Down Expand Up @@ -96,7 +94,7 @@ All files with a file path containing `target-path` as a substring will be
attributed to the named `actual-author` under the named software license.
_A list of SPDX license identifiers can be found here:
https://spdx.org/licenses/_
https://spdx.org/licenses/_ -->

## License
Copyright (c) 2022 Joseph Hale, All Rights Reserved
Expand Down
166 changes: 166 additions & 0 deletions git_authorship/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
# Copyright (c) 2024 Joseph Hale
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import argparse
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Optional
import plotly.graph_objects as go
import json

from git import Repo

EXCLUDE_DIRS = [".git"]


def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("location", nargs="?", default=".")
parser.add_argument("--clone-to", nargs="?", default="./repo/git_authorship")
# TODO --branch (to analyze a specific branch)
return parser.parse_args()


def ensure_cloned_and_pulled(location: str, clone_to: str):
if not Path(clone_to).exists():
Repo.clone_from(location, clone_to)
else:
Repo(clone_to).git.pull()

return Repo(clone_to)


def iterfiles(dir: Path, exclude: Optional[List[Path]] = None):
exclude = exclude or []
for path in dir.iterdir():
if path.is_file():
yield path
elif path.is_dir() and path not in exclude:
yield from iterfiles(path)


def iterdirs(dir: Path, exclude: Optional[List[Path]] = None):
exclude = exclude or []
for path in dir.iterdir():
if path.is_file():
continue
elif path.is_dir() and path not in exclude:
yield path
yield from iterdirs(path)


FilePath = Path
Author = str
LineCount = int
Authorship = Dict[Author, LineCount]
RepoAuthorship = Dict[FilePath, Authorship]


def file_authorship(repo: Repo, path: Path) -> Authorship:
raw_blame = repo.blame("HEAD", str(path), rev_opts=["-M", "-C", "-C", "-C"])
blame = [
(f"{commit.author.name} <{commit.author.email}>", len(lines))
for commit, lines in (raw_blame or [])
]

authorship = defaultdict(int)
for author, lines in blame:
authorship[author] += lines

return authorship


def repo_authorship(repo: Repo) -> RepoAuthorship:
"""
Calculates how many lines each author has contributed to the repo, with breakdowns
by folder and file.
e.g. For a repo with the following structure:
```
.
├── folder1
│ ├── file1.txt (author1: 25 lines, author2: 150 lines)
│ └── file2.txt (author1: 25 lines)
├── folder2
│ ├── file1.txt (author1: 25 lines, author2: 25 lines)
│ └── file2.txt (author1: 25 lines, author2: 25 lines)
```
The result will be
```
{
".": { "author1": 100, "author2": 200 },
"./folder1": { "author1": 50, "author2": 150 },
"./folder1/file1.txt": { "author1": 25, "author2": 150 },
"./folder1/file2.txt": { "author1": 25 },
"./folder2": { "author1": 50, "author2": 50 },
"./folder2/file1.txt": { "author1": 25, "author2": 25 },
"./folder2/file2.txt": { "author1": 25, "author2": 25 },
}
```
"""
root = Path(repo.working_dir)
filepaths = [
Path(str(f)[len(str(root)) + 1 :])
for f in iterfiles(root, exclude=[root / d for d in EXCLUDE_DIRS])
]
file_authorships = {path: file_authorship(repo, path) for path in filepaths}

repo_authorship: RepoAuthorship = defaultdict(lambda: defaultdict(int))
for file, authorship in file_authorships.items():
parts = f"./{file}".split("/")
for i in range(len(parts)):
cur = "/".join(parts[: i + 1])
for author, lines in authorship.items():
repo_authorship[Path(cur)][author] += lines

return repo_authorship


def export_treemap(authorship: RepoAuthorship, output: Path = Path("authorship.html")):
ids = [str(file) for file in authorship.keys()]
parents = [
str(file.parent) if str(file) != "." else "" for file in authorship.keys()
]
values = [sum(authors.values()) for authors in authorship.values()]
labels = [file.name for file in authorship.keys()]
descriptions = [
"<br>Authors:<br> - "
+ "<br> - ".join(f"{author}: {lines}" for author, lines in authorship.items())
for authorship in authorship.values()
]

fig = go.Figure(
go.Treemap(
ids=ids,
labels=labels,
parents=parents,
values=values,
maxdepth=3,
branchvalues="total",
text=descriptions,
hovertemplate="%{label}<br><br>%{value} lines<br>%{text}",
root_color="lightgrey",
)
)

fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
fig.write_html(output)


def export_json(authorship: RepoAuthorship, output: Path = Path("authorship.json")):
with open(output, "w") as f:
json.dump({str(path): authors for path, authors in authorship.items()}, f)


if __name__ == "__main__":
args = parse_args()
repo = ensure_cloned_and_pulled(args.location, args.clone_to)
authorship = repo_authorship(repo)
export_treemap(authorship)
export_json(authorship)
Loading

0 comments on commit b655cc6

Please sign in to comment.