Skip to content

Commit

Permalink
Parallelize evaluation of llama2 rouge scores (#1995)
Browse files Browse the repository at this point in the history
* Parallelize evaluation of rouge scores

* use variable for repeated val

* [Automated Commit] Format Codebase

---------

Co-authored-by: mlcommons-bot <[email protected]>
Co-authored-by: Miro <[email protected]>
  • Loading branch information
3 people authored Jan 14, 2025
1 parent 88f4d23 commit af05f0d
Showing 1 changed file with 39 additions and 9 deletions.
48 changes: 39 additions & 9 deletions language/llama2-70b/evaluate-accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import evaluate
import numpy as np
import json
from multiprocessing import Pool, cpu_count


def get_args():
Expand Down Expand Up @@ -52,12 +53,21 @@ def postprocess_text(preds, targets):
return preds, targets


def compute_rouge_chunk(chunk):
"""Compute ROUGE scores for a chunk of predictions and references."""
metric = evaluate.load("rouge")
preds, targets = chunk
result = metric.compute(
predictions=preds, references=targets, use_stemmer=True, use_aggregator=False
)
return result


def main():

args = get_args()
dataset_path = args.dataset_file
checkpoint_path = args.checkpoint_path
metric = evaluate.load("rouge")
nltk.download("punkt")
nltk.download("punkt_tab")

Expand Down Expand Up @@ -103,23 +113,43 @@ def main():

preds, targets = postprocess_text(preds_decoded_text, target_required)

result = metric.compute(
predictions=preds, references=targets, use_stemmer=True, use_aggregator=False
)
result = {k: round(np.mean(v) * 100, 4) for k, v in result.items()}
# Split data into chunks for parallel processing
num_chunks = cpu_count() # Number of parallel processes
chunk_size = len(preds) // num_chunks + (len(preds) % num_chunks > 0)

chunks = [
(preds[i:i + chunk_size], targets[i:i + chunk_size])
for i in range(0, len(preds), chunk_size)
]

# Use multiprocessing Pool to compute ROUGE scores in parallel
with Pool(num_chunks) as pool:
results_list = pool.map(compute_rouge_chunk, chunks)

# Aggregate results from all chunks
aggregated_results = {}

for result in results_list:
for k, v in result.items():
if k not in aggregated_results:
aggregated_results[k] = []
aggregated_results[k].extend(v)

final_result = {k: round(np.mean(v) * 100, 4)
for k, v in aggregated_results.items()}

prediction_lens = [len(pred) for pred in preds]
gen_num = len(preds)

result = {
**result,
final_result.update({
"gen_len": np.sum(prediction_lens),
"gen_num": gen_num,
"gen_tok_len": gen_tok_len,
"tokens_per_sample": round(gen_tok_len / gen_num, 1),
}
})

print("\nResults\n")
print(result)
print(final_result)


if __name__ == "__main__":
Expand Down

0 comments on commit af05f0d

Please sign in to comment.