From b666f3af275aa499e3e9811bfb09a081dded7513 Mon Sep 17 00:00:00 2001 From: Ned Batchelder Date: Tue, 28 May 2024 09:25:12 -0400 Subject: [PATCH] perf: it's faster in all versions if we don't cache tokenize #1791 --- CHANGES.rst | 6 ++++++ coverage/phystokens.py | 14 ++++---------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 5dac34fc0..c2673f325 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -28,6 +28,12 @@ Unreleased extreme case of combining 700+ data files, the time dropped from more than three hours to seven minutes. Thanks for Kraken Tech for funding the fix. +- Performance improvements for generating HTML reports, with a side benefit of + reducing memory use, closing `issue 1791`_. Thanks to Daniel Diniz for + helping to diagnose the problem. + +.. _issue 1791: https://github.com/nedbat/coveragepy/issues/1791 + .. scriv-start-here diff --git a/coverage/phystokens.py b/coverage/phystokens.py index a42d184a6..8a7f9db6b 100644 --- a/coverage/phystokens.py +++ b/coverage/phystokens.py @@ -6,7 +6,6 @@ from __future__ import annotations import ast -import functools import io import keyword import re @@ -163,20 +162,15 @@ def source_token_lines(source: str) -> TSourceTokenLines: yield line -@functools.lru_cache(maxsize=100) def generate_tokens(text: str) -> TokenInfos: - """A cached version of `tokenize.generate_tokens`. + """A helper around `tokenize.generate_tokens`. - When reporting, coverage.py tokenizes files twice, once to find the - structure of the file, and once to syntax-color it. Tokenizing is - expensive, and easily cached. + Originally this was used to cache the results, but it didn't seem to make + reporting go faster, and caused issues with using too much memory. - Unfortunately, the HTML report code tokenizes all the files the first time - before then tokenizing them a second time, so we cache many. Ideally we'd - rearrange the code to tokenize each file twice before moving onto the next. """ readline = io.StringIO(text).readline - return list(tokenize.generate_tokens(readline)) + return tokenize.generate_tokens(readline) def source_encoding(source: bytes) -> str: