From 54a6759da80127305d891250a31fa0d2531cc203 Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Tue, 17 Mar 2020 15:21:49 -0700 Subject: [PATCH] Remove # of changes from priority calculation In practice, including the number of changes in the priority calculation has been more harmful than helpful. While some pages return the same HTML if their content hasn't changed, others return slightly different HTML for every response of for every different session (and every Wayback Memento we have is typically a unique session). That means that some pages have fewer changes over the week because they legitimately had few changes, while others have *many* changes over the course of the week even when they didn't change at all, massively skewing our priority numbers. The way we were *trying* to use the change count could be useful, but we'd need a more practical way to compare versions than by exact byte equality, which is all we have the capacity for right now. This commit removes the change count from priority to avoid the issue. Fixes #2. --- analyst_sheets/analyze.py | 10 ++++++---- analyst_sheets/sheets.py | 2 -- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/analyst_sheets/analyze.py b/analyst_sheets/analyze.py index 93a9e94..14c085f 100644 --- a/analyst_sheets/analyze.py +++ b/analyst_sheets/analyze.py @@ -245,6 +245,12 @@ def page_status_changed(page, a, b): return page_ok != first_ok +# NOTE: this function is not currently used. For it to be reasonably effective, +# we need a way to tell whether two HTML documents are *practically* the same, +# not just exactly the same (via hash, which is what we have been doing). For +# more on why using this without that capability is harmful rather than +# helpful, see: +# https://github.com/edgi-govdata-archiving/web-monitoring-task-sheets/issues/2 def analyze_change_count(page, after, before): """ Determine a factor for the number of changes that occurred during the time @@ -304,9 +310,6 @@ def analyze_page(page, after, before): if status_changed: priority = 1 - change_count_factor = analyze_change_count(page, after, before) - priority += 0.2 * change_count_factor - link_analysis = analyze_links(a, b) priority += 0.1 + 0.3 * priority_factor(link_analysis['diff_ratio']) # This most likely indicates a page was removed from navigation! Big deal. @@ -330,7 +333,6 @@ def analyze_page(page, after, before): return dict( priority=max(min(priority, 1), 0), versions=versions_count, - change_count_factor=change_count_factor, root_page=root_page, status=page['status'], status_changed=status_changed, diff --git a/analyst_sheets/sheets.py b/analyst_sheets/sheets.py index 29d25a1..8db8c39 100644 --- a/analyst_sheets/sheets.py +++ b/analyst_sheets/sheets.py @@ -36,7 +36,6 @@ 'Home page?', 'Changed status?', 'Status', - 'Version count factor', 'Readable?', 'Key Terms', '% Changed Text', @@ -98,7 +97,6 @@ def format_row(page, analysis, error, index, name, timestamp): analysis['root_page'], analysis['status_changed'], analysis['status'], - format(analysis['change_count_factor'], '.3f'), analysis['text']['readable'], ', '.join((f'{term}: {count}' for term, count in analysis['text']['key_terms'].items())), format(analysis['text']['percent_changed'], '.3f'),