Improvements to the PS review script

duckduckgo · Jan 18, 2025 · fc3a6b0 · fc3a6b0
1 parent f95f022
commit fc3a6b0
Show file tree

Hide file tree

Showing 2 changed files with 84 additions and 100 deletions.
diff --git a/.github/workflows/ps-review-analysis.yml b/.github/workflows/ps-review-analysis.yml
@@ -4,6 +4,23 @@ on:
   schedule:
     - cron: "0 9 * * *"  # Runs daily at 9am UTC
   workflow_dispatch:  # Allow manual triggering of the workflow
+    inputs:
+      package_name:
+        description: "Package name of the app to analyze"
+        required: true
+        default: "com.duckduckgo.mobile.android"
+      langs:
+        description: "Comma-separated list of languages"
+        required: false
+        default: "en"
+      countries:
+        description: "Comma-separated list of countries"
+        required: false
+        default: "us"
+      count:
+        description: "Number of reviews to fetch per language/country"
+        required: false
+        default: "10000"
 
 jobs:
   analyze-reviews:
@@ -26,7 +43,11 @@ jobs:
       - name: Run Review Analysis Script
         id: run-script
         run: |
-          python scripts/ps-analysis/ps_review_anomaly.py --package_name com.duckduckgo.mobile.android > output.txt
+          python scripts/ps-analysis/ps_review_anomaly.py \
+            --package_name "${{ inputs.package_name }}" \
+            --langs "${{ inputs.langs }}" \
+            --countries "${{ inputs.countries }}" \
+            --count "${{ inputs.count }}" > output.txt
           echo "script_output<<EOF" >> $GITHUB_ENV
           cat output.txt >> $GITHUB_ENV
           echo "EOF" >> $GITHUB_ENV
@@ -43,4 +64,4 @@ jobs:
           asana-section: ${{ vars.GH_ANDROID_APP_INCOMING_SECTION_ID }}
           asana-task-name: Google Play Review Analysis -- ${{ env.current_date }}
           asana-task-description: ${{ env.script_output }}
-          action: 'create-asana-task'
+          action: 'create-asana-task'
diff --git a/scripts/ps-analysis/ps_review_anomaly.py b/scripts/ps-analysis/ps_review_anomaly.py
@@ -8,15 +8,6 @@
 def fetch_reviews_all_languages_countries(package_name, langs, countries, count=10000):
     """
     Fetch the latest reviews for all specified languages and countries.
-
-    Args:
-        package_name (str): The app's package name.
-        langs (list): List of languages to fetch reviews for.
-        countries (list): List of countries to fetch reviews for.
-        count (int): Number of reviews to fetch per language/country.
-
-    Returns:
-        list: Combined list of deduplicated reviews from all languages and countries.
     """
     all_reviews = []
     review_ids = set()  # To track and deduplicate reviews
@@ -37,7 +28,7 @@ def fetch_reviews_all_languages_countries(package_name, langs, countries, count=
                         all_reviews.append(review)
                         review_ids.add(review['reviewId'])
                     else:
-                            dup_count += 1
+                        dup_count += 1
                 print(f"Fetched {len(reviews_data)} reviews for lang={lang}, country={country}.")
             except Exception as e:
                 print(f"Failed to fetch reviews for lang={lang}, country={country}: {e}")
@@ -49,31 +40,18 @@ def fetch_reviews_all_languages_countries(package_name, langs, countries, count=
 def analyze_anomalies(reviews_data):
     """
     Analyze reviews for anomalies in all star ratings on the most recent date.
-
-    Args:
-        reviews_data (list): List of review data.
-
-    Returns:
-        dict: Analysis results including the most recent date, anomalies, and statistics.
     """
-    # Group reviews by date
     date_reviews = {}
     for review in reviews_data:
         review_date = review['at'].date()
         if review_date not in date_reviews:
             date_reviews[review_date] = []
         date_reviews[review_date].append(review)
 
-    # Calculate the number of days covered by the reviews
     all_dates = sorted(date_reviews.keys())
     num_days = (all_dates[-1] - all_dates[0]).days + 1 if all_dates else 0
+    skew_warning = f"Warning: The reviews cover only {num_days} day(s), indicating potential skew." if num_days <= 3 else None
 
-    # Check for skew in review distribution
-    skew_warning = None
-    if num_days <= 3:
-        skew_warning = f"Warning: The reviews cover only {num_days} day(s), indicating potential skew."
-
-    # Analyze daily statistics for all star ratings
     daily_stats = []
     for date, reviews_on_date in date_reviews.items():
         star_counts = Counter([review['score'] for review in reviews_on_date])
@@ -83,47 +61,40 @@ def analyze_anomalies(reviews_data):
             **{f"{star}_star": star_counts.get(star, 0) for star in range(1, 6)}
         })
 
-    # Calculate averages and standard deviations for all star ratings
     star_stats = {}
     for star in range(1, 6):
         star_counts = [day[f"{star}_star"] for day in daily_stats]
         star_avg = mean(star_counts) if star_counts else 0
         star_std = stdev(star_counts) if len(star_counts) > 1 else 0
-        star_stats[star] = {
-            'avg': star_avg,
-            'std': star_std
-        }
+        star_stats[star] = {'avg': star_avg, 'std': star_std}
 
-    # Find the most recent date with reviews
     most_recent_date = max(date_reviews.keys()) if date_reviews else None
     recent_reviews = date_reviews[most_recent_date] if most_recent_date else []
 
-    # Analyze anomalies for the most recent date using Z-scores
     star_counts = Counter([review['score'] for review in recent_reviews])
     total_reviews = len(recent_reviews)
 
     anomalies = {}
+    star_z_scores = {}
     if total_reviews > 0:
         for star in range(1, 6):
             count = star_counts.get(star, 0)
             avg = star_stats[star]['avg']
             std = star_stats[star]['std']
 
-            if std > 0:
-                z_score = (count - avg) / std
-                # Adjust Z-score threshold dynamically based on variability
-                dynamic_threshold = 2 if std > 1 else 1.5
-                if abs(z_score) > dynamic_threshold:
-                    anomalies[star] = {
-                        'count': count,
-                        'z_score': z_score,
-                        'avg': avg,
-                        'std': std,
-                        'threshold': dynamic_threshold,
-                        'star': star  # Add star rating to anomaly data for later use
-                    }
-
-    # Include reasons why no anomalies were detected
+            z_score = (count - avg) / std if std > 0 else 0
+            star_z_scores[star] = z_score
+
+            dynamic_threshold = 2 if std > 1 else 1.5
+            if abs(z_score) > dynamic_threshold: # and z_score > 0:
+                anomalies[star] = {
+                    'count': count,
+                    'z_score': z_score,
+                    'avg': avg,
+                    'std': std,
+                    'threshold': dynamic_threshold
+                }
+
     no_anomaly_reasons = []
     if total_reviews == 0:
         no_anomaly_reasons.append("No reviews available for the most recent date.")
@@ -138,36 +109,16 @@ def analyze_anomalies(reviews_data):
         'no_anomaly_reasons': no_anomaly_reasons,
         'num_days': num_days,
         'skew_warning': skew_warning,
-        'star_stats': star_stats
+        'star_stats': star_stats,
+        'star_z_scores': star_z_scores
     }
 
-
 def main():
     parser = argparse.ArgumentParser(description="Analyze Google Play Store reviews for anomalies.")
-    parser.add_argument(
-        "--package_name",
-        type=str,
-        required=True,
-        help="The package name of the app (e.g., com.duckduckgo.mobile.android).",
-    )
-    parser.add_argument(
-        "--langs",
-        type=str,
-        default="en",
-        help="Comma-separated list of languages to fetch reviews for (default: 'en').",
-    )
-    parser.add_argument(
-        "--countries",
-        type=str,
-        default="us",
-        help="Comma-separated list of countries to fetch reviews for (default: 'us').",
-    )
-    parser.add_argument(
-        "--count",
-        type=int,
-        default=10000,
-        help="Number of reviews to fetch per language/country pair (default: 10000).",
-    )
+    parser.add_argument("--package_name", type=str, required=True, help="The package name of the app.")
+    parser.add_argument("--langs", type=str, default="en", help="Comma-separated list of languages.")
+    parser.add_argument("--countries", type=str, default="us", help="Comma-separated list of countries.")
+    parser.add_argument("--count", type=int, default=10000, help="Number of reviews to fetch per language/country pair.")
 
     args = parser.parse_args()
 
@@ -176,40 +127,52 @@ def main():
     countries = args.countries.split(",")
     count = args.count
 
-    print("Fetching the latest reviews...")
-    reviews_data = fetch_reviews_all_languages_countries(package_name, langs, countries, count=count)
-
-    print("Analyzing anomalies...")
+    reviews_data = fetch_reviews_all_languages_countries(package_name, langs, countries, count)
     analysis_results = analyze_anomalies(reviews_data)
 
     print(f"\n--- Analysis Results ---")
     print(f"Most Recent Date: {analysis_results['most_recent_date']}")
     print(f"Total Reviews on Most Recent Date: {analysis_results['total_reviews']}")
     print(f"Number of Days Covered: {analysis_results['num_days']}")
     if analysis_results['skew_warning']:
-        print(f"{analysis_results['skew_warning']}")
+        print(analysis_results['skew_warning'])
+
+    print("\n--- Star Ratings Summary (⚠️ means anomaly) ---")
+    for star, stats in analysis_results['star_stats'].items():
+        avg = stats['avg']
+        std = stats['std']
+        z_score = analysis_results['star_z_scores'].get(star, 0)
+        anomaly = analysis_results['anomalies'].get(star)
+        anomaly_flag = "⚠️" if anomaly else ""
+        threshold_str = f"(Threshold: {anomaly['threshold']:.2f})" if anomaly else ""
+        print(f"Average {star}-Star Reviews: {avg:.2f}, STD: {std:.2f}, Z-Score: {z_score:.2f} {anomaly_flag} {threshold_str}")
+
+    print("\n--- Conditional Review Display ---")
     for star, stats in analysis_results['star_stats'].items():
-        print(f"Average {star}-Star Reviews: {stats['avg']:.2f}, STD: {stats['std']:.2f}")
-
-    if analysis_results['anomalies']:
-        print("\nAnomalies Found:")
-        for star, data in analysis_results['anomalies'].items():
-            print(f"{star}-Star Reviews: {data['count']} (Z-Score: {data['z_score']:.2f}, Avg: {data['avg']:.2f}, STD: {data['std']:.2f}, Threshold: {data['threshold']})")
-
-        # Print reviews related to anomalies
-        print("\n--- Reviews Related to Anomalies ---")
-        for review in analysis_results['reviews_on_date']:
-            if review['score'] in analysis_results['anomalies']:
-                print(f"\nReview by {review['userName']} (Rating: {review['score']}):")
-                print(f"Date: {review['at']}")
-                print(f"Review: {review['content']}")
-    else:
-        print("\nNo anomalies detected.")
-        if analysis_results['no_anomaly_reasons']:
-            print("Reasons:")
-            for reason in analysis_results['no_anomaly_reasons']:
-                print(f"- {reason}")
+        z_score = analysis_results['star_z_scores'].get(star, 0)
+        anomaly = analysis_results['anomalies'].get(star)
+        if anomaly:
+            threshold = anomaly['threshold']
+            if z_score > 0 and abs(z_score) > threshold:
+                print(f"\nSignificant Positive Anomaly Detected for {star}-Star Reviews:")
+                for review in analysis_results['reviews_on_date']:
+                    if review['score'] == star:
+                        print(f"Date: {review['at']}, Score: {review['score']}, Review: {review['content']}")
+            elif z_score < 0 and abs(z_score) > threshold:
+                print(f"\nWarning: Significant Negative Anomaly Detected for {star}-Star Reviews.")
+                print(f"Reviews are NOT printed because Z-score ({z_score:.2f}) is negative.")
+
+
+    print("\n--- Z-Score Guide (Dynamic Threshold-Aware) ---")
+    print("    +-----------+----------------------------+------------------------------+")
+    print("    | Z-Score   | Deviation Level            | Interpretation               |")
+    print("    +-----------+----------------------------+------------------------------+")
+    print("    | 0         | None                       | No deviation                 |")
+    print("    | 0 to th   | Minimal                    | Within expected variation    |")
+    print("    | th to 2   | Moderate                   | Potential trend or anomaly   |")
+    print("    | >2        | Severe                     | Unusual, likely an anomaly   |")
+    print("    +-----------+----------------------------+------------------------------+")
+    print("    Note: 'th' represents the dynamic threshold: 2 if STD > 1, else 1.5.")
 
 if __name__ == "__main__":
     main()
-