Add PS analysis workflow

duckduckgo · Jan 15, 2025 · 4d80bbb · 4d80bbb
1 parent 27a830d
commit 4d80bbb
Show file tree

Hide file tree

Showing 3 changed files with 264 additions and 0 deletions.
diff --git a/.github/workflows/ps-review-analysis.yml b/.github/workflows/ps-review-analysis.yml
@@ -0,0 +1,45 @@
+name: Daily Review Analysis to Asana
+
+on:
+  schedule:
+    - cron: "0 0 * * *"  # Runs daily at midnight UTC
+  workflow_dispatch:  # Allow manual triggering of the workflow
+
+jobs:
+  analyze-reviews:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.9"
+
+      - name: Install Dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Run Review Analysis Script
+        id: run-script
+        run: |
+          python scripts/ps-analysis/ps_review_analysis.py --package_name com.duckduckgo.mobile.android > output.txt
+          echo "script_output<<EOF" >> $GITHUB_ENV
+          cat output.txt >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+      - name: Get Current Date
+        id: date
+        run: echo "current_date=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
+
+      - name: Sync Output to Asana
+        uses: duckduckgo/[email protected]
+        with:
+          asana-pat: ${{ secrets.GH_ASANA_SECRET }}
+          asana-project: ${{ vars.GH_ASANA_ANDROID_PS_REVIEWS_PROJECT }}
+          asana-task-name: "Google Play Review Analysis -- ${{ env.current_date }}"
+          asana-task-description: "${{ env.script_output }}"
+          action: 'create-asana-task'
diff --git a/scripts/ps-analysis/ps_review_anomaly.py b/scripts/ps-analysis/ps_review_anomaly.py
@@ -0,0 +1,215 @@
+import sys
+import argparse
+from datetime import datetime, timedelta
+from collections import Counter
+from statistics import mean, stdev
+from google_play_scraper import reviews, Sort
+
+def fetch_reviews_all_languages_countries(package_name, langs, countries, count=10000):
+    """
+    Fetch the latest reviews for all specified languages and countries.
+
+    Args:
+        package_name (str): The app's package name.
+        langs (list): List of languages to fetch reviews for.
+        countries (list): List of countries to fetch reviews for.
+        count (int): Number of reviews to fetch per language/country.
+
+    Returns:
+        list: Combined list of deduplicated reviews from all languages and countries.
+    """
+    all_reviews = []
+    review_ids = set()  # To track and deduplicate reviews
+    dup_count = 0
+
+    for lang in langs:
+        for country in countries:
+            try:
+                reviews_data, _ = reviews(
+                    package_name,
+                    count=count,
+                    lang=lang,
+                    country=country,
+                    sort=Sort.NEWEST
+                )
+                for review in reviews_data:
+                    if review['reviewId'] not in review_ids:
+                        all_reviews.append(review)
+                        review_ids.add(review['reviewId'])
+                    else:
+                            dup_count += 1
+                print(f"Fetched {len(reviews_data)} reviews for lang={lang}, country={country}.")
+            except Exception as e:
+                print(f"Failed to fetch reviews for lang={lang}, country={country}: {e}")
+
+    print(f"Total deduplicated reviews fetched: {len(all_reviews)}")
+    print(f"Total duplicates removed: {dup_count}")
+    return all_reviews
+
+def analyze_anomalies(reviews_data):
+    """
+    Analyze reviews for anomalies in all star ratings on the most recent date.
+
+    Args:
+        reviews_data (list): List of review data.
+
+    Returns:
+        dict: Analysis results including the most recent date, anomalies, and statistics.
+    """
+    # Group reviews by date
+    date_reviews = {}
+    for review in reviews_data:
+        review_date = review['at'].date()
+        if review_date not in date_reviews:
+            date_reviews[review_date] = []
+        date_reviews[review_date].append(review)
+
+    # Calculate the number of days covered by the reviews
+    all_dates = sorted(date_reviews.keys())
+    num_days = (all_dates[-1] - all_dates[0]).days + 1 if all_dates else 0
+
+    # Check for skew in review distribution
+    skew_warning = None
+    if num_days <= 3:
+        skew_warning = f"Warning: The reviews cover only {num_days} day(s), indicating potential skew."
+
+    # Analyze daily statistics for all star ratings
+    daily_stats = []
+    for date, reviews_on_date in date_reviews.items():
+        star_counts = Counter([review['score'] for review in reviews_on_date])
+        daily_stats.append({
+            'date': date,
+            'total_reviews': len(reviews_on_date),
+            **{f"{star}_star": star_counts.get(star, 0) for star in range(1, 6)}
+        })
+
+    # Calculate averages and standard deviations for all star ratings
+    star_stats = {}
+    for star in range(1, 6):
+        star_counts = [day[f"{star}_star"] for day in daily_stats]
+        star_avg = mean(star_counts) if star_counts else 0
+        star_std = stdev(star_counts) if len(star_counts) > 1 else 0
+        star_stats[star] = {
+            'avg': star_avg,
+            'std': star_std
+        }
+
+    # Find the most recent date with reviews
+    most_recent_date = max(date_reviews.keys()) if date_reviews else None
+    recent_reviews = date_reviews[most_recent_date] if most_recent_date else []
+
+    # Analyze anomalies for the most recent date using Z-scores
+    star_counts = Counter([review['score'] for review in recent_reviews])
+    total_reviews = len(recent_reviews)
+
+    anomalies = {}
+    if total_reviews > 0:
+        for star in range(1, 6):
+            count = star_counts.get(star, 0)
+            avg = star_stats[star]['avg']
+            std = star_stats[star]['std']
+
+            if std > 0:
+                z_score = (count - avg) / std
+                # Adjust Z-score threshold dynamically based on variability
+                dynamic_threshold = 2 if std > 1 else 1.5
+                if abs(z_score) > dynamic_threshold:
+                    anomalies[star] = {
+                        'count': count,
+                        'z_score': z_score,
+                        'avg': avg,
+                        'std': std,
+                        'threshold': dynamic_threshold,
+                        'star': star  # Add star rating to anomaly data for later use
+                    }
+
+    # Include reasons why no anomalies were detected
+    no_anomaly_reasons = []
+    if total_reviews == 0:
+        no_anomaly_reasons.append("No reviews available for the most recent date.")
+    elif not anomalies:
+        no_anomaly_reasons.append("No significant deviations (Z-scores within dynamic thresholds).")
+
+    return {
+        'most_recent_date': most_recent_date,
+        'total_reviews': total_reviews,
+        'anomalies': anomalies,
+        'reviews_on_date': recent_reviews,
+        'no_anomaly_reasons': no_anomaly_reasons,
+        'num_days': num_days,
+        'skew_warning': skew_warning,
+        'star_stats': star_stats
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Analyze Google Play Store reviews for anomalies.")
+    parser.add_argument(
+        "--package_name",
+        type=str,
+        required=True,
+        help="The package name of the app (e.g., com.duckduckgo.mobile.android).",
+    )
+    parser.add_argument(
+        "--langs",
+        type=str,
+        default="en",
+        help="Comma-separated list of languages to fetch reviews for (default: 'en').",
+    )
+    parser.add_argument(
+        "--countries",
+        type=str,
+        default="us",
+        help="Comma-separated list of countries to fetch reviews for (default: 'us').",
+    )
+    parser.add_argument(
+        "--count",
+        type=int,
+        default=10000,
+        help="Number of reviews to fetch per language/country pair (default: 10000).",
+    )
+
+    args = parser.parse_args()
+
+    package_name = args.package_name
+    langs = args.langs.split(",")
+    countries = args.countries.split(",")
+    count = args.count
+
+    print("Fetching the latest reviews...")
+    reviews_data = fetch_reviews_all_languages_countries(package_name, langs, countries, count=count)
+
+    print("Analyzing anomalies...")
+    analysis_results = analyze_anomalies(reviews_data)
+
+    print(f"\n--- Analysis Results ---")
+    print(f"Most Recent Date: {analysis_results['most_recent_date']}")
+    print(f"Total Reviews on Most Recent Date: {analysis_results['total_reviews']}")
+    print(f"Number of Days Covered: {analysis_results['num_days']}")
+    if analysis_results['skew_warning']:
+        print(f"{analysis_results['skew_warning']}")
+    for star, stats in analysis_results['star_stats'].items():
+        print(f"Average {star}-Star Reviews: {stats['avg']:.2f}, STD: {stats['std']:.2f}")
+
+    if analysis_results['anomalies']:
+        print("\nAnomalies Found:")
+        for star, data in analysis_results['anomalies'].items():
+            print(f"{star}-Star Reviews: {data['count']} (Z-Score: {data['z_score']:.2f}, Avg: {data['avg']:.2f}, STD: {data['std']:.2f}, Threshold: {data['threshold']})")
+
+        # Print reviews related to anomalies
+        print("\n--- Reviews Related to Anomalies ---")
+        for review in analysis_results['reviews_on_date']:
+            if review['score'] in analysis_results['anomalies']:
+                print(f"\nReview by {review['userName']} (Rating: {review['score']}):")
+                print(f"Date: {review['at']}")
+                print(f"Review: {review['content']}")
+    else:
+        print("\nNo anomalies detected.")
+        if analysis_results['no_anomaly_reasons']:
+            print("Reasons:")
+            for reason in analysis_results['no_anomaly_reasons']:
+                print(f"- {reason}")
+
+if __name__ == "__main__":
+    main()
+
diff --git a/scripts/ps-analysis/requirements.txt b/scripts/ps-analysis/requirements.txt
@@ -0,0 +1,4 @@
+google-play-scraper==1.2.7
+pyinstaller==6.11.1
+pyinstaller-hooks-contrib==2024.11
+setuptools==75.7.0