diff --git a/.github/workflows/ps-review-analysis.yml b/.github/workflows/ps-review-analysis.yml index f1d388ae0fcc..45c79a4b244e 100644 --- a/.github/workflows/ps-review-analysis.yml +++ b/.github/workflows/ps-review-analysis.yml @@ -4,6 +4,23 @@ on: schedule: - cron: "0 9 * * *" # Runs daily at 9am UTC workflow_dispatch: # Allow manual triggering of the workflow + inputs: + package_name: + description: "Package name of the app to analyze" + required: true + default: "com.duckduckgo.mobile.android" + langs: + description: "Comma-separated list of languages" + required: false + default: "en" + countries: + description: "Comma-separated list of countries" + required: false + default: "us" + count: + description: "Number of reviews to fetch per language/country" + required: false + default: "10000" jobs: analyze-reviews: @@ -26,7 +43,11 @@ jobs: - name: Run Review Analysis Script id: run-script run: | - python scripts/ps-analysis/ps_review_anomaly.py --package_name com.duckduckgo.mobile.android > output.txt + python scripts/ps-analysis/ps_review_anomaly.py \ + --package_name "${{ inputs.package_name }}" \ + --langs "${{ inputs.langs }}" \ + --countries "${{ inputs.countries }}" \ + --count "${{ inputs.count }}" > output.txt echo "script_output<> $GITHUB_ENV cat output.txt >> $GITHUB_ENV echo "EOF" >> $GITHUB_ENV @@ -43,4 +64,4 @@ jobs: asana-section: ${{ vars.GH_ANDROID_APP_INCOMING_SECTION_ID }} asana-task-name: Google Play Review Analysis -- ${{ env.current_date }} asana-task-description: ${{ env.script_output }} - action: 'create-asana-task' \ No newline at end of file + action: 'create-asana-task' diff --git a/scripts/ps-analysis/ps_review_anomaly.py b/scripts/ps-analysis/ps_review_anomaly.py index f7b331368240..a5aef5bad4ed 100644 --- a/scripts/ps-analysis/ps_review_anomaly.py +++ b/scripts/ps-analysis/ps_review_anomaly.py @@ -8,15 +8,6 @@ def fetch_reviews_all_languages_countries(package_name, langs, countries, count=10000): """ Fetch the latest reviews for all specified languages and countries. - - Args: - package_name (str): The app's package name. - langs (list): List of languages to fetch reviews for. - countries (list): List of countries to fetch reviews for. - count (int): Number of reviews to fetch per language/country. - - Returns: - list: Combined list of deduplicated reviews from all languages and countries. """ all_reviews = [] review_ids = set() # To track and deduplicate reviews @@ -37,7 +28,7 @@ def fetch_reviews_all_languages_countries(package_name, langs, countries, count= all_reviews.append(review) review_ids.add(review['reviewId']) else: - dup_count += 1 + dup_count += 1 print(f"Fetched {len(reviews_data)} reviews for lang={lang}, country={country}.") except Exception as e: print(f"Failed to fetch reviews for lang={lang}, country={country}: {e}") @@ -49,14 +40,7 @@ def fetch_reviews_all_languages_countries(package_name, langs, countries, count= def analyze_anomalies(reviews_data): """ Analyze reviews for anomalies in all star ratings on the most recent date. - - Args: - reviews_data (list): List of review data. - - Returns: - dict: Analysis results including the most recent date, anomalies, and statistics. """ - # Group reviews by date date_reviews = {} for review in reviews_data: review_date = review['at'].date() @@ -64,16 +48,10 @@ def analyze_anomalies(reviews_data): date_reviews[review_date] = [] date_reviews[review_date].append(review) - # Calculate the number of days covered by the reviews all_dates = sorted(date_reviews.keys()) num_days = (all_dates[-1] - all_dates[0]).days + 1 if all_dates else 0 + skew_warning = f"Warning: The reviews cover only {num_days} day(s), indicating potential skew." if num_days <= 3 else None - # Check for skew in review distribution - skew_warning = None - if num_days <= 3: - skew_warning = f"Warning: The reviews cover only {num_days} day(s), indicating potential skew." - - # Analyze daily statistics for all star ratings daily_stats = [] for date, reviews_on_date in date_reviews.items(): star_counts = Counter([review['score'] for review in reviews_on_date]) @@ -83,47 +61,40 @@ def analyze_anomalies(reviews_data): **{f"{star}_star": star_counts.get(star, 0) for star in range(1, 6)} }) - # Calculate averages and standard deviations for all star ratings star_stats = {} for star in range(1, 6): star_counts = [day[f"{star}_star"] for day in daily_stats] star_avg = mean(star_counts) if star_counts else 0 star_std = stdev(star_counts) if len(star_counts) > 1 else 0 - star_stats[star] = { - 'avg': star_avg, - 'std': star_std - } + star_stats[star] = {'avg': star_avg, 'std': star_std} - # Find the most recent date with reviews most_recent_date = max(date_reviews.keys()) if date_reviews else None recent_reviews = date_reviews[most_recent_date] if most_recent_date else [] - # Analyze anomalies for the most recent date using Z-scores star_counts = Counter([review['score'] for review in recent_reviews]) total_reviews = len(recent_reviews) anomalies = {} + star_z_scores = {} if total_reviews > 0: for star in range(1, 6): count = star_counts.get(star, 0) avg = star_stats[star]['avg'] std = star_stats[star]['std'] - if std > 0: - z_score = (count - avg) / std - # Adjust Z-score threshold dynamically based on variability - dynamic_threshold = 2 if std > 1 else 1.5 - if abs(z_score) > dynamic_threshold: - anomalies[star] = { - 'count': count, - 'z_score': z_score, - 'avg': avg, - 'std': std, - 'threshold': dynamic_threshold, - 'star': star # Add star rating to anomaly data for later use - } - - # Include reasons why no anomalies were detected + z_score = (count - avg) / std if std > 0 else 0 + star_z_scores[star] = z_score + + dynamic_threshold = 2 if std > 1 else 1.5 + if abs(z_score) > dynamic_threshold: # and z_score > 0: + anomalies[star] = { + 'count': count, + 'z_score': z_score, + 'avg': avg, + 'std': std, + 'threshold': dynamic_threshold + } + no_anomaly_reasons = [] if total_reviews == 0: no_anomaly_reasons.append("No reviews available for the most recent date.") @@ -138,36 +109,16 @@ def analyze_anomalies(reviews_data): 'no_anomaly_reasons': no_anomaly_reasons, 'num_days': num_days, 'skew_warning': skew_warning, - 'star_stats': star_stats + 'star_stats': star_stats, + 'star_z_scores': star_z_scores } - def main(): parser = argparse.ArgumentParser(description="Analyze Google Play Store reviews for anomalies.") - parser.add_argument( - "--package_name", - type=str, - required=True, - help="The package name of the app (e.g., com.duckduckgo.mobile.android).", - ) - parser.add_argument( - "--langs", - type=str, - default="en", - help="Comma-separated list of languages to fetch reviews for (default: 'en').", - ) - parser.add_argument( - "--countries", - type=str, - default="us", - help="Comma-separated list of countries to fetch reviews for (default: 'us').", - ) - parser.add_argument( - "--count", - type=int, - default=10000, - help="Number of reviews to fetch per language/country pair (default: 10000).", - ) + parser.add_argument("--package_name", type=str, required=True, help="The package name of the app.") + parser.add_argument("--langs", type=str, default="en", help="Comma-separated list of languages.") + parser.add_argument("--countries", type=str, default="us", help="Comma-separated list of countries.") + parser.add_argument("--count", type=int, default=10000, help="Number of reviews to fetch per language/country pair.") args = parser.parse_args() @@ -176,10 +127,7 @@ def main(): countries = args.countries.split(",") count = args.count - print("Fetching the latest reviews...") - reviews_data = fetch_reviews_all_languages_countries(package_name, langs, countries, count=count) - - print("Analyzing anomalies...") + reviews_data = fetch_reviews_all_languages_countries(package_name, langs, countries, count) analysis_results = analyze_anomalies(reviews_data) print(f"\n--- Analysis Results ---") @@ -187,29 +135,44 @@ def main(): print(f"Total Reviews on Most Recent Date: {analysis_results['total_reviews']}") print(f"Number of Days Covered: {analysis_results['num_days']}") if analysis_results['skew_warning']: - print(f"{analysis_results['skew_warning']}") + print(analysis_results['skew_warning']) + + print("\n--- Star Ratings Summary (⚠️ means anomaly) ---") + for star, stats in analysis_results['star_stats'].items(): + avg = stats['avg'] + std = stats['std'] + z_score = analysis_results['star_z_scores'].get(star, 0) + anomaly = analysis_results['anomalies'].get(star) + anomaly_flag = "⚠️" if anomaly else "" + threshold_str = f"(Threshold: {anomaly['threshold']:.2f})" if anomaly else "" + print(f"Average {star}-Star Reviews: {avg:.2f}, STD: {std:.2f}, Z-Score: {z_score:.2f} {anomaly_flag} {threshold_str}") + + print("\n--- Conditional Review Display ---") for star, stats in analysis_results['star_stats'].items(): - print(f"Average {star}-Star Reviews: {stats['avg']:.2f}, STD: {stats['std']:.2f}") - - if analysis_results['anomalies']: - print("\nAnomalies Found:") - for star, data in analysis_results['anomalies'].items(): - print(f"{star}-Star Reviews: {data['count']} (Z-Score: {data['z_score']:.2f}, Avg: {data['avg']:.2f}, STD: {data['std']:.2f}, Threshold: {data['threshold']})") - - # Print reviews related to anomalies - print("\n--- Reviews Related to Anomalies ---") - for review in analysis_results['reviews_on_date']: - if review['score'] in analysis_results['anomalies']: - print(f"\nReview by {review['userName']} (Rating: {review['score']}):") - print(f"Date: {review['at']}") - print(f"Review: {review['content']}") - else: - print("\nNo anomalies detected.") - if analysis_results['no_anomaly_reasons']: - print("Reasons:") - for reason in analysis_results['no_anomaly_reasons']: - print(f"- {reason}") + z_score = analysis_results['star_z_scores'].get(star, 0) + anomaly = analysis_results['anomalies'].get(star) + if anomaly: + threshold = anomaly['threshold'] + if z_score > 0 and abs(z_score) > threshold: + print(f"\nSignificant Positive Anomaly Detected for {star}-Star Reviews:") + for review in analysis_results['reviews_on_date']: + if review['score'] == star: + print(f"Date: {review['at']}, Score: {review['score']}, Review: {review['content']}") + elif z_score < 0 and abs(z_score) > threshold: + print(f"\nWarning: Significant Negative Anomaly Detected for {star}-Star Reviews.") + print(f"Reviews are NOT printed because Z-score ({z_score:.2f}) is negative.") + + + print("\n--- Z-Score Guide (Dynamic Threshold-Aware) ---") + print(" +-----------+----------------------------+------------------------------+") + print(" | Z-Score | Deviation Level | Interpretation |") + print(" +-----------+----------------------------+------------------------------+") + print(" | 0 | None | No deviation |") + print(" | 0 to th | Minimal | Within expected variation |") + print(" | th to 2 | Moderate | Potential trend or anomaly |") + print(" | >2 | Severe | Unusual, likely an anomaly |") + print(" +-----------+----------------------------+------------------------------+") + print(" Note: 'th' represents the dynamic threshold: 2 if STD > 1, else 1.5.") if __name__ == "__main__": main() -