Skip to content

Commit

Permalink
Improvements to the PS review script
Browse files Browse the repository at this point in the history
  • Loading branch information
aitorvs committed Jan 18, 2025
1 parent f95f022 commit b0431a8
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 100 deletions.
25 changes: 23 additions & 2 deletions .github/workflows/ps-review-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,23 @@ on:
schedule:
- cron: "0 9 * * *" # Runs daily at 9am UTC
workflow_dispatch: # Allow manual triggering of the workflow
inputs:
package_name:
description: "Package name of the app to analyze"
required: true
default: "com.duckduckgo.mobile.android"
langs:
description: "Comma-separated list of languages"
required: false
default: "en"
countries:
description: "Comma-separated list of countries"
required: false
default: "us"
count:
description: "Number of reviews to fetch per language/country"
required: false
default: "10000"

jobs:
analyze-reviews:
Expand All @@ -26,7 +43,11 @@ jobs:
- name: Run Review Analysis Script
id: run-script
run: |
python scripts/ps-analysis/ps_review_anomaly.py --package_name com.duckduckgo.mobile.android > output.txt
python scripts/ps-analysis/ps_review_anomaly.py \
--package_name "${{ inputs.package_name }}" \
--langs "${{ inputs.langs }}" \
--countries "${{ inputs.countries }}" \
--count "${{ inputs.count }}" > output.txt
echo "script_output<<EOF" >> $GITHUB_ENV
cat output.txt >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
Expand All @@ -43,4 +64,4 @@ jobs:
asana-section: ${{ vars.GH_ANDROID_APP_INCOMING_SECTION_ID }}
asana-task-name: Google Play Review Analysis -- ${{ env.current_date }}
asana-task-description: ${{ env.script_output }}
action: 'create-asana-task'
action: 'create-asana-task'
159 changes: 61 additions & 98 deletions scripts/ps-analysis/ps_review_anomaly.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,6 @@
def fetch_reviews_all_languages_countries(package_name, langs, countries, count=10000):
"""
Fetch the latest reviews for all specified languages and countries.
Args:
package_name (str): The app's package name.
langs (list): List of languages to fetch reviews for.
countries (list): List of countries to fetch reviews for.
count (int): Number of reviews to fetch per language/country.
Returns:
list: Combined list of deduplicated reviews from all languages and countries.
"""
all_reviews = []
review_ids = set() # To track and deduplicate reviews
Expand All @@ -37,7 +28,7 @@ def fetch_reviews_all_languages_countries(package_name, langs, countries, count=
all_reviews.append(review)
review_ids.add(review['reviewId'])
else:
dup_count += 1
dup_count += 1
print(f"Fetched {len(reviews_data)} reviews for lang={lang}, country={country}.")
except Exception as e:
print(f"Failed to fetch reviews for lang={lang}, country={country}: {e}")
Expand All @@ -49,31 +40,18 @@ def fetch_reviews_all_languages_countries(package_name, langs, countries, count=
def analyze_anomalies(reviews_data):
"""
Analyze reviews for anomalies in all star ratings on the most recent date.
Args:
reviews_data (list): List of review data.
Returns:
dict: Analysis results including the most recent date, anomalies, and statistics.
"""
# Group reviews by date
date_reviews = {}
for review in reviews_data:
review_date = review['at'].date()
if review_date not in date_reviews:
date_reviews[review_date] = []
date_reviews[review_date].append(review)

# Calculate the number of days covered by the reviews
all_dates = sorted(date_reviews.keys())
num_days = (all_dates[-1] - all_dates[0]).days + 1 if all_dates else 0
skew_warning = f"Warning: The reviews cover only {num_days} day(s), indicating potential skew." if num_days <= 3 else None

# Check for skew in review distribution
skew_warning = None
if num_days <= 3:
skew_warning = f"Warning: The reviews cover only {num_days} day(s), indicating potential skew."

# Analyze daily statistics for all star ratings
daily_stats = []
for date, reviews_on_date in date_reviews.items():
star_counts = Counter([review['score'] for review in reviews_on_date])
Expand All @@ -83,47 +61,40 @@ def analyze_anomalies(reviews_data):
**{f"{star}_star": star_counts.get(star, 0) for star in range(1, 6)}
})

# Calculate averages and standard deviations for all star ratings
star_stats = {}
for star in range(1, 6):
star_counts = [day[f"{star}_star"] for day in daily_stats]
star_avg = mean(star_counts) if star_counts else 0
star_std = stdev(star_counts) if len(star_counts) > 1 else 0
star_stats[star] = {
'avg': star_avg,
'std': star_std
}
star_stats[star] = {'avg': star_avg, 'std': star_std}

# Find the most recent date with reviews
most_recent_date = max(date_reviews.keys()) if date_reviews else None
recent_reviews = date_reviews[most_recent_date] if most_recent_date else []

# Analyze anomalies for the most recent date using Z-scores
star_counts = Counter([review['score'] for review in recent_reviews])
total_reviews = len(recent_reviews)

anomalies = {}
star_z_scores = {}
if total_reviews > 0:
for star in range(1, 6):
count = star_counts.get(star, 0)
avg = star_stats[star]['avg']
std = star_stats[star]['std']

if std > 0:
z_score = (count - avg) / std
# Adjust Z-score threshold dynamically based on variability
dynamic_threshold = 2 if std > 1 else 1.5
if abs(z_score) > dynamic_threshold:
anomalies[star] = {
'count': count,
'z_score': z_score,
'avg': avg,
'std': std,
'threshold': dynamic_threshold,
'star': star # Add star rating to anomaly data for later use
}

# Include reasons why no anomalies were detected
z_score = (count - avg) / std if std > 0 else 0
star_z_scores[star] = z_score

dynamic_threshold = 2 if std > 1 else 1.5
if abs(z_score) > dynamic_threshold: # and z_score > 0:
anomalies[star] = {
'count': count,
'z_score': z_score,
'avg': avg,
'std': std,
'threshold': dynamic_threshold
}

no_anomaly_reasons = []
if total_reviews == 0:
no_anomaly_reasons.append("No reviews available for the most recent date.")
Expand All @@ -138,36 +109,16 @@ def analyze_anomalies(reviews_data):
'no_anomaly_reasons': no_anomaly_reasons,
'num_days': num_days,
'skew_warning': skew_warning,
'star_stats': star_stats
'star_stats': star_stats,
'star_z_scores': star_z_scores
}


def main():
parser = argparse.ArgumentParser(description="Analyze Google Play Store reviews for anomalies.")
parser.add_argument(
"--package_name",
type=str,
required=True,
help="The package name of the app (e.g., com.duckduckgo.mobile.android).",
)
parser.add_argument(
"--langs",
type=str,
default="en",
help="Comma-separated list of languages to fetch reviews for (default: 'en').",
)
parser.add_argument(
"--countries",
type=str,
default="us",
help="Comma-separated list of countries to fetch reviews for (default: 'us').",
)
parser.add_argument(
"--count",
type=int,
default=10000,
help="Number of reviews to fetch per language/country pair (default: 10000).",
)
parser.add_argument("--package_name", type=str, required=True, help="The package name of the app.")
parser.add_argument("--langs", type=str, default="en", help="Comma-separated list of languages.")
parser.add_argument("--countries", type=str, default="us", help="Comma-separated list of countries.")
parser.add_argument("--count", type=int, default=10000, help="Number of reviews to fetch per language/country pair.")

args = parser.parse_args()

Expand All @@ -176,40 +127,52 @@ def main():
countries = args.countries.split(",")
count = args.count

print("Fetching the latest reviews...")
reviews_data = fetch_reviews_all_languages_countries(package_name, langs, countries, count=count)

print("Analyzing anomalies...")
reviews_data = fetch_reviews_all_languages_countries(package_name, langs, countries, count)
analysis_results = analyze_anomalies(reviews_data)

print(f"\n--- Analysis Results ---")
print(f"Most Recent Date: {analysis_results['most_recent_date']}")
print(f"Total Reviews on Most Recent Date: {analysis_results['total_reviews']}")
print(f"Number of Days Covered: {analysis_results['num_days']}")
if analysis_results['skew_warning']:
print(f"{analysis_results['skew_warning']}")
print(analysis_results['skew_warning'])

print("\n--- Star Ratings Summary (⚠️ means anomaly) ---")
for star, stats in analysis_results['star_stats'].items():
avg = stats['avg']
std = stats['std']
z_score = analysis_results['star_z_scores'].get(star, 0)
anomaly = analysis_results['anomalies'].get(star)
anomaly_flag = "⚠️" if anomaly else ""
threshold_str = f"(Threshold: {anomaly['threshold']:.2f})" if anomaly else ""
print(f"Average {star}-Star Reviews: {avg:.2f}, STD: {std:.2f}, Z-Score: {z_score:.2f} {anomaly_flag} {threshold_str}")

print("\n--- Conditional Review Display ---")
for star, stats in analysis_results['star_stats'].items():
print(f"Average {star}-Star Reviews: {stats['avg']:.2f}, STD: {stats['std']:.2f}")

if analysis_results['anomalies']:
print("\nAnomalies Found:")
for star, data in analysis_results['anomalies'].items():
print(f"{star}-Star Reviews: {data['count']} (Z-Score: {data['z_score']:.2f}, Avg: {data['avg']:.2f}, STD: {data['std']:.2f}, Threshold: {data['threshold']})")

# Print reviews related to anomalies
print("\n--- Reviews Related to Anomalies ---")
for review in analysis_results['reviews_on_date']:
if review['score'] in analysis_results['anomalies']:
print(f"\nReview by {review['userName']} (Rating: {review['score']}):")
print(f"Date: {review['at']}")
print(f"Review: {review['content']}")
else:
print("\nNo anomalies detected.")
if analysis_results['no_anomaly_reasons']:
print("Reasons:")
for reason in analysis_results['no_anomaly_reasons']:
print(f"- {reason}")
z_score = analysis_results['star_z_scores'].get(star, 0)
anomaly = analysis_results['anomalies'].get(star)
if anomaly:
threshold = anomaly['threshold']
if z_score > 0 and abs(z_score) > threshold:
print(f"\nSignificant Positive Anomaly Detected for {star}-Star Reviews:")
for review in analysis_results['reviews_on_date']:
if review['score'] == star:
print(f"Date: {review['at']}, Score: {review['score']}, Review: {review['content']}")
elif z_score < 0 and abs(z_score) > threshold:
print(f"\nWarning: Significant Negative Anomaly Detected for {star}-Star Reviews.")
print(f"Reviews are NOT printed because Z-score ({z_score:.2f}) is negative.")


print("\n```--- Z-Score Guide (Dynamic Threshold-Aware) ---")
print("+-----------+----------------------------+------------------------------+")
print("| Z-Score | Deviation Level | Interpretation |")
print("+-----------+----------------------------+------------------------------+")
print("| 0 | None | No deviation |")
print("| 0 to th | Minimal | Within expected variation |")
print("| th to 2 | Moderate | Potential trend or anomaly |")
print("| >2 | Severe | Unusual, likely an anomaly |")
print("+-----------+----------------------------+------------------------------+")
print("Note: 'th' represents the dynamic threshold: 2 if STD > 1, else 1.5.```")

if __name__ == "__main__":
main()

0 comments on commit b0431a8

Please sign in to comment.