doc: Glides Full Data Analysis

mbta · Aug 21, 2024 · 108876c · 108876c
1 parent 9a1c3a2
commit 108876c
Showing 1 changed file with 136 additions and 0 deletions.
diff --git a/reports/glides_full_data_analysis.livemd b/reports/glides_full_data_analysis.livemd
@@ -0,0 +1,136 @@
+<!-- livebook:{"file_entries":[{"file":{"file_system_id":"local","file_system_type":"local","path":"/Users/pswartz/Dropbox/0-Inbox/trip-data-analytics-2024-08-12-2024-08-16/full_data.csv"},"name":"full_data.csv","type":"file"}]} -->
+
+# Glides Full Data Analysis
+
+```elixir
+Mix.install([
+  {:explorer, "~> 0.9.1"},
+  {:kino, "~> 0.13.2"}
+])
+```
+
+## Summary
+
+(This is based on data from 2024-08-12 through 2024-08-16)
+
+Overall, 20.8% of scheduled trips would have had an accurate prediction based solely on the schedule.
+
+Inspector-entered data (before the trip leaves) is better. 41.1% of scheduled trips had an accurate time entered by an inspector before the trip left.
+
+* dataset does not included ADDED trips (those that did not match to a scheduled departure)
+
+* 33% of scheduled departures did not have a recorded departure
+
+* of trips with recorded departures:
+
+  * ~50% leave more than a minute after the scheduled time
+  * ~50% leave between 15s after and 3 minutes after the scheduled time
+  * 31.1% would have had an accurate schedule prediction (30s before departure to 90s after)
+
+  Accuracy measurements are based on the [ETA Accuracy Benchmark](https://github.com/TransitApp/ETA-Accuracy-Benchmark?tab=readme-ov-file)
+
+## Data
+
+Fetch `full_data.csv` fetched from Glides report panel and add to the workbook as a file reference.
+
+File documentation: https://www.notion.so/mbta-downtown-crossing/Trip-Data-Analytics-Export-Field-Descriptions-71f6e0fc443f4ca5aaae18183028dd0a
+
+```elixir
+require Explorer.DataFrame, as: DF
+alias Explorer.Series
+
+df = DF.from_csv!(Kino.FS.file_path("full_data.csv"))
+
+# DF.sort_by(df, [asc: service_date, asc: terminal, asc: scheduled_time])
+# |> Kino.DataTable.new()
+
+Kino.nothing()
+```
+
+```elixir
+df = df
+|> DF.mutate(
+  schedule_inaccuracy: automatic_time - scheduled_time, 
+  manual_inaccuracy: manual_time - scheduled_time,
+  allowed_early: Series.select(
+    final_lead_time > 10,
+    Series.from_list([90]),
+    Series.select(
+      final_lead_time > 6,
+      Series.from_list([60]),
+      Series.select(
+        final_lead_time > 3,
+        Series.from_list([60]),
+        Series.from_list([30])
+      )
+    )
+  ),
+  allowed_late: Series.select(
+    final_lead_time > 10,
+    Series.from_list([-270]),
+    Series.select(
+      final_lead_time > 6,
+      Series.from_list([-210]),
+      Series.select(
+        final_lead_time > 3,
+        Series.from_list([-150]),
+        Series.from_list([-90])
+      )
+    )
+  )
+
+)
+|> DF.mutate(
+  is_accurate: Series.select(dropped?, is_nil(automatic_time), schedule_inaccuracy > -90 and schedule_inaccuracy < 30),
+  manual_accurate: Series.select(
+    final_lead_time > 0,
+    Series.select(dropped?, 
+      is_nil(manual_time), 
+      manual_inaccuracy > allowed_late and manual_inaccuracy < allowed_early),
+    false)
+)
+
+
+df 
+#|> DF.select([:scheduled_time, :automatic_time, :schedule_inaccuracy, :is_accurate])
+#|> DF.describe()
+|> DF.summarise(
+  count: count(automatic_time), 
+  nil_count: nil_count(automatic_time), 
+  mean: mean(schedule_inaccuracy), 
+  std: standard_deviation(schedule_inaccuracy), 
+  p25: quantile(schedule_inaccuracy, 0.25), 
+  p50: median(schedule_inaccuracy), 
+  p75: quantile(schedule_inaccuracy, 0.75),
+  accurate_count: sum(is_accurate),
+  manual_count: sum(manual_accurate)
+)
+|> DF.mutate(
+  accurate_pct: cast(accurate_count, {:u, 32}) / (count + nil_count),
+  manual_pct: cast(manual_count, {:u, 32}) / (count + nil_count)
+)
+|> Kino.DataTable.new()
+```
+
+```elixir
+df
+|> DF.group_by(:terminal)
+|> DF.summarise(
+  count: count(automatic_time), 
+  nil_count: nil_count(automatic_time), 
+  mean: mean(schedule_inaccuracy), 
+  std: standard_deviation(schedule_inaccuracy), 
+  p25: quantile(schedule_inaccuracy, 0.25), 
+  p50: median(schedule_inaccuracy), 
+  p75: quantile(schedule_inaccuracy, 0.75),
+  accurate_count: sum(is_accurate),
+  manual_count: sum(manual_accurate)
+)
+|> DF.mutate(
+  accurate_pct: cast(accurate_count, {:u, 32}) / (count + nil_count),
+  manual_pct: cast(manual_count, {:u, 32}) / (count + nil_count)
+)
+|> Kino.DataTable.new()
+```
+
+<!-- livebook:{"offset":4076,"stamp":{"token":"XCP.e1hm_TcJ7fZ6OIzYdFbe4NU81CyUD_ZAR0V41E0htcbyMkmezW2ouk2Yw28V_v_m-bKZIflA1yWgMSif87YpMRF9Dvzj4wFRJpy5fw","version":2}} -->