diff --git a/src/qp/metrics/parallel_metrics.ipynb b/src/qp/metrics/parallel_metrics.ipynb index 3996f6a..27405eb 100644 --- a/src/qp/metrics/parallel_metrics.ipynb +++ b/src/qp/metrics/parallel_metrics.ipynb @@ -12,6 +12,7 @@ " PointSigmaIQR,\n", " PointBias,\n", " PointSigmaMAD,\n", + " PointOutlierRate,\n", ")" ] }, @@ -142,7 +143,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "An example running the PointBias metric in directly and in parallel" + "### An example running the PointBias metric in directly and in parallel" ] }, { @@ -167,7 +168,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "An example running PointSigmaMAD directly and in parallel" + "### An example running PointSigmaMAD directly and in parallel" ] }, { @@ -221,6 +222,66 @@ "source": [ "run_parallel_metric(PointSigmaMAD(), point_sigma_mad_data_chunks)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### An example running PointOutlierRate metric directly and in parallel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# An example with PointOutlierRate\n", + "point_outlier_estimator = PointOutlierRate()\n", + "point_outlier_estimator_list = [point_outlier_estimator]*n_chunk\n", + "point_outlier_data_chunks = [chunk for chunk in zip(point_outlier_estimator_list, chunker(estimate, chunk_size), chunker(reference, chunk_size))]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "PointOutlierRate().evaluate(estimate, reference)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "por = PointOutlierRate()\n", + "centroids = por.accumulate(estimate, reference)\n", + "\n", + "por.finalize(centroids=[centroids])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The estimation of the metric trends closer to the analytic if the value of `compression` is increased.\n", + "\n", + "The default value for compression is 1000. If set to 10_000, the estimate becomes 0.13645.\n", + "\n", + "Note that, of course, setting compression = 10_000 increases runtime and memory usage." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run_parallel_metric(PointOutlierRate(), point_outlier_data_chunks)" + ] } ], "metadata": { diff --git a/src/qp/metrics/point_estimate_metric_classes.py b/src/qp/metrics/point_estimate_metric_classes.py index caf3748..5abf8b1 100644 --- a/src/qp/metrics/point_estimate_metric_classes.py +++ b/src/qp/metrics/point_estimate_metric_classes.py @@ -178,6 +178,37 @@ def evaluate(self, estimate, reference): outlier = np.sum(mask) return float(outlier) / float(num) + def accumulate(self, estimate, reference): + ez = (estimate - reference) / (1.0 + reference) + digest = TDigest.compute(ez, compression=1000) + centroids = digest.get_centroids() + return centroids + + def finalize(self, centroids=None): + digests = ( + TDigest.of_centroids(np.array(centroid), compression=1000) + for centroid in centroids + ) + digest = reduce(add, digests) + + # this replaces the call to PointSigmaIQR().evaluate() + x75, x25 = digest.inverse_cdf([0.75,0.25]) + iqr = x75 - x25 + sigma_iqr = iqr / 1.349 + + three_sig = 3.0 * sigma_iqr + cut_criterion = np.maximum(0.06, three_sig) + + # here we use the number of points in the centroids as an approximation + # of ez. + centroids = digest.get_centroids() + mask = np.fabs(centroids[:,0]) > cut_criterion + outlier = np.sum(centroids[mask,1]) + + # Since we use equal weights for all the values in the digest + # digest.weight is the total number of values, and is stored as a float. + return float(outlier) / digest.weight + class PointSigmaMAD(PointToPointMetric): """Function to calculate median absolute deviation and sigma