Skip to content

Commit

Permalink
Start refactor of ocelot.calculate
Browse files Browse the repository at this point in the history
  • Loading branch information
emilyhunt committed Jan 10, 2024
1 parent 28e1a2c commit 7468d80
Show file tree
Hide file tree
Showing 8 changed files with 228 additions and 321 deletions.
4 changes: 1 addition & 3 deletions src/ocelot/calculate/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@
from .calculate import mean_distance, mean_proper_motion, mean_radius, mean_internal_velocity_dispersion, all_statistics
from .profile import king_surface_density, king_surface_density_fast, sample_2d_king_profile, sample_1d_king_profile
from . import random
# Todo decide what to make top-level here
321 changes: 3 additions & 318 deletions src/ocelot/calculate/calculate.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,6 @@
from .constants import mas_per_yr_to_rad_per_s, pc_to_m, deg_to_rad


def _weighted_standard_deviation(x, weights):
# See https://stackoverflow.com/a/52655244/12709989 for how the standard deviation is calculated in a weighted way
# Todo: not sure that this deals with small numbers of points correctly!
# See: unit test fails when v. few points used
return np.sqrt(np.cov(x, aweights=weights))


def _handle_ra_discontinuity(ra_data, middle_ras_raise_error=True):
"""Tries to detect when the ras in a field cross the (0, 360) ra discontinuity and returns corrected results. Will
raise an error if ras are all over the place (which will happen e.g. at very high declinations) in which you
Expand Down Expand Up @@ -67,158 +60,6 @@ def _handle_ra_discontinuity(ra_data, middle_ras_raise_error=True):
return ra_data


def mean_distance(
data_gaia: pd.DataFrame,
membership_probabilities: Optional[np.ndarray] = None,
key_parallax: str = "parallax",
key_parallax_error: str = "parallax_error",
key_r_est: str = "r_est",
key_r_low: str = "r_lo",
key_r_high: str = "r_hi",
key_result_flag: str = "result_flag",
calculate_cbj_mean_distance: bool = False,
**kwargs,
):
"""Produces mean parallax and distance statistics, including a basic handling of error.
Done in a very basic, frequentist way, whereby means are weighted based on the magnitude of the inverse errors on
parameters and the membership probabilities (if specified).
Args:
data_gaia (pd.DataFrame): Gaia data for the cluster in the standard format (e.g. as in DR2.)
membership_probabilities (optional, np.ndarray): membership probabilities for the cluster. When specified,
they can increase or decrease the effect of certain stars on the mean.
key_parallax (str): Gaia parameter name.
key_parallax_error (str): Gaia parameter name.
key_r_est (str): Gaia parameter name. Corresponds to Bailer-Jones+18 distance column names.
key_r_low (str): Gaia parameter name. Corresponds to Bailer-Jones+18 distance column names.
key_r_high (str): Gaia parameter name. Corresponds to Bailer-Jones+18 distance column names.
key_result_flag (str): Gaia parameter name. Corresponds to Bailer-Jones+18 distance column names.
calculate_cbj_mean_distance (bool): whether or not to even bother calculating a mean CBJ distance.
Default: False
Returns:
a dict, formatted with:
{
"parallax": weighted mean parallax
"parallax_error": error on the above
"inverse_parallax": inverse of the weighted mean parallax, a naive distance estimate
"distance": weighted mean distance
"distance_error": (naive, non-Bayesian) weighted error on mean_distance
}
"""
inferred_parameters = {}
sqrt_n_stars = np.sqrt(data_gaia.shape[0])

# Mean parallax
inferred_parameters["parallax"] = np.average(
data_gaia[key_parallax], weights=membership_probabilities
)
inferred_parameters["parallax_std"] = _weighted_standard_deviation(
data_gaia[key_parallax], membership_probabilities
)
inferred_parameters["parallax_error"] = (
inferred_parameters["parallax_std"] / sqrt_n_stars
)

# The inverse too (which is a shitty proxy for distance when you're feeling too lazy to be a Bayesian)
inferred_parameters["inverse_parallax"] = 1000 / inferred_parameters["parallax"]
inferred_parameters["inverse_parallax_l68"] = 1000 / (
inferred_parameters["parallax"] + inferred_parameters["parallax_error"]
)
inferred_parameters["inverse_parallax_u68"] = 1000 / (
inferred_parameters["parallax"] - inferred_parameters["parallax_error"]
)

# Mean distance, but a bit shit for now lol
if calculate_cbj_mean_distance:
# Todo this could infer a mean/MAP value in a Bayesian way
# We only want to work on stars with a result
good_stars = data_gaia[key_result_flag] == 1
r_est = data_gaia.loc[good_stars, key_r_est].values

# Deal with the fact that dropping stars without an inferred distance means membership_probabilities might not
# have the same length as our r_est
if (
type(membership_probabilities) is not float
and membership_probabilities is not None
):
membership_probabilities = membership_probabilities[good_stars]

inferred_parameters["distance"] = np.average(
r_est, weights=membership_probabilities
)
inferred_parameters["distance_std"] = _weighted_standard_deviation(
r_est, membership_probabilities
)
inferred_parameters["distance_error"] = (
inferred_parameters["distance_std"] / sqrt_n_stars
)

return inferred_parameters


def mean_proper_motion(
data_gaia: pd.DataFrame,
membership_probabilities: Optional[np.ndarray] = None,
key_pmra: str = "pmra",
key_pmra_error: str = "pmra_error",
key_pmdec: str = "pmdec",
key_pmdec_error: str = "pmdec_error",
**kwargs,
):
"""Calculates the weighted mean proper motion of a cluster, including error.
Done in a very basic, frequentist way, whereby means are weighted based on the magnitude of the inverse errors on
parameters and the membership probabilities (if specified).
Args:
data_gaia (pd.DataFrame): Gaia data for the cluster in the standard format (e.g. as in DR2.)
membership_probabilities (optional, np.ndarray): membership probabilities for the cluster. When specified,
they can increase or decrease the effect of certain stars on the mean.
key_pmra (str): Gaia parameter name.
key_pmra_error (str): Gaia parameter name.
key_pmdec (str): Gaia parameter name.
key_pmdec_error (str): Gaia parameter name.
Returns:
a dict, formatted with:
{
"pmra": weighted mean proper motion in the right ascension direction * cos declination
"pmra_error": error on the above
"pmdec": weighted mean proper motion in the declination direction
"pmdec_error": error on the above
}
"""
inferred_parameters = {}
sqrt_n_stars = np.sqrt(data_gaia.shape[0])

# Mean proper motion time!
inferred_parameters["pmra"] = np.average(
data_gaia[key_pmra], weights=membership_probabilities
)
inferred_parameters["pmra_std"] = _weighted_standard_deviation(
data_gaia[key_pmra], membership_probabilities
)
inferred_parameters["pmra_error"] = inferred_parameters["pmra_std"] / sqrt_n_stars

inferred_parameters["pmdec"] = np.average(
data_gaia[key_pmdec], weights=membership_probabilities
)
inferred_parameters["pmdec_std"] = _weighted_standard_deviation(
data_gaia[key_pmdec], membership_probabilities
)
inferred_parameters["pmdec_error"] = inferred_parameters["pmdec_std"] / sqrt_n_stars

inferred_parameters["pm_dispersion"] = np.sqrt(
inferred_parameters["pmra_std"] ** 2 + inferred_parameters["pmdec_std"] ** 2
)

return inferred_parameters


def mean_radius(
data_gaia: pd.DataFrame,
membership_probabilities: Optional[np.ndarray] = None,
Expand Down Expand Up @@ -359,164 +200,8 @@ def mean_radius(
return inferred_parameters


def mean_internal_velocity_dispersion(
data_gaia: pd.DataFrame,
membership_probabilities: Optional[np.ndarray] = None,
already_inferred_parameters: Optional[dict] = None,
key_pmra: str = "pmra",
key_pmra_error: str = "pmra_error",
key_pmdec: str = "pmdec",
key_pmdec_error: str = "pmdec_error",
distance_to_use: str = "inverse_parallax",
**kwargs,
):
"""Estimates the internal velocity dispersion of a cluster.
Done in a very basic, frequentist way, whereby means are weighted based on the membership probabilities (if
specified).
N.B. unlike the above functions, errors do *not change the mean* as this would potentially bias the
estimates towards being dominated by large, centrally-located stars within clusters (that have generally lower
velocities.) Hence, estimates here will be less precise but hopefully more accurate.
Todo: add error on velocity dispersion estimation to this function (hard)
Args:
data_gaia (pd.DataFrame): Gaia data for the cluster in the standard format (e.g. as in DR2.)
membership_probabilities (optional, np.ndarray): membership probabilities for the cluster. When specified,
they can increase or decrease the effect of certain stars on the mean.
already_inferred_parameters (optional, dict): a parameter dictionary of the mean distance and proper motion.
Otherwise, this function calculates a version.
key_pmra (str): Gaia parameter name.
key_pmra_error (str): Gaia parameter name.
key_pmdec (str): Gaia parameter name.
key_pmdec_error (str): Gaia parameter name.
distance_to_use (str): which already inferred distance to use to convert proper motions to m/s velocities.
Default: "inverse_parallax"
Returns:
a dict, formatted with:
{
"v_ra_dec": mean velocity dispersion of the cluster
"v_ra_dec_error": error on the above
}
def all_statistics():
"""
inferred_parameters = {}

# Grab the distances and proper motions if they aren't specified - we'll need them in a moment!
if already_inferred_parameters is None:
already_inferred_parameters = {
**mean_distance(data_gaia, membership_probabilities),
**mean_proper_motion(data_gaia, membership_probabilities),
}

# Center the proper motions on the cluster
pmra = data_gaia[key_pmra] - already_inferred_parameters["pmra"]
pmdec = data_gaia[key_pmdec] - already_inferred_parameters["pmdec"]

# Velocity dispersion time
pm_magnitude = np.sqrt(pmra**2 + pmdec**2)
velocity_dispersion = (
np.tan(pm_magnitude * mas_per_yr_to_rad_per_s)
* already_inferred_parameters[distance_to_use]
* pc_to_m
)

# Save the standard deviations of the sum of the squares of parameters as our velocity dispersions
inferred_parameters["v_internal_tangential"] = _weighted_standard_deviation(
velocity_dispersion, membership_probabilities
)
inferred_parameters["v_internal_tangential_error"] = np.nan

return inferred_parameters


def all_statistics(
data_gaia: pd.DataFrame,
membership_probabilities: Optional[np.ndarray] = None,
parameter_inference_mode: str = "mean",
override_membership_probabilities_being_off: bool = False,
middle_ras_raise_error: bool = True,
**kwargs,
):
"""Wraps all subfunctions in ocelot.calculate and calculates all the stats you could possibly ever want.
Args:
data_gaia (pd.DataFrame): Gaia data for the cluster in the standard format (e.g. as in DR2.)
membership_probabilities (optional, np.ndarray): membership probabilities for the cluster. When specified,
they can increase or decrease the effect of certain stars on the mean.
parameter_inference_mode (str): mode to use when inferring parameters.
override_membership_probabilities_being_off (bool): little check to stop membership probabilities from being
used for now, as these actually mess up the
middle_ras_raise_error (bool): whether or not a cluster having right ascensions in all ranges [0, 90), [90, 270]
and (270, 360] raises an error. The error here indicates that this cluster has extreme spherical
discontinuities (e.g. it's near a coordinate pole) and that the mean ra and mean dec will be inaccurate.
Default: True
**kwargs: keyword arguments to pass to the calculation functions this one calls.
Returns:
a dict, containing the following parameters for the cluster:
'n_stars', 'ra', 'ra_error', 'dec', 'dec_error', 'ang_radius_50', 'ang_radius_50_error', 'ang_radius_c',
'ang_radius_c_error', 'ang_radius_t', 'ang_radius_t_error', 'radius_50', 'radius_50_error', 'radius_c',
'radius_c_error', 'radius_t', 'radius_t_error', 'parallax', 'parallax_error', 'inverse_parallax',
'inverse_parallax_l68', 'inverse_parallax_u68', 'distance', 'distance_error', 'pmra', 'pmra_error', 'pmdec',
'pmdec_error', 'v_internal_tangential', 'v_internal_tangential_error', 'parameter_inference_mode'
where parameter_inference_mode is the only one added by this function itself, and is a copy of the keyword arg
this function holds.
"""
if not override_membership_probabilities_being_off:
membership_probabilities = None

if parameter_inference_mode == "mean":
# Calculate all parameters! We incrementally add to the dictionary as some of the later functions require
# parameters that we've already calculated. This is done in a weird order to make sure that the final dict is in
# the right order.
inferred_parameters = {}

inferred_parameters.update(
mean_distance(
data_gaia, membership_probabilities=membership_probabilities, **kwargs
)
)

# Todo: could also add the number of 1sigma+ member stars.

inferred_parameters = {
"n_stars": data_gaia.shape[0],
**mean_radius(
data_gaia,
membership_probabilities=membership_probabilities,
already_inferred_parameters=inferred_parameters,
middle_ras_raise_error=middle_ras_raise_error,
**kwargs,
),
**inferred_parameters,
}

inferred_parameters.update(
mean_proper_motion(
data_gaia, membership_probabilities=membership_probabilities, **kwargs
)
)

inferred_parameters.update(
mean_internal_velocity_dispersion(
data_gaia,
membership_probabilities=membership_probabilities,
already_inferred_parameters=inferred_parameters,
**kwargs,
)
)

inferred_parameters.update({"parameter_inference_mode": "mean"})

else:
raise ValueError(
"the only currently implemented parameter_inference_mode for this function is 'mean'"
)

# Return it all as one big dict
return inferred_parameters
# Todo refactor this (and other high-level calculation methods)
pass
1 change: 1 addition & 0 deletions src/ocelot/calculate/distance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Functions for working with cluster distances."""
Loading

0 comments on commit 7468d80

Please sign in to comment.