diff --git a/src/ocelot/calculate/__init__.py b/src/ocelot/calculate/__init__.py index 41d11cc..0c754ce 100644 --- a/src/ocelot/calculate/__init__.py +++ b/src/ocelot/calculate/__init__.py @@ -1,3 +1 @@ -from .calculate import mean_distance, mean_proper_motion, mean_radius, mean_internal_velocity_dispersion, all_statistics -from .profile import king_surface_density, king_surface_density_fast, sample_2d_king_profile, sample_1d_king_profile -from . import random +# Todo decide what to make top-level here diff --git a/src/ocelot/calculate/calculate.py b/src/ocelot/calculate/calculate.py index 8422aac..973aae7 100644 --- a/src/ocelot/calculate/calculate.py +++ b/src/ocelot/calculate/calculate.py @@ -12,13 +12,6 @@ from .constants import mas_per_yr_to_rad_per_s, pc_to_m, deg_to_rad -def _weighted_standard_deviation(x, weights): - # See https://stackoverflow.com/a/52655244/12709989 for how the standard deviation is calculated in a weighted way - # Todo: not sure that this deals with small numbers of points correctly! - # See: unit test fails when v. few points used - return np.sqrt(np.cov(x, aweights=weights)) - - def _handle_ra_discontinuity(ra_data, middle_ras_raise_error=True): """Tries to detect when the ras in a field cross the (0, 360) ra discontinuity and returns corrected results. Will raise an error if ras are all over the place (which will happen e.g. at very high declinations) in which you @@ -67,158 +60,6 @@ def _handle_ra_discontinuity(ra_data, middle_ras_raise_error=True): return ra_data -def mean_distance( - data_gaia: pd.DataFrame, - membership_probabilities: Optional[np.ndarray] = None, - key_parallax: str = "parallax", - key_parallax_error: str = "parallax_error", - key_r_est: str = "r_est", - key_r_low: str = "r_lo", - key_r_high: str = "r_hi", - key_result_flag: str = "result_flag", - calculate_cbj_mean_distance: bool = False, - **kwargs, -): - """Produces mean parallax and distance statistics, including a basic handling of error. - - Done in a very basic, frequentist way, whereby means are weighted based on the magnitude of the inverse errors on - parameters and the membership probabilities (if specified). - - Args: - data_gaia (pd.DataFrame): Gaia data for the cluster in the standard format (e.g. as in DR2.) - membership_probabilities (optional, np.ndarray): membership probabilities for the cluster. When specified, - they can increase or decrease the effect of certain stars on the mean. - key_parallax (str): Gaia parameter name. - key_parallax_error (str): Gaia parameter name. - key_r_est (str): Gaia parameter name. Corresponds to Bailer-Jones+18 distance column names. - key_r_low (str): Gaia parameter name. Corresponds to Bailer-Jones+18 distance column names. - key_r_high (str): Gaia parameter name. Corresponds to Bailer-Jones+18 distance column names. - key_result_flag (str): Gaia parameter name. Corresponds to Bailer-Jones+18 distance column names. - calculate_cbj_mean_distance (bool): whether or not to even bother calculating a mean CBJ distance. - Default: False - - Returns: - a dict, formatted with: - { - "parallax": weighted mean parallax - "parallax_error": error on the above - "inverse_parallax": inverse of the weighted mean parallax, a naive distance estimate - "distance": weighted mean distance - "distance_error": (naive, non-Bayesian) weighted error on mean_distance - } - - """ - inferred_parameters = {} - sqrt_n_stars = np.sqrt(data_gaia.shape[0]) - - # Mean parallax - inferred_parameters["parallax"] = np.average( - data_gaia[key_parallax], weights=membership_probabilities - ) - inferred_parameters["parallax_std"] = _weighted_standard_deviation( - data_gaia[key_parallax], membership_probabilities - ) - inferred_parameters["parallax_error"] = ( - inferred_parameters["parallax_std"] / sqrt_n_stars - ) - - # The inverse too (which is a shitty proxy for distance when you're feeling too lazy to be a Bayesian) - inferred_parameters["inverse_parallax"] = 1000 / inferred_parameters["parallax"] - inferred_parameters["inverse_parallax_l68"] = 1000 / ( - inferred_parameters["parallax"] + inferred_parameters["parallax_error"] - ) - inferred_parameters["inverse_parallax_u68"] = 1000 / ( - inferred_parameters["parallax"] - inferred_parameters["parallax_error"] - ) - - # Mean distance, but a bit shit for now lol - if calculate_cbj_mean_distance: - # Todo this could infer a mean/MAP value in a Bayesian way - # We only want to work on stars with a result - good_stars = data_gaia[key_result_flag] == 1 - r_est = data_gaia.loc[good_stars, key_r_est].values - - # Deal with the fact that dropping stars without an inferred distance means membership_probabilities might not - # have the same length as our r_est - if ( - type(membership_probabilities) is not float - and membership_probabilities is not None - ): - membership_probabilities = membership_probabilities[good_stars] - - inferred_parameters["distance"] = np.average( - r_est, weights=membership_probabilities - ) - inferred_parameters["distance_std"] = _weighted_standard_deviation( - r_est, membership_probabilities - ) - inferred_parameters["distance_error"] = ( - inferred_parameters["distance_std"] / sqrt_n_stars - ) - - return inferred_parameters - - -def mean_proper_motion( - data_gaia: pd.DataFrame, - membership_probabilities: Optional[np.ndarray] = None, - key_pmra: str = "pmra", - key_pmra_error: str = "pmra_error", - key_pmdec: str = "pmdec", - key_pmdec_error: str = "pmdec_error", - **kwargs, -): - """Calculates the weighted mean proper motion of a cluster, including error. - - Done in a very basic, frequentist way, whereby means are weighted based on the magnitude of the inverse errors on - parameters and the membership probabilities (if specified). - - Args: - data_gaia (pd.DataFrame): Gaia data for the cluster in the standard format (e.g. as in DR2.) - membership_probabilities (optional, np.ndarray): membership probabilities for the cluster. When specified, - they can increase or decrease the effect of certain stars on the mean. - key_pmra (str): Gaia parameter name. - key_pmra_error (str): Gaia parameter name. - key_pmdec (str): Gaia parameter name. - key_pmdec_error (str): Gaia parameter name. - - Returns: - a dict, formatted with: - { - "pmra": weighted mean proper motion in the right ascension direction * cos declination - "pmra_error": error on the above - "pmdec": weighted mean proper motion in the declination direction - "pmdec_error": error on the above - } - - """ - inferred_parameters = {} - sqrt_n_stars = np.sqrt(data_gaia.shape[0]) - - # Mean proper motion time! - inferred_parameters["pmra"] = np.average( - data_gaia[key_pmra], weights=membership_probabilities - ) - inferred_parameters["pmra_std"] = _weighted_standard_deviation( - data_gaia[key_pmra], membership_probabilities - ) - inferred_parameters["pmra_error"] = inferred_parameters["pmra_std"] / sqrt_n_stars - - inferred_parameters["pmdec"] = np.average( - data_gaia[key_pmdec], weights=membership_probabilities - ) - inferred_parameters["pmdec_std"] = _weighted_standard_deviation( - data_gaia[key_pmdec], membership_probabilities - ) - inferred_parameters["pmdec_error"] = inferred_parameters["pmdec_std"] / sqrt_n_stars - - inferred_parameters["pm_dispersion"] = np.sqrt( - inferred_parameters["pmra_std"] ** 2 + inferred_parameters["pmdec_std"] ** 2 - ) - - return inferred_parameters - - def mean_radius( data_gaia: pd.DataFrame, membership_probabilities: Optional[np.ndarray] = None, @@ -359,164 +200,8 @@ def mean_radius( return inferred_parameters -def mean_internal_velocity_dispersion( - data_gaia: pd.DataFrame, - membership_probabilities: Optional[np.ndarray] = None, - already_inferred_parameters: Optional[dict] = None, - key_pmra: str = "pmra", - key_pmra_error: str = "pmra_error", - key_pmdec: str = "pmdec", - key_pmdec_error: str = "pmdec_error", - distance_to_use: str = "inverse_parallax", - **kwargs, -): - """Estimates the internal velocity dispersion of a cluster. - - Done in a very basic, frequentist way, whereby means are weighted based on the membership probabilities (if - specified). - - N.B. unlike the above functions, errors do *not change the mean* as this would potentially bias the - estimates towards being dominated by large, centrally-located stars within clusters (that have generally lower - velocities.) Hence, estimates here will be less precise but hopefully more accurate. - - Todo: add error on velocity dispersion estimation to this function (hard) - - Args: - data_gaia (pd.DataFrame): Gaia data for the cluster in the standard format (e.g. as in DR2.) - membership_probabilities (optional, np.ndarray): membership probabilities for the cluster. When specified, - they can increase or decrease the effect of certain stars on the mean. - already_inferred_parameters (optional, dict): a parameter dictionary of the mean distance and proper motion. - Otherwise, this function calculates a version. - key_pmra (str): Gaia parameter name. - key_pmra_error (str): Gaia parameter name. - key_pmdec (str): Gaia parameter name. - key_pmdec_error (str): Gaia parameter name. - distance_to_use (str): which already inferred distance to use to convert proper motions to m/s velocities. - Default: "inverse_parallax" - - Returns: - a dict, formatted with: - { - "v_ra_dec": mean velocity dispersion of the cluster - "v_ra_dec_error": error on the above - } - +def all_statistics(): """ - inferred_parameters = {} - - # Grab the distances and proper motions if they aren't specified - we'll need them in a moment! - if already_inferred_parameters is None: - already_inferred_parameters = { - **mean_distance(data_gaia, membership_probabilities), - **mean_proper_motion(data_gaia, membership_probabilities), - } - - # Center the proper motions on the cluster - pmra = data_gaia[key_pmra] - already_inferred_parameters["pmra"] - pmdec = data_gaia[key_pmdec] - already_inferred_parameters["pmdec"] - - # Velocity dispersion time - pm_magnitude = np.sqrt(pmra**2 + pmdec**2) - velocity_dispersion = ( - np.tan(pm_magnitude * mas_per_yr_to_rad_per_s) - * already_inferred_parameters[distance_to_use] - * pc_to_m - ) - - # Save the standard deviations of the sum of the squares of parameters as our velocity dispersions - inferred_parameters["v_internal_tangential"] = _weighted_standard_deviation( - velocity_dispersion, membership_probabilities - ) - inferred_parameters["v_internal_tangential_error"] = np.nan - - return inferred_parameters - - -def all_statistics( - data_gaia: pd.DataFrame, - membership_probabilities: Optional[np.ndarray] = None, - parameter_inference_mode: str = "mean", - override_membership_probabilities_being_off: bool = False, - middle_ras_raise_error: bool = True, - **kwargs, -): - """Wraps all subfunctions in ocelot.calculate and calculates all the stats you could possibly ever want. - - Args: - data_gaia (pd.DataFrame): Gaia data for the cluster in the standard format (e.g. as in DR2.) - membership_probabilities (optional, np.ndarray): membership probabilities for the cluster. When specified, - they can increase or decrease the effect of certain stars on the mean. - parameter_inference_mode (str): mode to use when inferring parameters. - override_membership_probabilities_being_off (bool): little check to stop membership probabilities from being - used for now, as these actually mess up the - middle_ras_raise_error (bool): whether or not a cluster having right ascensions in all ranges [0, 90), [90, 270] - and (270, 360] raises an error. The error here indicates that this cluster has extreme spherical - discontinuities (e.g. it's near a coordinate pole) and that the mean ra and mean dec will be inaccurate. - Default: True - **kwargs: keyword arguments to pass to the calculation functions this one calls. - - - Returns: - a dict, containing the following parameters for the cluster: - 'n_stars', 'ra', 'ra_error', 'dec', 'dec_error', 'ang_radius_50', 'ang_radius_50_error', 'ang_radius_c', - 'ang_radius_c_error', 'ang_radius_t', 'ang_radius_t_error', 'radius_50', 'radius_50_error', 'radius_c', - 'radius_c_error', 'radius_t', 'radius_t_error', 'parallax', 'parallax_error', 'inverse_parallax', - 'inverse_parallax_l68', 'inverse_parallax_u68', 'distance', 'distance_error', 'pmra', 'pmra_error', 'pmdec', - 'pmdec_error', 'v_internal_tangential', 'v_internal_tangential_error', 'parameter_inference_mode' - where parameter_inference_mode is the only one added by this function itself, and is a copy of the keyword arg - this function holds. - """ - if not override_membership_probabilities_being_off: - membership_probabilities = None - - if parameter_inference_mode == "mean": - # Calculate all parameters! We incrementally add to the dictionary as some of the later functions require - # parameters that we've already calculated. This is done in a weird order to make sure that the final dict is in - # the right order. - inferred_parameters = {} - - inferred_parameters.update( - mean_distance( - data_gaia, membership_probabilities=membership_probabilities, **kwargs - ) - ) - - # Todo: could also add the number of 1sigma+ member stars. - - inferred_parameters = { - "n_stars": data_gaia.shape[0], - **mean_radius( - data_gaia, - membership_probabilities=membership_probabilities, - already_inferred_parameters=inferred_parameters, - middle_ras_raise_error=middle_ras_raise_error, - **kwargs, - ), - **inferred_parameters, - } - - inferred_parameters.update( - mean_proper_motion( - data_gaia, membership_probabilities=membership_probabilities, **kwargs - ) - ) - - inferred_parameters.update( - mean_internal_velocity_dispersion( - data_gaia, - membership_probabilities=membership_probabilities, - already_inferred_parameters=inferred_parameters, - **kwargs, - ) - ) - - inferred_parameters.update({"parameter_inference_mode": "mean"}) - - else: - raise ValueError( - "the only currently implemented parameter_inference_mode for this function is 'mean'" - ) - - # Return it all as one big dict - return inferred_parameters + # Todo refactor this (and other high-level calculation methods) + pass diff --git a/src/ocelot/calculate/distance.py b/src/ocelot/calculate/distance.py new file mode 100644 index 0000000..002773e --- /dev/null +++ b/src/ocelot/calculate/distance.py @@ -0,0 +1 @@ +"""Functions for working with cluster distances.""" \ No newline at end of file diff --git a/src/ocelot/calculate/generic.py b/src/ocelot/calculate/generic.py new file mode 100644 index 0000000..b4d2dfe --- /dev/null +++ b/src/ocelot/calculate/generic.py @@ -0,0 +1,97 @@ +"""Generic calculation utilities used across the module.""" +import numpy as np +from numpy.typing import ArrayLike, NDArray +from typing import Optional, Union +from ocelot.util.check import _check_matching_lengths_of_non_nones +from scipy.stats import directional_stats + + +def _weighted_standard_deviation(x: ArrayLike, weights: Optional[ArrayLike] = None): + """Computes weighted standard deviation. Uses method from + https://stackoverflow.com/a/52655244/12709989. + """ + # Todo: not sure that this deals with small numbers of points correctly! + # See: unit test fails when v. few points used + return np.sqrt(np.cov(x, aweights=weights)) + + +def standard_error( + standard_deviation: Union[ArrayLike[Union[float, int]], float, int], + number_of_measurements: Union[ArrayLike[int], int], +) -> float: + """Calculates the standard error on the mean of some parameter given the standard + deviation. + + Parameters + ---------- + standard_deviation : array-like, float, or int + Standard deviation(s) + number_of_measurements : array-like of ints, int + Number of measurements used to find standard deviation. + + Returns + ------- + standard_error : float + """ + return standard_deviation / np.sqrt(number_of_measurements) + + +def mean_and_deviation( + values: ArrayLike, + weights: Optional[ArrayLike] = None, +) -> tuple[float]: + """Calculates the mean and standard deviation of some set of values. + + Parameters + ---------- + values : array-like + Values to calculate mean and standard deviation of. + weights : array-like, optional + Array of weights to use to compute a weighted mean and average. + + Returns + ------- + mean : float + Mean of values. + std : float + Standard deviation of values. + """ + _check_matching_lengths_of_non_nones(values, weights) + + return ( + np.average(values, weights=weights), + _weighted_standard_deviation(values, weights), + ) + + +def lonlat_to_unitvec(longitudes: NDArray, latitudes: NDArray): + """Converts longitudes and latitudes to unit vectors on a unit sphere. Uses method + at https://en.wikipedia.org/wiki/Spherical_coordinate_system#Cartesian_coordinates. + Assumes that latitudes is in the range [-pi / 2, pi / 2] as is common in + astronomical unit systems. + """ + x = np.cos(latitudes) * np.cos(longitudes) + y = np.cos(latitudes) * np.sin(longitudes) + z = np.sin(latitudes) + return np.column_stack((x, y, z)) + + +def unitvec_to_lonlat(unit_vectors: NDArray): + """Converts unit vectors on a unit sphere to longitudes and latitudes. See + `lonlat_to_unitvec` for more details. + """ + x, y, z = [column.ravel() for column in np.hsplit(unit_vectors, 3)] + longitudes = np.arctan2(y, x) + latitudes = np.arcsin(z / np.sqrt(x**2 + y**2 + z**2)) + return longitudes, latitudes + + +def spherical_mean(longitudes: ArrayLike, latitudes: ArrayLike): + """Calculates the spherical mean of angular positions.""" + longitudes = np.asarray_chkfinite(longitudes) + latitudes = np.asarray_chkfinite(latitudes) + + unit_vectors = lonlat_to_unitvec(longitudes, latitudes) + mean_unit_vector = directional_stats(unit_vectors).mean_direction + mean_lon, mean_lat = unitvec_to_lonlat(mean_unit_vector) + return mean_lon[0], mean_lat[0] diff --git a/src/ocelot/calculate/motion.py b/src/ocelot/calculate/motion.py new file mode 100644 index 0000000..6914358 --- /dev/null +++ b/src/ocelot/calculate/motion.py @@ -0,0 +1,11 @@ +"""Tools for calculating values based on proper motions and/or velocities.""" + + +def radial_velocity(velocities, with_frame_correction=False): + # Todo + pass + + +def proper_motion(proper_motions, with_frame_correction=False): + # Todo + pass diff --git a/src/ocelot/calculate/position.py b/src/ocelot/calculate/position.py new file mode 100644 index 0000000..4b3b9a2 --- /dev/null +++ b/src/ocelot/calculate/position.py @@ -0,0 +1,100 @@ +"""Different methods for calculating the center of a cluster in spherical coordinates. +(This is actually oddly difficult, thanks to how spheres work. Damn spheres.) +""" +import numpy as np +from numpy.typing import ArrayLike + +from ocelot.calculate.generic import spherical_mean + + +# def shift_longitudinal_coordinates(longitudes: ArrayLike, middle_ras_raise_error=True): +# """Helper function that tries to move longitudinal coordinates + +# Args: +# ra_data (pd.Series or np.ndarray): data on ras. +# middle_ras_raise_error (bool): whether or not a cluster having right ascensions in all ranges [0, 90), [90, 270] +# and (270, 360] raises an error. The error here indicates that this cluster has extreme spherical +# discontinuities (e.g. it's near a coordinate pole) and that the mean ra and mean dec will be inaccurate. +# Default: True + +# Returns: +# ra_data but corrected for distortions. If values are both <90 and >270, the new ra data will be in the range +# (-90, 90). + +# """ +# # Firstly, check that the ras are valid ras +# if np.any(longitudes >= 360) or np.any(longitudes < 0): +# raise ValueError( +# "at least one input ra value was invalid! Ras must be in the range [0, 360)." +# ) + +# # Next, grab all the locations of dodgy friends and check that all three aren't ever in use at the same time +# low_ra = longitudes < 90 +# high_ra = longitudes > 270 +# middle_ra = np.logical_not(np.logical_or(low_ra, high_ra)) + +# # Proceed if we have both low and high ras +# if np.any(low_ra) and np.any(high_ra): +# # Stop if we have middle too (would imply stars everywhere or an extreme dec value) +# if np.any(middle_ra) and middle_ras_raise_error: +# raise ValueError( +# "ra values are in all three ranges: [0, 90), [90, 270] and (270, 360). This cluster can't " +# "be processed by this function! Spherical distortions must be removed first." +# ) + +# # Otherwise, apply the discontinuity removal +# else: +# # Make a copy so nothing weird happens +# longitudes = longitudes.copy() + +# # And remove the distortion for all high numbers +# longitudes[high_ra] = longitudes[high_ra] - 360 + +# return longitudes + + +def mean_position( + longitudes: ArrayLike, latitudes: ArrayLike, degrees=True +) -> tuple[float]: + """Calculates the spherical mean of angular positions, specified as longitudes and + latitudes. This uses directional statistics to do so in a way that is aware of + discontinuities, such as the fact that 0° = 360°. + + Parameters + ---------- + longitudes : array-like + Array of longitudinal positions of stars in your cluster (e.g. right ascensions + or galactic longitudes.) Assumed to be in the range [0°, 360°]. + latitudes : array-like + Array of latitudinal positions of stars in your cluster (e.g. declinations or + galactic latitudes.) Assumed to be in the range [-90°, 90°]. + degrees : bool + Whether longitudes and latitudes are in degrees, and whether to return an answer + in degrees. Defaults to True. If False, longitudes and latitudes are assumed to + be in radians, with ranges [0, 2π] and [-π/2, π/2] respectively. + + Returns + ------- + mean_longitude : float + mean_latitude : float + + Notes + ----- + This function explicitly assumes that your star cluster *has* a well-defined mean + position. Some configurations (such as points uniformly distributed in at least one + axis of a sphere) will not have a meaningful mean position. + + Internally, this function uses `scipy.stats.directional_stats`, with a definition + taken from [1]. See [2] for more background. + + References + ---------- + [1] Mardia, Jupp. (2000). Directional Statistics (p. 163). Wiley. + [2] https://en.wikipedia.org/wiki/Directional_statistics + """ + if degrees: + longitudes, latitudes = np.radians(longitudes), np.radians(latitudes) + mean_lon, mean_lat = spherical_mean(longitudes, latitudes) + if degrees: + return np.degrees(mean_lon), np.degrees(mean_lat) + return mean_lon, mean_lat diff --git a/src/ocelot/util/__init__.py b/src/ocelot/util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/ocelot/util/check.py b/src/ocelot/util/check.py new file mode 100644 index 0000000..7698bca --- /dev/null +++ b/src/ocelot/util/check.py @@ -0,0 +1,15 @@ +"""Utilities for checking that things have the correct shape, etc.""" + + +def _check_matching_lengths_of_non_nones(*args): + """Checks that all non-None arguments have the same length.""" + if len(args) <= 1: + return + + args_excluding_nones = [arg for arg in args if arg is not None] + first_length = len(args_excluding_nones[0]) + for i, length in enumerate(args_excluding_nones[1:]): + if length != first_length: + raise ValueError( + f"Length of argument {i+1} does not match length of first element" + )