diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 70db10c8..852f88a5 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,13 +4,15 @@ Changelog v0.5.0 (unreleased) ------------------- -Contributors to this version: Thomas-Charles Fortier Filion (:user:`TC-FF`). +Contributors to this version: Thomas-Charles Fortier Filion (:user:`TC-FF`) Gabriel Rondeau-Genesse (:user:`RondeauG`), Trevor James Smith (:user:`Zeitsperre`). Internal changes ^^^^^^^^^^^^^^^^ -* land_use_classification default collection has been changed to io-lulc-annual-v02 as previous one will be deprecated december 2024. (:pull:`227`). -* Also added some collection, year, resolution and history attributes to xarray output of land_use_classification. (:pull:`227`). -* Added a downloader agent to fix an issue related to ``pooch`` in recent ReadTheDocs builds. (:pull:`231`). +* `"land_use_classification"` default collection has been changed to `"io-lulc-annual-v02"`, as the previous one will be deprecated in December 2024. (:pull:`227`). +* Added some collection, year, resolution and history attributes to `xarray` output of `"land_use_classification"`. (:pull:`227`). +* Added a "User-Agent" to fix an issue related to `pooch` calls in the notebooks for recent ReadTheDocs builds. (:pull:`231`). +* Patched the ``xhydro.testing.helpers.devereaux()`` function to add a "User-Agent" by default. (:pull:`234`). +* Fixed the URL joining logic of the ``load_registry()`` and ``devereaux()`` functions in the `xhydro.testing.helpers` module. (:pull:`234`). v0.4.1 (2024-11-07) ------------------- diff --git a/docs/notebooks/climate_change.ipynb b/docs/notebooks/climate_change.ipynb index e13b493a..d75e474f 100644 --- a/docs/notebooks/climate_change.ipynb +++ b/docs/notebooks/climate_change.ipynb @@ -28,20 +28,15 @@ "from xhydro.testing.helpers import deveraux\n", "\n", "D = deveraux()\n", - "downloader = pooch.HTTPDownloader(headers={\"User-Agent\": f\"xHydro-{xh.__version__}\"})\n", "\n", "# Future streamflow file (1 file - Hydrotel driven by BCC-CSM-1.1(m))\n", - "streamflow_file = D.fetch(\n", - " \"cc_indicators/streamflow_BCC-CSM1.1-m_rcp45.nc\", downloader=downloader\n", - ")\n", + "streamflow_file = D.fetch(\"cc_indicators/streamflow_BCC-CSM1.1-m_rcp45.nc\")\n", "\n", "# Reference mean annual streamflow (QMOYAN) for 6 calibrations of Hydrotel\n", - "reference_files = D.fetch(\n", - " \"cc_indicators/reference.zip\", pooch.Unzip(), downloader=downloader\n", - ")\n", + "reference_files = D.fetch(\"cc_indicators/reference.zip\", pooch.Unzip())\n", "\n", "# Future deltas of QMOYAN (63 simulations x 6 calibrations of Hydrotel)\n", - "deltas_files = D.fetch(\"cc_indicators/deltas.zip\", pooch.Unzip(), downloader=downloader)" + "deltas_files = D.fetch(\"cc_indicators/deltas.zip\", pooch.Unzip())" ] }, { @@ -94,7 +89,7 @@ "id": "5", "metadata": {}, "source": [ - "Hydrological indicators can be separated in two broad categories: \n", + "Hydrological indicators can be separated in two broad categories:\n", "\n", "- Frequential indicators, such as the maximum 20-year flow (*Qmax20*) or the minimum 2-year 7-day averaged flow in summer (*Q7min2_summer*). Computing these is already covered in the [Local Frequency Analysis notebook](local_frequency_analysis.ipynb) notebook.\n", "- Non frequencial indicators, such as the average yearly flow.\n", @@ -180,7 +175,7 @@ "Since indicators could be output at varying frequencies, `compute_indicators` will return a dictionary where the keys are the output frequencies. In this example, we only have one key: `AS-JAN` (annual data starting in January). The keys follow the `pandas` nomenclature.\n", "\n", "The next step is to obtain averages over climatological periods. The `xh.cc.climatological_op` function can be called for this purpose. The inputs of that function are:\n", - " \n", + "\n", "- *ds*: Dataset to use for the computation.\n", "- *op*: Operation to perform over time. While other operations are technically possible, the following are recommended and tested: ['max', 'mean', 'median', 'min', 'std', 'sum', 'var', 'linregress'].\n", "- *window* (optional): Number of years to use for the rolling operation. If None, all the available data will be used.\n", @@ -220,7 +215,7 @@ "metadata": {}, "source": [ "Computing deltas is then as easy as calling `xh.cc.compute_deltas`. The inputs of that function are:\n", - " \n", + "\n", "- *ds*: Dataset to use for the computation.\n", "- *reference_horizon*: Either a YYYY-YYYY string corresponding to the 'horizon' coordinate of the reference period, or a xr.Dataset containing the climatological mean.\n", "- *kind*: ['+', '/', '%'] Whether to provide absolute, relative, or percentage deltas. Can also be a dictionary separated per variable name." diff --git a/docs/notebooks/hydrological_modelling.ipynb b/docs/notebooks/hydrological_modelling.ipynb index 06c040df..a7ccf4bf 100644 --- a/docs/notebooks/hydrological_modelling.ipynb +++ b/docs/notebooks/hydrological_modelling.ipynb @@ -112,12 +112,11 @@ "from xhydro.testing.helpers import deveraux\n", "\n", "D = deveraux()\n", - "downloader = pooch.HTTPDownloader(headers={\"User-Agent\": f\"xHydro-{xh.__version__}\"})\n", "\n", "# This notebook will use ERA5 data for a small watershed in Eastern Quebec, along with faked elevation data.\n", "\n", "# Streamflow file (1 file - Hydrotel driven by BCC-CSM-1.1(m))\n", - "meteo_file = D.fetch(\"hydro_modelling/ERA5_testdata.nc\", downloader=downloader)\n", + "meteo_file = D.fetch(\"hydro_modelling/ERA5_testdata.nc\")\n", "ds = xr.open_dataset(meteo_file)\n", "ds" ] diff --git a/docs/notebooks/local_frequency_analysis.ipynb b/docs/notebooks/local_frequency_analysis.ipynb index 74d7bc78..9aa63bb1 100644 --- a/docs/notebooks/local_frequency_analysis.ipynb +++ b/docs/notebooks/local_frequency_analysis.ipynb @@ -14,7 +14,7 @@ "outputs": [], "source": [ "# Basic imports\n", - "import hvplot.xarray\n", + "import hvplot.xarray # noqa\n", "import numpy as np\n", "import xarray as xr\n", "import xdatasets as xd\n", @@ -185,7 +185,6 @@ "outputs": [], "source": [ "# Create a mask beforehand\n", - "import random\n", "\n", "nyears = np.unique(ds.time.dt.year).size\n", "dom_start = xr.DataArray(\n", @@ -232,7 +231,7 @@ "\n", "# We use where() to mask the data that we want to ignore\n", "masked = ds.where(mask == 1)\n", - "# Since we masked almost all of the year, our tolerance for missing data should be changed accordingly\n", + "# Since we masked almost all the year, our tolerance for missing data should be changed accordingly\n", "missing = \"at_least_n\"\n", "missing_options = {\"n\": 45}\n", "\n", @@ -422,7 +421,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Lets plot the observations\n", + "# Let's plot the observations\n", "p1 = data.streamflow_max_spring.hvplot(\n", " x=\"return_period\", by=\"scipy_dist\", grid=True, groupby=[\"id\"], logx=True\n", ")\n", @@ -437,7 +436,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Lets now plot the distributions\n", + "# Let's now plot the distributions\n", "p2 = pp.hvplot.scatter(\n", " x=\"streamflow_max_spring_pp\",\n", " y=\"streamflow_max_spring\",\n", diff --git a/docs/notebooks/optimal_interpolation.ipynb b/docs/notebooks/optimal_interpolation.ipynb index 6dc66e80..ddc3dbb0 100644 --- a/docs/notebooks/optimal_interpolation.ipynb +++ b/docs/notebooks/optimal_interpolation.ipynb @@ -11,7 +11,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Optimal interpolation is a tool that allows combining a spatially distributed field (i.e. the \"background field\") with point observations in such a way that the entire field can be adjusted according to deviations between the observations and the field at the point of observations. For example, it can be used to combine a field of reanalysis precipitation (e.g. ERA5) with observation records, and thus adjust the reanalysis precipitation over the entire domain in a statistically optimal manner. \n", + "Optimal interpolation is a tool that allows combining a spatially distributed field (i.e. the \"background field\") with point observations in such a way that the entire field can be adjusted according to deviations between the observations and the field at the point of observations. For example, it can be used to combine a field of reanalysis precipitation (e.g. ERA5) with observation records, and thus adjust the reanalysis precipitation over the entire domain in a statistically optimal manner.\n", "\n", "This page demonstrates how to use `xhydro` to perform optimal interpolation using field-like simulations and point observations for hydrological modelling. In this case, the background field is a set of outputs from a distributed hydrological model and the observations correspond to real hydrometric stations. The aim is to correct the background field (i.e. the distributed hydrological simulations) using optimal interpolation, as in Lachance-Cloutier et al (2017).\n", "\n", @@ -27,7 +27,6 @@ "import datetime as dt\n", "from functools import partial\n", "from pathlib import Path\n", - "from zipfile import ZipFile\n", "\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", @@ -63,7 +62,7 @@ "* Observed data at the 3 gauged locations\n", "* Simulated data at the 5 locations\n", "\n", - "Let's define these now and show the stations on a map: " + "Let's define these now and show the stations on a map:" ] }, { @@ -164,7 +163,7 @@ "* Model 3: par[0] * exp(-h / par[1])\n", "* Model 4: par[0] * exp(-(h ** par[1]) / par[0])\n", "\n", - " We will use model #4, but you can change it below and see how it affects results. Parameters can also be changed to assess their impacts. " + " We will use model #4, but you can change it below and see how it affects results. Parameters can also be changed to assess their impacts." ] }, { @@ -196,16 +195,16 @@ "metadata": {}, "outputs": [], "source": [ - "print(\"lat_est: \" + str(lat_est))\n", - "print(\"lon_est: \" + str(lon_est))\n", - "print(\"lat_obs: \" + str(lat_obs))\n", - "print(\"lon_obs: \" + str(lon_obs))\n", - "print(\"bg_departures: \" + str(departures))\n", - "print(\"bg_est: \" + str(scaled_simulated_flow))\n", - "print(\"bg_var_obs: \" + str(bg_var_obs))\n", - "print(\"bg_var_est: \" + str(bg_var_est))\n", - "print(\"var_obs: \" + str(var_obs))\n", - "print(\"ecf: \" + str(ecf))" + "print(f\"lat_est: {lat_est}\")\n", + "print(f\"lon_est: {lon_est}\")\n", + "print(f\"lat_obs: {lat_obs}\")\n", + "print(f\"lon_obs: {lon_obs}\")\n", + "print(f\"bg_departures: {departures}\")\n", + "print(f\"bg_est: {scaled_simulated_flow}\")\n", + "print(f\"bg_var_obs: {bg_var_obs}\")\n", + "print(f\"bg_var_est: {bg_var_est}\")\n", + "print(f\"var_obs: {var_obs}\")\n", + "print(f\"ecf: {ecf}\")" ] }, { @@ -249,9 +248,9 @@ "# Transform back into absolute values and rescale by the drainage area\n", "estimated_flow = np.exp(v_est) * drainage_area\n", "\n", - "print(\"Estimated values are: \" + str(estimated_flow))\n", - "print(\"Simulated values were: \" + str(simulated_flow))\n", - "print(\"Observed values are: \" + str(observed_flow))" + "print(f\"Estimated values are: {estimated_flow}\")\n", + "print(f\"Simulated values were: {simulated_flow}\")\n", + "print(f\"Observed values are: {observed_flow}\")" ] }, { @@ -271,7 +270,7 @@ "var_bg = np.var(departures) # Variance of the departures of the background field\n", "var_est = (\n", " var_est * var_bg\n", - ") # Complete error model that includes the interpolation variance and the departures variance.\n", + ") # Complete error model that includes the interpolation variance and the departure variance.\n", "\n", "# Using the uncertainty estimation, get the 25th percentile of the estimated flows, and un-transform\n", "percentile_values = norm.ppf(np.array(25.0) / 100.0, loc=v_est, scale=np.sqrt(var_est))\n", @@ -282,9 +281,9 @@ "# Get the values in real units and scale according to drainage area\n", "flows_75th_percentile = np.exp(percentile_values) * drainage_area\n", "\n", - "print(\"Estimated values for the 25th percentile are: \" + str(flows_25th_percentile))\n", - "print(\"Estimated values for the 50th percentile are: \" + str(estimated_flow))\n", - "print(\"Estimated values for the 75th percentile are: \" + str(flows_75th_percentile))" + "print(f\"Estimated values for the 25th percentile are: {flows_25th_percentile}\")\n", + "print(f\"Estimated values for the 50th percentile are: {estimated_flow}\")\n", + "print(f\"Estimated values for the 75th percentile are: {flows_75th_percentile}\")" ] }, { @@ -327,11 +326,9 @@ "outputs": [], "source": [ "# Get data\n", - "downloader = pooch.HTTPDownloader(headers={\"User-Agent\": f\"xHydro-{xh.__version__}\"})\n", "test_data_path = deveraux().fetch(\n", " \"optimal_interpolation/OI_data_corrected.zip\",\n", " pooch.Unzip(),\n", - " downloader=downloader,\n", ")\n", "directory_to_extract_to = Path(test_data_path[0]).parent\n", "\n", @@ -395,7 +392,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### IMPORTANT: \n", + "### IMPORTANT:\n", "Notice that there are a few keywords that are important in these files that the code expects:\n", "1. The streamflow observations must be in a data variable named \"streamflow\", with dimensions \"station\" and \"time\".\n", "2. There must be the catchment drainage area in a variable named \"drainage_area\" with dimensions \"station\".\n", @@ -466,9 +463,7 @@ "outputs": [], "source": [ "print(\n", - " \"There are a total of \"\n", - " + str(len(observation_stations))\n", - " + \" selected observation stations.\"\n", + " f\"There are a total of {len(observation_stations)} selected observation stations.\"\n", ")\n", "print(observation_stations)" ] @@ -532,7 +527,7 @@ "max_cores = 1\n", "\n", "# However, if leave_one_out_cv is set to False, then a simple operational application is performed and the model will estimate flows\n", - "# at all of the \"qsim\" simulation sites. Here we set to \"True\" to generate a Leave-One-Out Cross-Validation and thus get flows that can\n", + "# at all \"qsim\" simulation sites. Here we set to \"True\" to generate a Leave-One-Out Cross-Validation and thus get flows that can\n", "# be evaluated and compared to actual observations.\n", "leave_one_out_cv = True" ] @@ -654,7 +649,7 @@ "plt.plot(raw_simulated_flow_select, label=\"Raw simulation\")\n", "plt.plot(interpolated_flow_select, label=\"Interpolated simulation\")\n", "plt.xlabel(\"Simulation day\")\n", - "plt.ylabel(\"Streamflow (m³/s\")\n", + "plt.ylabel(\"Streamflow (m³/s)\")\n", "plt.legend()\n", "plt.show()" ] diff --git a/docs/notebooks/pmp.ipynb b/docs/notebooks/pmp.ipynb index fb9e6738..283de8c8 100644 --- a/docs/notebooks/pmp.ipynb +++ b/docs/notebooks/pmp.ipynb @@ -21,7 +21,6 @@ "outputs": [], "source": [ "from pathlib import Path\n", - "from zipfile import ZipFile\n", "\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", @@ -36,7 +35,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Open data \n", + "## Open data\n", "\n", "This example uses a sample of 2-years and 3x3 grid cells from the CMIP model which can be accessed from the xhydro-testdata repository. It should be noted that this example seeks to show the functionality of the package and not to provide a complete analysis of the PMP, which requires a longer data time period." ] @@ -49,16 +48,13 @@ "source": [ "import xhydro as xh\n", "\n", - "downloader = pooch.HTTPDownloader(headers={\"User-Agent\": f\"xHydro-{xh.__version__}\"})\n", "path_day_zip = deveraux().fetch(\n", " \"pmp/CMIP.CCCma.CanESM5.historical.r1i1p1f1.day.gn.zarr.zip\",\n", " pooch.Unzip(),\n", - " downloader=downloader,\n", ")\n", "path_fx_zip = deveraux().fetch(\n", " \"pmp/CMIP.CCCma.CanESM5.historical.r1i1p1f1.fx.gn.zarr.zip\",\n", " pooch.Unzip(),\n", - " downloader=downloader,\n", ")\n", "\n", "path_day_zarr = (\n", @@ -78,9 +74,9 @@ "For this example, the CMIP simulations on an daily scale were used since it contains the variables necessary for the computing of the PMP:\n", "\n", "ds_day\n", - "* pr --> Precipitation_flux \n", - "* snw --> Snow water equivalent \n", - "* hus --> Specific humidity \n", + "* pr --> Precipitation_flux\n", + "* snw --> Snow water equivalent\n", + "* hus --> Specific humidity\n", "* zg --> Geopotential height\n", "\n", "ds_fx\n", @@ -350,7 +346,7 @@ "plt.plot(np.arange(len(sm_agg)), sm_agg.values, \"o\", label=\"Summer\")\n", "plt.xticks(ticks=np.arange(len(sp_agg)), labels=sp_agg.conf.values)\n", "plt.ylabel(\"PMP\")\n", - "plt.xlabel(\"Storm cofiguration\")\n", + "plt.xlabel(\"Storm configuration\")\n", "plt.legend()" ] } diff --git a/src/xhydro/testing/helpers.py b/src/xhydro/testing/helpers.py index b80620bf..a5496a53 100644 --- a/src/xhydro/testing/helpers.py +++ b/src/xhydro/testing/helpers.py @@ -3,13 +3,18 @@ import importlib.resources as ilr import logging import os +from collections.abc import Callable +from functools import wraps from pathlib import Path +from typing import IO from urllib.error import HTTPError, URLError -from urllib.parse import urlparse +from urllib.parse import urljoin, urlparse from urllib.request import urlretrieve import pooch +from xhydro import __version__ as __xhydro_version__ + __all__ = [ "TESTDATA_BRANCH", "TESTDATA_CACHE_DIR", @@ -27,7 +32,7 @@ """Default version of the testing data to use when fetching datasets.""" default_testdata_repo_url = ( - "https://raw.githubusercontent.com/hydrologie/xhydro-testdata" + "https://raw.githubusercontent.com/hydrologie/xhydro-testdata/" ) """Default URL of the testing data repository to use when fetching datasets.""" @@ -47,13 +52,13 @@ .. code-block:: console - $ export XHYDRO_TESTDATA_REPO_URL="https://github.com/my_username/xhydro-testdata" + $ export XHYDRO_TESTDATA_REPO_URL="https://github.com/my_username/xhydro-testdata/" or setting the variable at runtime: .. code-block:: console - $ env XHYDRO_TESTDATA_REPO_URL="https://github.com/my_username/xhydro-testdata" pytest + $ env XHYDRO_TESTDATA_REPO_URL="https://github.com/my_username/xhydro-testdata/" pytest """ TESTDATA_BRANCH = os.getenv("XHYDRO_TESTDATA_BRANCH", default_testdata_version) @@ -113,9 +118,28 @@ def load_registry( dict Dictionary of filenames and hashes. """ - remote_registry = audit_url(f"{repo}/{branch}/data/registry.txt") + if not repo.endswith("/"): + repo = f"{repo}/" + remote_registry = audit_url( + urljoin( + urljoin(repo, branch if branch.endswith("/") else f"{branch}/"), + "data/registry.txt", + ) + ) - if branch != default_testdata_version: + if repo != default_testdata_repo_url: + external_repo_name = urlparse(repo).path.split("/")[-2] + external_branch_name = branch.split("/")[-1] + registry_file = Path( + str( + ilr.files("xhydro").joinpath( + f"testing/registry.{external_repo_name}.{external_branch_name}.txt" + ) + ) + ) + urlretrieve(remote_registry, registry_file) # noqa: S310 + + elif branch != default_testdata_version: custom_registry_folder = Path( str(ilr.files("xhydro").joinpath(f"testing/{branch}")) ) @@ -123,11 +147,9 @@ def load_registry( registry_file = custom_registry_folder.joinpath("registry.txt") urlretrieve(remote_registry, registry_file) # noqa: S310 - elif repo != default_testdata_repo_url: + else: registry_file = Path(str(ilr.files("xhydro").joinpath("testing/registry.txt"))) - urlretrieve(remote_registry, registry_file) # noqa: S310 - registry_file = Path(str(ilr.files("xhydro").joinpath("testing/registry.txt"))) if not registry_file.exists(): raise FileNotFoundError(f"Registry file not found: {registry_file}") @@ -189,9 +211,13 @@ def deveraux( # noqa: PR01 "The `pooch` package is required to fetch the xhydro testing data. " "You can install it with `pip install pooch` or `pip install xhydro[dev]`." ) + if not repo.endswith("/"): + repo = f"{repo}/" + remote = audit_url( + urljoin(urljoin(repo, branch if branch.endswith("/") else f"{branch}/"), "data") + ) - remote = audit_url(f"{repo}/{branch}/data") - return pooch.create( + _devereaux = pooch.create( path=cache_dir, base_url=remote, version=default_testdata_version, @@ -200,6 +226,34 @@ def deveraux( # noqa: PR01 registry=load_registry(branch=branch, repo=repo), ) + # Add a custom fetch method to the Pooch instance + # Needed to address: https://github.com/readthedocs/readthedocs.org/issues/11763 + _devereaux.fetch_diversion = _devereaux.fetch + + # Overload the fetch method to add user-agent headers + @wraps(_devereaux.fetch_diversion) + def _fetch(*args: str, **kwargs: bool | Callable) -> str: # numpydoc ignore=GL08 + + def _downloader( + url: str, + output_file: str | IO, + poocher: pooch.Pooch, + check_only: bool | None = False, + ) -> None: + """Download the file from the URL and save it to the save_path.""" + headers = {"User-Agent": f"xhydro ({__xhydro_version__})"} + downloader = pooch.HTTPDownloader(headers=headers) + return downloader(url, output_file, poocher, check_only=check_only) + + # default to our http/s downloader with user-agent headers + kwargs.setdefault("downloader", _downloader) + return _devereaux.fetch_diversion(*args, **kwargs) + + # Replace the fetch method with the custom fetch method + _devereaux.fetch = _fetch + + return _devereaux + def populate_testing_data( temp_folder: Path | None = None, diff --git a/tests/conftest.py b/tests/conftest.py index 8d4c97d5..bd4d7372 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,6 @@ """Pytest configuration for xHydro tests.""" # noqa: D100 -from os import PathLike from os.path import commonpath from pathlib import Path