diff --git a/.gitignore b/.gitignore index 80012883..2c1c8d72 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,4 @@ __pycache__ .ruff-cache .venv +.vscode/settings.json diff --git a/src/ewatercycle/analysis/__init__.py b/src/ewatercycle/analysis/__init__.py index 473cafa8..6d39d4b9 100644 --- a/src/ewatercycle/analysis/__init__.py +++ b/src/ewatercycle/analysis/__init__.py @@ -1,159 +1,5 @@ -"""Analysis methods for eWaterCycle.""" +"""ewatercycle analysis module.""" -import os +from . import hydrograph -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -from hydrostats import metrics -from matplotlib.axes import Axes -from matplotlib.dates import DateFormatter -from matplotlib.figure import Figure - - -def _downsample(df, nrows=100, agg="mean"): - """Resample dataframe with datetimeindex to a fixed number of rows.""" - if len(df) <= nrows: - return df, df.index[1] - df.index[0] - - grouper = np.arange(len(df)) // (len(df) / nrows) - new_df = df.groupby(grouper).agg(agg) - new_df.index = pd.date_range(df.index[0], df.index[-1], periods=nrows) - - new_period = (df.index[-1] - df.index[0]) / nrows - - return new_df, new_period - - -def hydrograph( - discharge: pd.DataFrame, - *, - reference: str, - precipitation: pd.DataFrame | None = None, - dpi: int | None = None, - title: str = "Hydrograph", - discharge_units: str = "m$^3$ s$^{-1}$", - precipitation_units: str = "mm day$^{-1}$", - figsize: tuple[float, float] = (10, 10), - filename: os.PathLike | str | None = None, - nbars: int | None = None, - **kwargs, -) -> tuple[Figure, tuple[Axes, Axes]]: - """Plot a hydrograph. - - This utility function makes it convenient to create a hydrograph from - a set of discharge data from a `pandas.DataFrame`. A column must be marked - as the reference, so that the agreement metrics can be calculated. - - Optionally, the corresponding precipitation data can be plotted for - comparison. - - Args: - discharge: Dataframe containing time series of discharge data to be plotted. - reference: Name of the reference data, must correspond to a column - in the discharge dataframe. Metrics are calculated between - the reference column and each of the other columns. - precipitation: Optional dataframe containing time series of precipitation data - to be plotted from the top of the hydrograph. - dpi: DPI for the plot. - title: Title of the hydrograph. - discharge_units: Units for the discharge data. - precipitation_units: Units for the precipitation data. - figsize: With, height of the plot in inches. - filename: If specified, a copy of the plot will be saved to this path. - nbars: Number of bars to use for downsampling precipitation. - **kwargs: Options to pass to the matplotlib plotting function - - Returns: - First tuple member is a matplotlib figure, the second is a tuple of axes. - """ - discharge_cols = discharge.columns.drop(reference) - y_obs = discharge[reference] - y_sim = discharge[discharge_cols] - - fig, (ax, ax_tbl) = plt.subplots( - nrows=2, - ncols=1, - dpi=dpi, - figsize=figsize, - gridspec_kw={"height_ratios": [3, 1]}, - ) - - ax.set_title(title) - ax.set_ylabel(f"Discharge ({discharge_units})") - - y_sim.plot(ax=ax, **kwargs) - y_obs.plot(ax=ax, **kwargs) - - handles, labels = ax.get_legend_handles_labels() - - # Add precipitation as bar plot to the top if specified - if precipitation is not None: - if nbars is not None: - precipitation, barwidth = _downsample( - precipitation, nrows=nbars, agg="mean" - ) - else: - barwidth = 0.8 # default value for matplotlib barplot - - ax_pr = ax.twinx() - ax_pr.invert_yaxis() - ax_pr.set_ylabel(f"Precipitation ({precipitation_units})") - - for pr_label, pr_timeseries in precipitation.items(): - ax_pr.bar( - pr_timeseries.index.values, - pr_timeseries.values, - width=barwidth, - alpha=0.4, - label=pr_label, - ) - - # tweak ylim to make space at bottom and top - ax_pr.set_ylim(ax_pr.get_ylim()[0] * (7 / 2), 0) - ax.set_ylim(0, ax.get_ylim()[1] * (7 / 5)) - - # prepend handles/labels so they appear at the top - handles_pr, labels_pr = ax_pr.get_legend_handles_labels() - handles = handles_pr + handles - labels = labels_pr + labels - - # Put the legend outside the plot - ax.legend(handles, labels, bbox_to_anchor=(1.10, 1), loc="upper left") - - # set formatting for xticks - date_fmt = DateFormatter("%Y-%m") - ax.xaxis.set_major_formatter(date_fmt) - ax.tick_params(axis="x", rotation=30) - - # calculate metrics for data table underneath plot - def calc_metric(metric) -> float: - return y_sim.apply(metric, observed_array=y_obs) - - metrs = pd.DataFrame( - { - "nse": calc_metric(metrics.nse), - "kge_2009": calc_metric(metrics.kge_2009), - "sa": calc_metric(metrics.sa), - "me": calc_metric(metrics.me), - } - ) - - # convert data in dataframe to strings - cell_text = [[f"{item:.2f}" for item in row[1]] for row in metrs.iterrows()] - - table = ax_tbl.table( - cellText=cell_text, - rowLabels=metrs.index, - colLabels=metrs.columns, - loc="center", - ) - ax_tbl.set_axis_off() - - # give more vertical space in cells - table.scale(1, 1.5) - - if filename is not None: - fig.savefig(filename, bbox_inches="tight", dpi=dpi) - - return fig, (ax, ax_tbl) +__all__ = ["hydrograph"] diff --git a/src/ewatercycle/analysis/hydrograph.py b/src/ewatercycle/analysis/hydrograph.py new file mode 100644 index 00000000..fedf21bd --- /dev/null +++ b/src/ewatercycle/analysis/hydrograph.py @@ -0,0 +1,281 @@ +"""Analysis methods for eWaterCycle.""" + +import os + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import xarray as xr +from HydroErr.HydroErr import function_list +from hydrostats import metrics # noqa: F401 +from matplotlib.axes import Axes +from matplotlib.dates import AutoDateLocator, DateFormatter +from matplotlib.figure import Figure + +# metrics from https://hydroerr.readthedocs.io/en/stable/list_of_metrics.html callable + +metric_map = {func.name: func for func in function_list} # full names +metric_map.update({func.abbr: func for func in function_list}) # abbreviations +metric_map.update({func.__name__: func for func in function_list}) # function names +metric_map.update({k.lower(): v for k, v in metric_map.items()}) # lowercase + + +def _downsample(df, nrows=100, agg="mean"): + """Resample dataframe with datetimeindex to a fixed number of rows.""" + if len(df) <= nrows: + return df, df.index[1] - df.index[0] + + grouper = np.arange(len(df)) // (len(df) / nrows) + new_df = df.groupby(grouper).agg(agg) + new_df.index = pd.date_range(df.index[0], df.index[-1], periods=nrows) + + new_period = (df.index[-1] - df.index[0]) / nrows + + return new_df, new_period + + +def _to_pandas(data_in): + """Convert input to pandas DataFrame if it is not already. + + Args: + data_in : supported data types: pd.Dataframe, xr.Dataset + + Returns: + pd.Dataframe + + Raises: + Typerror if the input is not supported + """ + # already a DataFrame + if isinstance(data_in, pd.DataFrame): + return data_in + + # Single series + if isinstance(data_in, pd.Series): + msg = "A panda series contains only a single timeseries, please provide a pandas DataFrame or xr.Dataset." # noqa: E501 + raise TypeError(msg) + + # xarray Dataset + if isinstance(data_in, xr.Dataset): + return data_in.to_pandas() + + # xarray DataArray + if isinstance(data_in, xr.DataArray): + if data_in.ndim == 1: + msg = "A DataArray with a single timeseries is not supported, please provide a DataFrame or xr.Dataset." # noqa: E501 + raise TypeError(msg) + else: # noqa: RET506 + msg = "DataArray with more than one dimension is not supported, please provide a DataFrame or xr.Dataset." # noqa: E501 + raise TypeError(msg) + + # unsupported type + msg = f"Unsupported data type: {type(data_in)}, please provide a DataFrame or xr.Dataset" # noqa: E501 + raise TypeError(msg) + + +def _prepare_discharge(discharge, reference: str, selected_year=None): + """Prepare discharge data for hydrograph.""" + discharge = _to_pandas(discharge) + + # Slice by selected_year if provided + if selected_year is not None: + if not isinstance(discharge.index, pd.DatetimeIndex): + msg = "Discharge index must be a DatetimeIndex to select a year." + raise ValueError(msg) + discharge = discharge[discharge.index.year == selected_year] + y_obs = discharge[reference] + y_sim = discharge.drop(columns=[reference]) + return y_obs, y_sim + + +def _prepare_precipitation(precipitation, nbars=None, selected_year=None): + if precipitation is None: + return None, None + precipitation = _to_pandas(precipitation) + if nbars is not None: + precipitation, barwidth = _downsample(precipitation, nrows=nbars, agg="mean") + else: + barwidth = 0.8 # default value for matplotlib barplot + + if selected_year is not None: + if not isinstance(precipitation.index, pd.DatetimeIndex): + msg = "Precipitation index must be a DatetimeIndex to select a year." + raise ValueError(msg) + precipitation = precipitation[precipitation.index.year == selected_year] + + return precipitation, barwidth + + +def _plot_discharge(ax, y_obs, y_sim, **kwargs): + if hasattr(y_sim, "shape") and y_sim.shape[1] > 1: + y_obs.plot(ax=ax, linewidth=2.5, zorder=10, **kwargs) + y_sim.plot(ax=ax, alpha=0.7, linewidth=1.25, **kwargs) + else: + y_obs.plot(ax=ax, **kwargs, zorder=10) + y_sim.plot(ax=ax, **kwargs) + ax.grid(True) + return ax + + +def _plot_precipitation(ax, precipitation, barwidth, precipitation_units): + ax_pr = ax.twinx() + ax_pr.invert_yaxis() + ax_pr.set_ylabel(f"Precipitation ({precipitation_units})") + + for pr_label, pr_timeseries in precipitation.items(): + ax_pr.bar( + pr_timeseries.index.values, + pr_timeseries.values, + width=barwidth, + alpha=0.4, + label=pr_label, + ) + + # adjust ylim + ax_pr.set_ylim(ax_pr.get_ylim()[0] * (7 / 2), 0) + ax.set_ylim(0, ax.get_ylim()[1] * (7 / 5)) + return ax_pr + + +def _calculate_metrics(y_obs, y_sim, metrics_list=None): + if metrics_list is None: + metrics_list = ["NSE", "KGE (2009)", "SA", "ME"] + + metrics_objs = [] + for m in metrics_list: + if isinstance(m, str): + if m in metric_map: + metrics_objs.append(metric_map[m]) + elif m.lower() in metric_map: + metrics_objs.append(metric_map[m.lower()]) + else: + msg = f"Metric '{m}' not found in hydroerr metrics." + raise ValueError(msg) + else: + metrics_objs.append(m) + + def calc_metric(metric) -> float: + return y_sim.apply(metric, observed_array=y_obs) + + df_metrics = pd.DataFrame( + {metric.name: calc_metric(metric) for metric in metrics_objs} + ) + return df_metrics, metrics_objs + + +def _create_metrics_table(ax_tbl, df_metrics, metrics_objs): + col_labels = [f"{metric.name}\n({metric.abbr})" for metric in metrics_objs] + metrs_rounded = df_metrics.round(2) + cell_text = [[f"{item:.2f}" for item in row[1]] for row in metrs_rounded.iterrows()] + table = ax_tbl.table( + cellText=cell_text, + rowLabels=metrs_rounded.index, + colLabels=col_labels, + loc="center", + fontsize=15, + ) + ax_tbl.set_axis_off() + for (i, j), cell in table.get_celld().items(): + if i == 0: # headers + cell.set_fontsize(12) + cell.set_text_props(weight="bold") + cell.set_height(0.15) + elif j == -1: # row labels + cell.set_fontsize(11) + cell.set_text_props(weight="bold") + else: # table values + cell.set_fontsize(10) + table.scale(1, 1.5) + return table + + +def hydrograph( + discharge: pd.DataFrame | pd.Series | xr.DataArray | xr.Dataset, + *, + reference: str, + precipitation: pd.DataFrame | pd.Series | xr.DataArray | xr.Dataset | None = None, + dpi: int | None = None, + title: str | None = None, + discharge_units: str = "m$^3$ s$^{-1}$", + precipitation_units: str = "mm day$^{-1}$", + figsize: tuple[float, float] = (10, 10), + filename: os.PathLike | str | None = None, + nbars: int | None = None, + metrics_list: list[object] | None = None, + selected_year: int | None = None, + **kwargs, +) -> tuple[Figure, tuple[Axes, Axes]]: + """Plot a hydrograph. + + This utility function makes it convenient to create a hydrograph from + a set of discharge data from a `pandas.DataFrame` or `xarray.Dataset`. A column must + be marked as the reference, so that the agreement metrics can be calculated. + + Optionally, the corresponding precipitation data can be plotted for + comparison. + + Args: + discharge: Data containing time series of discharge data to be plotted. + reference: Name of the reference data, must correspond to a column + in the discharge dataframe. Metrics are calculated between + the reference column and each of the other columns. + precipitation: Optional dataframe containing time series of precipitation data + to be plotted from the top of the hydrograph. + dpi: DPI for the plot. + title: Title of the hydrograph. + discharge_units: Units for the discharge data. + precipitation_units: Units for the precipitation data. + figsize: With, height of the plot in inches. + filename: If specified, a copy of the plot will be saved to this path. + nbars: Number of bars to use for downsampling precipitation. + metrics_list: List of metrics to calculate and display in the table below + the hydrograph. If not specified, a default set of metrics is used. + selected_year: Slices a single year from the data + **kwargs: Options to pass to the matplotlib plotting function + + Returns: + First tuple member is a matplotlib figure, the second is a tuple of axes. + """ + y_obs, y_sim = _prepare_discharge(discharge, reference, selected_year) + precipitation, barwidth = _prepare_precipitation( + precipitation, nbars, selected_year + ) + + fig, (ax, ax_tbl) = plt.subplots( + nrows=2, + ncols=1, + dpi=dpi, + figsize=figsize, + gridspec_kw={"height_ratios": [3, 1]}, + ) + fig.subplots_adjust(bottom=0.05, top=0.95, hspace=0.3) + + ax.set_title( + title or f"$\\mathbf{{Hydrograph\\ with\\ Metrics}}$\nReference: {reference}" + ) + + ax.set_ylabel(f"Discharge ({discharge_units})") + + _plot_discharge(ax, y_obs, y_sim, **kwargs) + + if precipitation is not None: + ax_pr = _plot_precipitation(ax, precipitation, barwidth, precipitation_units) + handles, labels = ax_pr.get_legend_handles_labels() + else: + handles, labels = ax.get_legend_handles_labels() + + ax.legend(handles, labels, bbox_to_anchor=(1.10, 1), loc="upper left") + + locator = AutoDateLocator(minticks=5, maxticks=12) # adjust min/max ticks + ax.xaxis.set_major_locator(locator) + ax.xaxis.set_major_formatter(DateFormatter("%Y-%m")) + ax.tick_params(axis="x", rotation=30) + ax.set_xlabel(None) # no xlabel, ticks make it clear + + df_metrics, metrics_objs = _calculate_metrics(y_obs, y_sim, metrics_list) + _create_metrics_table(ax_tbl, df_metrics, metrics_objs) + + if filename: + fig.savefig(filename, bbox_inches="tight", dpi=dpi) + + return fig, (ax, ax_tbl) diff --git a/tests/src/baseline_images/test_analysis/hydrograph_DataFrame.png b/tests/src/baseline_images/test_analysis/hydrograph_DataFrame.png new file mode 100644 index 00000000..9084bf55 Binary files /dev/null and b/tests/src/baseline_images/test_analysis/hydrograph_DataFrame.png differ diff --git a/tests/src/baseline_images/test_analysis/hydrograph_xarray.png b/tests/src/baseline_images/test_analysis/hydrograph_xarray.png new file mode 100644 index 00000000..1b183436 Binary files /dev/null and b/tests/src/baseline_images/test_analysis/hydrograph_xarray.png differ diff --git a/tests/src/baseline_images/test_analysis/hydrograph_xarray_single_comparison.png b/tests/src/baseline_images/test_analysis/hydrograph_xarray_single_comparison.png new file mode 100644 index 00000000..c1859932 Binary files /dev/null and b/tests/src/baseline_images/test_analysis/hydrograph_xarray_single_comparison.png differ diff --git a/tests/src/baseline_images/test_analysis/hydrograph_xarray_single_year.png b/tests/src/baseline_images/test_analysis/hydrograph_xarray_single_year.png new file mode 100644 index 00000000..b4d7c31c Binary files /dev/null and b/tests/src/baseline_images/test_analysis/hydrograph_xarray_single_year.png differ diff --git a/tests/src/test_analysis.py b/tests/src/test_analysis.py deleted file mode 100644 index b289d8d1..00000000 --- a/tests/src/test_analysis.py +++ /dev/null @@ -1,35 +0,0 @@ -import numpy as np -import pandas as pd -from matplotlib.testing.decorators import image_comparison - -from ewatercycle.analysis import hydrograph - - -@image_comparison( - baseline_images=["hydrograph"], - extensions=["png"], - savefig_kwarg={"bbox_inches": "tight"}, -) -def test_hydrograph(): - ntime = 3000 - - dti = pd.date_range("2018-01-01", periods=ntime, freq="d") - - np.random.seed(20210416) - - discharge = { - "discharge_a": pd.Series(np.linspace(0, 2, ntime), index=dti), - "discharge_b": pd.Series(3 * np.random.random(ntime) ** 2, index=dti), - "discharge_c": pd.Series(2 * np.random.random(ntime) ** 2, index=dti), - "reference": pd.Series(np.random.random(ntime) ** 2, index=dti), - } - - df_q = pd.DataFrame(discharge) - - precipitation = { - "precipitation_a": pd.Series(np.random.random(ntime) / 20, index=dti), - "precipitation_b": pd.Series(np.random.random(ntime) / 30, index=dti), - } - - df_pr = pd.DataFrame(precipitation) - hydrograph(df_q, reference="reference", precipitation=df_pr, nbars=100) diff --git a/tests/src/test_analysis_hydrograph.py b/tests/src/test_analysis_hydrograph.py new file mode 100644 index 00000000..77078e4b --- /dev/null +++ b/tests/src/test_analysis_hydrograph.py @@ -0,0 +1,131 @@ +from pathlib import Path + +import numpy as np +import pandas as pd +import xarray as xr + +from ewatercycle.analysis.hydrograph import hydrograph + + +def _create_data(): + """Create sample data for testing.""" + ntime = 3000 + + dti = pd.date_range("2018-01-01", periods=ntime, freq="d") + + np.random.seed(20210416) + t = np.arange(ntime) + + discharge_wave = { + "discharge_a": pd.Series( + 1 + np.sin(2 * np.pi * (t - 60) / 365), # sinus wave 0-2 centered at 1 + index=dti, + ), + "discharge_b": pd.Series( + 1.1 + np.sin(2 * np.pi * t / 365), # sinus wave offset ~2 months + index=dti, + ), + "discharge_c": pd.Series( + 1 + np.cos(2 * np.pi * t / 365), # cosine wave + index=dti, + ), + "reference": pd.Series( + 1 + + np.sin(2 * np.pi * t / 365) + + 0.03 * np.random.randn(ntime), # sinus + noise + index=dti, + ), + } + + df_q_wave = pd.DataFrame(discharge_wave) + + precipitation = { + "precipitation_a": pd.Series(np.random.random(ntime) / 20, index=dti), + "precipitation_b": pd.Series(np.random.random(ntime) / 30, index=dti), + } + + df_pr = pd.DataFrame(precipitation) + return df_q_wave, df_pr + + +def _save_figure(fig, fname): + """Save figure to baseline directory.""" + baseline_dir = "tests/src/baseline_images/test_analysis" + fig_path = Path(baseline_dir) / fname + fig.savefig(fig_path, bbox_inches="tight") + + +def test_hydrograph(): + """Test hydrograph with pandas DataFrame input.""" + df_q, df_pr = _create_data() + fig, (ax, ax_tbl) = hydrograph( + df_q, reference="reference", precipitation=df_pr, nbars=100 + ) + + _save_figure(fig, "hydrograph_DataFrame.png") + + # + assert len(ax.lines) == 4 # 3 discharge + 1 reference + assert ax_tbl.tables + + +def test_hydrograph_xarray(): + """Test hydrograph with xarray Dataset input.""" + df_q = _create_data()[0] + ds_q = xr.Dataset.from_dataframe(df_q) + + fig, (ax, ax_tbl) = hydrograph( + ds_q, reference="reference", metrics_list=["kge_2009", "nse_mod", "male"] + ) + + _save_figure(fig, "hydrograph_xarray.png") + + # + assert len(ax.lines) == 4 # 3 discharge + 1 reference + assert ax_tbl.tables + + +def test_hydrograph_xarray_single_year(): + """Test hydrograph with xarray Dataset input and selecting a single year.""" + df_q = _create_data()[0] + ds_q = xr.Dataset.from_dataframe(df_q) + + fig, (ax, ax_tbl) = hydrograph(ds_q, reference="reference", selected_year=2020) + + _save_figure(fig, "hydrograph_xarray_single_year.png") + + # + assert len(ax.lines) == 4 # 3 discharge + 1 reference + assert ax_tbl.tables + + +def test_hydrograph_xarray_single_hydrograph(): + """Test hydrograph with xarray Dataset input and only one discharge to commpare.""" + df_q = _create_data()[0] + ds_q = xr.Dataset.from_dataframe(df_q) + ds_q = ds_q.drop_vars(["discharge_b", "discharge_c"]) + + fig, (ax, ax_tbl) = hydrograph(ds_q, reference="reference") + + _save_figure(fig, "hydrograph_xarray_single_comparison.png") + + # + assert len(ax.lines) == 2 # 3 discharge + 1 reference + assert ax_tbl.tables + + +def test_hydrograph_series_error(): + """Test hydrograph raises error with pandas Series input.""" + df_q, df_pr = _create_data() + ser_q = df_q["discharge_a"] + + try: + hydrograph(ser_q, reference="discharge_a", precipitation=df_pr, nbars=100) + except TypeError as e: + assert ( + str(e) + == "A panda series contains only a single timeseries, please provide a pandas DataFrame or xr.Dataset." + ) + else: + msg = "TypeError not raised" + raise AssertionError(msg)